Python Web数据抓取(xpath版)
这个版本较之前的“正则表达式版”而言,主要有以下几个改进:
(1)采用SQLite缓存抓取的HTML页面,大大提高了二次数据处理的效率。第一次运行程序大约耗时6小时,以后只需3分钟左右即可完成。
(2)采用xpath替换之前的正则表达式进行HTML解析。xpath定位更加简单、方便,而且能够自动修正html错误语法。xpath真强大!!
(3)去掉了重复的结果。
程序代码如下:
# coding:utf-8
# Practice of scraping web data with xpath
# by redice 2010.11.05
import codecs
import sys
reload(sys)
sys.setdefaultencoding(utf-8)
import urllib2
from urllib2 import URLError, HTTPError
import zlib
import sqlite3
try:
import cPickle as pickle
except ImportError:
import pickle
conn = sqlite3.connect("html_cache.db")
conn.text_factory = lambda x: unicode(x, utf-8, replace)
curs = conn.cursor()
#if htmls tables not exist,create it
#curs.execute(CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content BLOG,size INTEGER);)
curs.execute(CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content TEXT,size INTEGER);)
conn.commit()
def serialize(value):
"""convert object to a compressed pickled string to save in the db
"""
#return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), 5))
#return sqlite3.Binary(value)
return value
def deserialize(value):
"""convert compressed pickled string from database back into an object
"""
#return pickle.loads(zlib.decompress(value)) if value else value
return value
# Fetch the target html
def gethtml(url):
Fetch the target html
try:
# look up the html_cache.db first
curs.execute("select * from htmls where url=?;" ,(url,))
row = curs.fetchone()
if row:
# find the target
#print deserialize(str(row[1]))
return deserialize(str(row[1]))
response = urllib2.urlopen(url)
result = response.read()
# insert into the html_cache.db
curs.execute("insert into htmls values(?,?,?);", (url,serialize(result),len(result)))
conn.commit()
print "saved %s into html_cache.db" % (url)
return result
except URLError, e:
if hasattr(e, reason):
print Failed to reach a server.
print Reason: , e.reason
return None
elif hasattr(e, code):
print The server couldnt fulfill the request.
print Error code: , e.code
return None
#except:
#return None
# end def gethtml
import re
#Fetch the all string matched. Return a list.
def regexmatch(rule,str):
Fetch the all string matched. Return a list.
p = re.compile(rule)
return p.findall(str)
#end def regexmatch
# decodeHtmlEntity
def decodeHtmlEntity(s) :
decodeHtmlEntity
if s== or not s:
return
result = s
import locale
result = result.decode(locale.getdefaultlocale()[1],"ignore").encode(locale.getdefaultlocale()[1]).replace("xc2xa0"," ")
return result
# end def decodeHtmlEntity
#final result
dining_db = []
total = 0;
#debug
debug = 0
# Fetch menupalace.coms html
print Fetching html from http://menupalace.com ...
html = gethtml(http://menupalace.com)
from lxml import etree
if html== or html==None:
print "Cant get them html from http://menupalace.com"
sys.exit()
try:
tree = etree.HTML(html)
nodes = tree.xpath("//table[@class=n_table]")
except:
f = open("log.txt","wa")
f.write(html)
print("error to resolve the html http://menupalace.com")
sys.exit()
for node in nodes:
if debug and total>=10:
break;
n = node.xpath("./tr[1]/td[1]/img")
# Fetch country
country = ""
if len(n)>0:
country = decodeHtmlEntity(n[0].tail)
country = country.strip()
# Fetch all link
ls = node.xpath(".//a")
# Through all link
for l in ls:
if debug and total>=10:
break;
#city
city = decodeHtmlEntity(l.text)
city = city.strip()
prelink = l.get("href")
link = prelink + "restaurants/restaurants.aspx"
#print Fetching html from + link + ...
html = gethtml(link)
if html== or html == None:
print "Cant get them html from " + link
continue
try:
subtree = etree.HTML(html)
subnodes = subtree.xpath("//td[@class=frame_style_padding]")
except:
if debug:
&nbs
补充:Web开发 , Python ,