Python抓取电影来了(dianying.fm)
/*title:Python抓取电影来了(dianying.fm)blog:http://yxmhero1989.blog.163.com/blog/static/11215795620132441055127/author:insun*/小虾的电影FM readself.com2月中旬前面还是老版本的布局 当时写的代码是这样的:#!/usr/bin/env python#-*- coding=utf-8 -*-import urllib2,urllib,reimport pymongoimport sys,osimport jsonimport chardetreload(sys)sys.setdefaultencoding('utf-8')db = pymongo.Connection().testif(os.path.exists('Allsmall')==False):os.mkdir('Allsmall')if(os.path.exists('Allbig')==False):os.mkdir('Allbig')#http://dianying.fm/genre/%E7%88%B1%E6%83%85 调整后的分类是http://movie.readself.com/category/filter_play-genre_%E5%96%9C%E5%89%A7#下拉加载和点击加载是JS附加了一段json 要json {"result": -1, "error": "unknown error"}#一个json加载28个 29*28 = 812 > 500for i in range(1,49):#http://movie.readself.com/print "-----------第"+str(i)+"个链接开始抓取--------------"url = 'http://movie.readself.com/reflect/category/eyJzb3J0IjogIiIsICJmaWx0ZXIiOiAiIiwgImNhdGFsbCI6ICIiLCAia2V5IjogIiIsICJ5ZWFyIjogIiIsICJnZW5yZSI6ICIiLCAicmVnaW9uIjogIiIsICJzaG93IjogIiIsICJjbGFzcyI6ICIifQ==/'+str(i)try:proxies = {'http':'http://219.239.26.23:80'}#或者proxies = {'':'211.167.112.14:80'}opener = urllib.FancyURLopener(proxies)f = opener.open(url)html = f.read()print htmlsys.exit(-1)#print chardet.detect(html)#{'confidence': 1.0, 'encoding': 'ascii'}#html = html.replace('\\n','') #这个导致了28个变成了5个 说明后来的正则已经不适合下面的了html = json.loads(html,encoding='utf-8')html = str(html)reg = re.compile(r'<div class="x-movie-entry">.+?<a href="(.*?)".+?title="(.*?)">.+?<img.+?src="(.*?)">.+?</a>.+?<div class="x-movie-caption">.+?</div>',re.S)#reg = re.compile(r'<div class=\\\"x-movie-entry\\\">.+?<a href=\\\"(.*?)\\\".+?title=\\\"(.*?)\\\">.+?<img.+?src=\\\"(.*?)\\\">.+?</a>.+?<div class=\\\"x-movie-caption\\\">.+?</div>',re.S)groups = re.findall(reg,html)print "*****----本链接抓取了"+str(len(groups))+"个影片信息****----------"for detail in groups:shortTitle = detail[1]#三傻大闹宝莱坞 \u4e09\u50bb\u5927\u95f9\u5b9d\u83b1\u575eshortTitle = shortTitle.decode('unicode_escape').encode('utf-8')print shortTitle + ' 下载中'#\u8ba9\u5b50\u5f39\u98de \u后边的四位十六进制数是汉字的UNICODE编码#\xe8\xae\xa9\xe5\xad\x90\xe5\xbc\xb9\xe9\xa3\x9e gb2312 编码#b'\xDE\xD5\xB4\xF8' 二进制smallImage = detail[2]detailUrl = 'http://movie.readself.com'+detail[0]text = opener.open(detailUrl).read()IMDB_stars_need = '.+?<td class="span2"><span class="x-m-label">IMDB评分</span></td>.+?<b>(.*?)</b>'IMDB_link_need = '.+?<a rel="nofollow" href="(.*?)" target="_blank">IMDB链接</a>.+?'alias_need = '.+?<td class="span2"><span class="x-m-label">别名</span></td>.+?<td>(.*?)</td>'if text.count('别名') == 0:alias_need = '(.*?)'if text.count('IMDB评分') == 0:IMDB_stars_need = '(.*?)'if text.count('IMDB链接') == 0 :IMDB_link_need = '(.*?)'detailReg = re.compile('<div class="x-m-title">.+?<h3>(.*?)<span class="x-m-year">(.*?)</span>.+?</h3>.+?</div>'+'.+?<div class="x-m-poster span3".+?<img.+?src="(.*?)">.+?</div>'+'.+?<td class="span2"><span class="x-m-label">导演</span></td>.+?<td>(.*?)</td>'+'.+?<td class="span2"><span class="x-m-label">主演</span></td>.+?<td>(.*?)</td>'+'.+?<td class="span2"><span class="x-m-label">类型</span></td>.+?<td>(.*?)</td>'+'.+?<td class="span2"><span class="x-m-label">地区</span></td>.+?<td>(.*?)</td>'+'.+?<td class="span2"><span class="x-m-label">上映时间</span></td>.+?<td>(.*?)</td>'+'.+?<td class="span2"><span class="x-m-label">片长</span></td>.+?<td&g补充:Web开发 , Python ,
上一个:使用Python读取和写入CSV文件
下一个:Python与Django的时区问题