当前位置:编程学习 > 网站相关 >>

唐茶字节社与Python的故事(I)

     万历年间 小太阳先森 先是下载了唐茶的App 书生意气 勤于读书 苦于没官府白花花的银两  然先生习得一手好python 
一步步走向不归路
礼记有云:凡事预则立 不预则废 故而先派个侦察兵去探测地形 这也是小螃蟹先生的常用手段
 
主页:http://www.tangcha.tc/books
新书上架 http://www.tangcha.tc/books/latest 82*5 = 410
排行榜 http://www.tangcha.tc/books/top 410
本期推荐 http://www.tangcha.tc/books/recommendation
唐茶字节社与Python的故事(I) - InSun - Minghacker is Insun
有书籍分类 出版社 和 书目排行 
找准下手点 射人先射马 拿排行版下手 http://www.tangcha.tc/books/top  页面结构简单 没什么变化 高手过招 3s钟解决
 

 

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author :insun

import urllib2
import re

#获得电子书列表,抓取 id,link,thumb_cover,title,author
def get_booklist():
    url = 'http://www.tangcha.tc/books/top'
    html = urllib2.urlopen(url).read()

    reg = re.compile(r'<li class="book-cell">.+?<span>(.*?).</span>'+
                    '.+?<a href="(.*?)" class="cell-item boxable">'+
                    '.+?<img.+?src="(.*?)" />.+?</figure>'+
                    '.+?<p class="book-title">(.*?)</p>'+
                    '.+?<p class="book-author">(.*?)</p>',re.S)
    groups = re.findall(reg,html)
    return groups
获得电子书列表,抓取 id,link,thumb_cover,title,author
这里的id是假id 我们存为bid 真id是href = "/books/906" 后面的整形数字 我们存为rid
根据rid入手到下一页
一阵血杀 这是个强悍的敌人 善于变换和分身术 
    #href = "/books/906"  /books/626  /books/707  /books/472
    #/books/414  <img alt="Thumb_cover_thumbnail"   alt="Thumb_cover"
    #/books/596 没有<p class="book-publisher book-info-entry">
    ##/books/429  和 /books/424  <div class="no-related-items">  <div class="related-items">
斩了这些荆棘后 道路就明朗了 
这样初步代码如下:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author :insun

import urllib2
import re

#获得电子书列表,抓取 id,link,thumb_cover,title,author
def get_booklist():
url = 'http://www.tangcha.tc/books/top'
html = urllib2.urlopen(url).read()

reg = re.compile(r'<li class="book-cell">.+?<span>(.*?).</span>'+
'.+?<a href="(.*?)" class="cell-item boxable">'+
'.+?<img.+?src="(.*?)" />.+?</figure>'+
'.+?<p class="book-title">(.*?)</p>'+
'.+?<p class="book-author">(.*?)</p>',re.S)
groups = re.findall(reg,html)
return groups


#获取电子书详细信息,抓取cover,title,author,publisher,douban_rate,content,author_intro
def get_bookdetail(href):
detailurl = 'http://www.tangcha.tc' + href
detailhtml = urllib2.urlopen(detailurl).read()

if re.search('book-publisher book-info-entry',detailhtml) != None:
publish = '.+?<p class="book-publisher book-info-entry">.+?<a .+?>(.*?)</a>.+?</p>.+?<p class="book-device">.+?'
else:
publish = '(.*?)<p class="book-device">'
if re.search('douban-rating-number',detailhtml) != None:
douban = '.+?<div class="douban-rating-number">(.*?)</div></div></a>'
description = '(.*?)<div class=".+?">'
else:
douban = '(.*?)'
description = '.+?<section id="book-description">(.*?)<div class=".+?">'

dreg = re.compile(r'<figure class="book-cover">.+?<img alt=".+?" src="(.*?)" />.+?</figure>'+
'.+?<p class="book-title">(.*?)</p>'+
'.+?<p class="book-author book-info-entry">.+?<a .+?>(.*?)</a>.+?</p>'+
publish+douban+description,re.S)

dgroups = re.findall(dreg,detailhtml)
return dgroups


for i in get_booklist():
href = i[1]
details = get_bookdetail(href)
print details[0]
 

为存入背囊,我们引入mongodb 自己去安装 我们前面也有介绍:MongoDB实战入门
python的pymongo
top的

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author :insun
import urllib
import urllib2,sys
import re
import pymongo
import os

db = pymongo.Connection().test
if(os.path.exists('thumb')==False):
os.mkdir('thumb')
if(os.path.exists('cover')==False):
os.mkdir('cover')
#新书上架 http://www.tangcha.tc/books/latest 82*5 = 410
#排行榜 http://www.tangcha.tc/books/top 410
#本期推荐 http://www.tangcha.tc/books/recommendation
#获得电子书列表,抓取 id,link,thumb_cover,title,author
def get_booklist():
url = 'http://www.tangcha.tc/books/top'
html = urllib2.urlopen(url).read()

reg = re.compile(r'<li class="book-cell">.+?<span>(.*?).</span>'+
'.+?<a href="(.*?)" class="cell-item boxable">'+
'.+?<img.+?src="(.*?)" />.+?</figure>'+
'.+?<p class="book-title">(.*?)</p>'+
'.+?<p class="book-author">(.*?)</p>',re.S)
groups = re.findall(reg,html)
return groups


#获取电子书详细信息,抓取cover,title,author,publisher,douban_rate,content,author_intro
def get_bookdetail(href):
detailurl = 'http://www.tangcha.tc' + href
detailhtml = urllib2.urlopen(detailurl).read()

if re.search('book-publisher book-info-entry',detailhtml) != None:
publish = '.+?<p class="book-publisher book-info-entry">.+?<a .+?>(.*?)</a>.+?</p>.+?<p class="book-device">.+?'
else:
publish = '(.*?)<p class="book-device">'
if re.search('douban-rating-number',detailhtml) != None:
douban = '.+?<div class="douban-rating-number">(.*?)</div></div></a>'
description = '(.*?)<div class=".+?">'
else:
douban = '(.*?)'
descriptio

补充:Web开发 , Python ,
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,