当前位置:编程学习 > 网站相关 >>

递归获取网站的链接抽取图片

递归获取网站的链接抽取图片

 


写着玩的

代码:

1.从网站主页开始访问,递归获取到链接,list1放所有遇到过的链接,list2放需要请求的链接

起始状态list1、list2只包含主页的地址,size=1

当list2为空时,递归结束

2.请求链接时,判断返回的类型,如果是图片类型,就保存到文件夹

 

 

[python] 
# coding:utf-8  
import re, urllib2, os, datetime, urlparse 
 
def main(LEFT_PAGES): 
    if len(LEFT_PAGES) == 0: 
        print "没有需要访问的网页了,END" 
        return 
    else: 
        global MAIN_CNT 
        print "...第%s次递归进入MAIN函数..." % MAIN_CNT 
        tmp_pages = [] 
        for page in LEFT_PAGES: 
            tmp_pages.append(page) 
         
        for each in tmp_pages: 
            print "准备获取网页:%s" % each 
            try: 
                resp = urllib2.urlopen(each) 
            except urllib2.HTTPError as err: 
                print err.code, each 
                continue 
            finally: 
                LEFT_PAGES.remove(each) 
             
            source = resp.read() 
            current_url = resp.geturl() 
            content_type = resp.headers.get("Content-Type") 
            resp.close() 
            # 保存图片文件  
        if content_type is not None: 
            type1, type2 = content_type.split(";")[0].split("/") 
            if type1 is not None and type2 is not None and type1.lower() == "image": 
                src_dir = os.path.dirname(__file__) 
                filename = os.path.join(src_dir, "source", datetime.datetime.now().strftime("%Y%m%d.%H%M%S%f") + "." + type2) 
                fp = file(filename, "wb") 
                fp.write(source) 
                fp.close() 
            # 抽取链接  
            hrefs = re.findall(PATTERN, source) 
            if len(hrefs) > 0: 
                for each in hrefs: 
                    href = each[1] 
                    href = urlparse.urljoin(current_url, href) 
                    href = href.replace("/../", "/") 
                    if href not in HAS_MEET_PAGES: 
                        HAS_MEET_PAGES.append(href) 
                        if urlparse.urlparse(href).hostname is not None and "renrendai.com" in urlparse.urlparse(href).hostname: 
                            LEFT_PAGES.append(href) 
     
    MAIN_CNT += 1 
    main(LEFT_PAGES) 
 
if __name__ == '__main__': 
    VISIT_SITE = "http://www.renrendai.com/" 
    HAS_MEET_PAGES = [VISIT_SITE] 
    LEFT_PAGES = [VISIT_SITE] 
    MAIN_CNT = 1 
    PATTERN = re.compile('(href|src|area)="([^\s;]+)"') 
    main(LEFT_PAGES) 

# coding:utf-8
import re, urllib2, os, datetime, urlparse

def main(LEFT_PAGES):
    if len(LEFT_PAGES) == 0:
        print "没有需要访问的网页了,END"
        return
    else:
        global MAIN_CNT
        print "...第%s次递归进入MAIN函数..." % MAIN_CNT
        tmp_pages = []
        for page in LEFT_PAGES:
            tmp_pages.append(page)
       
        for each in tmp_pages:
            print "准备获取网页:%s" % each
            try:
                resp = urllib2.urlopen(each)
            except urllib2.HTTPError as err:
                print err.code, each
                continue
            finally:
              

补充:Web开发 , Python ,
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,