递归获取网站的链接抽取图片

写着玩的

代码：

1.从网站主页开始访问，递归获取到链接，list1放所有遇到过的链接，list2放需要请求的链接

起始状态list1、list2只包含主页的地址，size=1

当list2为空时，递归结束

2.请求链接时，判断返回的类型，如果是图片类型，就保存到文件夹

[python]
# coding:utf-8
import re, urllib2, os, datetime, urlparse

def main(LEFT_PAGES):
    if len(LEFT_PAGES) == 0:
        print "没有需要访问的网页了，END"
        return
    else:
        global MAIN_CNT
        print "...第%s次递归进入MAIN函数..." % MAIN_CNT
        tmp_pages = []
        for page in LEFT_PAGES:
            tmp_pages.append(page)

        for each in tmp_pages:
            print "准备获取网页:%s" % each
            try:
                resp = urllib2.urlopen(each)
            except urllib2.HTTPError as err:
                print err.code, each
                continue
            finally:
                LEFT_PAGES.remove(each)

            source = resp.read()
            current_url = resp.geturl()
            content_type = resp.headers.get("Content-Type")
            resp.close()
            # 保存图片文件
        if content_type is not None:
            type1, type2 = content_type.split(";")[0].split("/")
            if type1 is not None and type2 is not None and type1.lower() == "image":
                src_dir = os.path.dirname(__file__)
                filename = os.path.join(src_dir, "source", datetime.datetime.now().strftime("%Y%m%d.%H%M%S%f") + "." + type2)
                fp = file(filename, "wb")
                fp.write(source)
                fp.close()
            # 抽取链接
            hrefs = re.findall(PATTERN, source)
            if len(hrefs) > 0:
                for each in hrefs:
                    href = each[1]
                    href = urlparse.urljoin(current_url, href)
                    href = href.replace("/../", "/")
                    if href not in HAS_MEET_PAGES:
                        HAS_MEET_PAGES.append(href)
                        if urlparse.urlparse(href).hostname is not None and "renrendai.com" in urlparse.urlparse(href).hostname:
                            LEFT_PAGES.append(href)

    MAIN_CNT += 1
    main(LEFT_PAGES)

if __name__ == '__main__':
    VISIT_SITE = "http://www.renrendai.com/"
    HAS_MEET_PAGES = [VISIT_SITE]
    LEFT_PAGES = [VISIT_SITE]
    MAIN_CNT = 1
    PATTERN = re.compile('(href|src|area)="([^\s;]+)"')
    main(LEFT_PAGES)

# coding:utf-8
import re, urllib2, os, datetime, urlparse

def main(LEFT_PAGES):
    if len(LEFT_PAGES) == 0:
        print "没有需要访问的网页了，END"
        return
    else:
        global MAIN_CNT
        print "...第%s次递归进入MAIN函数..." % MAIN_CNT
        tmp_pages = []
        for page in LEFT_PAGES:
            tmp_pages.append(page)

        for each in tmp_pages:
            print "准备获取网页:%s" % each
            try:
                resp = urllib2.urlopen(each)
            except urllib2.HTTPError as err:
                print err.code, each
                continue
            finally:

补充：Web开发 , Python ,