python扫目录程序

by：90sec maxs98 申请论坛邀请码的帖子，现在补上。

写了2个版本的，先贴单线程的，程序用HTTP头判断页面是否存在速度较快。使用时需要指定字典文件，要在程序里改改。（相信你懂的）

#usr/bin/python
#encoding=utf-8
import sys
import httplib
import re
import time

def Usage():
    print 'Usage: python scan.py'
    sys.exit()

if len(sys.argv)!=2:
    Usage()

start = time.time()
target = sys.argv[1]
port = 80
dict_path = "/media/sf_TDDOWNLOAD/dict.txt"
f = file(dict_path)
while True:
    line = f.readline()
    line = re.split('\\r',line,2)
    path = line[0].decode("gbk").encode("utf-8")
    #print line
    conn = httplib.HTTPConnection(target,80)
    #conn.set_debuglevel(2)
    conn.request('GET',path,headers = {"Host": target,"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5","Accept": "text/plain"})

    ret = conn.getresponse().status
    if ret==200 or ret==500 or ret==403 or ret==301:
        print target+path+' found! status:', ret
    else:
        print target+path+" not found!"
    if len(line)==0:
        print "done..."
        break
f.close()
print "Elapsed Time: %s" % (time.time() - start)

==========================================
下面是www.zzzyk.com多线程版本，使用了一个消息队列来处理要扫描的路径。注意线程不要开的太多。不然会出莫名其妙的错误。
#!/usr/bin/env python
import Queue
import threading
import httplib
import time
import re

queue = Queue.Queue()

class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            #
            path = self.queue.get()
            target = "www.xjbaihe.com"
            port = 80

            conn = httplib.HTTPConnection(target,80)
            conn.request('GET',path,headers = {"Host": target,"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5","Accept": "text/plain"})
            ret = conn.getresponse().status
            if ret==200 or ret==500 or ret==403 or ret==301:
                print path+' found! status:', ret
            else:
                print path+" not found!"


            #signals to queue job is done
            self.queue.task_done()

start = time.time()
def main():

    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()

    #populate queue with data
    print "reading dict..."
    dict_path = "/media/sf_TDDOWNLOAD/dictest.txt"
    f = file(dict_path)
    while True:
        line = f.readline()
        line = re.split('\\r',line,2)
        path = line[0].decode("gbk").encode("utf-8")
        queue.put(path)
        if len(path)==0:
            print "done..."
            break
    f.close()

    #wait on the queue until everything has been processed
    queue.join()
if __name__ == '__main__':
    main()
    print "Elapsed Time: %s" % (time.time() - start)

=================

补充：Web开发 , Python ,