采集代码一

 1#! /usr/bin/env python
 2# -*- coding: utf-8 -*-
 3import os ,sys ,urllib2,socket
 4import re
 5import time
 6from threading import Thread
 7from Queue import Queue
 8DOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download') #保存地址
 9socket.setdefaulttimeout(30)
10THREAD_COUNT = 5 #线程数量
11def md5sum(s):
12    try:
13        import hashlib
14        m = hashlib.md5()
15        m.update(s)
16        return m.hexdigest()
17    except:
18        import md5
19        m = md5.new()
20        m.update(s)
21        return m.hexdigest()
22class spiderList(Thread):
23    def __init__(self ,queue):
24        Thread.__init__(self)
25        self.queue = queue
26    def run(self):
27        pages = []
28        #这个网站列表的页数从第1页到第117页
29        for i in range(1,117):
30            pages.append('http://xxx.com/?page=%s' % i)
31        self.queue.put(pages)
32        self.queue.task_done()
33class spiderDetail(Thread):
34    def __init__(self,queue):
35        Thread.__init__(self)
36        self.queue = queue
37        self.header = {
38            'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'
39        }
40    def run(self):
41        urls = self.queue.get()
42        self.page=1
43        for url in urls:
44            rq = urllib2.urlopen(urllib2.Request(url = url ,headers = self.header))
45            result = re.findall('_src="([wW]+?)"', rq.read())
46            if result != '':
47                for src in result:
48                    bigImage = self.__getBigImage(src)
49                    if bigImage!='':
50                        img = urllib2.urlopen(bigImage).read()
51                        fileName = self.__getFileName(bigImage)
52                        file(fileName,'wb').write(img)
53            self.page+=1
54        self.queue.task_done()
55    def __getDir(self):
56        import datetime
57        now = datetime.datetime.now()
58        dateDir = now.strftime('%Y-%m-%d')
59        saveDir = os.path.join(DOWNLOAD_BASEDIR, dateDir)
60        pageDir = 'page_%d' % self.page
61        saveDir = os.path.join(saveDir, pageDir)
62        if os.path.isdir(saveDir) == False:
63            os.makedirs(saveDir)
64        return saveDir
65    def __getBigImage(self ,url):
66        if(url==''):
67            return False
68        args = re.split("-([0-9a-zA-z]+).", url)
69        return args[0]+'.'+args[2]
70    def __getFileName(self,url):
71        baseName = os.path.basename(bigImage)
72        args = os.path.splitext(baseName)
73        fileName = md5sum(args[0])+args[1]
74        return os.path.join(self.__getDir(), fileName)
75if __name__ == '__main__':
76    queue = Queue()
77    for i in range(THREAD_COUNT):
78        lt = spiderList(queue)
79        lt.setDaemon(True)
80        lt.start()
81        dt = spiderDetail(queue)
82        dt.setDaemon(True)
83        dt.start()
84    while 1:
85        pass

采集代码二

 1#!/usr/bin/env python
 2#-*- coding: utf-8 -*-
 3#通过urllib(2)模块下载网络内容
 4import urllib,urllib2,gevent
 5#引入正则表达式模块,时间模块
 6import re,time
 7from gevent import monkey
 8monkey.patch_all()
 9def geturllist(url):
10    url_list=[]
11    print url
12    s = urllib2.urlopen(url)
13    text = s.read()
14    #正则匹配,匹配其中的图片
15    html = re.search(r'<ol.*</ol>', text, re.S)
16    urls = re.finditer(r'<p><img src="(.+?)jpg" /></p>',html.group(),re.I)
17    for i in urls:
18        url=i.group(1).strip()+str("jpg")
19        url_list.append(url)
20    return url_list
21def download(down_url):
22    name=str(time.time())[:-3]+"_"+re.sub('.+?/','',down_url)
23    print name
24    urllib.urlretrieve(down_url, "D:\TEMP\"+name)
25def getpageurl():
26    page_list = []
27    #进行列表页循环
28    for page in range(1,700):
29        url="http://jandan.net/ooxx/page-"+str(page)+"#comments"
30        #把生成的url加入到page_list中
31        page_list.append(url)
32    print page_list
33    return page_list
34if __name__ == '__main__':
35    jobs = []
36    pageurl = getpageurl()[::-1]
37    #进行图片下载
38    for i in pageurl:
39        for (downurl) in geturllist(i):
40            jobs.append(gevent.spawn(download, downurl))
41    gevent.joinall(jobs)

采集代码三

 1import os,time,sys,re,threading
 2import urllib
 3DOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download')
 4DOWNLOAD_BASEURL = './download/'
 5os.mkdir(DOWNLOAD_BASEDIR)
 6def md5sum(s):
 7    try:
 8        import hashlib
 9        m = hashlib.md5()
10        m.update(s)
11        return m.hexdigest()
12    except:
13        import md5
14        m = md5.new()
15        m.update(s)
16        return m.hexdigest()
17class Download(threading.Thread):
18    def __init__(self, url):
19        threading.Thread.__init__(self)
20        self.url = url
21    def run(self):
22##        print "downloading %s " % self.url
23        f = urllib.urlopen(self.url)
24        content_type,extention = f.headers.get('content-type','image/jpeg').split('/')
25        if extention in ('jpeg','html'):
26            extention = 'jpg'
27        basename = "%s.%s" %( md5sum(self.url) , extention)
28        self.filename = os.path.join(DOWNLOAD_BASEDIR, basename)
29        self.local_url = DOWNLOAD_BASEURL + basename
30        file(self.filename, 'wb').write(f.read())
31content = file(os.path.join(os.path.dirname(__file__), 'content.html')).read()
32pt=re.compile(r"""src=['"]?(http://.*?)[ '"]""")
33urls = []
34for url in pt.findall(content):
35    urls.append(url)
36print time.ctime()
37thread_pools = []
38for url in urls:
39    current = Download(url)
40    thread_pools.append(current)
41    current.start()
42result_text = content
43for result in thread_pools:
44    print "%s threads running" % threading.activeCount()
45    result.join(5)
46    if not result.isAlive():
47##        print "url %s saved to %s" % (result.url, result.filename)
48        result_text = result_text.replace(result.url, result.local_url)
49file(os.path.join(os.path.dirname(__file__), 'result.html'), 'wb').write(result_text)
50print "%s threads running" % threading.activeCount()
51if threading.activeCount():
52    print "Can not stop"
53print time.ctime()