三个多线程采集网站图片的示例
采集代码一
1#! /usr/bin/env python
2# -*- coding: utf-8 -*-
3import os ,sys ,urllib2,socket
4import re
5import time
6from threading import Thread
7from Queue import Queue
8DOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download') #保存地址
9socket.setdefaulttimeout(30)
10THREAD_COUNT = 5 #线程数量
11def md5sum(s):
12 try:
13 import hashlib
14 m = hashlib.md5()
15 m.update(s)
16 return m.hexdigest()
17 except:
18 import md5
19 m = md5.new()
20 m.update(s)
21 return m.hexdigest()
22class spiderList(Thread):
23 def __init__(self ,queue):
24 Thread.__init__(self)
25 self.queue = queue
26 def run(self):
27 pages = []
28 #这个网站列表的页数从第1页到第117页
29 for i in range(1,117):
30 pages.append('http://xxx.com/?page=%s' % i)
31 self.queue.put(pages)
32 self.queue.task_done()
33class spiderDetail(Thread):
34 def __init__(self,queue):
35 Thread.__init__(self)
36 self.queue = queue
37 self.header = {
38 'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'
39 }
40 def run(self):
41 urls = self.queue.get()
42 self.page=1
43 for url in urls:
44 rq = urllib2.urlopen(urllib2.Request(url = url ,headers = self.header))
45 result = re.findall('_src="([wW]+?)"', rq.read())
46 if result != '':
47 for src in result:
48 bigImage = self.__getBigImage(src)
49 if bigImage!='':
50 img = urllib2.urlopen(bigImage).read()
51 fileName = self.__getFileName(bigImage)
52 file(fileName,'wb').write(img)
53 self.page+=1
54 self.queue.task_done()
55 def __getDir(self):
56 import datetime
57 now = datetime.datetime.now()
58 dateDir = now.strftime('%Y-%m-%d')
59 saveDir = os.path.join(DOWNLOAD_BASEDIR, dateDir)
60 pageDir = 'page_%d' % self.page
61 saveDir = os.path.join(saveDir, pageDir)
62 if os.path.isdir(saveDir) == False:
63 os.makedirs(saveDir)
64 return saveDir
65 def __getBigImage(self ,url):
66 if(url==''):
67 return False
68 args = re.split("-([0-9a-zA-z]+).", url)
69 return args[0]+'.'+args[2]
70 def __getFileName(self,url):
71 baseName = os.path.basename(bigImage)
72 args = os.path.splitext(baseName)
73 fileName = md5sum(args[0])+args[1]
74 return os.path.join(self.__getDir(), fileName)
75if __name__ == '__main__':
76 queue = Queue()
77 for i in range(THREAD_COUNT):
78 lt = spiderList(queue)
79 lt.setDaemon(True)
80 lt.start()
81 dt = spiderDetail(queue)
82 dt.setDaemon(True)
83 dt.start()
84 while 1:
85 pass
采集代码二
1#!/usr/bin/env python
2#-*- coding: utf-8 -*-
3#通过urllib(2)模块下载网络内容
4import urllib,urllib2,gevent
5#引入正则表达式模块,时间模块
6import re,time
7from gevent import monkey
8monkey.patch_all()
9def geturllist(url):
10 url_list=[]
11 print url
12 s = urllib2.urlopen(url)
13 text = s.read()
14 #正则匹配,匹配其中的图片
15 html = re.search(r'<ol.*</ol>', text, re.S)
16 urls = re.finditer(r'<p><img src="(.+?)jpg" /></p>',html.group(),re.I)
17 for i in urls:
18 url=i.group(1).strip()+str("jpg")
19 url_list.append(url)
20 return url_list
21def download(down_url):
22 name=str(time.time())[:-3]+"_"+re.sub('.+?/','',down_url)
23 print name
24 urllib.urlretrieve(down_url, "D:\TEMP\"+name)
25def getpageurl():
26 page_list = []
27 #进行列表页循环
28 for page in range(1,700):
29 url="http://jandan.net/ooxx/page-"+str(page)+"#comments"
30 #把生成的url加入到page_list中
31 page_list.append(url)
32 print page_list
33 return page_list
34if __name__ == '__main__':
35 jobs = []
36 pageurl = getpageurl()[::-1]
37 #进行图片下载
38 for i in pageurl:
39 for (downurl) in geturllist(i):
40 jobs.append(gevent.spawn(download, downurl))
41 gevent.joinall(jobs)
采集代码三
1import os,time,sys,re,threading
2import urllib
3DOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download')
4DOWNLOAD_BASEURL = './download/'
5os.mkdir(DOWNLOAD_BASEDIR)
6def md5sum(s):
7 try:
8 import hashlib
9 m = hashlib.md5()
10 m.update(s)
11 return m.hexdigest()
12 except:
13 import md5
14 m = md5.new()
15 m.update(s)
16 return m.hexdigest()
17class Download(threading.Thread):
18 def __init__(self, url):
19 threading.Thread.__init__(self)
20 self.url = url
21 def run(self):
22## print "downloading %s " % self.url
23 f = urllib.urlopen(self.url)
24 content_type,extention = f.headers.get('content-type','image/jpeg').split('/')
25 if extention in ('jpeg','html'):
26 extention = 'jpg'
27 basename = "%s.%s" %( md5sum(self.url) , extention)
28 self.filename = os.path.join(DOWNLOAD_BASEDIR, basename)
29 self.local_url = DOWNLOAD_BASEURL + basename
30 file(self.filename, 'wb').write(f.read())
31content = file(os.path.join(os.path.dirname(__file__), 'content.html')).read()
32pt=re.compile(r"""src=['"]?(http://.*?)[ '"]""")
33urls = []
34for url in pt.findall(content):
35 urls.append(url)
36print time.ctime()
37thread_pools = []
38for url in urls:
39 current = Download(url)
40 thread_pools.append(current)
41 current.start()
42result_text = content
43for result in thread_pools:
44 print "%s threads running" % threading.activeCount()
45 result.join(5)
46 if not result.isAlive():
47## print "url %s saved to %s" % (result.url, result.filename)
48 result_text = result_text.replace(result.url, result.local_url)
49file(os.path.join(os.path.dirname(__file__), 'result.html'), 'wb').write(result_text)
50print "%s threads running" % threading.activeCount()
51if threading.activeCount():
52 print "Can not stop"
53print time.ctime()
捐赠本站(Donate)
如您感觉文章有用,可扫码捐赠本站!(If the article useful, you can scan the QR code to donate))
- Author: shisekong
- Link: https://blog.361way.com/python-threading-spider-img/3910.html
- License: This work is under a 知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议. Kindly fulfill the requirements of the aforementioned License when adapting or creating a derivative of this work.