python图片采集
本例是一个通过BeautifulSoup模块解析处理后进行img下载的示例,可以指定下载的路径,代码如下:
1# ImageDownloader.py
2# Finds and downloads all images from any given URL recursively.
3# FB - 20140223
4import sys
5import os
6import urllib2
7from os.path import basename
8import urlparse
9from BeautifulSoup import BeautifulSoup # for HTML parsing
10urlList = []
11# recursively download images starting from the root URL
12def downloadImages(url, level): # the root URL is level 0
13 # do not go to other websites
14 global website
15 netloc = urlparse.urlsplit(url).netloc.split('.')
16 if netloc[-2] + netloc[-1] != website:
17 return
18 global urlList
19 if url in urlList: # prevent using the same URL again
20 return
21 try:
22 urlContent = urllib2.urlopen(url).read()
23 urlList.append(url)
24 print url
25 except:
26 return
27 soup = BeautifulSoup(''.join(urlContent))
28 # find and download all images
29 imgTags = soup.findAll('img')
30 for imgTag in imgTags:
31 imgUrl = imgTag['src']
32 imgUrl = url[ : url.find(".com") + 4] + imgUrl if (imgUrl[ : 4] != "http") else imgUrl
33 # download only the proper image files
34 if imgUrl.lower().endswith('.jpeg') or
35 imgUrl.lower().endswith('.jpg') or
36 imgUrl.lower().endswith('.gif') or
37 imgUrl.lower().endswith('.png') or
38 imgUrl.lower().endswith('.bmp'):
39 try:
40 imgData = urllib2.urlopen(imgUrl).read()
41 global minImageFileSize
42 if len(imgData) >= minImageFileSize:
43 print " " + imgUrl
44 fileName = basename(urlparse.urlsplit(imgUrl)[2])
45 output = open(os.path.join(downloadLocationPath, fileName),'wb')
46 output.write(imgData)
47 output.close()
48 except Exception, e:
49 print str(e)
50 # pass
51 print
52 print
53 # if there are links on the webpage then recursively repeat
54 if level > 0:
55 linkTags = soup.findAll('a')
56 if len(linkTags) > 0:
57 for linkTag in linkTags:
58 try:
59 linkUrl = linkTag['href']
60 downloadImages(linkUrl, level - 1)
61 except Exception, e:
62 print str(e)
63 # pass
64# MAIN
65cla = sys.argv # command line arguments
66if len(cla) != 5:
67 print "USAGE:"
68 print "[python] ImageDownloader.py URL MaxRecursionDepth DownloadLocationPath MinImageFileSize"
69 os._exit(1)
70rootUrl = cla[1]
71maxRecursionDepth = int(cla[2])
72downloadLocationPath = cla[3] # absolute path
73if not os.path.isdir(downloadLocationPath):
74 print downloadLocationPath + " is not an existing directory!"
75 os._exit(2)
76minImageFileSize = long(cla[4]) # in bytes
77netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
78website = netloc[-2] + netloc[-1]
79downloadImages(rootUrl, maxRecursionDepth)
这里使用的是Beautifulsoup模块,同样也可以使用lxml模块中的lxml.html方法,具体代码如下:
1imageUrl = xhtml.xpath('//img[@alt="something"]/@src')
捐赠本站(Donate)
如您感觉文章有用,可扫码捐赠本站!(If the article useful, you can scan the QR code to donate))
- Author: shisekong
- Link: https://blog.361way.com/python-spider-img/3907.html
- License: This work is under a 知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议. Kindly fulfill the requirements of the aforementioned License when adapting or creating a derivative of this work.