python图片采集

本例是一个通过BeautifulSoup模块解析处理后进行img下载的示例，可以指定下载的路径，代码如下：

 1# ImageDownloader.py
 2# Finds and downloads all images from any given URL recursively.
 3# FB - 20140223
 4import sys
 5import os
 6import urllib2
 7from os.path import basename
 8import urlparse
 9from BeautifulSoup import BeautifulSoup # for HTML parsing
10urlList = []
11# recursively download images starting from the root URL
12def downloadImages(url, level): # the root URL is level 0
13    # do not go to other websites
14    global website
15    netloc = urlparse.urlsplit(url).netloc.split('.')
16    if netloc[-2] + netloc[-1] != website:
17        return
18    global urlList
19    if url in urlList: # prevent using the same URL again
20        return
21    try:
22        urlContent = urllib2.urlopen(url).read()
23        urlList.append(url)
24        print url
25    except:
26        return
27    soup = BeautifulSoup(''.join(urlContent))
28    # find and download all images
29    imgTags = soup.findAll('img')
30    for imgTag in imgTags:
31        imgUrl = imgTag['src']
32        imgUrl = url[ : url.find(".com") + 4] + imgUrl if (imgUrl[ : 4] != "http") else imgUrl
33        # download only the proper image files
34        if imgUrl.lower().endswith('.jpeg') or
35            imgUrl.lower().endswith('.jpg') or
36            imgUrl.lower().endswith('.gif') or
37            imgUrl.lower().endswith('.png') or
38            imgUrl.lower().endswith('.bmp'):
39            try:
40                imgData = urllib2.urlopen(imgUrl).read()
41                global minImageFileSize
42                if len(imgData) >= minImageFileSize:
43                    print "    " + imgUrl
44                    fileName = basename(urlparse.urlsplit(imgUrl)[2])
45                    output = open(os.path.join(downloadLocationPath, fileName),'wb')
46                    output.write(imgData)
47                    output.close()
48            except Exception, e:
49                print str(e)
50                # pass
51    print
52    print
53    # if there are links on the webpage then recursively repeat
54    if level > 0:
55        linkTags = soup.findAll('a')
56        if len(linkTags) > 0:
57            for linkTag in linkTags:
58                try:
59                    linkUrl = linkTag['href']
60                    downloadImages(linkUrl, level - 1)
61                except Exception, e:
62                    print str(e)
63                    # pass
64# MAIN
65cla = sys.argv # command line arguments
66if len(cla) != 5:
67    print "USAGE:"
68    print "[python] ImageDownloader.py URL MaxRecursionDepth DownloadLocationPath MinImageFileSize"
69    os._exit(1)
70rootUrl = cla[1]
71maxRecursionDepth = int(cla[2])
72downloadLocationPath = cla[3] # absolute path
73if not os.path.isdir(downloadLocationPath):
74    print downloadLocationPath + " is not an existing directory!"
75    os._exit(2)
76minImageFileSize = long(cla[4]) # in bytes
77netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
78website = netloc[-2] + netloc[-1]
79downloadImages(rootUrl, maxRecursionDepth)

这里使用的是Beautifulsoup模块，同样也可以使用lxml模块中的lxml.html方法，具体代码如下：

1imageUrl = xhtml.xpath('//img[@alt="something"]/@src')

如您感觉文章有用，可扫码捐赠本站！(If the article useful, you can scan the QR code to donate))

捐赠本站(Donate)

See Also

Latest articles

Categories

Tags

Links

Meta