您的位置：首页 > 编程语言 > Python开发

用python实现的一个抓取图片的爬虫

2015-01-25 20:10 1231 查看

最近学到python的时候学到正则表达式，于是心血来潮就用python写了一个简单抓取网站图片的小爬虫，为此贴上代码来记录自己学习python的过程，同时也希望大家多提出爬虫改进的意见，因为此爬虫只能抓取以http://开头的图片，而像百度图库的图片大多数都不是以http：//开头，希望大家能给出意见。

废话不多说，代码上：

version：1.0

#!/usr/bin/python

import re
import urllib

def getHtml():
url=raw_input('Enter url:')
page=urllib.urlopen(url)
html=page.read()
return html

def getImgUrl(html):
print '''
1 represents .jpg
2 represents .png
4 represents .gif
3 represents .jpg+.png
5 represents .jpg+.gif
6 represents .png+.gif
7 represents .jpg+.png+.gif
'''
imgForm=int(raw_input('Enter the Form of Img:'))
imgurllist=[]
if imgForm==1:
reg=r'http://[^\s]+?\.jpg'
imgre=re.compile(reg,re.I)
imgurllist=re.findall(imgre,html)
elif imgForm==2:
reg=r'http://[^\s]+?\.png'
imgre=re.compile(reg,re.I)
imgurllist=re.findall(imgre,html)
elif imgForm==4:
reg=r'http://[^\s]+?\.gif'
imgre=re.compile(reg,re.I)
imgurllist=re.findall(imgre,html)
elif imgForm==3:
reg=r'(http://[^\s]+?\.(png|jpg))'
imgre=re.compile(reg,re.I)
imglist=re.findall(imgre,html)
imgurllist=[]
for x in imglist:
imgurllist.append(x[0])
elif imgForm==5:
reg=r'(http://[^\s]+?\.(gif|jpg))'
imgre=re.compile(reg,re.I)
imglist=re.findall(imgre,html)
imgurllist=[]
for x in imglist:
imgurllist.append(x[0])
elif imgForm==6:
reg=r'(http://[^\s]+?\.(png|gif))'
imgre=re.compile(reg,re.I)
imglist=re.findall(imgre,html)
imgurllist=[]
for x in imglist:
imgurllist.append(x[0])
else:
reg=r'(http://.+?\.(png|jpg|jpg))'
imgre=re.compile(reg,re.I)
imglist=re.findall(imgre,html)
imgurllist=[]
for x in imglist:
imgurllist.append(x[0])

return imgurllist

def downloadImg(url):
jpg=0;
png=0;
gif=0;
for imgurl in url:
if(re.findall(r'.+\.jpg',imgurl)):
urllib.urlretrieve(imgurl,"%s.jpg"%jpg)
jpg+=1
elif(re.findall(r'.+\.png',imgurl)):
urllib.urlretrieve(imgurl,"%s.png"%png)
png+=1
elif(re.findall(r'.+?\.gif',imgurl)):
urllib.urlretrieve(imgurl,"%s.gif"%gif)
gif+=1
else:
print "not picture captured"

html=getHtml()
imgurl=getImgUrl(html)
downloadImg(imgurl)
print imgurl

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python 正则表达式爬虫

相关文章推荐

新的分享

章节导航