您的位置：首页 > 编程语言 > Python开发
百度空间相册下载器 python实现 by Gods_巨蚁（原创）

2011-04-20 15:08 375 查看
最近学习python中，感觉python确实挺好用

昨晚加今天实现了一个百度空间相册下载器

下面开放源代码，作者:Gods_巨蚁，转载注明出处

新QQ:1443561883
#coding: UTF-8

import urllib, re, os

__metaclass__ = type	#使用新类

class AntAlbumDownload:
'''
用于下载百度空间相册照片
'''
#imgarr[len]={purl:"/zhongji/album/item/8177718de7d67312b21bba72.html", psrc:"http://hiphotos.baidu.com/zhongji/abpic/item/8177718de7d67312b21bba72.jpg",
#		psize:"300*200 61K", pcmtNum:0, pname:"移动.gif",
#		pedit:  '' ,
#		pid:"8177718de7d67312b21bba72",
#isMobileUp:0,
#isLocked:0				};

#相册网页信息匹配
patPage = re.compile(
r'''
imgarr\[len\]={purl:"(.*?)",
.*?
psrc:"(.*?)",
.*?
pname:"(.*?)"
.*?
pid:"(.*?)"
.*?};
''',
re.VERBOSE
)
#测试相册网页
pat = re.compile(r'imgarr\[len\]={purl:".*?",.*?psrc:".*?",.*?')

#var Session = {
#spaceURL: "/zhongji",
#isHost: false,         // 是否是空间主人
#isLogin: false,
#isActive: false,
#isShowVcode: true,
#userName: "饥饿蚂蚁",   // 空间主人用户名
#userNameEnc:    "%BC%A2%B6%F6%C2%EC%D2%CF",
#visitorName:    "",
#visitorURL: "\/index.html",        //
#refer: "http:\/\/hi.baidu.com\/zhongji\/album\/%D7%CA%C1%CF%D6%D0%B5%C4%CD%BC%C6%AC\/index\/2",
#spaceDomain: 'http://hi.baidu.com',
#spaceStaticDomain: 'http://hi.bdimg.com',
#portraitDomain: 'http://tx.bdimg.com',
#photoDomain: 'http://hiphotos.baidu.com',
#hiupDomain: 'http://hiup.baidu.com',
#spToken: 'd3981061a624c51023d46bcdc8336fd4'
#};

#图片网页信息匹配
patImage = re.compile(
r'''
var\ Session\ =\ {
.*?
spaceURL:\ "(.*?)",
.*?
userName:\ "(.*?)",	# 空间主人用户名
.*?
photoDomain:\ '(.*?)',
.*?};
''',
re.VERBOSE
)
#测试图片网页
pat2 = re.compile(r'''
var\ Session\ =\ {
.*?
spaceURL.*?
userName:.*?
photoDomain:
''',
re.VERBOSE
)

def __init__(self):
pass

def _getPageText(self, url):
#获取一个网页的内容，并且替换掉所有换行符
page = urllib.urlopen(url)
text = page.read()
page.close()

#这里很关键，去除换行符
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
return text

def setAttr(self, name = '', url = ''):
if not name:
self.nameAlbum = raw_input('I will create the album directory, Input the name:')
else:
self.nameAlbum = name

if not url:
self.urlAlbum = raw_input('Input the URL of the album:')
else:
self.urlAlbum = url

self.countAnalysisPage = 0

#百度空间特殊性
#若URL中包含#，则提取#后的URL替换当前URL
loc = self.urlAlbum.find('#')
if loc != -1:
self.urlAlbum = 'http://hi.baidu.com' + self.urlAlbum[loc+1:]

#如果已经包含页数信息，特将当前页数改为0
#若不包含页数信息，加上/index/0

#patFirst = re.compile(
#	r'''
#	(.*?)/index/([0-9]+)
#	''',
#	re.VERBOSE
#)
#urlFirst = patFirst.search(self.urlAlbum)
#if urlFirst:
#	self.urlAlbum = urlFirst.group(1) + '/index/0'
#else:
#	self.urlAlbum += '/index/0'

#新方法
self.urlAlbum = self._getIndexPageUrl(0)

print '解析到相册首页URL为: ', self.urlAlbum

cmd = 'md ' + self.nameAlbum
os.system(cmd)

def analysisImagePage(self, url, imageId):
'''
分析相片网页，获取相片实际地址
'''
text = self._getPageText(url)

print '分析图片页 当前页:%3d' % self.countAnalysisPage
self.countAnalysisPage += 1

#if self.pat2.search(text):
#	print 'analysisImagePage ok'

urlImage = self.patImage.search(text)

return urlImage.group(3) + urlImage.group(1) + '/pic/item/' + imageId +'.jpg'

def _getIndexPageUrl(self, iPage):
#获得当前相册页码为iPage的网页地址
patIndexPage = re.compile(
r'''
(.*?)/index/([0-9]+)
''',
re.VERBOSE
)
urlIndexPage = patIndexPage.search(self.urlAlbum)

if urlIndexPage:
return urlIndexPage.group(1) + '/index/' + str(iPage)
else:
return self.urlAlbum + '/index/' + str(iPage)

def analysis(self, countPage):
'''
分析相册网页内容
参数countPage表示分析的页数，0表示分析所有页

'''
print '开始分析页面'

#存放图片下载地址
images = []

#表示总页数
maxPage = 0

#读取相册首页内容
textPage = self._getPageText(self.urlAlbum)

#<a  href="#/zhongji/album/%D7%CA%C1%CF%D6%D0%B5%C4%CD%BC%C6%AC/index/3">[尾页]</a>
patLastPage = re.compile(
r'''
<a.+?href="\#.+?/index/([0-9]+?)">\[尾页\]</a>
''',
re.VERBOSE
)

urlLastPage = patLastPage.search(textPage)

if urlLastPage:
#如果找到 [尾页] 匹配,则可以得到相册最大页数
print '尾页匹配成功'
maxPage = int(urlLastPage.group(1))
else:
print '尾页匹配失败'
maxPage = 0

#参数countPage为0时，表示分析相册所有页面
#参数countPage不允许超过总页数
if countPage == 0 or countPage > maxPage + 1:
countPage = maxPage + 1

for iPage in xrange(countPage):
print '分析相册 当前页:%3d' % (iPage)
if iPage != 0:
#获得页码为iPage的页码类容
urlIndexPage = self._getIndexPageUrl(iPage)
textPage = self._getPageText(urlIndexPage)

for imagePage, imageSmall, imageName, imageId in self.patPage.findall(textPage):
#print 'href:', imagePage
#print 'imageSmall:', imageSmall
#print 'name:', imageName
#print 'id:', imageId
#imagePages.append(imagePage)

#原图所在网页URL
urlImagePage = 'http://hi.baidu.com' + imagePage
#print 'ulrImage:', urlImagePage
#print imageId

#获取原图URL
urlImage = self.analysisImagePage(urlImagePage, imageId)
#print 'ulrImage:', urlImage
images.append(urlImage)

print '分析完成，开始下载'

max = len(images)

#显示图片URL，并且下载图片到 已命名文件夹
for index, image in enumerate(images):
#设置保存路径
pathImage = r'%s\%04d.jpg' % (self.nameAlbum, index)

#下载图片
urllib.urlretrieve(image, pathImage)

print pathImage
print '下载完成%.1f:%%' % ((index+1)*100.0/max)

def download(self, countPage = 0):
'''
下载相册图形
countPage表示页数,0表示下载所有图片
'''
self.analysis(countPage);

#for i in xrange(countPage):

def main():
album = AntAlbumDownload()
album.setAttr('', '')
album.download(0)

main()
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签：
相关文章推荐
新的分享
章节导航