您的位置:首页 > 编程语言 > Python开发

Python爬虫初学(2)豆瓣电影top250评论数

2014-03-28 11:36 1126 查看
#作者:Nonikka
#版本:0.3
#2014年3月28日
import os,urllib.request,re

try:
douban250 = urllib.request.urlopen('http://movie.douban.com/top250').read()
except:
print("link Error")
douban250 = douban250.decode('utf-8')
re250 = re.compile(r'<div class="info">\s+<div class="hd">\s+<a href="(.+?)" class="">',re.DOTALL)
movies_page = re250.findall(douban250)
r_number = re.compile(r'<a href.+?(\d{4,7})</span>人评价',re.DOTALL)                    #正则评论数
r_name = re.compile(r'<span property="v:itemreviewed">(.+?)</span>',re.DOTALL)          #正则名字
page_open = []

for i in movies_page:
page_open.append(urllib.request.urlopen(i).read().decode('utf-8'))

movies_name = []
pinglunshu = []
for numbers in page_open:
print(r_name.findall(numbers) + r_number.findall(numbers))
movies_name.append(r_name.findall(numbers))                                         #电影名做列表
pinglunshu.append(r_number.findall(numbers))                                        #评论数做列表

dic = {}
intpinglunshu = []
for i in pinglunshu:
intpinglunshu.append(int(pinglunshu.pop(0)[0]))                                      #str转int

intmovies_name = []
for i in movies_name:
intmovies_name.append(movies_name.pop(0)[0])

for i_ in intmovies_name:
dic[i_] = intpinglunshu.pop(0)

dic = sorted(dic.items(),key=lambda d:d[1],reverse = True)                              #字典按value排序

out = open('data.txt','w')

for key in dic :                                                                        #输出文本
out.write(str(key) + '\n')   
#只能输出10个,此处有bug
out.close()

os.system("pause")
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: