您的位置:首页 > 理论基础 > 计算机网络

简易的网络爬虫代码-python

2018-01-10 16:50 399 查看
#encoding:utf-8

# eg: 以时光网电影排名为例
# author:hou

from bs4 import BeautifulSoup
import urllib
import urlparse
import sys
reload(sys)
sys.setdefaultencoding('utf8')

# 获取URL中的参数
def geturlparams(url):
query = urlparse.urlparse(url).query
return dict([(k, v[0]) for k, v in urlparse.parse_qs(query).items()])

# 配置抓取网页的URL
def geturl(year,page,area):
url = 'http://movie.mtime.com/boxoffice/?year=%s' % year + \
'&area=%s'%area + \
'&type=MovieRankingYear&category=all&page=%s' % page + \
'&display=list×tamp=1515392838844&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json'
return url

# 解析HTML并导出所需数据
def export(file,url):
# 输出标签dd中的内容
a = urllib.urlopen(url).read()
b = BeautifulSoup(a, "html.parser")
c = b.find_all('dd')
if len(c) > 0:
for z in range(0, len(c)):
print c[z].i.string
print c[z].h3.string
print c[z].h4.string
record(file,c[z].i.string)
record(file, c[z].h3.string)
record(file, c[z].h4.string)
d = c[z].findAll('p')
for e in d:
print e.get_text().encode('utf-8')
record(file, e.get_text().encode('utf-8'))
print "************************************************"
record(file<
4000
span style="color:#cc7832;">, "************************************************")

# 记录数据到文件中
def record(file,param):
file.write(param + '\r\n')

# 网页抓取函数,节约代码
def scraping(file,pagecount,minyear,maxyear,area):
for page in range(0, pagecount):
for year in range(minyear, maxyear):
url = geturl(year, page, area)
export(file, url)

# # 定义全局参数
# area = ['china','NorthAmerica','global']
# # 定义抓取页数
# pagecount = 1
# # 定义抓取起始年份、终止年份
# minyear = 2016
# maxyear = 2017
# print "开始抓取数据,所需时间较长,请等待……"
#
# # 在脚本当前目录下创建保存的文件txt
# timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
# filename = 'Mtime film ranking_%s' % timestamp + '.txt'
# file_new = open (filename, 'w')
#
# # 导出中国排名
# record(file_new,"开始抓取内地票房,请等待……")
# scraping(file_new,pagecount,minyear,maxyear,area[0])
# record(file_new,"抓取内地票房完成!!!!!")
# record(file_new, "************************************************")
#
# # 导出北美排名
# record(file_new,"开始抓取北美票房,请等待……")
# scraping(file_new,pagecount,minyear,maxyear,area[1])
# record(file_new,"抓取北美票房完成!!!!!")
# record(file_new, "************************************************")
#
# # 导出全球排名
# record(file_new,"开始抓取全球票房,请等待……")
# scraping(file_new,pagecount,minyear,maxyear,area[2])
# record(file_new,"抓取全球票房完成!!!!!")
# record(file_new, "************************************************")
#
# file_new.close()
# print "全部数据已经抓取完成!!!可前去查看生成的文件:%s" % filename

print  geturlparams('http://movie.mtime.com/boxoffice/?year=2016&area=china&type=MovieRankingYear&category=all&page=0&display=list×tamp=1515573624641&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json')
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: