您的位置:首页 > 编程语言 > Python开发

python3 爬取猫眼榜单top100(requests+beautifulsoup)

2018-01-04 15:36 806 查看
初学python,记录学习过程。

爬取的url:http://maoyan.com/board/4

共十页,第二页的url:http://maoyan.com/board/4?offset=10 以此类推

源码如下:

#猫眼电影TOP100

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
#找到str在string中最后出现的位置,若没有出现过,则返回-1
def find_last(string,str):
last_position=-1
while True:
position=string.find(str,last_position+1)
if position==-1:
return last_position
last_position=position

def get_html(url):
try:
response = requests.get(url)
if response.status_code ==200:
return response.text
return None
except RequestException:
return None

def parse_html(html):
soup = BeautifulSoup(html , "html.parser")
names = [i.a.string for i in soup.select(".movie-item-info > .name")]
stars = [i.string for i in soup.select(".movie-item-info > .star")]
times = [i.string for i in soup.select(".movie-item-info > .releasetime")]
integers = [i.string for i in soup.select(".score > .integer")]
fractions = [i.string for i in soup.select(".score > .fraction")]
star = []
time = []
score = []
for s in stars:
index = s.find('主演')
lindex = find_last(s,'\n')
star.append(s[index+3:lindex])

for s in times:
index = s.find('上映时间')
time.append(s[index+5:])

for x,y in zip(integers,fractions):
score.append(x+y)

return names,star,time,score

def write_to_file(names,stars,times,scores):
file = open('result.txt','a',encoding='utf-8')
# file.write('%-15s %-20s %-17s %-3s' % ('电影', '主演', '上映时间', '评分:'))
# file.write('\n')
for n,s,t,sc in zip(names,stars,times,scores):
file.write('%-15s %-20s %-17s %-3s' % (n,s,t,sc))
file.write('\n')
file.close()

def main(offset):
url = 'http://maoyan.com/board/4?offset='+str(offset)
html = get_html(url)
names,stars,times,scores = parse_html(html)
write_to_file(names,stars,times,scores)

if __name__ == '__main__':
for i in range(10):
main(i*10)

结果如下:




内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫 猫眼 电影