您的位置:首页 > 编程语言

今天写的一个用爬虫爬猫眼电影top100的完整代码

2018-04-02 17:03 711 查看
这个是今天写的一个爬取猫眼电影top100的一个完整的代码

# coding:utf-8
import json
import requests
from bs4 import  BeautifulSoup

def get_one_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
Response = requests.get(url,headers=headers)
if Response.status_code == 200:
return Response.text
return None
except Exception:
return None

def parse_one_page(html):
Soup=BeautifulSoup(html,'html.parser',from_encoding='utf-8')
board_wrapper=Soup.find('dl',class_='board-wrapper')
dd_wag=board_wrapper.find_all('dd')
list=[]
for i in dd_wag:
rate=i.find('i').get_text(strip=True)
figure=i.find('a',class_='image-link').find('img',class_='board-img')['data-src']
movie=i.find('p',class_='name').find('a',attrs={'data-act':'boarditem-click'}).get_text(strip=True)
cast=i.find('p',class_='star').get_text(strip=True)[3:]
releasetime=i.find('p',class_='releasetime').get_text(strip=True)[5:]
mark=i.find('i',class_='integer').string+i.find('i',class_='fraction').get_text(strip=True)
x = {'rate':rate,
'figure':figure,
'movie':movie,
'cast':cast,
'releasetime':releasetime,
'mark':mark
}
list.append(x)
return list

def write_content(content):
with open('D:/猫眼电影5.txt','a',encoding='utf-8') as f:
for i in content:
f.write(json.dumps(i,ensure_ascii=False)+'\n')
f.close()

if __name__=='__main__':
for i in range(10):
html=get_one_page('http://maoyan.com/board/4?offset=%s'%(i*10))
content=parse_one_page(html)
write_content(content)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python爬虫