您的位置:首页 > 产品设计 > UI/UE

第一个爬虫程序,基于requests和BeautifulSoup

2017-12-26 15:53 741 查看
断断续续学了1年多python,最近总算感觉自己入门了,记录下这几天用requests和BeautifulSoup写的爬虫。

python的环境是anaconda+pycharm。

直接上代码

@requires_authorization
"""
作者:西瓜不是我的
日期:2017年12月26日
功能:爬取全国每个城市各个监测点的AQI
版本:v11.0
"""
import requests
from bs4 import BeautifulSoup
import csv

def get_city_area_aqi(url):
'''
获取城市监测点aqi
'''
r = requests.get(url, timeout=20)
soup = BeautifulSoup(r.text, 'lxml')
area_name = soup.find_all('thead')  # len = 1
area_final_name = area_name[0].find_all('th')  # len = 13
area_aqi = soup.find_all('tbody')  # len = 1
area_final_aqi = area_aqi[0].find_all('tr')  # len = 13
# 各监测点名称
area_name_list = []
# 最终目录
final_list = []
# 为各监测点名称列表赋初值
for i in range(len(area_final_name)):
area_name_list.append(area_final_name[i].text)
# 将信息写入到最终目录中
for i in range(len(area_final_aqi)):
final_aqi = area_final_aqi[i].text.strip()
aqi = final_aqi.split('\n')
for j in range(len(area_name_list)):
final_list.append((area_name_list[j], aqi[j]))
return final_list

def write_to_csv(final_list,city_name):
'''
将获取到的city_aqi列表写到csv文件中
'''
with open('aqi.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
row = [city_name]
# for循环用来写csv文件的数据行
for i, name in enumerate(final_list):
row.append(name[1])
if (i + 1) % 11 == 0:

4000
writer.writerow(row)
row = [city_name]

def get_all_city_name(url):
'''
获取所有城市的中文名和英文名
'''
r = requests.get(url,timeout = 30)
soup = BeautifulSoup(r.text, 'lxml')
city_div = soup.find_all('div',{'class':'bottom'})[1]
city_name_list = city_div.find_all('a')

city_name = []
for name in city_name_list:
name_text = name.text
name_pinyin = name['href'][1:]
city_name.append((name_text,name_pinyin))
return city_name

def main():
url = 'http://www.pm25.in'
all_city_list = get_all_city_name(url)
#写csv文件的第一行,即标题行
write_row = ['city','监测点','AQI','空气质量','首要污染物','PM2.5','PM10','CO','NO2','O3-1','O3-8','SO2']
with open('aqi.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(write_row)
#先获取所以城市的英文名,根据英文名找到对应的url,再获得每个城市各监测点的aqi值,最后写入到scv文件中
for i,city in enumerate(all_city_list):
city_name = city[0]
city_pinyin = city[1]
url = 'http://www.pm25.in/' + city_pinyin
city_list = get_city_area_aqi(url)
write_to_csv(city_list,city_name)
#只爬取10个城市
if i == 10:
break

if __name__=='__main__':
main()




第一次用markdown,就写这么多。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: