您的位置:首页 > 编程语言 > Python开发

【Python】抓取人人都是产品经理的文章

2017-10-09 12:03 741 查看

简介

使用python3.5

支持自动切换User-Agent(基于fake_useragent)

支持增量爬取(基于pybloom)

支持中断续爬

代码

# -*- coding: utf-8 -*-
#-------------------------------------
# author: maqingxiong
# date:   2017-10-09
# desc:   抓取人人都是产品经理最新文章
#-------------------------------------

import requests
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

from pybloom import ScalableBloomFilter
from fake_useragent import UserAgent

class PmSpider(object):
def __init__(self):
self.session = requests.session()
self.ua = UserAgent()
self.sbf = ScalableBloomFilter()
self.load_url_in_bloomfilter()

def load_url_in_bloomfilter(self):
with open('crawled_url.txt', 'r') as f:
for line in f:
self.sbf.add(line.replace('\n', ''))

def construct_all_url(self):
url_list = []
base_url = 'http://www.woshipm.com/__api/v1/stream-list?paged={}&action=laodpost'
for i in range(1, 11):
url_list.append(base_url.format(str(i)))
return url_list

def get_req_headers(self):
headers = {
'User-Agent': self.ua.random,
'Host': 'www.woshipm.com',
'Referer': 'http://www.woshipm.com/'
}
return headers

def get_html(self, url):
headers = self.get_req_headers()
try:
response = self.session.get(url=url, headers=headers)
if response.status_code == 200:
return response.json()
else:
return None
except Exception as e:
return self.get_html(url=url)

def parse_html(self, html):
for item in html['payload']:
print(item['id'])
print(item['title'])
print(item['permalink'])
print(item['date'])
print(item['image'])

def run(self):
f = open("crawled_url.txt", 'w')
url_list = self.construct_all_url()
for url in url_list:
if url in self.sbf:
continue
else:
f.write(url+'\n')
html = self.get_html(url=url)
if html:
self.parse_html(html)
f.close()

if __name__ == "__main__":
pm_spider = PmSpider()
pm_spider.run()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: