您的位置:首页 > 编程语言 > Python开发

python爬取知乎话题的精华问题下的用户信息

2016-01-12 21:22 676 查看
今天试着用自己的爬虫代码爬取了知乎【同性恋】话题下的所有精华问题的用户位置信息

代码:

__author__ = 'yang'
# -*- coding: utf-8 -*-

import configparser
import requests
import time
import re
import string

def curTime():
curTime = time.strftime('%Y-%m-%d %H:%M:%S')
timeStr = '\n<!--'+curTime+'-->'
return timeStr

def loginInfo(): #获取用户名,密码
filename = 'test.ini' #test.ini中有知乎账号、密码及浏览器cookies
config = configparser.ConfigParser()
config.read(filename)
cookies = config.items('COOKIES')
cookies = dict(cookies)
username = config.get("USER","username")
password = config.get("USER","password")
#print username
return username,password,cookies

def create_session():
username, password, cookies = loginInfo()
session = requests.session()
login_data = {'email':username, 'password':password}

header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',
'Host': 'www.zhihu.com',
'Referer': 'http://www.zhihu.com/'
}

r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header)

if r.json()['r'] == 1:
print 'Login Failed, reason is:',
for m in r.json()['data']:
print r.json()['data'][m]
print 'Use cookies to login...'
has_cookies = False
for key in cookies:
if key != '__name__' and cookies[key] != '':
has_cookies = True
break
if has_cookies is False:
raise ValueError('请填写config.ini文件中的cookies项')
else:
r = session.get('http://www.zhihu.com/login/email', cookies=cookies)

with open('login.html', 'w') as fp:
fp.write(r.content)

return session, cookies

def writeFile(name,content):
with open(name,'w') as fp:
fp.write(content)

if __name__ == '__main__':

requests_session, requests_cookies = create_session()

with open('tong.html','w') as fp:
fp.write(curTime())

for page in range(0,49):
url = 'https://www.zhihu.com/topic/19552984/top-answers?'+str(page)
content = requests_session.get(url, cookies=requests_cookies).content
f = file('tong.html', 'a+')
f.write(content)
#f = file('url.html', 'a+')
#f.write(curTime())

#匹配问题连接字符串
str = re.compile(r'<a class="question_link.*?href="(.*?)">')
with open('url.html') as file:
content = file.read()
questionLinks = str.findall(content)
print (questionLinks)
with open('resultLink.html','w') as fp:
fp.write('\n'.join(questionLinks))

with open('resultLink.html') as fp:
questionLinks = fp.readlines()

#获取用户链接

usrRegex = re.compile(r'<a class="author-link.*?href="(.*?)">')
for link in questionLinks:
num = link.strip()
url = 'https://www.zhihu.com'+str(num)
page = requests_session.get(url,cookies=requests_cookies).content #获取页面内容

#过滤用户链接
usrLinks = usrRegex.findall(page)
f = file('usrLinks.html','a+')
f.write('\n'.join(usrLinks))

with open('usrLinks.html') as fp:
ls = fp.readlines()

links = []
for link in ls:
links.append(link.strip())
#print len(links)

links = list(set(links))
#print len(links)

#获取用户个人页面
locationRegex = re.compile(r'<span class="location item.*?title="(.*?)"')
locations = []
for link in links:
url = 'https://www.zhihu.com'+str(link)
page = requests_session.get(url, cookies=requests_cookies).content

#获取位置信息
#locations.append(locationRegex.findall(page))
location = locationRegex.findall(page)+'\n'
if (location):
f = file('locations.html','a+')
f.write('\n'.join(location))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: