您的位置:首页 > 编程语言 > Python开发

python获取本人关注列表并批量存入本地mysql数据库

2017-01-12 11:26 513 查看
先模拟登陆,将cookies保存到本地。代码中获得知乎关注列表的链接是16年知乎电脑网页版改版以前的,返回一组json数据,下拉自动填充网页,需要传xsrf 、hash_id。2016年11月左右知乎改版后有了新的api,新的api不需要获取xsrf和hashid,只要有内部的name就可以,不过返回的数据中没有了赞同数和提问问题数。

mysql批量插入用的是pymsql的executemany方法。

import http.cookiejar
import requests
import re
import json
import math
import time
import pymysql.cursors
from zhihu.author import Author
from bs4 import BeautifulSoup
from collections import deque

deque=deque()
agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/46.0.2490.76 Mobile Safari/537.36"
headers = {
'User-Agent': agent
}
#zui-jiu-qing-niu-4
def getsession():
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename="cookies")
session.cookies.load(ignore_discard=True)
return session

def followers_num(session,id):
res = session.get('https://www.zhihu.com/people/'+id+'/followees', headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
fonum = bs.find("div", {'class': 'zu-main-sidebar'}).find('a', {'class': 'item'}).find('strong').text
fonum = int(fonum)
return fonum

def get_xsrf(session):
'''''_xsrf 是一个动态变化的参数'''
index_url = 'http://www.zhihu.com'
# 获取登录时需要用到的_xsrf
index_page = session.get(index_url, headers=headers)
html = index_page.text
pattern = r'name="_xsrf" value="(.*?)"'
# 这里的_xsrf 返回的是一个list
_xsrf = re.findall(pattern, html)
return str(_xsrf[0])

def getfollwer(fonum,session,xsrf):
begin = 0
end = math.ceil(fonum / 20)
num = 1
for x in range(0, end):
beginnum = str(x * 20)
# print(x)
postdata = {'method': 'next',
'params': '{"offset":' + beginnum + ',"order_by":"created","hash_id":"29d75b4013b4631aaf7fe5848f3f6113"}',
'_xsrf': xsrf}
ress = session.post('https://www.zhihu.com/node/ProfileFolloweesListV2', data=postdata, headers=headers)
jsons = json.loads(ress.content.decode('utf-8'))
print(jsons['msg'])
time.sleep(5)
for a in jsons['msg']:
#print(a)
abs = BeautifulSoup(a, 'html.parser')
print(num)
name=abs.find('a', {'class': 'zg-link author-link'}).text
print("用户:" + name)
homepage = abs.find('a', {'class': 'zg-link author-link'})['href']
id = homepage[29:]
print(id)
#print("用户主页:" + homepage)

normals = abs.find_all(a, {'class': 'zg-link-gray-normal'})
follower_num=int(abs.find('a', {'href': '/people/' + id + '/followers'}).text[:-4])
print(follower_num)
question_num=int(abs.find('a', {'href': '/people/' + id + '/asks'}).text[:-3])
print(question_num)
answer_num=int(abs.find('a', {'href': '/people/' + id + '/answers'}).text[:-3])
print(answer_num)
agree_num=int(abs.find('a', {'href': '/people/' + id, 'class': 'zg-link-gray-normal'}).text[:-3])
print(a
4000
gree_num)
author=Author(id,name,homepage,follower_num,question_num,answer_num,agree_num)
deque.append(author)
print(author.name)
print(author.homepage)
num = num + 1
print("================================================================================================")
return deque

def insertzhihu(deque):
connetion=pymysql.connect(host='localhost',
user='root',
password='159366',
db='zhihu',
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
values=[]
for author in deque:
value=(author.id,author.name,author.homepage,author.follower_num,author.question_num,
author.answer_num,author.agree_num)
values.append(value)
cursor=connetion.cursor()
cursor.executemany("insert into zhihu_author values(%s,%s,%s,%s,%s,%s,%s)",values)
connetion.commit()

if __name__ == "__main__":
session=getsession()
xsrf=get_xsrf(session)
fnum=followers_num(session,"zui-jiu-qing-niu-4")
deque=getfollwer(fnum,session,xsrf)
insertzhihu(deque)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: