python获取本人关注列表并批量存入本地mysql数据库
2017-01-12 11:26
513 查看
先模拟登陆,将cookies保存到本地。代码中获得知乎关注列表的链接是16年知乎电脑网页版改版以前的,返回一组json数据,下拉自动填充网页,需要传xsrf 、hash_id。2016年11月左右知乎改版后有了新的api,新的api不需要获取xsrf和hashid,只要有内部的name就可以,不过返回的数据中没有了赞同数和提问问题数。
mysql批量插入用的是pymsql的executemany方法。
mysql批量插入用的是pymsql的executemany方法。
import http.cookiejar import requests import re import json import math import time import pymysql.cursors from zhihu.author import Author from bs4 import BeautifulSoup from collections import deque deque=deque() agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) " \ "Chrome/46.0.2490.76 Mobile Safari/537.36" headers = { 'User-Agent': agent } #zui-jiu-qing-niu-4 def getsession(): session = requests.session() session.cookies = http.cookiejar.LWPCookieJar(filename="cookies") session.cookies.load(ignore_discard=True) return session def followers_num(session,id): res = session.get('https://www.zhihu.com/people/'+id+'/followees', headers=headers) bs = BeautifulSoup(res.text, 'html.parser') fonum = bs.find("div", {'class': 'zu-main-sidebar'}).find('a', {'class': 'item'}).find('strong').text fonum = int(fonum) return fonum def get_xsrf(session): '''''_xsrf 是一个动态变化的参数''' index_url = 'http://www.zhihu.com' # 获取登录时需要用到的_xsrf index_page = session.get(index_url, headers=headers) html = index_page.text pattern = r'name="_xsrf" value="(.*?)"' # 这里的_xsrf 返回的是一个list _xsrf = re.findall(pattern, html) return str(_xsrf[0]) def getfollwer(fonum,session,xsrf): begin = 0 end = math.ceil(fonum / 20) num = 1 for x in range(0, end): beginnum = str(x * 20) # print(x) postdata = {'method': 'next', 'params': '{"offset":' + beginnum + ',"order_by":"created","hash_id":"29d75b4013b4631aaf7fe5848f3f6113"}', '_xsrf': xsrf} ress = session.post('https://www.zhihu.com/node/ProfileFolloweesListV2', data=postdata, headers=headers) jsons = json.loads(ress.content.decode('utf-8')) print(jsons['msg']) time.sleep(5) for a in jsons['msg']: #print(a) abs = BeautifulSoup(a, 'html.parser') print(num) name=abs.find('a', {'class': 'zg-link author-link'}).text print("用户:" + name) homepage = abs.find('a', {'class': 'zg-link author-link'})['href'] id = homepage[29:] print(id) #print("用户主页:" + homepage) normals = abs.find_all(a, {'class': 'zg-link-gray-normal'}) follower_num=int(abs.find('a', {'href': '/people/' + id + '/followers'}).text[:-4]) print(follower_num) question_num=int(abs.find('a', {'href': '/people/' + id + '/asks'}).text[:-3]) print(question_num) answer_num=int(abs.find('a', {'href': '/people/' + id + '/answers'}).text[:-3]) print(answer_num) agree_num=int(abs.find('a', {'href': '/people/' + id, 'class': 'zg-link-gray-normal'}).text[:-3]) print(a 4000 gree_num) author=Author(id,name,homepage,follower_num,question_num,answer_num,agree_num) deque.append(author) print(author.name) print(author.homepage) num = num + 1 print("================================================================================================") return deque def insertzhihu(deque): connetion=pymysql.connect(host='localhost', user='root', password='159366', db='zhihu', charset='utf8', cursorclass=pymysql.cursors.DictCursor) values=[] for author in deque: value=(author.id,author.name,author.homepage,author.follower_num,author.question_num, author.answer_num,author.agree_num) values.append(value) cursor=connetion.cursor() cursor.executemany("insert into zhihu_author values(%s,%s,%s,%s,%s,%s,%s)",values) connetion.commit() if __name__ == "__main__": session=getsession() xsrf=get_xsrf(session) fnum=followers_num(session,"zui-jiu-qing-niu-4") deque=getfollwer(fnum,session,xsrf) insertzhihu(deque)
相关文章推荐
- 怎么获取本地时间存入mysql数据库,数据库类型是datetime
- 【原创】python爬虫获取网站数据并存入本地数据库
- Python批量获取京东商品列表信息
- python+微博API获取我的粉丝列表和关注列表信息(只能得到最新的30%)
- python爬虫获取数据后存入MySQL数据库中
- Python批量获取京东商品列表信息
- 新浪微博Python SDK笔记——获取粉丝列表或关注列表
- Python 获取本地IP
- Python:获取新浪微博用户的收听列表和粉丝列表
- python 获取网络时间及修改本地时间
- Python:获取新浪微博用户的收听列表和粉丝列表
- Python 获取目录下的文件列表与内容
- python 获取一定范围内不重复的多个随机数列表
- python获取本地的IP地址及mac地址
- Python - 获取校内(人人网)的所有好友照片存储到本地
- 使用Python模拟登录QQ邮箱获取QQ好友列表
- python模拟163登陆获取邮件列表
- C# 获取本地安装的软件列表
- python 获取一定范围内不重复的多个随机数列表
- 将CSDN600W用户及密码帐号存入本地MySql数据库