您的位置:首页 > 编程语言 > Python开发

Python模拟登陆知乎,获取收藏夹内容

2017-08-30 20:52 337 查看

最近在研究模拟登陆,以模拟知乎登陆为例,参考了知乎大神xchaoinfo的代码。

PS.知乎可以通过手机号和邮箱号两种方式登陆,方法相同,这里模拟的是通过手机号的登陆。

主要用到了requests库等,用requests处理cookies更方便一点。requests.Session能够跨请求地保持cookies,同一个session实例发送的请求都保持同一个cookies,requests模块会自动处理。

PC端的验证码是选择倒立的汉字,处理起来比较麻烦,这里模拟了手机端登陆的过程,输入验证码即可。

import re
import requests
from bs4 import BeautifulSoup
from PIL import Image
import http.cookiejar
import time
import json

session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookies')
try:
session.cookies.load(ignore_discard=True)
except:
print("Cookie 未能加载")

home_url = 'https://www.zhihu.com/'
headers = {
'User-Agent':'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
'Referer':'https://www.zhihu.com/',
'Host':'www.zhihu.com'
}

def islogin():
url = 'https://www.zhihu.com/settings/profile'
status = session.get(url,headers = headers, allow_redirects=False ).status_code #注意禁止重定向
if status == 200:
return True
else:
return False

def get_xsrf(url = 'https://www.zhihu.com/explore'):
html = session.get(url,headers = headers)
_xsrf = re.search('_xsrf" value="(.*?)"/>',html.text,re.S).group(1)
return _xsrf

def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
r = session.get(captcha_url, headers=headers)
with open('captcha.jpg', 'wb') as f: #保存验证码到本地
f.write(r.content)
f.close()
im = Image.open('captcha.jpg')
im.show()
im.close()
captcha = input('请输入验证码:')
return captcha

def login(user_name,password):
_xsrf = get_xsrf()
post_url = 'https://www.zhihu.com/login/phone_num'
post_data = {
'_xsrf' : _xsrf,
'password' : password,
'phone_num' : user_name,
}
result = session.post(post_url,data = post_data,headers = headers) #尝试不输入验证码登陆
if(json.loads(result.text))['r'] == 1 : #登陆失败,需要输入验
4000
证码
post_data['captcha'] = get_captcha()
login_page = session.post(post_url,data = post_data,headers = headers)
print((json.loads(login_page.text))['msg']) #登陆提示信息
session.cookies.save() #保存cookies,方便下次登陆

def show_mine():
mine_url = 'https://www.zhihu.com/collections/mine'
html = session.get(mine_url,headers = headers)
href = BeautifulSoup(html.text,'html5lib').find_all('h2',class_ = 'zm-item-title')[1].find('a')['href']
url = 'https://www.zhihu.com' + str(href)
show_collection(url)

def down_answer(url):
answer_html = session.get(url,headers = headers)
title = BeautifulSoup(answer_html.text,'html5lib').find('title').get_text()
print(title)

def get_collection(url):
print(url)
content = session.get(url,headers = headers)
all_answer = BeautifulSoup(content.text,'html5lib').find_all('a',class_="toggle-expand")
for answer in all_answer:
answer_href = answer['href']
if(re.match('^https://zhuanlan',answer_href) == None):
answer_url = 'https://www.zhihu.com' + answer_href
else:
answer_url = answer_href
down_answer(answer_url)

def show_collection(url):
coll_html = session.get(url,headers = headers)
page_num = BeautifulSoup(coll_html.text,'html5lib').find('div' ,class_ = 'zm-invite-pager').find_all('span')[-2].get_text()
print(page_num)
for page in range(1,int(page_num)):
page_url = url + '?page=' + str(page)
get_collection(page_url)

if __name__ == '__main__':
if islogin():
print('已登录')
else:
user_name = input('请输入用户名:')
password = input('请输入密码:')
login(user_name,password)
show_mine()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: