python模拟登陆篇——requests & urllib2方式 &有图片验证码情况
2015-11-22 19:50
519 查看
(一)基于python的requests库,模拟登陆1 爬取公司信息,不用保持cookie的情况,代码如下:
# -*- coding: utf-8 -*- import requests import requests import re import MySQLdb import mysql.connector import MySQLdb as mdb Conn = MySQLdb.connect(user='wenrui', passwd='wenrui', host='localhost', db='companydata', charset="utf8") cur = Conn.cursor() cur.execute( "CREATE TABLE CompanyData(name VARCHAR(70),realnam VARCHAR(20),phone VARCHAR(20),mail VARCHAR(20),Bigaddress varchar(10),Smladdress varchar(70))") Conn.commit() class Company: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6', 'Cookie': 'Hm_lvt_3ac4a19eb5f266a8046f9f5b29c52a00=1447818297; Hm_lpvt_3ac4a19eb5f266a8046f9f5b29c52a00=1447818297; so=296ef4827c1eabe98da0650c74e0bc7e; vr_1447818247=296ef4827c1eabe98da0650c74e0bc7e; un=2af45194a1467fca68d2ab4b6b0c083a58ebf139; zh_CN=zh_CN; PLAY_SESSION="74b2f9223df0e04bd8d69cc881072bb46fce9e63-userId=418920"' #直接把cookie写入请求头即可 } self.rname = ur'target="_blank">( .*?)</a>' self.rrealname = ur'联系人:(.{1,6})&' # 匹配联系人:后面任意一个1-4次,遇到&后结束 self.rphone = r'<label id=".*?">(.*?)</label>' self.rmail = ur'href="mailto.*>(.*?)</a>' self.raddress = ur"<font color='red'>(.*?)</font>(.*?)</p>" # self.raddress=ur'地.*址:(.*)' def getPage(self): for n in range(1, 21): # 提取前20页信息 try: siteurl = 'http://www.czvv.com/kp%dc210000ccs0m0e0f0d0.html' % n content = requests.get(url=siteurl, headers=self.headers, ).content.decode('utf-8').encode('gbk', 'ignore').decode( 'gbk') name = re.findall(self.rname, content) realname = re.findall(self.rrealname, content) phone = re.findall(self.rphone, content, re.S) mail = re.findall(self.rmail, content) address = re.findall(self.raddress, content, re.S) for i in range(0, 10): data = (name[i], realname[i], phone[i].strip(), mail[i], address[i][0], address[i][1]) cur.execute("INSERT INTO CompanyData VALUES (%s, %s, %s, %s, %s, %s)", data) Conn.commit() except: continue company = Company() company.getPage()2模拟登录CSDN博客,自动保存cookie,并用requests.Session()来保持cookie,后续可以直接访问其他网页
# -*- coding: utf-8 -*- import requests import os import re from cookielib import LWPCookieJar def toJson(str): #提取lt流水号,将数据化为一个字典 reg1 = r'<input type="hidden" name="lt" value="(.*?)" />' execution = re.findall(reg1,str) print execution return execution # cookie setting s = requests.Session() s.cookies = LWPCookieJar("cookiejar ") header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0"} if not os.path.exists('cookiejar'): print "there is no cookie,setting" r = s.get("http://passport.csdn.net/account/login") soup = toJson(r.text) payload ={'username':'***','password':'***','lt':soup[0],'execution':'e1s1','_eventId':'submit'} print payload r = s.post("http://passport.csdn.net/account/login",data=payload,headers=header) s.cookies.save(ignore_discard=True) print r.text else: print "cookie exists,restore" s.cookies.load(ignore_discard=True) r = s.get("http://write.blog.csdn.net/postlist",headers=header) print r.text(二)基于python的urllib2库,模拟登陆1、爬取拉勾网——先登录,在进入我的简历的投递记录的链接(比较简单)
# -*- coding: utf-8 -*- import HTMLParser import urlparse import urllib import urllib2 import cookielib import string import re import sys reload(sys) sys.setdefaultencoding('utf-8') print sys.getdefaultencoding() # hosturl = 'http://passport.lagou.com/login/login.html' posturl = 'http://passport.lagou.com/login/login.json' cj = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cj) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) #h = urllib2.urlopen(hosturl) headers = { 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'} # 构造Post数据,他也是从抓大的包里分析得出的。 postData = {"isValidate": "true", "username": "932018594@qq.com", "password": "eae82fc22c9608fda4157ce770297228", "request_form_verifyCode": ""} # 需要给Post数据编码 postData = urllib.urlencode(postData) # 通过urllib2提供的request方法来向指定Url发送我们构造的数据,并完成登录过程 request = urllib2.Request(posturl, postData, headers) response = urllib2.urlopen(request) text = response.read() print text.decode("utf-8").encode("gbk") #打印登录后提交成功的那个页面 myemail = urllib2.urlopen(urllib2.Request('http://www.lagou.com/mycenter/delivery.html?tag=-1')).read() print myemail.encode('gbk', 'ignore') #打印我的简历投递记录的界面 # f = open('thefile.txt', 'w') # f.write(text) # f.close()2、爬取豆瓣网——先登录,在转到个人信息界面,需要手动输入验证码
# -*- coding: utf-8 -*- import urllib import urllib2 import cookielib import re import sys reload(sys) sys.setdefaultencoding('utf-8') class DB(object): def __init__(self, email, passwd): self.url = "http://www.douban.com/accounts/login" self.post = { 'form_email':email, 'form_password':passwd, 'source':'index_nav' } cookie = cookielib.CookieJar() self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) self.opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36")] self.response = self.opener.open(self.url, urllib.urlencode(self.post)) def login(self): if self.response.geturl() == self.url: print 'logining...' html = self.response.read() reg = r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"/>' imglist = re.findall(reg, html) urllib.urlretrieve(imglist[0], 'captcha.jpg') #获取验证码图片 captcha = raw_input('captcha is: ') #手动输入验证码 regid = r'<input type="hidden" name="captcha-id" value="(.*?)"/>' #captcha-id为隐含的表单信息,需要从原网页中获取ids = re.findall(regid, html) self.post["captcha-solution"] = captcha self.post["captcha-id"] = ids[0] self.post["user_login"] = "登录" self.post["redir"] = 'http://www.douban.com/people/81157996/' self.response = self.opener.open(self.url, urllib.urlencode(self.post)) print self.response.read().encode("gbk","ignore") if self.response.geturl() == "http://www.douban.com/people/81157996/": print 'login success !' email = "*****"passwd = "****"my = DB(email, passwd) my.login()
相关文章推荐
- python 数据库查询为字典是取对应的值
- python3 BIF里的并发与并行处理昝(IPC ITC)=>LTS
- [No000054] Windows 下Python3.5, NoteBook增强版安装
- python Class
- Head Frist Python 读书笔记 第五章 处理数据
- python常用package下载地址
- sympy —— Python 符号运算
- Python:unorderable types: str() > int()
- python之模块 os
- 简单解决Python文件中文编码问题
- Python制作简单的网页爬虫
- 在ubuntu下安装python imagine library
- Python——文档
- python模块之linecache
- Python带你轻松进行网页爬虫
- Python Coroutine 初探
- S折交叉验证 in Python
- 用python 爬糗事百科的段子-1
- python3爬虫问题 POST data should be bytes or an iterable of bytes
- Python抓取淘女郎网页信息以及代码下载