python模拟浏览器登录淘宝抓取内容
2012-12-18 09:38
579 查看
# -*- coding: utf-8 -*- import urllib, urllib2, socket, cookielib import json, re, os import time, datetime # from gzipSupport import ContentEncodingProcessor # set timeout timeout = 20 timesleep = 10 socket.setdefaulttimeout(timeout) httpHandler = urllib2.HTTPHandler() httpsHandler = urllib2.HTTPSHandler() # cookie support cookie = cookielib.CookieJar() cookie_support= urllib2.HTTPCookieProcessor(cookie) # gzip support # gzip_support = ContentEncodingProcessor opener = urllib2.build_opener(cookie_support, httpHandler, httpsHandler) urllib2.install_opener(opener) def get_headers(): headers = { "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13", #"User-Agent" = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language":"zh-cn,zh;q=0.5", #"Accept-Encoding":"gzip,deflate", "Accept-Charset":"GB2312,utf-8;q=0.7,*;q=0.7", "Keep-Alive":"115", "Connection":"keep-alive" } return headers def get_login_data(): login_data = { 'TPL_username':u'用户名'.encode('gbk'), 'action':'Authenticator', 'event_submit_do_login':'anything', 'TPL_redirect_url':'', 'from':'tb', 'fc':'2', 'style':'default', 'css_style':'', 'tid':'', 'support':'000001', 'CtrlVersion':'1,0,0,7', 'loginType':'3', 'minititle':'', 'minipara':'', 'pstrong':'3', 'longLogin':'-1', 'llnick':'', 'sign':'', 'need_sign':'', 'isIgnore':'', 'popid':'', 'callback':'', 'guf':'', 'not_duplite_str':'', 'need_user_id':'', 'poy':'', 'gvfdcname':'10', 'from_encoding':'' } return login_data def login(source=None): """ login """ url = 'https://login.taobao.com/member/login.jhtml' if not source: source = request(url=url) token_list = re.findall(r"input name='_tb_token_' type='hidden' value='([a-zA-Z0-9]+)'", source) login_data = get_login_data() login_data['_tb_token_'] = token_list[0] if token_list else '' login_data['TPL_password'] = raw_input("input password:") source = request(url=url, data=login_data) r = re.findall(r'window.location = "([\w\W]+)";', source) if r: redirect_url = r[0] else: print "login failed, valid password and try again" return False request(url=redirect_url) return True def request(url, headers=None, data=dict()): if headers is None: headers = get_headers() data = urllib.urlencode(data) if data else None req = urllib2.Request( url = url, data = data, headers = headers ) try: request = urllib2.urlopen(req) source = request.read() # print url # print request.code,request.msg request.close() except: source = None print "connect timeout" return source if __name__=="__main__": login()
相关文章推荐
- Python模拟浏览器登录淘宝
- Python使用Selenium模块实现模拟浏览器抓取淘宝商品美食信息功能示例
- python 爬照片 模拟浏览器 先登录账号
- Python项目模拟登录学校正方教务系统抓取课程表。
- python+selenium模拟浏览器登录shibboleth登录的模拟
- Python爬虫实战03:用Selenium模拟浏览器爬取淘宝美食
- Python爬虫实战(5):模拟登录淘宝并获取所有订单(1)
- Python网页抓取、模拟登录
- 使用Python实现自动化抓取浏览器内容、提交结果
- python模拟浏览器webdriver登陆网站后抓取页面并输出
- python 模拟抓取手机访问页面内容今天
- PHP CURL模拟登录新浪微博抓取页面内容 基于EaglePHP框架开发
- Python爬虫,抓取淘宝商品评论内容
- Python爬虫实战(5):模拟登录淘宝并获取所有订单(1)
- python 利用浏览器 Cookie 模拟登录的用户访问知乎
- python3实现网络爬虫(5)--模拟浏览器抓取网页
- python实现简单爬虫模拟登录淘宝(cookie简介)
- 【网络爬虫】【python】网络爬虫(三):模拟登录——伪装浏览器登录爬取过程
- python登录新浪微博并抓取内容