您的位置：首页 > 编程语言 > Python开发

Python3中urllib学习笔记

2017-06-19 15:34 267 查看

get请求

post请求
cookie的使用

代理服务器设置

异常处理以及日输出

多线程

get请求

1.urllib.request.urlopen最简单的方式

keyword='哈哈'
#对keyword进行URL编码
kw=urllib.request.quote(keyword)
url='http://www.baidu.com/s?ie=UTF-8&wd='+kw
response = urllib.request.urlopen(url)
buff = response.read()
html = buff.decode("utf8")
response.close()
print(html)

2.使用Request的方式

url='https://www.baidu.com/'
#创建Request对象
req=urllib.request.Request(url)
response =urllib.request.urlopen(req)
buff = response.read()
html = buff .decode("utf8")
response.close()
print(the_page)

3.自己创建build_opener

header=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36')]
#创建opener对象
opener=urllib.request.build_opener()
opener.addheaders=header
#设置opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
response =urllib.request.urlopen('http://www.baidu.com/')
buff = response.read()
html = buff .decode("utf8")
response.close()
print(the_page)

4.urlib.resquest.urlretrieve远程下载

header=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36')]
#创建opener对象
opener=urllib.request.build_opener()
opener.addheaders=header
#设置opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
#下载文件到当前文件夹
urllib.request.urlretrieve('http://www.baidu.com/','baidu.html')
#清除urlretrieve产生的缓存
urlib.resquest.urlcleanup()

post请求

import urllib.request
import urllib.parse
url='http://www.iqianyue.com/mypost/'
#将数据使用urlencode编码处理后，使用encode()设置为utf-8编码
postdata=urllib.parse.urlencode({name:'测试名',pass:"123456"}).encode('utf-8')
#urllib.request.quote()接受字符串，
#urllib.parse.urlencode()接受字典或者列表中的二元组[(a,b),(c,d)],将URL中的键值对以连接符&划分
req=urllib.request.Request(url,postdata)
#urllib.request.Request(url, data=None, header={}, origin_req_host=None, unverifiable=False, #method=None)
#url：包含URL的字符串。
#data：http request中使用，如果指定，则发送POST而不是GET请求。
#header：是一个字典。
#后两个参数与第三方cookie有关。
req.add_header('user-agent','User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/
537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')
data=urllib.request.urlopen(req).read()
//urlopen（）的data参数默认为None，当data参数不为空的时候，urlopen（）提交方式为Post。

cookie的使用

1.获取Cookie保存到变量

import urllib.request
import http.cookie
# 声明一个CookieJar对象实例来保存cookie
cookie = cookielib.CookieJar()
# 利用urllib库的HTTPCookieProcessor对象来创建cookie处理器
handler = urllib.request.HTTPCookieProcessor(cookie)
# 通过handler来构建opener
opener = urllib.request.build_opener(handler)
# 此处的open方法同urllib.request的urlopen方法，也可以传入request
urllib.request.install_opener(opener)
#使用opener或者urlretrieve方法来获取需要的网站cookie
urllib.request.urlretrieve('http://www.baidu.com/','baidu.html')
# data=urllib.request.urlopen('http://www.baidu.com/')

2.保存cookies到文件

import http.cookie
import urllib.request
# 设置保存cookie的文件，同级目录下的cookie.txt
filename = 'cookie.txt'
# 声明一个MozillaCookieJar对象实例来保存cookie，之后写入文件
cookie = http.cookie.MozillaCookieJar(filename)
# 利用urllib库的HTTPCookieProcessor对象来创建cookie处理器
handler = urllib.request.HTTPCookieProcessor(cookie)
# 通过handler来构建opener
opener = urllib.request.build_opener(handler)
# 创建一个请求，原理同urllib的urlopen
response = opener.open("http://www.baidu.com")
# 保存cookie到文件
cookie.save(ignore_discard=True, ignore_expires=True)

3.从文件中获取cookies并访问

import http.cookielib
import urllib.request
# 创建MozillaCookieJar实例对象
cookie = http.cookie.MozillaCookieJar()
# 从文件中读取cookie内容到变量
cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
# 创建请求的request
req = urllib.Request("http://www.baidu.com")
# 利用urllib的build_opener方法创建一个opener
opener = urllib.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response = opener.open(req)
print (response.read())

代理服务器设置

import socket
设置Socket连接超时时间,同时决定了urlopen的超时时间
socket.setdefaulttimeout(1)
import urllib.request
#代理服务器信息，http代理使用地址
startime = time.time()
设置http和https代理
proxy=request.ProxyHandler({'https':'175.155.25.91:808','http':'175.155.25.91:808'})
opener=request.build_opener(proxy)
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'),
# ("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
# ("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"),
# ("Accept-Encoding","gzip, deflate, br"),
# ("Connection","keep-alive"),
# ("Pragma","no-cache"),
# ("Cache-Control","no-cache")
]
request.install_opener(opener)
# data = request.urlopen('https://www.iplocation.net/find-ip-address').read()
data = request.urlopen( 'http://www.ipip.net/' ).read().decode('utf-8')
# data=gzip.decompress(data).decode('utf-8','ignore')
endtime = time.time()
delay = endtime-startime
print(data)

有时在urlopen的data数据直接decode(‘utf-8’)会失败，必须要使用gzip.decompress(‘utf-8’,’ignore’)才能打开，猜测应该是header的问题，换一个有时会好

异常处理以及日输出

import urllib.error
import urllib.request
#分别设置HTTPHandle和HTTPSHandle处理器的debuglevel=1，日志开启
httphd=urllib.request.HTTPHandle(debuglevel=1)
httpshd=urlliib.request.HTTPSHandle(debuglevel=1)
opener=urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)
try:
urllib.request.urlopen('http://www.baidu.com/')
except urllib.error.URLError as e :
if hasattr(e,'code'):
print('异常代码为：'+str(e.code))
if hasattr(e,'reason'):
print('异常原因为：'+str(e.reason))
except Exception as e :
print('Exception错误为：'+str(e))

多线程

这里使用自带的threading

import threading
class A(threading.Thread):
def __int__(self):
threading.Thread.__int__(self)
def run(self):
#在本方法中写上该线程要执行的程序
for i in range(10):
print('我是线程A')
class B(threading.Thread):
def __int__(self):
threading.Thread.__int__(self)
def run(self):
for i in range(10):
print('我是线程B')
if __name__=='__main__':
t1=A()
#启动线程t1
t1.star()
#启动线程t2
t2=B()
t2.star()

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python-爬虫 urllib python3

相关文章推荐

新的分享

章节导航