您的位置:首页 > 编程语言 > Python开发

Python3 urllib库爬虫 基础

2017-10-25 18:02 621 查看

基础

add_header()添加报头

url="http://blog.csdn.net/yudiyanwang/article/details/78322039"
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0")
data = urllib.request.urlopen(req).read()
print(data)


GET请求

keyword = "hello" #要搜索的关键字
url = "http://www.baidu.com/s?wd=hello"
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
with open("./result.txt","wb") as fd:
fd.write(data)

# 上述当检索中文的时候 编码错误
keyword = "你好"
key_code = urllib.request.quote(keyword) # 编码
url = "http://www.baidu.com/s?wd=" + key_code
print(url) # http://www.baidu.com/s?wd=%E4%BD%A0%E5%A5%BD req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
with open("./result.txt","wb") as fd:
fd.write(data)


post请求

# PHP页面
<form action="" method="post">
<input name="name" type="text" /></br>
<input name="pass" type="text" /><br>
<input name="" type="submit" value="submit"/>
</form>

#请求地址
url = "http://192.168.1.108/login.html"
# 构建表单数据并进行编码处理
postdata = urllib.parse.urlencode({
"name":"abcdef",
"pass":"123456"
}).encode("utf-8")
# 创建Request对象 参数包括URL地址和要传递的数据
req =urllib.request.Request(url,postdata)
# 添加头信息
req.add_header("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0")

data =urllib.request.urlopen(req).read()
with open("./post.txt",'wb') as fd:
fd.write(data)


一边运行 一边打印日志 开启DebugLog

httpd = urllib.request.HTTPHandler(debuglevel=1)
httpsd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httpd,httpsd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://edu.jd.com")


异常

# URLerror异常 1,连接不上远程服务器,2,远程URL不存在,3 无网络,4 触发了HTTPError
try:
data = urllib.request.urlopen("http://blog.csdn1.net").read()
print(data)
except urllib.error.URLError as e:

# print(e.code)
# print("-----------------------")
print(e.reason)
# 当构造一个存在的网址,引发的异常不能用HTTPError处理,要用URLError处理  ,URLError是HTTPError的父类
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: