您的位置:首页 > 理论基础 > 计算机网络

HTTP 协议网络编程

2017-08-14 11:01 316 查看
本文基于 《Python网络编程攻略》 第四章,使用 Python3 改写。

主要介绍以下内容:

Python HTTP 网络库

requests 库

从 HTTP 服务器下载数据

在 Python3 中 httplib 模块被重命名为 http.client

更高层的封装推荐使用 Requests

#!/usr/bin/env python3

import argparse
import http.client

REMOTE_SERVER_HOST = 'www.python.org'
REMOTE_SERVER_PATH = '/'

class HTTPClient:

def __init__(self, host):
self.host = host

def fetch(self, path):
h = http.client.HTTPConnection(self.host)
h.putrequest('GET', path)
h.putheader('User-Agent', __file__)
h.putheader('Host', self.host)
h.putheader('Accept', '*/*')
h.endheaders()

try:
r = h.getresponse()
except Exception as e:
print(e)
else:
print('Got homepage from', self.host)

return r.read()

if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'HTTP Client Example')
parser.add_argument('--host', '-H', action = 'store', dest = 'host',
default = REMOTE_SERVER_HOST)
parser.add_argument('--path', '-P', action = 'store', dest = 'path',
default = REMOTE_SERVER_PATH)
given_args = parser.parse_args()
host, path = given_args.host, given_args.path
client = HTTPClient(host)
print(client.fetch(path))


OUTPUT

$ python3 download_data.py -H www.vlight.me
Got homepage from www.vlight.me

[...]


在你的设备中伺服 HTTP 请求

在 Python3 中 BaseHTTPServer 模块被合并到 http.server

#!/usr/bin/env python3

import argparse
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer

DEFAULT_HOST = 'localhost'
DEFAULT_PORT = 8800

class RequestHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_response(200)
self.send_header('Content-type','text/html')
self.end_headers()
self.wfile.write(b'Hello from server!')
return

class CustomHTTPServer(HTTPServer):

def __init__(self, host, port):
server_address = (host, port)
HTTPServer.__init__(self, server_address, RequestHandler)

def run_server(port):
try:
server = CustomHTTPServer(DEFAULT_HOST, port)
print('Custom HTTP server started on port:', port)
server.serve_forever()
except Exception as e:
print(e)
except KeyboardInterrupt:
print('Server interrupted and is shutting down...')
server.socket.close()

if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Simple HTTP Server Example')
parser.add_argument('--port', '-P', action = 'store', dest = 'port',
type = int, default = DEFAULT_PORT)
given_args = parser.parse_args()
port = given_args.port
run_server(port)


OUTPUT:

$ python3 simple_http_server.py
Custom HTTP server started on port: 8800
127.0.0.1 - - [05/Aug/2017 20:50:33] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [05/Aug/2017 20:50:34] "GET /favicon.ico HTTP/1.1" 200 -
127.0.0.1 - - [05/Aug/2017 20:51:30] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [05/Aug/2017 20:51:30] "GET /favicon.ico HTTP/1.1" 200 -
127.0.0.1 - - [05/Aug/2017 20:51:30] "
4000
GET /favicon.ico HTTP/1.1" 200 -
^CServer interrupted and is shutting down...




访问网站后提取 cookie 信息

很多网站使用cookie在你的本地硬盘中存储各种信息。你可能想要查看cookie中保存的信息,或者使用cookie自动登录网站。

在 Python3 中 cookielib 被重命名为 http.cookiejar

在 Python3 中 urllib2 被分成 urllib.request 和 urllib.error 。

#!/usr/bin/env python3

import http.cookiejar
# import urllib
import urllib.request

URL = 'https://www.baidu.com'

def extract_cookie_info():
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
r = opener.open(URL)

print('---First time cookie:---')
for cookie in cj:
print('  %s --> %s' %(cookie.name, cookie.value))

# print 'Headers: %s' %r.Headers

r = opener.open(URL)

print('***Second time cookie:***')
for cookie in cj:
print('  %s --> %s' %(cookie.name, cookie.value))

# print 'Headers: %s' %r.Headers

if __name__ == '__main__':
extract_cookie_info()


OUTPUT

$ python3 extract_cookie_info.py
---First time cookie:---
BIDUPSID --> A4AC988092E044FEDC73F0C8B28EA687
PSTM --> 1502023270
BD_NOT_HTTPS --> 1
***Second time cookie:***
BIDUPSID --> A4AC988092E044FEDC73F0C8B28EA687
PSTM --> 1502023270
BD_NOT_HTTPS --> 1


提交网页表单

需要用到第三方模块 requests



通过代理服务器发送 Web 请求

#!/usr/bin/env python3

import requests

URL = 'https://www.github.com'
PROXY_ADDRESS = '138.197.211.31:80'

if __name__ == '__main__':
r = requests.get(URL, proxies = {'http': PROXY_ADDRESS})
print('Proxy server returns response headers:\n{}'.format(r.headers))


OUTPUT

$ python3 proxy_web_request.py
Proxy server returns response headers:
{'Content-Encoding': 'gzip', 'X-Frame-Options': 'deny', 'X-Runtime-rack': '0.048141', 'Date': 'Sun, 06 Aug 2017 13:39:30 GMT', 'X-UA-Compatible': 'IE=Edge,chrome=1', 'Transfer-Encoding': 'chunked', 'Content-Type': 'text/html; charset=utf-8', 'X-Content-Type-Options': 'nosniff', 'Set-Cookie': '_octo=GH1.1.962502759.1502026770; domain=.github.com; path=/; expires=Tue, 06 Aug 2019 13:39:30 -0000, logged_in=no; domain=.github.com; path=/; expires=Thu, 06 Aug 2037 13:39:30 -0000; secure; HttpOnly, _gh_sess=eyJzZXNzaW9uX2lkIjoiZTM2OGNjOWM1YjI3ZjYzZWNjNzcyMjA1ZWRlZjUwMDUiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwMjAyNjc3MDM3MCwiX2NzcmZfdG9rZW4iOiI3N29acmdJb25vNnhGd1pCMmRieEc4bDV3cktvbTg4bTFGK3NyMVhVTnJzPSJ9--796fbdfa9eee35eba79bc34d665314a727a6f093; path=/; secure; HttpOnly', 'Vary': 'X-PJAX, Accept-Encoding', 'Content-Security-Policy': "default-src 'none'; base-uri 'self'; block-all-mixed-content; child-src render.githubusercontent.com; connect-src 'self' uploads.github.com status.github.com collector.githubapp.com api.github.com www.google-analytics.com github-cloud.s3.amazonaws.com github-production-repository-file-5c1aeb.s3.amazonaws.com github-production-upload-manifest-file-7fdce7.s3.amazonaws.com github-production-user-asset-6210df.s3.amazonaws.com wss://live.github.com; font-src assets-cdn.github.com; form-action 'self' github.com gist.github.com; frame-ancestors 'none'; img-src 'self' data: assets-cdn.github.com identicons.github.com collector.githubapp.com github-cloud.s3.amazonaws.com *.githubusercontent.com; media-src 'none'; script-src assets-cdn.github.com; style-src 'unsafe-inline' assets-cdn.github.com", 'X-Request-Id': '145e355481dbb87c3836afe4cc15e54d', 'X-GitHub-Request-Id': '6A81:3C2E:6D8B2FB:A52673B:59871C11', 'X-XSS-Protection': '1; mode=block', 'Cache-Control': 'no-cache', 'Public-Key-Pins': 'max-age=5184000; pin-sha256="WoiWRyIOVNa9ihaBciRSC7XHjliYS9VwUGOIud4PB18="; pin-sha256="RRM1dGqnDFsCJXBTHky16vi1obOlCgFFn/yOhI/y+ho="; pin-sha256="k2v657xBsOVe1PQRwOsHsw3bsGT2VzIqz5K+59sNQws="; pin-sha256="K87oWBWM9UZfyddvDfoxL+8lpNyoUB2ptGtn0fv6G2Q="; pin-sha256="IQBnNBEiFuhj+8x6X8XLgh01V9Ic5/V3IRQLNFFc7v4="; pin-sha256="iie1VXtL7HzAMF+/PVPR9xzT80kQxdZeJ+zduCB3uj0="; pin-sha256="LvRiGEjRqfzurezaWuj8Wie2gyHMrW5Q06LspMnox7A="; includeSubDomains', 'X-Runtime': '0.042197', 'Server': 'GitHub.com', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'Status': '200 OK'}


使用 HEAD 请求检查网页是否存在

#!/usr/bin/env python3

import argparse
import requests
from requests import codes as sc

DEFAULT_URL = 'http://www.python.org'
HTTP_GOOD_CODES = [sc.OK, sc.FOUND, sc.MOVED_PERMANENTLY]

def get_server_status_code(url):

r = requests.get(url)

return r.status_code

if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Example HEAD Request')
parser.add_argument('--url', '-u', action = 'store', dest = 'url',
default = DEFAULT_URL)
given_args = parser.parse_args()
url = given_args.url

if get_server_status_code(url) in HTTP_GOOD_CODES:
print('Server: [{}] status is OK!'.format(url))
else:
print('Server: [{}] status is NOT OK!'.format(url))


OUTPUT

$ python3 checking_webpage_with_HEAD_request.py -u https://www.github.com Server: [https://www.github.com] status is OK!

$ python3 checking_webpage_with_HEAD_request.py -u https://www.github.com/py.py Server: [https://www.github.com/py.py] status is NOT OK!


把客户端伪装成 Mozilla Firefox

#!/usr/bin/env python3

import requests

HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
URL = 'http://httpbin.org/user-agent'

def spoof_firefox():
r = requests.get(URL, headers = HEADERS)
print(r.json())

if __name__ == '__main__':
spoof_firefox()


OUTPUT

$ python3 spoof_mozilla_firefox_in_client_code.py
{'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}


使用 HTTP 压缩节省 Web 请求消耗的带宽

#!/usr/bin/env python3

import argparse
import string
import os
import gzip
import sys
import io
from http.server import BaseHTTPRequestHandler, HTTPServer

DEFAULT_HOST = '127.0.0.1'
DEFAULT_PORT = 8800
HTML_CONTENT = b"""<html><body><h1>Compressed Hello World!</h1></body></html>"""

class RequestHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.send_header('Content-Encoding', 'gzip')
zbuf = self.compress_buffer(HTML_CONTENT)
sys.stdout.write('Content_Encoding: gzip\r\n')
self.send_header('Content-Length', len(zbuf))
self.end_headers()

zbuf = self.compress_buffer(HTML_CONTENT)
sys.stdout.write('Content-Encoding: gzip\r\n')
sys.stdout.write('Content_Length: {}\r\n'.format(len(zbuf)))
sys.stdout.write('\r\n')
self.wfile.write(zbuf)
return

def compress_buffer(self, buf):
zbuf = io.BytesIO()
zfile = gzip.GzipFile(mode = 'wb', fileobj = zbuf, compresslevel = 6)
zfile.write(buf)
zfile.close()
return zbuf.getvalue()

if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Simple HTTP Server Example')
parser.add_argument('--port', '-p', action = 'store', dest = 'port',
type = int, default = DEFAULT_PORT)
given_args = parser.parse_args()
port = given_args.port
server_address = (DEFAULT_HOST, port)
server = HTTPServer(server_address, RequestHandler)
server.serve_forever()


OUTPUT

$ python3 http_compression.py
127.0.0.1 - - [07/Aug/2017 11:16:59] "GET / HTTP/1.1" 200 -
Content_Encoding: gzip
Content-Encoding: gzip
Content_Length: 70

127.0.0.1 - - [07/Aug/2017 11:16:59] "GET /favicon.ico HTTP/1.1" 200 -
Content_Encoding: gzip
Content-Encoding: gzip
Content_Length: 70




编写一个支持断点续传功能的 HTTP 容错客户端

#!/usr/bin/env python3

import requests
import os
import argparse

TARGET_URL = 'https://www.python.org/ftp/python/3.6.2/Python-3.6.2.tar.xz'

def download_file(url):

local_file_name = url.split('/')[-1]

while True:

if os.path.exists(local_file_name):
local_file_size = os.path.getsize(local_file_name)
else:
local_file_size = 0

headers = {'range': 'bytes={}-'.format(local_file_size)}

r = requests.get(url, stream = True, headers = headers)

# print(r.headers['Content-Length'])
# print(local_file_size)
if int(r.headers['Content-Length']) == 0:
print('File: {} already downloaded!'.format(local_file_name))
break

else:
with open(local_file_name, 'ab') as f:
for chunk in r.iter_content(chunk_size = 1024):
if chunk:
f.write(chunk)
f.flush()

if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Simple Downloader')
parser.add_argument('--url', '-u', action = 'store', dest = 'url',
default = TARGET_URL)
given_args = parser.parse_args()
url = given_args.url

download_file(url)


OUTPUT



此处我们手动打断了下载进程



然后,又能继续下载
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  网络编程 python 网络