您的位置：首页 > 移动开发 > 微信开发

使用代理的爬虫小程序

2017-12-06 14:39 197 查看

使用使用ip117.135.250.134端口80作为代理服务器，爬取了百度首页的代码。

import urllib.request
import os
import sys
import re

def testArgument(url):
TP=TestProxy(url)

def tipUse():
print('改程序只能输入一个参数，这个参数必须是可用的proxy')
print('usage:python test Urllib2WithProxy.py http//1.2.3.4:5')
print('usage:python test Urllib2WithProxy.py https//1.2.3.4:5')

class TestProxy(object):
def __init__(self,proxy):
self.proxy = proxy
self.checkProxyFormat(self.proxy)
self.url = 'http://www.baidu.com'
self.timeout=5
self.flagWord='百度'
self.useProxy(self.proxy)

def checkProxyFormat(self,proxy):
try:
proxyMatch = re.compile('http[s]?://[\d]{1,3}\.{\d}{1,3}\.[\d]{1,3}:[\d]{1,5}$')
proxyMatch.match(proxy)
except AttributeError:
tipUse()
exit()
flag = 1
proxy = proxy.replace('//','')
try:
protocol = proxy.split(':')[0]
ip = proxy.split(':')[1]
port = proxy.split(':')[2]
except IndexError:
print('下标出界')
tipUse()
exit()
flag = flag and len(proxy.split(':')) and len(ip.split('.'))

flag = ip.split('.')[0] in map(str,range(1,256)) and flag
flag = ip.split('.')[1] in map(str,range(256)) and flag
flag = ip.split('.')[2] in map(str,range(256)) and flag
flag = ip.split('.')[3] in map(str,range(1,255)) and flag
flag = protocol in ['http','https'] and flag
flag = port in map(str,range(1,65535)) and flag
if flag:
print('输入的http代理服务器符合标准')
else:
tipUse()
exit()

def useProxy(self,proxy):
protocol = proxy.split('//')[0].replace(':','')
ip = proxy.split('//')[1]
opener = urllib.request.build_opener(urllib.request.ProxyHandler({protocol:ip}))
urllib.request.install_opener(opener)
try:
response = urllib.request.urlopen(self.url,timeout=self.timeout)
except:
print('连接错误，退出程序')
exit()

data = response.read()
data = data.decode('UTF-8')
print(data)

testArgument('https://117.135.250.134:80')

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 数据爬虫 python

相关文章推荐

新的分享

章节导航