您的位置:首页 > 编程语言 > Python开发

python小爬虫

2016-02-05 17:53 423 查看
遍历一个网页的所有链接,跟网上学的

import re
import urllib.request
import urllib

from collections import deque

que = deque()
vis = set()

url = 'http://news.dbanotes.net/'

que.append(url)
cnt = 0
li = []
f = open('G:/1.txt', 'w')
while que:
url = que.popleft()
vis |= {url}

urlopen = urllib.request.urlopen(url)

if 'html' not in urlopen.getheader('Content-Type'):
continue

try:
data = urlopen.read().decode('utf-8')
except:
continue

r = r'href=\"(.+?)\"'
com = re.compile(r)
ans = com.findall(data)
for i in ans:
if i not in vis and 'http' in i:
que.append(i)
f.write(i)
f.write('\n')
f.close()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: