您的位置:首页 > 编程语言 > Python开发

python实现爬虫--以CSDN为例

2016-08-24 21:51 363 查看
#! /usr/bin/env python
# -*- coding=utf-8 -*-
# author tmq
import urllib2
import time
import re
import random
#扫描参数设置
#min_star 访问次数最低值
#blog_name 博客名
#isscan 是否对不满足条件的文章进行访问量刷新
#isprintblog 是否打印不满足条件的博客信息
#aircle_time 访问刷新间隙
#isrand_time 访问刷新间隙是否自动产生
min_star = 100
blog_name = "qq_18661257"
isscan = True
isprintbolg = False
isrand_time = True
aircle_time = 3
#---------------------

headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}
init_url = "http://blog.csdn.net/" + blog_name + "/article/list/"
url_num_pe = ur"<a.*?href=\"/" + blog_name + "/article/list/(.*?)\">.*?</a>"
url_read_pe = r"<span class=\"link_view\".*?>.*?<a href=\"(.*?)\".*?>.*?</a>\((\d*?)\)</span>"
web_data_list =ur"<span class=\"link_title\">.*?<a href=\".*?\">([\s\S]*?)</a></span>"
url_head = "http://blog.csdn.net/"
rq = urllib2.Request(init_url+str(1),headers=headers)
source = urllib2.urlopen(rq)
ft = source.read()
co = re.compile(url_num_pe);
pAr = co.findall(ft)
pArlen = len(pAr)
pageTotal = pAr[pArlen - 1]
need_urlArr = []
print(pageTotal)
for i in range(int(pageTotal)):
print(init_url+str(i + 1))
rq = urllib2.Request(init_url+str(i),headers=headers)
source = urllib2.urlopen(rq)
ft = source.read()
co = re.compile(url_read_pe)
pAr = co.findall(ft)
co = re.compile(web_data_list)
pArName = co.findall(ft)
for i in range(len(pAr)):
if int(pAr[i][1]) <= min_star:
hstr = pArName[i].replace('\t','').replace('\n','').replace(' ','')
need_urlArr.append([hstr,url_head + pAr[i][0], int(pAr[i][1])])
if isprintbolg:
print(len(need_urlArr))
for du in need_urlArr :
print("博客名称:" + du[0] + "\n博客地址:" + du[1] + "\n访问次数:" + str(du[2]))

flag = True
if isscan:
while flag:
print("web access ...")
flag = False
for i in range(len(need_urlArr)) :
rq = urllib2.Request(need_urlArr[i][1],headers=headers)
source = urllib2.urlopen(rq)
if isrand_time :
time.sleep(random.randint(5, 10))
else:
time.sleep(aircle_time);
need_urlArr[i][2] = need_urlArr[i][2] + 1
if need_urlArr[i][2] <= min_star and not flag:
flag = True
print("web compete!")
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: