您的位置:首页 > 其它

刚刚闲着无聊,随便写了一个抓取博客园的py

2013-06-17 20:45 281 查看
只抓第一页没有重复抓取验证、没有sleep ; 哈哈想抓更多调用call_me就好了(排版有点淡淡的疼)

#!/usr/bin/python
# -*- coding: utf-8 -*-

from pyquery import PyQuery as pq
from lxml import etree
import sys
import re
import urllib
import urllib2
import json
import hashlib
import random
import time
import os
import cookielib
import writetemp
#抓取调用函数
def call_me(url):
if(url == None):
print 'did\'t url'

try:
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
p = opener.open(url)

if p.getcode() != 200:
sys.exit()
html = p.read()
blog_list_url = []
blogs_info = []
test = pq(html)
blog_list = test('#post_list a.titlelnk')
for m in blog_list:
blog_list_url.append(pq(m).attr('href'))
blog_list_url = sorted(set(blog_list_url),key=blog_list_url.index)

for m_url in blog_list_url:
try:
new_p = opener.open(m_url)
if new_p.getcode() != 200:
sys.exit()

m_html = new_p.read()
db_blog = pq(m_html)
m_class = []

info = {
'title': db_blog('#cb_post_title_url').text(),
'time': db_blog('#post-date').text(),
'info':db_blog('#cnblogs_post_body').html(),
'link': m_url
}
print info
blogs_info.append(info)
except:
continue

writetemp.write_temp(blogs_info)
except Exception,e:
print e
finally:
print 'original page done'
#demo
#url = 'http://www.cnblogs.com/'
#call_me(url)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: