您的位置:首页 > 数据库 > Mongodb

python抓取伯乐在线的全部文章,对标题分词后存入mongodb中

2014-10-06 16:03 656 查看
依赖包:

1.pymongo

2.jieba

# -*- coding: utf-8 -*-

"""

@author: jiangfuqiang

"""

from HTMLParser import HTMLParser

import urllib2

import sys

import pymongo

import time

import jieba

import traceback

default_encoding = 'utf-8'

if sys.getdefaultencoding() != default_encoding:

reload(sys)

sys.setdefaultencoding(default_encoding)

class FetchJobble(HTMLParser):

def __init__(self):

HTMLParser.__init__(self)

self.isPostThumb = False

self.isPostMeta = False

self.isMetaTitle = False

self.isCategoryTag = False

self.isComment = False

self.isexcerpt = False

self.isReadMore = False

self.isPicture = False

self.data = {}

self.result = []

def handle_starttag(self,tag,attrs):

if tag == 'div':

for key,value in attrs:

if key == 'class':

if value == 'post-thumb':

self.isPostThumb = True

elif value == 'meta-title':

self.isMetaTitle = True

elif tag == 'a' and self.isPostThumb == True:

for key, value in attrs:

if self.isReadMore:

if key == 'href':

self.data['redmoreLink'] = value

self.data['keyword'] = ",".join(jieba.cut(self.data['title']))

self.result.append(self.data)

self.isPostThumb = False

self.isMetaTitle = False

self.isReadMore = False

self.isCategoryTag = False

self.isComment = False

self.isexcerpt = False

self.isPicture = False

self.data = {}

else:

if key == 'class':

if value == 'meta-title':

self.isMetaTitle = True

elif key == 'rel':

if value == 'category tag':

self.isCategoryTag = True

elif key =='href':

if value.find('#respond') > 0:

self.isComment = True

elif tag == 'span' and self.isComment == True:

for key, value in attrs:

if key == 'class' and value == 'excerpt':

self.isexcerpt = True

elif key == 'class' and value == 'read-more':

self.isReadMore = True

elif tag == 'img' and self.isPostThumb and self.isPostMeta == False:

for key, value in attrs:

if key == 'src':

self.data['imgSrc'] = value

def handle_endtag(self,tag):

pass

def handle_data(self,data):

if self.isMetaTitle:

self.data['title'] = data

self.isMetaTitle = False

elif self.isCategoryTag:

ct = ''

if 'tag' in self.data.keys() :

ct = self.data['tag'] + "," + data

else:

ct = data

self.data['tag'] = ct

self.isCategoryTag = False

elif self.isComment and 'comment' not in self.data.keys():

self.data['comment'] = data.split(" ")[0]

elif self.isexcerpt:

self.data['desc'] = data

self.isexcerpt = False

def getResult(self):

return self.result

if __name__ == "__main__":

con = pymongo.Connection('localhost', 27017)

db = con.blog

fetchblog = db.fetch_blog

url = "http://blog.jobbole.com/all-posts/page/%d"

count = 1

flag = False

headers={

'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

while flag == False:

try:

req = urllib2.Request(url%count,headers=headers)

request = urllib2.urlopen(req)

data = request.read()

fj = FetchJobble()

fj.feed(data)

result = fj.getResult()

if len(result) < 1:

flag = True

else:

for doc in result:

fetchblog.insert(doc)

print "page is %d"%count

count += 1

time.sleep(5)

except Exception, e:

traceback.print_exc()

print "parse error",e
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: