您的位置:首页 > 编程语言 > Python开发

贡献一段学习过程中的爬糗百的代码python

2016-06-30 14:05 573 查看
# coding=utf-8import urllibimport urllib2import reimport  threadimport timeclass QSBK:def __init__(self):self.pageIndex = 1self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'self.headers = {'User-Agent' :self.user_agent}self.stories = []self.enable = Falsedef getPage(self,pageIndex):try:url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)request = urllib2.Request(url,headers=self.headers)response = urllib2.urlopen(request)pageCode = response.read().decode('utf-8')return pageCodeexcept urllib2.URLError,e:if hasattr(e,"reason"):print "error",e.reasonreturn Nonedef getPageItems(self,pageIndex):pageCode = self.getPage(pageIndex)if not pageCode:print "page load error"return Nonepattern = re.compile('h2>(.*?)</h2.*?content">(.*?)</.*?number">(.*?)</',re.S)items = re.findall(pattern,pageCode)pageStories = []for item in items:pageStories.append([item[0].strip(),item[1].strip(),item[2].strip()])return pageStoriesdef loadPage(self):if self.enable==True:if len(self.stories)<2:pageStories = self.getPageItems(self.pageIndex)if pageStories:self.stories.append(pageStories)self.pageIndex +=1def getOneStory(self,pageStories,page):for story in pageStories:input = raw_input()self.loadPage()if input == "Q":self.enable = Falsereturnprint u"第%d页\t发布人:%s\t 赞:%s\n%s" %(page,story[0],story[2],story[1])def start(self):print u'正在读取,回车查看,Q退出'self.enable = Trueself.loadPage()nowPage = 0while self.enable:if len(self.stories)>0:pageStories = self.stories[0]nowPage +=1del self.stories[0]self.getOneStory(pageStories,nowPage)spider = QSBK()spider.start()C:\python.exe C:/python_test/qiubai.py正在读取,回车查看,Q退出第1页发布人:匿名用户 赞:1909跟着后面,感觉压力好大第1页发布人:花殇随风飞 赞:440我只是忘了给你带夜宵,你用得着用这样的眼神看我么……
                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  爬虫