Python 爬虫模块 bs4 实战一:获取百度贴吧内容
2017-09-15 20:10
871 查看
getCommentInfo.py:
mylog.py
from bs4 import BeautifulSoup import requests from mylog import MyLog as mylog # 《Python 网络爬虫实战》胡松涛著 P196 class Item(): title = None firstAuthor = None firstTime = None reNum = None content = None lastAuthor = None lastTime = None class GetTiebaInfo(): def __init__(self, url): self.url = url self.log = mylog() self.pageSum = 1 self.urls = self.getUrls(self.pageSum) self.items = self.spider(self.urls) self.pipelines(self.items) # http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200 def getUrls(self, pageSum): urls = [] pns = [str(i * 50) for i in range(pageSum)] ul = self.url.split("=") for pn in pns: ul[-1] = pn url = "=".join(ul) print(url) urls.append(url) return urls def spider(self, urls): items = [] for url in urls: htmlContent = self.getResponseContent(url) soup = BeautifulSoup(htmlContent, 'lxml') # 注意:这里前面有个空格 tagsli = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'}) for tag in tagsli: item = Item() item.title = tag.find('a', attrs={'class': 'j_th_tit'}).get_text().strip() # 注意,这里有一个 .a item.firstAuthor = tag.find('span', attrs={'class': 'frs-author-name-wrap'}).a.get_text().strip() item.firstTime = tag.find('span', attrs={'title': '创建时间'}).get_text().strip() item.reNum = tag.find('span', attrs={'title': '回复'}).get_text().strip() # 注意:这里后面有个空格 item.content = tag.find('div', attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip() item.lastAuthor = tag.find('span', attrs={'class': 'tb_icon_author_rely j_replyer'}).get_text().strip() item.lastTime = tag.find('span', attrs={'title': '最后回复时间'}).get_text().strip() items.append(item) self.log.info('获取标题为《%s》的项成功 ...' % item.title) return items def pipelines(self, items): fileName = '百度贴吧_python.txt' with open(fileName, 'w', encoding='utf-8') as fp: for item in items: fp.write( 'title:{} \t author:{} \t firstTime:{} \ncontent:{} \n return:{} \n lastAuthor:{} \t lastTime:{} \n\n\n\n' .format(item.title, item.firstAuthor, item.firstTime, item.content, item.reNum, item.lastAuthor, item.lastTime)) def getResponseContent(self, url): try: response = requests.get(url) except: self.log.error('Python 返回 URL:%s 数据失败' % url) else: self.log.info('Python 返回 URL:%s 数据成功' % url) return response.text if __name__ == '__main__': url = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50' GTI = GetTiebaInfo(url)
mylog.py
import logging import getpass import sys class MyLog(): def __init__(self): self.user = getpass.getuser() self.logger = logging.getLogger(self.user) self.logger.setLevel(logging.DEBUG) self.logFile = sys.argv[0][0:-3] + '.log' self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n') self.logHand = logging.FileHandler(self.logFile, encoding='utf8') self.logHand.setFormatter(self.formatter) self.logHand.setLevel(logging.DEBUG) self.logHandst = logging.StreamHandler() self.logHandst.setFormatter(self.formatter) self.logHandst.setLevel(logging.DEBUG) self.logger.addHandler(self.logHand) self.logger.addHandler(self.logHandst) # 日志的 5 个级别对应一下的 5 个函数 def debug(self, msg): self.logger.debug(msg) def info(self, msg): self.logger.info(msg) def warn(self, msg): self.logger.warn(msg) def error(self, msg): self.logger.error(msg) def critical(self, msg): self.logger.critical(msg) if __name__ == '__main__': mylog = MyLog() mylog.debug(u"I'm debug 测试中文") mylog.info("I'm info") mylog.warn("I'm warm") mylog.error(u"I'm error 测试中文") mylog.critical("I'm critical")
相关文章推荐
- Python爬虫实战入门五:获取JS动态内容—爬取今日头条
- 从零开始写Python爬虫 --- 1.5 爬虫实践: 获取百度贴吧内容
- Python爬虫实战(1)——百度贴吧抓取帖子并保存内容和图片
- Python爬虫实战入门五:获取JS动态内容—爬取今日头条
- Python爬虫实战二之爬取百度贴吧帖子
- python-70:使用BS4获取正文内容
- [Python]实战——百度贴吧爬虫
- Python爬虫实战二:下载百度贴吧帖子内的壁纸
- python爬虫实战(六)--------新浪微博(爬取微博帐号所发内容,不爬取历史内容)
- [Python爬虫之路2]爬取百度贴吧内容
- Python爬虫实战一之使用Beautiful Soup抓取‘谣言百科’的分类内容
- Python爬虫_简单获取百度贴吧图片
- python爬虫——获取正文内容
- Python爬虫实战--(三)获取网页中的动态数据
- [python3]爬虫实战二之爬取百度贴吧帖子
- Python爬虫实战二之爬取百度贴吧帖子
- Python爬虫实战(五) :下载百度贴吧帖子里的所有图片
- Python爬虫_获取贴吧内容
- python爬虫学习(一)通过urllib2模块获取html,设置用户代理
- python3 [爬虫入门实战] 爬虫之使用selenium 爬取百度招聘内容并存mongodb