Python豆瓣爬虫,指定文件行数写入到文件中
2014-05-13 15:12
489 查看
__author__ = 'huafeng' import os import re import time import codecs import logging import urllib2 import random from math import ceil from bs4 import BeautifulSoup PATH = os.path.dirname(os.path.abspath(__file__)) SLEEP_INTERVAL = random.randint(2, 5) class MovieActor: def __init__(self): self._gen_log() self.proxy_list = [] # self._gen_proxy() self.actor_content_text_list = [] self.timeout_url_list = [] def _gen_log(self): logfile = os.path.join(PATH, 'log', 'douban_actor_cralwer.log') self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) log_file = logging.FileHandler(logfile) log_file.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') log_file.setFormatter(formatter) self.logger.addHandler(log_file) def _gen_proxy(self): filename = os.path.join(PATH, 'sys', 'xici_proxy') with codecs.open(filename, encoding='utf-8') as f: self.proxy_list.extend([item.strip() for item in f.readlines()]) def parse_actor_content_url(self, url): try: html = urllib2.urlopen(url, timeout=15).read() except: try: html = urllib2.urlopen(url, timeout=15).read() except: self.timeout_url_list.append(url) self.logger.debug('request timeout in item_url:%s'%url) return soup = BeautifulSoup(html) div_level_str = soup.find('div', id='content') if not div_level_str: self.logger.error('div_level do not match regular expression in url:%s'%url) return actor_summary = div_level_str.find('div', class_='bd') if not div_level_str.find('span', class_='all hidden') else div_level_str.find('span', class_='all hidden') if not actor_summary: self.logger.debug('actor_summary do not match re in item_url:%s'%url) actor_content_text = actor_summary.text.strip() if actor_content_text: print actor_content_text self.actor_content_text_list.append(actor_content_text) def write_content_into_file(self): timestamp = time.strftime('%Y_%m_%d_%H%M%S.txt') filename = os.path.join(PATH, 'out', 'actor_con_out', timestamp) actor_content_text_list = ["".join((item, '\n')) for item in self.actor_content_text_list] with codecs.open(filename, mode='a', encoding='utf-8') as wf: wf.writelines(actor_content_text_list) def write_timeout_url(self): filename = os.path.join(PATH, 'log', 'actor_info_timeout_url') timeout_url_list = [item+'\n' for item in self.timeout_url_list] with codecs.open(filename, mode='a', encoding='utf-8') as f: f.writelines(timeout_url_list) def main(self): range_from = 1000000 range_to = 1339959 url_pattern = 'http://movie.douban.com/celebrity/%d/' url_list = [url_pattern%item for item in range(range_from, range_to+1)] # print len(url_list), url_list[-1] url_count = range_to - range_from con_count_write_in_file = 2000 file_count = url_count/2000#338000 for file_point in range(file_count): for url_point in range(file_point*con_count_write_in_file, (file_point+1)*con_count_write_in_file): url = url_list[url_point] self.parse_actor_content_url(url) time.sleep(SLEEP_INTERVAL) self.write_content_into_file() self.actor_content_text_list[:] = [] if self.timeout_url_list: self.write_timeout_url() url_left = url_count - file_count*con_count_write_in_file#339959-338000 for i in range(1,url_left+1): url = url_list[-i] self.parse_actor_content_url(url) time.sleep(SLEEP_INTERVAL) self.write_content_into_file() self.actor_content_text_list[:] = [] if self.timeout_url_list: self.write_timeout_url() if __name__ == '__main__': actor = MovieActor() actor.main()main函数中,指定每2000条数据写入到时间戳文件中。不用每次append操作是判定该时数据行数已经达到具体的设定值,而且如果在append时判断其行数来决定是否要写入文件的方法,会漏掉最多2000条数据。所以该方法可以避免数据的丢失。。。
相关文章推荐
- C++读取写入.txt文件(ifstream/ofstream)—读取指定行,修改指定行,复制文件,清除文件,统计文件行数
- python计算文件的行数和读取指定行的内容
- Python爬虫系列:爬取小说并写入txt文件
- Python爬虫-爬取 ygdy8.com 站点的所有电影并写入文件和数据库
- 第三百四十一节,Python分布式爬虫打造搜索引擎Scrapy精讲—编写spiders爬虫文件循环抓取内容—meta属性返回指定值给回调函数—Scrapy内置图片下载器
- python写的整本书的小说爬虫(并写入txt文件)
- python 脚本(获取指定文件夹、指定文件格式、的代码行数、注释行数)
- python 脚本(获取指定文件夹、指定文件格式、的代码行数、注释行数)
- python指定写入文件时的编码格式
- Python爬虫-爬取51job.com 招聘信息并写入文件和数据库mysql
- python读取指定文件指定行数内容
- Python3 爬取豆瓣书籍 Xpath bs4 写入文件
- Python爬虫——FileNotFoundError: [WinError 2] 系统找不到指定的文件。
- Python爬虫-爬取集思录的金融信息,并写入文件和检测数据变化发送邮件通知
- C++读取写入.txt文件(ifstream/ofstream)—读取指定行,修改指定行,复制文件,清除文件,统计文件行数
- python 获取指定文件夹下所有文件名称并写入列表的实例
- python (1)一个简单的爬虫: python 在windows下 创建文件夹并写入文件
- Python3爬虫豆瓣电影TOP250将电影名写入到EXCEL
- python爬虫(爬取糗事百科段子)_get_text() , 文件写入
- python从页面获取中文,写入指定文件中