python爬duitang的摄影类图片
2014-05-08 17:32
239 查看
这几天闲着没事,写了个python爬虫,专把堆糖上的摄影类图片爬下来。
废话不多说,直接上代码,不用解释应该也能看懂。
废话不多说,直接上代码,不用解释应该也能看懂。
#coding: utf-8 # 抓堆糖摄影图片 from html.parser import HTMLParser import urllib.request import string import queue from datetime import datetime import os queue_url = queue.Queue() site = 'http://www.duitang.com' savePath = '/home/michael/Pictures/' # 图片保存路径前缀 # logs url_log = 'urls.log' img_log = 'imgs.log' err_log = 'errors.log' action = ("connecting", "downloading", "parsing") class MyHTMLParser(HTMLParser): #follow指定是否往下搜索 def __init__(self, strict, follow=True): super(MyHTMLParser, self).__init__() self.follow = follow def handle_starttag(self, tag, attrs): if tag.__eq__("img"): # imgurl = [x[1] for x in attrs if x[0].__eq__('src')][0] imgurl = None width = 200 height = 200 for x in attrs: if x[0].__eq__('src'): imgurl = x[1] elif x[0].__eq__('width'): width = x[1] print('width=%s' % width) elif x[0].__eq__('height'): height = x[1] print('height=%s' % height) if imgurl and float(width)>300 and float(height)>300: print(imgurl) r = imgurl.rfind("/") #下载图片到本地 urllib.request.urlretrieve(imgurl, '%s%s' %(savePath, imgurl[r:])) # 写入日志 img_file.write("%s\t%s\t%s\t%s\t%s\n" %(datetime.now(), current_url,\ imgurl[r:], width, height)) if tag.__eq__("a") and self.follow: href = [x[1] for x in attrs if x[0].__eq__('href')] if href: if href[0].startswith("/people/mblog/"): #取出大图 get_img_in_url("%s%s" %(site, href[0])) elif href[0].startswith("/category/photography/"): url = href[0] url = '%s%s' % (site, url) queue_url.put(url) def handle_endtag(self, tag): # print("Encountered an end tag :", tag) pass def handle_data(self, data): # print("Encountered some data :", data) pass ua = { 'User-agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11' } def get_html(url_address): '''open url and read it''' try: url_file.write("%s\t%s\t%s\t" % (datetime.now(), url_address, action[2])) req = urllib.request.Request(url_address, headers=ua) f = urllib.request.urlopen(req) html = urllib.request.urlopen(req).read().decode('utf-8') url_file.write("%s\n" % ("YES")) return html except Exception: url_file.write("%s\n" % ("NO")) def get_img_in_url(url_address): html = get_html(url_address) if html: p = MyHTMLParser(strict=False, follow=False) p.feed(html) parser = MyHTMLParser(strict=False) url = "http://www.duitang.com/category/photography/" queue_url.put(url) # 创建日志文件 if os.path.isfile(url_log): url_file = open(url_log, 'a+') else: url_file = open(url_log, 'w+') url_file.write("%s\t%s\t%s\t%s\n" %("time", "url", "action", "success")) if os.path.isfile(img_log): img_file = open(img_log, 'a+') else: img_file = open(img_log, 'w+') img_file.write("%s\t%s\t%s\t%s\t%s\n" %("time", "url", "name", "width", "height")) current_url = None while(not queue_url.empty()): current_url = queue_url.get() html = get_html(current_url) if html: parser.feed(html)
相关文章推荐
- Python实例讲解 -- 定时播放 (闹钟+音乐)
- 与Monkeyrunner初接触-基本测试
- Python 以一个指定的间隔定时循环执行任务
- windows下python脚本程序的运行
- python 发送html邮件
- Python API 翻译-HTMLParser模块
- Hbase Python接口
- Python的数据类型(内置类型)
- Python 安装第三方包方法
- 个人python开发环境搭建
- python的range()函数
- Valid Parentheses @Leetcode -- Python
- Longest Valid Parentheses @Leetcode -- Python
- Python中threading模块的join函数
- python列表里__setslices__方法函数解析a
- python十个项目之----即时标记
- python os.path模块
- python os.path模块
- python 解析html文档模块HTMLPaeser
- Python第四章