python 爬虫图片
2017-09-26 11:47
363 查看
#!/usr/bin/env python # encoding: utf-8 ''' @author: caopeng @license: (C) Copyright 2013-2017, Node Supply Chain Manager Corporation Limited. @contact: deamoncao100@gmail.com @software: garner @file: movie1.py @time: 2017/9/16 0016 14:49 @desc: ''' import threading import time import urllib import urllib.request import os,queue,re from bs4 import BeautifulSoup def getUrl(name,hostUrls,girlsUrls,flag): while not flag.isSet(): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: hostUrl=hostUrls.get(timeout=2) except queue.Empty: print("queue empty") return request=urllib.request.Request(hostUrl,headers=headers) response=urllib.request.urlopen(request) data=response.read().decode('gbk') soup=BeautifulSoup(data) tag_lady=soup.find_all("a",attrs={"class":"lady-avatar"}) for tag_href in tag_lady: girlsUrls.put("https:"+tag_href['href']) print("录入:https:"+tag_href['href']) hostUrls.task_done() print("getUrl is working") def getImg(name,girlsUrls,flag): while not flag.isSet(): user_agent ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240' headers={'User-Agent':user_agent} try: ur=girlsUrls.get(timeout=5) except queue.Empty: print(name+" imgqueue empty") return pattern=re.compile(r"/(\d+).htm") items=pattern.findall(ur) girlUrl="https://mm.taobao.com/self/aiShow.htm?userId="+items[0] request=urllib.request.Request(girlUrl,headers=headers) response=urllib.request.urlopen(request) data=response.read() soup=BeautifulSoup(data) fileName=soup.head.title.contents fileName[0]=fileName[0].rstrip() tag_div=soup.find('div',attrs={"class":"mm-aixiu-content"}) imgs=tag_div.find_all("img",attrs={}) if len(imgs)==0: girlsUrls.task_done() return path=cdir+'/'+str(fileName[0]) if not os.path.exists(path): os.makedirs(path) n=0 for img in imgs: n=n+1 link=img.get('src') if link: s="http:"+str(link) i=link[link.rfind('.'):] try: request=urllib.request.Request(s) response=urllib.request.urlopen(request) imgData=response.read() pathfile=path+r'/'+str(n)+i with open(pathfile,'wb') as f: f.write(imgData) f.close() print("thread "+name+" write:"+pathfile) except: print(str(name)+" thread write false:"+s) girlsUrls.task_done() #start=time.time() if __name__=='__main__': start=time.time() hostUrls=queue.Queue() girlsUrls=queue.Queue() cdir=os.getcwd() url='https://mm.taobao.com/json/request_top_list.htm?page=' flag_girl=threading.Event() flag_img=threading.Event() for i in range(1,3): u=url+str(i) hostUrls.put(u) threads_girl = threading.Thread(target=getUrl, args=(str(1), hostUrls,girlsUrls,flag_girl)) threads_img = [threading.Thread(target=getImg, args=(str(i+1), girlsUrls,flag_img)) for i in range(8)] threads_girl.start() while(girlsUrls.empty()): print("wait..") time.sleep(0.1) for t in threads_img: t.start() hostUrls.join() flag_girl.set() girlsUrls.join() flag_img.set() for t in threads_img: t.join() end=time.time() print("run time:"+str(end-start))
相关文章推荐
- Python爬虫_简单获取百度贴吧图片
- Python爬虫--抓取单一页面上的图片文件学习
- Python3.4网页爬虫,提取图片
- [Python]使用Scrapy爬虫框架简单爬取图片并保存本地
- python shell 爬虫 妹子图片
- python爬虫实战(一)--爬取知乎话题图片
- Python简单知乎爬虫--爬取页面的图片并下载到本地
- [Python爬虫] 之二十六:Selenium +phantomjs 利用 pyquery抓取智能电视网站图片信息
- python简易爬虫编写--图片获取
- Python爬虫小实践:下载妹子图www.mzitu.com网站上所有的妹子图片,并按相册名字建立文件夹分好文件名
- Python爬虫设置代理IP爬取知乎图片
- python爬虫之图片爬虫
- python实现爬虫下载美女图片
- python 实现网站图片抓取小爬虫
- python爬虫 分页获取图片并下载
- python爬虫之抓取网页中的图片到本地
- python爬虫(1) 之下载图片
- 【python爬虫】游民星空福利和壁纸帖图片爬虫
- 爬虫实战---python图片验证码破解,PIL和安装
- Python3 爬虫(三) -- 爬取豆瓣首页图片