python爬取妹纸图片
2018-02-13 11:37
127 查看
源代码:
程序说明:
功能:爬取www.mzitu.com网页中1-14页中妹子图片
注意:随着www.mzitu.com网页结构的改变,此代码也需要作出相应的变化
程序抓取结果:
from urllib import request,parse
import re
import os
import time
#获取图片数量
def getPicNum(links):
headers = {
"Host": " www.mzitu.com",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": " zh-CN,zh;q=0.9",
}
req=request.Request(url=links,headers=headers)
response=request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile('<span>(\d*)</span>')
res=pattern.findall(html)
# print(res)
# exit()
pageNum=int(res[4])
return pageNum
#获取爬取列表
def getList():
base_url="http://www.mzitu.com/page/{0}/"
headers={
"Host": " www.mzitu.com",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Referer": "http://www.mzitu.com/119965",
"Accept-Language": " zh-CN,zh;q=0.9",
}
allLinks=list()
for one in range(1,15):
time.sleep(0.5)
fullPath=base_url.format(one)
req=request.Request(url=fullPath,headers=headers)
response=request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile('href="(.*?)"\starget="_blank"><')
res=pattern.findall(html)
allLinks=allLinks+res
# print(res)
# print("{0}---------------------------------------------".format(one))
return allLinks
#下载图片
def downloadPic(picPath, fileName,dirName):
print("downloading...."+fileName)
headers={
"Host": " i.meizitu.net",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "http://www.mzitu.com/120012",
"Accept-Language": " zh-CN,zh;q=0.9",
}
req=request.Request(url=picPath,headers=headers)
response=request.urlopen(req)
context=response.read()
with open("c:/Users/Michael/Desktop/爬取内容/{dirName}/{fileName}".format(dirName=dirName,fileName=fileName),"wb") as fp:
fp.write(context)
#获取所图片链接
def getPic(base):
base_url=base+"/%d"
headers={
"Host": " www.mzitu.com",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": " zh-CN,zh;q=0.9",
# "Cookie": " Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1518425004; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1518425648",
}
dirName=base_url.split("/")[-2]
if not os.path.exists("c:/Users/Michael/Desktop/爬取内容/{dirName}/".format(dirName=dirName)):
os.makedirs("c:/Users/Michael/Desktop/爬取内容/{dirName}/".format(dirName=dirName))
count=getPicNum(base)
for one in range(1,count):
time.sleep(0.5)
fullUrl=base_url% one
req=request.Request(url=fullUrl,headers=headers)
response=request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile('img.*?src="(.*?)"')
res=pattern.search(html)
# print(res.group(1))
if res is not None:
picPath=res.group(1)
fileName=picPath.split("/")[-1]
# print("c:/Users/Michael/Desktop/{dirName}/fileName".format(dirName=dirName))
# exit()
downloadPic(picPath,fileName,dirName)
# request.urlretrieve(picPath,"c:/Users/Michael/Desktop/{dirName}/{fileName}".format(dirName=dirName,fileName=fileName),headers)
else:
print("第{0}页没有图片".format(one))
# print(res)0
print(one,"------------------------------------------------------------------------")
print("下载完毕")
if __name__ == '__main__':
allLinks=getList()
for one in allLinks:
getPic(one)
程序说明:
功能:爬取www.mzitu.com网页中1-14页中妹子图片
注意:随着www.mzitu.com网页结构的改变,此代码也需要作出相应的变化
程序抓取结果:
from urllib import request,parse
import re
import os
import time
#获取图片数量
def getPicNum(links):
headers = {
"Host": " www.mzitu.com",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": " zh-CN,zh;q=0.9",
}
req=request.Request(url=links,headers=headers)
response=request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile('<span>(\d*)</span>')
res=pattern.findall(html)
# print(res)
# exit()
pageNum=int(res[4])
return pageNum
#获取爬取列表
def getList():
base_url="http://www.mzitu.com/page/{0}/"
headers={
"Host": " www.mzitu.com",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Referer": "http://www.mzitu.com/119965",
"Accept-Language": " zh-CN,zh;q=0.9",
}
allLinks=list()
for one in range(1,15):
time.sleep(0.5)
fullPath=base_url.format(one)
req=request.Request(url=fullPath,headers=headers)
response=request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile('href="(.*?)"\starget="_blank"><')
res=pattern.findall(html)
allLinks=allLinks+res
# print(res)
# print("{0}---------------------------------------------".format(one))
return allLinks
#下载图片
def downloadPic(picPath, fileName,dirName):
print("downloading...."+fileName)
headers={
"Host": " i.meizitu.net",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "http://www.mzitu.com/120012",
"Accept-Language": " zh-CN,zh;q=0.9",
}
req=request.Request(url=picPath,headers=headers)
response=request.urlopen(req)
context=response.read()
with open("c:/Users/Michael/Desktop/爬取内容/{dirName}/{fileName}".format(dirName=dirName,fileName=fileName),"wb") as fp:
fp.write(context)
#获取所图片链接
def getPic(base):
base_url=base+"/%d"
headers={
"Host": " www.mzitu.com",
"Connection": " keep-alive",
"Pragma": " no-cache",
"Cache-Control": " no-cache",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": " zh-CN,zh;q=0.9",
# "Cookie": " Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1518425004; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1518425648",
}
dirName=base_url.split("/")[-2]
if not os.path.exists("c:/Users/Michael/Desktop/爬取内容/{dirName}/".format(dirName=dirName)):
os.makedirs("c:/Users/Michael/Desktop/爬取内容/{dirName}/".format(dirName=dirName))
count=getPicNum(base)
for one in range(1,count):
time.sleep(0.5)
fullUrl=base_url% one
req=request.Request(url=fullUrl,headers=headers)
response=request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile('img.*?src="(.*?)"')
res=pattern.search(html)
# print(res.group(1))
if res is not None:
picPath=res.group(1)
fileName=picPath.split("/")[-1]
# print("c:/Users/Michael/Desktop/{dirName}/fileName".format(dirName=dirName))
# exit()
downloadPic(picPath,fileName,dirName)
# request.urlretrieve(picPath,"c:/Users/Michael/Desktop/{dirName}/{fileName}".format(dirName=dirName,fileName=fileName),headers)
else:
print("第{0}页没有图片".format(one))
# print(res)0
print(one,"------------------------------------------------------------------------")
print("下载完毕")
if __name__ == '__main__':
allLinks=getList()
for one in allLinks:
getPic(one)
相关文章推荐
- 手把手教你用 Python 爬虫煎蛋妹纸海量图片
- 手把手教你用Python爬虫煎蛋妹纸海量图片
- 手把手教你用Python爬虫煎蛋妹纸海量图片
- python 老司机开车之二爬取福利妹纸图片
- 手把手教你用Python爬虫煎蛋妹纸海量图片
- Python 老司机开车之三爬取福利妹纸图片(多线程学习)
- 使用Python爬取煎蛋网妹纸图片
- 用Python批量爬取妹纸图片
- 利用Python PIL、cPickle将图片读取和保存为pkl格式文件
- Python徒手实现识别手写数字—简易图片数据库
- python学习笔记(一)爬虫实战:图片自动下载器
- Python拼接多张图片
- Python的PIL库实现验证码图片
- Python+selenium实现截图图片并保存截取的图片
- Python爬虫之——爬取妹子图片
- python用10行代码实现黄色图片检测
- 使用python爬取豆瓣电影图片(-)
- python制作花瓣网美女图片爬虫
- Python 将图片转化为 HTML 页面
- Python网络爬虫(4)煎蛋网妹子图片抓取