从某图片论坛中自动下载每页的每个子post的图片
2012-12-15 09:04
330 查看
I often goto some picture bbs to see pictures ,I have needed to download the pictures from some good post.
But I hate of saving it to local disk by WebBrowser. WE NEED TO DO IT AUTOMATICALLY.
Some picture bbs forum usually has a list of each post , and each post contain the pictures , with url in it's code file.
So we
download the list
grab each post
analyst each post to grab the pictures' url
download the pictures by url
Here is the code in python do step 1-2 (htmladdr.py):
#!/usr/bin/python
import re
import os
import sys
import funimg
#BBS url (might need to change)
url_pre='http://xxxxx.com/bbs'
#sub html in hte list (might need to change)
p='<a\shref=\"thread-\d+-\d+-\d+.html\">'
def usage():
print sys.argv[0]+' filepath'
def getpicfromhtml(htmlpath):
text=open(htmlpath,'r').readlines()
print "try to analyst", htmlpath
line=0
for element in text:
m=re.findall(p, element)
if m:
try:
for element_m in m:
#get the real relative url in element_m (might need to change)
url=element_m[9:-2]
print url
dirname=url
if not os.path.exists(dirname):
os.makedirs(dirname)
os.chdir(dirname)
url_sub=url_pre+dirname
print "get" ,url_sub
os.system('GET '+ url_sub+' HTTP/1.1'+ ' > '+ 'htmlfile')
funimg.downloadpic('./htmlfile')
print url_sub
#go back to parent (might need to change)
os.chdir('..')
except Exception:
pass
# else:
# print 'find no match'
#get max 5 pages from bbs
htmlindex=1
while htmlindex<=5:
htmlfilename='page'+str(htmlindex)
#forum list url (might need to change)
html_url=url_pre+'forum-4-'+str(htmlindex)+'.html'
print html_url
if not os.path.exists(htmlfilename):
#download the pagefile to a file name: pagex (x is stand for page index)
print 'GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename
os.system('GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename)
getpicfromhtml('./page'+str(htmlindex))
htmlindex=htmlindex+1
Here is the code in python do step 3-4 (funimg.py):
#!/usr/bin/python
import re
import os
import string
import sys
#(might need to change)
p=re.compile('.*<img.*src=\".*.jpg\"', re.I)
def downloadpic(filename):
#read from the filename
text=open(filename,'r').readlines()
for element in text:
m=re.match(p,element)
if m:
#get the real url of pic from m
url=m.group()
i=string.find(url,"src=")
url_sub=url[i+4+1:-1]
j=string.rfind(url,"/")
jpgfile=url[j+1:-1]
if not os.path.exists(jpgfile):
#max retry 3 times
os.system('wget '+url_sub+' --tries=3 &' )
print url_sub
else:
print "exists already", jpgfile
EOF
But I hate of saving it to local disk by WebBrowser. WE NEED TO DO IT AUTOMATICALLY.
Some picture bbs forum usually has a list of each post , and each post contain the pictures , with url in it's code file.
So we
download the list
grab each post
analyst each post to grab the pictures' url
download the pictures by url
Here is the code in python do step 1-2 (htmladdr.py):
#!/usr/bin/python
import re
import os
import sys
import funimg
#BBS url (might need to change)
url_pre='http://xxxxx.com/bbs'
#sub html in hte list (might need to change)
p='<a\shref=\"thread-\d+-\d+-\d+.html\">'
def usage():
print sys.argv[0]+' filepath'
def getpicfromhtml(htmlpath):
text=open(htmlpath,'r').readlines()
print "try to analyst", htmlpath
line=0
for element in text:
m=re.findall(p, element)
if m:
try:
for element_m in m:
#get the real relative url in element_m (might need to change)
url=element_m[9:-2]
print url
dirname=url
if not os.path.exists(dirname):
os.makedirs(dirname)
os.chdir(dirname)
url_sub=url_pre+dirname
print "get" ,url_sub
os.system('GET '+ url_sub+' HTTP/1.1'+ ' > '+ 'htmlfile')
funimg.downloadpic('./htmlfile')
print url_sub
#go back to parent (might need to change)
os.chdir('..')
except Exception:
pass
# else:
# print 'find no match'
#get max 5 pages from bbs
htmlindex=1
while htmlindex<=5:
htmlfilename='page'+str(htmlindex)
#forum list url (might need to change)
html_url=url_pre+'forum-4-'+str(htmlindex)+'.html'
print html_url
if not os.path.exists(htmlfilename):
#download the pagefile to a file name: pagex (x is stand for page index)
print 'GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename
os.system('GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename)
getpicfromhtml('./page'+str(htmlindex))
htmlindex=htmlindex+1
Here is the code in python do step 3-4 (funimg.py):
#!/usr/bin/python
import re
import os
import string
import sys
#(might need to change)
p=re.compile('.*<img.*src=\".*.jpg\"', re.I)
def downloadpic(filename):
#read from the filename
text=open(filename,'r').readlines()
for element in text:
m=re.match(p,element)
if m:
#get the real url of pic from m
url=m.group()
i=string.find(url,"src=")
url_sub=url[i+4+1:-1]
j=string.rfind(url,"/")
jpgfile=url[j+1:-1]
if not os.path.exists(jpgfile):
#max retry 3 times
os.system('wget '+url_sub+' --tries=3 &' )
print url_sub
else:
print "exists already", jpgfile
EOF
相关文章推荐
- 从某图片论坛中自动下载每页的每个子post的图片 - 邓维 - 博客园
- iOS UI 15 网络编程下载 图片 音乐 大文件 视频 get/ post方法
- Python的系统管理_07_python_自动下载图片示例
- iOS UI 15 网络编程下载 图片 音乐 大文件 视频 get/ post方法
- PYTHON实现DISCUZ论坛的自动POST登录发贴回帖
- scrapy 自动下载图片
- PYTHON实现DISCUZ论坛的自动POST登录发贴回帖
- 用正则表达式自动下载网页中的图片
- Android HttpURLConnection获取网络下载图片POST请求方式
- 如何让redmine中问题或论坛中图片附件自动预览
- jQuery自动与手动图片切换效果下载
- HttpUrlConnection 下的post, get , 和图片下载(工具类)
- HTML5画渐变背景图片并自动下载实现步骤
- Discuz! 5.0.0论坛程序中加入一段js代码,让会员点击下载附件前自动弹出提示窗口
- 关于给出url自动下载图片的demo
- 从XKCD网站下载自动所有漫画图片---python实现
- 从指定网页自动下载其中所有的图片
- 自动识别HTML代码里的图片链接,并下载到服务器的指定目录源码
- C#实体图片下载与批量下载(自动保存功能)
- java下载远程http地址的图片文件到本地-自动处理图片是否经过服务器gzip压缩的问题