您的位置:首页 > 其它

从某图片论坛中自动下载每页的每个子post的图片

2012-12-15 09:04 330 查看
I often goto some picture bbs to see pictures ,I have needed to download the pictures from some good post.

But I hate of saving it to local disk by WebBrowser. WE NEED TO DO IT AUTOMATICALLY.

Some picture bbs forum usually has a list of each post , and each post contain the pictures , with url in it's code file.

So we

download the list

grab each post

analyst each post to grab the pictures' url

download the pictures by url

Here is the code in python do step 1-2 (htmladdr.py):

#!/usr/bin/python

import re

import os

import sys

import funimg

#BBS url (might need to change)

url_pre='http://xxxxx.com/bbs'

#sub html in hte list (might need to change)

p='<a\shref=\"thread-\d+-\d+-\d+.html\">'

def usage():

print sys.argv[0]+' filepath'

def getpicfromhtml(htmlpath):

text=open(htmlpath,'r').readlines()

print "try to analyst", htmlpath

line=0

for element in text:

m=re.findall(p, element)

if m:

try:

for element_m in m:

#get the real relative url in element_m (might need to change)

url=element_m[9:-2]

print url

dirname=url

if not os.path.exists(dirname):

os.makedirs(dirname)

os.chdir(dirname)

url_sub=url_pre+dirname

print "get" ,url_sub

os.system('GET '+ url_sub+' HTTP/1.1'+ ' > '+ 'htmlfile')

funimg.downloadpic('./htmlfile')

print url_sub

#go back to parent (might need to change)

os.chdir('..')

except Exception:

pass

# else:

# print 'find no match'

#get max 5 pages from bbs

htmlindex=1

while htmlindex<=5:

htmlfilename='page'+str(htmlindex)

#forum list url (might need to change)

html_url=url_pre+'forum-4-'+str(htmlindex)+'.html'

print html_url

if not os.path.exists(htmlfilename):

#download the pagefile to a file name: pagex (x is stand for page index)

print 'GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename

os.system('GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename)

getpicfromhtml('./page'+str(htmlindex))

htmlindex=htmlindex+1

Here is the code in python do step 3-4 (funimg.py):

#!/usr/bin/python

import re

import os

import string

import sys

#(might need to change)

p=re.compile('.*<img.*src=\".*.jpg\"', re.I)

def downloadpic(filename):

#read from the filename

text=open(filename,'r').readlines()

for element in text:

m=re.match(p,element)

if m:

#get the real url of pic from m

url=m.group()

i=string.find(url,"src=")

url_sub=url[i+4+1:-1]

j=string.rfind(url,"/")

jpgfile=url[j+1:-1]

if not os.path.exists(jpgfile):

#max retry 3 times

os.system('wget '+url_sub+' --tries=3 &' )

print url_sub

else:

print "exists already", jpgfile

EOF
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: