py 爬取页面http://m.sohu.com 并存储
2015-10-27 20:21
826 查看
1 #思路 : 利用beautiful 省去了正则这个麻烦事,把页面搞出来然后提取js,css,img ,提取命令使用getopt 很方便,使用前需要确保已经安装了beautiful soup,如没有安#装请 到 http://www.crummy.com/software/BeautifulSoup/ 下载
2 from bs4 import BeautifulSoup
3 import urllib, urllib2,time
4 import sys,os
5 import getopt
6 reload(sys)
7 sys.setdefaultencoding("utf-8")
8
9 #set default value
10 clock_time = 60
11 target_url = "http://m.sohu.com"
12 target_lib = "/tmp/backup"
13
14 def usage() :
15 print "simple like this :"
16 print "main.py -d 60 -u http://m.sohu.com -o \tmp\backup"
17
18 def getHtml(target_url,target_lib,time) :
19 response = urllib.urlopen(target_url)
20 Html= response.read()
21 target_lib=target_lib+'/'+time
22 os.makedirs(target_lib)
23 #save html
24 print target_lib
25 try :
26 f = open(target_lib+"/index.html","w")
27 f.write(Html)
28 f.close()
29 print "save index.html ok!"
30 except Exception,e:
31 print str(e)
32
33 # save picture
34 os.makedirs(target_lib+"/images")
35 soup = BeautifulSoup(Html)
36 f=soup.find_all('img')
37 if f != None :
38 for i in f :
39 pic_url=i.get('src')
40 response = urllib.urlopen(pic_url)
41 pic_url=pic_url.split('/')
42 pic= response.read()
43 try :
44 f = open(target_lib+"/images/"+pic_url[-1],"wb")
45 f.write(pic)
46 f.close()
47 except Exception,e :
48 print str(e)
49
50 print "save picture ok!"
51
52 #save js
53 os.makedirs(target_lib+"/js")
54 f=soup.find_all('script')
55 noName=0
56 if f != None :
57 for i in f :
58 if i.get('src')!=None :
59 js_url=i.get('src')
60 response = urllib.urlopen(js_url)
61 js_url=js_url.split('/')
62 js= response.read()
63 try :
64 f = open(target_lib+"/js/"+js_url[-1],"w")
65 f.write(js)
66 f.close()
67 except Exception,e :
68 print str(e)
69 else : # js 可以嵌入在文档里 保存为wuming
70 f = open(target_lib+"/js/"+"wuming"+str(noName)+".js","w")
71 noName+=1
72 f.write(i.string)
73 f.close()
74 print "save js ok!"
75
76 #save css
77 os.makedirs(target_lib+"/css")
78 f=soup.find_all('link')
79 if f != None :
80 for i in f :
81 if i.get('type') != None and i.get('type') == "text/css" :
82 css_url=i.get('href')
83 response = urllib.urlopen(css_url)
84 css_url=css_url.split('/')
85 css= response.read()
86 try :
87 f = open(target_lib+"/css/"+css_url[-1],"w")
88 f.write(css)
89 f.close()
90 except Exception,e :
91 print str(e)
92 print "save css ok!"
93
94 def main() :
95 global clock_time
96 global target_url
97 global target_lib
98
99 if not len(sys.argv[1:]) :
usage()
try :
opts,args = getopt.getopt(sys.argv[1:], "d:u:o:",[])
except getopt.GetoptError as err :
print str(err)
usage()
for o,a in opts :
if o in ("-d") :
clock_time = a
if o in ("-u") :
target_url = a
if o in ("-o") :
target_lib = a
lastTime = int(time.time())
timeArray = time.localtime(lastTime)
otherStyleTime = time.strftime("%Y%m%d%H%M", timeArray)
getHtml(target_url,target_lib,otherStyleTime)
while True :
nowTime=int(time.time())
if nowTime - lastTime >= 60 :
lastTime=nowTime
timeArray = time.localtime(nowTime)
otherStyleTime = time.strftime("%Y%m%d%H%M", timeArray)
getHtml(target_url,target_lib,otherStyleTime)
print "update at time" + otherStyleTime
if __name__=="__main__" :
main()
2 from bs4 import BeautifulSoup
3 import urllib, urllib2,time
4 import sys,os
5 import getopt
6 reload(sys)
7 sys.setdefaultencoding("utf-8")
8
9 #set default value
10 clock_time = 60
11 target_url = "http://m.sohu.com"
12 target_lib = "/tmp/backup"
13
14 def usage() :
15 print "simple like this :"
16 print "main.py -d 60 -u http://m.sohu.com -o \tmp\backup"
17
18 def getHtml(target_url,target_lib,time) :
19 response = urllib.urlopen(target_url)
20 Html= response.read()
21 target_lib=target_lib+'/'+time
22 os.makedirs(target_lib)
23 #save html
24 print target_lib
25 try :
26 f = open(target_lib+"/index.html","w")
27 f.write(Html)
28 f.close()
29 print "save index.html ok!"
30 except Exception,e:
31 print str(e)
32
33 # save picture
34 os.makedirs(target_lib+"/images")
35 soup = BeautifulSoup(Html)
36 f=soup.find_all('img')
37 if f != None :
38 for i in f :
39 pic_url=i.get('src')
40 response = urllib.urlopen(pic_url)
41 pic_url=pic_url.split('/')
42 pic= response.read()
43 try :
44 f = open(target_lib+"/images/"+pic_url[-1],"wb")
45 f.write(pic)
46 f.close()
47 except Exception,e :
48 print str(e)
49
50 print "save picture ok!"
51
52 #save js
53 os.makedirs(target_lib+"/js")
54 f=soup.find_all('script')
55 noName=0
56 if f != None :
57 for i in f :
58 if i.get('src')!=None :
59 js_url=i.get('src')
60 response = urllib.urlopen(js_url)
61 js_url=js_url.split('/')
62 js= response.read()
63 try :
64 f = open(target_lib+"/js/"+js_url[-1],"w")
65 f.write(js)
66 f.close()
67 except Exception,e :
68 print str(e)
69 else : # js 可以嵌入在文档里 保存为wuming
70 f = open(target_lib+"/js/"+"wuming"+str(noName)+".js","w")
71 noName+=1
72 f.write(i.string)
73 f.close()
74 print "save js ok!"
75
76 #save css
77 os.makedirs(target_lib+"/css")
78 f=soup.find_all('link')
79 if f != None :
80 for i in f :
81 if i.get('type') != None and i.get('type') == "text/css" :
82 css_url=i.get('href')
83 response = urllib.urlopen(css_url)
84 css_url=css_url.split('/')
85 css= response.read()
86 try :
87 f = open(target_lib+"/css/"+css_url[-1],"w")
88 f.write(css)
89 f.close()
90 except Exception,e :
91 print str(e)
92 print "save css ok!"
93
94 def main() :
95 global clock_time
96 global target_url
97 global target_lib
98
99 if not len(sys.argv[1:]) :
usage()
try :
opts,args = getopt.getopt(sys.argv[1:], "d:u:o:",[])
except getopt.GetoptError as err :
print str(err)
usage()
for o,a in opts :
if o in ("-d") :
clock_time = a
if o in ("-u") :
target_url = a
if o in ("-o") :
target_lib = a
lastTime = int(time.time())
timeArray = time.localtime(lastTime)
otherStyleTime = time.strftime("%Y%m%d%H%M", timeArray)
getHtml(target_url,target_lib,otherStyleTime)
while True :
nowTime=int(time.time())
if nowTime - lastTime >= 60 :
lastTime=nowTime
timeArray = time.localtime(nowTime)
otherStyleTime = time.strftime("%Y%m%d%H%M", timeArray)
getHtml(target_url,target_lib,otherStyleTime)
print "update at time" + otherStyleTime
if __name__=="__main__" :
main()
相关文章推荐
- Unity--动态加载网络图片和本地图片文件
- http中content-type头值-(MIME类型)
- RESTful Web API中的Http协议语义
- 【Android网络编程】获取网络图片,具有缓存功能
- 【转】HTTP Response Header 的 Content-Disposition
- 变更管理、信息系统安全管理及项目风险管理作业
- HTTP/1.1协议(中文归纳版)
- ubuntu限制本地网速
- IOS网络笔记--地图内容3(点击获取当前地址的具体坐标)
- 使用TCP从客户端上传图片到服务器端
- netstat查看网络信息
- httpclient4.3设置代理请求
- IOS网络笔记--地图内容2(正反向编码)
- 【工具类】安卓开发 HttpPost和HttpGet请求
- HttpClient使用详解
- 通过URLHttpConnection方式来取得图片,并且显示在ImageView上
- Java IO: 网络
- 同行不同命:中美两国的“网络水军”有哪些差异
- openfire因为网络不稳定而造成消息丢失的解决方案
- TCP/IP