python 爬虫的简单示例
2015-11-02 16:13
465 查看
最近在学习python,最近将语法学习完毕之后,出于兴趣做了一个简单的爬虫,现在将代码粘贴如下:
#!/usr/bin/python
#conding=utf-8
import re
import urllib
import sys
import os
#获取参数,写入到目录所在的data目录下
times = sys.argv[1]
print times
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
#验证当前url是否可以访问
def isable2visit(url):
statusCode =urllib.urlopen(url).getcode()
if (statusCode == 200):
return True
else:
return False
#创建目录
def createDir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
print "Success to create file " + dir
return dir
#获取图片保存到本地
def getImg(html,x,times):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
for imgurl in imglist:
local = dir + os.sep + str(x) + '.jpg'
urllib.urlretrieve(imgurl,local)
x+=1
return x
urls = raw_input("Enter the preFix of the url:")
if len(urls) == 0:
urls = "http://tieba.baidu.com/p/41254316"
print urls
x = 0
storeDir = "/home/liyong/python/spider/data/"+str(times)
dir = createDir(storeDir)
for i in range(100):
url = urls + str(i)
print "Done %.2f%%" % ((float(i)/100)*100)
if(isable2visit(url)):
x = html = getImg(getHtml(url),x,dir)
print "Done 100%"
爬取结果如下:
#!/usr/bin/python
#conding=utf-8
import re
import urllib
import sys
import os
#获取参数,写入到目录所在的data目录下
times = sys.argv[1]
print times
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
#验证当前url是否可以访问
def isable2visit(url):
statusCode =urllib.urlopen(url).getcode()
if (statusCode == 200):
return True
else:
return False
#创建目录
def createDir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
print "Success to create file " + dir
return dir
#获取图片保存到本地
def getImg(html,x,times):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
for imgurl in imglist:
local = dir + os.sep + str(x) + '.jpg'
urllib.urlretrieve(imgurl,local)
x+=1
return x
urls = raw_input("Enter the preFix of the url:")
if len(urls) == 0:
urls = "http://tieba.baidu.com/p/41254316"
print urls
x = 0
storeDir = "/home/liyong/python/spider/data/"+str(times)
dir = createDir(storeDir)
for i in range(100):
url = urls + str(i)
print "Done %.2f%%" % ((float(i)/100)*100)
if(isable2visit(url)):
x = html = getImg(getHtml(url),x,dir)
print "Done 100%"
爬取结果如下:
相关文章推荐
- python实现拷贝指定文件到指定目录
- python 类型转换函数
- python字符编码错误的解决方案
- Python - Json
- 以正确的方式开源 Python 项目(转)
- python 2.6 替换安装Python2.7
- python中关于装饰器的理解
- python , angular js 学习记录【3】
- 一些实用的python小脚本
- Python installation
- windows下搭建python+NLTK开发环境
- Python/scikit-learn机器学习库(线性、二次判别分析)
- Python单例模式终极版
- python-swap函数复制与赋值
- Python单例模式的4种实现方法
- Python--函数return后面的语句不执行
- window下从python开始安装科学计算环境
- python 小记
- leetcode之Find Median from Data Stream
- python小贴士之strip() split()