您的位置:首页 > 编程语言 > Python开发

python3学习爬虫 正则以及url

2015-12-04 09:12 531 查看
#coding=utf8
__author__ = 'Administrator'

import os
import re
import urllib.request
import pymysql

class Spider:
#页面初始化
def __init__(self,url,retext,path):
self.url = url
self.path = path
self.retext = retext

def mkdir(self,path):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return path

def getData(self):
url = urllib.request.Request(self.url)
html = urllib.request.urlopen(url).read()
print(html)
html = html.decode('utf-8','ignore')
imgRe = re.compile(self.retext)
data = imgRe.findall(html)
return data
#self.data = data

def saveImg(self,imgurl,imgname):
#img = urllib.request.urlopen(imgurl).read()
#img = urllib.request.urlopen(imgurl)
#print(img)
path = self.path
try:
img = urllib.request.urlopen(imgurl)
#print(img)
except Exception as e:
print(e)
else:
img = img.read()
f = open("./%s/%s.jpg" %(path,imgname),'wb')
f.write(img)
f.close()
    def saveMysql(self,title,url,catogary,content):
        try:
            con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')
            cur=con.cursor()
            #cur.execute('select * from imgurl')
            #data=cur.fetchall()
            insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)
            #print(insert)
            cur.execute(insert)
            cur.close()#关闭游标
            con.close()#释放数据库资源
        except  Exception as e:
            print("发生异常:%s"%e)
def getContent(self):
path = self.mkdir(self.path)
data = self.getData()
#print(data)
fp = open('./%s/url.txt'%(path),'w+')
x = 0
for d in data:
print(d)
fp.write(d)
if (len(d) < 80) :
#self.saveImg(d,x)
#print(d)
x+=1
fp.close()

url = "http://www.zhihu.com/question/29649162"
url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'
retext = r'http://.*?\.jpg|http://.*?\.png'

retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'
spider = Spider(url,retext2,"赵丽颖")
spider.getContent()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: