python3学习爬虫 正则以及url
2015-12-04 09:12
531 查看
#coding=utf8 __author__ = 'Administrator' import os import re import urllib.request import pymysql class Spider: #页面初始化 def __init__(self,url,retext,path): self.url = url self.path = path self.retext = retext def mkdir(self,path): isExists = os.path.exists(path) if not isExists: os.makedirs(path) return path def getData(self): url = urllib.request.Request(self.url) html = urllib.request.urlopen(url).read() print(html) html = html.decode('utf-8','ignore') imgRe = re.compile(self.retext) data = imgRe.findall(html) return data #self.data = data def saveImg(self,imgurl,imgname): #img = urllib.request.urlopen(imgurl).read() #img = urllib.request.urlopen(imgurl) #print(img) path = self.path try: img = urllib.request.urlopen(imgurl) #print(img) except Exception as e: print(e) else: img = img.read() f = open("./%s/%s.jpg" %(path,imgname),'wb') f.write(img) f.close()
def saveMysql(self,title,url,catogary,content): try: con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8') cur=con.cursor() #cur.execute('select * from imgurl') #data=cur.fetchall() insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content) #print(insert) cur.execute(insert) cur.close()#关闭游标 con.close()#释放数据库资源 except Exception as e: print("发生异常:%s"%e)
def getContent(self): path = self.mkdir(self.path) data = self.getData() #print(data) fp = open('./%s/url.txt'%(path),'w+') x = 0 for d in data: print(d) fp.write(d) if (len(d) < 80) : #self.saveImg(d,x) #print(d) x+=1 fp.close() url = "http://www.zhihu.com/question/29649162" url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1' retext = r'http://.*?\.jpg|http://.*?\.png' retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>' spider = Spider(url,retext2,"赵丽颖") spider.getContent()
相关文章推荐
- python --面向对象
- python --函数
- python --for循环
- python --if语句
- 学习Python (九)
- 饿了么黑客马拉松参赛杂谈
- python核心编程-默认函数对象参数
- python 爬取天气温度
- python 爬取贴吧图片
- python操作excel的技巧整理
- python解析XML方法
- Conque ERROR: Python interface cannot be loaded
- Python.h: No such file or directory
- Python结构化编程
- python requests 高级用法 -- 包括SSL 证书错误的解决方案
- python execl转xml应用
- 使用Python开发windows GUI程序入门实例
- Python脚本自动化编译RPM包
- Python之路(二)
- 17.2. socket — Low-level networking interface Python