python爬虫
2016-07-13 17:09
453 查看
1.控制器:
2.url管理
3.下载
上面有问题,如下改进:
def startLoad(self, url):
# req=urllib2.Request(url)
res = urllib2.urlopen(url, timeout=1)
if res.getcode() != 200:
return
result = res.read()
res.close()
return result
4.解析:
6.存储展示:
测试线程;
#coding=utf-8 ''' Created on 2016��7��12�� @author: wenwen.huang ''' from spider import html_url, html_dw, html_parse, html_collect class SpiderMain(object): def __init__(self): self.urlManager = html_url.Urlmanager() self.downloader = html_dw.DownLoader() self.paser = html_parse.Parser() self.collecter = html_collect.Collect() def start(self, rootUrl): count = 1 self.urlManager.addUrl(rootUrl) while self.urlManager.hasUrl(): try: newUrl = self.urlManager.getUrl() print 'start download %d -- %s' % (count, newUrl) html = self.downloader.startLoad(newUrl) newUrls, newData = self.paser.parse(html, newUrl) self.urlManager.addUrls(newUrls) self.collecter.collect(newData) if count > 100: break count = count + 1 except Exception, e: print e print "parse fail" self.collecter.writeFile() if __name__ == '__main__': rootUrl = "http://baike.baidu.com/view/21087.htm" spider = SpiderMain() spider.start(rootUrl)
2.url管理
#coding=utf-8 ''' Created on 2016��7��12�� @author: wenwen.huang ''' class Urlmanager(object): def __init__(self): self.newUrls = set() self.oldurls = set() def addUrl(self, url): if url is None: return if url in self.newUrls or url in self.oldurls: return self.newUrls.add(url) def hasUrl(self): return len(self.newUrls) != 0 def getUrl(self): url = self.newUrls.pop() self.oldurls.add(url) return url def addUrls(self, urls): if urls is None or len(urls) == 0: return for url in urls : self.addUrl(url)
3.下载
#coding=utf-8 ''' Created on 2016年7月12日 @author: wenwen.huang ''' import urllib2 class DownLoader(object): <del> def startLoad(self, url): res = urllib2.urlopen(url) if res.getcode() != 200: return return res.read() </del>
上面有问题,如下改进:
def startLoad(self, url):
# req=urllib2.Request(url)
res = urllib2.urlopen(url, timeout=1)
if res.getcode() != 200:
return
result = res.read()
res.close()
return result
4.解析:
#coding=utf-8 ''' Created on 2016��7��12�� @author: wenwen.huang ''' from bs4 import BeautifulSoup import re import urlparse class Parser(object): # <dd class="lemmaWgt-lemmaTitle-title"> # <h1>Python</h1> # <a href="javascript:;" class="edit-lemma cmn-btn-hover-blue cmn-btn-28 j-edit-link" style="display: inline-block;"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_edit-lemma"></em>�༭</a> # <a class="lock-lemma" target="_blank" href="/view/10812319.htm" title="����"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_lock-lemma"></em>����</a> # </dd> # <a target="_blank" href="/view/20965.htm">�������</a> # <div class="para" label-module="para"> def __parseUrl(self, soup, parseUrl): newUrls = set() links = soup.find_all("a", href=re.compile(r"/view/\d+\.htm")) for link in links: url = link['href'] url = urlparse.urljoin(parseUrl, url) newUrls.add(url) return newUrls def __parseData(self, soup, parseUrl): datas={} titleNode = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1") datas['title'] = titleNode.get_text() datas['url'] = parseUrl descNode = soup.find("div", class_="para") datas['desc'] = descNode.get_text() return datas def parse(self, html, parseUrl): if html is None or parseUrl is None: return soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') newUrls = self.__parseUrl(soup, parseUrl) data = self.__parseData(soup, parseUrl) return newUrls, data
6.存储展示:
#coding=utf-8 ''' Created on 2016��7��12�� @author: wenwen.huang ''' class Collect(object): def __init__(self): self.list = [] def collect(self, data): if data is None: return self.list.append(data) def writeFile(self): fout = open("out.html", "w") fout.write("<html>") fout.write("<head><meta charset='utf-8'>") fout.write('<link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet">') fout.write("</head>") fout.write("<body>") fout.write('<div class="container">') fout.write('<table class="table table-bordered">') for data in self.list: # print data fout.write("<tr border='1'>") fout.write('<td>%s</td>' % data['url']) title = data['title'].encode('utf-8') print title fout.write('<td>%s</td>' % title) fout.write('<td>%s</td>' % data['desc'].encode('utf-8')) # fout.write("<td>") # fout.write(data['title'].encode("utf8")) # # print data['title'] # fout.write("</td>") # fout.write("<td>") # fout.write(data['url']) # fout.write("</td>") # fout.write("<td>") # fout.write(data['desc']) # fout.write("</td>") fout.write("</tr>") fout.write("</table>") fout.write("</div>") fout.write("</body>") fout.write("</html>")
#以下容易被忽略: fout.close()
测试线程;
#coding=utf-8 ''' Created on 2016年7月13日 @author: wenwen.huang ''' from time import sleep, ctime import threading class MyThread(object): def __init__(self): self.threads = [] def music(self, name): for i in range(2): print " 我在听歌, 听 %s in %s \n" % (name, ctime()) sleep(1) def movie(self, name): for i in range(2): print "我在看电影 ,看 %s in %s \n" % (name, ctime()) sleep(3) def useThread(self): t1 = threading.Thread(target=self.music, args=('爱情买卖',)) self.threads.append(t1) t2 = threading.Thread(target=self.movie, args=('阿凡达',)) self.threads.append(t2) for t in self.threads: # t.setDaemon(True) //打开导致子线程没执行完,就跟着主线程死亡! t.start() t.join() //保证子线程执行完后,再执行主线程; if __name__ == '__main__': test = MyThread() test.useThread() print 'start all threads over !' # test.music('黑色幽默') # test.movie('血色浪漫')
相关文章推荐
- python 发送邮件实例
- python存储对象到文件
- Python 变量类型
- python文件管理
- python while()语句
- python logging模块 自定义输出
- 关于灰帽python一书中提到的debugger
- python文件路径操作
- Python input()
- 利用python分析日志生成图表
- python 学习
- Spark学习笔记#1-快速入门
- python线程锁实践实例
- Python实现根据IP地址和子网掩码算出网段的方法
- 笨方法学Python出现问题script, user_name=argv ValueError: need more than 1 value to unpack
- python模块之os模块
- Python入门:Django错误(1146,Table 'blog.django_session' doesn't exist")
- Python自动化测试工具Splinter简介和使用实例
- Python学习总结(一) 双Python版本环境配置
- mysql数据表自动导为python sqlalchemy可操作对象