python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)
2015-05-19 00:19
441 查看
废话不多说,直接贴代码,主要采用BeautifulSoup写的
# -*- coding: utf-8 -*-
"""
Created on Mon May 18 19:12:06 2015
@author: Administrator
"""
import urllib
import os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
i = 0
j = 0
list_a = []
def gettext(href):
global j,list_a
page = urllib.urlopen(href).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
div = soup.find_all("div",class_="content")
p_text = div[0].find_all("p")
for p in p_text:
fp = file("%s.txt" % list_a[j],"a")
fp.write(' ')
fp.write(p.get_text())
fp.write(" \n")
j+=1
def gethref(url): #获得所有链接
global i,list_a
fp = file("AllTitle.txt","w+")
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
ul = soup.find_all("ul",class_="row1")
li = ul[0].find_all("li")
for lia in li:
list_a.append(("%s、" % (i+1))+lia.h3.get_text())
href = lia.a.get('href')
# 将标题简介和链接有规则的写入文件中
fp.write("%s、" % (i+1))
i+=1
fp.write("标题:")
fp.write(lia.h3.get_text())
fp.write("\n 简介:")
fp.write(lia.p.get_text())
fp.write("\n 链接:")
fp.write(lia.a.get("href"))
fp.write("\n")
gettext(href)
if "__main__"==__name__:
url ="http://re.qq.com/biznext/zkht.htm"
gethref(url)
print "All Is OK!"
# -*- coding: utf-8 -*-
"""
Created on Mon May 18 19:12:06 2015
@author: Administrator
"""
import urllib
import os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
i = 0
j = 0
list_a = []
def gettext(href):
global j,list_a
page = urllib.urlopen(href).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
div = soup.find_all("div",class_="content")
p_text = div[0].find_all("p")
for p in p_text:
fp = file("%s.txt" % list_a[j],"a")
fp.write(' ')
fp.write(p.get_text())
fp.write(" \n")
j+=1
def gethref(url): #获得所有链接
global i,list_a
fp = file("AllTitle.txt","w+")
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
ul = soup.find_all("ul",class_="row1")
li = ul[0].find_all("li")
for lia in li:
list_a.append(("%s、" % (i+1))+lia.h3.get_text())
href = lia.a.get('href')
# 将标题简介和链接有规则的写入文件中
fp.write("%s、" % (i+1))
i+=1
fp.write("标题:")
fp.write(lia.h3.get_text())
fp.write("\n 简介:")
fp.write(lia.p.get_text())
fp.write("\n 链接:")
fp.write(lia.a.get("href"))
fp.write("\n")
gettext(href)
if "__main__"==__name__:
url ="http://re.qq.com/biznext/zkht.htm"
gethref(url)
print "All Is OK!"
相关文章推荐
- Python3.5爬虫urllib系列之三
- Python爬虫学习系列教程(2.7)
- Python爬虫UrlError和HttpError系列之五
- Python爬虫Csdn系列III
- python爬虫之爬取腾讯新闻
- Python爬虫系列之----Scrapy(四)一个简单的示例
- Python 爬虫学习系列教程
- 【Python爬虫系列】内容解析之BeautifulSoup
- Python爬虫请求与响应过程系列之二
- Python爬虫学习系列教程
- Python爬虫Csdn系列II
- 纯代码系列:Python实现验证码图片(PIL库经典用法用法,爬虫12306思路)
- python爬虫系列
- Python爬虫系列之----Scrapy(五)网页提取的三种方式(正则,Beautiful Soup,Lxml)
- python 爬虫系列03 认识 BeautifulSoup
- Python爬虫系列:开端
- Python爬虫系列(三)多线程爬取斗图网站(皮皮虾,我们上车)
- Python爬虫利器系列文章
- Python爬虫瞎玩系列(2)—— Bilibili视频最新投稿实时跟踪
- Python爬虫系列(一):从零开始,安装环境