您的位置:首页 > 编程语言 > Python开发

python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)

2015-05-19 00:19 441 查看
废话不多说,直接贴代码,主要采用BeautifulSoup写的

# -*- coding: utf-8 -*-
"""
Created on Mon May 18 19:12:06 2015

@author: Administrator
"""

import urllib
import os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

i = 0
j = 0
list_a = []

def gettext(href):
global j,list_a
page = urllib.urlopen(href).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
div = soup.find_all("div",class_="content")
p_text = div[0].find_all("p")
for p in p_text:
fp = file("%s.txt" % list_a[j],"a")
fp.write(' ')
fp.write(p.get_text())
fp.write(" \n")
j+=1

def gethref(url): #获得所有链接
global i,list_a
fp = file("AllTitle.txt","w+")
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
ul = soup.find_all("ul",class_="row1")
li = ul[0].find_all("li")
for lia in li:
list_a.append(("%s、" % (i+1))+lia.h3.get_text())
href = lia.a.get('href')
# 将标题简介和链接有规则的写入文件中
fp.write("%s、" % (i+1))
i+=1
fp.write("标题:")
fp.write(lia.h3.get_text())
fp.write("\n 简介:")
fp.write(lia.p.get_text())
fp.write("\n 链接:")
fp.write(lia.a.get("href"))
fp.write("\n")

gettext(href)

if "__main__"==__name__:
url ="http://re.qq.com/biznext/zkht.htm"

gethref(url)
print "All Is OK!"
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: