您的位置:首页 > 编程语言 > Python开发

Python解析已下载html文件

2015-12-06 22:22 639 查看
离线解析百度百科中的“百度百科”,提取各级标题:

#encoding:UTF-8
#_Author_:Ibsen

import urllib2
from sgmllib import SGMLParser

class ListName(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_h1=False
self.flag=False
self.getdata=False
self.name = []
def start_h1(self,attrs):
self.is_h1=True
def end_h1(self):
self.is_h1=False
def start_span(self, attrs):
for k,v in attrs:
if k=='class' and v=='title-text':
self.flag=True;
return
def end_span(self):
self.flag=False
def handle_data(self, text):
if self.is_h1:
self.name.append(text)
if self.flag:
self.name.append(text)

content = urllib2.urlopen('file:///C:/Users/John/Desktop/1.html').read()
#content=file('C:/Users/John/Desktop/1.html').read()
listname = ListName()
listname = ListName()
listname.feed(content)
for item in listname.name:
print item


运行结果:



提取主标题和前两个<h2>标签下的内容:

#encoding:UTF-8
#_Author_:Ibsen

import sys
import urllib2
from sgmllib import SGMLParser

class ListName(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_h1=False #标记<h1>
self.f_div=False #标记<div>
self.div_cnt=0 #计数div出现的次数,防止<div>嵌套出现
self.f_divp=False  #标记<div class='para'>标签中的内容输出
self.id=0 #纪录para对应的</div>的编号
self.f_divd=False #标记<div class='description'>标签:此标签中内容不输出
self.idd=0 #纪录description对应的</div>的编号
self.f_h2=False #标记<h2>
self.cnt=0 #计数<h2>出现的次数,只解析前两个
self.f_sup=False #标记<sup>,此标签中的内容不保留
self.name = [] #提取内容放入链表中

#提取<h1>标签中的内容
def start_h1(self,attrs):
self.is_h1=True
def end_h1(self):
self.is_h1=False

#提取<div class="para">标签中的内容
def start_div(self,attrs):
self.f_div=True
self.div_cnt+=1
for k,v in attrs:
if k=='class' and v=='para':
self.f_divp=True
self.id=self.div_cnt
for k,v in attrs:
if k=='class' and v=='description':
self.f_divd=True
self.idd=self.div_cnt
def end_div(self):
if self.div_cnt==self.id:
self.f_divp=False
if self.div_cnt==self.idd:
self.f_divd=False
if self.div_cnt==0:
self.f_div=False
else:
self.div_cnt-=1

#纪录<h2>标签出现的次数
def start_h2(self,attrs):
for k,v in attrs:
if k=='class' and v=='para-title level-2':
self.cnt+=1
self.f_h2=True;
def end_h2(self):
self.f_h2=False

#<sup>标签中的内容不保留
def start_sup(self,attrs):
self.f_sup=True
if self.f_divp:
self.f_divp=False
def end_sup(self):
if self.f_sup and self.f_divp==False:
self.f_divp=True
self.f_sup=False

def handle_data(self, text):
if self.is_h1:
self.name.append(text)
if self.cnt<=2:
if self.f_divp and self.f_divd==False:
self.name.append(text)

content = urllib2.urlopen('file:///C:/Users/John/Desktop/1.html').read()
#content=file('C:/Users/John/Desktop/1.html').read()
listname = ListName()
listname.feed(content)

output=sys.stdout
outputfile=open('C:\Users\John\Desktop\oput.txt','w')
sys.stdout=outputfile

for item in listname.name:
print item

outputfile.close()
sys.stdout=output
#print str(len(listname.name))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: