您的位置:首页 > 编程语言 > Python开发

使用python3 解析html对称标签

2017-11-21 14:17 381 查看
写了一个类,主要用于解析html文本的对称的标签结构。

通过输入tag名称,解析对应HTML文本,查找对应tag的层级数,并可以通过层级数得出对应的tag内容。写的比较粗糙,后续如果用到在慢慢改进。

代码如下:

#!/usr/bin/python3
#encoding = UTF-8
import re

####################################
#通过解析HTML文本,获取指定tag的层数
###################################
class htmltaganalysis(object):
def __init__(self,html,tag):
self.html = html
self.tag = tag

#正则匹配,还需要调优
def tagdec(self,html,tag):
pa = re.compile(tag,re.I|re.S|re.M)
return re.finditer(pa,html)

#返回数组[{'content':'xx','layer',x}....]
def GetTagContent(self):
divfinditers = self.tagdec(self.html,'<'+ self.tag)
divfinditere = self.tagdec(self.html,'</'+ self.tag + '>')
startlist = []
endlist = []
arr = []
for n in divfinditere:
endlist.append(n.end())
for m in divfinditers:
startlist.append(m.start())
for j in range(len(endlist)):
for i in range(len(startlist)-1):
if startlist[i] < endlist[j] and startlist[i + 1] > endlist[j] :
arr.append([startlist[i] ,endlist[j]])
startlist.remove(startlist[i])
continue

for k in range(len(startlist)):
#print(startlist[k],endlist[len(endlist)-k-1])
arr.append([startlist[k],endlist[len(endlist)-k-1]])
#按第一列进行排序
arr = sorted(arr, key=lambda x:x[0])
arrcontent = []
for i in range(len(arr)):
#print(arr[i],self.Getlayer(arr,i,1))
dic = dict()
dic['content'] = self.html[arr[i][0]:arr[i][1]]
dic['layer'] = self.Getlayer(arr,i,1)
arrcontent.append(dic)
return arrcontent

#计算数组在二维数组中的层级
def Getlayer(self,arr,i,layer):
#print(arr[i])
zz = False
if i > 0 and i <len(arr):
for j in range(i - 1 ,-1,-1):
if arr[i][0] > arr[j][0] and arr[i][1] < arr[j][1]:
zz = True
layer = layer + 1
#print(arr[j])
#break
return self.Getlayer(arr,j,layer)
if zz:
return layer
else:
return layer

#获取对应层级的标签文本
def GetContentForLayer(self,layer = 1):
arr = []
for dic in self.GetTagContent():
if dic['layer'] == layer:
arr.append(dic['content'])
return arr

#获取最高层级
def GetTopLayer(self):
tplayer = 0
for dic in self.GetTagContent():
if tplayer < dic['layer']:
tplayer = dic['layer']
return tplayer


使用示例:

html = '<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>'
htmltaganalysis = htmltaganalysis(html,'div')
print(htmltaganalysis.GetTopLayer())
print(htmltaganalysis.GetContentForLayer(1))
print(htmltaganalysis.GetContentForLayer(2))


结果:

2
['<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>']
['<div class="x-wiki-content x-content"></div>']


备注:欢迎任何形式的转载,但请务必注明出处。
限于本人水平,如果文章和代码有表述不当之处,还请不吝赐教。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: