您的位置：首页 > 编程语言 > Python开发
python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用)

2015-11-21 15:47 716 查看
# -*- coding: utf-8 -*-
#python 27
#xiaodeng
#python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用)

import HTMLParser
#tag是的html标签，attrs是 (属性，值)元组(tuple)的列表(list)。
#HTMLParser自动将tag和attrs都转为小写

'''
>>> help(HTMLParser)
Help on module HTMLParser:
CLASSES
exceptions.Exception(exceptions.BaseException)
HTMLParseError
markupbase.ParserBase
HTMLParser

class HTMLParser(markupbase.ParserBase)
|  Find tags and other markup and call handler functions.
|
|  Usage:
|      p = HTMLParser()#初始化
|      p.feed(data)#feed()方法可以多次调用，也就是不一定一次把整个HTML字符串都塞进去，可以一部分一部分塞进去
#提供一些文本给解析器。在由完整元素组成的限度内进行处理，不完整的数据被缓冲直到更多的数据提供或者close()被调用
|      ...
|      p.close()
|
|  Methods defined here:
|
|  __init__(self)
|      Initialize and reset this instance.
|
|  check_for_whole_start_tag(self, i)
|      # Internal -- check to see if we have a complete starttag; return end
|      # or -1 if incomplete.
|
|  clear_cdata_mode(self)
|
|  close(self)
|      Handle any buffered data.
|
|  error(self, message)
|
|  feed(self, data)            #向分析器提供数据。
|      Feed data to the parser.
|
|      Call this as often as you want, with as little or as much text
|      as you want (may include '\n').
|
|  get_starttag_text(self)
|      Return full source of start tag: '<...>'.
|
|  goahead(self, end)
|      # Internal -- handle data as far as reasonable.  May leave state
|      # and data to be processed by a subsequent call.  If 'end' is
|      # true, force handling all data as if followed by EOF marker.
|
|  handle_charref(self, name)              #处理特殊字符串，就是以&#开头的，一般是内码表示的字符
|      # Overridable -- handle character reference
|
|  handle_comment(self, data)              #处理注释，处理<!--comment-->内的内容
|      # Overridable -- handle comment
|
|  handle_data(self, data)                 #处理数据，就是<xx>data</xx>中间的那些数据
|      # Overridable -- handle data
|
|  handle_decl(self, decl)                 #处理<!开头的，比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|                                          #文档类型声明，
# Overridable -- handle declaration
|
|  handle_endtag(self, tag)                #处理结束标签，</xx>
|      # Overridable -- handle end tag
|
|  handle_entityref(self, name)            #处理一些特殊字符，以&开头的
|      # Overridable -- handle entity reference
|
|  handle_pi(self, data)                   #处理形如<?instruction>的东西
|      # Overridable -- handle processing instruction
|
|  handle_startendtag(self, tag, attrs)    #处理开始标签和结束标签
|      # Overridable -- finish processing of start+end tag: <tag.../>
|
|  handle_starttag(self, tag, attrs)       # 处理开始标签，比如<xx>
|      # Overridable -- handle start tag
|
|  parse_bogus_comment(self, i, report=1)
|      # Internal -- parse bogus comment, return length or -1 if not terminated
|      # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state |
|  parse_endtag(self, i)
|      # Internal -- parse endtag, return end or -1 if incomplete
|
|  parse_html_declaration(self, i)
|      # Internal -- parse html declarations, return length or -1 if not terminated
|      # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
|      # See also parse_declaration in _markupbase
|
|  parse_pi(self, i)
|      # Internal -- parse processing instr, return end or -1 if not terminated
|
|  parse_starttag(self, i)
|      # Internal -- handle starttag, return end or -1 if not terminated
|
|  reset(self)
|      Reset this instance.  Loses all unprocessed data.
|
|  set_cdata_mode(self, elem)
|
|  unescape(self, s)
|
|  unknown_decl(self, data)
|
|  ----------------------------------------------------------------------
|  Data and other attributes defined here:
|
|  CDATA_CONTENT_ELEMENTS = ('script', 'style')
|
|  entitydefs = None
|
|  ----------------------------------------------------------------------
|  Methods inherited from markupbase.ParserBase:
|
|  getpos(self)
|      Return current line number and offset.
|
|  parse_comment(self, i, report=1)
|      # Internal -- parse comment, return length or -1 if not terminated
|
|  parse_declaration(self, i)
|      # Internal -- parse declaration (for use by subclasses).
|
|  parse_marked_section(self, i, report=1)
|      # Internal -- parse a marked section
|      # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
|
|  updatepos(self, i, j)
|      # Internal -- update line number and offset.  This should be
|      # called for each piece of data exactly once, in order -- in other
|      # words the concatenation of all the input strings to this
|      # function should be exactly the entire input.

>>>
'''
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签：
相关文章推荐
新的分享
章节导航