您的位置:首页 > 编程语言 > Delphi

Delphi 通过MSHTML实现一个HTML解析类

2010-01-05 09:14 344 查看
最近经常会模拟网页提交返回网页源码,然后获得网页中相应的元素,于是需要常常解析Html中相应的各种元素,网络是个好东西,搜索一番,就找到了
好几个Delphi版本的HtmlParser的类库,试着使用了几个,发现解析起来都不完整,或多或少的回出现一些问题!于是想到了如果界面上有一个浏
览器,我们可以通过WebBrowser的Document接口对网页元素进行操作,很是方便!但是模拟网页提交,界面上是不一定要出现
WebBrowser的,肯定有办法,不通过WebBrowser就直接解析HTML的,那便是我不要WebBrowser这个外壳,只要他里面的
Document文档接口对象就能实现对Html的解析了,查找了一番MSDN,然后Google一下,果然可行,构建方法如下:

//创建
IHTMLDocument2接口

CoCreateInstance(CLASS_HTMLDocument, nil,
CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);

接口创建好了之后就能够对文
档元素进行解析了,很是爽快!

结合了我自己的特有操作,我对Combobox,Table,Frame等一些网页元素做了相应的封装,实现
了一个HTMLParser,大致代码如下:

这里只给出声明,代
码请在最后下载





代码

(*
****************************************************
*)

(*
得闲工作

*)

(*
网页元素操作
类库
*)

(*

*)

(*
DxHtmlElement Unit
*)

(*
Copyright(c) 2008-2010 不得

*)

(*
email:appleak46@yahoo.com.cn QQ:75492895
*)

(*
****************************************************
*)

unit
DxHtmlElement;

interface

uses
Windows,sysUtils,Clipbrd,MSHTML,ActiveX,OleCtrls,Graphics,TypInfo;

{
Get EleMent Type
}

function
IsSelectElement(eleElement: IHTMLElement): Boolean;

function
IsPwdElement(eleElement: IHTMLElement): Boolean;

function
IsTextElement(element: IHTMLElement): boolean;

function
IsTableElement(element: IHTMLElement): Boolean;

function
IsElementCollection(element: IHTMLElement): Boolean;

function
IsChkElement(element: IHTMLElement): boolean;

function
IsRadioBtnElement(element: IHTMLElement): boolean;

function
IsMemoElement(element: IHTMLElement): boolean;

function
IsFormElement(element: IHTMLElement): boolean;

function
IsIMGElement(element: IHTMLElement): boolean;

function
IsInIMGElement(element: IHTMLElement): boolean;

function
IsLabelElement(element: IHTMLElement): boolean;

function
IsLinkElement(element: IHTMLElement): boolean;

function
IsListElement(element: IHTMLElement): boolean;

function
IsControlElement(element: IHTMLElement): boolean;

function
IsObjectElement(element: IHTMLElement): boolean;

function
IsFrameElement(element: IHTMLElement): boolean;

function
IsInPutBtnElement(element: IHTMLElement): boolean;

function
IsInHiddenElement(element: IHTMLElement): boolean;

function
IsSubmitElement(element: IHTMLElement): boolean;

{
Get ImgElement Data
}

function
GetPicIndex(doc: IHTMLDocument2; Src:
string
; Alt:
string
): Integer;

function
GetPicElement(doc: IHTMLDocument2;imgName:
string
;src:
string
;Alt:
string
): IHTMLImgElement;

function
GetRegCodePic(doc: IHTMLDocument2;ImgName:
string
; Src:
string
; Alt:
string
): TPicture;
overload
;

function
GetRegCodePic(doc: IHTMLDocument2;Index: integer): TPicture;
overload
;

function
GetRegCodePic(doc: IHTMLDocument2;element: IHTMLIMGElement): TPicture;
overload
;

type

TObjectFromLResult
=

function
(LRESULT: lResult;
const
IID: TIID; WPARAM: wParam;
out
pObject): HRESULT;
stdcall
;

TEleMentType
=
(ELE_UNKNOW,ELE_TEXT,ELE_PWD,ELE_SELECT,ELE_CHECKBOX,ELE_RADIOBTN,ELE_MEMO,ELE_FORM,ELE_IMAGE,

ELE_LABEL,ELE_LINK,ELE_LIST,ELE_CONTROL,ELE_OBJECT,ELE_FRAME,ELE_INPUTBTN,ELE_INIMAGE,ELE_INHIDDEN);

function
GetElementType(element: IHTMLELEMENT): TEleMentType;

function
GetElementTypeName(element: IHTMLELEMENT):
string
;

function
GetHtmlTableCell(aTable: IHTMLTable;aRow,aCol: Integer): IHTMLElement;

function
GetHtmlTable(aDoc: IHTMLDocument2; aIndex: Integer): IHTMLTable;

function
GetWebBrowserHtmlTableCellText(Doc: IHTMLDocument2;

const
TableIndex, RowIndex, ColIndex: Integer;
var
ResValue:
string
): Boolean;

function
GetHtmlTableRowHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;

function
GetWebBrowserHtmlTableCellHtml(Doc: IHTMLDocument2;

const
TableIndex,RowIndex,ColIndex: Integer;
var
ResValue:
string
): Boolean;

function
GeHtmlTableHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;

function
GetWebBrowserHtmlTableHtml(Doc: IHTMLDocument2;

const
TableIndex,RowIndex: Integer;
var
ResValue:
string
): Boolean;

type

TDxWebFrameCollection
=

class
;

TDxWebElementCollection
=

class
;

TLoadState
=
(Doc_Loading,Doc_Completed,Doc_Invalidate);

TDxWebFrame
=

class

private

FFrame: IHTMLWINDOW2;

FElementCollections: TDxWebElementCollection;

FWebFrameCollections: TDxWebFrameCollection;

function
GetSrc:
string
;

function
GetElementCount: integer;

function
GetWebFrameCollections: TDxWebFrameCollection;

function
GetElementCollections: TDxWebElementCollection;

function
GetDocument: IHTMLDOCUMENT2;

function
GetReadState: TLoadState;

function
GetIsLoaded: boolean;

procedure
SetFrame(
const
Value: IHTMLWINDOW2);

function
GetName:
string
;

public

Constructor Create(IFrame: IHTMLWINDOW2);

Destructor Destroy;
override
;

property
Frame: IHTMLWINDOW2
read
FFrame
write
SetFrame;

property
Src:
string

read
GetSrc;

property
Document: IHTMLDOCUMENT2
read
GetDocument;

property
Name:
string

read
GetName;

property
Frames: TDxWebFrameCollection
read
GetWebFrameCollections;

property
ElementCount: integer
read
GetElementCount;

property
ElementCollections: TDxWebElementCollection
read
GetElementCollections;

property
ReadyState: TLoadState
read
GetReadState;

property
IsLoaded: boolean
read
GetIsLoaded;

end
;

TDxWebFrameCollection
=
Class

private

FFrameCollection: IHTMLFramesCollection2;

Frame: TDxWebFrame;

function
GetCount: integer;

function
GetFrameInterfaceByIndex(index: integer): IHTMLWINDOW2;

function
GetFrameInterfaceByName(Name:
string
): IHTMLWINDOW2;

function
GetFrameByIndex(index: integer): TDxWebFrame;

function
GetFrameByName(Name:
string
): TDxWebFrame;

procedure
SetFrameCollection(
const
Value: IHTMLFramesCollection2);

public

Constructor Create(ACollection: IHTMLFramesCollection2);

Destructor Destroy;
override
;

property
FrameCollection: IHTMLFramesCollection2
read
FFrameCollection
write
SetFrameCollection;

property
Count: integer
read
GetCount;

property
FrameInterfaceByIndex[index: integer]: IHTMLWINDOW2
read
GetFrameInterfaceByIndex;

property
FrameInterfaceByName[Name:
string
]: IHTMLWINDOW2
read
GetFrameInterfaceByName;

property
FrameByIndex[index: integer]: TDxWebFrame
read
GetFrameByIndex;

property
FrameByName[Name:
string
]: TDxWebFrame
read
GetFrameByName;

end
;

TDxWebElementCollection
=

class

private

FCollection: IHTMLElementCollection;

FChildCollection: TDxWebElementCollection;

function
GetCollection(index: String): TDxWebElementCollection;

function
GetCount: integer;

function
GetElement(itemName:
string
; index: integer): IHTMLElement;

function
GetElementByName(itemName:
string
): IHTMLELEMENT;

function
GetElementByIndex(index: integer): IHTMLELEMENT;

procedure
SetCollection(
const
Value: IHTMLElementCollection);

public

Constructor Create(ACollection: IHTMLElementCollection);

Destructor Destroy;
override
;

property
Collection: IHTMLElementCollection
read
FCollection
write
SetCollection;

property
ChildElementCollection[index: String]: TDxWebElementCollection
read
GetCollection;

property
ElementCount: integer
read
GetCount;

property
Element[itemName:
string
;index: integer]: IHTMLElement
read
GetElement;

property
ElementByName[itemName:
string
]: IHTMLELEMENT
read
GetElementByName;

property
ElementByIndex[index: integer]: IHTMLELEMENT
read
GetElementByIndex;

end
;

TLinkCollection
=

class
(TDxWebElementCollection)

end
;

TDxWebTable
=

class
;

TDxTableCollection
=

class

private

FTableCollection: IHTMLElementCollection;

FDocument: IHTMLDOCUMENT2;

FWebTable: TDxWebTable;

function
GetTableInterfaceByName(AName:
string
): IHTMLTABLE;

procedure
SetDocument(Value: IHTMLDOCUMENT2);

function
GetTableInterfaceByIndex(index: integer): IHTMLTABLE;

function
GetCount: integer;

function
GetTableByIndex(index: integer): TDxWebTable;

function
GetTableByName(AName:
string
): TDxWebTable;

public

Constructor Create(Doc: IHTMLDOCUMENT2);

destructor
Destroy;
override
;

property
TableInterfaceByName[AName:
string
]: IHTMLTABLE
read
GetTableInterfaceByName;

property
TableInterfaceByIndex[index: integer]: IHTMLTABLE
read
GetTableInterfaceByIndex;

property
TableByName[AName:
string
]: TDxWebTable
read
GetTableByName;

property
TableByIndex[index: integer]: TDxWebTable
read
GetTableByIndex;

property
Document: IHTMLDOCUMENT2
read
FDocument
write
SetDocument;

property
Count: integer
read
GetCount;

end
;

TDxWebTable
=

class

private

FTableInterface: IHTMLTABLE;

function
GetRowCount: integer;

procedure
SetTableInterface(
const
Value: IHTMLTABLE);

function
GetCell(ACol, ARow: integer):
string
;

function
GetRowColCount(RowIndex: integer): integer;

function
GetInnerHtml:
string
;

function
GetInnerText:
string
;

function
GetCellElement(ACol, ARow: Integer): IHTMLTableCell;

public

Constructor Create(ATable: IHTMLTABLE);

property
TableInterface: IHTMLTABLE
read
FTableInterface
write
SetTableInterface;

property
RowCount: integer
read
GetRowCount;

property
Cell[ACol: integer;ARow: integer]:
string

read
GetCell;

property
CellElement[ACol: Integer;ARow: Integer]: IHTMLTableCell
read
GetCellElement;

property
RowColCount[RowIndex: integer]: integer
read
GetRowColCount;

property
InnerHtml:
string

read
GetInnerHtml;

property
InnerText:
string

read
GetInnerText;

end
;

TDxWebCombobox
=

class

private

FHtmlSelect: IHTMLSelectElement;

function
GetCount: Integer;

procedure
SetItemIndex(
const
Value: Integer);

function
GetItemIndex: Integer;

function
GetName:
string
;

procedure
SetName(
const
Value:
string
);

function
GetValue:
string
;

procedure
SetValue(
const
Value:
string
);

procedure
SetCombInterface(
const
Value: IHTMLSelectElement);

function
GetItemByName(EleName:
string
):
string
;

function
GetItemByIndex(index: integer):
string
;

function
GetItemAttribute(index: Integer; AttribName:
string
): OleVariant;

public

constructor
Create(AWebCombo: IHTMLSelectElement);

procedure
Add(Ele: IHTMLElement);

procedure
Insert(Ele: IHTMLElement;Index: Integer);

procedure
Remove(index: Integer);

property
CombInterface: IHTMLSelectElement
read
FHtmlSelect
write
SetCombInterface;

property
Count: Integer
read
GetCount;

property
ItemIndex: Integer
read
GetItemIndex
write
SetItemIndex;

property
ItemByIndex[index: integer]:
string

read
GetItemByIndex;

property
ItemByName[EleName:
string
]:
string

read
GetItemByName;

property
ItemAttribute[index: Integer;AttribName:
string
]: OleVariant
read
GetItemAttribute;

property
Name:
string

read
GetName
write
SetName;

property
value:
string

read
GetValue
write
SetValue;

end
;

implementation

end
.

HTMLParser解析类的代码实现单元





代码

(*
****************************************************
*)

(*
得闲工作

*)

(*
HTML解析
单元库
*)

(*

*)

(*
DxHtmlParser Unit
*)

(*
Copyright(c) 2008-2010 不得

*)

(*
email:appleak46@yahoo.com.cn QQ:75492895
*)

(*
****************************************************
*)

unit
DxHtmlParser;

interface

uses
Windows,MSHTML,ActiveX,DxHtmlElement,Forms;

type

TDxHtmlParser
=

class

private

FHtmlDoc: IHTMLDocument2;

FHTML:
string
;

FWebTables: TDxTableCollection;

FWebElements: TDxWebElementCollection;

FWebComb: TDxWebCombobox;

procedure
SetHTML(
const
Value:
string
);

function
GetWebCombobox(AName:
string
): TDxWebCombobox;

public

constructor
Create;

destructor
Destroy;
override
;

property
HTML:
string

read
FHTML
write
SetHTML;

property
WebTables: TDxTableCollection
read
FWebTables;

property
WebElements: TDxWebElementCollection
read
FWebElements;

property
WebCombobox[Name:
string
]: TDxWebCombobox
read
GetWebCombobox;

end
;

implementation

{
TDxHtmlParser
}

constructor
TDxHtmlParser.Create;

begin

CoInitialize(
nil
);

//
创建IHTMLDocument2接口

CoCreateInstance(CLASS_HTMLDocument,
nil
, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);

Assert(FHtmlDoc
<>
nil
,
'
构建HTMLDocument接口失败
'
);

FHtmlDoc.Set_designMode(
'
On
'
);
//
设置为设计模式,不执行脚本

while

not
(FHtmlDoc.readyState
=

'
complete
'
)
do

begin

sleep(
1
);

Application.ProcessMessages;

end
;

FWebTables :
=
TDxTableCollection.Create(FHtmlDoc);

FWebElements :
=
TDxWebElementCollection.Create(
nil
);

FWebComb :
=
TDxWebCombobox.Create(
nil
);

end
;

destructor
TDxHtmlParser.Destroy;

begin

FWebTables.Free;

FWebElements.Free;

FWebComb.Free;

CoUninitialize;

inherited
;

end
;

function
TDxHtmlParser.GetWebCombobox(AName:
string
): TDxWebCombobox;

begin

if
FWebElements.Collection
<>

nil

then

begin

FWebComb.CombInterface :
=
FWebElements.ElementByName[AName]
as
IHTMLSelectElement;

Result :
=
FWebComb;

end

else
Result :
=

nil
;

end
;

procedure
TDxHtmlParser.SetHTML(
const
Value:
string
);

begin

if
FHTML
<>
Value
then

begin

FHTML :
=
Value;

FHtmlDoc.body.innerHTML :
=
FHTML;

FWebElements.Collection :
=
FHtmlDoc.all;

end
;

end
;

end
.


部代码下载
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐