您的位置:首页 > Web前端 > HTML

使用mshtml解析html

2010-08-11 16:42 507 查看
测试用例

代码

// TestMSHTML.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#endif

// 唯一的应用程序对象

CWinApp theApp;
FILE * fout;
using namespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");

typedef int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
IHTMLElement * pBody;
pNewDoc->get_body(&pBody);
pBody->Release();
}

void PrintTabs(int n)
{
for (int i = 0;i<n;i++)
{
//cout << '\t';
fwprintf(fout,_T("\t"));
}
}

void VisitNode(IHTMLElement* pElement,int level)
{
BSTR strName,strId,strTag;
PrintTabs(level);
pElement->get_className(&strName);
pElement->get_id(&strId);
pElement->get_tagName(&strTag);
if (strTag!=NULL)
{
fwprintf(fout,_T("TagName:%s "),strTag);
}
if (strName!=NULL)
{
fwprintf(fout,_T("className:%s "),strName);
}
if (strId != NULL)
{
fwprintf(fout,_T("Id:%s "),strId);
}
SysFreeString(strName);
SysFreeString(strId);
SysFreeString(strTag);
BSTR strAttrName1 = _T("border");
BSTR strAttrName2 = _T("bgcolor");
VARIANT val;

pElement->getAttribute(strAttrName1,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("border:%s "),val.bstrVal);
}
}

pElement->getAttribute(strAttrName2,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("bgcolor:%s "),val.bstrVal);
}
}

fwprintf(fout,_T("\n"));
}
//将DOM树打印出来
void Run(IHTMLElement * pElement,int level)
{
IHTMLElementCollection * children;

VisitNode(pElement,level);

IDispatch* pDisp;
pElement->get_children(&pDisp);
pDisp->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
pDisp->Release();

long len;
children->get_length(&len);
VARIANT dummy;
dummy.vt = VT_I4;
for (int i = 0;i < len;i++)
{
IHTMLElement* child;
dummy.intVal = i;
children->item(dummy,dummy,(IDispatch**)&pDisp);
pDisp->QueryInterface(IID_IHTMLElement,(void**)&child);
pDisp->Release();
Run(child,level + 1);
child->Release();
}
children->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
BSTR strText;
IHTMLElement *pBody;
pNewDoc->get_body(&pBody);
pBody->get_innerText(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);

pNewDoc->get_title(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);

cout << "Run begin...."<<endl;
Run(pBody,0);
cout << "Run end...."<<endl;

pBody->Release();

//FindAllElementHavingBg(pNewDoc);

}
void TestMSHTML(wchar_t * wcontent)
{
IHTMLDocument2 *pDoc = NULL;
CoInitialize(NULL);
CoCreateInstance(CLSID_HTMLDocument,
NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2,
(LPVOID *) &pDoc);

if (pDoc)
{
IPersistStreamInit *pPersist = NULL;
pDoc->QueryInterface(IID_IPersistStreamInit,
(LPVOID *) &pPersist);
if (pPersist)
{
IMarkupServices *pMS = NULL;
pPersist->InitNew();
pPersist->Release();
pDoc->QueryInterface(IID_IMarkupServices,
(LPVOID *) &pMS);

if (pMS)
{
IMarkupContainer *pMC = NULL;
IMarkupPointer *pMkStart = NULL;
IMarkupPointer *pMkFinish = NULL;
pMS->CreateMarkupPointer(&pMkStart);
pMS->CreateMarkupPointer(&pMkFinish);
pMS->ParseString(wcontent,
0,
&pMC,
pMkStart,
pMkFinish);

if (pMC)
{
IHTMLDocument2 *pNewDoc = NULL;

pMC->QueryInterface(IID_IHTMLDocument,
(LPVOID *) &pNewDoc);

if (pNewDoc)
{
// do anything with pNewDoc, in this case
// get the body innerText.
TestParse(pNewDoc);

pNewDoc->Release();
}

pMC->Release();
}

if (pMkStart)
pMkStart->Release();

if (pMkFinish)
pMkFinish->Release();

pMS->Release();
}
}

pDoc->Release();
}

CoUninitialize();

}

inline wchar_t* AnsiToUnicode( const char* szStr )
{
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
if (nLen == 0)
{
return NULL;
}
wchar_t* pResult = new wchar_t[nLen+1];
MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, pResult, nLen );
pResult[nLen] = L'\0';
return pResult;
}

//调用者负责delete wcontent
wchar_t * ReadFromHtmlFile(string str,string & content)
{
ifstream fin(str.c_str());
string line;
while(getline(fin,line))
{
// cout << line << endl;
content = content + line;
}
//cout << content << endl;
//cout << content.size() << endl;
//printf("original html code\n%s\n",content.c_str());
wchar_t * wcontent = AnsiToUnicode(content.c_str());
//wprintf(L"after transferred\n%s\n",wcontent);
//delete[] wcontent;
fin.close();
fin.clear();
return wcontent;
}

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;

// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
fout = fopen("out.txt","w");
string str = "test.html";
string content;
wchar_t * wcontent = ReadFromHtmlFile(str,content);
int len = wcslen(wcontent);
//cout << len << endl;

TestMSHTML(wcontent);
delete[] wcontent;
fclose(fout);
}

return nRetCode;
}
输入结果
TagName:BODY
TagName:DIV
TagName:TABLE bgcolor:#ff0000
TagName:TBODY
TagName:TR
TagName:TD border:2 bgcolor:#ffff00
TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
TagName:TR
TagName:TD
TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
TagName:TD
TagName:TR
TagName:TD
TagName:TD
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: