您的位置:首页 > 运维架构 > 网站架构

在Delhpi中,巧用WebBrowser 和Excel摘取网站内容(数据库)

2007-10-29 15:07 495 查看
有时候我们在浏览网站的时候, 经常会看到一些重要的数据,想把它全部保存下来,但又没有什么好的工具可以实现。其实我们自己动手,开发一个有针对型的小工具,是很容易的。

现在就以http://219.142.101.91/jzqy/ 网站为例,



可以看出,该系统共有6527条记录,我们要全部下载下来。

[实现思路]:用WebBrowser打开该网页,然后一条一条地复制数据。然后在用WebBrowser自动浏览下一页,直至全部复制完为止。

[具要实现方法]:

1、初始化工作


procedure TForm1.Button1Click(Sender: TObject);


begin


ProgressBar1.Max:= SumCount;


ProgressBar1.Min:= Count;




httpaddress1:=Edit1.Text;


httpaddress2:=Edit2.Text;


Count:=StrToIntDef(Edit3.Text,1);


SumCount:=StrToIntDef(Edit4.Text,1);




httpaddress:=httpaddress1+inttostr(Count)+httpaddress2;


WebBrowser1.Navigate(httpaddress);


end;

数理数据,先把整个网页复制下来,把整个网页全部复制在Excel中,然后在Excel中去掉不需要的东西,再从Excel中提取需要的数据,并保存进自己的数据库中。


function GetHtml(const WebBrowser:TWebBrowser): string;


const


BufSize = $10000;


var


Size: Int64;


Stream: IStream;


hHTMLText: HGLOBAL;


psi: IPersistStreamInit;


begin


if not Assigned(WebBrowser.Document) then Exit;


OleCheck(WebBrowser.Document.QueryInterface(IPersistStreamInit, psi));


try


hHTMLText := GlobalAlloc(GPTR, BufSize);


if 0 = hHTMLText then Exit;// RaiseLastWin32Error;


OleCheck(CreateStreamOnHGlobal(hHTMLText, True, Stream));


try


OleCheck(psi.Save(Stream, False));


Size := StrLen(PChar(hHTMLText));


SetLength(Result, Size);


CopyMemory(PChar(Result), Pointer(hHTMLText), Size);


finally


Stream := nil;


end;


finally


psi := nil;


end;


end;




procedure TForm1.OutExcel(const WebBrowser:TWebBrowser); //导出为excel


const


//行列的分别起止


rc = 3;


rs = 12;


cc = 1;


cs = 7;


str='EXCEL.EXE';


var


Excelid :variant;


ri,ci :Integer; //当前行和当前列


abc :array[cc..cs] of string;


sqlstr :String;


H :THandle;


P :DWORD;


begin


try


Excelid:=CreateOleObject( 'Excel.Application' );


except


on Exception do raise exception.Create('无法创建Xls文件,请确认是否安装EXCEL')


end;


Excelid.Visible := False;


Excelid.WorkBooks.Add;


WebBrowser.ExecWB(OLECMDID_SELECTALL,OLECMDEXECOPT_DODEFAULT);


WebBrowser.ExecWB(OLECMDID_COPY, OLECMDEXECOPT_DODEFAULT); //复制网页


Excelid.worksheets[1].Paste; //excel文档粘贴


WebBrowser.ExecWB(OLECMDID_UNDO, OLECMDEXECOPT_DODEFAULT); //取消全选




//Excel文件操作


Excelid.Range[Excelid.cells[1,1],Excelid.cells[100,20]].select;


Excelid.selection.MergeCells := False; //取消合并


ri:=4;


ci:=0;


while ri < 100 do


begin


if ci > 70 then Break;


if Trim(AnsiReplaceText(Excelid.Cells[ri,1].Value,'?','')) = '' then


begin


Excelid.ActiveSheet.Rows[ri].Delete; //删除行


inc(ci);


end else


begin


Inc(ri);


end;


end;


ri:=2;


ci:=0;


while ri < 20 do


begin


if ci > 20 then Break;


if Trim(AnsiReplaceText(Excelid.Cells[2,ri].Value,'?','')) = '' then


begin


Excelid.ActiveSheet.Columns[ri].Delete; //删除行


inc(ci);


end else Inc(ri);


end;




for ri:= rc to rs do


begin


for ci:=cc to cs do


begin


abc[ci]:= Trim(AnsiReplaceText(Excelid.Cells[ri,ci].Value,'?',''));


end;


sqlstr:='Insert Into 建筑业企业资质数据库(序号,企业名称,'+


'资质证书编号,主项资质,增项资质,原发证日期,主管部门'+


') Values ('+QuotedStr(abc[1])+','+QuotedStr(abc[2])+


','+QuotedStr(abc[3])+','+QuotedStr(abc[4])+','


+QuotedStr(abc[5])+','+QuotedStr(abc[6])+','


+QuotedStr(abc[7])+')';


// ShowMessage(sqlstr);


if not od.SetExecSql(sqlstr) then


begin


od.SetExecSql('Insert into 日志表(日志) values ('


+QuotedStr('系统在取第'+inttostr(Count)+'页,第'+abc[1]+'行时遇到错误!')+')');


Break;


// ShowMessage('第'+inttostr(Count)+'页。第'+abc[1]+'行');


end;


ProgressBar1.StepIt;


end;




Excelid.ActiveWorkBook.Saved := True;


Excelid.WorkBooks.Close;


Excelid.quit;




//杀死进程


H:=FindWindow(nil,pchar(Str));


if H<>0 then


begin


GetWindowThreadProcessId(H,@P);


if P<>0 then


TerminateProcess(OpenProcess(PROCESS_TERMINATE,False,P),$FFFFFFFF);


end;


end;

当一页处理完毕后,自动处理下一页


procedure TForm1.WebBrowser1DocumentComplete(Sender: TObject;


const pDisp: IDispatch; var URL: OleVariant);


begin


Label6.Caption:=IntToStr(count);


if Count < SumCount then


begin


// htmlstr:= GetHtml(WebBrowser1); //取得HTML源代码


OutExcel(WebBrowser1);


Inc(Count);


httpaddress:=httpaddress1+inttostr(Count)+httpaddress2;


WebBrowser1.Navigate(httpaddress);


end;


end;

全部源代码如下:


unit Unit1;




interface




uses


Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,


Dialogs, StdCtrls, OleCtrls, SHDocVw, ExtCtrls,MSHTML, ActiveX,comobj,


StrUtils, DB, ADODB,UOperationData, ComCtrls;




type


TForm1 = class(TForm)


Panel1: TPanel;


Panel2: TPanel;


WebBrowser1: TWebBrowser;


Button1: TButton;


GroupBox1: TGroupBox;


Label1: TLabel;


Edit1: TEdit;


Label2: TLabel;


Edit2: TEdit;


Edit3: TEdit;


Label3: TLabel;


Edit4: TEdit;


Label4: TLabel;


Label5: TLabel;


ADOConnection1: TADOConnection;


ProgressBar1: TProgressBar;


Label6: TLabel;


procedure Button1Click(Sender: TObject);


procedure WebBrowser1DocumentComplete(Sender: TObject;


const pDisp: IDispatch; var URL: OleVariant);


procedure FormShow(Sender: TObject);


private


procedure OutExcel(const WebBrowser:TWebBrowser); //导出为excel




...{ Private declarations }


public




...{ Public declarations }


end;




var


Form1 : TForm1;




httpaddress1 :string;


httpaddress2 :String;


SumCount :Integer;


Count : Integer; //次数


M_Bool : Boolean; //鼠标模拟是否有效


httpaddress : String;


od :TOperationData;




implementation






...{$R *.dfm}


function GetHtml(const WebBrowser:TWebBrowser): string;


const


BufSize = $10000;


var


Size: Int64;


Stream: IStream;


hHTMLText: HGLOBAL;


psi: IPersistStreamInit;


begin


if not Assigned(WebBrowser.Document) then Exit;


OleCheck(WebBrowser.Document.QueryInterface(IPersistStreamInit, psi));


try


hHTMLText := GlobalAlloc(GPTR, BufSize);


if 0 = hHTMLText then Exit;// RaiseLastWin32Error;


OleCheck(CreateStreamOnHGlobal(hHTMLText, True, Stream));


try


OleCheck(psi.Save(Stream, False));


Size := StrLen(PChar(hHTMLText));


SetLength(Result, Size);


CopyMemory(PChar(Result), Pointer(hHTMLText), Size);


finally


Stream := nil;


end;


finally


psi := nil;


end;


end;




procedure TForm1.OutExcel(const WebBrowser:TWebBrowser); //导出为excel


const


//行列的分别起止


rc = 3;


rs = 12;


cc = 1;


cs = 7;


str='EXCEL.EXE';


var


Excelid :variant;


ri,ci :Integer; //当前行和当前列


abc :array[cc..cs] of string;


sqlstr :String;


H :THandle;


P :DWORD;


begin


try


Excelid:=CreateOleObject( 'Excel.Application' );


except


on Exception do raise exception.Create('无法创建Xls文件,请确认是否安装EXCEL')


end;


Excelid.Visible := False;


Excelid.WorkBooks.Add;


WebBrowser.ExecWB(OLECMDID_SELECTALL,OLECMDEXECOPT_DODEFAULT);


WebBrowser.ExecWB(OLECMDID_COPY, OLECMDEXECOPT_DODEFAULT); //复制网页


Excelid.worksheets[1].Paste; //excel文档粘贴


WebBrowser.ExecWB(OLECMDID_UNDO, OLECMDEXECOPT_DODEFAULT); //取消全选




//Excel文件操作


Excelid.Range[Excelid.cells[1,1],Excelid.cells[100,20]].select;


Excelid.selection.MergeCells := False; //取消合并


ri:=4;


ci:=0;


while ri < 100 do


begin


if ci > 70 then Break;


if Trim(AnsiReplaceText(Excelid.Cells[ri,1].Value,'?','')) = '' then


begin


Excelid.ActiveSheet.Rows[ri].Delete; //删除行


inc(ci);


end else


begin


Inc(ri);


end;


end;


ri:=2;


ci:=0;


while ri < 20 do


begin


if ci > 20 then Break;


if Trim(AnsiReplaceText(Excelid.Cells[2,ri].Value,'?','')) = '' then


begin


Excelid.ActiveSheet.Columns[ri].Delete; //删除行


inc(ci);


end else Inc(ri);


end;




for ri:= rc to rs do


begin


for ci:=cc to cs do


begin


abc[ci]:= Trim(AnsiReplaceText(Excelid.Cells[ri,ci].Value,'?',''));


end;


sqlstr:='Insert Into 建筑业企业资质数据库(序号,企业名称,'+


'资质证书编号,主项资质,增项资质,原发证日期,主管部门'+


') Values ('+QuotedStr(abc[1])+','+QuotedStr(abc[2])+


','+QuotedStr(abc[3])+','+QuotedStr(abc[4])+','


+QuotedStr(abc[5])+','+QuotedStr(abc[6])+','


+QuotedStr(abc[7])+')';


// ShowMessage(sqlstr);


if not od.SetExecSql(sqlstr) then


begin


od.SetExecSql('Insert into 日志表(日志) values ('


+QuotedStr('系统在取第'+inttostr(Count)+'页,第'+abc[1]+'行时遇到错误!')+')');


Break;


// ShowMessage('第'+inttostr(Count)+'页。第'+abc[1]+'行');


end;


ProgressBar1.StepIt;


end;




Excelid.ActiveWorkBook.Saved := True;


Excelid.WorkBooks.Close;


Excelid.quit;




//杀死进程


H:=FindWindow(nil,pchar(Str));


if H<>0 then


begin


GetWindowThreadProcessId(H,@P);


if P<>0 then


TerminateProcess(OpenProcess(PROCESS_TERMINATE,False,P),$FFFFFFFF);


end;


end;




procedure TForm1.Button1Click(Sender: TObject);


begin


ProgressBar1.Max:= SumCount;


ProgressBar1.Min:= Count;




httpaddress1:=Edit1.Text;


httpaddress2:=Edit2.Text;


Count:=StrToIntDef(Edit3.Text,1);


SumCount:=StrToIntDef(Edit4.Text,1);




httpaddress:=httpaddress1+inttostr(Count)+httpaddress2;


WebBrowser1.Navigate(httpaddress);


end;




procedure TForm1.WebBrowser1DocumentComplete(Sender: TObject;


const pDisp: IDispatch; var URL: OleVariant);


begin


Label6.Caption:=IntToStr(count);


if Count < SumCount then


begin


// htmlstr:= GetHtml(WebBrowser1); //取得HTML源代码


OutExcel(WebBrowser1);


Inc(Count);


httpaddress:=httpaddress1+inttostr(Count)+httpaddress2;


WebBrowser1.Navigate(httpaddress);


end;


end;




procedure TForm1.FormShow(Sender: TObject);


begin


od:= TOperationData.Create(ADOConnection1);


end;




initialization


OleInitialize(nil);


finalization


try


OleUninitialize;


except


end;




end.

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: