您的位置:首页 > Web前端 > HTML

关于html-x1.1的发布

2010-02-23 22:40 197 查看
忙了一晚上,修正了许多bug,最重要的是可以直接根据url解析网页文件,我主要是增加了parseurl这个方法,采用internetreadfile这个api来读取网页内容,然后用string类来存储内容,最后用load方法来解析。并且还增加了对特定节点内的getelementsbytagname。好了,具体看代码:

#include<iostream>
#include<string>
#include "windows.h"
#include "wininet.h"
using namespace std;
#pragma comment(lib,"wininet.lib")
wstring UTF8ToUnicode( const string& str )
    {
        int len = str.length();
        int unicodeLen = ::MultiByteToWideChar( CP_UTF8, 0, str.c_str(), -1, NULL, 0 );

        wchar_t * pUnicode;
        pUnicode = new wchar_t[unicodeLen+1];
        memset(pUnicode,0,(unicodeLen+1)*sizeof(wchar_t));

        MultiByteToWideChar( CP_UTF8, 0, str.c_str(), -1, (LPWSTR)pUnicode, unicodeLen );

        wstring rt = ( wchar_t* )pUnicode;
        delete pUnicode;

        return rt;
    }

 string UnicodeToANSI( const wstring& str )
    {
        char* pElementText;
        int iTextLen;

        // wide char to multi char
        iTextLen = WideCharToMultiByte( CP_ACP, 0, str.c_str(), -1, NULL, 0, NULL, NULL );

        pElementText = new char[iTextLen + 1];
        memset( ( void* )pElementText, 0, sizeof( char ) * ( iTextLen + 1 ) );

        WideCharToMultiByte( CP_ACP, 0, str.c_str(), -1, pElementText, iTextLen, NULL, NULL );

        string strText = pElementText;
        delete[] pElementText;

        return strText;
    }

class node;
class dom;
class nodecollect{
private:
	node *n;
	int length;
public:
	nodecollect();
	~nodecollect();
    int getlength();
	void add(node *nn);
	node* item(int i);

};
class node{
private:
	int start;
	int len;
	char name[20];
public:
	char* nodehtml();
	char* innerhtml();
	char* outerhtml();
	char* innertext();
	char* getattr(char* str);
	char* tagname();
	void setname(char *str);
	node* getparent();
	nodecollect* getchild();	
	node* getnext();
	node* getprevious();
	nodecollect* getbytagname(char *tagname);
	node *next;
	node *previous;
	void setstart(int i);
	void setlen(int i);
	int getstart();
	int getlen();
	dom *d;
};
class dom{
private:
	char *text;
	node *start;
	node *end;
	int count;
	int parse(char *s);
public:
	~dom();
	char *gettext();
	void load(char *str);
	node* getitem(int i);
	int getcount();
	node *getbyid(char* id);
	void praseurl(char *url);
	nodecollect* getbytagname(char *tagname);
};
void dom::load(char* str){
	start=0;
	end=0;
	count=0;
	int l=strlen(str);
	text=new char[l+1];
	strcpy(text,str);
	char *t=text;
	parse(t);
	
}
int dom::getcount(){
	return count;
}
char *dom::gettext(){
	return text;
}
node* dom::getitem(int i){
	node* n1=start;
	while(i--){
		if(n1){
			n1=n1->next;
		}else{
			return 0;
		}
	}
	return n1;
	
}
node *dom::getbyid(char *id){
	for(int i=0;i<this->getcount();i++){
		if(::stricmp(this->getitem(i)->getattr("id"),id)==0){
			return this->getitem(i);
		}
	}
	return 0;
}
nodecollect* dom::getbytagname(char *tagname){
	nodecollect *nnode=new nodecollect;
	
	for(int i=0;i<this->getcount();i++){
		// cout<<strlen(this->getitem(i)->tagname(*this))<<endl;
		if(::stricmp(this->getitem(i)->tagname(),tagname)==0){
			nnode->add(this->getitem(i));
		}
	}
	return nnode;
}
dom::~dom(){
	delete[] text;
	node *n1=start,*n2;
	if(n1){
		while(n1->next!=0){
			n2=n1;
			n1=n1->next;
			delete n2;
		}
	}
}
void dom::praseurl(char *url){
	HINTERNET ie=InternetOpen("sx",INTERNET_OPEN_TYPE_PRECONFIG,0,0,0);
HINTERNET ieo=InternetOpenUrl(ie,url,0,0,INTERNET_FLAG_RELOAD,0);
DWORD read=0;
char b[100]={0};
char *bb=0;
string str="";
while(1){
	InternetReadFile(ieo,b,99,&read);
	if(read==0){
		break;
	}
str+=b;
memset(b,0,100);
}
string temp=UnicodeToANSI( UTF8ToUnicode(str));
bb=(char*)temp.c_str();
load(bb);
InternetCloseHandle(ieo);
InternetCloseHandle(ie);
}
int dom::parse(char *s){
	int i1=0,i2=0,i3=0,i4=0;
	while(*s!=0){
		
		if(*s==0){
			return (long)s;
		}
		if(i3==1 || i4==1){
			if(*s=='/"' && *(s-1)!='//'){
				if(i1==0){
					i1=1;
				}else{
					i1=0;
				}
			}
			if(*s=='/'' && *(s-1)!='//'){
				if(i2==0){
					i2=1;
				}else{
					i2=0;
				}
			}
		}
		if(*s=='<' && *(s+1)=='!'){
			
			if(i1==0 && i2==0 && i4==0){
				i3=1;
				node *nn=new node;
				nn->setstart(s-text);
				nn->setlen(0);
				nn->d=this;
				nn->setname(s+1);
				nn->next=0;
				nn->previous=0;
				if(start){
					node *n1=start;
					while(n1->next!=0){
						n1=n1->next;
					}
					n1->next=nn;
					nn->previous=n1;
					end=nn;
				}else{
					start=nn;
					end=nn;
				}
				int s1=(long)s;
				while(*s){
					if(*s=='/"'){
						if(i1==0){
							i1=1;
						}else{
							i1=0;
						}
					}
					if(*s=='/''){
						if(i2==0){
							i2=1;
						}else{
							i2=0;
						}
					}
					if(*s=='>' && ((*(s-1)=='-' && *(s-2)=='-') ||  (*(s-1)=='/"' && *(s-2)=='d' && *(s-3)=='t' && *(s-4)=='d' && *(s-5)=='.') || (*(s-1)=='l' && *(s-2)=='m' && *(s-3)=='t' && *(s-4)=='h' && *(s-5)==' '))){
						if(i1==0 && i2==0){
							//cout<<(long)s+1-s1<<endl;
							nn->setlen((long)s+1-s1);
							s++;
							break;
						}
						
					}
					s++;
				}
				count++;
			}
		}
		if(*s=='<' && *(s+1)!='/' && *(s+1)!='!'){
			
			if(i1==0 && i2==0 && i4==0){
				i3=1;
				node *nn=new node;
				//cout<<s-text<<endl;
				nn->setstart(s-text);
				nn->setlen(0);
				nn->d=this;
				nn->setname(s+1);
				nn->next=0;
				nn->previous=0;
				if(start){
					node *n1=start;
					while(n1->next!=0){
						n1=n1->next;
					}
					n1->next=nn;
					nn->previous=n1;
					end=nn;
				}else{
					start=nn;
					end=nn;
				}
				if((*(s+1)=='s' || *(s+1)=='S') && (*(s+2)=='c' || *(s+2)=='C') && (*(s+3)=='r' || *(s+3)=='R') && (*(s+4)=='i' || *(s+4)=='I') && (*(s+5)=='p' || *(s+5)=='P') && (*(s+6)=='t' || *(s+6)=='T')){
					i4=1;
				}
				if((*(s+1)=='s' || *(s+1)=='S') && (*(s+2)=='t' || *(s+2)=='T') && (*(s+3)=='y' || *(s+3)=='Y') && (*(s+4)=='l' || *(s+4)=='L') && (*(s+5)=='e' || *(s+5)=='E') ){
					i4=1;
				}
				count++;
			}
		}
		if(*s=='>'){
			if(i1==0 && i2==0){
				i3=0;
				if(i4==0){
					node *n1=end;
					// cout<<(long)s+2-(n1->getstart())-(long)text<<endl;
					n1->setlen((long)s+1-(n1->getstart())-(long)text);
					
				}
				if(stricmp(strlwr(end->tagname()),"script")==0){
					i4=1;
				}
				if(stricmp(strlwr(end->tagname()),"style")==0){
					i4=1;
				}
			}
		}
		
		if(*s=='<' && *(s+1)=='/'){
			if(i1==0 && i2==0){
				i3=0;
				if(i4=1){
					i4=0;
				}
				char temp[20]={0};
				s=s+2;
				int i=0;
				while(*s!='>'){
					temp[i++]=*s;
					s++;
				}
				node *n1=end;
				node* min;
				while(n1!=0){
					if(stricmp(n1->tagname(),temp)==0 && strlen(n1->nodehtml())==n1->getlen()){
						min=n1;
						break;
						// cout<<min->getstart()<<"*"<<i<<endl;
					}
					n1=n1->previous;
				}
				n1=min;
				// cout<<strlen(n1->nodehtml())<<"      "<<n1->getlen()<<endl;
				// cout<<(long)s-(long)text-n1->getstart()<<endl;
				
				n1->setlen((long)s+1-(n1->getstart())-(long)text);
				
			}
		}
		
		s++;
 }
}
void node::setname(char *str){
	memset(name,0,20);
	int i=0;
	while(1){
		if(*str=='!'){
			name[0]='!';
			break;
		}else if(*str==' ' || *str=='/' || *str=='>'){
			break;
		}else if(*str==13 || *str==10){
			break;
		}else{
			name[i++]=*str;
			str++;
		}
	}
}
node *node::getparent(){
	int p=-1;
	for(int i=0;i<d->getcount();i++){
		if(d->getitem(i)->getstart()<start){
			if(d->getitem(i)->getlen()+d->getitem(i)->getstart()>start+len){
				p=i;
			}
		}else{
			break;
		}
	}
	if(p==-1){
		return 0;
	}else{
		return d->getitem(p);
	}
}
nodecollect* node::getchild(){
	int p=-1;
	nodecollect *nn=new nodecollect;
	for(int i=0;i<d->getcount();i++){
		if(d->getitem(i)->getstart()>start){
			p=i;
			break;
		}
	}
	if(p!=-1){
		for(;p<d->getcount();p++){
			if(start+len>d->getitem(p)->getlen()+d->getitem(p)->getstart()){
				nn->add(d->getitem(p));
			}else{
				break;
			}
		}
		
	}
	return nn;
}
nodecollect* node::getbytagname(char *tagname){
nodecollect *nn=new nodecollect;
nodecollect *nn1=getchild();
for(int i=0;i<nn1->getlength();i++){
	if(stricmp(nn1->item(i)->tagname(),tagname)==0){
	nn->add(nn1->item(i));
	}
}
return nn;
}
char *node::tagname(){
	return name;
}
void node::setstart(int i){
	start=i;
}
void node::setlen(int i){
	len=i;
}
int node::getstart(){
	return start;
}
int node::getlen(){
	return len;
}
char *node::nodehtml(){
	char *out=outerhtml();
	int i=0,i1=0,i2=0;
	char *v=new char[strlen(out)+1];
	::memset(v,0,strlen(out)+1);
	while(*out!=0){
		
		if(*out==0){
			return v;
		}
		if(*out=='/"' && *(out-1)!='//'){
			if(i1==0){
				i1=1;
			}else{
				i1=0;
			}
		}
		if(*out=='/'' && *(out-1)!='//'){
			if(i2==0){
				i2=1;
			}else{
				i2=0;
			}
		}
		if(*out=='>'){
			if(i1==0 && i2==0){
				v[i]='>';
				return v;
			}
		}
		v[i++]=*(out++);
	}
}
char *node::outerhtml(){
	char *out=new char[len+1];
	char *c=d->gettext()+start;
	for(int i=0;i<len;i++){
		*(out+i)=*(c+i);
	}
	out[len]=0;
	return out;
}
char *node::getattr(char* str){
	char *v=nodehtml();
	char* index=v;
	char *attr=new char[strlen(v)+1];
	memset(attr,0,strlen(v)+1);
	while(1){
		index=strstr(index,str);
		if(index==0){
			return attr;
		}else{
			if((*(index-1)==' ' || *(index-1)==10 || *(index-1)==13) && *(index+strlen(str))=='='){
				index=index+strlen(str)+1;
				break;
			}
		}
		index++;
	}
	int i1=0,i2=0;
	if(*index=='/"'){
		i1=1;
		index++;
		
	}
	if(*index=='/''){
		i2=1;
		index++;
		
	}
	int i=0;
	while(*index!=0){
		
		if(*index==0){
			return attr;
		}
		if(*index=='/"' && *(index-1)!='//'){
			if(i1==0){
				i1=1;
			}else{
				i1=0;
			}
		}
		if(*index=='/'' && *(index-1)!='//'){
			if(i2==0){
				i2=1;
			}else{
				i2=0;
			}
		}
		if(*index==' ' || *index=='>' || *index=='/'){
			if(i1==0 && i2==0){
				if(*(index-1)=='/'' || *(index-1)=='/"'){
					attr[strlen(attr)-1]=0;
					return attr;
				}else{
					return attr;
				}
			}
		}
		*(attr+(i++))=*(index++);
	}
}
char *node::innerhtml(){
	char* out=outerhtml();
	char *base=out;
	int l=strlen(out);
	int i1=0,i2=0;
	char *inner=new char[strlen(out)+1];
	::memset(inner,0,strlen(out)+1);
	while(*out!=0){
		
		if(*out==0){
			return inner;
		}
		if(*out=='/"'){
			if(i1==0){
				i1=1;
			}else{
				i1=0;
			}
		}
		if(*out=='/''){
			if(i2==0){
				i2=1;
			}else{
				i2=0;
			}
		}
		if(*out=='>'){
			if(i1==0 && i2==0){
				break;
			}
		}
		out++;
	}
	int innerlen=l-(strlen(tagname())+3)-(out-base+1);
	if(innerlen==0){
		return inner;
	}else{
		for(int i=0;i<innerlen;i++){
			inner[i]=*(out+i+1);
		}
		return inner;
	}
}
char *node::innertext(){
	char *h=innerhtml();
	char *h1=h;
	char *inner;
	if(h[0]==0){
		inner=new char;
		*inner=0;
		return inner;
	}else if(stricmp(strlwr(this->tagname()),"script")==0){
		inner=new char[strlen(h)+1];
		strcpy(inner,h);
		return inner;
	}else{
		inner=new char[strlen(h)+1];
		::memset(inner,0,strlen(h)+1);
	}
	int i=0,i1=0,i2=0,i3=0;
	for(;*h!=0;h++){
		
		if(*h==0){
			return inner;
		}
		if(*h=='<'){
			if(i3==0){
				if((*(h+1)=='s' || *(h+1)=='S') && (*(h+2)=='c' || *(h+2)=='C') && (*(h+3)=='r' || *(h+3)=='R') && (*(h+4)=='i' || *(h+4)=='I') && (*(h+5)=='p' || *(h+5)=='P') && (*(h+6)=='t' || *(h+6)=='T')){
					int l=start+strlen(nodehtml())+(h-h1);
					for(int i1=0;i1<d->getcount();i1++){
						if(d->getitem(i1)->getstart()==l){
							strcat(inner,d->getitem(i1)->innertext());
							h=h+strlen(d->getitem(i1)->outerhtml());
							i=i+strlen(d->getitem(i1)->innertext());
							i3=0;
							break;
						}
					}
				}else{
					i3=1;
				}
				
				
				
			}
		}
		if(i3==1){
			if(*h=='/"' && *(h-1)!='//'){
				if(i1==0){
					i1=1;
				}else{
					i1=0;
				}
				
			}
			if(*h=='/'' && *(h-1)!='//'){
				if(i2==0){
					i2=1;
				}else{
					i2=0;
				}
				
			}
			if(*h=='>'){
				if(i1==0 && i2==0){
					i3=0;
					
				}
			}
		}else{
			//cout<<*h;
			*(inner+i)=*h;
			i++;
		}
		
		
	}
	return inner;
}

node* node::getprevious(){
	node *nn=0;
	for(int i=0;i<d->getcount();i++){
		
		if(d->getitem(i)->getstart()==start && d->getitem(i)->getlen()==len){
			break;
		}else{
			if(start>=d->getitem(i)->getstart()+d->getitem(i)->getlen()){
				nn=d->getitem(i);
				
			}
		}
	}
	return nn;
}
node* node::getnext(){
	node *nn=0;
	for(int i=0;i<d->getcount();i++){
		if(start+len<=d->getitem(i)->getstart()){
			nn=d->getitem(i);
			break;
		}
	}
	return nn;
}
nodecollect::nodecollect(){
	n=0;
	length=0;
}
int nodecollect::getlength(){
	return length;
}
node *nodecollect::item(int i){
	node* n1=n;
	while(i--){
		if(n1){
			n1=n1->next;
		}else{
			return 0;
		}
	}
	return n1;
}

void nodecollect::add(node *nn){
	node *n1=new node;
	n1->setstart(nn->getstart());
	n1->setlen(nn->getlen());
	n1->next=0;
	char *temp=new char[strlen(nn->tagname())+2];
	strcpy(temp,nn->tagname());
	n1->setname(strcat(temp," "));
	delete[] temp;
	n1->d=nn->d;
	if(n){
		node *n2=n;
		while(n2->next){
			n2=n2->next;
		}
		n2->next=n1;
		
	}else{
		n=n1;
		
	}
	length++;
}
nodecollect::~nodecollect(){
	node *n1=n,*n2;
	if(n1){
		while(n1->next!=0){
			n2=n1;
			n1=n1->next;
			delete n2;
		}
	}
}






给出个例子:



main(){
dom d;
d.praseurl("http://g.cn");
cout<<d.getbytagname("div")->item(0)->outerhtml();
}




好了,不写不知道,一写发现了许多问题,UTF-8到gb2312编码的转换(这一段代码是借用网上的),internetreadfile读取的内容不是以null结尾的字符串等。

不管怎么说,问题总算得到解决了,我测试了许多网页,都可以正常解析,心里感觉挺欣慰的。

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: