您的位置:首页 > 理论基础 > 计算机网络

用C++解析HTTP下载下来的HTML文档

2013-11-03 20:23 423 查看
最近跟朋友一起写了一个 批量网站查询工具 BlueCatTools,其中,需要用C++解析HTTP下载下来的HTML文档。



懂的人不用我多说,不懂的我也没能力说道你懂,看代码吧。


BlueCatTools 百度收录批量查询工具

//////////////////////////////--caller.cpp--////////////////////////////////////////////////
// to run the program, you should make sure that, there is a "NIKE新浪竞技风暴_新浪网.htm" in your working directory.
// The program run time can be saved about a half if you give a better implementation of the "ofile <<" stament;
#include "HtmlParser.h"
#include <ctime>
#include <iomanip>
using namespace std;
void main()
{
 clock_t start = clock();
 map<string, link_info> LinkInfo;
 multimap<float, link_info, greater<float> > Sorted;
 string FileName = "NIKE新浪竞技风暴_新浪网.htm";
 HtmlParser(FileName, LinkInfo);
 
 string Result;
 for(map<string, link_info>::iterator miter = LinkInfo.begin(); miter != LinkInfo.end(); miter++)
 {
  Sorted.insert(make_pair(miter->second.Value, miter->second));
 }
 ofstream ofile;
 ofile.open("a.txt");
 for(multimap<float, link_info, greater<float> >::iterator miter = Sorted.begin(); miter != Sorted.end(); miter++)
 {
  ofile << miter->first << "\t"
   <<setw(50) << left << miter->second.Title << "\t"
   << miter->second.Link << endl;
 }
 ofile.close();
 cout << clock() - start << endl;
}
 
//////////////////////////////--HtmlParser.h--/////////////////////////////////////////////
#pragma once
#include <cstdio>
#include <iostream>
#include <fstream>
#include <string>
#include <map>
using namespace std;
struct link_info
{
 float  Value;
 string Link;
 string Title;
};
const int BUFFERSIZE = 10000;
const int LOOKUP = 100;
const int ASIZE = 300;                     //max length assumed of <a tag,
string RepairTitle(string& Title)
{
 string Result = "";
 for(string::iterator siter = Title.begin(); siter != Title.end(); siter++)
 {
  unsigned char ch = *siter;
  if(ch == 0x0d || ch == 0x0a || ch == ' ' || ch == '\t')
  {
   if(*Result.rbegin() != '_')
    Result.push_back('_');
  }
  else Result.push_back(ch);
 }
 return Result;
}

bool HtmlParser(const string& FileName, map<string, link_info>& LinkInfo)
{
 int i = 2000;
 FILE *fp;
 size_t ReadIn;
 char Dst[ASIZE];
 char buffer[BUFFERSIZE + 1];
 string Modified_Line;
 fp = fopen(FileName.c_str(), "rb");
 while(fp)
 {
  ReadIn = fread(buffer, 1, BUFFERSIZE, fp);
  fseek(fp, - LOOKUP, SEEK_CUR);
  if(ReadIn == LOOKUP) break;
  buffer[ReadIn] = 0;
  Modified_Line.clear();
  char *p = buffer ;
  while(*p)
  {
   unsigned ch = *p;
   if(ch >= 'A' && ch <= 'Z') Modified_Line.push_back(ch + 32);
   else Modified_Line.push_back(ch);
   p++;
  }
  string::size_type pos0;
  string::size_type pos1 = 0;
  while((pos0 = Modified_Line.find("<a", pos1)) != string::npos)
  {
   string Atag, LAtag;
   pos1 = Modified_Line.find("</a", pos0);
   if(pos1 != string::npos){ 
    if(pos1 - pos0 + 4 >= ASIZE)                                                //make sure that Atag.size() < Asize
     continue;
    memset(Dst, 0, ASIZE);
    Atag = strncpy(Dst, buffer + pos0, pos1 - pos0 + 4);  
    LAtag = Modified_Line.substr(pos0, pos1 - pos0 + 4);
    link_info tmpLink;
    {
     string::size_type pos0, pos1;
     pos1 = LAtag.find("</a");
     while(LAtag[pos1 - 1] == '>')
     {
      pos1 = LAtag.find_last_of("<", pos1 - 1);
      if(pos1 == 0) break;
     }
     pos0 = LAtag.find_last_of(">", pos1);
     string tmpstr = Atag.substr(pos0 + 1, pos1 - pos0 - 1);
     tmpLink.Title = RepairTitle(tmpstr);;          
    }
    {
     string::size_type pos0, pos1;
     pos0 = LAtag.find("href",0);
     pos0 = LAtag.find_first_not_of("=\"\' ",pos0 + 4);              // ",', ,=
     pos1 = LAtag.find_first_of("\"\' >", pos0 + 1);                 // ",', ,>
     tmpLink.Link = Atag.substr(pos0, pos1 - pos0);      
    }
    tmpLink.Value = (i--) * 0.0005;
    if(tmpLink.Title.size() > 3 && tmpLink.Link.size() > 3)             //filter: the filename.size() at least 3
     LinkInfo.insert(make_pair(tmpLink.Link, tmpLink));              //filter: the Link must be unique
   }
  }
 }
 return true;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: