用C++解析HTTP下载下来的HTML文档
2013-11-03 20:23
423 查看
最近跟朋友一起写了一个 批量网站查询工具 BlueCatTools,其中,需要用C++解析HTTP下载下来的HTML文档。
懂的人不用我多说,不懂的我也没能力说道你懂,看代码吧。
懂的人不用我多说,不懂的我也没能力说道你懂,看代码吧。
BlueCatTools 百度收录批量查询工具
//////////////////////////////--caller.cpp--//////////////////////////////////////////////// // to run the program, you should make sure that, there is a "NIKE新浪竞技风暴_新浪网.htm" in your working directory. // The program run time can be saved about a half if you give a better implementation of the "ofile <<" stament; #include "HtmlParser.h" #include <ctime> #include <iomanip> using namespace std; void main() { clock_t start = clock(); map<string, link_info> LinkInfo; multimap<float, link_info, greater<float> > Sorted; string FileName = "NIKE新浪竞技风暴_新浪网.htm"; HtmlParser(FileName, LinkInfo); string Result; for(map<string, link_info>::iterator miter = LinkInfo.begin(); miter != LinkInfo.end(); miter++) { Sorted.insert(make_pair(miter->second.Value, miter->second)); } ofstream ofile; ofile.open("a.txt"); for(multimap<float, link_info, greater<float> >::iterator miter = Sorted.begin(); miter != Sorted.end(); miter++) { ofile << miter->first << "\t" <<setw(50) << left << miter->second.Title << "\t" << miter->second.Link << endl; } ofile.close(); cout << clock() - start << endl; } //////////////////////////////--HtmlParser.h--///////////////////////////////////////////// #pragma once #include <cstdio> #include <iostream> #include <fstream> #include <string> #include <map> using namespace std; struct link_info { float Value; string Link; string Title; }; const int BUFFERSIZE = 10000; const int LOOKUP = 100; const int ASIZE = 300; //max length assumed of <a tag, string RepairTitle(string& Title) { string Result = ""; for(string::iterator siter = Title.begin(); siter != Title.end(); siter++) { unsigned char ch = *siter; if(ch == 0x0d || ch == 0x0a || ch == ' ' || ch == '\t') { if(*Result.rbegin() != '_') Result.push_back('_'); } else Result.push_back(ch); } return Result; } bool HtmlParser(const string& FileName, map<string, link_info>& LinkInfo) { int i = 2000; FILE *fp; size_t ReadIn; char Dst[ASIZE]; char buffer[BUFFERSIZE + 1]; string Modified_Line; fp = fopen(FileName.c_str(), "rb"); while(fp) { ReadIn = fread(buffer, 1, BUFFERSIZE, fp); fseek(fp, - LOOKUP, SEEK_CUR); if(ReadIn == LOOKUP) break; buffer[ReadIn] = 0; Modified_Line.clear(); char *p = buffer ; while(*p) { unsigned ch = *p; if(ch >= 'A' && ch <= 'Z') Modified_Line.push_back(ch + 32); else Modified_Line.push_back(ch); p++; } string::size_type pos0; string::size_type pos1 = 0; while((pos0 = Modified_Line.find("<a", pos1)) != string::npos) { string Atag, LAtag; pos1 = Modified_Line.find("</a", pos0); if(pos1 != string::npos){ if(pos1 - pos0 + 4 >= ASIZE) //make sure that Atag.size() < Asize continue; memset(Dst, 0, ASIZE); Atag = strncpy(Dst, buffer + pos0, pos1 - pos0 + 4); LAtag = Modified_Line.substr(pos0, pos1 - pos0 + 4); link_info tmpLink; { string::size_type pos0, pos1; pos1 = LAtag.find("</a"); while(LAtag[pos1 - 1] == '>') { pos1 = LAtag.find_last_of("<", pos1 - 1); if(pos1 == 0) break; } pos0 = LAtag.find_last_of(">", pos1); string tmpstr = Atag.substr(pos0 + 1, pos1 - pos0 - 1); tmpLink.Title = RepairTitle(tmpstr);; } { string::size_type pos0, pos1; pos0 = LAtag.find("href",0); pos0 = LAtag.find_first_not_of("=\"\' ",pos0 + 4); // ",', ,= pos1 = LAtag.find_first_of("\"\' >", pos0 + 1); // ",', ,> tmpLink.Link = Atag.substr(pos0, pos1 - pos0); } tmpLink.Value = (i--) * 0.0005; if(tmpLink.Title.size() > 3 && tmpLink.Link.size() > 3) //filter: the filename.size() at least 3 LinkInfo.insert(make_pair(tmpLink.Link, tmpLink)); //filter: the Link must be unique } } } return true; }
相关文章推荐
- 浏览器解析HTML文档的资源并下载
- andriod gson 解析:jar下载(及帮助文档 ) http://code.google.com/p/google-gson/
- 常见设计模式的解析和实现(C++)文档及源码打包下载
- VC C++ 断点续传 http 多线程 下载 源代码
- vbs 解析html文档的方法(htmlfile)
- 解析html网页及下载连接
- [转载]python模块学习---HTMLParser(解析HTML文档元素)
- C++ http get下载网页源码
- c/c++编译过程http://www.cnblogs.com/hktk/archive/2012/09/11/2680495.html
- vs2010环境 c++ 使用htmlcxx解析html
- 解析html中链接url,并下载在指定目录
- SharePoint 文档库打开HTML 直接浏览而不是打开下载对话框
- Java的XML创建、解析文档(转载自http://blog.csdn.net/psyuhen/article/details/7539228)
- HttpHandler解析并展示PDF文档内容
- 怎样学好hibernate!(http://edu.csdn.net/news/2012822/240ca26f0d7e.shtml)没事下载下来看哦!
- C++ 输入输出流和文件流 (2010-01-17 13:34:03)http://blog.sina.com.cn/s/blog_700a65cc0100mi5o.html
- 浏览器如何解析HTML文档
- 使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies .
- 微视频/音乐专辑批量打包解析下载门户http://www.boloo.com.cn/
- [HTTP]_[C/C++]_[获取html页面里的image src属性值的实际图片路径]