使用.NET提取网页中的文本
2011-05-10 14:05
162 查看
/****************************** Module Header ******************************\ * Module Name: Default.aspx.cs * Project: CSASPNETStripHtmlCode * Copyright (c) Microsoft Corporation * * This page retrieve the entire html code from SourcePage.aspx. * User can strip or parse many parts of html code, such as pure * text, images, links, script code, etc. * The code-sample can be used in many web applications. For example, * search engines, the search engines need check the short message * of web-pages, like titles, pure text, images and so on. * * This source is subject to the Microsoft Public License. * See http://www.microsoft.com/opensource/licenses.mspx#Ms-PL. * All other rights reserved. * * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. \*****************************************************************************/ using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using System.Net; using System.IO; using System.Text; using System.Text.RegularExpressions; namespace CSASPNETStripHtmlCode { public partial class Default : System.Web.UI.Page { string strUrl = String.Empty; string strWholeHtml = string.Empty; const string MsgPageRetrieveFailed = "Sorry, the web page is not run successful"; bool flgPageRetrieved = true; protected void Page_Load(object sender, EventArgs e) { strUrl = this.Page.Request.Url.ToString().Replace("Default","SourcePage"); tbResult.Text = string.Empty; } protected void btnRetrieveAll_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { tbResult.Text = strWholeHtml; } else { tbResult.Text = MsgPageRetrieveFailed; } } /// <summary> /// Retrieve the entire html code from SourcePage.aspx with WebRequest and /// WebRespond. We transfer the format of html code to uft-8. /// </summary> /// <param name="url"></param> /// <returns></returns> public string GetWholeHtmlCode(string url) { string strHtml = string.Empty; StreamReader strReader = null; HttpWebResponse wrpContent = null; try { HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(strUrl); wrqContent.Timeout = 300000; wrpContent = (HttpWebResponse)wrqContent.GetResponse(); if (wrpContent.StatusCode != HttpStatusCode.OK) { flgPageRetrieved = false; strHtml = "Sorry, the web page is not run successful"; } if (wrpContent != null) { strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.GetEncoding("utf-8")); strHtml = strReader.ReadToEnd(); } } catch (Exception e) { flgPageRetrieved = false; strHtml = e.Message; } finally { if (strReader != null) strReader.Close(); if (wrpContent != null) wrpContent.Close(); } return strHtml; } /// <summary> /// Retrieve the pure text from html code, this pure text include /// only the Body tags of html. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrievePureText_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexScript = @"(?m)<body[^>]*>(\w|\W)*?</body[^>]*>"; string strRegex = @"<[^>]*>"; string strMatchScript = string.Empty; Match matchText = Regex.Match(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase); strMatchScript = matchText.Groups[0].Value; string strPureText = Regex.Replace(strMatchScript, strRegex, string.Empty, RegexOptions.IgnoreCase); tbResult.Text = strPureText; } else { tbResult.Text = MsgPageRetrieveFailed; } } /// <summary> /// Retrieve the script code from html code. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrieveSriptCode_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexScript = @"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>"; string strRegex = @"<[^>]*>"; string strMatchScript = string.Empty; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase); StringBuilder strbScriptList = new StringBuilder(); foreach (Match matchSingleScript in matchList) { string strSingleScriptText = Regex.Replace(matchSingleScript.Value, strRegex, string.Empty, RegexOptions.IgnoreCase); strbScriptList.Append(strSingleScriptText + "\r\n"); } tbResult.Text = strbScriptList.ToString(); } else { tbResult.Text = MsgPageRetrieveFailed; } } /// <summary> /// Retrieve the image information from html code /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrieveImage_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexImg = @"(?is)<img.*?>"; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexImg, RegexOptions.IgnoreCase); StringBuilder strbImageList = new StringBuilder(); foreach (Match matchSingleImage in matchList) { strbImageList.Append(matchSingleImage.Value + "\r\n"); } tbResult.Text = strbImageList.ToString(); } else { tbResult.Text = MsgPageRetrieveFailed; } } /// <summary> /// Retrieve the links from html code /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrievelink_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexLink = @"(?is)<a .*?>"; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexLink, RegexOptions.IgnoreCase); StringBuilder strbLinkList = new StringBuilder(); foreach (Match matchSingleLink in matchList) { strbLinkList.Append(matchSingleLink.Value + "\r\n"); } tbResult.Text = strbLinkList.ToString(); } else { tbResult.Text = MsgPageRetrieveFailed; } } } }
相关文章推荐
- 黄聪:使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies(二)
- 在.NET中使用JQuery 选择器精确提取网页内容
- 使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies(二)(转)
- 使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies .
- java使用htmlparser提取网页纯文本例子
- 使用jsoup从网页中提取非脚本文本内容
- 【python】使用HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies
- 使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies
- 黄聪:使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies(二)
- 使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies(二)
- spark 使用lda算法提取中文文档文本主题
- 【搜索引擎Jediael开发笔记3】使用HtmlParser提取网页中的链接
- Python使用xslt提取网页数据
- 网页去噪,网页正文文本提取方案二(goose)
- 微信公众帐号开发教程第8篇-文本消息中使用网页超链接
- Hive汇总统计数据自动化传输到Mysql数据库-跑批参数文本配置及提取使用
- pyhton爬虫(9)——使用XPath提取网页信息
- 微信公众帐号开发教程第8篇-文本消息中使用网页超链接
- 牛腩 之HTML创建使用网页的文本对齐与格式化
- .net中 网页抓取数据(提取html中的数据,提取table中的数据)