您的位置：首页 > 其它

获取指定URL源码内容

2011-03-22 16:39 211 查看

附代码：

public class WebContent
{
public WebContent() { }
private string Url { get; set; }
private string Content { get; set; }

public WebContent(string url)
{
this.Url = url;
}

public string GetContent(string url)//使用时调用此方法
{
ThreadWebBrowser(url);
return this.Content;
}

private void ThreadWebBrowser(string url)
{
Thread thread = new Thread(new ParameterizedThreadStart(BeginCatch));
thread.SetApartmentState(ApartmentState.STA);
thread.Start(url);
thread.Join();
while (thread.IsAlive)
{
System.Windows.Forms.Application.DoEvents();
}

}

private void BeginCatch(object obj)
{
try
{
string url = obj.ToString();

WebBrowser webBrowser = new WebBrowser();

webBrowser.ScriptErrorsSuppressed = true;
webBrowser.Navigate("about:blank");
string charset = SniffwebCodeReturnList(GetHtmlCode(url, Encoding.Default), "charset=", "\"");
webBrowser.Document.Write(GetHtmlCode(url, Encoding.GetEncoding(charset)));
//上面这两行看上去也很别扭，主要是想解决部分网页获取源代码的时候中文变成乱码
//即使这样处理了，还是有不少网页无法获取正确的源代码
//此处期待支招

Dictionary<string, string> dict = new Dictionary<string, string>();
HtmlElementCollection allElement = webBrowser.Document.Body.All;

for (int i = 0; i < allElement.Count; i++)
{
if (!dict.Keys.Contains(allElement[i].OuterHtml))
{
if (allElement[i].InnerText != null && allElement[i].InnerText.Length > 100)//这里设置文本长度超过100的才算有效正文，否则太短了就没有什么意义了
{
dict.Add(allElement[i].OuterHtml, allElement[i].InnerText);
}
}
}

string content = dict.OrderByDescending(p => p.Value.Length * p.Value.Length / p.Key.Length).FirstOrDefault().Value ?? string.Empty;
//原来我只是先按照长度排序，然后再按照密度排序，后来发现这样不准确，不惯我是先按照密度排序还是先按照文本长度排序结果都不准确，
//后来我在密度的基础上乘上文本长度后发现准确度提高了不少，不过还是害怕一些特殊的网页

this.Content = content;

}
catch
{

}

}

//根据网址获取网页源代码
private static string GetHtmlCode(string url, Encoding encode)
{
string htmlCode = string.Empty;
System.Net.WebRequest webRequest;
webRequest = System.Net.WebRequest.Create(url);
System.Net.WebResponse webResponse;
webResponse = webRequest.GetResponse();
System.IO.Stream stream;
stream = webResponse.GetResponseStream();
System.IO.StreamReader streamReader = new System.IO.StreamReader(stream, encode);
htmlCode = streamReader.ReadToEnd();
stream.Close();
stream.Dispose();

return htmlCode;
}

//从html源代码中截取一段代码
private static string SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
{
try
{
System.Collections.ArrayList codeList = new System.Collections.ArrayList();
System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(string.Empty + wordsBegin + @"(?<code>[\s\S]+?)" + wordsEnd + string.Empty, System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
for (System.Text.RegularExpressions.Match match = regex.Match(code); match.Success; match = match.NextMatch())
{
codeList.Add(match.Groups["code"].ToString());
}
if (codeList.Count > 0)
{
return codeList[0].ToString();
}
else
{
return string.Empty;
}
}
catch
{
return string.Empty;
}
}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航