您的位置:首页 > 其它

用以获取网页源码并转码的方式。(新闻标题信息采集)

2013-04-08 17:27 134 查看
/**
* 追加收录一转码方式
**/
var checker = new Unicode.Utf8Checker();
var isUtf8 = checker.Check(path);

System.IO.StreamReader reader;
if (isUtf8)
reader = new System.IO.StreamReader(path);
else
reader = new System.IO.StreamReader(path, System.Text.Encoding.GetEncoding("GB2312"));

/**
* 用以获取请求页面源代码
**/
private String getPageSource(String url)
{
String Source = "";
try
{
//连接到网址
WebClient web = new WebClient();
Byte[] data = web.DownloadData(url);
//获取响应报文头
String sContentEncoding = web.ResponseHeaders["Content-Encoding"];
//处理报文编码为gzip的情况
if (sContentEncoding == "gzip")
{
MemoryStream ms = new MemoryStream(data);
MemoryStream msTemp = new MemoryStream();
int count = 0;
GZipStream gzip = new GZipStream(ms, CompressionMode.Decompress);
byte[] buf = new byte[10000];
while ((count = gzip.Read(buf, 0, buf.Length)) > 0)
{
msTemp.Write(buf, 0, count);
}
data = msTemp.ToArray();
}
//获取网页编码格式
String Content_Type = web.ResponseHeaders.Get("Content-Type");
//获取流对象
//StreamReader reader = null;
Regex typeReg = new Regex("charset=.+", RegexOptions.IgnoreCase);
if (typeReg.IsMatch(Content_Type))
{
String CodeType = Content_Type.Substring(Content_Type.IndexOf("=") + 1).ToUpper();
//reader = new StreamReader(new MemoryStream(data), Encoding.GetEncoding(CodeType));
//Source = reader.ReadToEnd();
Source = Encoding.GetEncoding(CodeType).GetString(data);
}
else
{
//reader = new StreamReader(new MemoryStream(data), Encoding.UTF8);
//Source = reader.ReadToEnd();
Source = Encoding.GetEncoding("UTF-8").GetString(data);
if (isGarbled(Source))
{
//reader = new StreamReader(new MemoryStream(data), Encoding.UTF8);
//Source = reader.ReadToEnd();
Source = Encoding.GetEncoding("GBK").GetString(data);
}
}
Regex reg = new Regex(@"<(script|style)\s*.*?>\s*.*?\s*</(script|style)>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
Source = reg.Replace(Source, "");
//关闭
//reader.Close();
}
catch (WebException e)
{
throw e;
}
return Source.Replace("\"", "'");
}

/**
* 获取所有新闻标题
**/
private List<String> getNewsTitle(String url)
{
List<String> strList = new List<String>();
try
{
//<a\s*?(?=[^>]*?targer=){1}.*?href='[^']*?\.(html|shtml)'\s*?>.*?</a>
//得到主页面
string mainPage = getPageSource(url);
//获取主页面所有新闻标题(可能一条匹配多条)
//Regex mainReg = new Regex(@"<(li|h[1-9]?|p)\s*[^i]*?>\s*?(\[?<a\s*?(target='.*?')?\s*?href='http://.*?'\s*?
//        (target='.*?')?\s*?>.*?</a>\]?)*?\s*?</\s*(li|h[1-9]?|p)\s*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
//获取一级匹配结果所有新闻标题(单一匹配)
//<a\s*?(target='_blank')?\s*?href='[^']*?news\.[^']*?\.(html|shtml)'\s*?(target='_blank')?[^>]*?>[^<]{3,}?</a>
Regex titleReg = new Regex(@"<li>\s*?<a\s*?(target='?_blank'?)?\s*?href='[^']*?news\.[^']*?\.(htm|html|shtml)'\s*?" +
@"(target='?_blank'?)?[^>]*?>[^&<]{5,}?</a>\s*?</li>", RegexOptions.IgnoreCase);
//迭代获取新闻标题存于集合中
MatchCollection titleMc = titleReg.Matches(mainPage);
foreach (Match t in titleMc)
{
if (!strList.Contains(t.Value))
{
strList.Add(t.Value);
}
}
}
catch (WebException e)
{
throw e;
}
return strList;
}

/**
* 判断是否存在乱码(无详细解释,参考网络)
**/
private Boolean isGarbled(String val)
{
var bytes = Encoding.UTF8.GetBytes(val);
//连续三个byte表示为239 191 189则为乱码
for (var i = 0; i < bytes.Length; i++)
{
if (i < bytes.Length - 3)
{
if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189)
{
return true;
}
}
}
return false;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: