您的位置:首页 > 编程语言 > ASP

asp.net中抓取远程页面,自动采集示例

2008-05-11 09:54 411 查看
由于远程网页可能采用多种编码,并且有可能采用Gzip格式来压缩数据,让我们采集页面时头疼不已,这里借鉴了一些网络上的资料,一段小程序,用来自动分析编码,自动解压缩gzip数据。


private string GetChartset(string url)




...{


string html = getHTML(url);


Regex reg_charset = new Regex(@"charsets*=s*(?<charset>[^""]*)");


string enconding = null;


if (reg_charset.IsMatch(html))




...{


enconding = reg_charset.Match(html).Groups["charset"].Value;


}


else




...{


enconding = Encoding.Default.EncodingName;


}


if (enconding.ToLower().Contains("gb2312"))


enconding = "gb2312";


if (enconding.ToLower().Contains("utf-8"))


enconding = "utf-8";


return enconding;


}




private string getHTML(string url)




...{




try




...{


WebRequest webRequest = WebRequest.Create(url);


WebResponse webResponse = webRequest.GetResponse();


Stream stream = webResponse.GetResponseStream();


StreamReader sr = new StreamReader(stream, Encoding.GetEncoding(Encoding.ASCII.EncodingName));


string html = sr.ReadToEnd();


return html;


}


catch (UriFormatException ex)




...{




Console.WriteLine(ex.Message);


return null;


}


catch (WebException ex)




...{




Console.WriteLine(ex.Message);


return null;


}


}




private string Html(string Url)




...{


string strResult = "";


try




...{


HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);


HttpWebResponse response = (HttpWebResponse)request.GetResponse();


string ce = response.ContentEncoding;


Stream streamReceive = response.GetResponseStream();




Encoding encoding = Encoding.GetEncoding(GetChartset(Url));


if (ce.ToLower() == "gzip")//压缩的内容




...{


GZipStream gzip = new GZipStream(streamReceive, CompressionMode.Decompress);


using (StreamReader reader = new StreamReader(gzip,encoding))




...{


strResult = reader.ReadToEnd();


}




}


}




catch (Exception ex) ...{ HttpContext.Current.Response.Write(ex.ToString()); }


return strResult;




}








protected void Button1_Click(object sender, EventArgs e)




...{


Response.Write(Html(TextBox1.Text));


}










/**//*


使用起来很简单,下面的程序将字符串压缩入文件:




using (DeflateStream gzip = new DeflateStream(fs, CompressionMode.Compress))


{


byte[] buf = Encoding.UTF8.GetBytes(this.txbSource.Text);


gzip.Write(buf, 0, buf.Length);


gzip.Flush();


}




解压只需要这样:




gzip = new GZipStream(new MemoryStream(buf), CompressionMode.Decompress);


using (StreamReader reader = new StreamReader(gzip))


{


this.txbTarget.Text = reader.ReadToEnd();


}




如果从文件解压,只需要把MemoryStream换成一个FileStream就行了。






*/
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: