您的位置:首页 > 理论基础 > 计算机网络

网络蜘蛛(网络爬虫)核心C#源代码

2011-03-08 10:28 267 查看
网络蜘蛛或爬虫需要能够下载网页、图片(流)以及登录的Cookies等信息,以下的C#代码是比较实用的核心程序。using System;using System.Collections.Generic;using System.Text;using System.IO;using System.IO.Compression;using System.Xml;using System.Web;using System.Collections;using System.Runtime.InteropServices;using System.Net;using System.Net.Security;using System.Security.Authentication;using System.Security.Cryptography.X509Certificates;namespace Common{/// <summary>/// 准备POST/// </summary>/// <param name="httpRequest"></param>public delegate void OnGetPostReady(HttpWebRequest httpRequest);/// <summary>/// 准备取回应/// </summary>/// <param name="httpRequest"></param>public delegate void OnGetResponseReady(HttpWebRequest httpRequest);public class HttpWebHelper{protected HttpWebRequest httpRequest;protected HttpWebResponse httpResponse;protected CookieContainer cookieContainer;protected CredentialCache credentialCache;protected bool certificatedMode = false;protected string certFilepath = string.Empty;public OnGetPostReady OnGetPostReadyHandler = null;public OnGetPostReady OnGetResponseReadyHandler = null;protected readonly int DEFAULT_BUFFER_SIZE = 4096;public WebProxy webProxySrv = null;private static readonly int MyConnectionLimit = 300;public bool CheckGotoRecv{get;set;}public bool DoBetIsGotoRecv{get;set;}public bool LastAccessError{private set;get;}/// <summary>/// 当前自动转向后的url/// </summary>public string CurrentUrl{private set;get;}public string CurrentLocation{private set;get;}public string CurSetCookie{set;get;}public string CurSetCookie2{set;get;}/// <summary>/// 默认构造器/// </summary>public HttpWebHelper(){this.cookieContainer = new CookieContainer();ServicePointManager.DefaultConnectionLimit = MyConnectionLimit;ServicePointManager.Expect100Continue = false;ServicePointManager.MaxServicePointIdleTime = 10000;}/// <summary>/// 代理參數構造器/// </summary>/// <param name="wp"></param>public HttpWebHelper(WebProxy wp) : this(){this.webProxySrv = wp;}/// <summary>/// 需要基本认证的构造器/// </summary>/// <param name="cred"></param>public HttpWebHelper(bool cred): this(){this.certificatedMode = cred;}public HttpWebHelper(bool cred, WebProxy wp): this(){this.certificatedMode = cred;this.webProxySrv = wp;}/// <summary>/// 基本认证和证书,refer页面/// </summary>/// <param name="cred"></param>/// <param name="certFilepath"></param>public HttpWebHelper(bool cred, string certFilepath): this(cred){this.certFilepath = certFilepath;}public HttpWebHelper(bool cred, WebProxy wp, string certFilepath): this(cred, wp){this.certFilepath = certFilepath;}/// <summary>/// 提供批量用户名和密码的构造器/// </summary>/// <param name="uri"></param>/// <param name="method"></param>/// <param name="username"></param>/// <param name="password"></param>public HttpWebHelper(string uri, string method, string username, string password): this(true){this.credentialCache = new CredentialCache();this.credentialCache.Add(new Uri(uri), method, new NetworkCredential(username, password));}/// <summary>/// 安全询问回调函数,直接同意/// </summary>/// <param name="sender"></param>/// <param name="certificate"></param>/// <param name="chain"></param>/// <param name="errors"></param>/// <returns></returns>public bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors){return true;}private void SetHttpRequestOptions_Accept(string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm, string httpAccept){this.SetHttpRequestOptions(url, method, cc, referUrl, nocache, dm);this.httpRequest.Accept = httpAccept;}/// <summary>/// 设置HttpWebRequest对象/// </summary>/// <param name="url"></param>/// <param name="method"></param>/// <param name="cc"></param>/// <param name="referUrl"></param>/// <param name="nocache"></param>/// <param name="dm"></param>private void SetHttpRequestOptions(string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm){httpRequest = (HttpWebRequest)HttpWebRequest.Create(url);httpRequest.UnsafeAuthenticatedConnectionSharing = true;httpRequest.ServicePoint.ConnectionLimit = MyConnectionLimit;if (null != this.webProxySrv) httpRequest.Proxy = this.webProxySrv;if (this.certificatedMode && url.ToLower().Substring(0, 5).Equals("https")){ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(CheckValidationResult);if (null == this.credentialCache)httpRequest.UseDefaultCredentials = true;elsehttpRequest.Credentials = this.credentialCache;if (!string.IsNullOrEmpty(this.certFilepath))httpRequest.ClientCertificates.Add(X509Certificate.CreateFromCertFile(this.certFilepath));}httpRequest.CookieContainer = this.cookieContainer;if (!string.IsNullOrEmpty(referUrl)) httpRequest.Referer = referUrl;httpRequest.AutomaticDecompression = dm;httpRequest.ServicePoint.Expect100Continue = false;httpRequest.ServicePoint.UseNagleAlgorithm = false;httpRequest.ContentType = "application/x-www-form-urlencoded";// httpRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, */*";// httpRequest.AllowWriteStreamBuffering = true; 默认值就是true// httpRequest.AllowAutoRedirect = true; 默认值就是truehttpRequest.Method = method;httpRequest.Timeout = ApplicationConfig.HTTP_REQUEST_TIMEOUT;// 讀寫超時//httpRequest.ReadWriteTimeout = ApplicationConfig.HTTP_REQUEST_TIMEOUT;// httpRequest.MaximumAutomaticRedirections = 50; 默认值就是50httpRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";httpRequest.Headers.Add("Accept-Language", "zh-cn");httpRequest.Headers.Add("UA-CPU", "x86");//httpRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); if (nocache){httpRequest.Headers.Add("Cache-Control", "no-cache");//httpRequest.Headers.Add("Pragma", "no-cache");}if (null != cc) httpRequest.CookieContainer.Add(cc);// 回调发起请求前事件if(null != this.OnGetPostReadyHandler){try{this.OnGetPostReadyHandler(this.httpRequest);//BaseDebug.DebugPrint("KeepAlive = " + this.httpRequest.KeepAlive.ToString());}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint(ex.ToString());}}}private void SetHttpRequestOptions(string url, string method, CookieCollection cc, string referUrl, string httpAccept){this.SetHttpRequestOptions_Accept(url, method, cc, referUrl, false, DecompressionMethods.GZip | DecompressionMethods.Deflate, httpAccept);}/// <summary>/// 重新设置某些成员/// </summary>private void ManualResetMember(){ this.cookieContainer = httpRequest.CookieContainer;this.CurrentUrl = httpRequest.Address.OriginalString;this.CurrentLocation = httpResponse.Headers["Location"];}public MemoryStream GetMemoryStream(string url, string method, CookieCollection cc, string referUrl, string httpAccept){MemoryStream ms = new MemoryStream();try{this.SetHttpRequestOptions(url, method, cc, referUrl, "*/*");this.httpRequest.Accept = httpAccept;this.httpResponse = (HttpWebResponse)httpRequest.GetResponse();// 是否收到响应if (!this.httpRequest.HaveResponse){this.httpResponse.Close();this.httpRequest.Abort();return ms;}this.ManualResetMember();if (null != this.OnGetResponseReadyHandler){try{this.OnGetResponseReadyHandler(this.httpRequest);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint(ex.ToString());}}this.DoBetIsGotoRecv = true;Stream sm = httpResponse.GetResponseStream();if (null != sm && sm.CanRead){BinaryReader br = new BinaryReader(sm);byte[] bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);while (null != bytes && bytes.Length != 0){ms.Write(bytes, 0, bytes.Length);bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);}br.Close();}if (httpResponse.Headers["Set-Cookie"] != null)this.CurSetCookie = httpResponse.Headers["Set-Cookie"].ToString();httpResponse.Close();if (null != sm) sm.Close();// 非常重要,回到开头ms.Seek(0, SeekOrigin.Begin);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint("異常網址:" + url);BaseDebug.DebugPrint(ex.ToString());if (null != httpRequest) httpRequest.Abort();}return ms;}public MemoryStream SimpleGetMemoryStream(string url, string method){return this.GetMemoryStream(url, method, null, null, "text/html");}public MemoryStream SimpleGetMemoryStream(string url, string method, string httpAccept){return this.GetMemoryStream(url, method, null, null, httpAccept);}/// <summary>/// 仅仅发送请求,返回所有的输出文本/// </summary>/// <param name="url"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="cc"></param>/// <param name="referUrl"></param>/// <returns></returns>public string SimpleDoPostWrapper(string url, string method, Encoding coding, CookieCollection cc, string referUrl){string str = string.Empty;StreamReader sr = null;MemoryStream sm = null;if (null == coding){sm = this.GetMemoryStream(url, method, cc, referUrl, "text/html");sr = new StreamReader(sm);}else{sm = this.GetMemoryStream(url, method, cc, referUrl, "text/html");sr = new StreamReader(sm, coding);}str = sr.ReadToEnd();sr.Close();sm.Close();return str;}public string SimpleDoPostWrapper(string url, string method){return this.SimpleDoPostWrapper(url, method, null, null, null);}public string SimpleDoPostWrapper(string url, string method, CookieCollection cc){return this.SimpleDoPostWrapper(url, method, null, cc, null);}public string SimpleDoPostWrapper(string url, string method, string referUrl){return this.SimpleDoPostWrapper(url, method, null, null, referUrl);}/// <summary>/// 上送数据,返回输出流/// </summary>/// <param name="url"></param>/// <param name="data"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="cc"></param>/// <param name="referUrl"></param>/// <returns></returns>public MemoryStream GetMemoryStream(string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl){MemoryStream ms = new MemoryStream();try{this.SetHttpRequestOptions(url, method, cc, referUrl, "text/html");byte[] bytesData = coding.GetBytes(data);Stream requestStream = httpRequest.GetRequestStream();requestStream.Write(bytesData, 0, bytesData.Length);requestStream.Flush();requestStream.Close();this.httpResponse = (HttpWebResponse)httpRequest.GetResponse();// 是否收到响应if (!this.httpRequest.HaveResponse){this.httpResponse.Close();this.httpRequest.Abort();return ms;}this.ManualResetMember();if (null != this.OnGetResponseReadyHandler){try{this.OnGetResponseReadyHandler(this.httpRequest);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint(ex.ToString());}}this.DoBetIsGotoRecv = true;Stream sm = httpResponse.GetResponseStream();if (null != sm && sm.CanRead){BinaryReader br = new BinaryReader(sm);byte[] bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);while (null != bytes && bytes.Length != 0){ms.Write(bytes, 0, bytes.Length);bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);}br.Close();}if (httpResponse.Headers["Set-Cookie"] != null)this.CurSetCookie = httpResponse.Headers["Set-Cookie"].ToString();httpResponse.Close();if (null != sm) sm.Close();// 非常重要,回到开头ms.Seek(0, SeekOrigin.Begin);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint("異常網址:" + url);BaseDebug.DebugPrint(ex.ToString());if (null != httpRequest) httpRequest.Abort();}return ms;}public MemoryStream SimpleGetMemoryStream(string url, string data, string method, Encoding coding){return this.GetMemoryStream(url, data, method, coding, null, null);}public MemoryStream SimpleGetMemoryStream(string url, string data, string method, Encoding coding, string referUrl){return this.GetMemoryStream(url, data, method, coding, null, referUrl);}/// <summary>/// 上送,返回所有的输出文本/// </summary>/// <param name="url"></param>/// <param name="data"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="referUrl"></param>/// <returns></returns>public string DoPostWrapper(string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl){string str = string.Empty;MemoryStream sm = this.GetMemoryStream(url, data, method, coding, cc, referUrl);StreamReader sr = new StreamReader(sm);str = sr.ReadToEnd();sr.Close();sm.Close();return str;}public string DoPostWrapper(string url, string data, string method, Encoding coding){return this.DoPostWrapper(url, data, method, coding, null, null);}public string DoPostWrapper(string url, string data, string method, Encoding coding, CookieCollection cc){return this.DoPostWrapper(url, data, method, coding, cc, null);}public string DoPostWrapper(string url, string data, string method, Encoding coding, string referUrl){return this.DoPostWrapper(url, data, method, coding, null, referUrl);}/// <summary>/// 上送,返回所有的输出文本,参数是字典/// </summary>/// <param name="url"></param>/// <param name="dicArguments"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="referUrl"></param>/// <returns></returns>public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, CookieCollection cc, string referUrl){string data = this.BuildRequestArguments(dicArguments);return this.DoPostWrapper(url, data, method, coding, cc, referUrl);}public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding){return this.DoPostWrapper(url, dicArguments, method, coding, null, null);}public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, CookieCollection cc){return this.DoPostWrapper(url, dicArguments, method, coding, cc, null);}public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, string referUrl){return this.DoPostWrapper(url, dicArguments, method, coding, null, referUrl);}/// <summary>/// 下载验证码,只返回内存流,调用函数要负责关闭该Stream/// </summary>/// <param name="url"></param>/// <param name="method"></param>/// <returns></returns>public MemoryStream DownloadStream(string url, string method){return this.SimpleGetMemoryStream(url, method, "*/*");}/// <summary>/// 从字典中生成上传参数.提供编码定制支持/// </summary>/// <param name="dicArguments"></param>/// <param name="coding"></param>/// <returns></returns>public string BuildRequestArguments(Dictionary<string, string> dicArguments, Encoding coding){StringBuilder sb = new StringBuilder();string str = string.Empty;if (0 == dicArguments.Count) return str;foreach (KeyValuePair<string, string> kvp in dicArguments){if(null != coding)sb.Append(HttpUtility.UrlEncode(kvp.Key, coding) + "=" + HttpUtility.UrlEncode(kvp.Value, coding));elsesb.Append(HttpUtility.UrlEncode(kvp.Key) + "=" + HttpUtility.UrlEncode(kvp.Value));// a&bsb.Append("&");}str = sb.ToString();return str.Substring(0, str.Length - 1);}/// <summary>/// 从字典中生成上传的默认参数,不提供编码定制支持/// </summary>/// <param name="dicArguments"></param>/// <returns></returns>public string BuildRequestArguments(Dictionary<string, string> dicArguments){return this.BuildRequestArguments(dicArguments, null);}/// <summary>/// 查询cookie中的某个项的值/// </summary>/// <param name="key"></param>/// <param name="domain"></param>/// <returns></returns>public string GetCookieValue(string key, string domain) {if (0 == this.cookieContainer.Count){return string.Empty;}CookieCollection cc = this.cookieContainer.GetCookies(new Uri(domain));return cc[key].Value;}/// <summary>/// 设置cookies容器/// </summary>/// <param name="cc"></param>public void SetCookieContainer(CookieContainer cc) {this.cookieContainer = cc;}/// <summary>/// 放棄請求/// </summary>public bool AbortHttpRequest(){if(null != this.httpRequest){this.httpRequest.Abort();}return this.CheckGotoRecv && this.DoBetIsGotoRecv;}}}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: