您的位置:首页 > 其它

网页抓取实战总结

2012-02-08 14:55 155 查看
网页抓取无外乎就是两种POST和GET方式。需要安装httpwatch+professional插件。

/// <summary>

/// POST 方式下载网页

/// </summary>

/// <param name="url">网站地址</param>

/// <param name="encode">网站解码方式</param>

/// <param name="param">访问网站所需要的参数</param>

/// <returns>网站的HTML数据</returns>

public static string DoPost(string url, IDictionary<string, string> param, Encoding encode)

{

StringBuilder paramBuilder = new StringBuilder();

if (param != null)

{

foreach (string key in param.Keys)

{

if (paramBuilder.Length == 0)

{

paramBuilder.AppendFormat("{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));

}

else

{

paramBuilder.AppendFormat("&{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));

}

}

}

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

request.ContentType = "application/x-www-form-urlencoded";

request.Method = "POST";

byte[] postData = null;

postData = encode.GetBytes(paramBuilder.ToString());

request.ContentLength = postData.Length;

using (Stream newStream = request.GetRequestStream())

{

newStream.Write(postData, 0, postData.Length);

}

string html;

HttpWebResponse response = request.GetResponse() as HttpWebResponse;

using (System.IO.Stream responseStream = response.GetResponseStream())

{

System.IO.StreamReader reader = new System.IO.StreamReader(responseStream, encode);

html = reader.ReadToEnd();

}

return html;

}

强调上述方法中param的参数设置。所传值必须与插件的stream中内容显示的一致,否则会出现无法连接的异常。

提前分析网页主要就是确认传进去的编码格式与传出来的编码格式。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: