网页抓取实战总结
2012-02-08 14:55
155 查看
网页抓取无外乎就是两种POST和GET方式。需要安装httpwatch+professional插件。
/// <summary>
/// POST 方式下载网页
/// </summary>
/// <param name="url">网站地址</param>
/// <param name="encode">网站解码方式</param>
/// <param name="param">访问网站所需要的参数</param>
/// <returns>网站的HTML数据</returns>
public static string DoPost(string url, IDictionary<string, string> param, Encoding encode)
{
StringBuilder paramBuilder = new StringBuilder();
if (param != null)
{
foreach (string key in param.Keys)
{
if (paramBuilder.Length == 0)
{
paramBuilder.AppendFormat("{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));
}
else
{
paramBuilder.AppendFormat("&{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));
}
}
}
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.ContentType = "application/x-www-form-urlencoded";
request.Method = "POST";
byte[] postData = null;
postData = encode.GetBytes(paramBuilder.ToString());
request.ContentLength = postData.Length;
using (Stream newStream = request.GetRequestStream())
{
newStream.Write(postData, 0, postData.Length);
}
string html;
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
using (System.IO.Stream responseStream = response.GetResponseStream())
{
System.IO.StreamReader reader = new System.IO.StreamReader(responseStream, encode);
html = reader.ReadToEnd();
}
return html;
}
强调上述方法中param的参数设置。所传值必须与插件的stream中内容显示的一致,否则会出现无法连接的异常。
提前分析网页主要就是确认传进去的编码格式与传出来的编码格式。
/// <summary>
/// POST 方式下载网页
/// </summary>
/// <param name="url">网站地址</param>
/// <param name="encode">网站解码方式</param>
/// <param name="param">访问网站所需要的参数</param>
/// <returns>网站的HTML数据</returns>
public static string DoPost(string url, IDictionary<string, string> param, Encoding encode)
{
StringBuilder paramBuilder = new StringBuilder();
if (param != null)
{
foreach (string key in param.Keys)
{
if (paramBuilder.Length == 0)
{
paramBuilder.AppendFormat("{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));
}
else
{
paramBuilder.AppendFormat("&{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));
}
}
}
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.ContentType = "application/x-www-form-urlencoded";
request.Method = "POST";
byte[] postData = null;
postData = encode.GetBytes(paramBuilder.ToString());
request.ContentLength = postData.Length;
using (Stream newStream = request.GetRequestStream())
{
newStream.Write(postData, 0, postData.Length);
}
string html;
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
using (System.IO.Stream responseStream = response.GetResponseStream())
{
System.IO.StreamReader reader = new System.IO.StreamReader(responseStream, encode);
html = reader.ReadToEnd();
}
return html;
}
强调上述方法中param的参数设置。所传值必须与插件的stream中内容显示的一致,否则会出现无法连接的异常。
提前分析网页主要就是确认传进去的编码格式与传出来的编码格式。
相关文章推荐
- 有关网页抓取问题的一些经验总结
- SEO优化实战经验总结:网页减肥
- ZH奶酪:PHP抓取网页方法总结
- 网页抓取总结(一)
- 一淘搜索之网页抓取系统分析与实现(4)- 实现&总结
- 分享:用php抓取网页内容方法总结
- 使用perl脚本抓取网页总结
- SEO优化实战经验总结:简洁而实用的SEO网页工具
- 爬取银行网页数据的项目实战总结
- python爬虫实战(1)抓取网页图片自动保存
- 网页抓取 总结
- python爬虫selenium+firefox抓取动态网页--表情包爬虫实战
- ajax动态网页抓取学习总结
- 【转】用php抓取网页内容方法总结
- Python实战(二)—— urllib2 下载网页的方式总结
- scrapy入门实战练习(三)----抓取AJAX异步加载网页
- Scrapy抓取网页相关问题解决以及注意事项总结
- python访问抓取网页常用命令总结
- Java网页抓取错误总结
- PHP抓取网页、解析HTML常用的方法总结