网页采集c#
2014-03-07 19:19
197 查看
private
string GetWebContent(string Url)
{
string strResult =
"";
try
{
HttpWebRequest request =
(HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma","no-cache");
HttpWebResponse response =
(HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding =Encoding.GetEncoding("GB2312");
StreamReader streamReader =
new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
return strResult;
}
为了使用HttpWebRequest和HttpWebResponse,需填名字空间引用
using System.Net;
以下是程序具体实现过程:
private
void button1_Click(object sender,
EventArgs e)
{
//要抓取的URL地址
string Url =
"<a href='http://list.mp3.baidu.com/topso/" +
"mp3topsong.html?id=1#top2' target='_blank'>" +
"http://list.mp3.baidu.com/...</a>";
//得到指定Url的源码
string strWebContent = GetWebContent(Url);
richTextBox1.Text = strWebContent;
//取出和数据有关的那段源码
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf("歌曲TOP500", iBodyStart);
int iTableStart = strWebContent.IndexOf("<table", iStart);
int iTableEnd = strWebContent.IndexOf("</table>", iTableStart);
string strWeb =
strWebContent.Substring(iTableStart, iTableEnd - iTableStart + 8);
//生成HtmlDocument
WebBrowser webb = new WebBrowser();
webb.Navigate("about:blank");
HtmlDocument htmldoc = webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName("TR");
foreach (HtmlElement trin htmlTR)
{
string strID = tr.GetElementsByTagName("TD")[0].InnerText;
string strName =
SplitName(tr.GetElementsByTagName("TD")[1].InnerText,"MusicName");
string strSinger =
SplitName(tr.GetElementsByTagName("TD")[1].InnerText,"Singer");
strID = strID.Replace(".","");
//插入DataTable
AddLine(strID, strName, strSinger,"0");
string strID1 = tr.GetElementsByTagName("TD")[2].InnerText;
string strName1 =
SplitName(tr.GetElementsByTagName("TD")[3].InnerText,"MusicName");
string strSinger1 =
SplitName(tr.GetElementsByTagName("TD")[3].InnerText,"Singer");
//插入DataTable
strID1 = strID1.Replace(".","");
AddLine(strID1, strName1, strSinger1,"0");
903f
string strID2 = tr.GetElementsByTagName("TD")[4].InnerText;
string strName2 =
SplitName(tr.GetElementsByTagName("TD")[5].InnerText,"MusicName");
string strSinger2 =
SplitName(tr.GetElementsByTagName("TD")[5].InnerText,"Singer");
//插入DataTable
strID2 = strID2.Replace(".","");
AddLine(strID2, strName2, strSinger2,"0");
}
//插入数据库
InsertData(dt);
dataGridView1.DataSource = dt.DefaultView;
}
string GetWebContent(string Url)
{
string strResult =
"";
try
{
HttpWebRequest request =
(HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma","no-cache");
HttpWebResponse response =
(HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding =Encoding.GetEncoding("GB2312");
StreamReader streamReader =
new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
return strResult;
}
为了使用HttpWebRequest和HttpWebResponse,需填名字空间引用
using System.Net;
以下是程序具体实现过程:
private
void button1_Click(object sender,
EventArgs e)
{
//要抓取的URL地址
string Url =
"<a href='http://list.mp3.baidu.com/topso/" +
"mp3topsong.html?id=1#top2' target='_blank'>" +
"http://list.mp3.baidu.com/...</a>";
//得到指定Url的源码
string strWebContent = GetWebContent(Url);
richTextBox1.Text = strWebContent;
//取出和数据有关的那段源码
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf("歌曲TOP500", iBodyStart);
int iTableStart = strWebContent.IndexOf("<table", iStart);
int iTableEnd = strWebContent.IndexOf("</table>", iTableStart);
string strWeb =
strWebContent.Substring(iTableStart, iTableEnd - iTableStart + 8);
//生成HtmlDocument
WebBrowser webb = new WebBrowser();
webb.Navigate("about:blank");
HtmlDocument htmldoc = webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName("TR");
foreach (HtmlElement trin htmlTR)
{
string strID = tr.GetElementsByTagName("TD")[0].InnerText;
string strName =
SplitName(tr.GetElementsByTagName("TD")[1].InnerText,"MusicName");
string strSinger =
SplitName(tr.GetElementsByTagName("TD")[1].InnerText,"Singer");
strID = strID.Replace(".","");
//插入DataTable
AddLine(strID, strName, strSinger,"0");
string strID1 = tr.GetElementsByTagName("TD")[2].InnerText;
string strName1 =
SplitName(tr.GetElementsByTagName("TD")[3].InnerText,"MusicName");
string strSinger1 =
SplitName(tr.GetElementsByTagName("TD")[3].InnerText,"Singer");
//插入DataTable
strID1 = strID1.Replace(".","");
AddLine(strID1, strName1, strSinger1,"0");
903f
string strID2 = tr.GetElementsByTagName("TD")[4].InnerText;
string strName2 =
SplitName(tr.GetElementsByTagName("TD")[5].InnerText,"MusicName");
string strSinger2 =
SplitName(tr.GetElementsByTagName("TD")[5].InnerText,"Singer");
//插入DataTable
strID2 = strID2.Replace(".","");
AddLine(strID2, strName2, strSinger2,"0");
}
//插入数据库
InsertData(dt);
dataGridView1.DataSource = dt.DefaultView;
}
相关文章推荐
- C# 网页信息采集(Form.cs)
- C#网页数据采集(一)HtmlAgilityPack
- asp.net(c#)做一个网页数据采集工具
- asp.net(c#)做一个网页数据采集工具
- 浅析基于ASP.NET网页的C#数据采集
- c#采集网页用得几个函数
- C# WEB网页内容采集
- C#网页数据采集(三)HttpWebRequest
- C# WEB网页内容采集
- asp.net/C#网页数据采集
- C# 网页图片采集
- c#采集网页用得几个函数
- asp.net/C#网页数据采集
- C# 网页信息采集(Form.cs)
- C# 网页信息采集 核心代码收集
- c# 采集 获取网页数据内容 一会超时的问题
- 巧用C#webbrowser以及Application.DoEvents()实现采集动态网页的爬虫机器人
- 多个账户模拟登录---c#异步模拟登录网站并采集网页
- 网络采集软件核心技术剖析系列(4)---使用C#语言如何将html网页转换成pdf(html2pdf)
- C#网页采集数据的几种方式(WebClient、WebBrowser和HttpWebRequest/HttpWebResponse)