爬一个网页,输出全部<table>(C#Console)
2014-04-17 15:36
573 查看
using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace LoginkCreditCenter.WebSpiderConsole
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("\r\n-------Please Enter URl--------\r\n");
string url = Console.ReadLine();
string strResponse = GetPageData(url, "");
#region
//StringBuilder strWebData = new StringBuilder(strResponse);
//Console.WriteLine(strWebData);//写StringBuilder对象
//Console.WriteLine(strResponse.IndexOf("<table"));
//Console.WriteLine(strResponse.IndexOf("</table>"));
//Console.WriteLine("\r\n-------The Table IS--------\r\n");
//string strSub = strResponse.Substring(strResponse.IndexOf("<table"), (strResponse.IndexOf("</table>") - strResponse.IndexOf("<table")+8));
//Console.WriteLine(strSub);
//Console.WriteLine("\r\n-------The Last Table IS--------\r\n");
//string strSubLast = strResponse.Substring(strResponse.LastIndexOf("<table"), (strResponse.LastIndexOf("</table>") - strResponse.LastIndexOf("<table") + 8));
//Console.WriteLine(strSubLast);
#endregion
string strRemain = strResponse;
string nextStrRemain = strResponse;
do
{
strRemain = nextStrRemain;
string strSubStr = strRemain.Substring(strRemain.IndexOf("<table"), (strRemain.IndexOf("</table>") - strRemain.IndexOf("<table") + 8));
Console.WriteLine("\r\n-------The IS A Table--------\r\n");
Console.WriteLine(strSubStr);
nextStrRemain = strRemain.Substring((strRemain.IndexOf("</table>") + 8));
} while (nextStrRemain.Contains("</table>")) ;
Console.ReadLine();
}
private static string GetPageData(string url, string charSet)
{
try
{
//StringBuilder strWebData = new StringBuilder();
string strWebData = string.Empty;
if (url != null || url.Trim() != "")
{
//创建WebClient实例wc
WebClient wc = new WebClient();
//下载网页要解决编码问题或者Cookie
//在头部加入Cookie
//需要一些重载方法
//string cookie="";
//wc.Headers.Add("Cookie", cookie);
//获取或设置用于对向Internet资源请求进行身份验证的网络凭据
wc.Credentials = CredentialCache.DefaultCredentials;
//如果服务器要验证用户名密码
//string username="";
//string password="";
//NetworkCredential credential = new NetworkCredential(username, password);
//wc.Credentials = credential;
//从资源下载并返回字节数组
byte[] dataBuffer = wc.DownloadData(url);
strWebData = Encoding.Default.GetString(dataBuffer);
//获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" >
//Match charSetMatchs=Regex.Matches(,,);所有匹配项
string webCharSet = charSetMatch.Groups[2].Value;//"<meta([^<]*)charset=([^<]*)\"有两个()获取两个所以Group[2].value
//string webCharSet = "";
if (charSet == null || charSet == "")
{
//如果未获取到编码,则设置默认编码
if (webCharSet == null || webCharSet == "")
{
charSet = "utf-8";
}
else
{
charSet = webCharSet;
}
}
if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
{
strWebData = Encoding.GetEncoding(charSet).GetString(dataBuffer);
//strWebData = Encoding.Default.GetString(dataBuffer);
}
}
return strWebData;
}
catch (Exception ex)
{
return ex.Message;
}
}
}
}
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace LoginkCreditCenter.WebSpiderConsole
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("\r\n-------Please Enter URl--------\r\n");
string url = Console.ReadLine();
string strResponse = GetPageData(url, "");
#region
//StringBuilder strWebData = new StringBuilder(strResponse);
//Console.WriteLine(strWebData);//写StringBuilder对象
//Console.WriteLine(strResponse.IndexOf("<table"));
//Console.WriteLine(strResponse.IndexOf("</table>"));
//Console.WriteLine("\r\n-------The Table IS--------\r\n");
//string strSub = strResponse.Substring(strResponse.IndexOf("<table"), (strResponse.IndexOf("</table>") - strResponse.IndexOf("<table")+8));
//Console.WriteLine(strSub);
//Console.WriteLine("\r\n-------The Last Table IS--------\r\n");
//string strSubLast = strResponse.Substring(strResponse.LastIndexOf("<table"), (strResponse.LastIndexOf("</table>") - strResponse.LastIndexOf("<table") + 8));
//Console.WriteLine(strSubLast);
#endregion
string strRemain = strResponse;
string nextStrRemain = strResponse;
do
{
strRemain = nextStrRemain;
string strSubStr = strRemain.Substring(strRemain.IndexOf("<table"), (strRemain.IndexOf("</table>") - strRemain.IndexOf("<table") + 8));
Console.WriteLine("\r\n-------The IS A Table--------\r\n");
Console.WriteLine(strSubStr);
nextStrRemain = strRemain.Substring((strRemain.IndexOf("</table>") + 8));
} while (nextStrRemain.Contains("</table>")) ;
Console.ReadLine();
}
private static string GetPageData(string url, string charSet)
{
try
{
//StringBuilder strWebData = new StringBuilder();
string strWebData = string.Empty;
if (url != null || url.Trim() != "")
{
//创建WebClient实例wc
WebClient wc = new WebClient();
//下载网页要解决编码问题或者Cookie
//在头部加入Cookie
//需要一些重载方法
//string cookie="";
//wc.Headers.Add("Cookie", cookie);
//获取或设置用于对向Internet资源请求进行身份验证的网络凭据
wc.Credentials = CredentialCache.DefaultCredentials;
//如果服务器要验证用户名密码
//string username="";
//string password="";
//NetworkCredential credential = new NetworkCredential(username, password);
//wc.Credentials = credential;
//从资源下载并返回字节数组
byte[] dataBuffer = wc.DownloadData(url);
strWebData = Encoding.Default.GetString(dataBuffer);
//获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" >
//Match charSetMatchs=Regex.Matches(,,);所有匹配项
string webCharSet = charSetMatch.Groups[2].Value;//"<meta([^<]*)charset=([^<]*)\"有两个()获取两个所以Group[2].value
//string webCharSet = "";
if (charSet == null || charSet == "")
{
//如果未获取到编码,则设置默认编码
if (webCharSet == null || webCharSet == "")
{
charSet = "utf-8";
}
else
{
charSet = webCharSet;
}
}
if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
{
strWebData = Encoding.GetEncoding(charSet).GetString(dataBuffer);
//strWebData = Encoding.Default.GetString(dataBuffer);
}
}
return strWebData;
}
catch (Exception ex)
{
return ex.Message;
}
}
}
}
相关文章推荐
- C++中输入输出<< 和>>重载,以便适应输出输入一个对象
- 【c语言】有一函数:x < 0 ,y = -1;x = 0,y = 0;x > 0,y = 1,编程输入一个x值,要求输出对应的y
- c# 快速将LIST<> 输出到EXCEL中
- 螺旋输出一个方形的二维数组<java版>
- C# List<>.Add一个细节
- 【C#】对异步请求处理程序IHttpAsyncHandler的理解和分享一个易用性封装 【手记】走近科学之为什么明明实现了IEnumerable<T>的类型却不能调用LINQ扩展方法 【手记】手机网页弹出层后屏蔽底层的滑动响应 【手记】ASP.NET提示“未能创建类型”处理 【Web】一个非常简单的移动web消息框 【手记】解决EXCEL跑SQL遇“查询无法运行或数据库表无法打开...”
- <实例>输出给定字符串的全部连续子序列
- Asp.Net 无限分类生成表格 <后台自定义输出table>
- c语言:有一函数:当x&lt;0,y=-1;x=0,y=0;x&gt;0,y=1;编一程序,输入一个x值,要求输出相应的y值。
- html第三天 如何使用<table>制作一个网页
- C#操作字符串方法总结<转>
- c# 泛型集合Dictionary&lt;TKey,TValue&gt;
- .Net/C# 实现 中国移动 CMPP v3.0 ISMG <-> SP 收发短信的 SP 客户端 (CMPP SP Client)
- AR--未来技术提前探索<3>[在汉明码上叠加一个3D模型]
- struts2利用<s:fielderror/>等标签详细地控制错误消息输出格式
- <<Accelerated C# 2008>>笔记3容器&&数组&&迭代器
- Tips: 解决XSLT中table内容<br/>换行问题
- 在一个文件中有 10G 个整数,乱序排列,要求找出中位数。内存限制为 2G。 -- Shirley对比编程珠玑 in<Shirley>
- HTML中的链接到另外一个页面的标签<a>的用法与规则
- <如何创建一个实体历史> 时间维度 让 1:n的 产生新的实体 或者关系