您的位置:首页 > 编程语言 > C#

爬一个网页,输出全部<table>(C#Console)

2014-04-17 15:36 573 查看
using System;

using System.Net;

using System.Text;

using System.Text.RegularExpressions;

namespace LoginkCreditCenter.WebSpiderConsole

{

class Program

{

static void Main(string[] args)

{

Console.WriteLine("\r\n-------Please Enter URl--------\r\n");

string url = Console.ReadLine();

string strResponse = GetPageData(url, "");

#region

//StringBuilder strWebData = new StringBuilder(strResponse);

//Console.WriteLine(strWebData);//写StringBuilder对象

//Console.WriteLine(strResponse.IndexOf("<table"));

//Console.WriteLine(strResponse.IndexOf("</table>"));

//Console.WriteLine("\r\n-------The Table IS--------\r\n");

//string strSub = strResponse.Substring(strResponse.IndexOf("<table"), (strResponse.IndexOf("</table>") - strResponse.IndexOf("<table")+8));

//Console.WriteLine(strSub);

//Console.WriteLine("\r\n-------The Last Table IS--------\r\n");

//string strSubLast = strResponse.Substring(strResponse.LastIndexOf("<table"), (strResponse.LastIndexOf("</table>") - strResponse.LastIndexOf("<table") + 8));

//Console.WriteLine(strSubLast);

#endregion

string strRemain = strResponse;

string nextStrRemain = strResponse;

do

{

strRemain = nextStrRemain;

string strSubStr = strRemain.Substring(strRemain.IndexOf("<table"), (strRemain.IndexOf("</table>") - strRemain.IndexOf("<table") + 8));

Console.WriteLine("\r\n-------The IS A Table--------\r\n");

Console.WriteLine(strSubStr);

nextStrRemain = strRemain.Substring((strRemain.IndexOf("</table>") + 8));

} while (nextStrRemain.Contains("</table>")) ;

Console.ReadLine();

}

private static string GetPageData(string url, string charSet)

{

try

{

//StringBuilder strWebData = new StringBuilder();

string strWebData = string.Empty;

if (url != null || url.Trim() != "")

{

//创建WebClient实例wc

WebClient wc = new WebClient();

//下载网页要解决编码问题或者Cookie

//在头部加入Cookie

//需要一些重载方法

//string cookie="";

//wc.Headers.Add("Cookie", cookie);

//获取或设置用于对向Internet资源请求进行身份验证的网络凭据

wc.Credentials = CredentialCache.DefaultCredentials;

//如果服务器要验证用户名密码

//string username="";

//string password="";

//NetworkCredential credential = new NetworkCredential(username, password);

//wc.Credentials = credential;

//从资源下载并返回字节数组

byte[] dataBuffer = wc.DownloadData(url);

strWebData = Encoding.Default.GetString(dataBuffer);

//获取网页字符编码描述信息

Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);

//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" >

//Match charSetMatchs=Regex.Matches(,,);所有匹配项

string webCharSet = charSetMatch.Groups[2].Value;//"<meta([^<]*)charset=([^<]*)\"有两个()获取两个所以Group[2].value

//string webCharSet = "";

if (charSet == null || charSet == "")

{

//如果未获取到编码,则设置默认编码

if (webCharSet == null || webCharSet == "")

{

charSet = "utf-8";

}

else

{

charSet = webCharSet;

}

}

if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)

{

strWebData = Encoding.GetEncoding(charSet).GetString(dataBuffer);

//strWebData = Encoding.Default.GetString(dataBuffer);

}

}

return strWebData;

}

catch (Exception ex)

{

return ex.Message;

}

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐