.NET RSS新闻自动采集
2008-05-14 08:58
337 查看
创建应用程序
[align=left] void Application_Start(object sender, EventArgs e) [/align]
[align=left] {[/align]
[align=left] AppLine.RssUrl = @"http://news.163.com/special/00011K6L/rss_newstop.xml";[/align]
[align=left] AppLine.NewPath = Server.MapPath("~/Get_New/");[/align]
[align=left] AppLine appline = new AppLine();[/align]
}
AppLin
[align=left]using System;[/align]
[align=left]using System.Data;[/align]
[align=left]using System.Configuration;[/align]
[align=left]using System.Web;[/align]
[align=left]using System.Web.Security;[/align]
[align=left]using System.Web.UI;[/align]
[align=left]using System.Web.UI.WebControls;[/align]
[align=left]using System.Web.UI.WebControls.WebParts;[/align]
[align=left]using System.Web.UI.HtmlControls;[/align]
[align=left]using System.IO;[/align]
[align=left]using System.Net;[/align]
[align=left]using System.Text;[/align]
[align=left]using System.Threading;[/align]
[align=left]using System.Text.RegularExpressions;[/align]
[align=left] [/align]
[align=left] [/align]
[align=left]///<summary>[/align]
[align=left]/// AppLine 的摘要说明[/align]
[align=left]///</summary>[/align]
[align=left]public class AppLine : Page[/align]
[align=left]{[/align]
[align=left] [/align]
[align=left] public static bool bStop;[/align]
[align=left] public static DataSet ds;[/align]
[align=left] public static string RssUrl;[/align]
[align=left] public static string htmltext;[/align]
[align=left] public static string NewPath;[/align]
[align=left] WebRequest request;[/align]
[align=left] Regex rex;[/align]
[align=left] public AppLine()[/align]
[align=left] {[/align]
[align=left] new Thread(new ThreadStart(ThreadProc)).Start();[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] public void ThreadProc()[/align]
[align=left] {[/align]
[align=left] while (!bStop)[/align]
[align=left] {[/align]
[align=left] [/align]
[align=left] ds = new DataSet();[/align]
[align=left] try[/align]
[align=left] {[/align]
[align=left] ds.ReadXml(RssUrl);[/align]
[align=left] }[/align]
[align=left] catch [/align]
[align=left] {[/align]
[align=left] return;[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] foreach (DataRow row in ds.Tables["item"].Rows)[/align]
[align=left] {[/align]
[align=left] if (row["link"].ToString().IndexOf("news") != -1)[/align]
[align=left] Get_New_163(row["link"].ToString(), row["title"].ToString());[/align]
[align=left] }[/align]
[align=left] ds.Reset();[/align]
[align=left] Thread.Sleep(3600000); [/align]
[align=left] }[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] private void Get_New_163(string this_url,string title) [/align]
[align=left] {[/align]
[align=left] string Directory_Name = NewPath + "//" + DateTime.Now.ToString("yyyyMMdd");[/align]
[align=left] rex = new Regex(@"//|/|:|/*|/?|<|>|/|");[/align]
[align=left] title = rex.Replace(title, "");[/align]
[align=left] string File_Name = Directory_Name + "//" + title + ".html";[/align]
[align=left] [/align]
[align=left] if (!Directory.Exists(Directory_Name))[/align]
[align=left] {[/align]
[align=left] Directory.CreateDirectory(Directory_Name);[/align]
[align=left] }[/align]
[align=left] if (!File.Exists(File_Name))[/align]
[align=left] {[/align]
[align=left] request = WebRequest.Create(this_url);[/align]
[align=left] WebResponse response = request.GetResponse();[/align]
[align=left] Stream resStream = response.GetResponseStream();[/align]
[align=left] StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);[/align]
[align=left] htmltext = sr.ReadToEnd();[/align]
[align=left] sr.Close();[/align]
[align=left] sr.Dispose(); [/align]
[align=left] [/align]
[align=left] /*********************格式化************************/[/align]
[align=left] int index_end = htmltext.IndexOf("<!-- 分页 -->");[/align]
[align=left] int index_star = htmltext.IndexOf("<span id=/"digest/">");[/align]
[align=left] if (index_end != -1 && index_star != -1)[/align]
[align=left] {[/align]
[align=left] rex = new Regex("(<h1 id=/"endTitle/">.*</h1>)", RegexOptions.IgnoreCase);[/align]
[align=left] [/align]
[align=left] string tem = "";[/align]
[align=left] foreach (Match match in rex.Matches(htmltext, 0))[/align]
[align=left] {[/align]
[align=left] tem = match.ToString();[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] tem += htmltext.Substring(index_star, index_end - index_star);[/align]
[align=left] rex = new Regex("(<a href.*</a>)", RegexOptions.IgnoreCase);[/align]
[align=left] htmltext = rex.Replace(tem, "");[/align]
[align=left] [/align]
[align=left] //保存[/align]
[align=left] StreamWriter write = new StreamWriter(File_Name, false, Encoding.GetEncoding("GB2312"));[/align]
[align=left] write.Write(htmltext);[/align]
[align=left] write.Close();[/align]
[align=left] resStream.Close();[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] /*********************格式化************************/ [/align]
[align=left] } [/align]
[align=left] }[/align]
}
实例下载地址:http://d.download.csdn.net/down/454200/sunchaohuang
[align=left] void Application_Start(object sender, EventArgs e) [/align]
[align=left] {[/align]
[align=left] AppLine.RssUrl = @"http://news.163.com/special/00011K6L/rss_newstop.xml";[/align]
[align=left] AppLine.NewPath = Server.MapPath("~/Get_New/");[/align]
[align=left] AppLine appline = new AppLine();[/align]
}
AppLin
[align=left]using System;[/align]
[align=left]using System.Data;[/align]
[align=left]using System.Configuration;[/align]
[align=left]using System.Web;[/align]
[align=left]using System.Web.Security;[/align]
[align=left]using System.Web.UI;[/align]
[align=left]using System.Web.UI.WebControls;[/align]
[align=left]using System.Web.UI.WebControls.WebParts;[/align]
[align=left]using System.Web.UI.HtmlControls;[/align]
[align=left]using System.IO;[/align]
[align=left]using System.Net;[/align]
[align=left]using System.Text;[/align]
[align=left]using System.Threading;[/align]
[align=left]using System.Text.RegularExpressions;[/align]
[align=left] [/align]
[align=left] [/align]
[align=left]///<summary>[/align]
[align=left]/// AppLine 的摘要说明[/align]
[align=left]///</summary>[/align]
[align=left]public class AppLine : Page[/align]
[align=left]{[/align]
[align=left] [/align]
[align=left] public static bool bStop;[/align]
[align=left] public static DataSet ds;[/align]
[align=left] public static string RssUrl;[/align]
[align=left] public static string htmltext;[/align]
[align=left] public static string NewPath;[/align]
[align=left] WebRequest request;[/align]
[align=left] Regex rex;[/align]
[align=left] public AppLine()[/align]
[align=left] {[/align]
[align=left] new Thread(new ThreadStart(ThreadProc)).Start();[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] public void ThreadProc()[/align]
[align=left] {[/align]
[align=left] while (!bStop)[/align]
[align=left] {[/align]
[align=left] [/align]
[align=left] ds = new DataSet();[/align]
[align=left] try[/align]
[align=left] {[/align]
[align=left] ds.ReadXml(RssUrl);[/align]
[align=left] }[/align]
[align=left] catch [/align]
[align=left] {[/align]
[align=left] return;[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] foreach (DataRow row in ds.Tables["item"].Rows)[/align]
[align=left] {[/align]
[align=left] if (row["link"].ToString().IndexOf("news") != -1)[/align]
[align=left] Get_New_163(row["link"].ToString(), row["title"].ToString());[/align]
[align=left] }[/align]
[align=left] ds.Reset();[/align]
[align=left] Thread.Sleep(3600000); [/align]
[align=left] }[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] private void Get_New_163(string this_url,string title) [/align]
[align=left] {[/align]
[align=left] string Directory_Name = NewPath + "//" + DateTime.Now.ToString("yyyyMMdd");[/align]
[align=left] rex = new Regex(@"//|/|:|/*|/?|<|>|/|");[/align]
[align=left] title = rex.Replace(title, "");[/align]
[align=left] string File_Name = Directory_Name + "//" + title + ".html";[/align]
[align=left] [/align]
[align=left] if (!Directory.Exists(Directory_Name))[/align]
[align=left] {[/align]
[align=left] Directory.CreateDirectory(Directory_Name);[/align]
[align=left] }[/align]
[align=left] if (!File.Exists(File_Name))[/align]
[align=left] {[/align]
[align=left] request = WebRequest.Create(this_url);[/align]
[align=left] WebResponse response = request.GetResponse();[/align]
[align=left] Stream resStream = response.GetResponseStream();[/align]
[align=left] StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);[/align]
[align=left] htmltext = sr.ReadToEnd();[/align]
[align=left] sr.Close();[/align]
[align=left] sr.Dispose(); [/align]
[align=left] [/align]
[align=left] /*********************格式化************************/[/align]
[align=left] int index_end = htmltext.IndexOf("<!-- 分页 -->");[/align]
[align=left] int index_star = htmltext.IndexOf("<span id=/"digest/">");[/align]
[align=left] if (index_end != -1 && index_star != -1)[/align]
[align=left] {[/align]
[align=left] rex = new Regex("(<h1 id=/"endTitle/">.*</h1>)", RegexOptions.IgnoreCase);[/align]
[align=left] [/align]
[align=left] string tem = "";[/align]
[align=left] foreach (Match match in rex.Matches(htmltext, 0))[/align]
[align=left] {[/align]
[align=left] tem = match.ToString();[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] tem += htmltext.Substring(index_star, index_end - index_star);[/align]
[align=left] rex = new Regex("(<a href.*</a>)", RegexOptions.IgnoreCase);[/align]
[align=left] htmltext = rex.Replace(tem, "");[/align]
[align=left] [/align]
[align=left] //保存[/align]
[align=left] StreamWriter write = new StreamWriter(File_Name, false, Encoding.GetEncoding("GB2312"));[/align]
[align=left] write.Write(htmltext);[/align]
[align=left] write.Close();[/align]
[align=left] resStream.Close();[/align]
[align=left] }[/align]
[align=left] [/align]
[align=left] /*********************格式化************************/ [/align]
[align=left] } [/align]
[align=left] }[/align]
}
实例下载地址:http://d.download.csdn.net/down/454200/sunchaohuang
相关文章推荐
- Python多篇新闻自动采集
- 新闻自动采集系统
- 个性新闻自动采集系统
- 分享C#源代码,自动排课系统 V1.1(源码), 新闻自动采集系统(源码)
- Java基础系列6:计时器Timer与新闻的定时自动采集
- 求助,怎么实现新闻自动采集并更新内容
- 分享C#源代码:自动排课,新闻自动采集等
- Python多篇新闻自动采集
- 自动采集程序
- php curl自动采集远程服务器图片方法
- asp微信公众号自动回复开发案列之新闻查询机器人
- 发现一款强大的自动采集软件myspider
- form表单数据的自动采集的js方法
- DEDE自动采集插件演示版发布
- Wordpress下的自动采集爬虫插件
- 模拟HTTP请求实现网页自动操作及数据采集的方法
- 【开源】开发者新闻APP+新闻Restful服务+博客园新闻采集程序+infoq新闻采集程序+36kr新闻采集程序+oschina新闻采集程序+51cto新闻采集程序+csdn新闻采集程序
- TF-IDF与余弦相似性的应用——完成新闻的相似性检测和自动摘要
- web编程之内容自动采集器模块(PHP+Mysql)
- php采集csdn首页新闻