C#【爬虫】明星 微博+贴吧+格式化+发送
2016-05-21 17:32
525 查看
using ResourceLib; using ResourceLib.ResourceService; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using VCard; using System.Xml.Linq; using System.Threading; namespace ConsoleApplication7 { class STAR_MSG //明星动态 { public DateTime time { get; set; } //发动态时间 public string msg { get; set; }//动态文字信息 public string img_list { get; set; }//动态包含的图片list } class SPOT_MSG //现场动态 { public DateTime time { get; set; }//发动态时间 public string msg { get; set; }//文字信息 public string img_list { get; set; }//动态图片list } class STRUCT_STAR { public string name { get; set; }//名字 public DateTime time { get; set; } //时间 } class STRUCT_SPOT { public string name { get; set; }//名字 public DateTime time { get; set; } //时间 } class Program { static List<STRUCT_STAR> lstar = new List<STRUCT_STAR>(); static List<STRUCT_SPOT> lspot = new List<STRUCT_SPOT>(); //===================================================================================记录上次的时间需要去重 static STAR_MSG get_star_msg(string s, int n) { STAR_MSG star = new STAR_MSG(); string str = s; //================================= string date = ""; string msg = ""; List<string> picURL = new List<string>(); Regex rgx = new Regex("WB_from S_txt2"); //=========================================== //获取date try { date = rgx.Split(str) ; rgx = new Regex("WB_text W_f14"); date = rgx.Split(date)[0]; rgx = new Regex("S_txt2"); date = rgx.Split(date)[1]; rgx = new Regex(">"); date = rgx.Split(date)[1].Trim(); rgx = new Regex("<"); date = rgx.Split(date)[0].Trim(); if ((new Regex("月")).IsMatch(date)) { string y = DateTime.Now.Year.ToString(); string m = (new Regex("月")).Split(date)[0]; string d = (new Regex("日")).Split(date)[0]; d = (new Regex("月")).Split(d)[1]; string hm = (new Regex(" ")).Split(date)[1]; star.time = DateTime.Parse(y + "-" + m + "-" + d + " " + hm); } else { star.time = DateTime.Parse(date); } Console.WriteLine("date成功"); } catch { Console.WriteLine("时间解析出错"); } //获取msg try { rgx = new Regex("WB_text W_f14[\\s\\S]{10,200}\">\\\\n"); msg = rgx.Split(str) .Trim(); rgx = new Regex(@"<\\/div>"); msg = rgx.Split(msg)[0].Trim(); rgx = new Regex(@"<a[\s\S]{10,450}/a>"); msg = rgx.Replace(msg, "").Trim(); rgx = new Regex(@"<img[\s\S]{100,300}>"); msg = rgx.Replace(msg, "").Replace(" ", " ").Trim(); star.msg = msg; Console.WriteLine("msg成功"); } catch { Console.WriteLine("消息解析出错"); } //获取图片list try { rgx = new Regex("<!--判断图片的个数,渲染图片-->"); string ct = rgx.Split(str) ; rgx = new Regex(@"<\\/div>"); ct = rgx.Split(ct)[0].Trim(); rgx = new Regex(@"http[\s\S]{50,80}.jpg"); MatchCollection mc = rgx.Matches(ct); foreach (var imgstr in mc) { picURL.Add(imgstr.ToString().Replace(@"\", "")); } foreach (string Url in picURL) { UploadResourceInfoUrl uinfo = new UploadResourceInfoUrl(); uinfo.Authority = "xunjian.o2o.com.cn"; uinfo.Domain = ".o2o.com.cn"; uinfo.FileName = "tu.jpg"; uinfo.ResourceUrl = Url; uinfo.ResourceLength = 0; uinfo.ResourceType = 0; var r = ResourceHelper.UploadResource(uinfo); if (r.IsOK) { star.img_list += r.FilePath + ";"; } else { star.img_list += Url; } } Console.WriteLine("img_list成功"); } catch { Console.WriteLine("图片解析出错"); } return star; } static string get_weibo_html(string url) { string s = ""; try { HttpWebRequest request = null; HttpWebResponse response = null; request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; request.Host = "weibo.com"; request.KeepAlive = true; CookieContainer cn = new CookieContainer(); cn.Add(new Cookie("SINAGLOBAL", "2363872622988.9253.1462504935307", "/", ".weibo.com")); cn.Add(new Cookie("UOR", "www.php100.com%2Cwidget.weibo.com%2Cbbs.miercn.com", "/", ".weibo.com")); cn.Add(new Cookie("SUB", "_2AkMgcMoRf8NhqwJRmP0Xy2viaYV3zAHEiebDAHzsJxJjHlVJ7T9lqCQ8rzNGKzIiK52m-nxxOM_n1QZ7pw..", "/", ".weibo.com")); cn.Add(new Cookie("SUBP", "0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5GY1xBTkV.w-ucMxfmFxY-", "/", ".weibo.com")); cn.Add(new Cookie("YF-Page-G0", "b9004652c3bb1711215bacc0d9b6f2b5", "/", ".weibo.com")); cn.Add(new Cookie("_s_tentry", "-", "/", ".weibo.com")); cn.Add(new Cookie("Apache", "4656317907979.732.1462753760391", "/", ".weibo.com")); cn.Add(new Cookie("ULV", "1462753760468:3:3:1:4656317907979.732.1462753760391:1462598305284", "/", ".weibo.com")); cn.Add(new Cookie("YF-V5-G0", "694581d81c495bd4b6d62b3ba4f9f1c8", "/", ".weibo.com")); cn.Add(new Cookie("YF-Ugrow-G0", "1eba44dbebf62c27ae66e16d40e02964", "/", ".weibo.com")); cn.Add(new Cookie("WBtopGlobal_register_version", "60539f809b40ed0d", "/", ".weibo.com")); request.CookieContainer = cn; request.ContentType = "application/x-www-form-urlencoded"; response = (HttpWebResponse)request.GetResponse(); StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); s = sr.ReadToEnd(); sr.Close(); request.Abort(); response.Close(); } catch { Console.WriteLine("获取数据出错"); } return s; } //===================================================================================微博方法 static string get_tieba_html(string url) { string s = ""; try { HttpWebRequest request = null; HttpWebResponse response = null; request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; request.Host = "tieba.baidu.com"; request.KeepAlive = true; CookieContainer cn = new CookieContainer(); cn.Add(new Cookie("TIEBA_USERTYPE", "3b218806e82b3aa0fd2ae7dc", "/", ".tieba.baidu.com")); cn.Add(new Cookie("bdshare_firstime", "1461572415344", "/", ".tieba.baidu.com")); cn.Add(new Cookie("TIEBAUID", "ab202a741c5a937664dafb46", "/", ".tieba.baidu.com")); cn.Add(new Cookie("BAIDUID", "AA783FFD52E477A7C8D25BBBFA49313D:FG=1", "/", ".tieba.baidu.com")); cn.Add(new Cookie("PSTM", "1462440736", "/", ".tieba.baidu.com")); cn.Add(new Cookie("BIDUPSID", "878F94DD4B53B4C29C2452AC7256AD78", "/", ".tieba.baidu.com")); cn.Add(new Cookie("LONGID", "1759463301", "/", ".tieba.baidu.com")); cn.Add(new Cookie("H_PS_PSSID", "19636_20023_19685_1436_12897_17948_19570_19805_19558_19808_19842_17001_15294_11963", "/", ".tieba.baidu.com")); cn.Add(new Cookie("wise_device", "0", "/", ".tieba.baidu.com")); request.CookieContainer = cn; request.ContentType = "application/x-www-form-urlencoded"; response = (HttpWebResponse)request.GetResponse(); StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); s = sr.ReadToEnd(); sr.Close(); request.Abort(); response.Close(); } catch { Console.WriteLine("获取明星贴吧html页信息出错!"); } return s; } static List<string> GetURL(string ss) { List<string> urllist = new List<string>(); try { List<string> tmp = new List<string>(); string s = ss; Regex rgx = new Regex(" j_thread_list clearfix"); tmp = rgx.Split(s).Skip(1).ToList(); foreach (string str in tmp) { urllist.Add("http://tieba.baidu.com/p/" + str.Split(',')[0].Split(':')[1]); } } catch { Console.WriteLine("解析获取每个贴吧的所有帖子url出错!"); } return urllist; } static string get_tiezi_html(string url) { string s = ""; try { HttpWebRequest request = null; HttpWebResponse response = null; request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; request.Host = "tieba.baidu.com"; request.KeepAlive = true; CookieContainer cn = new CookieContainer(); cn.Add(new Cookie("userFromPsNeedShowTab", "1", "/", ".tieba.baidu.com")); cn.Add(new Cookie("TIEBA_USERTYPE", "3b218806e82b3aa0fd2ae7dc", "/", ".tieba.baidu.com")); cn.Add(new Cookie("bdshare_firstime", "1461572415344", "/", ".tieba.baidu.com")); cn.Add(new Cookie("TIEBAUID", "ab202a741c5a937664dafb46", "/", ".tieba.baidu.com")); cn.Add(new Cookie("BAIDUID", "AA783FFD52E477A7C8D25BBBFA49313D:FG=1", "/", ".tieba.baidu.com")); cn.Add(new Cookie("PSTM", "1462440736", "/", ".tieba.baidu.com")); cn.Add(new Cookie("BIDUPSID", "878F94DD4B53B4C29C2452AC7256AD78", "/", ".tieba.baidu.com")); cn.Add(new Cookie("LONGID", "1759463301", "/", ".tieba.baidu.com")); cn.Add(new Cookie("H_PS_PSSID", "19636_20023_19685_1436_12897_17948_19570_19805_19558_19808_19842_17001_15294_11963", "/", ".tieba.baidu.com")); cn.Add(new Cookie("wise_device", "0", "/", ".tieba.baidu.com")); request.CookieContainer = cn; request.ContentType = "application/x-www-form-urlencoded"; response = (HttpWebResponse)request.GetResponse(); StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); s = sr.ReadToEnd(); sr.Close(); request.Abort(); response.Close(); } catch { Console.WriteLine("获取帖子html信息失败!"); } return s; } static DateTime get_time(string ss) { string s = ss; DateTime time = DateTime.Now; try { Regex rgx = new Regex("l_post j_l_post l_post_bright noborder "); s = rgx.Split(s)[1]; rgx = new Regex("date":""); s = rgx.Split(s)[1]; rgx = new Regex("&"); s = rgx.Split(s)[0].Trim(); time = DateTime.Parse(s); } catch { Console.WriteLine("正则解析获取时间出错!"); } return time; } static string get_msg(string ss) { string s = ss; try { Regex rgx = new Regex("d_post_content j_d_post_content clearfix\">"); s = rgx.Split(s)[1]; rgx = new Regex("</div>"); s = rgx.Split(s)[0].Trim().Replace("<br>", "\\n"); rgx = new Regex("<img[\\S\\s]{50,280}>"); s = rgx.Replace(s, ""); rgx = new Regex("<a[\\S\\s]{50,300}\">"); s = rgx.Replace(s, ""); rgx = new Regex("</a>"); s = rgx.Replace(s, ""); } catch { Console.WriteLine("正则解析获取msg出错!"); } return s; } static string get_img_list(string ss) { string needurl = ""; List<string> img_list = new List<string>(); List<string> tmp = new List<string>(); string s = ss; try { Regex rgx = new Regex("d_post_content j_d_post_content clearfix\">"); s = rgx.Split(s)[1]; rgx = new Regex("</div>"); s = rgx.Split(s)[0].Trim().Replace("<br>", "\\n"); rgx = new Regex("http://imgsrc.baidu.com[\\S\\s]{50,240}g\""); MatchCollection mc = rgx.Matches(s); foreach (var str in mc) { tmp.Add(str.ToString()); } rgx = new Regex("\""); foreach (string str in tmp) { img_list.Add(rgx.Replace(str, "")); } foreach (string Url in img_list) { UploadResourceInfoUrl uinfo = new UploadResourceInfoUrl(); uinfo.Authority = "xunjian.o2o.com.cn"; uinfo.Domain = ".o2o.com.cn"; uinfo.FileName = "tu.jpg"; uinfo.ResourceUrl = Url; uinfo.ResourceLength = 0; uinfo.ResourceType = 0; var r = ResourceHelper.UploadResource(uinfo); if (r.IsOK) { needurl += r.FilePath + ";"; } else { needurl += Url; } } } catch { Console.WriteLine("正则解析获取img_list出错!"); } return needurl; } //===================================================================================贴吧方法 static string star_getstatus(string postdata) { string s = ""; try { using (MyWebClient client = new MyWebClient()) { client.Encoding = Encoding.UTF8; client.Headers.Add("Content-Type: application/x-www-form-urlencoded"); byte[] bt = Encoding.UTF8.GetBytes(postdata); byte[] resultbt = client.UploadData("http://wkfind.o2o.com.cn:8080/api/webapi/addfoot", bt); s = Encoding.UTF8.GetString(resultbt); } } catch { Console.WriteLine("post姜志远失败!"); } return s; } //===================================================================================post返回结果 static bool check1(string name, DateTime time) { for (int i = 0; i < lstar.Count; i++) { if (name.Equals(lstar[i].name)) { if (DateTime.Compare(lstar[i].time, time) == 0) { return false; } else { lstar[i].time = time; return true; } } } lstar.Add(new STRUCT_STAR { name = name, time = time }); return true; } static bool check2(string name, DateTime time) { for (int i = 0; i < lspot.Count; i++) { if (name.Equals(lspot[i].name)) { if (DateTime.Compare(lspot[i].time, time) == 0) { return false; } else { lspot[i].time = time; return true; } } } lspot.Add(new STRUCT_SPOT { name = name, time = time }); return true; } static void run() { XDocument doc = XDocument.Load("../../GroupIDToUrl.xml"); var items = doc.Element("root").Elements("item").ToList(); try { foreach (XElement item in items) { //=============================================================== string name = item.Element("name").Value; string groupid = item.Element("groupid").Value; string id = item.Element("id").Value; string weibourl = item.Element("weibourl").Value; string tiebaurl = item.Element("tiebaurl").Value; string status = ""; string status2 = ""; //=============================================================== string weibo_html = get_weibo_html(weibourl); STAR_MSG star_msg = get_star_msg(weibo_html, 1); if (check1(name, star_msg.time)) { string star_postdata = "info={id:\"" + id + "\",content:\"" + star_msg.msg + "\",image:\"" + star_msg.img_list + "\",groupid:" + groupid + ",datetime:\"" + star_msg.time + "\",type:" + 3 + "}"; status = star_getstatus(star_postdata); } //=============================================================== string tieba_html = get_tieba_html(tiebaurl); //进入一个贴吧(比如:王祖蓝吧)的html信息 List<string> url_list = GetURL(tieba_html); //一个贴吧(比如:王祖蓝吧)的所有帖子指向的链接list List<string> tmp = new List<string>(); //暂时存储第一页所有帖子的html信息 大约占用空间不超过40M foreach (string _url in url_list) { tmp.Add(get_tiezi_html(_url)); //所有帖子的html信息获取完毕 Console.WriteLine("一个帖子已找到"); } Console.WriteLine("所有帖子的html信息获取完毕"); DateTime time_tmp = DateTime.MinValue; string res2 = ""; foreach (string _tmp in tmp) { DateTime dt = get_time(_tmp); if (DateTime.Compare(time_tmp, dt) < 0) { time_tmp = dt; res2 = _tmp; } } Console.WriteLine("选出最大时间 以及对应的帖子");//选出最大时间 以及对应的帖子 SPOT_MSG no1 = new SPOT_MSG(); no1.time = time_tmp; no1.msg = get_msg(res2); no1.img_list = get_img_list(res2); if (check2(name, no1.time)) { string spot_postdata = "info={id:\"" + id + "\",content:\"" + no1.msg + "\",image:\"" + no1.img_list + "\",groupid:" + groupid + ",datetime:\"" + no1.time + "\",type:" + 1 + "}"; status2 = star_getstatus(spot_postdata); } //================================================打印返回信息 Console.WriteLine(status); Console.WriteLine(status2); Console.WriteLine("==================================================" + name); } /* items.AsParallel().ForAll(item => { //=============================================================== string name = item.Element("name").Value; string groupid = item.Element("groupid").Value; string id = item.Element("id").Value; string weibourl = item.Element("weibourl").Value; string tiebaurl = item.Element("tiebaurl").Value; string status = ""; string status2 = ""; //=============================================================== string weibo_html = get_weibo_html(weibourl); STAR_MSG star_msg = get_star_msg(weibo_html, 1); if(check1(name, star_msg.time)) { string star_postdata = "info={id:\"" + id + "\",content:\"" + star_msg.msg + "\",image:\"" + star_msg.img_list + "\",groupid:" + groupid + ",datetime:\"" + star_msg.time + "\",type:" + 3 + "}"; status = star_getstatus(star_postdata); } //=============================================================== string tieba_html = get_tieba_html(tiebaurl); //进入一个贴吧(比如:王祖蓝吧)的html信息 List<string> url_list = GetURL(tieba_html); //一个贴吧(比如:王祖蓝吧)的所有帖子指向的链接list List<string> tmp = new List<string>(); //暂时存储第一页所有帖子的html信息 大约占用空间不超过40M foreach (string _url in url_list) { tmp.Add(get_tiezi_html(_url)); //所有帖子的html信息获取完毕 Console.WriteLine("一个帖子已找到"); } Console.WriteLine("所有帖子的html信息获取完毕"); DateTime time_tmp = DateTime.MinValue; string res2 = ""; foreach (string _tmp in tmp) { DateTime dt = get_time(_tmp); if (DateTime.Compare(time_tmp, dt) < 0) { time_tmp = dt; res2 = _tmp; } } Console.WriteLine("选出最大时间 以及对应的帖子");//选出最大时间 以及对应的帖子 SPOT_MSG no1 = new SPOT_MSG(); no1.time = time_tmp; no1.msg = get_msg(res2); no1.img_list = get_img_list(res2); if (check2(name, no1.time)) { string spot_postdata = "info={id:\"" + id + "\",content:\"" + no1.msg + "\",image:\"" + no1.img_list + "\",groupid:" + groupid + ",datetime:\"" + no1.time + "\",type:" + 1 + "}"; status2 = star_getstatus(spot_postdata); } //================================================打印返回信息 Console.WriteLine(status); Console.WriteLine(status2); Console.WriteLine("==================================================" + name); }); */ using (StreamWriter sw = new StreamWriter("../../star.txt", false)) { foreach (STRUCT_STAR s in lstar) { sw.WriteLine(s.name+"$"+s.time); } } using (StreamWriter sw = new StreamWriter("../../spot.txt", false)) { foreach (STRUCT_SPOT s in lspot) { sw.WriteLine(s.name + "$" + s.time); } } } catch { Console.WriteLine("多线程运行失败"); } } static List<string> txt_to_list(string path) //读取文件返回List<string> { List<string> l = new List<string>(); try { StreamReader sr = new StreamReader(path, Encoding.UTF8); string s = ""; while ((s = sr.ReadLine()) != null) { l.Add(s); } sr.Close(); } catch { } return l; } static void load_txt() { List<string> l1 = txt_to_list("../../star.txt"); List<string> l2 = txt_to_list("../../spot.txt"); foreach (string str in l1) { string[] ss = str.Split('$'); STRUCT_STAR st = new STRUCT_STAR(); st.name = ss[0]; st.time = DateTime.Parse(ss[1]); lstar.Add(st); } foreach (string str in l2) { string[] ss = str.Split('$'); STRUCT_SPOT sp = new STRUCT_SPOT(); sp.name = ss[0]; sp.time = DateTime.Parse(ss[1]); lspot.Add(sp); } } static void Main(string[] args) { load_txt(); while (true) { run(); Thread.Sleep(1000 * 60 * 60 * 4); } } //===================================================================================入口 } }
相关文章推荐
- Python3写爬虫(四)多线程实现数据爬取
- Scrapy的架构介绍
- 爬虫笔记
- c#调用COM组件
- C#实现把指定数据写入串口
- C#动态创建button的方法
- C#中抽象方法与虚拟方法的区别
- c#中虚函数的相关使用方法
- C#实现给图片加水印的方法
- C#使用加边法计算行列式的值
- C#实现多线程的同步方法实例分析
- C#中尾递归的使用、优化及编译器优化
- C#中的delegate委托类型基本学习教程
- C#实现子窗体与父窗体通信方法实例总结
- C#通用邮件发送类分享
- 举例讲解C#中自动实现的属性
- C#中this的用法集锦
- C#数据结构之顺序表(SeqList)实例详解
- C#.NET获取拨号连接的宽带连接方法
- C#异步绑定数据实现方法