您的位置:首页 > 编程语言 > C#

C#【爬虫】明星 微博+贴吧+格式化+发送

2016-05-21 17:32 525 查看
using ResourceLib;
using ResourceLib.ResourceService;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using VCard;
using System.Xml.Linq;
using System.Threading;

namespace ConsoleApplication7
{
class STAR_MSG  //明星动态
{
public DateTime time { get; set; } //发动态时间
public string msg { get; set; }//动态文字信息
public string img_list { get; set; }//动态包含的图片list
}
class SPOT_MSG //现场动态
{
public DateTime time { get; set; }//发动态时间
public string msg { get; set; }//文字信息
public string img_list { get; set; }//动态图片list
}
class STRUCT_STAR
{
public string name { get; set; }//名字
public DateTime time { get; set; } //时间
}
class STRUCT_SPOT
{
public string name { get; set; }//名字
public DateTime time { get; set; } //时间
}
class Program
{
static List<STRUCT_STAR> lstar = new List<STRUCT_STAR>();
static List<STRUCT_SPOT> lspot = new List<STRUCT_SPOT>();
//===================================================================================记录上次的时间需要去重
static STAR_MSG get_star_msg(string s, int n)
{
STAR_MSG star = new STAR_MSG();
string str = s;
//=================================
string date = "";
string msg = "";
List<string> picURL = new List<string>();

Regex rgx = new Regex("WB_from S_txt2");
//===========================================
//获取date
try
{
date = rgx.Split(str)
;
rgx = new Regex("WB_text W_f14");
date = rgx.Split(date)[0];
rgx = new Regex("S_txt2");
date = rgx.Split(date)[1];
rgx = new Regex(">");
date = rgx.Split(date)[1].Trim();
rgx = new Regex("<");
date = rgx.Split(date)[0].Trim();
if ((new Regex("月")).IsMatch(date))
{
string y = DateTime.Now.Year.ToString();
string m = (new Regex("月")).Split(date)[0];
string d = (new Regex("日")).Split(date)[0];
d = (new Regex("月")).Split(d)[1];
string hm = (new Regex(" ")).Split(date)[1];
star.time = DateTime.Parse(y + "-" + m + "-" + d + " " + hm);
}
else
{
star.time = DateTime.Parse(date);
}
Console.WriteLine("date成功");
}
catch
{
Console.WriteLine("时间解析出错");
}
//获取msg
try
{
rgx = new Regex("WB_text W_f14[\\s\\S]{10,200}\">\\\\n");
msg = rgx.Split(str)
.Trim();
rgx = new Regex(@"<\\/div>");
msg = rgx.Split(msg)[0].Trim();
rgx = new Regex(@"<a[\s\S]{10,450}/a>");
msg = rgx.Replace(msg, "").Trim();
rgx = new Regex(@"<img[\s\S]{100,300}>");
msg = rgx.Replace(msg, "").Replace(" ", " ").Trim();
star.msg = msg;
Console.WriteLine("msg成功");
}
catch
{
Console.WriteLine("消息解析出错");
}
//获取图片list
try
{
rgx = new Regex("<!--判断图片的个数,渲染图片-->");
string ct = rgx.Split(str)
;
rgx = new Regex(@"<\\/div>");
ct = rgx.Split(ct)[0].Trim();

rgx = new Regex(@"http[\s\S]{50,80}.jpg");
MatchCollection mc = rgx.Matches(ct);
foreach (var imgstr in mc)
{
picURL.Add(imgstr.ToString().Replace(@"\", ""));
}
foreach (string Url in picURL)
{
UploadResourceInfoUrl uinfo = new UploadResourceInfoUrl();
uinfo.Authority = "xunjian.o2o.com.cn";
uinfo.Domain = ".o2o.com.cn";
uinfo.FileName = "tu.jpg";
uinfo.ResourceUrl = Url;
uinfo.ResourceLength = 0;
uinfo.ResourceType = 0;
var r = ResourceHelper.UploadResource(uinfo);
if (r.IsOK)
{
star.img_list += r.FilePath + ";";
}
else
{
star.img_list += Url;
}
}
Console.WriteLine("img_list成功");
}
catch
{
Console.WriteLine("图片解析出错");
}

return star;
}
static string get_weibo_html(string url)
{
string s = "";
try
{
HttpWebRequest request = null;
HttpWebResponse response = null;
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.Host = "weibo.com";
request.KeepAlive = true;
CookieContainer cn = new CookieContainer();
cn.Add(new Cookie("SINAGLOBAL", "2363872622988.9253.1462504935307", "/", ".weibo.com"));
cn.Add(new Cookie("UOR", "www.php100.com%2Cwidget.weibo.com%2Cbbs.miercn.com", "/", ".weibo.com"));
cn.Add(new Cookie("SUB", "_2AkMgcMoRf8NhqwJRmP0Xy2viaYV3zAHEiebDAHzsJxJjHlVJ7T9lqCQ8rzNGKzIiK52m-nxxOM_n1QZ7pw..", "/", ".weibo.com"));
cn.Add(new Cookie("SUBP", "0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5GY1xBTkV.w-ucMxfmFxY-", "/", ".weibo.com"));
cn.Add(new Cookie("YF-Page-G0", "b9004652c3bb1711215bacc0d9b6f2b5", "/", ".weibo.com"));
cn.Add(new Cookie("_s_tentry", "-", "/", ".weibo.com"));
cn.Add(new Cookie("Apache", "4656317907979.732.1462753760391", "/", ".weibo.com"));
cn.Add(new Cookie("ULV", "1462753760468:3:3:1:4656317907979.732.1462753760391:1462598305284", "/", ".weibo.com"));
cn.Add(new Cookie("YF-V5-G0", "694581d81c495bd4b6d62b3ba4f9f1c8", "/", ".weibo.com"));
cn.Add(new Cookie("YF-Ugrow-G0", "1eba44dbebf62c27ae66e16d40e02964", "/", ".weibo.com"));
cn.Add(new Cookie("WBtopGlobal_register_version", "60539f809b40ed0d", "/", ".weibo.com"));
request.CookieContainer = cn;
request.ContentType = "application/x-www-form-urlencoded";
response = (HttpWebResponse)request.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
s = sr.ReadToEnd();
sr.Close();
request.Abort();
response.Close();
}
catch
{
Console.WriteLine("获取数据出错");
}
return s;
}
//===================================================================================微博方法
static string get_tieba_html(string url)
{
string s = "";
try
{
HttpWebRequest request = null;
HttpWebResponse response = null;
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.Host = "tieba.baidu.com";
request.KeepAlive = true;
CookieContainer cn = new CookieContainer();
cn.Add(new Cookie("TIEBA_USERTYPE", "3b218806e82b3aa0fd2ae7dc", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("bdshare_firstime", "1461572415344", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("TIEBAUID", "ab202a741c5a937664dafb46", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("BAIDUID", "AA783FFD52E477A7C8D25BBBFA49313D:FG=1", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("PSTM", "1462440736", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("BIDUPSID", "878F94DD4B53B4C29C2452AC7256AD78", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("LONGID", "1759463301", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("H_PS_PSSID", "19636_20023_19685_1436_12897_17948_19570_19805_19558_19808_19842_17001_15294_11963", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("wise_device", "0", "/", ".tieba.baidu.com"));
request.CookieContainer = cn;
request.ContentType = "application/x-www-form-urlencoded";
response = (HttpWebResponse)request.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
s = sr.ReadToEnd();
sr.Close();
request.Abort();
response.Close();
}
catch
{
Console.WriteLine("获取明星贴吧html页信息出错!");
}
return s;
}
static List<string> GetURL(string ss)
{
List<string> urllist = new List<string>();
try
{
List<string> tmp = new List<string>();
string s = ss;
Regex rgx = new Regex(" j_thread_list clearfix");
tmp = rgx.Split(s).Skip(1).ToList();

foreach (string str in tmp)
{
urllist.Add("http://tieba.baidu.com/p/" + str.Split(',')[0].Split(':')[1]);
}
}
catch
{
Console.WriteLine("解析获取每个贴吧的所有帖子url出错!");
}
return urllist;
}
static string get_tiezi_html(string url)
{
string s = "";
try
{
HttpWebRequest request = null;
HttpWebResponse response = null;
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.Host = "tieba.baidu.com";
request.KeepAlive = true;
CookieContainer cn = new CookieContainer();
cn.Add(new Cookie("userFromPsNeedShowTab", "1", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("TIEBA_USERTYPE", "3b218806e82b3aa0fd2ae7dc", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("bdshare_firstime", "1461572415344", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("TIEBAUID", "ab202a741c5a937664dafb46", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("BAIDUID", "AA783FFD52E477A7C8D25BBBFA49313D:FG=1", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("PSTM", "1462440736", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("BIDUPSID", "878F94DD4B53B4C29C2452AC7256AD78", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("LONGID", "1759463301", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("H_PS_PSSID", "19636_20023_19685_1436_12897_17948_19570_19805_19558_19808_19842_17001_15294_11963", "/", ".tieba.baidu.com"));
cn.Add(new Cookie("wise_device", "0", "/", ".tieba.baidu.com"));
request.CookieContainer = cn;
request.ContentType = "application/x-www-form-urlencoded";
response = (HttpWebResponse)request.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
s = sr.ReadToEnd();
sr.Close();
request.Abort();
response.Close();
}
catch
{
Console.WriteLine("获取帖子html信息失败!");
}
return s;
}
static DateTime get_time(string ss)
{
string s = ss;
DateTime time = DateTime.Now;
try
{
Regex rgx = new Regex("l_post j_l_post l_post_bright noborder ");
s = rgx.Split(s)[1];
rgx = new Regex("date":"");
s = rgx.Split(s)[1];
rgx = new Regex("&");
s = rgx.Split(s)[0].Trim();
time = DateTime.Parse(s);
}
catch
{
Console.WriteLine("正则解析获取时间出错!");
}
return time;
}
static string get_msg(string ss)
{
string s = ss;
try
{
Regex rgx = new Regex("d_post_content j_d_post_content  clearfix\">");
s = rgx.Split(s)[1];
rgx = new Regex("</div>");
s = rgx.Split(s)[0].Trim().Replace("<br>", "\\n");
rgx = new Regex("<img[\\S\\s]{50,280}>");
s = rgx.Replace(s, "");
rgx = new Regex("<a[\\S\\s]{50,300}\">");
s = rgx.Replace(s, "");
rgx = new Regex("</a>");
s = rgx.Replace(s, "");
}
catch
{
Console.WriteLine("正则解析获取msg出错!");
}
return s;
}
static string get_img_list(string ss)
{
string needurl = "";
List<string> img_list = new List<string>();
List<string> tmp = new List<string>();
string s = ss;
try
{
Regex rgx = new Regex("d_post_content j_d_post_content  clearfix\">");
s = rgx.Split(s)[1];
rgx = new Regex("</div>");
s = rgx.Split(s)[0].Trim().Replace("<br>", "\\n");
rgx = new Regex("http://imgsrc.baidu.com[\\S\\s]{50,240}g\"");
MatchCollection mc = rgx.Matches(s);
foreach (var str in mc)
{
tmp.Add(str.ToString());
}
rgx = new Regex("\"");
foreach (string str in tmp)
{
img_list.Add(rgx.Replace(str, ""));
}
foreach (string Url in img_list)
{
UploadResourceInfoUrl uinfo = new UploadResourceInfoUrl();
uinfo.Authority = "xunjian.o2o.com.cn";
uinfo.Domain = ".o2o.com.cn";
uinfo.FileName = "tu.jpg";
uinfo.ResourceUrl = Url;
uinfo.ResourceLength = 0;
uinfo.ResourceType = 0;
var r = ResourceHelper.UploadResource(uinfo);
if (r.IsOK)
{
needurl += r.FilePath + ";";
}
else
{
needurl += Url;
}
}
}
catch
{
Console.WriteLine("正则解析获取img_list出错!");
}

return needurl;
}
//===================================================================================贴吧方法
static string star_getstatus(string postdata)
{
string s = "";
try
{
using (MyWebClient client = new MyWebClient())
{
client.Encoding = Encoding.UTF8;
client.Headers.Add("Content-Type: application/x-www-form-urlencoded");
byte[] bt = Encoding.UTF8.GetBytes(postdata);
byte[] resultbt = client.UploadData("http://wkfind.o2o.com.cn:8080/api/webapi/addfoot", bt);
s = Encoding.UTF8.GetString(resultbt);
}
}
catch
{
Console.WriteLine("post姜志远失败!");
}
return s;
}
//===================================================================================post返回结果
static bool check1(string name, DateTime time)
{
for (int i = 0; i < lstar.Count; i++)
{
if (name.Equals(lstar[i].name))
{
if (DateTime.Compare(lstar[i].time, time) == 0)
{
return false;
}
else
{
lstar[i].time = time;
return true;
}
}
}
lstar.Add(new STRUCT_STAR { name = name, time = time });
return true;
}
static bool check2(string name, DateTime time)
{
for (int i = 0; i < lspot.Count; i++)
{
if (name.Equals(lspot[i].name))
{
if (DateTime.Compare(lspot[i].time, time) == 0)
{
return false;
}
else
{
lspot[i].time = time;
return true;
}
}
}
lspot.Add(new STRUCT_SPOT { name = name, time = time });
return true;
}
static void run()
{
XDocument doc = XDocument.Load("../../GroupIDToUrl.xml");
var items = doc.Element("root").Elements("item").ToList();
try
{
foreach (XElement item in items)
{
//===============================================================
string name = item.Element("name").Value;
string groupid = item.Element("groupid").Value;
string id = item.Element("id").Value;
string weibourl = item.Element("weibourl").Value;
string tiebaurl = item.Element("tiebaurl").Value;
string status = "";
string status2 = "";
//===============================================================
string weibo_html = get_weibo_html(weibourl);
STAR_MSG star_msg = get_star_msg(weibo_html, 1);
if (check1(name, star_msg.time))
{
string star_postdata = "info={id:\"" + id + "\",content:\"" + star_msg.msg + "\",image:\"" + star_msg.img_list + "\",groupid:" + groupid + ",datetime:\"" + star_msg.time + "\",type:" + 3 + "}";
status = star_getstatus(star_postdata);
}
//===============================================================
string tieba_html = get_tieba_html(tiebaurl);   //进入一个贴吧(比如:王祖蓝吧)的html信息
List<string> url_list = GetURL(tieba_html);  //一个贴吧(比如:王祖蓝吧)的所有帖子指向的链接list
List<string> tmp = new List<string>(); //暂时存储第一页所有帖子的html信息 大约占用空间不超过40M
foreach (string _url in url_list)
{
tmp.Add(get_tiezi_html(_url));           //所有帖子的html信息获取完毕
Console.WriteLine("一个帖子已找到");
}
Console.WriteLine("所有帖子的html信息获取完毕");
DateTime time_tmp = DateTime.MinValue;
string res2 = "";
foreach (string _tmp in tmp)
{
DateTime dt = get_time(_tmp);
if (DateTime.Compare(time_tmp, dt) < 0)
{
time_tmp = dt;
res2 = _tmp;
}
}
Console.WriteLine("选出最大时间 以及对应的帖子");//选出最大时间 以及对应的帖子
SPOT_MSG no1 = new SPOT_MSG();
no1.time = time_tmp;
no1.msg = get_msg(res2);
no1.img_list = get_img_list(res2);
if (check2(name, no1.time))
{
string spot_postdata = "info={id:\"" + id + "\",content:\"" + no1.msg + "\",image:\"" + no1.img_list + "\",groupid:" + groupid + ",datetime:\"" + no1.time + "\",type:" + 1 + "}";
status2 = star_getstatus(spot_postdata);
}

//================================================打印返回信息
Console.WriteLine(status);
Console.WriteLine(status2);
Console.WriteLine("==================================================" + name);
}
/*
items.AsParallel().ForAll(item => {
//===============================================================
string name = item.Element("name").Value;
string groupid = item.Element("groupid").Value;
string id = item.Element("id").Value;
string weibourl = item.Element("weibourl").Value;
string tiebaurl = item.Element("tiebaurl").Value;
string status = "";
string status2 = "";
//===============================================================
string weibo_html = get_weibo_html(weibourl);
STAR_MSG star_msg = get_star_msg(weibo_html, 1);
if(check1(name, star_msg.time))
{
string star_postdata = "info={id:\"" + id + "\",content:\"" + star_msg.msg + "\",image:\"" + star_msg.img_list + "\",groupid:" + groupid + ",datetime:\"" + star_msg.time + "\",type:" + 3 + "}";
status = star_getstatus(star_postdata);
}
//===============================================================
string tieba_html = get_tieba_html(tiebaurl);   //进入一个贴吧(比如:王祖蓝吧)的html信息
List<string> url_list = GetURL(tieba_html);  //一个贴吧(比如:王祖蓝吧)的所有帖子指向的链接list
List<string> tmp = new List<string>(); //暂时存储第一页所有帖子的html信息 大约占用空间不超过40M
foreach (string _url in url_list)
{
tmp.Add(get_tiezi_html(_url));           //所有帖子的html信息获取完毕
Console.WriteLine("一个帖子已找到");
}
Console.WriteLine("所有帖子的html信息获取完毕");
DateTime time_tmp = DateTime.MinValue;
string res2 = "";
foreach (string _tmp in tmp)
{
DateTime dt = get_time(_tmp);
if (DateTime.Compare(time_tmp, dt) < 0)
{
time_tmp = dt;
res2 = _tmp;
}
}
Console.WriteLine("选出最大时间 以及对应的帖子");//选出最大时间 以及对应的帖子
SPOT_MSG no1 = new SPOT_MSG();
no1.time = time_tmp;
no1.msg = get_msg(res2);
no1.img_list = get_img_list(res2);
if (check2(name, no1.time))
{
string spot_postdata = "info={id:\"" + id + "\",content:\"" + no1.msg + "\",image:\"" + no1.img_list + "\",groupid:" + groupid + ",datetime:\"" + no1.time + "\",type:" + 1 + "}";
status2 = star_getstatus(spot_postdata);
}

//================================================打印返回信息
Console.WriteLine(status);
Console.WriteLine(status2);
Console.WriteLine("==================================================" + name);
});
*/
using (StreamWriter sw = new StreamWriter("../../star.txt", false))
{
foreach (STRUCT_STAR s in lstar)
{
sw.WriteLine(s.name+"$"+s.time);
}
}
using (StreamWriter sw = new StreamWriter("../../spot.txt", false))
{
foreach (STRUCT_SPOT s in lspot)
{
sw.WriteLine(s.name + "$" + s.time);
}
}

}
catch
{
Console.WriteLine("多线程运行失败");
}
}
static List<string> txt_to_list(string path)  //读取文件返回List<string>
{
List<string> l = new List<string>();
try
{
StreamReader sr = new StreamReader(path, Encoding.UTF8);
string s = "";
while ((s = sr.ReadLine()) != null)
{
l.Add(s);
}
sr.Close();

}
catch
{
}
return l;
}
static void load_txt()
{
List<string> l1 = txt_to_list("../../star.txt");
List<string> l2 = txt_to_list("../../spot.txt");
foreach (string str in l1)
{
string[] ss = str.Split('$');
STRUCT_STAR st = new STRUCT_STAR();
st.name = ss[0];
st.time = DateTime.Parse(ss[1]);
lstar.Add(st);
}
foreach (string str in l2)
{
string[] ss = str.Split('$');
STRUCT_SPOT sp = new STRUCT_SPOT();
sp.name = ss[0];
sp.time = DateTime.Parse(ss[1]);
lspot.Add(sp);
}
}
static void Main(string[] args)
{
load_txt();
while (true)
{
run();
Thread.Sleep(1000 * 60 * 60 * 4);
}
}
//===================================================================================入口
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  C# 爬虫