您的位置:首页 > Web前端 > HTML

通过HtmlAgilityPack插件和xpath解析html完成爬虫抓取数据

2020-02-16 19:18 696 查看
  1. 通过HtmlAgilityPack插件和xpath解析html完成爬虫抓取数据
    public static void GetData(string url, ref DataTable dt)
    {
    try
    {
    //WebClient获取Amazon的html会返回校验页面的html
    //WebClient wc = new WebClient();
    //string html = wc.DownloadString(url);

    //HtmlWeb方式获取html,获取多次以后,后续会加载不到html
    //HtmlWeb web = new HtmlWeb();
    //HtmlAgilityPack.HtmlDocument doc = web.Load(url);
    //找到排行的每个商品节点
    
    //通过HttpWebRequest方式获取html
    string html = WebRequestPost(url);
    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
    doc.LoadHtml(html);
    string xpathDiv = "//div[@class='zg_itemImmersion']";//找到class=zg_itemImmersion的div节点
    HtmlNodeCollection allDivs = doc.DocumentNode.SelectNodes(xpathDiv);
    for (int i = 0; i < allDivs.Count; i++)
    {
    if (i > 2) break;
    
    //需要把allDivs里面的node重新转换为HtmlNode才能取到对应节点的信息,否则取到的一直都是第一个
    HtmlNode node = HtmlNode.CreateNode(allDivs[i].InnerHtml);
    DataRow dr = dt.NewRow();
    
    //过滤商品排名
    string xpath = "//span[@class='zg_rankNumber']";//找到class=zg_rankNumber的span节点
    string indexText = node.SelectSingleNode(xpath).InnerText.Replace(".", "").Replace("\n", "").TrimStart().TrimEnd();
    int rank = int.Parse(indexText);
    dr["排名"] = rank;
    
    //过滤商品名称
    xpath = "//div[@class='p13n-sc-truncate p13n-sc-truncated-hyphen p13n-sc-line-clamp-2']";//找到对应class的div节点
    string name = node.SelectSingleNode(xpath).InnerText.Replace("\n", "").TrimStart().TrimEnd();
    dr["商品名称"] = name;
    
    //过滤商品价格
    xpath = "//span[@class='p13n-sc-price']";//找到class=p13n-sc-price的span节点
    string price = node.SelectSingleNode(xpath).InnerText.Replace("\n", "");
    dr["售价"] = price;
    
    //过滤商品明细连接,position()从1开始
    xpath = "//a[@class='a-link-normal' and position()=1]";//找到class=a-link-normal的并且位置是第一个的a节点
    string href = node.SelectSingleNode(xpath).Attributes["href"].Value;
    href = "https://www.amazon.com" + href;
    string htmlDetail = WebRequestPost(href);
    HtmlAgilityPack.HtmlDocument docDetail = new HtmlAgilityPack.HtmlDocument();
    docDetail.LoadHtml(htmlDetail);
    xpath = "//div[@id='detailBulletsWrapper_feature_div']";//找到id=detailBulletsWrapper_feature_div的div节点
    HtmlNode nodeDetail = docDetail.DocumentNode.SelectSingleNode(xpath);
    if (nodeDetail != null)
    {
    //过滤商品首次上架日期节点
    //xpath = "//li[position()=5]//span[position()=2]";//不能直接取固定位置的li,因为有些商品的li数量不一致有些5个,有些6个
    //找到包含有Date first available at Amazon.com文本内容的span节点的第一个span兄弟节点
    xpath = "//span[contains(text(), 'Date first available at Amazon.com')]/following-sibling::span[1]";
    string dateFrist = nodeDetail.SelectSingleNode(xpath).InnerText;
    dr["首次上架日期"] = dateFrist;
    
    //过滤商品分类排名信息
    xpath = "//li[@id='SalesRank']/b/following::text()[1]";//找到id=SalesRank的li节点里面b节点相邻的第一个文本节点
    string categoryRank = nodeDetail.SelectSingleNode(xpath).InnerText.Replace("(", "");//获取主分类排名
    xpath = "//li[@id='SalesRank']/ul[@class='zg_hrsr']";//找到id=SalesRank的li节点里面class=zg_hrsr的ul节点
    string detailRank = nodeDetail.SelectSingleNode(xpath).InnerText.Replace("&nbsp;", " ").Replace("&gt;", ">");//获取具体分类排名
    dr["排名信息"] = categoryRank + detailRank;
    }
    dt.Rows.Add(dr);
    }
    }
    catch (Exception ex)
    {
    MessageBox.Show("爬虫抓取失败,失败信息:" + ex.Message);
    }
    }
  • 点赞
  • 收藏
  • 分享
  • 文章举报
qq_43196691 发布了0 篇原创文章 · 获赞 0 · 访问量 63 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: