爬取京东评论、分词+词频统计、词云图展示
2017-06-28 11:59
190 查看
一、爬取京东评论
京东评论竟然全部对外开放public class CommentCrawler { final static PoolingHttpClientConnectionManager httpClientConnectionManager = new PoolingHttpClientConnectionManager(); final static int MAX_PAGE = 50; static HttpClient getClient() { return HttpClients.custom().setConnectionManager(httpClientConnectionManager).build(); } static String getUrl(String productId, int page) { return String.format( "http://sclub.jd.com/comment/productPageComments.action?productId=%s&score=0&sortType=3&page=%d&pageSize=10", productId, page); } static Comment commentFromJson(JSONObject json, String productId) { return new Comment(json.getLongValue("id"), productId, json.getString("score"), json.getString("content")); } public static boolean crawlComments(String productId) { try { int maxPage = 1; int nowPage = 0; HttpClient client = getClient(); while (nowPage < maxPage) { String url = getUrl(productId, nowPage); HttpGet get = new HttpGet(url); HttpResponse resp = client.execute(get); JSONObject json = JSON.parseObject(EntityUtils.toString(resp.getEntity())); JSONArray comments = json.getJSONArray("comments"); if (comments.size() == 0) return false; CommentService ser = new CommentService(); for (int i = 0; i < comments.size(); i++) { Comment comment = commentFromJson(comments.getJSONObject(i), productId); ser.insertComment(comment); } if (nowPage == 0) { maxPage = json.getInteger("maxPage"); ser.insertProduct(new Product(productId, comments.getJSONObject(0).getString("referenceName"))); } nowPage++; } ProductJudger.judge(productId); return true; } catch (Exception e) { e.printStackTrace(); } return false; } }
二、结巴分词
jieba分词原本是Python版的,有人把它改成了Java版,名字也改成了jieba-analysisJiebaSegmenter segmenter = new JiebaSegmenter(); List<Word> list = segmenter.sentenceProcess(str); for (Word i : list) { String token = i.getToken(); if (stopWords.contains(token)) { continue; } Integer cnt = map.get(token); if (cnt == null) { cnt = 0; } map.put(token, new Integer(cnt + 1)); }
三、词云图
用到d3.js,d3-cloud.js这两个库,d3.js是“Data Driven Document”,d3-cloud这个库还是比较难用的,主要是官方实例代码太少了。这里给出一个例子:每一个词云图都对应一个字典,这个字典就是“词语:频率”这样的键值对。给定多个字典,每一个字典都要渲染成一个词云图。
<div id="word-clouds" style="text-align: center"></div> <script> var wordClouds = <%=request.getAttribute("wordClouds")%>; $(document).ready(function () { for (var i in wordClouds) { var it = wordClouds[i]; var divId = "product_wordcloud" + it['productId']; $("#word-clouds").append("<h3 align='center'>商品" + it['productId'] + "词云图</h3>") .append("<div id='" + divId + "'></div>"); createWordCloud(transformWordFraquency(it['words']), "#" + divId) } }) </script>
还需要编写如下JS代码
var fill = d3.scale.category20();// 20种颜色 var wordCloudWidth = 800, wordCloudHeight = 400; var font_name = "楷体", font_weight = "bold", max_font_size = 50; var word_count = 50;// 显示词汇个数 var word_max_size = 60;// 显示词汇字体最大字号 var word_min_size = 10;// 显示词汇字体最小字号 /* * 函数名称:transformWordFrequency * 参数words:字典类型,形如“{word1:cnt1,word2:cnt2,word3:cnt3}” 返回值:{text,size}数组 */ function transformWordFraquency(words) { var ar = [] for ( var i in words) { ar.push({ "text" : i, "size" : words[i] }) } // 按照字体的大小从大到小进行排序,只取出现次数较多的前几名 ar.sort(function(x, y) { return y['size'] - x['size']; }) ar = ar.slice(0, Math.min(word_count, ar.length)); for (var i = 0; i < ar.length; i++) { ar[i]['size'] = word_max_size - (word_max_size - word_min_size) / ar.length * i; } return ar; } /* * wordMap是[{text:"",size:""}]形式的数组 selector是即将渲染到的目标位置 */ function createWordCloud(wordMap, selector) { d3.layout.cloud().size( [ wordCloudWidth * 2 - 100, wordCloudHeight * 2 - 100 ]).words( wordMap).font(font_name).fontWeight(font_weight).fontSize( function(d) { return d.size; }).rotate(function() { return 0; }).on("end", function(words) { renderWordCloud(words, selector) }).start(); } /* * 像这种风格的函数调用,这些函数的调用顺序不能变 */ function renderWordCloud(words, selector) { d3.select(selector).append("svg").attr("width", wordCloudWidth).attr( "height", wordCloudHeight).append("g") .attr( "transform", "translate(" + wordCloudWidth / 2 + "," + wordCloudHeight / 2 + ")").selectAll("text").data(words).enter()// 进入words,相当于for循环 .append("text").style("font-family", font_name).style( "font-weight", font_weight) // .attr("text-anchor", "middle") .style("font-size", function(d) {// 字体大小 return d.size + "px"; }).style("fill", function(d, i) {// 字体颜色 return fill(i); }).attr( "transform", function(d) { return "translate(" + [ d.x, d.y ] + ") rotate(" + d.rotate + ")"; }).text(function(d) { return d.text; }); }
相关文章推荐
- 爬取京东评论、分词+词频统计、词云图展示
- 中文分词,词频统计,词云图制作
- Hadoop的改进实验(中文分词词频统计及英文词频统计)(4/4)
- 使用ES对中文文章进行分词,并进行词频统计排序
- Hadoop IK分词 词频统计
- PYTHON3.6对中文文本分词、去停用词以及词频统计
- Python jieba 中文分词与词频统计
- python jieba分词并统计词频后输出结果到Excel和txt文档方法
- 使用Spark、Ansj分词进行词频统计
- Python进行文本预处理(文本分词,过滤停用词,词频统计,特征选择,文本表示)
- python3结巴分词分行拆分统计词频
- Python分词并进行词频统计
- nltk学习之统计词频和分词nltk.word_tokenize nltk.FreqDist
- scrapy爬虫之抓取京东机械键盘评论量并画图展示
- 【中文分词】使用IKAnalyzer分词统计词频
- 使用 wordcloud, jieba, PIL, matplotlib, numpy 进行分词,统计词频,并绘制词云的一次尝试
- 分词统计词频
- python结巴分词以及词频统计实例
- 【自然语言处理入门】01:利用jieba对数据集进行分词,并统计词频
- 【python 编程】网页中文过滤分词及词频统计