基于模糊K-Means算法的新闻聚类
2016-02-27 11:31
267 查看
<strong>/*** * @author YangXin * @info 基于模糊K-Means算法的新闻聚类 */ package unitNine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.lucene.analysis.Analyzer; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.classify.WeightedVectorWritable; import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.distance.ManhattanDistanceMeasure; import org.apache.mahout.common.distance.TanimotoDistanceMeasure; import org.apache.mahout.vectorizer.DictionaryVectorizer; import org.apache.mahout.vectorizer.DocumentProcessor; import org.apache.mahout.vectorizer.tfidf.TFIDFConverter; public class NewsFuzzyKMeansClustering { public static void main(String args[]) throws Exception { int minSupport = 5; int minDf = 10; int maxDFPercent = 70; int maxNGramSize = 1; int minLLRValue = 200; int reduceTasks = 1; int chunkSize = 200; int norm = 2; boolean sequentialAccessOutput = true; String inputDir = "inputDir"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); /* SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(inputDir, "documents.seq"), Text.class, Text.class); for (Document d : Database) { writer.append(new Text(d.getID()), new Text(d.contents())); } writer.close();*/ String outputDir = "newsClusters"; HadoopUtil.delete(conf, new Path(outputDir)); Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); MyAnalyzer analyzer = new MyAnalyzer(); DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass() .asSubclass(Analyzer.class), tokenizedPath, conf); DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks, chunkSize, sequentialAccessOutput, false); TFIDFConverter.processTfIdf( new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new Path(outputDir), conf, chunkSize, minDf, maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks); String vectorsFolder = outputDir + "/tfidf-vectors"; String canopyCentroids = outputDir + "/canopy-centroids"; String clusterOutput = outputDir + "/clusters/"; CanopyDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids), new ManhattanDistanceMeasure(), 3000.0, 2000.0, false, false); FuzzyKMeansDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids, "clusters-0"), new Path(clusterOutput), new TanimotoDistanceMeasure(), 0.01, 20, 2.0f, true, true, 0.0, false); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path( clusterOutput + Cluster.CLUSTERED_POINTS_DIR +"/part-m-00000"), conf); IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { System.out.println("Cluster: " + key.toString() + " " + value.getVector().asFormatString()); } reader.close(); } } </strong>
相关文章推荐
- iOS 加密的3种方法
- 严重: Exception starting filter struts2解决方法!
- 手把手教你做开源项目MyMeiZi 二(使用RecyclerView+Glide打造瀑布流)
- 如何用javascript实现 网页标题的滚动效果
- 详解Java中对象序列化与反序列化
- 欢迎使用CSDN-markdown编辑器
- [Locked] Paint Fence
- GB2312和ASCII码点阵字库HZK, ASC说明使用心得,全
- 《Effective C++》Rule36:绝不重新定义继承而来的non-virtual函数
- Php-Redis 邮件队列实现总结
- sigsegv 問題的調是
- in-memory形式的牧户K-Means聚类
- PHP写的一个轻量级的DI容器类(转)
- IDEA maven 下载 源码和javadoc命令
- java 中的数学计算函数
- Angular2教程(一)
- 事件委托
- 229. Majority Element II My Submissions Question
- halcon之共线连接union_collinear_contours_xld
- Java 解析json数据