您的位置:首页 > 其它

基于模糊K-Means算法的新闻聚类

2016-02-27 11:31 267 查看
<strong>/***
 * @author YangXin
 * @info 基于模糊K-Means算法的新闻聚类
 */
package unitNine;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.classify.WeightedVectorWritable;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.common.distance.TanimotoDistanceMeasure;
import org.apache.mahout.vectorizer.DictionaryVectorizer;
import org.apache.mahout.vectorizer.DocumentProcessor;
import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
public class NewsFuzzyKMeansClustering {
	public static void main(String args[]) throws Exception {
	    
	    int minSupport = 5;
	    int minDf = 10;
	    int maxDFPercent = 70;
	    int maxNGramSize = 1;
	    int minLLRValue = 200;
	    int reduceTasks = 1;
	    int chunkSize = 200;
	    int norm = 2;
	    boolean sequentialAccessOutput = true;
	    
	    String inputDir = "inputDir";
	    
	    Configuration conf = new Configuration();
	    FileSystem fs = FileSystem.get(conf);
	    /*
	    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
	        new Path(inputDir, "documents.seq"), Text.class, Text.class);
	    for (Document d : Database) {
	      writer.append(new Text(d.getID()), new Text(d.contents()));
	    }
	    writer.close();*/
	     
	    String outputDir = "newsClusters";
	    HadoopUtil.delete(conf, new Path(outputDir));
	    
	    Path tokenizedPath = new Path(outputDir,
	        DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
	    MyAnalyzer analyzer = new MyAnalyzer();
	    DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()
	        .asSubclass(Analyzer.class), tokenizedPath, conf);
	    
	    DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
	      new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,
	      chunkSize, sequentialAccessOutput, false);
	    TFIDFConverter.processTfIdf(
	      new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
	      new Path(outputDir), conf, chunkSize, minDf,
	      maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);
	    String vectorsFolder = outputDir + "/tfidf-vectors";
	    String canopyCentroids = outputDir + "/canopy-centroids";
	    String clusterOutput = outputDir + "/clusters/";
	    
	    CanopyDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids),
	      new ManhattanDistanceMeasure(), 3000.0, 2000.0, false, false);
	    
	    FuzzyKMeansDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids, "clusters-0"), new Path(clusterOutput),
	      new TanimotoDistanceMeasure(), 0.01, 20, 2.0f, true, true, 0.0, false);
	    
	    SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
	      clusterOutput + Cluster.CLUSTERED_POINTS_DIR +"/part-m-00000"), conf);
	    
	    IntWritable key = new IntWritable();
	    WeightedVectorWritable value = new WeightedVectorWritable();
	    while (reader.next(key, value)) {
	      System.out.println("Cluster: " + key.toString() + " "
	                         + value.getVector().asFormatString());
	    }
	    reader.close();
	  }
}
</strong>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: