基于庖丁分词的TFIDF计算
2014-12-12 16:19
295 查看
通过近期研究测试,发现庖丁分词在中文分词中效果好一点,而TFIDF是词频计算中常用方法,关于TFIDF的计算过程就不详细说明了。
直接上代码:
复制去Google翻译翻译结果
TFIDF
直接上代码:
package com.util; import java.io.*; import java.util.*; import java.util.Map.Entry; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class TFIDFMeasure { /** * @param args */ private static List<String> FileList = new ArrayList<String>(); // the list of file // get list of file for the directory, including sub-directory of it public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException { try { File file = new File(filepath); if (!file.isDirectory()) { System.out.println("输入的[]"); System.out.println("filepath:" + file.getAbsolutePath()); } else { String[] flist = file.list(); for (int i = 0; i < flist.length; i++) { File newfile = new File(filepath + "\\" + flist[i]); if (!newfile.isDirectory()) { FileList.add(newfile.getAbsolutePath()); } else if (newfile.isDirectory()) // if file is a directory, call ReadDirs { readDirs(filepath + "\\" + flist[i]); } } } } catch (FileNotFoundException e) { System.out.println(e.getMessage()); } return FileList; } // read file public static String readFile(String file) throws FileNotFoundException, IOException { StringBuffer strSb = new StringBuffer(); // String is constant, StringBuffer can be changed. InputStreamReader inStrR = new InputStreamReader(new FileInputStream( file), "UTF-8"); // byte streams to character streams BufferedReader br = new BufferedReader(inStrR); String line = br.readLine(); while (line != null) { strSb.append(line).append("\r\n"); line = br.readLine(); } br.close(); return strSb.toString(); } // word segmentation public static List<String> cutWords(String file) throws IOException { List<String> words = new ArrayList<String>(); String text = TFIDFMeasure.readFile(file); // IKAnalyzer analyzer = new IKAnalyzer(); // words = analyzer.split(text); Analyzer analyzer = new PaodingAnalyzer(); //庖丁分词 TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); CharTermAttribute offAtt = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class); while(ts.incrementToken()){ words.add(offAtt.toString()); } analyzer.close(); return words; } // term frequency in a file, times for each word public static Map<String, Integer> normalTF(List<String> cutwords) { Map<String, Integer> resTF = new HashMap<String, Integer>(); for (String word : cutwords) { if (resTF.get(word) == null) { resTF.put(word, 1); // System.out.println(word); } else { resTF.put(word, resTF.get(word) + 1); // System.out.println(word.toString()); } } return resTF; } // term frequency in a file, frequency of each word public static Map<String, Double> tf(List<String> cutwords) { Map<String, Double> resTF = new HashMap<String, Double>(); int wordLen = cutwords.size(); Map<String, Integer> intTF = TFIDFMeasure.normalTF(cutwords); for (Entry<String, Integer> entry : intTF.entrySet()) { resTF.put(entry.getKey().toString(), Double.parseDouble(entry.getValue().toString()) / wordLen); // System.out.println(entry.getKey().toString() + " = " // + Double.parseDouble(entry.getValue().toString()) / wordLen); } return resTF; } // tf times for file public static Map<String, Map<String, Integer>> normalTFAllFiles( String dirc) throws IOException { Map<String, Map<String, Integer>> allNormalTF = new HashMap<String, Map<String, Integer>>(); List<String> filelist = TFIDFMeasure.readDirs(dirc); for (String file : filelist) { Map<String, Integer> dict = new HashMap<String, Integer>(); List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut word for one file dict = TFIDFMeasure.normalTF(cutwords); allNormalTF.put(file, dict); } return allNormalTF; } // tf for all file public static Map<String, Map<String, Double>> tfAllFiles(String dirc) throws IOException { Map<String, Map<String, Double>> allTF = new HashMap<String, Map<String, Double>>(); List<String> filelist = TFIDFMeasure.readDirs(dirc); for (String file : filelist) { Map<String, Double> dict = new HashMap<String, Double>(); List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut words for one file dict = TFIDFMeasure.tf(cutwords); allTF.put(file, dict); } return allTF; } public static Map<String, Double> idf( Map<String, Map<String, Double>> all_tf) { Map<String, Double> resIdf = new HashMap<String, Double>(); Map<String, Integer> dict = new HashMap<String, Integer>(); int docNum = FileList.size(); for (int i = 0; i < docNum; i++) { Map<String, Double> temp = all_tf.get(FileList.get(i)); for (Entry<String, Double> entry : temp.entrySet()) { String word = entry.getKey().toString(); if (dict.get(word) == null) { dict.put(word, 1); } else { dict.put(word, dict.get(word) + 1); } } } System.out.println("IDF for every word is:"); for (Entry<String, Integer> entry : dict.entrySet()) { double value = (float) Math.log(docNum / Double.parseDouble(entry.getValue().toString())); resIdf.put(entry.getKey().toString(), value); // System.out.println(entry.getKey().toString() + " = " + value); } return resIdf; } public static Map<String, Map<String, Double>> tf_idf(Map<String, Map<String, Double>> all_tf, Map<String, Double> idfs) { Map<String, Map<String, Double>> resTfIdf = new HashMap<String, Map<String, Double>>(); int docNum = FileList.size(); for (int i = 0; i < docNum; i++) { String filepath = FileList.get(i); Map<String, Double> tfidf = new HashMap<String, Double>(); Map<String, Double> temp = all_tf.get(filepath); for (Entry<String, Double> entry : temp.entrySet()) { String word = entry.getKey().toString(); Double value = (float) Double.parseDouble(entry.getValue() .toString()) * idfs.get(word); tfidf.put(word, value); } resTfIdf.put(filepath, tfidf); } return resTfIdf; } public static void disTfIdf(Map<String, Map<String, Double>> tfidf) throws IOException { System.out.println("TF-IDF for Every file is :"); Map<String,Double> resultMap = new HashMap<String,Double>(); for (Entry<String, Map<String, Double>> entrys : tfidf.entrySet()) { System.out.println("FileName: " + entrys.getKey().toString()); System.out.println("{"); Map<String, Double> temp = (HashMap<String, Double>) entrys .getValue(); ArrayList<Map.Entry<String, Double>> infoIds = new ArrayList<Map.Entry<String, Double>>(temp.entrySet()); Collections.sort(infoIds, new Comparator<Map.Entry<String, Double>>() { public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Entry<String, Double> entry : infoIds) { System.out.println(entry.getKey().toString() + " = " + entry.getValue().toString() + ", "); } System.out.println("}"); resultMap.put(entrys.getKey().toString(), infoIds.get(0).getValue()+infoIds.get(1).getValue());//存入高分中前两名的和 } ArrayList<Map.Entry<String, Double>> infoIds2 = new ArrayList<Map.Entry<String, Double>>(resultMap.entrySet()); Collections.sort(infoIds2, new Comparator<Map.Entry<String, Double>>() { public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Entry<String, Double> entry : infoIds2) { System.out.println(readFile(entry.getKey().toString()) + " = " + entry.getValue().toString() + ", "); } } public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String file = "DataMiningSample/text/test1"; Map<String, Map<String, Double>> all_tf = tfAllFiles(file); System.out.println(); Map<String, Double> idfs = idf(all_tf); System.out.println(); Map<String, Map<String, Double>> tf_idf = tf_idf(all_tf, idfs); disTfIdf(tf_idf); } }
复制去Google翻译翻译结果
TFIDF
相关文章推荐
- TF-IDF计算方法和基于图迭代的TextRank
- 使用sci-kit learn计算TF-IDF
- scikit-learn包进行tf-idf计算
- Pyhton 基于scikit的TFIDF特征抽取如何使用
- TF-IDF计算三
- 文本挖掘——基于TF-IDF的KNN分类算法实现
- 关键词权重计算算法 - TF-IDF
- 的Tf-idf值分词计算列举
- 转:Lucene之计算相似度模型VSM(Vector Space Model) : tf-idf与交叉熵关系,cos余弦相似度
- (6)文本挖掘(三)——文本特征TFIDF权重计算及文本向量空间VSM表示
- TF-IDF 的计算二
- TF-IDF计算四
- 运用mapreduce计算tf-idf
- c++实现之 -- 文章TF-IDF值的计算
- 使用spark的TF-IDF算法计算单词的重要性
- TF-IDF计算一
- spark mllib 中的tf-idf算法计算文档相似度
- 基于tfidf 以及 lsi 的文本相似度分析
- TF-IDF词项权重计算
- python scikit-learn计算tf-idf词语权重