您的位置:首页 > 其它

统计一TXT文档中单词出现频率,输出频率最高的10个单词

2014-10-18 21:17 771 查看
实验过程

主要思路就是首先将标点符号,常用冠词等替换掉,然后利用哈希表和数组原理排序,输出最高频率的前十个数组

代码如下

import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Map.Entry;

public class test {

 public static void main(String[] args) throws IOException {

  long start = System.currentTimeMillis(); // 程序开始时间

  File file = new File("E:/TEST.txt");

  BufferedReader br = new BufferedReader(new FileReader(file));

  StringBuilder sb = new StringBuilder();

  String line = null;

  while ((line = br.readLine()) != null) {

   sb.append(line);

  }

  br.close(); // 关闭流

  String words = sb.toString(); // 全部的单词字符串

  String targetString = words.replaceAll("[.,\"\\?!:;\\(\\)]", ""); // 将标点替换为空

  // 分词并且定义英文中不代表实际意义的一些单词,如介词、代词、情态动词等

  String[] singleWord = targetString.split(" ");

  String[] keys = { "you", "i", "he", "she", "me", "him", "her", "it",

    "they", "them", "we", "us", "your", "your", "our", "his",

    "her", "its", "my", "in", "into", "on", "for", "out", "up",

    "down", "at", "to","too", "with", "by", "about", "among", "between",

    "over", "from", "be", "been", "am", "is", "are", "was", "were",

    "whthout", "the", "of", "and", "a", "an", "that", "this", "be",

    "or", "as", "will", "would", "can", "could", "may", "might",

    "shall", "should", "must", "has", "have", "had", "than" };

  // 将一部分常见的无意义的英语单词替换为字符 '#' 以便后面输出单词出现次数时的判断

  for (int i = 0; i < singleWord.length; i++) {

   for (String str : keys) {

    if (singleWord[i].equals(str))

     singleWord[i] = "#";

   }

  }

  // 将单词以及其出现的次数关联起来

  for (int i = 0; i < singleWord.length; i++) {

   count++; // 计算单词个数

   if ((wordMap.get(singleWord[i]) != null)) {

    int value = ((Integer) wordMap.get(singleWord[i])).intValue();

    value++;

    wordMap.put(singleWord[i].toLowerCase(), new Integer(value)); // 将单词转换为小写存放以统一格式

   } else {

    wordMap.put(singleWord[i].toLowerCase(), new Integer(1));

   }

  }

  System.out.println("\t\t--文件信息--");

  System.out.println("     名称: " + file.getName() + "    大小: "

    + file.length()/ 1024 + "KB");

  System.out.println("\t\t--文件信息--");

  System.out.println();

  System.out.println("■■■■ " + count + " 个单词中出现频率最高的 10 个单词如下■■■■");

  // 比较器, 按值排序

  System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");

  List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(

    wordMap.entrySet());

  Collections.sort(list, new Comparator<Entry<String, Integer>>() {

   public int compare(Entry<String, Integer> e1,

     Entry<String, Integer> e2) {

    if (e2.getValue() != null && e1.getValue() != null

      && e2.getValue().compareTo(e1.getValue()) > 0) {

     return 1;

    } else {

     return -1;

    }

   }

  });

 

  int wordCount = 1; // 记录已经输出单词的个数

  for (Map.Entry<String, Integer> entry : list) {

   if (entry.getKey().equals("#")) // 相当于过滤作用,不输出介词、代词、情态动词等无意义单词

    continue;

   System.out.printf("\t%2d、 %8s \t %4d次\n", wordCount,

     entry.getKey(), entry.getValue());

   if (wordCount++ == 10) { // 表示只输出10个

    long end = System.currentTimeMillis(); // 程序结束时间

    System.out.println("■■■■■■■■■■■■■■■ 耗时 " + (end - start)

      + " ms" + " ■■■■■■■■■■■■■■■■");

    return;

   }

  }

 }

 private static HashMap<String, Integer> wordMap = new HashMap<String, Integer>();

 private static int count = 0;

}

运行结果如图




并且用JDK自带的visualVM测试工具进行测试,测试见截图如下



内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐