您的位置：首页 > 其它

一个自己用lucene写的敏感词查询

2015-12-10 22:32 274 查看

lucenepackage ddd.cms.util;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.PrintStream;

import java.util.Date;

import java.util.HashMap;

import java.util.Map;

import java.nio.charset.StandardCharsets;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.LongField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.queryparser.classic.ParseException;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

/**

* 敏感词管理，查询所有的敏感词，查看是否存在

* @author Administrator

*

*/

public class SensitiveWordUtil {

   private static String docsPath = "D:\\CMSDocs";

   private static String fileName = "analysis.txt";

   private static String indexPath = "D:\\CMSIndex";

   private static double baseScore = 0.02;



   public static void main(String[] args) throws Exception {

       /*String docsPath = "D:\\CMSDocs";

       String fileName = "analysis.txt";

       String indexPath = "D:\\CMSIndex";*/



       //startWork( indexPath,docsPath,fileName, content);

       String [] allSensitiveWord = {"使命","持续创新","没出事","十二号","辈数儿","否存在"};

       /*String sensitiveWord = null;

       for (int i = 0; i < allSensitiveWord.length; i++) {

           sensitiveWord = allSensitiveWord[i];

           judeIsExsist(sensitiveWord,indexPath);

       }*/

       //judeIsExsist(sensitiveWord,indexPath);

       String content = "bseurhg87e略后来持续改了良好社会了十二和谁零售额好了粉色，哦然否存在后首尔玫我们的使命：让IT更简单数据是未来商业的核心，IT是企业数据存储和分析的基础，SMARTX让IT更简单，帮助企业挖掘数据价值，让商业更有效率。我们相信： 1. 持续创新 SMARTX由热爱挑战的团队组建，永远解决用户最痛、最难的问题，不断创新为用户创造价值。 2. 简胜于繁不论是产品的设计实现，还是用户体验，我们把简单原则贯彻如一。让企业用更直觉、更简单的红色还是摩尔佛爱问还是没饿坏了分数而肥瘦儿";



       Map<String,Boolean> resMap = getSearchResult(allSensitiveWord,content);

       System.out.println(resMap);



   }

   /**

   * 将所有的敏感词纳入文件中查询，返回所有的敏感词的存在情况

   * @param allSensitiveWord

   * @param content

   * @return

   * @throws IOException

   * @throws ParseException

   */



   public static Map<String,Boolean> getSearchResult(String []allSensitiveWord,String content) throws IOException, ParseException{

       Map<String,Boolean> resultMap = new HashMap<String,Boolean>();

       if (content == null) {

           return null;

       }

       else{

           startWork( indexPath,docsPath,fileName, content);   //先做准备工作

           String sensitiveWord = null;

           for (int i = 0; i < allSensitiveWord.length; i++) {

               boolean isFind = false;

               sensitiveWord = allSensitiveWord[i];

               Map<String, Object> resMap= judeIsExsist(sensitiveWord,indexPath);

               ScoreDoc scoreDoc = (ScoreDoc) resMap.get("scoreDoc");

               int totalHits = (int) resMap.get("totalHits");

               if (totalHits >= 1 && scoreDoc.score >= baseScore) {

                   isFind = true;

               }

               else{

                   isFind = false;

               }

               resultMap.put(sensitiveWord, isFind);

           }

       }

       return resultMap;

   }



   /**

   * 做一些准备工作，将内容写入txt文件，并建立索引

   * @param indexPath

   * @param docsPath

   * @param fileName

   * @param content

   * @return

   */

   private static boolean startWork(String indexPath, String docsPath,String fileName,String content) {

       boolean create = true;

       String getContent = writeContentTotxt(docsPath, fileName, content);

       if(getContent != null){

           createIndex(indexPath,docsPath,create);

           return true;

       }

       else{

           System.out.println("写入文件失败！");

       }

       return false;

   }



   /**

   * 将文章写入txt文件的方法

   * @param filePath

   * @param fileName

   * @param content

   * @return

   */



   private static String writeContentTotxt(String filePath, String fileName,String content) {

       File fileFolder = new File(filePath);

       if (!fileFolder.exists() || !fileFolder.isDirectory()) {

           createFolder(filePath);

       }

       File contentFile = new File(filePath + "\\" + fileName);

       if (!contentFile.exists() || contentFile.isDirectory()) {

           try {

               createFile(filePath, fileName);

           } catch (IOException e) {

               e.printStackTrace();

           }

       }

       else{

           //如果没有该文件就要先创建该文件，如果有了就直接进行交给下一步

       }

       try {

           boolean isSuccess = printStream(filePath + "\\" + fileName, content, false); //将要搜索的文章先写入到txt临时文件中

           if (isSuccess) {

               return content;

           }

           else{

               return null;

           }

       } catch (IOException e) {

           e.printStackTrace();

       }

       return null;

   }

   /**

   * 用字符流写入的具体实现

   * @param filePath

   * @param content

   * @param isTrue

   * @return

   * @throws IOException

   */

   private static boolean printStream(String filePath, String content,boolean isTrue) throws IOException {

       boolean isSuccess = false;

       FileOutputStream out = new FileOutputStream(filePath, isTrue); // true意为是在其后追加，false会将原有的内容覆盖

       PrintStream p = new PrintStream(out, true);

       if (content != null) {

           p.println(content);

           isSuccess = true;

       } else {

           System.out.println("对不起你的要写入的内容为空");

       }

       out.close();

       p.close();

       return isSuccess;

   }

   /**

   * 建立索引的具体实现

   * @param indexPath

   * @param docsPath

   * @param create

   */

   private static void createIndex(String indexPath,String docsPath,boolean create){

        final File docDir = new File(docsPath);

        if (!docDir.exists() || !docDir.canRead()) {

            System.out.println("资源文件目录 '" + docDir.getAbsolutePath() + "' 不存在或不可读，请检查！");

            System.exit(1);

        }else{

           Date start = new Date();

            try {

                System.out.println("建立索引文件到该目录 '" + indexPath + "'...");



                Directory dir = FSDirectory.open(new File(indexPath));

                Analyzer analyzer = new StandardAnalyzer();

                IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_2, analyzer);

                if (create) {

                    // 创建新的索引文件，删除所有其他的索引文件

                    //（指的是该资源文件目录下的旧的索引文件，其他资源的索引文件不影响）

                    iwc.setOpenMode(OpenMode.CREATE);

                } else {

                    // 如果有旧的索引文件，则更新索引文件

                    iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);

                }

                IndexWriter writer = new IndexWriter(dir, iwc);

                indexDocs(writer, docDir);

                writer.close();

                Date end = new Date();

                System.out.println(end.getTime() - start.getTime() + " total milliseconds");

            } catch (IOException e) {

                System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());

            }

        }

   }



   /**

   * 索引文件的义务处理

   * @param writer

   * @param file

   * @throws IOException

   */

   private static void indexDocs(IndexWriter writer, File file) throws IOException {

        if (!file.canRead()) {

            return;

        }

        else if (file.isDirectory()) {

            String[] files = file.list();

            if (files != null) {

                for (int i = 0; i < files.length; i++) {

                    indexDocs(writer, new File(file, files[i]));

                }

            }

        } else {

            FileInputStream fis;

            try {

                fis = new FileInputStream(file);

            } catch (FileNotFoundException fnfe) {

                return;

            }

            try {

                // 每一个文档最终被封装成了一个 Document 对象

                // Document 是用来描述文档的，这里的文档可以指一个 HTML 页面，一封电子邮件，或者是一个文本文件。

                // 一个 Document 对象由多个 Field 对象组成的。

                // 可以把一个 Document 对象想象成数据库中的一个记录，而每个 Field 对象就是记录的一个字段。

                Document doc = new Document();

                // Field 对象是用来描述一个文档的某个属性的，比如一封电子邮件的标题和内容可以用两个 Field 对象分别描述。

                Field pathField = new StringField("path", file.getPath(),Field.Store.YES);

                // pathField指的是资源文件的路径的field

                doc.add(pathField);

                // 这个field指的是最后的修改时间

                doc.add(new LongField("modified", file.lastModified(),Field.Store.NO));

                // 把资源文件中的内容分词后，索引到索引文件中，指定为UTF-8编码

                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {

                    System.out.println("adding " + file);

                    writer.addDocument(doc);

                } else {

                    System.out.println("updating " + file);

                    writer.updateDocument(new Term("path", file.getPath()), doc);

                }

            } finally {

                fis.close();

            }

            }

        }

   /**

   * 创建文件

   * @param filePath

   * @param fileName

   * @throws IOException

   */

   private static void createFile(String filePath, String fileName) throws IOException {

       File file = new File(filePath+"\\"+fileName);

       if (!file.exists())

           file.createNewFile();

       else{

           return;

       }

   }

   /**

   * 创建文件夹

   * @param filePath

   * @return

   */

   private static String createFolder(String filePath) {

       File indexFile = new File(filePath);

       if (!indexFile.exists() || !indexFile.isDirectory()) {

           indexFile.mkdir();

       }

       else{

           //文件夹已经存在，直接返回

       }

       return filePath;

   }



   /**

   * lucene的具体查询方法

   * @param in

   * @param searcher

   * @param query

   * @param hitsPerPage

   * @param raw

   * @param interactive

   * @return

   * @throws IOException

   */

   public static Map<String, Object> doPagingSearch(BufferedReader in,IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,boolean interactive) throws IOException {

       Map<String,Object> resMap = new HashMap<String,Object>();

       TopDocs results = searcher.search(query, 5 * hitsPerPage); // 对应的每一页

       ScoreDoc[] hitDocs = results.scoreDocs;

       int numTotalHits = results.totalHits;

       resMap.put("totalHits", numTotalHits);

       if (hitDocs.length!=0 && numTotalHits > 0 && hitDocs[0]!=null) {

           resMap.put("scoreDoc", hitDocs[0]);

           System.out.println(hitDocs[0].toString());

           System.out.println(numTotalHits + " total matching documents");

       }else{

           resMap.put("scoreDoc", null);

       }

       return resMap;

   }

   /**

   * 根据索引与关键字来查询是否存在

   * @param sensitiveWord

   * @param indexPath

   * @return

   * @throws IOException

   * @throws ParseException

   */

   private static Map<String,Object> judeIsExsist(String sensitiveWord, String indexPath)throws IOException, ParseException {

       String field = "contents";

       String queries = null;

       int repeat = 0;

       boolean raw = false;

       String queryString = null;

       int hitsPerPage = 10;



       IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));

       IndexSearcher searcher = new IndexSearcher(reader);

       Analyzer analyzer = new StandardAnalyzer();

       BufferedReader in = null;

       in = new BufferedReader(new InputStreamReader(System.in,StandardCharsets.UTF_8));

       QueryParser parser = new QueryParser(field, analyzer);

       sensitiveWord = sensitiveWord.trim(); // 去除两边的看的空格

       Query query = parser.parse(sensitiveWord); // 创建查询器

       System.out.println("Searching for: " + query.toString(field));

       if (repeat > 0) { // repeat & time as benchmark

           Date start = new Date();

           for (int j = 0; j < repeat; j++) {

               searcher.search(query, null, 100);

           }

           Date end = new Date();

           System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");

       }

       Map<String, Object> resMap = doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

       reader.close();

       return resMap;

   }

}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航