您的位置:首页 > 编程语言 > Java开发

基于spring boot架构和word分词器的分词检索,排序,分页实现

2018-03-15 09:58 507 查看
       本文不适合Java初学者,适合对spring boot有一定了解的同学。 文中可能涉及到一些实体类、dao类、工具类文中没有这些类大家不必在意,不影响本文的核心内容,本文重在对方法的梳理。    word分词器maven依赖<dependency><groupId>org.apdplat</groupId><artifactId>word</artifactId><version>1.3</version></dependency>       spring boot的常见依赖在这里我就不列举了可以见文章基于maven的spring boot 项目porm文件配置(含定时器,数据抓取,分词器依赖配置)       先构建一个PageUtil类用于封装分页排序方法。package com.frank.demo.util;import java.text.ParseException;import java.util.ArrayList;import java.util.Arrays;import java.util.List;public class PageUtil {// 分页方法public static <T> List<T> splitList(List<T> list, int pageSize, int curPage) {List<T> subList = new ArrayList<T>();int listSize = list.size();int star = pageSize * curPage;int end = pageSize * (curPage + 1);if (end > listSize) {end = listSize;}if (star >= listSize) {return new ArrayList<T>();}for (int i = star; i < end; i++) {subList.add(list.get(i));}return subList;}// 排序(搜索内容按照相似度高低排序)private static void comparator(List<EtlSearchCompanyResponseDto> data) {Collections.sort(data, new Comparator<EtlSearchCompanyResponseDto>() {@Overridepublicint compare(EtlSearchCompanyResponseDto o1, EtlSearchCompanyResponseDto o2) {int cp = 0;if (o1.getMatching() > o2.getMatching()) {cp = -1;} else if (o1.getMatching() < o2.g4000etMatching()) {cp = 1;}return cp;}});}}现在构建一个SearchService请看下面代码,
package com.frank.demo.service;//java内部工具import java.util.Collections;import java.util.Comparator;import java.util.LinkedHashMap;import java.util.LinkedList;import java.util.List;import java.util.Map;//基于spring boot集成hibernate的标准查询import javax.persistence.criteria.CriteriaBuilder;import javax.persistence.criteria.CriteriaQuery;import javax.persistence.criteria.Predicate;import javax.persistence.criteria.Root;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.data.domain.Sort;import org.springframework.data.domain.Sort.Direction;import org.springframework.data.jpa.domain.Specification;import org.springframework.stereotype.Service;// 分词器import org.apdplat.word.WordSegmenter;import org.apdplat.word.segmentation.Word;//用到的dao、实体类、工具类等,本文重在方法上的理解不必在意这些辅助类import com.frank.demo.dao.EtlDataT1004Dao;import com.frank.demo.dao.EtlDataT1009Dao;import com.frank.demo.dao.EtlDataT1022Dao;import com.frank.demo.dto.EtlCreatDueDiligenceRequestDto;import com.frank.demo.dto.EtlSearchCompanyResponseDto;import com.frank.demo.entity.EtlDataT1004;import com.frank.demo.entity.EtlDataT1009;import com.frank.demo.entity.EtlDataT1022;import com.frank.demo.util.api.ApiResponse;import com.frank.demo.util.dto.v1.PageRequestDto;import com.frank.demo.util.PageUtil;@Servicepublic class SearchService {@AutowiredEtlDataT1004Dao etlDataT1004Dao;@AutowiredEtlDataT1009Dao etlDataT1009Dao;@AutowiredEtlDataT1022Dao etlDataT1022Dao;private List<Word> words;        //本例是多数据源搜索,所以采用的是从三张表中获取相似公司名称的记录,再计算每条记录的相似度,最后统一放到list集合进行排序,最后采用内存分页返回(提示在数据量不是特别大的情景下可以这么做,如果数据量上百万,建议采用搜索引擎实现)public Map<String, Object> searchCompany(EtlCreatDueDiligenceRequestDto request, PageRequestDto page) {Map<String, Object> response = new LinkedHashMap<String, Object>();response.put(ApiResponse.KEY_MESSAGE, ApiResponse.MESSAGE_OK);List<EtlSearchCompanyResponseDto> data = new LinkedList<>();// 采用分词检索按照相似度高低进行排序(数据来源于三个地方,上交所,深交所,中小型企业股权转让系统)words = WordSegmenter.segWithStopWords(request.getCompanyName());//通过word分词器获取分词结果Sort shsort = new Sort(Direction.ASC,"f8");//列用数据库对匹配结果进行一次排序List<EtlDataT1004> shdatas = etlDataT1004Dao.findAll(new Specification<EtlDataT1004>() {@Overridepublic Predicate toPredicate(Root<EtlDataT1004> root, CriteriaQuery<?> query, CriteriaBuilder cb) {List<Predicate> predicates = new LinkedList<>();for (Word word : words) {predicates.add(cb.like(root.get("f8").as(String.class), "%" + word.getText() + "%"));}Predicate[] p = new Predicate[predicates.size()];return cb.or(predicates.toArray(p));}},shsort);// 匹配度计算for (EtlDataT1004 t1004 : shdatas) {EtlSearchCompanyResponseDto responseDto = new EtlSearchCompanyResponseDto(t1004.getF8().split("/")[0], t1004.getF8().split("/")[1], t1004.getF1(), "1", t1004.getF9());int i = 0;for (Word word : words) {if (t1004.getF8().contains(word.getText())) {i++;}}responseDto.setCompanyLegal(t1004.getF11());responseDto.setMatching(i);data.add(responseDto);}Sort szsort = new Sort(Direction.ASC,"f3");List<EtlDataT1009> szDatas = etlDataT1009Dao.findAll(new Specification<EtlDataT1009>() {@Overridepublic Predicate toPredicate(Root<EtlDataT1009> root, CriteriaQuery<?> query, CriteriaBuilder cb) {List<Predicate> predicates = new LinkedList<>();for (Word word : words) {predicates.add(cb.or(cb.like(root.get("f3").as(String.class), "%" + word.getText() + "%")));predicates.add(cb.or(cb.like(root.get("f4").as(String.class), "%" + word.getText() + "%")));}Predicate[] p = new Predicate[predicates.size()];return cb.or(predicates.toArray(p));}},szsort);// 匹配度计算for (EtlDataT1009 t1009 : szDatas) {EtlSearchCompanyResponseDto responseDto = new EtlSearchCompanyResponseDto(t1009.getF3(), t1009.getF4(), t1009.getF1(), "2", t1009.getF5());int i = 0;for (Word word : words) {if (t1009.getF3().contains(word.getText())) {i++;} else if (t1009.getF4().contains(word.getText())) {i++;}}responseDto.setMatching(i);data.add(responseDto);}Sort gzsort = new Sort(Direction.ASC,"f11");List<EtlDataT1022> gzDatas = etlDataT1022Dao.findAll(new Specification<EtlDataT1022>() {@Overridepublic Predicate toPredicate(Root<EtlDataT1022> root, CriteriaQuery<?> query, CriteriaBuilder cb) {List<Predicate> predicates = new LinkedList<>();for (Word word : words) {predicates.add(cb.or(cb.like(root.get("f11").as(String.class), "%" + word.getText() + "%")));predicates.add(cb.or(cb.like(root.get("f12").as(String.class), "%" + word.getText() + "%")));}Predicate[] p = new Predicate[predicates.size()];return cb.or(predicates.toArray(p));}},gzsort);// 匹配度计算for (EtlDataT1022 t1022 : gzDatas) {EtlSearchCompanyResponseDto responseDto = new EtlSearchCompanyResponseDto(t1022.getF11(), t1022.getF12(), t1022.getF1(), "3", t1022.getF14());int i = 0;for (Word word : words) {if (t1022.getF11().contains(word.getText())) {i++;} else if (t1022.getF12().contains(word.getText())) {i++;}}responseDto.setCompanyLegal(t1022.getF15());responseDto.setMatching(i);data.add(responseDto);}// 排序分页PageUtil.searchCompanyComparator(data);List<EtlSearchCompanyResponseDto> pages = PageUtil.splitList(data, page.getSize(), page.getPage()-1);response.put(ApiResponse.KEY_DATA, pages);Map<String, Object> pageMap = new LinkedHashMap<>();int size = data.size() / page.getSize();if (data.size() % page.getSize() != 0) {size++;}pageMap.put("pageCount", size);response.put(ApiResponse.KEY_PAGE, pageMap);return response;}}
使用word分词器的朋友给个提醒,word分词器初次调用时会加载词库,所以建议大家在项目启动的时候默认去调用以下分词器的接口,这便于你在使用分词的时候不会等待很长时间,正常加载本例经测试10万级别的数据返回时间是1s内。有疑问的朋友可以在评论中留言了,看到会第一时间回复!喜欢朋友可以关注我的个人微信公众号哦,会同步更新相应技术,二维码见下图。萌萌技术
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐