您的位置:首页 > 其它

solr入门之pinyin4j的源码改写初尝试

2016-04-06 19:09 489 查看
目标:

pinyin4j中收录了很多的词,但是也有一些词语是未被收录的,目前想做到的效果是

能将为收录的词语收录进去,而且还不需要重新启动服务

===================================================

源码观看

//测试用调用的方法
String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);

static public String[] toHanyuPinyinStringArray(char ch, HanyuPinyinOutputFormat outputFormat)
throws BadHanyuPinyinOutputFormatCombination {
return getFormattedHanyuPinyinStringArray(ch, outputFormat);
}

static private String[] getFormattedHanyuPinyinStringArray(char ch,
HanyuPinyinOutputFormat outputFormat) throws BadHanyuPinyinOutputFormatCombination {
String[] pinyinStrArray = getUnformattedHanyuPinyinStringArray(ch);

if (null != pinyinStrArray) {

for (int i = 0; i < pinyinStrArray.length; i++) {
pinyinStrArray[i] = PinyinFormatter.formatHanyuPinyin(pinyinStrArray[i], outputFormat);
}

return pinyinStrArray;

} else
return ARR_EMPTY;
}

//这里初始化是很重要的一步
private static String[] getUnformattedHanyuPinyinStringArray(char ch) {
return ChineseToPinyinResource.getInstance().getHanyuPinyinStringArray(ch);
}

//单例类
static ChineseToPinyinResource getInstance() {
return ChineseToPinyinResourceHolder.theInstance;
}

/**
* Singleton implementation helper.
*/
private static class ChineseToPinyinResourceHolder {
static final ChineseToPinyinResource theInstance = new ChineseToPinyinResource();
}

//初始化过程
private ChineseToPinyinResource() {
initializeResource();
}

/**
* Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
*/
private void initializeResource() {
try {
final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
final String resourceMultiName = "/pinyindb/multi_pinyin.txt";

setUnicodeToHanyuPinyinTable(new Trie());
getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));//加载单个词语--我做的就是再加载一次自定义的文件

getUnicodeToHanyuPinyinTable().loadMultiPinyin(
ResourceHelper.getResourceInputStream(resourceMultiName));//加载联想词

getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();//加载用户扩展多音词

} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}

/**
*主要内容都在这个类中
*/
public class Trie {

private Hashtable<String, Trie> values = new Hashtable<String, Trie>();//本节点包含的值

private String pinyin;//本节点的拼音

private Trie nextTire;//下一个节点,也就是匹配下一个字符

public String getPinyin() {
return pinyin;
}

public void setPinyin(String pinyin) {
this.pinyin = pinyin;
}

public Trie getNextTire() {
return nextTire;
}

public void setNextTire(Trie nextTire) {
this.nextTire = nextTire;
}

/**
* 加载拼音
*
* @param inStream 拼音文件输入流
* @throws IOException
*/
public synchronized void load(InputStream inStream) throws IOException {
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
try {
inputStreamReader = new InputStreamReader(inStream);
bufferedReader = new BufferedReader(inputStreamReader);
String s;
while ((s = bufferedReader.readLine()) != null) {
String[] keyAndValue = s.split(" ");
if (keyAndValue.length != 2) continue;
Trie trie = new Trie();
trie.pinyin = keyAndValue[1];
put(keyAndValue[0], trie);
}
} finally {
if (inputStreamReader != null) inputStreamReader.close();
if (bufferedReader != null) bufferedReader.close();
}
}

/**
* 加载多音字拼音词典
*
* @param inStream 拼音文件输入流
*/
public synchronized void loadMultiPinyin(InputStream inStream) throws IOException {
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
try {
inputStreamReader = new InputStreamReader(inStream);
bufferedReader = new BufferedReader(inputStreamReader);
String s;
while ((s = bufferedReader.readLine()) != null) {
String[] keyAndValue = s.split(" ");
if (keyAndValue.length != 2) continue;

String key = keyAndValue[0];//多于一个字的字符串
String value = keyAndValue[1];//字符串的拼音
char[] keys = key.toCharArray();

Trie currentTrie = this;
for (int i = 0; i < keys.length; i++) {
String hexString = Integer.toHexString(keys[i]).toUpperCase();

Trie trieParent = currentTrie.get(hexString);
if (trieParent == null) {//如果没有此值,直接put进去一个空对象
currentTrie.put(hexString, new Trie());
trieParent = currentTrie.get(hexString);
}
Trie trie = trieParent.getNextTire();//获取此对象的下一个

if (keys.length - 1 == i) {//最后一个字了,需要把拼音写进去
trieParent.pinyin = value;
break;//此行其实并没有意义
}

if (trie == null) {
if (keys.length - 1 != i) {
//不是最后一个字,写入这个字的nextTrie,并匹配下一个
Trie subTrie = new Trie();
trieParent.setNextTire(subTrie);
subTrie.put(Integer.toHexString(keys[i + 1]).toUpperCase(), new Trie());
currentTrie = subTrie;
}
} else {
currentTrie = trie;
}

}
}
} finally {
if (inputStreamReader != null) inputStreamReader.close();
if (bufferedReader != null) bufferedReader.close();
}
}

/**
* 加载用户自定义的扩展词库
*/
public void loadMultiPinyinExtend() throws IOException {
String path = MultiPinyinConfig.multiPinyinPath;
if (path != null) {
File userMultiPinyinFile = new File(path);
if (userMultiPinyinFile.exists()) {
loadMultiPinyin(new FileInputStream(userMultiPinyinFile));
}
}
}

public Trie get(String hexString) {
return values.get(hexString);
}

public void put(String s, Trie trie) {
values.put(s, trie);
}
}


准备工作--测试文件的读写方式--加载的

package com.git.pinyin;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;

/**
* 读取配置文件中信息到map中并且能够写入信息到文件中和map中
* @author songqinghu
* 可以软加载的方法  定时去加载 一次配置文件
*/
public class ReadAndWriteTest {

private final static Map<String,String> dict = new HashMap<String,String>();

public static void main(String[] args) throws IOException {
String path  = "/pinyindb/gome_hanyu_pinyin_ext.txt";
// readText(path);
// HashMap<String, String> map = new HashMap<String,String>();
// map.put("我是好人", "wo");
// map.put("我", "wo");
// map.put("是", "shi");
//map.put("好", "hao,ren");
//writeText(map);
unicodeTohanzi("3007");  // 龦 9FA6 cháng //这个词典里没有 一会用她测试
}
private static void unicodeTohanzi(String unicode){

int code = Integer.parseInt(unicode, 16);
System.out.println((char)code);

}
/**
*
* @描述:将汉字和拼音写入文件中  汉字  拼音集合 xxx,xxx,xxx
* @param content
* @return void
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
* @throws IOException
*/
public static void writeText(Map<String,String> content) throws IOException{
String path = ReadAndWriteTest.class.getResource("/pinyindb/gome_hanyu_pinyin_ext.txt").getPath();
System.out.println(path);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, true)));
Set<Entry<String, String>> entrySet = content.entrySet();
String line = null;
//writer.newLine();
for (Entry<String, String> entry : entrySet) {
String key = entry.getKey();
char[] charArray = key.toCharArray();
for (int i = 0; i < charArray.length; i++) {
if(i!=0){
break;
}
key = Integer.toHexString(charArray[i]);
}
line = key +" (" +entry.getValue()+")";
writer.write(line);
writer.newLine();
}
writer.flush();
writer.close();
System.out.println("====");
}

/**
*
* @描述:读取文件中信息到map中
* @param path  文件路径  格式 "/pinyindb/gome_hanyu_pinyin_ext.txt"
* @return void
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
* @throws IOException
*/
private static synchronized void readText(String path){
if(StringUtils.isNotBlank(path)){
BufferedReader reader = new BufferedReader(new InputStreamReader(ReadAndWriteTest.class.getResourceAsStream(path)));
if(reader != null){
String line=null;
try {
while((line = reader.readLine())!=null){//读取一行到String中
String[] values = line.split(" ");
if(values.length != 2){
continue;
}
String unicode = values[0];
int code = Integer.parseInt(unicode, 16);
char ch = (char) code;
String pinyin  = values[1];
System.out.println("编码后的字符: " + unicode + "  对应的拼音:"+ pinyin);
System.out.println(ch);
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if(reader !=null){
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

}

}


3.简单的向源码中添加类

从git上下载后,改写的源码地址

链接:http://pan.baidu.com/s/1i5zvYfz 密码:zodu

package com.gome.mx.plus.pinyin.ext;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

/**
* 输入汉字获取对应拼音的工具类
* @author songqinghu
*
*/
public class PYReadUtils {

/**
*
* @描述:输入汉字获取对应的全拼  可能是多音字  返回为数组类型 ---如果该汉字查不到则返回null
* @param words
* @return
* @return String[]
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
* @throws BadHanyuPinyinOutputFormatCombination
*/
public static String[] getFullPY(String words) throws BadHanyuPinyinOutputFormatCombination{

StringBuffer buffer = new StringBuffer();

HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();

defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);

char[] chars = words.toCharArray();

for (char c : chars) {
if(c>128){//汉字
String[] results  = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat);

for (int i = 0; i < results.length; i++) {
buffer.append(results[i]);
if(results.length - 1 != i){
buffer.append(",");
}
}
buffer.append(" ");
}//不是汉字 --不处理 直接过滤掉
}
//所有汉字都变成了拼音  转换组合一下  将拼音拼凑起来
return combination(buffer.toString());
}
//拼音封装去重复
private static String[] combination(String all){
ArrayList<Map<String, Integer>> list = new ArrayList<Map<String,Integer>>();

String[] words = all.split(" ");//切为每个词
for (String word : words) {
String[] pys = word.split(",");//切出来每个词的每个拼音
HashMap<String, Integer> map = new HashMap<String,Integer>();
for (String py : pys) {//
if(map.containsKey(py)){//去除重复拼音
Integer count = map.get(py);
map.put(py, count+1);
}else{
map.put(py,1);
}
}
list.add(map);//拼音顺序保持正确
}
//所有拼音处理完毕---进行拼凑
return midMakeUp(list);
}
//组合拼音
private static String[] midMakeUp(ArrayList<Map<String, Integer>> list){

HashMap<String, Integer> firsts = null;

for (Map<String, Integer> map : list) {

HashMap<String, Integer> temp = new HashMap<String,Integer>();

if(firsts !=null){//如果不是第一次--考虑组合问题
for (String str : firsts.keySet()) {
for (String st : map.keySet()) {
temp.put(str + st, 1);//组合
}
}

if(temp != null && temp.size()>0){//清理容器  做容器转换
firsts.clear();
}

}else{//如果是第一次
for (String str : map.keySet()) {
temp.put(str, 1);
}
}
if(temp !=null && temp.size()>0){
firsts = temp;
}
}
//组合结束---调用方法转为string[]

return toStringArr(firsts);
}

private static String[] toStringArr(Map<String,Integer> map){
if(map !=null && map.size()>0){
String[] strs = new String[map.size()];
Set<String> keySet = map.keySet();
int i = 0;
for (String key : keySet) {
strs[i] = key;
i++;
}
return strs;
}
return null;
}

}


package com.gome.mx.plus.pinyin.ext;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import net.sourceforge.pinyin4j.ChineseToPinyinResource;
import net.sourceforge.pinyin4j.multipinyin.Trie;
/**
* 将汉语和拼音写入指定的文件中--文件位置可以指定
* 并且能够动态的加载  不需要重启服务
* 还能指定是否重新写 还是追加的方式
* 还能够将原来已经存在的拼音合并过来--可以指定
* @author songqinghu
*
*/
public class PYWriterUtils {

private static String path  = "/pinyindb/gome_hanyu_pinyin_ext.txt";

/**
* @描述:获取配置文件的位置
* @return void
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
*/
public static void setPath(String path){
PYWriterUtils.path = path;
}

/**
*
* @描述:默认写入的方式  设置为追加模式  合并已经存在的拼音为一个
* @param word  汉字
* @param pinyin 拼音
* @param voice  声调
* @return
* @return boolean  是否成功
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
* @throws Exception
*/
public static boolean dufaultWriter(String word,String pinyin,Integer voice) throws Exception{
return writerControler(word, pinyin, voice, true, true);
}
/**
*
* @描述:可以设置的写入方式  --这里还要增加一个批量写入的功能  本方法只是处理一个汉字
* @param word  汉字
* @param pinyin 拼音
* @param voice  声调
* @param additional 是否追加到文件后
* @param merge 是否合并已经出现的拼音到文件中
* @return
* @return boolean
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
* @throws Exception
* 龦
*/
public static boolean writerControler(String word,String pinyin,Integer voice,
boolean additional ,boolean merge) throws Exception{
//添加音调
pinyin = pinyin + voice;
//配置文件地址
String filePath = PYWriterUtils.class.getResource(path).getPath();
//获取
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filePath, additional)));
//写入--16进制  查询 --
if(word !=null && word.length()>0){
char c = word.toCharArray()[0];
if(c>128){//是汉字
String unicode = Integer.toHexString(c).toUpperCase();//编码
if(merge){//如果要合并
Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();
String before = trie.get(unicode).getPinyin();
before = before.trim().substring(1, before.trim().length()-1);//去除()
pinyin = before +Field.COMMA+ pinyin;
}
pinyin = addSymbol(pinyin);
writer.write(unicode+Field.SPACE+pinyin);
writer.newLine();
System.out.println(unicode+Field.SPACE+pinyin);
}
}
writer.flush();
writer.close();

return true;
}
/**
*
* @描述:当自定义文件需要更新时,调用方法 重新加载自己的配置文件
* @return
* @return boolean
* @exception
* @createTime:2016年4月6日
* @author: songqinghu
* @throws IOException
*/
public static boolean reloadText() throws IOException{

InputStream is = PYWriterUtils.class.getResourceAsStream(path);

ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable().load(is);

return true;
}

/**
* 添加操作符号
*/
private static String addSymbol(String pinyin){
return Field.LEFT_BRACKET+pinyin+Field.RIGHT_BRACKET;
}

class Field {
static final String LEFT_BRACKET = "(";

static final String RIGHT_BRACKET = ")";

static final String COMMA = ",";

static final String SPACE = " ";
}
}


package com.gome.mx.plus.pinyin.ext;

public enum Voice {

One(1),Two(2),Three(3),Four(4);

private  final Integer value;

Voice(Integer value){
this.value = value;
}

public Integer getValue(){
return value;
}
}


测试的类--第一次写入 但是不加载进map中 读取不到 加载后能读取到了

说明:默认自定义的文件地址为:path = "/pinyindb/gome_hanyu_pinyin_ext.txt" 即和pinyin4j的字典在相同的目录下

package com.gome.mx.plus.pinyin.ext;

import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class MyTest {

public static void main(String[] args) throws BadHanyuPinyinOutputFormatCombination {
//        String[] fullPY = PYReadUtils.getFullPY("龦");
//        for (String string : fullPY) {
//            System.out.println(string);
//        }
try {
//写入全新的字符到文件中
PYWriterUtils.writerControler("骉", "test", Voice.Two.getValue(),true, true);
String[] fullPY = PYReadUtils.getFullPY("骉");
if(fullPY == null){
System.out.println("没有查到");
}else{
System.out.println("查到");
for (String string : fullPY) {
System.out.println(string);
}
}
PYWriterUtils.reloadText();
String[] full = PYReadUtils.getFullPY("骉");
if(full == null){
System.out.println("没有查到");
}else{
System.out.println("查到");
for (String string : full) {
System.out.println(string);
}
}

} catch (Exception e) {
e.printStackTrace();
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: