您的位置:首页 > 编程语言 > Java开发

java解析搜狗词库scel文件到txt

2016-11-03 11:49 295 查看


SougouScelReader 读取词库文件类
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* 读取搜索词库
*
* @author dengjh
* @create 2016-11-03 9:39
**/
public class SougouScelReader {

public SougouScelMdel read(File file) throws IOException {
return read(new FileInputStream(file));
}

public SougouScelMdel read(URL url) throws IOException {
return read(url.openStream());
}

protected ByteArrayOutputStream output=new ByteArrayOutputStream();

protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
int read=reads[0];
input.skip(pos-read);
read=pos;
output.reset();
while(true) {
int c1 = input.read();
int c2 = input.read();
read+=2;
if(c1==0 && c2==0) {
break;
} else {
output.write(c1);
output.write(c2);
}
}
reads[0]=read;
return new String(output.toByteArray(),encoding);
}

protected static String encoding = "UTF-16LE";

public SougouScelMdel read(InputStream in) throws IOException {
SougouScelMdel model = new SougouScelMdel();
DataInputStream input = new DataInputStream(in);
int read;
try {
byte[] bytes = new byte[4];
input.readFully(bytes);
assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
input.readFully(bytes);
int flag1 = bytes[0];
assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
int[] reads=new int[]{8};
model.setName(readString(input,0x130,reads));
model.setType(readString(input,0x338,reads));
model.setDescription(readString(input,0x540,reads));
model.setSample(readString(input,0xd40,reads));
read = reads[0];
input.skip(0x1540 - read);
read=0x1540;
input.readFully(bytes);
read += 4;
assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
bytes = new byte[128];
Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
while (true) {
int mark = readUnsignedShort(input);
int size = input.readUnsignedByte();
input.skip(1);
read += 4;
assert (size > 0 && (size % 2) == 0);
input.readFully(bytes, 0, size);
read += size;
String py = new String(bytes, 0, size, encoding);
//System.out.println(py);
pyMap.put(mark, py);
if ("zuo".equals(py)) {
break;
}
}
if (flag1 == 0x44) {
input.skip(0x2628 - read);
} else if (flag1 == 0x45) {
input.skip(0x26C4 - read);
} else {
throw new RuntimeException("出现意外,联系作者");
}
StringBuffer buffer = new StringBuffer();
Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
while (true) {
int size = readUnsignedShort(input);
if (size < 0) {
break;
}
int count = readUnsignedShort(input);
int len = count / 2;
assert (len * 2 == count);
buffer.setLength(0);
for (int i = 0; i < len; i++) {
int key = readUnsignedShort(input);
buffer.append(pyMap.get(key)).append("'");
}
buffer.setLength(buffer.length() - 1);
String py = buffer.toString();
List<String> list = wordMap.get(py);
if (list == null) {
list = new ArrayList<String>();
wordMap.put(py, list);
}
for (int i = 0; i < size; i++) {
count = readUnsignedShort(input);
if (count > bytes.length) {
bytes = new byte[count];
}
input.readFully(bytes, 0, count);
String word = new String(bytes, 0, count, encoding);
//接下来12个字节可能是词频或者类似信息
input.skip(12);
list.add(word);
}
}
//System.out.println(wordMap.size());
model.setWordMap(wordMap);
return model;
} finally {
in.close();
}
}

protected final int readUnsignedShort(InputStream in) throws IOException {
int ch1 = in.read();
int ch2 = in.read();
if ((ch1 | ch2) < 0) {
return Integer.MIN_VALUE;
}
return (ch2 << 8) + (ch1 << 0);
}
}


SougouScelMdel.java 
import java.util.List;
import java.util.Map;

/**
* @author dengjh
* @create 2016-11-03 9:40
**/
public class SougouScelMdel {

private Map<String, List<String>> wordMap;

private String name;
private String type;
private String description;
private String sample;

public Map<String, List<String>> getWordMap() {
return wordMap;
}

void setWordMap(Map<String, List<String>> wordMap) {
this.wordMap = wordMap;
}

public String getType() {
return type;
}

public void setType(String type) {
this.type = type;
}

public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}

public String getSample() {
return sample;
}

public void setSample(String sample) {
this.sample = sample;
}

public String getName() {
return name;
}

public void setName(String name) {
this.name = name;
}

}


ParseSogo.java 解析词库文件类

 
/**
* 解析搜狗词库文件
*
* @author dengjh
* @create 2016-11-03 9:44
**/
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
public class ParseSogo {

public static void main(String[] args)throws Exception {

sogou("D:\\scel\\goods.scel","D:\\scel\\goods.txt",true);
}

/**
* 读取scel的词库文件
* 生成txt格式的文件
* @param inputPath 输入路径
* @param outputPath 输出路径
* @param isAppend  是否拼接追加词库内容 true 代表追加,false代表重建
*
* **/
private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{
File file=new File(inputPath);
if(!isAppend){
if(Files.exists(Paths.get(outputPath),LinkOption.values())){
System.out.println("存储此文件已经删除");
Files.deleteIfExists(Paths.get(outputPath));

}
}
RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");

int count=0;
SougouScelMdel model = new SougouScelReader().read(file);
Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>
Set<Entry<String,List&l
4000
t;String>>> set = words.entrySet();
Iterator<Entry<String,List<String>>> iter = set.iterator();
while(iter.hasNext()){
Entry<String,List<String>> entry = iter.next();
List<String> list = entry.getValue();
int size = list.size();
for(int i = 0; i < size; i++){
String word = list.get(i);

//System.out.println(word);
raf.seek(raf.getFilePointer());
raf.write((word+"\n").getBytes());//写入txt文件
count++;

}
}
raf.close();
System.out.println("生成txt成功!,总计写入: "+count+" 条数据!");
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  搜狗词库scel