您的位置:首页 > 其它

C45的.data和.names文件转化成Arff数据

2013-08-21 16:21 148 查看
package cn.ac.ict.ics.utils;

import lombok.Cleanup;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.StringTokenizer;
import java.util.Vector;

/**
* Created by qibaoyuan on 13-8-21.
*/
public class ArffConverter {

public void ConvertCommonFile2Arff(String file) throws Exception {

String header = file + ".names";
String data = file + ".data";
String arff = file + ".arff";
String[] lines = readArrayOfStringsFromFile(header);
boolean ogotClass = false;
String classLine = "";

@Cleanup
BufferedWriter bw = null;
try {

bw = Files.newBufferedWriter(Paths.get(arff), Charset.forName("UTF-8"));
} catch (IOException e) {
e.printStackTrace();
}
assert bw != null;
System.out.println("@relation '" + header + "_" + data + "'\n");
bw.write("@relation '" + header + "_" + data + "'\n" + "\r\n");

int nnumberOfAttributes = 0;
for (int i = 0; i < lines.length; i++) {
//skip comments
String line = lines[i].trim();
line = line.replaceAll(" ", "");
if (line.startsWith("|") || line.equals("")) {
continue;
}

StringTokenizer stringTokenizer = new StringTokenizer(line, ":");
//Print.dialog("tokens " + stringTokenizer.countTokens());
if (stringTokenizer.countTokens() != 2) {
System.err.println("Error parsing line:\n" + line);
}

String attributeName = stringTokenizer.nextToken();
String values = stringTokenizer.nextToken();
if (i == lines.length - 1) {
//class
System.out.println(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}));
bw.write(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}) + "\r\n");
nnumberOfAttributes++;
} else {
if (values.endsWith("continuous.")) {
System.out.println("@attribute " + attributeName + " numeric");
bw.write("@attribute " + attributeName + " numeric" + "\r\n");
nnumberOfAttributes++;
} else {
//nominal values
System.out.println(toWekaFormat(attributeName, nominalValues(values)));
bw.write(toWekaFormat(attributeName, nominalValues(values)) + "\r\n");
nnumberOfAttributes++;
}
}
}

System.out.println(classLine + "\n\n" + "@data\n");
bw.write(classLine + "\n\n" + "@data\n" + "\r\n");

@Cleanup
BufferedReader dataBr = Files.newBufferedReader(Paths.get(data), Charset.forName("UTF-8"));

String line = null;
int counter = 0;
while (null != (line = dataBr.readLine())) {
assert null != line;
if (line.trim().startsWith("|")) {
continue;
}
int label = Integer.parseInt(line.substring(line.lastIndexOf(",") + 1));
if (label < 0)
line = line.substring(0, line.lastIndexOf(",")) + ",-1";
else if (label == 0)
line = line.substring(0, line.lastIndexOf(",")) + ",0";
else
line = line.substring(0, line.lastIndexOf(",")) + ",1";
if (counter++ % 5000 == 0)
System.out.println("processed:" + counter);
//System.out.println(formatDataLine(line, nnumberOfAttributes));
bw.write(formatDataLine(line.replace(" ", ""), nnumberOfAttributes) + "\r\n");
}

}

private static String formatDataLine(String line, int nnumberOfAttributes) {
StringTokenizer stringTokenizer = new StringTokenizer(line, ",");
int n = stringTokenizer.countTokens();
if (n != nnumberOfAttributes) {
System.err.println("# attributes should be " + nnumberOfAttributes +
" but it's " + n + " in line " + line);
}
StringBuffer stringBuffer = new StringBuffer();
for (int i = 0; i < n - 1; i++) {
stringBuffer.append(stringTokenizer.nextToken().trim() + ",");
}
//I had to use the line below for adult.test because someone added a dot,
//which is not present in adult.data (the training part)
//stringBuffer.append(takeDot(stringTokenizer.nextToken()));
stringBuffer.append(stringTokenizer.nextToken().trim());
return stringBuffer.toString();
}

private static String takeDot(String last) {
last = last.trim();
last = last.substring(0, last.length() - 1);
return last;
}

private static String toWekaFormat(String attributeName, String[] nominalValues) {
String out = "@attribute " + attributeName + " {";
for (int i = 0; i < nominalValues.length - 1; i++) {
out += nominalValues[i].trim() + ",";
}
out += takeDot(nominalValues[nominalValues.length - 1]) + "}";
return out;
}

private static String[] nominalValues(String line) {
StringTokenizer stringTokenizer = new StringTokenizer(line, ",");
int n = stringTokenizer.countTokens();
if (n < 2) {
System.err.println("Problem parsing line:\n" + line);
}
String[] out = new String
;
for (int i = 0; i < n; i++) {
out[i] = stringTokenizer.nextToken();
}
return out;
}

public static String[] readArrayOfStringsFromFile(String fileName) {
Vector v = readVectorOfStringsFromFile(fileName);
if (v.size() < 1) {
return null;
}
String[] out = new String[v.size()];
for (int i = 0; i < out.length; i++) {
out[i] = (String) v.elementAt(i);
}
return out;
}

public static Vector readVectorOfStringsFromFile(String filename) {

if (filename == null) {
System.err.println("Passed a string that is null !");
}

Vector vectorOfStrings = new Vector();
try {
BufferedReader bufferedReader = new BufferedReader(new FileReader(filename));
String s = null;
int max = 90;
int i = 0;
while ((s = bufferedReader.readLine()) != null) {
if (i++ > max) break;
if (s.trim().equals("")) {
System.err.println("Skipped blank line");
} else {
vectorOfStrings.addElement(s);
}
}
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
System.err.println("Problem reading file " + filename);
}
return vectorOfStrings;
}
}

测试用例

package cn.ac.ict.ics.utils;

import org.junit.Test;

/**
* Created by qibaoyuan on 13-8-21.
*/
public class ArffConverterTest {
@Test
public void testConvertCommonFile2Arff() throws Exception {
ArffConverter arffConverter = new ArffConverter();
String file = "/Users/user/corpus/" +
"airlines/airline_14col";
arffConverter.ConvertCommonFile2Arff(file);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐