C45的.data和.names文件转化成Arff数据
2013-08-21 16:21
148 查看
package cn.ac.ict.ics.utils; import lombok.Cleanup; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; import java.util.StringTokenizer; import java.util.Vector; /** * Created by qibaoyuan on 13-8-21. */ public class ArffConverter { public void ConvertCommonFile2Arff(String file) throws Exception { String header = file + ".names"; String data = file + ".data"; String arff = file + ".arff"; String[] lines = readArrayOfStringsFromFile(header); boolean ogotClass = false; String classLine = ""; @Cleanup BufferedWriter bw = null; try { bw = Files.newBufferedWriter(Paths.get(arff), Charset.forName("UTF-8")); } catch (IOException e) { e.printStackTrace(); } assert bw != null; System.out.println("@relation '" + header + "_" + data + "'\n"); bw.write("@relation '" + header + "_" + data + "'\n" + "\r\n"); int nnumberOfAttributes = 0; for (int i = 0; i < lines.length; i++) { //skip comments String line = lines[i].trim(); line = line.replaceAll(" ", ""); if (line.startsWith("|") || line.equals("")) { continue; } StringTokenizer stringTokenizer = new StringTokenizer(line, ":"); //Print.dialog("tokens " + stringTokenizer.countTokens()); if (stringTokenizer.countTokens() != 2) { System.err.println("Error parsing line:\n" + line); } String attributeName = stringTokenizer.nextToken(); String values = stringTokenizer.nextToken(); if (i == lines.length - 1) { //class System.out.println(toWekaFormat(attributeName, new String[]{"-1", "0", "1."})); bw.write(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}) + "\r\n"); nnumberOfAttributes++; } else { if (values.endsWith("continuous.")) { System.out.println("@attribute " + attributeName + " numeric"); bw.write("@attribute " + attributeName + " numeric" + "\r\n"); nnumberOfAttributes++; } else { //nominal values System.out.println(toWekaFormat(attributeName, nominalValues(values))); bw.write(toWekaFormat(attributeName, nominalValues(values)) + "\r\n"); nnumberOfAttributes++; } } } System.out.println(classLine + "\n\n" + "@data\n"); bw.write(classLine + "\n\n" + "@data\n" + "\r\n"); @Cleanup BufferedReader dataBr = Files.newBufferedReader(Paths.get(data), Charset.forName("UTF-8")); String line = null; int counter = 0; while (null != (line = dataBr.readLine())) { assert null != line; if (line.trim().startsWith("|")) { continue; } int label = Integer.parseInt(line.substring(line.lastIndexOf(",") + 1)); if (label < 0) line = line.substring(0, line.lastIndexOf(",")) + ",-1"; else if (label == 0) line = line.substring(0, line.lastIndexOf(",")) + ",0"; else line = line.substring(0, line.lastIndexOf(",")) + ",1"; if (counter++ % 5000 == 0) System.out.println("processed:" + counter); //System.out.println(formatDataLine(line, nnumberOfAttributes)); bw.write(formatDataLine(line.replace(" ", ""), nnumberOfAttributes) + "\r\n"); } } private static String formatDataLine(String line, int nnumberOfAttributes) { StringTokenizer stringTokenizer = new StringTokenizer(line, ","); int n = stringTokenizer.countTokens(); if (n != nnumberOfAttributes) { System.err.println("# attributes should be " + nnumberOfAttributes + " but it's " + n + " in line " + line); } StringBuffer stringBuffer = new StringBuffer(); for (int i = 0; i < n - 1; i++) { stringBuffer.append(stringTokenizer.nextToken().trim() + ","); } //I had to use the line below for adult.test because someone added a dot, //which is not present in adult.data (the training part) //stringBuffer.append(takeDot(stringTokenizer.nextToken())); stringBuffer.append(stringTokenizer.nextToken().trim()); return stringBuffer.toString(); } private static String takeDot(String last) { last = last.trim(); last = last.substring(0, last.length() - 1); return last; } private static String toWekaFormat(String attributeName, String[] nominalValues) { String out = "@attribute " + attributeName + " {"; for (int i = 0; i < nominalValues.length - 1; i++) { out += nominalValues[i].trim() + ","; } out += takeDot(nominalValues[nominalValues.length - 1]) + "}"; return out; } private static String[] nominalValues(String line) { StringTokenizer stringTokenizer = new StringTokenizer(line, ","); int n = stringTokenizer.countTokens(); if (n < 2) { System.err.println("Problem parsing line:\n" + line); } String[] out = new String ; for (int i = 0; i < n; i++) { out[i] = stringTokenizer.nextToken(); } return out; } public static String[] readArrayOfStringsFromFile(String fileName) { Vector v = readVectorOfStringsFromFile(fileName); if (v.size() < 1) { return null; } String[] out = new String[v.size()]; for (int i = 0; i < out.length; i++) { out[i] = (String) v.elementAt(i); } return out; } public static Vector readVectorOfStringsFromFile(String filename) { if (filename == null) { System.err.println("Passed a string that is null !"); } Vector vectorOfStrings = new Vector(); try { BufferedReader bufferedReader = new BufferedReader(new FileReader(filename)); String s = null; int max = 90; int i = 0; while ((s = bufferedReader.readLine()) != null) { if (i++ > max) break; if (s.trim().equals("")) { System.err.println("Skipped blank line"); } else { vectorOfStrings.addElement(s); } } bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); System.err.println("Problem reading file " + filename); } return vectorOfStrings; } }
测试用例
package cn.ac.ict.ics.utils; import org.junit.Test; /** * Created by qibaoyuan on 13-8-21. */ public class ArffConverterTest { @Test public void testConvertCommonFile2Arff() throws Exception { ArffConverter arffConverter = new ArffConverter(); String file = "/Users/user/corpus/" + "airlines/airline_14col"; arffConverter.ConvertCommonFile2Arff(file); } }
相关文章推荐
- 微信小程序wx.uploadFile(上传文件)PHP服务器获取formData的数据
- caffe中将jpg数据转化为lmdb格式的文件
- C# DataGirdview手动添加数据,导出txt文件并自动对齐
- 解决办法:异地冷恢复时 如果发现v$datafile里的有些用户用的数据文件没有备份
- 将DataGirdView数据,导出EXCEL文件
- 在Java中使用weka:将实例转化为ARFF文件
- 通过试验探索Access 2000/XP 数据库的最佳 NTFS 权限设置Microsoft Jet 数据库引擎打不开文件'D:\wwwroot\test\data\'。 它已经被别的用户以独占方式打开,或没有查看数据的权限。
- 根据文件data.txt和模板文件template.tmpl,实现将数据文件和模板文件的合并,并保存在输出文件
- 如何rename datafile name中存在乱码的数据文件
- 利用随机函数产生100个三位整数,将这些整数写入到数据文件data1.dat中
- Android 读取清单文件<meta-data>元素的数据
- 处理本地JSON文件,将data数据转换成NSDictionary
- mysql 通过data文件下来恢复数据
- 获取真机data/data/目录下应用数据文件
- Oracle使用dba_data_files查看表空间大小及数据文件位置
- Outlook 2007: Accounts and data files 帐户和数据文件 - RSS Feeds
- 用WebClient.UploadData方法上载文件数据的方法
- C#把DataSet内数据转化为Excel和Word文件的通用类
- 将数据控件(如GridView)的内容转化成Excel格式文件
- 如何缩小或者扩大数据文件 How to Resize a Datafile (Doc ID 1029252.6)