您的位置：首页 > 其它

mahout将数据转化成序列化文件、稀疏向量

2014-05-05 14:21 267 查看

对于文本信息的向量化，Mahout 已经提供了工具类，它基于 Lucene 给出了对文本信息进行分析，然后创建文本向量。mahout提供下面两个命令来将文本转成向量形式（转化成向量后可以聚类）：

1.mahout seqdirectory：将文本文件转成SequenceFile文件，SequenceFile文件是一种二制制存储的key-value键值对，对应的源文件是org.apache.mahout.text.SequenceFilesFromDirectory.java

2.mahout seq2sparse：将SequenceFile转成向量文件，对应的源文件是org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java

我是将mahout源码导入到eclipse中，对以上的两个源文件分别进行运行（运行时必须配置参数，有输入、输出、字符编码）转化的，生成的向量文件目录结构是：

df-count 目录：保存着文本的频率信息

tf-vectors 目录：保存着以 TF 作为权值的文本向量

tfidf-vectors 目录：保存着以 TFIDF 作为权值的文本向量

tokenized-documents 目录：保存着分词过后的文本信息

wordcount 目录：保存着全局的词汇出现的次数

dictionary.file-0 目录：保存着这些文本的词汇表

frequcency-file-0 目录 : 保存着词汇表对应的频率信息。

查看转化结果：

mahout seqdumper：将SequenceFile文件转成文本形式，对应的源文件是org.apache.mahout.utils.SequenceFileDumper.java

mahout vectordump：将向量文件转成可读的文本形式，对应的源文件是org.apache.mahout.utils.vectors.VectorDumper.java

mahout clusterdump：分析最后聚类的输出结果，对应的源文件是org.apache.mahout.utils.clustering.ClusterDumper.java具体每种命令如何用及参数如何选择，在命令行后面加-h或-help可以查看

下面是我在项目中用到的一些源码

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.utils.io.ChunkedWriter;

import com.google.common.io.Closeables;

public class WriteToSequenceFileForBayesian  extends AbstractJob{//使用聚类的文件，所以要传入一个num，比如2000，表示以2000为单位，训练集的划分规范。
public static void main(String args[]) throws Exception{
ToolRunner.run(new WriteToSequenceFileForBayesian(), args);
}
@Override
public int run(String[] arg0) throws Exception {
String inputPath=arg0[0];//
String outputpoints=arg0[1];//
int k = Integer.parseInt(arg0[2]);//
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(inputPath );
FSDataInputStream dis = fs.open(inPath);
LineReader in = new LineReader(dis,conf);
ChunkedWriter writer = new ChunkedWriter(conf, 64, new Path(outputpoints));
Text line = new Text();
//按行读取
long recNum = 0;
StringBuilder ss=new StringBuilder();
while(in.readLine(line) > 0){
String aline=line.toString();
String[] strs=aline.split(" ");
if (recNum-2&&Double.parseDouble(strs[0])<4) {
ss.append("one_first");
}else if (Double.parseDouble(strs[0])<-2) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>4) {
ss.append("high_first");
}
ss.append(",");
//处理第2个数
if (Double.parseDouble(strs[1])>-3&&Double.parseDouble(strs[1])<3) {
ss.append("zero_second");
}else if (Double.parseDouble(strs[1])<-3) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>3) {
ss.append("high_second");
}
ss.append(",");
//处理第3个数
if (Double.parseDouble(strs[2])>-2&&Double.parseDouble(strs[2])<4) {
ss.append("one_third");
}else if (Double.parseDouble(strs[2])<-2) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>4) {
ss.append("high_third");
}
ss.append(",");
//处理第4个数
if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<5) {
ss.append("two_fourth");
}else if (Double.parseDouble(strs[3])<-1) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>5) {
ss.append("high_fourth");
}
ss.append(",");
//处理第5个数
if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<4) {
ss.append("one_fifth");
}else if (Double.parseDouble(strs[4])<-2) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>4) {
ss.append("high_fifth");
}
writer.write("first", ss.toString());
}else if (recNum1.5&&Double.parseDouble(strs[0])<2.5) {
ss.append("two_first");
}else if (Double.parseDouble(strs[0])<1.5) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>2.5) {
ss.append("high_first");
}
ss.append(",");
//处理第2个数
if (Double.parseDouble(strs[1])>0.5&&Double.parseDouble(strs[1])<1.5) {
ss.append("one_second");
}else if (Double.parseDouble(strs[1])<0.5) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>1.5) {
ss.append("high_second");
}
ss.append(",");
//处理第3个数
if (Double.parseDouble(strs[2])>-0.5&&Double.parseDouble(strs[2])<0.5) {
ss.append("zero_third");
}else if (Double.parseDouble(strs[2])<-0.5) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>0.5) {
ss.append("high_third");
}
ss.append(",");
//处理第4个数
if (Double.parseDouble(strs[3])>0.5&&Double.parseDouble(strs[3])<1.5) {
ss.append("one_fourth");
}else if (Double.parseDouble(strs[3])<0.5) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>1.5) {
ss.append("high_fourth");
}
ss.append(",");
//处理第5个数
if (Double.parseDouble(strs[4])>0.5&&Double.parseDouble(strs[4])<1.5) {
ss.append("one_fifth");
}else if (Double.parseDouble(strs[4])<0.5) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>1.5) {
ss.append("high_fifth");
}
writer.write("second", ss.toString());
}else if (recNum0.9&&Double.parseDouble(strs[0])<1.1) {
ss.append("one_first");
}else if (Double.parseDouble(strs[0])<0.9) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>1.1) {
ss.append("high_first");
}
ss.append(",");
//处理第2个数
if (Double.parseDouble(strs[1])>0.9&&Double.parseDouble(strs[1])<1.1) {
ss.append("one_second");
}else if (Double.parseDouble(strs[1])<0.9) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>1.1) {
ss.append("high_second");
}
ss.append(",");
//处理第3个数
if (Double.parseDouble(strs[2])>1.9&&Double.parseDouble(strs[2])<2.1) {
ss.append("two_third");
}else if (Double.parseDouble(strs[2])<1.9) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>2.1) {
ss.append("high_third");
}
ss.append(",");
//处理第4个数
if (Double.parseDouble(strs[3])>-0.1&&Double.parseDouble(strs[3])<0.1) {
ss.append("zero_fourth");
}else if (Double.parseDouble(strs[3])<-0.1) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>0.1) {
ss.append("high_fourth");
}
ss.append(",");
//处理第5个数
if (Double.parseDouble(strs[4])>0.9&&Double.parseDouble(strs[4])<1.1) {
ss.append("one_fifth");
}else if (Double.parseDouble(strs[4])<0.9) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>1.1) {
ss.append("high_fifth");
}
writer.write("third", ss.toString());
}else if (recNum-1&&Double.parseDouble(strs[0])<3) {
ss.append("one_first");
}else if (Double.parseDouble(strs[0])<-1) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>3) {
ss.append("high_first");
}
ss.append(",");
//处理第2个数
if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<4) {
ss.append("two_second");
}else if (Double.parseDouble(strs[1])<0) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>4) {
ss.append("high_second");
}
ss.append(",");
//处理第3个数
if (Double.parseDouble(strs[2])>-1&&Double.parseDouble(strs[2])<3) {
ss.append("one_third");
}else if (Double.parseDouble(strs[2])<-1) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>3) {
ss.append("high_third");
}
ss.append(",");
//处理第4个数
if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<3) {
ss.append("one_fourth");
}else if (Double.parseDouble(strs[3])<-1) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>3) {
ss.append("high_fourth");
}
ss.append(",");
//处理第5个数
if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<2) {
ss.append("zero_fifth");
}else if (Double.parseDouble(strs[4])<-2) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>2) {
ss.append("high_fifth");
}
writer.write("fourth", ss.toString());
}else if (recNum-1&&Double.parseDouble(strs[0])<1) {
ss.append("zero_first");
}else if (Double.parseDouble(strs[0])<-1) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>1) {
ss.append("high_first");
}
ss.append(",");
//处理第2个数
if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<2) {
ss.append("one_second");
}else if (Double.parseDouble(strs[1])<0) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>2) {
ss.append("high_second");
}
ss.append(",");
//处理第3个数
if (Double.parseDouble(strs[2])>0&&Double.parseDouble(strs[2])<2) {
ss.append("one_third");
}else if (Double.parseDouble(strs[2])<0) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>2) {
ss.append("high_third");
}
ss.append(",");
//处理第4个数
if (Double.parseDouble(strs[3])>0&&Double.parseDouble(strs[3])<2) {
ss.append("one_fourth");
}else if (Double.parseDouble(strs[3])<0) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>2) {
ss.append("high_fourth");
}
ss.append(",");
//处理第5个数
if (Double.parseDouble(strs[4])>1&&Double.parseDouble(strs[4])<3) {
ss.append("two_fifth");
}else if (Double.parseDouble(strs[4])<1) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>3) {
ss.append("high_fifth");
}
writer.write("fifth", ss.toString());
}
}
Closeables.close(writer, false);
dis.close();
in.close();
return 0;
}
}

import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.mahout.clustering.kmeans.Kluster;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.text.SequenceFilesFromDirectory;

public class WriteToSequenceFile {
public static void main(String args[]) throws Exception {
String inputPath=args[0];//文本数据文件输入目录
String outputpoints=args[1];//sequenceFile中的point数据输出目录
String outputclusters=args[2];//sequenceFile中的cluster数据输出目录
int k = Integer.parseInt(args[3]);//k个中心
List vectors = new ArrayList();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(inputPath );
FSDataInputStream dis = fs.open(inPath);
LineReader in = new LineReader(dis,conf);
Text line = new Text();
SequenceFile.Writer pointwriter = new SequenceFile.Writer(fs, conf, new Path(outputpoints), LongWritable.class, VectorWritable.class);
//按行读取
long recNum = 0;
VectorWritable vecWrite = new VectorWritable();
while(in.readLine(line) > 0){
String aline=line.toString();
String[] strs=aline.split(" ");
double[] fr = new double[5];
for (int i = 0; i < strs.length; i++) {
fr[i]=Double.parseDouble(strs[i]);
}
Vector vec = new RandomAccessSparseVector(fr.length);
vec.assign(fr);
vecWrite.set(vec);
pointwriter.append(new LongWritable(recNum++), vecWrite);
if (vectors.size()

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： mahout seqdirectory seq2sparse

相关文章推荐

新的分享

章节导航