您的位置:首页 > 编程语言

hadoop之求和和自定义排序编程

2017-03-26 21:05 309 查看
排序MR默认是按key2进行排序的,如果想自定义排序规则,被排序的对象要实现WritableComparable接口,在compareTo方法中实现排序规则,然后将这个对象当做k2,即可完成排序。

注:

1.key1 value1 是map的输入;key2 value2是reduce的输入。

需求分析:

数据:

zhangsan@163.com 6000
0 2014-02-20

lisi@163.com 2000
0 2014-02-20

lisi@163.com 0
100 2014-02-20

zhangsan@163.com 3000
0 2014-02-20

wangwu@126.com 9000
0 2014-02-20

wangwu@126.com 0
200 2014-02-20

账号对应的收入,收入越多排在前面,如果收入相等,就按支出排序,支出越小排在前面。

代码示例:

infoBean实体类:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class InfoBean implements WritableComparable<InfoBean>{

private String account;//账号
private double income;//收入
private double expenses;//支出
private double surplus;//结余

public void set(String account,double income,double expenses){
this.account = account;
this.income = income;
this.expenses = expenses;
this.surplus = income - expenses;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(account);
out.writeDouble(income);
out.writeDouble(expenses);
out.writeDouble(surplus);

}

@Override
public void readFields(DataInput in) throws IOException {
this.account = in.readUTF();
this.income = in.readDouble();
this.expenses = in.readDouble();
this.surplus = in.readDouble();
}
//compareTo方法中实现排序规则
@Override
public int compareTo(InfoBean o) {
if(this.income == o.getIncome()){
return this.expenses > o.getExpenses() ? 1 : -1;
}
return this.income > o.getIncome() ? 1 : -1;
}

@Override
public String toString() {
return income + "\t" + expenses + "\t" + surplus;
}
public String getAccount() {
return account;
}

public void setAccount(String account) {
this.account = account;
}

public double getIncome() {
return income;
}

public void setIncome(double income) {
this.income = income;
}

public double getExpenses() {
return expenses;
}

public void setExpenses(double expenses) {
this.expenses = expenses;
}

public double getSurplus() {
return surplus;
}

public void setSurplus(double surplus) {
this.surplus = surplus;
}

}求和的核心类:
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SumStep {

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);

job.setJarByClass(SumStep.class);

job.setMapperClass(SumMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));

job.setReducerClass(SumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(InfoBean.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.waitForCompletion(true);
}

public static class SumMapper extends Mapper<LongWritable, Text, Text, InfoBean>{

private InfoBean bean = new InfoBean();
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// split
String line = value.toString();
String[] fields = line.split("\t");
// get useful field
String account = fields[0];
double income = Double.parseDouble(fields[1]);
double expenses = Double.parseDouble(fields[2]);
k.set(account);
bean.set(account, income, expenses);
context.write(k, bean);
}
}

public static class SumReducer extends Reducer<Text, InfoBean, Text, InfoBean>{

private InfoBean bean = new InfoBean();
@Override
protected void reduce(Text key, Iterable<InfoBean> v2s, Context context)
throws IOException, InterruptedException {

double in_sum = 0;
double out_sum = 0;
for(InfoBean bean : v2s){
in_sum += bean.getIncome();
out_sum += bean.getExpenses();
}
bean.set("", in_sum, out_sum);
context.write(key, bean);
}

}
}

排序的核心类(将求和的mapreduce的输出结果文件作为排序的mapreduce的输入文件):

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SortStep {

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();

Job job = Job.getInstance(conf);

job.setJarByClass(SortStep.class);

job.setMapperClass(SortMapper.class);
job.setMapOutputKeyClass(InfoBean.class);
job.setMapOutputValueClass(NullWritable.class);

job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(InfoBean.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.waitForCompletion(true);

}
//MR默认是按key2进行排序的
public static class SortMapper extends Mapper<LongWritable, Text, InfoBean, NullWritable>{

private InfoBean k = new InfoBean();
@Override
protected void map(
LongWritable key,
Text value,
Mapper<LongWritable, Text, InfoBean, NullWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
k.set(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]));

context.write(k, NullWritable.get());

}

}
//MR默认是按key2进行排序的
public static class SortReducer extends Reducer<InfoBean, NullWritable, Text, InfoBean>{

private Text k = new Text();
@Override
protected void reduce(InfoBean key, Iterable<NullWritable> values,
Reducer<InfoBean, NullWritable, Text, InfoBean>.Context context)
throws IOException, InterruptedException {
k.set(key.getAccount());

context.write(k, key);
}

}

}


小结:

1.排序是在map执行之后,reduce执行之前。

2.map在读取文件时,会忽略下划线开头的的文件和文件夹;如果map读取文件时,设置的是文件夹,那么它会读取此文件下的除了下划线开头的所有文件和文件夹。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: