您的位置:首页 > 数据库

mapreduce导出MSSQL的数据到HDFS

2015-12-05 23:46 246 查看
今天想通过一些数据,来测试一下我的《基于信息熵的无字典分词算法》这篇文章的正确性。就写了一下MapReduce程序从MSSQL SERVER2008数据库里取数据分析。程序发布到hadoop机器上运行报SQLEXCEPTION错误



  奇怪了,我的SQL语句中没有LIMIT,这LIMIT哪来的。我翻看了DBInputFormat类的源码,





1 protected RecordReader<LongWritable, T> createDBRecordReader(DBInputSplit split,
2
3       Configuration conf) throws IOException {
4
5
6
7     @SuppressWarnings("unchecked")
8
9     Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
10
11     try {
12
13       // use database product name to determine appropriate record reader.
14
15       if (dbProductName.startsWith("ORACLE")) {
16
17         // use Oracle-specific db reader.
18
19         return new OracleDBRecordReader<T>(split, inputClass,
20
21             conf, createConnection(), getDBConf(), conditions, fieldNames,
22
23             tableName);
24
25       } else if (dbProductName.startsWith("MYSQL")) {
26
27         // use MySQL-specific db reader.
28
29         return new MySQLDBRecordReader<T>(split, inputClass,
30
31             conf, createConnection(), getDBConf(), conditions, fieldNames,
32
33             tableName);
34
35       } else {
36
37         // Generic reader.
38
39         return new DBRecordReader<T>(split, inputClass,
40
41             conf, createConnection(), getDBConf(), conditions, fieldNames,
42
43             tableName);
44
45       }
46
47     } catch (SQLException ex) {
48
49       throw new IOException(ex.getMessage());
50
51     }
52
53   }


View Code
DBRecordReader的源码





1 protected String getSelectQuery() {
2
3     StringBuilder query = new StringBuilder();
4
5
6
7     // Default codepath for MySQL, HSQLDB, etc. Relies on LIMIT/OFFSET for splits.
8
9     if(dbConf.getInputQuery() == null) {
10
11       query.append("SELECT ");
12
13
14
15       for (int i = 0; i < fieldNames.length; i++) {
16
17         query.append(fieldNames[i]);
18
19         if (i != fieldNames.length -1) {
20
21           query.append(", ");
22
23         }
24
25       }
26
27
28
29       query.append(" FROM ").append(tableName);
30
31       query.append(" AS ").append(tableName); //in hsqldb this is necessary
32
33       if (conditions != null && conditions.length() > 0) {
34
35         query.append(" WHERE (").append(conditions).append(")");
36
37       }
38
39
40
41       String orderBy = dbConf.getInputOrderBy();
42
43       if (orderBy != null && orderBy.length() > 0) {
44
45         query.append(" ORDER BY ").append(orderBy);
46
47       }
48
49     } else {
50
51       //PREBUILT QUERY
52
53       query.append(dbConf.getInputQuery());
54
55     }
56
57
58
59     try {
60
61       query.append(" LIMIT ").append(split.getLength()); //问题所在
62
63       query.append(" OFFSET ").append(split.getStart());
64
65     } catch (IOException ex) {
66
67       // Ignore, will not throw.
68
69     }
70
71
72
73     return query.toString();
74
75   }


View Code
终于找到原因了。

  原来,hadoop只实现了Mysql的DBRecordReader(MySQLDBRecordReader)和ORACLE的DBRecordReader(OracleDBRecordReader)。

原因找到了,我参考着OracleDBRecordReader实现了MSSQL SERVER的DBRecordReader代码如下:

  MSSQLDBInputFormat的代码:





1 /**
2  *
3  */
4 package org.apache.hadoop.mapreduce.lib.db;
5
6 import java.io.IOException;
7 import java.sql.SQLException;
8
9 import org.apache.hadoop.conf.Configuration;
10 import org.apache.hadoop.io.LongWritable;
11 import org.apache.hadoop.mapreduce.Job;
12 import org.apache.hadoop.mapreduce.RecordReader;
13
14 /**
15  * @author summer
16  *  MICROSOFT SQL SERVER
17  */
18 public class MSSQLDBInputFormat<T extends DBWritable> extends DBInputFormat<T> {
19
20     public static void setInput(Job job,
21               Class<? extends DBWritable> inputClass,
22               String inputQuery, String inputCountQuery,String rowId) {
23             job.setInputFormatClass(MSSQLDBInputFormat.class);
24             DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
25             dbConf.setInputClass(inputClass);
26             dbConf.setInputQuery(inputQuery);
27             dbConf.setInputCountQuery(inputCountQuery);
28             dbConf.setInputFieldNames(new String[]{rowId});
29           }
30
31     @Override
32     protected RecordReader<LongWritable, T> createDBRecordReader(
33             org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split,
34             Configuration conf) throws IOException {
35
36          @SuppressWarnings("unchecked")
37             Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
38             try {
39
40                 return new MSSQLDBRecordReader<T>(split, inputClass,
41                     conf, createConnection(), getDBConf(), conditions, fieldNames,
42                     tableName);
43
44             } catch (SQLException ex) {
45               throw new IOException(ex.getMessage());
46             }
47
48
49     }
50
51
52 }


View Code
  MSSQLDBRecordReader的代码:





1 /**
2  *
3  */
4 package org.apache.hadoop.mapreduce.lib.db;
5
6 import java.io.IOException;
7 import java.sql.Connection;
8 import java.sql.SQLException;
9
10 import org.apache.hadoop.conf.Configuration;
11
12
13 /**
14  * @author summer
15  *
16  */
17 public class MSSQLDBRecordReader <T extends DBWritable> extends DBRecordReader<T>{
18
19     public MSSQLDBRecordReader(DBInputFormat.DBInputSplit split,
20               Class<T> inputClass, Configuration conf, Connection conn, DBConfiguration dbConfig,
21               String cond, String [] fields, String table) throws SQLException {
22         super(split, inputClass, conf, conn, dbConfig, cond, fields, table);
23
24     }
25
26     @Override
27     protected String getSelectQuery() {
28          StringBuilder query = new StringBuilder();
29             DBConfiguration dbConf = getDBConf();
30             String conditions = getConditions();
31             String tableName = getTableName();
32             String [] fieldNames = getFieldNames();
33
34             // Oracle-specific codepath to use rownum instead of LIMIT/OFFSET.
35             if(dbConf.getInputQuery() == null) {
36               query.append("SELECT ");
37
38               for (int i = 0; i < fieldNames.length; i++) {
39                 query.append(fieldNames[i]);
40                 if (i != fieldNames.length -1) {
41                   query.append(", ");
42                 }
43               }
44
45               query.append(" FROM ").append(tableName);
46               if (conditions != null && conditions.length() > 0)
47                 query.append(" WHERE ").append(conditions);
48               String orderBy = dbConf.getInputOrderBy();
49               if (orderBy != null && orderBy.length() > 0) {
50                 query.append(" ORDER BY ").append(orderBy);
51               }
52             } else {
53               //PREBUILT QUERY
54               query.append(dbConf.getInputQuery());
55             }
56
57             try {
58               DBInputFormat.DBInputSplit split = getSplit();
59               if (split.getLength() > 0){
60                 String querystring = query.toString();
61                 String id = fieldNames[0];
62                 query = new StringBuilder();
63                 query.append("SELECT TOP "+split.getLength()+"* FROM ( ");
64                 query.append(querystring);
65                 query.append(" ) a WHERE " + id +" NOT IN (SELECT TOP ").append(split.getEnd());
66                 query.append(" "+id +" FROM (");
67                 query.append(querystring);
68                 query.append(" ) b");
69                 query.append(" )");
70                 System.out.println("----------------------MICROSOFT SQL SERVER QUERY STRING---------------------------");
71                 System.out.println(query.toString());
72                 System.out.println("----------------------MICROSOFT SQL SERVER QUERY STRING---------------------------");
73               }
74             } catch (IOException ex) {
75               // ignore, will not throw.
76             }
77
78             return query.toString();
79     }
80
81
82
83 }


View Code
mapreduce的代码





1 /**
2  *
3  */
4 package com.nltk.sns.mapreduce;
5
6 import java.io.IOException;
7 import java.util.List;
8
9 import org.apache.hadoop.conf.Configuration;
10 import org.apache.hadoop.fs.FileSystem;
11 import org.apache.hadoop.fs.Path;
12 import org.apache.hadoop.io.LongWritable;
13 import org.apache.hadoop.io.Text;
14 import org.apache.hadoop.mapreduce.Job;
15 import org.apache.hadoop.mapreduce.MRJobConfig;
16 import org.apache.hadoop.mapreduce.Mapper;
17 import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
18 import org.apache.hadoop.mapreduce.lib.db.MSSQLDBInputFormat;
19 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
20
21
22
23
24
25
26
27 import com.nltk.utils.ETLUtils;
28
29 /**
30  * @author summer
31  *
32  */
33 public class LawDataEtl {
34
35     public static class CaseETLMapper extends
36         Mapper<LongWritable, LawCaseRecord, LongWritable, Text>{
37
38         static final int step = 6;
39
40         LongWritable key = new LongWritable(1);
41         Text value = new Text();
42
43         @Override
44         protected void map(
45                 LongWritable key,
46                 LawCaseRecord lawCaseRecord,
47                 Mapper<LongWritable, LawCaseRecord, LongWritable, Text>.Context context)
48                 throws IOException, InterruptedException {
49
50             System.out.println("-----------------------------"+lawCaseRecord+"------------------------------");
51
52             key.set(lawCaseRecord.id);
53             String source = ETLUtils.format(lawCaseRecord.source);
54             List<LawCaseWord> words = ETLUtils.split(lawCaseRecord.id,source, step);
55             for(LawCaseWord w:words){
56                 value.set(w.toString());
57                 context.write(key, value);
58             }
59         }
60     }
61
62
63     static final String driverClass = "com.microsoft.sqlserver.jdbc.SQLServerDriver";
64     static final String dbUrl = "jdbc:sqlserver://192.168.0.1:1433;DatabaseName=XXX";
65     static final String uid = "XXX";
66     static final String pwd = "XXX";
67     static final String inputQuery = "select id,source from tablename where id<1000";
68     static final String inputCountQuery = "select count(1) from tablename where id<1000";
69     static final String jarClassPath = "/user/lib/sqljdbc4.jar";
70     static final String outputPath = "hdfs://ubuntu:9000/user/test";
71     static final String rowId = "id";
72
73     public static Job configureJob(Configuration conf) throws Exception{
74
75         String jobName = "etlcase";
76         Job job =  Job.getInstance(conf, jobName);
77
78         job.addFileToClassPath(new Path(jarClassPath));
79         MSSQLDBInputFormat.setInput(job, LawCaseRecord.class, inputQuery, inputCountQuery,rowId);
80         job.setJarByClass(LawDataEtl.class);
81
82         FileOutputFormat.setOutputPath(job, new Path(outputPath));
83
84         job.setMapOutputKeyClass(LongWritable.class);
85         job.setMapOutputValueClass(Text.class);
86         job.setOutputKeyClass(LongWritable.class);
87         job.setOutputValueClass(Text.class);
88         job.setMapperClass(CaseETLMapper.class);
89
90         return job;
91     }
92
93     public static void main(String[] args) throws Exception{
94
95         Configuration conf = new Configuration();
96         FileSystem fs = FileSystem.get(conf);
97         fs.delete(new Path(outputPath), true);
98
99         DBConfiguration.configureDB(conf, driverClass, dbUrl, uid, pwd);
100         conf.set(MRJobConfig.NUM_MAPS, String.valueOf(10));
101         Job job = configureJob(conf);
102         System.out.println("------------------------------------------------");
103         System.out.println(conf.get(DBConfiguration.DRIVER_CLASS_PROPERTY));
104         System.out.println(conf.get(DBConfiguration.URL_PROPERTY));
105         System.out.println(conf.get(DBConfiguration.USERNAME_PROPERTY));
106         System.out.println(conf.get(DBConfiguration.PASSWORD_PROPERTY));
107         System.out.println("------------------------------------------------");
108         System.exit(job.waitForCompletion(true) ? 0 : 1);
109
110     }
111 }


View Code
辅助类的代码:





1 /**
2  *
3  */
4 package com.nltk.sns;
5
6 import java.util.ArrayList;
7 import java.util.List;
8
9 import org.apache.commons.lang.StringUtils;
10
11
12
13
14
15 /**
16  * @author summer
17  *
18  */
19 public class ETLUtils {
20
21     public final static String NULL_CHAR = "";
22     public final static String PUNCTUATION_REGEX = "[(\\pP)&&[^\\|\\{\\}\\#]]+";
23     public final static String WHITESPACE_REGEX = "[\\p{Space}]+";
24
25     public static String format(String s){
26
27         return s.replaceAll(PUNCTUATION_REGEX, NULL_CHAR).replaceAll(WHITESPACE_REGEX, NULL_CHAR);
28     }
29
30     public static List<String> split(String s,int stepN){
31
32         List<String> splits = new ArrayList<String>();
33         if(StringUtils.isEmpty(s) || stepN<1)
34             return splits;
35         int len = s.length();
36         if(len<=stepN)
37             splits.add(s);
38         else{
39             for(int j=1;j<=stepN;j++)
40                 for(int i=0;i<=len-j;i++){
41                     String key = StringUtils.mid(s, i,j);
42                     if(StringUtils.isEmpty(key))
43                         continue;
44                     splits.add(key);
45                 }
46         }
47         return splits;
48
49     }
50
51     public static void main(String[] args){
52
53         String s="谢婷婷等与姜波等";
54         int stepN = 2;
55         List<String> splits = split(s,stepN);
56         System.out.println(splits);
57     }
58 }


View Code
运行成功了



代码初略的实现,主要是为了满足我的需求,大家可以根据自己的需要进行修改。

  实际上DBRecordReader作者实现的并不好,我们来看DBRecordReader、MySQLDBRecordReader和OracleDBRecordReader源码,DBRecordReader和MySQLDBRecordReader耦合度太高。一般而言,就是对于没有具体实现的数据库DBRecordReader也应该做到运行不报异常,无非就是采用单一的SPLIT和单一的MAP。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: