目录
- 1. 数据文本
- 2. pom.xml中依赖配置
- 3. 工具类Util
- 4. 导入数据ImportData
- 5. 对HBase表进行WordCount
- 6. 配置Job
- 7. 结果
- 参考
1. 数据文本
1_song1_2016-1-11 song1 singer1 man slow pc
2_song2_2016-1-11 song2 singer2 woman slow ios
3_song3_2016-1-11 song3 singer3 man quick andriod
4_song4_2016-1-11 song4 singer4 woman slow ios
5_song5_2016-1-11 song5 singer5 man quick pc
6_song6_2016-1-11 song6 singer6 woman quick ios
7_song7_2016-1-11 song7 singer7 man quick andriod
8_song8_2016-1-11 song8 singer8 woman slow pc
9_song9_2016-1-11 song9 singer9 woman slow ios
10_song4_2016-1-11 song4 singer4 woman slow ios
11_song6_2016-1-11 song6 singer6 woman quick ios
12_song6_2016-1-11 song6 singer6 woman quick ios
13_song3_2016-1-11 song3 singer3 man quick andriod
14_song2_2016-1-11 song2 singer2 woman slow ios
2. pom.xml中依赖配置
<dependencies><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.11</version><scope>test</scope></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-common</artifactId><version>3.3.6</version><exclusions><exclusion><groupId>org.slf4j</groupId><artifactId>slf4j-log4j12</artifactId></exclusion></exclusions></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-mapreduce-client-core</artifactId><version>3.3.6</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-mapreduce-client-jobclient</artifactId><version>3.3.6</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-auth</artifactId><version>3.3.6</version></dependency><dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-common</artifactId><version>2.5.10</version></dependency><dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-client</artifactId><version>2.5.10</version></dependency><dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-mapreduce</artifactId><version>2.5.10</version></dependency><dependency><groupId>log4j</groupId><artifactId>log4j</artifactId><version>1.2.17</version></dependency></dependencies>
3. 工具类Util
import java.io.IOException;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.util.Bytes;public class Util {public static Connection getConnection() throws IOException {Configuration conf = HBaseConfiguration.create();return ConnectionFactory.createConnection(conf);}public static void create(Connection conn, String tableName, String[] families) throws IOException {if (families.length == 0) {System.out.println("please provide at least one column family.");return;}if (families.length > 3) {System.out.println("please reduce the number of column families.");return;}Admin admin = conn.getAdmin();TableName tableName2 = TableName.valueOf(tableName);if (admin.tableExists(tableName2)) {System.out.println("table exists!");return;}TableDescriptorBuilder tableDescBuilder = TableDescriptorBuilder.newBuilder(tableName2);for (String family : families) {ColumnFamilyDescriptor columnFamily = ColumnFamilyDescriptorBuilder.of(family);tableDescBuilder.setColumnFamily(columnFamily);}admin.createTable(tableDescBuilder.build());System.out.println("create table success!");admin.close();}public static void delete(Connection conn, String tableName) throws IOException {Admin admin = getConnection().getAdmin();TableName tableName2 = TableName.valueOf(tableName);if (admin.tableExists(tableName2)) {admin.disableTable(tableName2);admin.deleteTable(tableName2);}admin.close();}public static void scan(Connection conn, String tableName) throws IOException {Table table = conn.getTable(TableName.valueOf(tableName));Scan scan = new Scan();ResultScanner scanner = table.getScanner(scan);System.out.println("scan: ");for (Result res = scanner.next(); res != null; res = scanner.next()) {for (Cell cell : res.listCells()) {String row = Bytes.toString(CellUtil.cloneRow(cell));String columnFamily = Bytes.toString(CellUtil.cloneFamily(cell));String column = Bytes.toString(CellUtil.cloneQualifier(cell));String data = Bytes.toString(CellUtil.cloneValue(cell));System.out.println(String.format("row: %s, family: %s, column: %s, data: %s", row, columnFamily,column, data));}}scanner.close();}
}
4. 导入数据ImportData
import java.io.IOException;import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;public class ImportData {public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {;@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {context.write(value, NullWritable.get());}}public static class MyReducer extends TableReducer<Text, NullWritable, Text> {@Overrideprotected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {String[] columns = {"name", "singer", "gender", "ryghme", "terminal"};String[] splitStr = key.toString().split("\\s+");Put put = new Put(Bytes.toBytes(splitStr[0]));for (int i = 1; i < splitStr.length; i++) {put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(columns[i - 1]), Bytes.toBytes(splitStr[i]));}context.write(key, put);}}
}
5. 对HBase表进行WordCount
当HBase作为数据来源时,自定义Mapper要继承TableMapper,实质上是使用TableInputFormat取得数据。同时,需要在Job配置时调用TableMapReduceUtil中的静态方法initTableMapperJob来标示作为数据输入来源的HBase表名和自定义Mapper类。
import java.io.IOException;
import java.util.List;import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;public class WordCount {public static class MyMapper extends TableMapper<Text, IntWritable> {@Overrideprotected void map(ImmutableBytesWritable key, Result value, Context context)throws IOException, InterruptedException {List<Cell> cells = value.listCells();for (Cell cell : cells) {context.write(new Text(Bytes.toString(CellUtil.cloneValue(cell))), new IntWritable(1));} }}public static class MyReducer extends TableReducer<Text, IntWritable, Text> {@Overrideprotected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0;for (IntWritable val : values) {count += val.get();}Put put = new Put(Bytes.toBytes(key.toString()));put.addColumn(Bytes.toBytes("details"), Bytes.toBytes("rank"), Bytes.toBytes(Integer.toString(count)));context.write(key, put);}}
}
6. 配置Job
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.log4j.Logger;public class App {private Logger logger1 = Logger.getLogger(App.class);public static void main(String[] args) throws Exception {String file = "file:///root/CodeProject/mapreduce-hbase/play_records.txt";Connection conn = Util.getConnection();Util.delete(conn, "music");Util.delete(conn, "namelist");Util.create(conn, "music", new String[] { "info" });Util.create(conn, "namelist", new String[] { "details" });Configuration conf = HBaseConfiguration.create();Job job = Job.getInstance(conf, "import-data");job.setJarByClass(App.class);job.setMapperClass(ImportData.MyMapper.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(NullWritable.class);job.setNumReduceTasks(2);TableMapReduceUtil.initTableReducerJob("music", ImportData.MyReducer.class, job);FileInputFormat.addInputPath(job, new Path(file));int res1 = job.waitForCompletion(true) ? 0 : 1;if (res1 == 0) {Job countJob = Job.getInstance(conf, "word-count");countJob.setJarByClass(App.class);Scan scan = new Scan();scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"));TableMapReduceUtil.initTableMapperJob(TableName.valueOf("music"), scan, WordCount.MyMapper.class, Text.class, IntWritable.class, countJob);TableMapReduceUtil.initTableReducerJob("namelist", WordCount.MyReducer.class, countJob);int res2 = countJob.waitForCompletion(true) ? 0 : 1;if (res2 == 0) {Util.scan(conn, "namelist");}System.exit(res2);}conn.close();System.exit(res1);}
}
7. 结果
参考
吴章勇 杨强著 大数据Hadoop3.X分布式处理实战