Scala(spark)读写Hbase示例 Veröffentlicht am 2019-07-01 由于网上找到的版本都比较老旧,记录一版现在在用的版本的Scala读写Hbase示例。Scala2.11.8;Spark2.1.0。仅在本机集群通过,供参考。 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859package testimport org.apache.hadoop.conf.Configurationimport org.apache.hadoop.hbase._import org.apache.hadoop.hbase.client._import org.apache.hadoop.hbase.io.ImmutableBytesWritableimport org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}import org.apache.hadoop.hbase.util.Bytesimport org.apache.hadoop.mapreduce.Jobimport org.apache.spark.sql.SparkSessionobject TestHBase { def main(args: Array[String]) { val spark = SparkSession.builder().appName("LinkStart").master("local").getOrCreate() val sc = spark.sparkContext val conf = HBaseConfiguration.create() //设置zookeeper连接端口,默认2181 conf.set("hbase.zookeeper.quorum", "集群地址ip,逗号分隔") // HBase集群服务器地址(任一台) conf.set("hbase.zookeeper.property.clientPort", "2181") // zookeeper客户端访问端口 conf.set("hbase.master", "master:port") //设置查询的表名 conf.set(TableInputFormat.INPUT_TABLE, "test2019:bulletin") val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) val count = hBaseRDD.count() println("Students RDD Count:" + count) hBaseRDD.cache() //遍历输出 hBaseRDD.foreach({ case (_,result) => val key = Bytes.toString(result.getRow) val oldData = Bytes.toString(result.getValue("docs".getBytes,"insert_time".getBytes)) val newData = Bytes.toString(result.getValue("docs".getBytes,"latest".getBytes)) println("Row key:"+key+" OLD:"+oldData+" NEW:"+newData) }) val tablename = "test2019:bull" sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename) val job = new Job(sc.hadoopConfiguration) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Result]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录 val rdd = indataRDD.map(_.split(',')).map{arr=>{ val put = new Put(Bytes.toBytes(arr(0))) //行健的值 put.add(Bytes.toBytes("docs"),Bytes.toBytes("name"),Bytes.toBytes(arr(1))) //info:name列的值 put.add(Bytes.toBytes("docs"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2))) //info:gender列的值 put.add(Bytes.toBytes("docs"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt)) //info:age列的值 (new ImmutableBytesWritable, put) }} rdd.saveAsNewAPIHadoopDataset(job.getConfiguration()) }} Maven配置文件如下: 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.haizhi.data</groupId> <artifactId>DataScala</artifactId> <version>1.0-SNAPSHOT</version> <properties> <spark.version>2.1.0</spark.version> <scala.version>2.11</scala.version> <hadoop.version>2.5.0</hadoop.version> <hbase.version>1.2.0</hbase.version> </properties> <dependencies> <dependency> <groupId>org.mongodb.scala</groupId> <artifactId>mongo-scala-driver_2.11</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>net.minidev</groupId> <artifactId>json-smart</artifactId> <version>2.3</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>${hbase.version}</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>${hbase.version}</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase</artifactId> <version>${hbase.version}</version> <type>pom</type> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <version>2.15.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> </execution> </executions> </plugin> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.0</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> <version>2.19</version> <configuration> <skip>true</skip> </configuration> </plugin> </plugins> </build></project>