Skip to content

Commit

Permalink
Spark bulk load tool.
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Hale committed Oct 13, 2023
1 parent c2e85fb commit 0f30b80
Show file tree
Hide file tree
Showing 9 changed files with 417 additions and 176 deletions.
11 changes: 11 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
<maven.compiler.target>11</maven.compiler.target>
<rdf4j.version>4.3.5</rdf4j.version>
<hbase.version>2.5.5</hbase.version>
<hbase-connectors.version>1.0.0</hbase-connectors.version>
<hadoop.version>3.3.6</hadoop.version>
<jackson.version>2.13.4</jackson.version>
<jersey.version>2.25.1</jersey.version>
Expand Down Expand Up @@ -218,6 +219,11 @@
</build>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
Expand All @@ -227,6 +233,11 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.21</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
Expand Down
24 changes: 24 additions & 0 deletions sdk/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,30 @@
<goal>run</goal>
</goals>
</execution>
<execution>
<id>hbase-connectors</id>
<phase>process-classes</phase>
<configuration>
<target>
<get
src="https://github.com/pulquero/hbase-connectors/releases/download/rel%2F${hbase.version}%2B${hadoop.version}/hbase-connectors-${hbase-connectors.version}-bin.tar.gz"
dest="${project.build.directory}/hbase-connectors.tar.gz"
skipexisting="true" />
<untar src="${project.build.directory}/hbase-connectors.tar.gz"
dest="${project.build.directory}/hbase-connectors-libs"
compression="gzip">
<patternset>
<include
name="hbase-connectors-${hbase-connectors.version}/lib/hbase-*-${hbase.version}.jar" />
</patternset>
<mapper type="flatten" />
</untar>
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
Expand Down
7 changes: 7 additions & 0 deletions sdk/src/main/assembly/sdk-assembly.xml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
<include>hbase-zookeeper-${hbase.version}.jar</include>
</includes>
</fileSet>
<fileSet>
<directory>${project.build.directory}/hbase-connectors-libs</directory>
<outputDirectory>lib</outputDirectory>
<includes>
<include>hbase-spark-*</include>
</includes>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
Expand Down
5 changes: 5 additions & 0 deletions sdk/src/main/scripts/halyardspark
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
halyard_home="$(dirname "${0}")"
halyard_tools_jar="$halyard_home/lib/halyard-tools-*.jar"
halyard_lib="$halyard_home/lib/"
spark-submit --class com.msd.gin.halyard.tools.HalyardSparkMain --master yarn --deploy-mode client --jars $halyard_lib $halyard_spark_jar "$1" -conf /etc/hbase/conf/hbase-site.xml -conf $halyard_home/halyard-defaults.xml "${@:2}"
275 changes: 137 additions & 138 deletions tools/pom.xml
Original file line number Diff line number Diff line change
@@ -1,146 +1,145 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>halyard-tools</artifactId>
<packaging>jar</packaging>
<parent>
<groupId>io.github.pulquero.halyard</groupId>
<artifactId>halyard</artifactId>
<version>4.9-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-strategy</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-sail</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-rio</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>javax.activation</groupId>
<artifactId>javax.activation-api</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-console</artifactId>
<version>${rdf4j.version}</version>
<exclusions>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>halyard-tools</artifactId>
<packaging>jar</packaging>
<parent>
<groupId>io.github.pulquero.halyard</groupId>
<artifactId>halyard</artifactId>
<version>4.9-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-strategy</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-sail</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-rio</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>javax.activation</groupId>
<artifactId>javax.activation-api</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-client</artifactId>
<version>${rdf4j.version}</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hbase.connectors.spark</groupId>
<artifactId>hbase-spark</artifactId>
<version>${hbase-connectors.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>${elasticsearch.version}</version>
<exclusions>
<exclusion>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-spin</artifactId>
<groupId>*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>${elasticsearch.version}</version>
<exclusions>
<exclusion>
<groupId>*</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-common</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
</exclusions>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>halyard-common</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit4.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>test</scope>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-hs</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-resourcemanager</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-tests</artifactId>
<scope>test</scope>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.derby</groupId>
<artifactId>derby</artifactId>
<version>10.12.1.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>test</scope>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-hs</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-resourcemanager</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-tests</artifactId>
<scope>test</scope>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.derby</groupId>
<artifactId>derby</artifactId>
<version>10.12.1.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,10 @@ protected static final void bulkLoad(Configuration conf, TableName tableName, Pa
// reqd if HFiles need splitting (code from HFileOutputFormat2)
byte[] tableAndFamily = HalyardTableUtils.getTableNameSuffixedWithFamily(tableName.toBytes());
Map<byte[], String> bloomTypeMap = createFamilyConfValueMap(conf, "hbase.hfileoutputformat.families.bloomtype");
Map<byte[], String> bloomParamMap = createFamilyConfValueMap(conf, "hbase.hfileoutputformat.families.bloomparam");
String bloomType = bloomTypeMap.get(tableAndFamily);
String bloomParam = bloomParamMap.get(tableAndFamily);
if (BloomType.ROWPREFIX_FIXED_LENGTH.toString().equals(bloomType)) {
Map<byte[], String> bloomParamMap = createFamilyConfValueMap(conf, "hbase.hfileoutputformat.families.bloomparam");
String bloomParam = bloomParamMap.get(tableAndFamily);
conf.set(BloomFilterUtil.PREFIX_LENGTH_KEY, bloomParam);
}

Expand Down
Loading

0 comments on commit 0f30b80

Please sign in to comment.