From 8feeb95ca7f6cb0ae8709cb262027269b1b6f255 Mon Sep 17 00:00:00 2001 From: Pahulpreet Singh Date: Wed, 15 Jan 2025 18:42:17 +0000 Subject: [PATCH 1/4] add script and readme (first draft) for hive lineage Signed-off-by: Pahulpreet Singh --- hive-lineage/README.md | 36 +++++++++++++++++++++++++ hive-lineage/hive-lineage.sh | 52 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 hive-lineage/README.md create mode 100644 hive-lineage/hive-lineage.sh diff --git a/hive-lineage/README.md b/hive-lineage/README.md new file mode 100644 index 000000000..b7ddf4547 --- /dev/null +++ b/hive-lineage/README.md @@ -0,0 +1,36 @@ +# Hive Lineage Initialization Action + +## Using the initialization action + +**:warning: NOTICE:** See +[best practices](/README.md#how-initialization-actions-are-used) of using +initialization actions in production. + +You can use this initialization action to create a new Dataproc cluster with Lineage enabled for Hive jobs. +Note that this feature is in preview for now. + +```shell +REGION= +CLUSTER_NAME= +gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --scopes cloud-platform \ + --initialization-actions gs://dataproc-hive-lineage-prototype/v3/initialization-actions/enable_lineage.sh +``` + +(TODO: Update the gs bucket with the regionalized `goog-dataproc-initialization-actions-`) + +If you want to run Hive jobs involving bigquery tables, hive-bigquery connector needs to be installed as well. +(TODO: Link to connectors init action) + +```shell +REGION= +CLUSTER_NAME= +gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --scopes cloud-platform \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://dataproc-hive-lineage-prototype/v3/initialization-actions/enable_lineage.sh \ + --metadata hive-bigquery-connector-version=2.0.3 +``` + +(TODO: Once multiple versions are supported, update instructions accordingly) \ No newline at end of file diff --git a/hive-lineage/hive-lineage.sh b/hive-lineage/hive-lineage.sh new file mode 100644 index 000000000..c022b42f1 --- /dev/null +++ b/hive-lineage/hive-lineage.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# This initialization script installs the required +# jars and sets the hive conf to enable lineage. +# +# To use this script, add the following using the +# --initialization-actions flag during cluster creation. +# gs://dataproc-hive-lineage-prototype/v3/initialization-actions/enable_lineage.sh + +set -euxo pipefail + +export HIVE_HOME="/usr/lib/hive" +export HIVE_CONF_DIR="/etc/hive/conf" +export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml" +export HIVE_LIB_DIR="/usr/lib/hive/lib" +export INSTALLATION_SOURCE="gs://dataproc-hive-lineage-prototype/v3/jars" # TODO: Update the gcs bucket once finalised +export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook" + + +function set_hive_lineage_conf() { + declare -A properties=( + ["hive.exec.post.hooks"]="$HIVE_OL_HOOK" + ["hive.exec.exec.hooks"]="$HIVE_OL_HOOK" + ["hive.openlineage.transport.type"]="gcplineage" + ["hive.conf.validation"]="false" # to allow custom properties, like hive.openlineage.namespace + ) + echo "Setting hive conf to enable lineage" + for key in "${!properties[@]}"; do + bdconfig set_property \ + --configuration_file="$HIVE_CONF_FILE" \ + --name "$key" \ + --value "${properties[$key]}" + done +} + +function install_jars() { + # TODO: Allow customisation of the jar version + echo "Installing openlineage-hive hook" + gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-shaded.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar" +} + +function restart_hive_server2_master() { + ROLE=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role) + if [[ "${ROLE}" == 'Master' ]]; then + echo "Restarting hive-server2" + sudo systemctl restart hive-server2.service + fi +} + +install_jars +set_hive_lineage_conf +restart_hive_server2_master From c05a7460275106c77973ec5496c5f3ee9223ea0d Mon Sep 17 00:00:00 2001 From: Pahulpreet Singh Date: Tue, 28 Jan 2025 02:26:42 +0000 Subject: [PATCH 2/4] address TODOs and add license header Signed-off-by: Pahulpreet Singh --- hive-lineage/README.md | 12 ++++-------- hive-lineage/hive-lineage.sh | 24 ++++++++++++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/hive-lineage/README.md b/hive-lineage/README.md index b7ddf4547..14f94c590 100644 --- a/hive-lineage/README.md +++ b/hive-lineage/README.md @@ -15,13 +15,11 @@ CLUSTER_NAME= gcloud dataproc clusters create ${CLUSTER_NAME} \ --region ${REGION} \ --scopes cloud-platform \ - --initialization-actions gs://dataproc-hive-lineage-prototype/v3/initialization-actions/enable_lineage.sh + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh ``` -(TODO: Update the gs bucket with the regionalized `goog-dataproc-initialization-actions-`) - If you want to run Hive jobs involving bigquery tables, hive-bigquery connector needs to be installed as well. -(TODO: Link to connectors init action) +See [connectors](../connectors/README.md) init action. ```shell REGION= @@ -29,8 +27,6 @@ CLUSTER_NAME= gcloud dataproc clusters create ${CLUSTER_NAME} \ --region ${REGION} \ --scopes cloud-platform \ - --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://dataproc-hive-lineage-prototype/v3/initialization-actions/enable_lineage.sh \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh \ --metadata hive-bigquery-connector-version=2.0.3 -``` - -(TODO: Once multiple versions are supported, update instructions accordingly) \ No newline at end of file +``` \ No newline at end of file diff --git a/hive-lineage/hive-lineage.sh b/hive-lineage/hive-lineage.sh index c022b42f1..6b99c2a8c 100644 --- a/hive-lineage/hive-lineage.sh +++ b/hive-lineage/hive-lineage.sh @@ -1,11 +1,19 @@ #!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # This initialization script installs the required # jars and sets the hive conf to enable lineage. -# -# To use this script, add the following using the -# --initialization-actions flag during cluster creation. -# gs://dataproc-hive-lineage-prototype/v3/initialization-actions/enable_lineage.sh set -euxo pipefail @@ -13,14 +21,15 @@ export HIVE_HOME="/usr/lib/hive" export HIVE_CONF_DIR="/etc/hive/conf" export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml" export HIVE_LIB_DIR="/usr/lib/hive/lib" -export INSTALLATION_SOURCE="gs://dataproc-hive-lineage-prototype/v3/jars" # TODO: Update the gcs bucket once finalised +export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage" +export HIVE_OL_HOOK_VERSION="1.0.0-preview" export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook" function set_hive_lineage_conf() { declare -A properties=( ["hive.exec.post.hooks"]="$HIVE_OL_HOOK" - ["hive.exec.exec.hooks"]="$HIVE_OL_HOOK" + ["hive.exec.failure.hooks"]="$HIVE_OL_HOOK" ["hive.openlineage.transport.type"]="gcplineage" ["hive.conf.validation"]="false" # to allow custom properties, like hive.openlineage.namespace ) @@ -34,9 +43,8 @@ function set_hive_lineage_conf() { } function install_jars() { - # TODO: Allow customisation of the jar version echo "Installing openlineage-hive hook" - gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-shaded.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar" + gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar" } function restart_hive_server2_master() { From 3bb025e178684e2601216e131e8549eb1e74251d Mon Sep 17 00:00:00 2001 From: Pahulpreet Singh Date: Thu, 30 Jan 2025 11:44:43 +0000 Subject: [PATCH 3/4] hive-lineage: Add basic test for hive job Signed-off-by: Pahulpreet Singh --- BUILD | 23 ++++++++++++++++++++--- hive-lineage/__init__.py | 1 + hive-lineage/hivetest.hive | 23 +++++++++++++++++++++++ hive-lineage/test_hive_lineage.py | 22 ++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 hive-lineage/__init__.py create mode 100644 hive-lineage/hivetest.hive create mode 100644 hive-lineage/test_hive_lineage.py diff --git a/BUILD b/BUILD index dfb599b4d..2bc555ce8 100644 --- a/BUILD +++ b/BUILD @@ -6,8 +6,8 @@ test_suite( ":test_cloud_sql_proxy", ":test_dr_elephant", ":test_hive_hcatalog", - ":test_starburst_presto", ":test_spark_rapids", + ":test_starburst_presto", "//alluxio:test_alluxio", "//atlas:test_atlas", "//bigtable:test_bigtable", @@ -60,7 +60,10 @@ py_test( name = "test_cloud_sql_proxy", size = "enormous", srcs = ["cloud-sql-proxy/test_cloud_sql_proxy.py"], - data = ["cloud-sql-proxy/cloud-sql-proxy.sh", "cloud-sql-proxy/hivetest.hive"], + data = [ + "cloud-sql-proxy/cloud-sql-proxy.sh", + "cloud-sql-proxy/hivetest.hive", + ], local = True, shard_count = 4, deps = [ @@ -114,10 +117,10 @@ py_test( size = "enormous", srcs = ["spark-rapids/test_spark_rapids.py"], data = [ + "spark-rapids/mig.sh", "spark-rapids/spark-rapids.sh", "spark-rapids/verify_xgboost_spark_rapids.scala", "spark-rapids/verify_xgboost_spark_rapids_sql.scala", - "spark-rapids/mig.sh", ], local = True, shard_count = 3, @@ -132,3 +135,17 @@ py_library( testonly = True, srcs = ["cloud-sql-proxy/pyspark_metastore_test.py"], ) + +py_test( + name = "test_hive_lineage", + size = "small", + srcs = ["hive-lineage/test_hive_lineage.py"], + data = [ + "hive-lineage/hive-lineage.sh", + "hive-lineage/hivetest.hive", + ], + local = True, + deps = [ + "//integration_tests:dataproc_test_case", + ], +) diff --git a/hive-lineage/__init__.py b/hive-lineage/__init__.py new file mode 100644 index 000000000..709aae858 --- /dev/null +++ b/hive-lineage/__init__.py @@ -0,0 +1 @@ +# required for integration tests \ No newline at end of file diff --git a/hive-lineage/hivetest.hive b/hive-lineage/hivetest.hive new file mode 100644 index 000000000..e9981994d --- /dev/null +++ b/hive-lineage/hivetest.hive @@ -0,0 +1,23 @@ +DROP TABLE IF EXISTS validate_hive_tbl; +DROP TABLE IF EXISTS grouped_tbl; + +CREATE EXTERNAL TABLE validate_hive_tbl ( + shell_user STRING, + dummy STRING, + uid INT, + gid INT, + name STRING, + home STRING, + shell STRING +) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY ':'; + +CREATE TABLE grouped_tbl + ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +AS SELECT shell, COUNT(*) shell_count + FROM validate_hive_tbl + GROUP BY shell + ORDER BY shell_count DESC, shell DESC; + +SELECT * from grouped_tbl; \ No newline at end of file diff --git a/hive-lineage/test_hive_lineage.py b/hive-lineage/test_hive_lineage.py new file mode 100644 index 000000000..db329a466 --- /dev/null +++ b/hive-lineage/test_hive_lineage.py @@ -0,0 +1,22 @@ +from absl.testing import absltest + +from integration_tests.dataproc_test_case import DataprocTestCase + +class HiveLineageTestCase(DataprocTestCase): + INIT_ACTIONS = ["hive-lineage/hive-lineage.sh"] + TEST_SCRIPT_FILE = "hive-lineage/hivetest.hive" + + def __submit_hive_job(self, cluster_name): + self.assert_dataproc_job( + cluster_name, 'hive', '--file={}/{}'.format(self.INIT_ACTIONS_REPO, + self.TEST_SCRIPT_FILE)) + def verify_cluster(self, name): + self.__submit_hive_job(name) + + def test_hive_job_success(self, configuration): + self.createCluster(configuration, self.INIT_ACTIONS, scopes='cloud-platform') + self.verify_cluster(self.getClusterName()) + + +if __name__ == "__main__": + absltest.main() \ No newline at end of file From 3d250a27eff1269d92a2703570810cb3ecbc9c5b Mon Sep 17 00:00:00 2001 From: Pahulpreet Singh Date: Thu, 30 Jan 2025 11:45:34 +0000 Subject: [PATCH 4/4] hive-lineage: address comments Signed-off-by: Pahulpreet Singh --- BUILD | 4 +++- hive-lineage/hive-lineage.sh | 22 +++++++++++++--------- hive-lineage/test_hive_lineage.py | 10 +++++++++- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/BUILD b/BUILD index 2bc555ce8..3dd5093ab 100644 --- a/BUILD +++ b/BUILD @@ -138,14 +138,16 @@ py_library( py_test( name = "test_hive_lineage", - size = "small", + size = "enormous", srcs = ["hive-lineage/test_hive_lineage.py"], data = [ "hive-lineage/hive-lineage.sh", "hive-lineage/hivetest.hive", ], local = True, + shard_count = 3, deps = [ "//integration_tests:dataproc_test_case", + "@io_abseil_py//absl/testing:parameterized", ], ) diff --git a/hive-lineage/hive-lineage.sh b/hive-lineage/hive-lineage.sh index 6b99c2a8c..586d60a3d 100644 --- a/hive-lineage/hive-lineage.sh +++ b/hive-lineage/hive-lineage.sh @@ -1,5 +1,7 @@ #!/bin/bash # +# Copyright 2025 Google LLC and contributors +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -17,14 +19,15 @@ set -euxo pipefail -export HIVE_HOME="/usr/lib/hive" -export HIVE_CONF_DIR="/etc/hive/conf" -export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml" -export HIVE_LIB_DIR="/usr/lib/hive/lib" -export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage" -export HIVE_OL_HOOK_VERSION="1.0.0-preview" -export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook" - +function prepare_env() { + export HIVE_HOME="/usr/lib/hive" + export HIVE_CONF_DIR="/etc/hive/conf" + export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml" + export HIVE_LIB_DIR="$HIVE_HOME/lib" + export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage" + export HIVE_OL_HOOK_VERSION="1.0.0-preview" + export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook" +} function set_hive_lineage_conf() { declare -A properties=( @@ -44,7 +47,7 @@ function set_hive_lineage_conf() { function install_jars() { echo "Installing openlineage-hive hook" - gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar" + gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar" } function restart_hive_server2_master() { @@ -55,6 +58,7 @@ function restart_hive_server2_master() { fi } +prepare_env install_jars set_hive_lineage_conf restart_hive_server2_master diff --git a/hive-lineage/test_hive_lineage.py b/hive-lineage/test_hive_lineage.py index db329a466..166c700c0 100644 --- a/hive-lineage/test_hive_lineage.py +++ b/hive-lineage/test_hive_lineage.py @@ -1,8 +1,10 @@ from absl.testing import absltest +from absl.testing import parameterized from integration_tests.dataproc_test_case import DataprocTestCase class HiveLineageTestCase(DataprocTestCase): + COMPONENT = "hive-lineage" INIT_ACTIONS = ["hive-lineage/hive-lineage.sh"] TEST_SCRIPT_FILE = "hive-lineage/hivetest.hive" @@ -13,8 +15,14 @@ def __submit_hive_job(self, cluster_name): def verify_cluster(self, name): self.__submit_hive_job(name) + @parameterized.parameters( + 'STANDARD', + 'HA', + ) def test_hive_job_success(self, configuration): - self.createCluster(configuration, self.INIT_ACTIONS, scopes='cloud-platform') + self.createCluster(configuration, + self.INIT_ACTIONS, + scopes='cloud-platform') self.verify_cluster(self.getClusterName())