docker/compose-controller-spark-sql-single.yaml

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This docker-compose configuration is for bringing up a pipeline controller
# along with a single-process Spark environment with a JDBC endpoint.

# Environment variables:
#
# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely
#   application.yaml and flink-conf.yaml files.
#
# DWH_ROOT: The directory where Parquet files are written. This is shared
#   between all containers; the pipeline writes to it and Spark ones read.
#
# Note if local paths are used, they should start with `./ `or `../`. Also the
# mounted files should be readable by containers, e.g., world-readable.
#

# NOTES ON SPARK:
# This is a very simple single-process Spark configuration to be able to run
# SQL queries against Parquet files generated by the pipeline. It exposes an
# endpoint on port 10001 which can be used for JDBC connection from any SQL
# client.
#
# For a more complete configuration which shows different pieces that are needed
# for a cluster environment, please see `compose-controller-spark-sql.yaml`.

# NOTES ON METASTORE:
# This configuration uses the default embedded Derby database as Metastore for
# the thriftserver. Example config lines are provided (but commented out) that
# show how to use an external DB instead.

# OTHER CONFIGS:
# If you want to change Spark default configs, you can mount your config files
# to /opt/bitnami/spark/conf/
# https://spark.apache.org/docs/latest/configuration.html

version: '2'

services:
  pipeline-controller:
    # to force a build use `--build` option of `docker-compose up`.
    build:
      context: ..
    container_name: pipeline-controller
    volumes:
      - ${PIPELINE_CONFIG}:/app/config:ro
      - ${DWH_ROOT}:/dwh
    ports:
      - '8090:8080'

  spark:
    image: docker.io/bitnami/spark:3.3
    container_name: spark-thriftserver
    command:
      - sbin/start-thriftserver.sh
    environment:
      - HIVE_SERVER2_THRIFT_PORT=10000
    ports:
      - '10001:10000'
      - '4041:4040'
    volumes:
      - ${DWH_ROOT}:/dwh
      # NON-EMBEDDED METASTORE CONFIG:
      # If you want to persist the Metastore data, e.g., table and view
      # definitions, you can use an external database by adjusting hive-site.xml
      #- ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml
      # Note to use an external DB, you need to provide its driver jar too:
      #- ./postgresql-42.6.0.jar:/opt/bitnami/spark/jars/postgresql-42.6.0.jar