pipeline.yaml

# `pipeline.yaml` is the main configuration file for an MLflow Pipeline.
# Required pipeline parameters should be defined in this file with either concrete values or
# variables such as {{ INGEST_DATA_LOCATION }}.

# Variables must be dereferenced in a profile YAML file, located under `profiles/`.
# See `profiles/local.yaml` for example usage. One may switch among profiles quickly by
# providing a profile name such as `local` in the Pipeline object constructor:
# `p = Pipeline(profile="local")`
#
# NOTE: YAML does not support tabs for indentation. Please use spaces and ensure that all YAML
# files are properly formatted.

template: "regression/v1"
# Specifies the dataset to use for model development
data:
  # Dataset locations on the local filesystem are supported, as well as HTTP(S) URLs and
  # any other remote locations resolvable by MLflow, such as those listed in
  # https://mlflow.org/docs/latest/tracking.html#artifact-stores
  location: {{INGEST_DATA_LOCATION}}
  # Beyond `parquet` datasets, the `spark_sql` and `delta` formats are also natively supported for
  # use with Spark
  format: {{INGEST_DATA_FORMAT|default('parquet')}}
  # Datasets with other formats, including `csv`, can be used by implementing and
  # specifying a `custom_loader_method`
  custom_loader_method: steps.ingest.load_file_as_dataframe
  # If the `spark_sql` `format` is specified,
  # And if the table location format is path-like, use the following sql command for Spark to read
  # sub-columns from the table:
  sql: SELECT * FROM delta.`{{INGEST_DATA_LOCATION}}`
  # And if the table location format is table-like, use the following sql command for Spark to read
  # sub-columns from the table:
  # sql: SELECT col1, col2 FROM {{INGEST_DATA_LOCATION}}
  # If the `delta` `format` is specified, you can also configure the Delta table `version` to read
  # or the `timestamp` at which to read data
  # version: 2
  # timestamp: 2022-06-01T00:00:00.000Z
# specify the dataset to use for batch scoring.  All params serve the same function as in `data`
data_scoring:
  location: {{INGEST_SCORING_DATA_LOCATION}}
  format: {{INGEST_SCORING_DATA_FORMAT|default('parquet')}}
  custom_loader_method: steps.ingest.load_file_as_dataframe
  sql: SELECT * FROM delta.`{{INGEST_SCORING_DATA_LOCATION}}`
# Specifies the name of the column containing targets / labels for model training and evaluation
target_col: "fare_amount"
steps:
  split:
    # Train/validation/test split ratios
    split_ratios: {{SPLIT_RATIOS|default([0.75, 0.125, 0.125])}}
    # Specifies the method to use to perform additional cleaning on split datasets
    # Note that arbitrary transformations should go into the transform step
    post_split_filter_method: steps.split.create_dataset_filter
  transform:
    # Specifies the method that defines the data transformations to apply during model inference
    transformer_method: steps.transform.transformer_fn
  train:
    using: estimator_spec
    # Specifies the method that defines the estimator type and parameters to use for model training
    estimator_method: steps.train.estimator_fn
  evaluate:
    # Sets performance thresholds that a trained model must meet in order to be eligible for
    # registration to the MLflow Model Registry
    validation_criteria:
      - metric: root_mean_squared_error
        threshold: 10
      - metric: mean_absolute_error
        threshold: 50
      - metric: weighted_mean_squared_error
        threshold: 20
  register:
    # Specifies the name of the Registered Model to use when registering a trained model to
    # the MLflow Model Registry
    model_name: "taxi_fare_regressor"
    # Indicates whether or not a model that fails to meet performance thresholds should still
    # be registered to the MLflow Model Registry
    allow_non_validated_model: false
  predict:
    # Specifically define the model URI to use in batch scoring here or use the latest model
    # registered from the training DAG
    # model_uri: "models/model.pkl"
    # Specify the output path of the scored data from predict
    output_location: {{SCORED_OUTPUT_DATA_LOCATION}}
    # Specify the output format of the scored data from predict
    output_format: {{SCORED_OUTPUT_DATA_FORMAT|default('parquet')}}
metrics:
  # Defines custom performance metrics to compute during model training and evaluation
  custom:
    - name: weighted_mean_squared_error
      # Specifies the name of the function in `steps/custom_metrics.py` to use to compute the metric
      function: weighted_mean_squared_error
      greater_is_better: False
  # Sets the primary metric to use to evaluate model performance. This primary metric is used
  # to sort MLflow Runs corresponding to the pipeline in the MLflow Tracking UI
  primary: "root_mean_squared_error"