-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpipeline.yaml
93 lines (91 loc) · 4.67 KB
/
pipeline.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# `pipeline.yaml` is the main configuration file for an MLflow Pipeline.
# Required pipeline parameters should be defined in this file with either concrete values or
# variables such as {{ INGEST_DATA_LOCATION }}.
# Variables must be dereferenced in a profile YAML file, located under `profiles/`.
# See `profiles/local.yaml` for example usage. One may switch among profiles quickly by
# providing a profile name such as `local` in the Pipeline object constructor:
# `p = Pipeline(profile="local")`
#
# NOTE: YAML does not support tabs for indentation. Please use spaces and ensure that all YAML
# files are properly formatted.
template: "regression/v1"
# Specifies the dataset to use for model development
data:
# Dataset locations on the local filesystem are supported, as well as HTTP(S) URLs and
# any other remote locations resolvable by MLflow, such as those listed in
# https://mlflow.org/docs/latest/tracking.html#artifact-stores
location: {{INGEST_DATA_LOCATION}}
# Beyond `parquet` datasets, the `spark_sql` and `delta` formats are also natively supported for
# use with Spark
format: {{INGEST_DATA_FORMAT|default('parquet')}}
# Datasets with other formats, including `csv`, can be used by implementing and
# specifying a `custom_loader_method`
custom_loader_method: steps.ingest.load_file_as_dataframe
# If the `spark_sql` `format` is specified,
# And if the table location format is path-like, use the following sql command for Spark to read
# sub-columns from the table:
sql: SELECT * FROM delta.`{{INGEST_DATA_LOCATION}}`
# And if the table location format is table-like, use the following sql command for Spark to read
# sub-columns from the table:
# sql: SELECT col1, col2 FROM {{INGEST_DATA_LOCATION}}
# If the `delta` `format` is specified, you can also configure the Delta table `version` to read
# or the `timestamp` at which to read data
# version: 2
# timestamp: 2022-06-01T00:00:00.000Z
# specify the dataset to use for batch scoring. All params serve the same function as in `data`
data_scoring:
location: {{INGEST_SCORING_DATA_LOCATION}}
format: {{INGEST_SCORING_DATA_FORMAT|default('parquet')}}
custom_loader_method: steps.ingest.load_file_as_dataframe
sql: SELECT * FROM delta.`{{INGEST_SCORING_DATA_LOCATION}}`
# Specifies the name of the column containing targets / labels for model training and evaluation
target_col: "fare_amount"
steps:
split:
# Train/validation/test split ratios
split_ratios: {{SPLIT_RATIOS|default([0.75, 0.125, 0.125])}}
# Specifies the method to use to perform additional cleaning on split datasets
# Note that arbitrary transformations should go into the transform step
post_split_filter_method: steps.split.create_dataset_filter
transform:
# Specifies the method that defines the data transformations to apply during model inference
transformer_method: steps.transform.transformer_fn
train:
using: estimator_spec
# Specifies the method that defines the estimator type and parameters to use for model training
estimator_method: steps.train.estimator_fn
evaluate:
# Sets performance thresholds that a trained model must meet in order to be eligible for
# registration to the MLflow Model Registry
validation_criteria:
- metric: root_mean_squared_error
threshold: 10
- metric: mean_absolute_error
threshold: 50
- metric: weighted_mean_squared_error
threshold: 20
register:
# Specifies the name of the Registered Model to use when registering a trained model to
# the MLflow Model Registry
model_name: "taxi_fare_regressor"
# Indicates whether or not a model that fails to meet performance thresholds should still
# be registered to the MLflow Model Registry
allow_non_validated_model: false
predict:
# Specifically define the model URI to use in batch scoring here or use the latest model
# registered from the training DAG
# model_uri: "models/model.pkl"
# Specify the output path of the scored data from predict
output_location: {{SCORED_OUTPUT_DATA_LOCATION}}
# Specify the output format of the scored data from predict
output_format: {{SCORED_OUTPUT_DATA_FORMAT|default('parquet')}}
metrics:
# Defines custom performance metrics to compute during model training and evaluation
custom:
- name: weighted_mean_squared_error
# Specifies the name of the function in `steps/custom_metrics.py` to use to compute the metric
function: weighted_mean_squared_error
greater_is_better: False
# Sets the primary metric to use to evaluate model performance. This primary metric is used
# to sort MLflow Runs corresponding to the pipeline in the MLflow Tracking UI
primary: "root_mean_squared_error"