diff --git a/README.md b/README.md index 00713db..e2a4e2e 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,28 @@ The `build_activity` macro is a convenience function that will take the data fro All other columns will be added automatically and aliased as configured in the project to the final `select` statement that is generated by the `build_activity` macro.

+## **Generating docs and basic tests** +Since each activity table has a similiar schema, it is possible to generate the documentation and add basic tests automatically. +The `generate_activity_yml` macro takes a list of activity names as input and prints out the yaml documentation for these. You can use the output as a basis for your documentation and test setup. + +### **Usage** +In your shell use `dbt run-operation` to run the macro. Replace the activities list with your specific activity names as needed. +``` +dbt run-operation generate_activity_yml --args '{activities: [customer__visited_page, customer__bought_something]}' +``` +Copy the output to a yaml file. E.g. `activites_models.yml` + +### **Input Expectations** +The macro requires a single argument: + +#### **`activities`** +_Description_: +A list of activity names (strings). Each name in this list will generate a separate model entry in the YAML file. + +### **Output** +For each activity the macro will generate a model entry in yaml format containing columns, data types, the keys of the feature_json object as well as basic tests for uniqueness and the existence of null values. +

+ # **Streams** Each Activity Schema should have exactly 1 stream model. The model should be the name of the stream that is registered in the `streams` variable in `dbt_project.yml`. diff --git a/macros/activity_schema/activity/generate_activity_yml.sql b/macros/activity_schema/activity/generate_activity_yml.sql new file mode 100644 index 0000000..e4e5f40 --- /dev/null +++ b/macros/activity_schema/activity/generate_activity_yml.sql @@ -0,0 +1,102 @@ +-- Get data_types object from model config block +{% macro get_activity_config(model) -%} + {% set relation = ref(model) %} + {% for node in graph.nodes.values() + | selectattr("resource_type", "equalto", "model") + | selectattr("name", "equalto", relation.identifier) %} + {% do return(node.config) %} + {% endfor %} +{%- endmacro %} + +-- Get column descriptions and tests +{% macro get_column_descriptions(activity) %} + {% set stream = get_activity_config(activity).stream %} + {% set schema_columns = dbt_aql.schema_columns() %} + {% set customer_column = dbt_aql.customer_column(stream) %} + {% set anonymous_customer_column = dbt_aql.anonymous_customer_column(stream) %} + + {% set columns = [ + {'name': 'activity_id', 'description': 'Unique identifier for the activity.', 'data_type': type_string(), 'tests': ['unique', 'not_null']}, + {'name': 'customer_id', 'description': 'Identifier for the entity.', 'data_type': type_string()}, + {'name': 'anonymous_customer_id', 'description': 'Anonymous identifier for the entity.', 'data_type': type_string(), 'tests': ['not_null']}, + {'name': 'activity', 'description': 'Type of activity performed.', 'data_type': type_string(), 'tests': ['not_null']}, + {'name': 'ts', 'description': 'Timestamp of when the activity occurred.', 'data_type': type_timestamp(), 'tests': ['not_null']}, + {'name': 'revenue_impact', 'description': 'Revenue impact of the activity, if applicable.', 'data_type': type_int()}, + {'name': 'link', 'description': 'Link associated with the activity, if applicable.', 'data_type': type_string()}, + {'name': 'feature_json', 'description': 'JSON containing additional feature data related to the activity. Contains the following items:', 'data_type': dbt_aql.type_json()}, + {'name': 'activity_occurrence', 'description': 'Number of times the activity occurred.', 'data_type': type_int()}, + {'name': 'activity_repeated_at', 'description': 'Timestamp of when the activity was repeated, if applicable.', 'data_type': type_timestamp()} + ] %} + + -- Remove unused columns + {%- if anonymous_customer_column is none -%} + {%- set columns = columns | rejectattr("name", "equalto", "anonymous_customer_id") | list -%} + {%- endif -%} + + {%- if schema_columns.link is not defined -%} + {%- set columns = columns | rejectattr("name", "equalto", "link") | list -%} + {%- endif -%} + + {%- if schema_columns.revenue_impact is not defined -%} + {%- set columns = columns | rejectattr("name", "equalto", "revenue_impact") | list -%} + {%- endif -%} + + -- Update column names based on schema_columns + {% for column in columns %} + {% if column.name in schema_columns %} + {% do column.update({'name': schema_columns[column.name]}) %} + {% endif %} + {% endfor %} + + {% do return(columns) %} + +{% endmacro %} + + +{% macro generate_activity_yml(activities) %} + +{% set yaml_output = [] %} + +{% do yaml_output.append('version: 2') %} +{% do yaml_output.append('models:') %} + +-- Loop through each activity +{% for activity in activities %} + {% set columns = get_column_descriptions(activity) %} + + {% do yaml_output.append(' - name: ' ~ activity) %} + {% do yaml_output.append(' columns:') %} + + {% for column in columns %} + {% do yaml_output.append(' - name: ' ~ column['name']) %} + {% if column['name'] == 'feature_json' %} + -- Call the macro to get feature_json items + {% set data_types = get_activity_config(activity).data_types %} + {% do yaml_output.append(' description: > ') %} + {% do yaml_output.append(' ' ~ column['description']) %} + {% for key, data_type in data_types.items() %} + {% do yaml_output.append(' - ' ~ key ~ ': ' ~ data_type) %} + {% endfor %} + {% do yaml_output.append(' data_type: ' ~ column['data_type']) %} + {% else %} + {% do yaml_output.append(' description: "' ~ column['description'] ~ '"') %} + {% do yaml_output.append(' data_type: ' ~ column['data_type']) %} + {% if column['tests'] %} + {% do yaml_output.append(' tests:') %} + {% for test in column['tests'] %} + {% do yaml_output.append(' - ' ~ test) %} + {% endfor %} + {% endif %} + {% endif %} + {% endfor %} + +{% endfor %} + + +{% if execute %} + {% set joined_yaml = yaml_output | join('\n') %} + {{ print(joined_yaml) }} + {% do return(joined_yaml) %} +{% endif %} + +{% endmacro %}