diff --git a/plugins/extractors/bigquery/README.md b/plugins/extractors/bigquery/README.md index 64c88aa8..07d29d94 100644 --- a/plugins/extractors/bigquery/README.md +++ b/plugins/extractors/bigquery/README.md @@ -8,6 +8,7 @@ source: config: project_id: google-project-id table_pattern: gofood.fact_ + max_preview_rows: 3 exclude: datasets: - dataset_a @@ -49,7 +50,7 @@ source: | `table_pattern` | `string` | `gofood.fact_` | Regex pattern to filter which bigquery table to scan (whitelist) | *optional* | | `max_page_size` | `int` | `100` | max page size hint used for fetching datasets/tables/rows from bigquery | *optional* | | `include_column_profile` | `bool` | `true` | true if you want to profile the column value such min, max, med, avg, top, and freq | *optional* | -| `max_preview_rows` | `int` | `30` | max number of preview rows to fetch, `0` will skip preview fetching. Default to `30`. | *optional* | +| `max_preview_rows` | `int` | `30` | max number of preview rows to fetch, `0` will skip preview fetching, `-1` will restrict adding preview_rows key in asset data . Default to `30`. | *optional* | | `mix_values` | `bool` | `false` | true if you want to mix the column values with the preview rows. Default to `false`. | *optional* | | `collect_table_usage` | `boolean` | `false` | toggle feature to collect table usage, `true` will enable collecting table usage. Default to `false`. | *optional* | | `usage_period_in_day` | `int` | `7` | collecting log from `(now - usage_period_in_day)` until `now`. only matter if `collect_table_usage` is true. Default to `7`. | *optional* | @@ -60,7 +61,8 @@ source: - Leaving `service_account_json` and `service_account_base64` blank will default to [Google's default authentication][google-default-auth]. It is recommended if Meteor instance runs inside the same Google Cloud environment as the BigQuery project. -- Service account needs to have `bigquery.privateLogsViewer` role to be able to collect bigquery audit logs +- Service account needs to have `bigquery.privateLogsViewer` role to be able to collect bigquery audit logs. +- Setting `max_preview_rows` to `-1` will restrict adding preview_rows key in asset data ## Outputs diff --git a/plugins/extractors/bigquery/bigquery.go b/plugins/extractors/bigquery/bigquery.go index 15820703..fd3e269c 100644 --- a/plugins/extractors/bigquery/bigquery.go +++ b/plugins/extractors/bigquery/bigquery.go @@ -41,20 +41,21 @@ var summary string type Config struct { ProjectID string `mapstructure:"project_id" validate:"required"` // ServiceAccountBase64 takes precedence over ServiceAccountJSON field - ServiceAccountBase64 string `mapstructure:"service_account_base64"` - ServiceAccountJSON string `mapstructure:"service_account_json"` - MaxPageSize int `mapstructure:"max_page_size"` - DatasetPageSize int `mapstructure:"dataset_page_size"` - TablePageSize int `mapstructure:"table_page_size"` - TablePattern string `mapstructure:"table_pattern"` - Exclude Exclude `mapstructure:"exclude"` - IncludeColumnProfile bool `mapstructure:"include_column_profile"` - MaxPreviewRows int `mapstructure:"max_preview_rows" default:"30"` - MixValues bool `mapstructure:"mix_values" default:"false"` - IsCollectTableUsage bool `mapstructure:"collect_table_usage" default:"false"` - UsagePeriodInDay int64 `mapstructure:"usage_period_in_day" default:"7"` - UsageProjectIDs []string `mapstructure:"usage_project_ids"` - BuildViewLineage bool `mapstructure:"build_view_lineage" default:"false"` + ServiceAccountBase64 string `mapstructure:"service_account_base64"` + ServiceAccountJSON string `mapstructure:"service_account_json"` + MaxPageSize int `mapstructure:"max_page_size"` + DatasetPageSize int `mapstructure:"dataset_page_size"` + TablePageSize int `mapstructure:"table_page_size"` + TablePattern string `mapstructure:"table_pattern"` + Exclude Exclude `mapstructure:"exclude"` + IncludeColumnProfile bool `mapstructure:"include_column_profile"` + // MaxPreviewRows can also be set to -1 to restrict adding preview_rows key in asset data + MaxPreviewRows int `mapstructure:"max_preview_rows" default:"30"` + MixValues bool `mapstructure:"mix_values" default:"false"` + IsCollectTableUsage bool `mapstructure:"collect_table_usage" default:"false"` + UsagePeriodInDay int64 `mapstructure:"usage_period_in_day" default:"7"` + UsageProjectIDs []string `mapstructure:"usage_project_ids"` + BuildViewLineage bool `mapstructure:"build_view_lineage" default:"false"` } type Exclude struct { @@ -444,15 +445,21 @@ func (e *Extractor) buildAsset(ctx context.Context, t *bigquery.Table, md *bigqu } } - table, err := anypb.New(&v1beta2.Table{ - Columns: e.buildColumns(ctx, md.Schema, md), - PreviewFields: previewFields, - PreviewRows: previewRows, - Profile: tableProfile, - Attributes: utils.TryParseMapToProto(attributesData), - CreateTime: timestamppb.New(md.CreationTime), - UpdateTime: timestamppb.New(md.LastModifiedTime), - }) + tableData := &v1beta2.Table{ + Columns: e.buildColumns(ctx, md.Schema, md), + Profile: tableProfile, + Attributes: utils.TryParseMapToProto(attributesData), + CreateTime: timestamppb.New(md.CreationTime), + UpdateTime: timestamppb.New(md.LastModifiedTime), + } + + maxPreviewRows := e.config.MaxPreviewRows + if maxPreviewRows != -1 { + tableData.PreviewFields = previewFields + tableData.PreviewRows = previewRows + } + + table, err := anypb.New(tableData) if err != nil { e.logger.Warn("error creating Any struct", "error", err) } @@ -515,7 +522,7 @@ func (e *Extractor) buildColumn(ctx context.Context, field *bigquery.FieldSchema func (e *Extractor) buildPreview(ctx context.Context, t *bigquery.Table, md *bigquery.TableMetadata) (fields []string, rows *structpb.ListValue, err error) { maxPreviewRows := e.config.MaxPreviewRows - if maxPreviewRows == 0 { + if maxPreviewRows <= 0 { return nil, nil, nil } diff --git a/plugins/sinks/compass/README.md b/plugins/sinks/compass/README.md index 99e13b46..35dc666c 100644 --- a/plugins/sinks/compass/README.md +++ b/plugins/sinks/compass/README.md @@ -15,8 +15,11 @@ sinks: labels: myCustom: $properties.attributes.myCustomField sampleLabel: $properties.labels.sampleLabelField + remove_unset_fields_in_data: false ``` +### *Notes* +- Setting `remove_unset_fields_in_data` to `true` will not populate fields in final data which are not set initially in source. Defaults to `false`. ## Contributing Refer to the [contribution guidelines](../../../docs/docs/contribute/guide.md#adding-a-new-sink) for information on contributing to this module. diff --git a/plugins/sinks/compass/sink.go b/plugins/sinks/compass/sink.go index 50d9bca3..218a1302 100644 --- a/plugins/sinks/compass/sink.go +++ b/plugins/sinks/compass/sink.go @@ -31,6 +31,8 @@ type Config struct { Host string `mapstructure:"host" validate:"required"` Headers map[string]string `mapstructure:"headers"` Labels map[string]string `mapstructure:"labels"` + // RemoveUnsetFieldsInData if set to true do not populate fields in final sink data which are unset in initial data. + RemoveUnsetFieldsInData bool `mapstructure:"remove_unset_fields_in_data"` } var info = plugins.Info{ @@ -193,7 +195,7 @@ func (s *Sink) buildCompassData(anyData *anypb.Any) (map[string]interface{}, err data, err := protojson.MarshalOptions{ UseProtoNames: true, - EmitUnpopulated: true, + EmitUnpopulated: !s.config.RemoveUnsetFieldsInData, }.Marshal(anyData) if err != nil { return nil, fmt.Errorf("marshaling asset data: %w", err)