-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessors.py
149 lines (133 loc) · 5.35 KB
/
processors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import json
from datetime import datetime
from decimal import Decimal
import apache_beam as beam
from apache_beam.io.gcp.internal.clients import bigquery
from apache_beam.internal.gcp.json_value import to_json_value
from bigquery_schema_generator.generate_schema import SchemaGenerator
from typing import Dict
import pyarrow as pa
import pandas as pd
class PubSubProcessor(beam.DoFn):
def __init__(self):
return None
def process(self, element):
message = json.loads(element)
print(message)
yield message["path"]
class NdJsonProcessor(beam.DoFn):
def __init__(self):
return None
def process(self, batch):
items = batch.split("\n")
for item in items:
js = json.loads(item)
record = js
record["dataflow_ingested_at"] = datetime.utcnow()
print(record)
yield record
class CsvProcessorFn(beam.DoFn):
def process(self, element):
record = element._asdict()
record["dataflow_ingested_at"] = datetime.utcnow()
yield record
class ParquetFn(beam.DoFn):
def __init__(self, input_path :str, output_path :str = None, partition_timestamp=None):
self.input_path = input_path
self.partition_timestamp = partition_timestamp
self.output_path = output_path
def process(self,batch):
df = pd.DataFrame(data=batch)
if self.output_path:
prefix = self.output_path
else:
prefix = self.input_path.split('*.csv')[0]
partition_timestamp_max = df["dataflow_ingested_at"].max()
year = f'year={partition_timestamp_max.year}'
month = f'month={partition_timestamp_max.month}'
day = f'day={partition_timestamp_max.day}'
hour = f'hour={partition_timestamp_max.hour}'
file_path = f"{prefix}test/{year}/{month}/{day}/{hour}/test{df['dataflow_ingested_at'].min()}-{partition_timestamp_max}.parquet"
print(file_path)
df.to_parquet(path=file_path)
return file_path
class ParquetWriter(beam.PTransform):
def __init__(self,file_path_prefix):
self.data_type_mapping = {
'STRING': pa.string(),
'BYTES': pa.string(),
'INTEGER': pa.int64(),
'NUMERIC': pa.decimal128(18, 2),
'FLOAT': pa.float64(),
'BOOLEAN': pa.bool_(),
'TIMESTAMP': pa.timestamp(unit='s'),
'DATE': pa.date64(),
'DATETIME': pa.timestamp(unit='s'),
'ARRAY': pa.list_(),
'STRUCT': pa.struct()
}
self.file_path_prefix = file_path_prefix
pass
def expand(self,pcoll):
return pcoll| "Write Parquet" >> beam.io.WriteToParquet(file_path_prefix=self.file_path_prefix)
class BigQueryWriter(beam.PTransform):
def __init__(self, table_spec, method, side_input=None,partition_field=None, schema_side_input=None):
self.table_spec = table_spec
self.partition_field = partition_field if partition_field else "dataflow_ingested_at"
self.schema_side_input = schema_side_input
self.side_input = side_input
self.additional_bq_parameters = {}
self.additional_bq_parameters["timePartitioning"] = {"type": "DAY", "field": f"{self.partition_field}"}
match method:
case "batch_load":
self.method = beam.io.WriteToBigQuery.Method.FILE_LOADS
case "storage_api":
self.method = beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API
pass
def expand(self, pcoll, side_input=None):
side_input = self.side_input
print(side_input)
# return pcoll | "Write to BQ" >> beam.io.WriteToBigQuery(
# table=self.table_spec,
# schema="SCHEMA_AUTODETECT",
# write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
# create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
# method=self.method,
# temp_file_format="NEWLINE_DELIMITED_JSON",
# additional_bq_parameters=self.additional_bq_parameters
# )
return pcoll
class AddSchemaFn(beam.DoFn):
def __init__(self):
pass
def start_bundle(self):
self.schema_generator = SchemaGenerator()
self.table_schema = bigquery.TableSchema()
@staticmethod
def default_json_serializer(obj):
if isinstance(obj, datetime):
try:
return_obj = obj.strftime("%Y-%m-%dT%H:%M:%SZ")
return return_obj
except:
return ""
elif isinstance(obj, Decimal):
return float(obj)
def process(self, element: Dict[str, str]):
schema_map, error = self.schema_generator.deduce_schema(
[json.dumps(element, default=self.default_json_serializer)]
)
schema = self.schema_generator.flatten_schema(schema_map)
# for field in schema:
# schema_field = bigquery.TableFieldSchema()
# schema_field.mode = field['mode']
# schema_field.name= field['name']
# schema_field.type = field['type']
# self.table_schema.fields.append(schema_field)
schema_fields = []
for field in schema:
schema_field = f"{field['name']}:{field['type']}"
schema_fields.append(schema_field)
sep = ','
schema_str =sep.join(schema_fields)
yield element