-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample_simple.py
69 lines (47 loc) · 1.85 KB
/
example_simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import sys
import time
import pandas as pd
import psutil
from syndiffix import Synthesizer
# Utility function for loading a CSV file.
def load_csv(path: str) -> pd.DataFrame:
from pandas.errors import ParserError
df = pd.read_csv(path, keep_default_na=False, na_values=[""], low_memory=False)
# Try to infer datetime columns.
for col in df.columns[df.dtypes == "object"]:
try:
df[col] = pd.to_datetime(df[col], format="ISO8601")
except (ParserError, ValueError):
pass
return df
# Simple usage example of the SynDiffix library.
# This script assumes each row belongs to a different protected entity.
# All columns in the input file are processed.
if len(sys.argv) != 3:
print(f"Usage: py {sys.argv[0]} <input.csv> <output.csv>")
exit()
input_file = sys.argv[1]
output_file = sys.argv[2]
print(f"Loading data from `{input_file}`...")
input_data = load_csv(input_file)
print(f"Loaded {len(input_data)} rows. Columns:")
for i, (column, dtype) in enumerate(zip(input_data.columns, input_data.dtypes)):
print(f"{i}: {column} ({dtype})")
start_time = time.time()
process = psutil.Process(os.getpid())
start_memory_usage = process.memory_info().rss
print("\nFitting the synthesizer over the data...")
synthesizer = Synthesizer(input_data)
print("Column clusters:")
print("Initial=", synthesizer.clusters.initial_cluster)
for cluster in synthesizer.clusters.derived_clusters:
print("Derived=", cluster)
print("\nSampling rows from the synthesizer...")
output_data = synthesizer.sample()
run_time = round(time.time() - start_time)
memory_usage = (process.memory_info().rss - start_memory_usage) // (1024**2)
print(f"Runtime: {run_time} seconds. Memory usage: {memory_usage} MB.")
print(f"\nWriting sampled rows to `{output_file}`...")
output_data.to_csv(output_file, index=False)
print("Done!")