forked from steckhelena/dash-lab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize_datasets.py
96 lines (75 loc) · 3.05 KB
/
normalize_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pathlib
from typing import List, TypedDict
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
class Limits(TypedDict):
upload_kbps: float
download_kbps: float
change_interval_seconds: int
class NormalizedDataset(TypedDict):
name: str
data: List[Limits]
total_duration: int
dataset: str
# disable pandas warnings for chaining assignment
pd.options.mode.chained_assignment = None
def get_normalized_datasets(datasets) -> List[NormalizedDataset]:
normalized_datasets = []
for filename in datasets:
# Normalize dataset using pandas
csv_data: DataFrame = pd.read_csv(filename) # type: ignore
# Get only interesting columns
filtered_data: DataFrame = csv_data[
["Timestamp", "DL_bitrate", "UL_bitrate", "State"]
]
# Set value types
filtered_data.astype({"DL_bitrate": "float", "UL_bitrate": "float"})
# Interpolate values between idle rows and downloaded ones
filtered_data.loc[
filtered_data["State"] == "I", ["DL_bitrate", "UL_bitrate"]
] = np.nan
filtered_data["DL_bitrate"].values[filtered_data["DL_bitrate"] < 0.001] = np.nan
filtered_data["UL_bitrate"].values[filtered_data["UL_bitrate"] < 0.001] = np.nan
filtered_data.interpolate(inplace=True)
filtered_data.dropna(inplace=True)
# Remove repeated timestamps and State column by taking the mean
filtered_data = filtered_data.groupby("Timestamp").mean().reset_index()
# calculate time deltas for each speed
filtered_data["Timestamp"] = pd.to_datetime(
filtered_data["Timestamp"], format="%Y.%m.%d_%H.%M.%S"
)
filtered_data["Timestamp"] = (
((filtered_data["Timestamp"] - filtered_data["Timestamp"].shift()))
.shift(-1)
.fillna(pd.Timedelta(seconds=1))
.dt.seconds
)
# replace values less than 1bps to be 1bps or htb does not work properly
filtered_data["DL_bitrate"].values[filtered_data["DL_bitrate"] < 0.001] = 0.001
filtered_data["UL_bitrate"].values[filtered_data["UL_bitrate"] < 0.001] = 0.001
# calculate total duration of data
total_duration = filtered_data["Timestamp"].sum()
# rename data columns to standardized names
filtered_data.rename(
columns={
"Timestamp": "change_interval_seconds",
"DL_bitrate": "download_kbps",
"UL_bitrate": "upload_kbps",
},
inplace=True,
)
# Normalize name removing first path
parts = pathlib.Path(filename).parts
normalized_name = "-".join(parts[1:]).strip(".csv")
dataset = parts[-1]
# append normalized results
normalized_datasets.append(
{
"name": normalized_name,
"data": filtered_data.to_dict("records"),
"total_duration": total_duration,
"dataset": dataset,
}
)
return normalized_datasets