-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcost.py
169 lines (149 loc) · 5.61 KB
/
cost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import math
import os
import re
import codecs
import matplotlib.pyplot as plt
COST_GOOGLE_SPEECH_API_15_SECONDS = 0.006
GOOGLE_MONTHLY_LIMIT_SECONDS = 60000000
COST_GOOGLE_TRANSLATE_API_PER_CHAR = 20.0 / 1000000.0
videos_english = "subtitles/english"
def get_length(hours, minutes):
return (hours * 60 + minutes) * 60
def speech_recognition_price(duration_seconds):
#round up
duration_seconds = math.ceil(duration_seconds / 15.0)
# up to 1,000,000 minutes, i.e. 16,666 hours, 694 days of speech per month
return COST_GOOGLE_SPEECH_API_15_SECONDS * duration_seconds
def translation_price(num_characters):
# limit of a billion characters per month
return COST_GOOGLE_TRANSLATE_API_PER_CHAR * num_characters
def calculate_price_video(video_seconds, subtitle_number_chars):
return speech_recognition_price(video_seconds) + translation_price(subtitle_number_chars)
def calculate_num_chars(path):
with codecs.open(path, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
# for srt files coming from Windows
content = content.replace("\r", "").replace("\t", "")
lines = content.split("\n")
sections = []
sec = []
for line in lines:
if line == "":
sections.append(sec)
sec = []
continue
sec.append(line)
sections = [x for x in sections if len(x) > 2]
# grab last subtitle time and use it as duration of movie
last_section_time = sections[-1][1]
m = re.match(r"(.*) --> (?P<end_time>.*)", last_section_time)
end_time = m.groupdict()["end_time"]
hours_minutes = end_time.split(":")
hours = int(hours_minutes[0])
minutes = int(hours_minutes[1])
time = (hours, minutes)
for idx in range(len(sections)):
sections[idx] = sections[idx][2:]
value = 0
for s in sections:
for sentence in s:
value += len(sentence)
print("{} characters".format(value))
return value, time
def to_hour_from_seconds(seconds):
return seconds / 3600.0
def run_analysis(subtitle_dir):
results_per_language = {}
# expects a directory with directories (languages), each with .srt files
for root, dirs, files in os.walk(subtitle_dir):
if root == subtitle_dir:
continue
print(root)
language = root.split("/")[1]
print("**********************")
print(language)
results = []
for file in files:
print("")
path = "{}/{}".format(root, file)
chars, time = calculate_num_chars(path)
seconds = get_length(time[0], time[1])
price = calculate_price_video(seconds, chars)
r = (seconds, price, price / to_hour_from_seconds(seconds), file, chars)
results.append(r)
print("{}, time: {} with price: USD {} in {}".format(file.split(".")[0], seconds, price, language))
print("Price per user per hour: {}".format(r[2]))
print("***********************")
# save results
results_per_language[language] = sorted(results, key=lambda x: x[3])
return results_per_language
def get_avg(res):
value = 0
for r in res:
value += r[2]
return value / len(res)
def get_avg_price_overall(rs):
count = 0
value = 0
for key in rs:
for v in rs[key]:
value += v[2]
count += 1
return value / count
def avg_price_per_language(results):
return { lang: get_avg(results[lang]) for lang in results }
def get_avg_number_chars_per_hour_total(rspl):
resdict = {}
chars = 0
t = 0
for lang in rspl:
resdict[lang] = []
r = rspl[lang]
for p in r:
chars = p[4]
t = p[0]
resdict[lang].append((p[3], chars / to_hour_from_seconds(t)))
return resdict
def plot_price_per_hour(rs):
# plot of price per hour for each subtitle file per language
values = [list(map(lambda x: x[2], rs[key])) for key in rs]
srt_names = [list(map(lambda x: x[3], rs[key])) for key in rs][0]
avg_price_dict = avg_price_per_language(rs)
languages = [key for key in rs]
avg_price_overall = get_avg_price_overall(rs)
label_idx = 0
print(avg_price_overall)
for v in values:
label = languages[label_idx]
srts = ["SRT " + str(i) for i in range(1, len(v) + 1)]
plt.plot(srts, v, label=label, marker="o")
label_idx += 1
plt.axhline(y=avg_price_overall, linestyle='dashed', color="black", label="Avg price")
plt.legend(loc="upper left")
plt.ylabel("USD per user per hour")
plt.xlabel("Subtitle files")
plt.title("Average price per user per hour in different languages")
plt.show()
def plot_characters_per_hour(cs_per_hour):
list_gen = (cs_per_hour[key] for key in cs_per_hour)
chars = map(lambda x: [p[1] for p in x], list_gen)
avg = sum(list(map(sum, chars))) / 15.0
print(avg)
values = [list(map(lambda x: x[1], cs_per_hour[key])) for key in cs_per_hour]
languages = [key for key in cs_per_hour]
label_idx = 0
for v in values:
srts = ["SRT " + str(i) for i in range(1, len(v) + 1)]
label = languages[label_idx]
plt.plot(srts, v, label=label, marker="o")
label_idx += 1
plt.axhline(y=avg, linestyle='dashed', color="black", label="Avg number of characters")
plt.legend(loc="upper left")
plt.ylabel("# characters per hour")
plt.xlabel("Subtitle files")
plt.title("Number of characters per hour in subtitles in different languages")
plt.show()
rs = run_analysis("subtitles")
cs_per_hour = get_avg_number_chars_per_hour_total(rs)
plot_characters_per_hour(cs_per_hour)
#plot_price_per_hour(rs)