-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.rb
executable file
·164 lines (141 loc) · 5.34 KB
/
analyze.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env ruby
## This analyzes a form using filter and distribution approach
# Basic usage:
# {"input": {"col1":["value1", "value2"], "col2":["valueA"]}, "multiplier":"colCount", "output":["col3", "col4"]}
# which means
# for rows with col1 having value1 AND value2, AND col2 having valueA, get the distribution of all values in col3, col4
# output will something like {"col3":[{"value":"valueX", "count":1, "pct":10}, {"value":"valueY", "count":9, "pct":90}], "col4":[...]}
# Assumptions on table
# 1. it's tsv
# 2. first row is for column names (e.g. "col1", "col2")
# 3. each value cell can be a single value string, or comma separated multiple values
require 'yaml'
class TableAnalyzer
def self.analyze(table, command)
# table is a tsv string, command is the dict with input and output
# init output arrays
result = Hash.new
command["output"].each {|col| result[col] = Array.new}
# puts result
data_row_count = table.split("\n").count - 1
rows_used = 0
command["input"]={} unless command["input"]
input_keys_to_watch = command["input"].keys
col_num_to_filter_map = Hash.new
col_num_to_result_map = Hash.new
col_val_result_dict = Hash.new # like {"3(col index)":{"valueX": the_array_in_results}}
filter_cols = []
result_cols = []
multiplier_col = nil
table.strip! # handle the case when there's an empty line at the beginning
table.split("\n").each_with_index do |line, idx|
ok_to_use_row = true
if idx==0
line.split("\t").each_with_index do |col, idx2|
col.strip!
if input_keys_to_watch.count(col)>0
# puts "input_keys_to_watch=#{input_keys_to_watch} which has #{col} for col #{idx2}"
col_num_to_filter_map[idx2] = command["input"][col] # array
filter_cols << idx2
end
if command["output"].count(col)>0
col_num_to_result_map[idx2] = result[col] # array
col_val_result_dict[idx2] = {}
result_cols << idx2
end
if command["multiplier"] and command["multiplier"].count(col) > 0
multiplier_col = idx2
end
end
# puts col_num_to_filter_map
# puts "result cols: #{result_cols}, map: #{col_num_to_result_map}"
# puts "filter cols: #{filter_cols}"
else
columns = line.split("\t")
filter_cols.each do |idx2|
values_str = columns[idx2]
unless values_str
ok_to_use_row = false
break
end
values = columns[idx2].split(",").map{|v| v.strip}
filters = col_num_to_filter_map[idx2]
# puts "values - filters = #{values} - #{filters}"
if (values - filters).count != (values.count - filters.count)
# filters has something values doesn't have
# puts "no match at row #{idx} for col #{idx2}"
ok_to_use_row = false
break # quit col loop
end
end
unless ok_to_use_row # quit row loop
# puts "row #{idx} doesn't qualify"
next
end
rows_used += multiplier_col ? columns[multiplier_col].to_f : 1.0
result_cols.each do |idx2|
# puts "columns: #{columns}, idx:#{idx2}"
values_str = columns[idx2]
break unless values_str
break if values_str.length < 1
values = columns[idx2].split(",").map{|v| v.strip}
result_array = col_num_to_result_map[idx2]
found = false
values.each do |value|
if col_val_result_dict[idx2][value]
col_val_result_dict[idx2][value]["count"] += multiplier_col ? columns[multiplier_col].to_f : 1.0
else
col_val_result_dict[idx2][value] = {"value"=>value, "count"=> multiplier_col ? columns[multiplier_col].to_f : 1.0}
result_array << col_val_result_dict[idx2][value]
end
end # of value loop
end # of col loop
end # of row
# puts "result is #{result} after row #{idx}"
end # of all rows
# add percentage
command["output"].each do |col|
result[col].each do |item|
# puts "count: #{item["count"]}, rows:#{rows_used}, result:#{item["count"].to_f/rows_used.to_f}"
item["pct"] = (item["count"].to_f * 100.0 / rows_used.to_f).round(2)
end
# sort output
result[col].sort!{|a, b| b["count"] <=> a["count"]}
end
# puts "result is #{result}"
return result
end
end
if __FILE__ == $0
if ARGV.count < 3
puts "Usage:\nanalyze filename [-i column1=value1 [-i column2=value2]] -o column3,column4,column5\n\n"
exit 1
end
filename = ARGV[0]
args = ARGV[1..-1].join(' ')
inputs = args.scan(/-i ([^-]*)/).map{|a| a[0]}
outputs = args.scan(/-o ([^-]*)/).map{|a| a[0]}
# puts inputs.inspect
# puts outputs.inspect
if outputs.count<1
puts "Need at least one output column\n\n"
exit 1
end
cmd = {"output"=>[], "input"=>{}}
# handle output
outputs.each do |output|
cmd["output"] += output.split(",").map{|c| c.strip}
end
# handle input
inputs.each do |input|
array = input.split("=")
if array.count==2
key = array[0].strip
values = array[1].strip
cmd["input"][key] = values.split(",").map{|v| v.strip}
end
end
# puts cmd
result = TableAnalyzer.analyze(File.read(filename), cmd)
puts result.to_yaml
end