-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
114 lines (87 loc) · 3.53 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# %%
import json
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import pandas as pd
from scipy.stats import fisher_exact
# %%
def get_data(fname):
data = []
with open(fname, 'r') as f:
for line in f:
row_data = json.loads(line)
if "sample_ix" in row_data:
data.append(row_data)
return data
# %%
model_name = "gpt-4-0613"
variants = ['ab_10', 'ab_50', 'cdef_10', 'cdef_50']
raw_data = [get_data(f"{model_name}/{variant}.log") for variant in variants]
# %%
# 1. Main plot
def true_ratio(data, key):
return len([row for row in data if row[key]]) / len(data)
values_A = [true_ratio(data, "correct_label") for data in raw_data]
values_B = [true_ratio(data, "correct_rule") for data in raw_data]
x = np.arange(len(variants)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, values_A, width, label='Get label')
rects2 = ax.bar(x + width/2, values_B, width, label='Get rule')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Variants')
ax.set_ylabel('Percentage Correct')
ax.set_title(f'Percentage of Correct Responses by Variant and Task - {model_name}')
ax.set_xticks(x)
ax.set_xticklabels(variants)
ax.legend()
ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)
fig.tight_layout()
plt.show()
# %%
# 2. Is the difference between 10 and 50 statistically significant?
sample_size = 100
print("ab", sm.stats.proportions_ztest([0.68 * sample_size, 0.88 * sample_size], [sample_size, sample_size]))
print("cdef", sm.stats.proportions_ztest([0.87 * sample_size, 0.97 * sample_size], [sample_size, sample_size]))
print("ab - rule", sm.stats.proportions_ztest([0.16 * sample_size, 0.02 * sample_size], [sample_size, sample_size]))
print("cdef - rule", sm.stats.proportions_ztest([0.16 * sample_size, 0 * sample_size], [sample_size, sample_size]))
# %%
# 3. Analyze rules
for variant, data in zip(variants, raw_data):
rules = [str(x["correct_rule"]) + " " + x["rule"] for x in data]
rules.sort()
print("VARIANT", variant, len(Counter(rules)))
print("RULES WITH 'a' or 'b'", len(list(rule for rule in rules if "'a'" in rule or "'b'" in rule)))
for rule, cnt in Counter(rules).items():
print(cnt, rule)
# %%
# 4. Are there easier/harder samples for both experiments?
for variant in ("ab_10", "cdef_10"):
variant_data = raw_data[variants.index(variant)]
data = []
for sample_data in variant_data:
data.append((sample_data["correct_label"], sample_data["correct_rule"]))
df = pd.DataFrame(data, columns=['correct_label', 'correct_rule'])
# Create a frequency table (contingency table)
frequency_table = pd.crosstab(df['correct_label'], df['correct_rule'])
print(frequency_table)
# Assuming frequency_table is a 2x2 DataFrame
_, p_value = fisher_exact(frequency_table)
print("P-value:", p_value)
# %%
# 5. Compare number of different tokens per task
# (this doesn't make much sense)
import tiktoken
from experiment import starts_a_ends_b_strings
encoder = tiktoken.encoding_for_model(model_name)
for letters in ("ab", "cdef"):
strings = starts_a_ends_b_strings(letters, 8)
tokens = [encoder.encode(string) for string in strings]
print(letters)
print("AVERAGE TOKENS", sum(len(x) for x in tokens)/len(tokens))
print("UNIQUE FIRST TOKENS", len(set(x[0] for x in tokens)))
print("UNIQUE LAST TOKENS", len(set(x[-1] for x in tokens)))
# %%