-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathal_dump_analysis.py
151 lines (112 loc) · 4.11 KB
/
al_dump_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#from neurometry import viz
#viz.scatter()
# %%
from neurometry import viz, cave
import pandas as pd
al_bodies = pd.read_csv("data/AL_dump_Oct_24_2022.csv")
# %%
from datetime import datetime
# %%
#viz.donut(al_bodies["cell_class"].value_counts().index.tolist(),
#al_bodies["cell_class"].value_counts().values.tolist(), "Cell Class").write_image("figures/cell_class_distribution_flyaldump_"+datetime.now().strftime("%m_%d_%Y_%H:%M:%S")+".png")
# %%
al_bodies["class+type"] = al_bodies["cell_class"] + " " + al_bodies["cell_type"]
# %%
client = cave.set_up()
# %%
meta = cave.get_cellinfo_meta(client)
# %%
al_ids = list(al_bodies["root_id"].unique())
annotations = cave.cellidentity_query(client, al_ids)
# %%
# %%
annotations_grouped = annotations.groupby(["pt_root_id", "tag"])["pt_root_id"].agg(["count"]).sort_values(by="count", ascending=False)
annotations_grouped = annotations_grouped.reset_index()
# %%
ann_dump = client.materialize.query_table('neuron_information_v2')
# %%
tags_grouped = ann_dump.groupby(["tag"])["id"].agg(["count"]).sort_values(by="count", ascending=False)
tags_grouped = tags_grouped.reset_index()
# %%
annotations_merged = al_bodies.merge(annotations_grouped, how="left", left_on="root_id", right_on="pt_root_id")
# %%
#annotations_merged.to_csv("outputs/annotations_merged_al_bodied_flywire_" + datetime.now().strftime("%m_%d_%Y_%H:%M:%S")+".csv", index=False)
# %%
import re
def separation(row):
text = row["tag"]
pattern = r"[^a-zA-Z']"
text = re.sub(pattern, ' ', text.lower())
#text = text.lower()
text = text.split()
return text
# %%
ann_dump
# %%
ann_dump["tag_processed"] = ann_dump.apply(separation, axis=1)
# %%
bag_of_words = {}
for index, row in ann_dump.iterrows():
_tags = row["tag_processed"]
for tag in _tags:
if tag in bag_of_words:
bag_of_words[tag] += 1
else:
bag_of_words[tag] = 1
# %%
bag_of_words_df = pd.DataFrame.from_dict(bag_of_words, orient='index', columns=["frequency"]).reset_index()
# %%
bag_of_words_df_sorted = bag_of_words_df.sort_values(by="frequency", ascending=False)
# %%
bag_of_words_df_alphabetical = bag_of_words_df.sort_values(by="index")
# %%
#bag_of_words_df_alphabetical.to_csv("outputs/annotations_tags_" + datetime.now().strftime("%m_%d_%Y_%H:%M:%S")+".csv", index=False)
# %%
ann_dump[ann_dump["tag_processed"].apply(lambda x: 'mbp' in x)]
# %%
annotation_tags = pd.read_csv("data/flywire_annotation_tags.csv")
# %%
important_tags = annotation_tags[annotation_tags["important?"]==1].reset_index()[["index","frequency","important?"]]
# %%
keywords = list(important_tags["index"])
# %%
remove_list = ["alln", "alpn", "antennal", "sensory"]
for x in remove_list:
keywords.remove(x)
# %%
len(keywords)
# %%
# Sample 200 synapses from AL_bodies
sampke_al_bodies = al_bodies.sample(n=200)
# %%
sampke_al_bodies.to_csv("outputs/al_bodies_200_neurons_" + datetime.now().strftime("%m_%d_%Y_%H:%M:%S")+".csv", index=False)
# %%
chosen = set()
while len(chosen) < 200:
the_word = pd.Series(keywords).sample().values[0]
the_sample = ann_dump[ann_dump["tag_processed"].apply(lambda x: the_word in x)]
the_sample = the_sample.sample(1)
root_id = the_sample["pt_root_id"].values[0]
chosen.add(root_id)
# %%
sample_ann_dump = ann_dump[ann_dump["pt_root_id"].isin(chosen)]
# %%
sample_ann_dump.to_csv("outputs/sample_ann_200_neurons_" + datetime.now().strftime("%m_%d_%Y_%H:%M:%S")+".csv", index=False)
# %%
s_bag_of_words = {}
for index, row in sample_ann_dump.iterrows():
_tags = row["tag_processed"]
for tag in _tags:
if tag in s_bag_of_words:
s_bag_of_words[tag] += 1
else:
s_bag_of_words[tag] = 1
# %%
s_bag_of_words_df = pd.DataFrame.from_dict(s_bag_of_words, orient='index', columns=["frequency"]).reset_index()
# %%
s_bag_of_words_df_sorted = s_bag_of_words_df.sort_values(by="frequency", ascending=False)
# %%
s_bag_of_words_df_alphabetical = s_bag_of_words_df.sort_values(by="index")
# %%
s_bag_of_words_df_alphabetical.to_csv("outputs/sample_ann_200_neurons_annotations_tags_" + datetime.now().strftime("%m_%d_%Y_%H:%M:%S")+".csv", index=False)
# %%