-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtime_cell_interaction_lib.py
4404 lines (3360 loc) · 274 KB
/
time_cell_interaction_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import utils
import new_phenotyping_lib
save_image_ext = 'jpg'
# save_image_ext = 'png'
class TIMECellInteraction:
'''
Instantiation of this class mainly loads Consolidata_data.txt into a Pandas dataframe (or reads in a simulated one in the case of simulated data) and performs some preprocessing on it
It will create a pickle file (initial_data.pkl) of the read-in and preprocessed data, unless the file already exists, in which case this step is skipped
'''
def __init__(self, dataset_obj, project_dir, allow_compound_species, nslices=1, thickness_new=4, n_neighs=6, radius_instead_of_knn=True, simulate_data=False, refine_plotting_map_using_mapping_dict=False, flatten=True, use_analytical_significance=True, decimate_top_species=False, **kwargs): # incorporating new dataset object
# Import relevant module
import os
# Extract attributes, some not actually needed, for compatibility with original way the SIP library worked before there was a dataset object
coord_units_in_microns, min_coord_spacing, input_data_filename, _, mapping_dict, phenotype_identification_tsv_file = dataset_obj.extract_original_parameters()
# Set local variables
thickness = thickness_new / coord_units_in_microns # we want "thickness" to be in the same units as the coordinates in the input file. Essentially, we're just making sure the units match from the get-go
dataset_name = 'slices_{}x{}'.format(nslices, thickness_new)
pickle_dir = os.path.join('.', 'output', 'checkpoints')
if not os.path.exists(pickle_dir):
os.makedirs(pickle_dir)
webpage_dir = os.path.join('.', 'output', 'images')
if not os.path.exists(webpage_dir):
os.makedirs(webpage_dir)
# Set attributes
self.dataset_name = dataset_name
self.project_dir = project_dir
self.input_data_filename = input_data_filename # if this is a dataframe then that's because dataset_formats was called with an already existing dataframe, not a string path
self.webpage_dir = webpage_dir
self.mapping_dict = mapping_dict
self.nslices = nslices
self.thickness = thickness
self.n_neighs = n_neighs
self.radius_instead_of_knn = radius_instead_of_knn
self.coord_units_in_microns = coord_units_in_microns
self.min_coord_spacing = min_coord_spacing
self.thickness_new = thickness_new
self.use_analytical_significance = use_analytical_significance
# These next block isn't technically needed but it helps to set these here to help for linting purposes
# These are set in this method but not saved in the traditional way (instead, using make_pickle_dict())
self.pickle_dir = pickle_dir # directory for storing the processed data, i.e., pickle files
self.unique_species = []
self.doubling_type = None
self.unique_slides = []
self.is_real_data = None
self.compound_species_allowed = None
self.plotting_map = []
self.num_colors = None
# These are set in other functions in this class but not saved in the traditional way (instead, using make_pickle_dict())
self.data_by_slide = []
self.dr = None
self.k_max = None
self.min_nvalid_centers = None
# Assign local variables that aren't the same as those inputted in order to save them later using make_pickle_dict()
is_real_data = not simulate_data
compound_species_allowed = allow_compound_species
# Constant
pickle_file = 'initial_data.pkl'
# If the pickle file doesn't exist...
if not os.path.exists(os.path.join(pickle_dir, pickle_file)):
# If requesting simulated data...
if simulate_data:
# ...generate simulated data
midpoints = kwargs['midpoints']
max_real_area = kwargs['max_real_area']
mult = kwargs['mult']
self.doubling_type = kwargs['doubling_type']
self.data = get_simulated_data(kwargs['doubling_type'], midpoints, max_real_area, min_coord_spacing, mult)
else:
# ...otherwise, read in the data from the CSV file
doubling_type = None
self.data = dataset_obj.data
# Phenotype as desired
if phenotype_identification_tsv_file is not None:
pheno_method = 'Custom'
else:
if allow_compound_species:
pheno_method = 'Species'
else:
pheno_method = 'Marker'
self.data, self.phenotypes, species_int_to_pheno_name = new_phenotyping_lib.apply_phenotyping(self.data, pheno_method, phenotype_identification_tsv_file, species_int_colname='Species int')
#### ADD IN species_int_to_pheno_name TO THE FUNCTION BELOW!!!!
# Get the plotting map, number of colors, unique species, and the unique slides
plotting_map, num_colors, unique_species, unique_slides, self.data = get_dataframe_info(self.data, self.phenotypes, species_int_to_pheno_name=species_int_to_pheno_name)
# Save the case/slide/ROI "table of contents", including the ROI widths and heights
self.df_data_by_roi = generate_case_slide_roi_contents(self.data)
# Print and save the recommended radius as 10% of the minimum average ROI size
self.recommended_radius_ = calculate_recommended_radius(self.df_data_by_roi)
# Reduce dataset here in order to test independence of significance methodology of non-involved species
if decimate_top_species:
print('WARNING: Decimating top species by 90%!!')
top_species = plotting_map[0][0]
data_sample = self.data[self.data['Species int'] == top_species].sample(frac=0.9, random_state=42) # didn't yet test addition of ", random_state=42"
self.data = self.data.drop(data_sample.index).reset_index()
plotting_map[0][2] = plotting_map[0][2] - len(data_sample)
# Set some properties that haven't already been defined
self.num_colors = num_colors
self.plotting_map = plotting_map
self.unique_species = unique_species
# Add to the used-to-be-called-contents dataframe the rest of the ROI-specific data
if flatten:
df_data_by_roi = self.flatten_roi_plotting_data()
# Save the data to a pickle file
data = self.data
self.make_pickle_dict(['pickle_dir', 'is_real_data', 'compound_species_allowed', 'doubling_type', 'data', 'phenotypes', 'plotting_map', 'num_colors', 'unique_species', 'unique_slides', 'df_data_by_roi'], locals(), pickle_file)
else:
# Load the data from the pickle file if it already exists
self.load_pickle_dict(pickle_file, pickle_dir=pickle_dir)
# However, overwrite the pickle and webpage directories as we should be able to load these same pickle files on different systems
self.pickle_dir = pickle_dir
self.webpage_dir = webpage_dir
def calculate_metrics(self, nworkers=1, do_logging=True, use_multiprocessing=True, delete_intermediate_pkl_files=True, keep_unnecessary_calculations=False):
'''
Calculate the P values (and Z scores) from the coordinates of the species in every ROI in every slide.
The assumed distributions are Poisson for the densities and binomial for the PMFs (see summary_of_formulas.lyx) for the formula and physical notebook notes on 12-16-20 for the derivations.
As seen in the Lyx/PDF file, the precise random variable that is Poisson-distributed (S) is the total number of neighbors of a particular species over all n samples (i.e., over all n centers of a particular species). This is converted to "density" by dividing by n, but then note this is not technically Poisson distributed; e.g., the Poisson distribution applies to only discrete random variables. The corresponding "average number of neighbors around the n centers" is what is typically referred to as "density," which is more technically thought of as the average number of neighbors DIVIDED BY an area. Note that <S> = n * lambda, where lambda is basically the number of neighbors (of a certain species) in the ROI, divided by the ROI area, times the area of the slice under consideration. Note that S runs from 0, 1, 2, ..., infinity.
The precise random variable that is binomial-distributed (Z_j) is the number of centers (of a particular species) having j neighbors (of a particular species). This (divided by n) is the probability mass function (PMF) of the random variable Y, which is the number of neighbors (of a particular species) around a center (of a particular species). (Note that Y is Poisson-distributed with mean lambda and S = Sum[Y_i], where i goes from 1, 2, ..., n, and Y_i is the number of neighbors around center i.) Note that <Z_j> = n * lambda^j*exp(-lambda)/j!. Note that Z_j runs from 0, 1, 2, ..., n. n is, as above, the total number of centers of a particular species.
Assuming these distributions, the null hypotheses are that means are as defined above. The "left" or "less" alternative hypotheses are that the means are "less" than (i.e., to the "left" of) these stated values. The "right" or "greater" alternative hypotheses are that the means are "greater" than (i.e., to the "right" of) these stated values. The P values are then the probabilities of observing values of S or Z_j as extreme as (in the directions of the alternative hypotheses) the observed instances of S or Z_j under the assumptions of the null hypotheses.
Notes:
* Took ~47 (later: 65) minutes on laptop
* Units here should be in the same units provided in Consolidated_data.txt (which were originally half-microns, i.e., for dr=8, each slice is 4 microns thick)
* nworkers should probably be the number of CPUs allocated by SLURM less 1 just to be safe
'''
# Import relevant libraries
import os
import subprocess
# Set variables already defined as attributes
unique_slides = self.unique_slides
pickle_dir = self.pickle_dir
plotting_map = self.plotting_map
nslices = self.nslices
thickness = self.thickness
n_neighs = self.n_neighs
radius_instead_of_knn = self.radius_instead_of_knn
min_coord_spacing = self.min_coord_spacing
df_data_by_roi = self.df_data_by_roi
use_analytical_significance = self.use_analytical_significance
# Set some attributes from the method parameters
self.k_max = nslices
self.dr = thickness
# Constants
pickle_file = 'calculated_metrics.pkl'
# If the pickle file doesn't already exist...
if not os.path.exists(os.path.join(pickle_dir, pickle_file)):
# Print what we're doing
print('Calculating metrics...')
# Experiment-wide variables
all_species_list = [x[0] for x in plotting_map]
nall_species = len(all_species_list)
nrois = len(df_data_by_roi)
log_file = 'calculated_metrics.log'
# ---- Calculate the metrics for all the ROIs that haven't already been calculated, saving the results in individual pickle files
# Determine the ROIs whose metrics need to be calculated, i.e., those whose corresponding pickle files are not present on the filesystem
retval = subprocess.run(['ls {}/calculated_metrics-roi_index_*.pkl'.format(pickle_dir)], shell=True, capture_output=True)
roi_ids_not_present = set(range(nrois)) - set([int(x.split('.pkl')[0].split('_')[-1]) for x in retval.stdout.decode().split('\n')[:-1]])
# Generate a list of tuple arguments each of which is inputted into calculate_metrics_for_roi() to be run by a single worker
constant_tuple = (pickle_dir, nslices, thickness, n_neighs, radius_instead_of_knn, min_coord_spacing, all_species_list, nall_species, do_logging, use_analytical_significance, df_data_by_roi, keep_unnecessary_calculations, nworkers)
# list_of_tuple_arguments = [constant_tuple + (x,) for x in range(nrois)] # doing it this lazy way potentially messes up the multiprocessing module, causing too many unnecessary-to-be-calculated ROIs to be sent into the Pool, causing only a single worker to actually be used
list_of_tuple_arguments = [constant_tuple + (x,) for x in roi_ids_not_present]
# Since Squidpy commandeers multiprocessing, completely disable it if Squidpy has been requested
if use_analytical_significance:
# Farm out the metrics calculations to the worker CPUs. This ensures that a pickle file gets created for each ROI
utils.execute_data_parallelism_potentially(function=calculate_metrics_for_roi, list_of_tuple_arguments=list_of_tuple_arguments, nworkers=(0 if not use_multiprocessing else nworkers), task_description='calculation of ROI metrics (Poisson)')
else: # Squidpy is being requested
print('Running {} function calls using 1 worker WITHOUT the multiprocessing module because Squidpy is being employed, which commandeers threads'.format(len(list_of_tuple_arguments)))
utils.execute_data_parallelism_potentially(function=calculate_metrics_for_roi, list_of_tuple_arguments=list_of_tuple_arguments, nworkers=0, task_description='calculation of ROI metrics (permutation test)')
# ---- Load the resulting individual pickle files into a new, single pickle file called calculated_metrics.pkl
# For each slide...
data_by_slide = []
roi_pickle_files = [] # store filenames to delete later
roi_log_files = []
for uslide in unique_slides:
print('Reading slide ' + uslide + '...')
# Get the unique ROIs in the current slide
unique_rois = df_data_by_roi[df_data_by_roi['unique_slide'] == uslide]['unique_roi'].unique() # note the .unique() is likely unneeded
# For each ROI in the slide...
data_by_roi = []
for uroi in unique_rois:
print(' Reading ROI ' + uroi + '...')
# Load the appropriate pickle file
roi_index = df_data_by_roi.loc[df_data_by_roi['unique_roi'] == uroi, :].index[0]
roi_pickle_file = 'calculated_metrics-roi_index_{:06}.pkl'.format(roi_index)
roi_pickle_files.append(roi_pickle_file)
roi_log_files.append('calculated_metrics-roi_index_{:06}.log'.format(roi_index))
roi_data_item = load_pickle(pickle_dir, roi_pickle_file)
# Save the loaded data
data_by_roi.append(roi_data_item)
data_by_slide.append([uslide, unique_rois, data_by_roi]) # save the current slide data and the inputted parameters
# Create the single pickle file saving all the data
make_pickle(data_by_slide, pickle_dir, pickle_file)
# Concatenate all metrics calculation log files (one per ROI) into a single log file
logs_dir = os.path.join('.', 'output', 'logs')
if not os.path.exists(logs_dir):
os.makedirs(logs_dir)
with open(os.path.join(logs_dir, log_file), 'w') as outfile:
for roi_log_file in roi_log_files:
with open(os.path.join(pickle_dir, roi_log_file)) as infile:
outfile.write(infile.read())
# If the overall pickle file was successfully created, delete all intermediate pickle files for the ROIs
if delete_intermediate_pkl_files:
if os.path.exists(os.path.join(pickle_dir, pickle_file)):
for roi_pickle_file in roi_pickle_files:
os.remove(os.path.join(pickle_dir, roi_pickle_file))
# If the overall log file was successfully created, delete all intermediate log files for the ROIs
if os.path.exists(os.path.join(logs_dir, log_file)):
for roi_log_file in roi_log_files:
os.remove(os.path.join(pickle_dir, roi_log_file))
# If the pickle file already exists, load it
else:
data_by_slide = load_pickle(pickle_dir, pickle_file)
# Save the calculated data as a property of the class object
self.metrics = data_by_slide
# Create a density P value Pandas dataframe from the just-calculated metrics
self.flatten_density_pvals()
def save_figs_and_corresp_data(self, plot_real_data=True, pval_figsize=(8, 12), log_pval_range=(-40, 0), calculate_empty_bin_pvals=True, max_nbins=None, roi_figsize=(6, 4), marker_size_step=0.80, pval_dpi=150, alpha=1, roi_dpi=200, square=True, yticklabels=2, pickle_dir=None, save_individual_pval_plots=True, edgecolors='k', default_marker_size_fac=1, yaxis_dir=1, nworkers=1):
'''
NOTE: This is likely no longer used!
Create and save all the figures and the data corresponding to the figures (i.e., the left and right P values for both the densities and PMFs), i.e., the actual data that is plotted. This is a version of save_figs_and_corresp_data() that uses an arbitrary number of CPUs.
This way, I can see exactly what the read-in data actually look like and therefore trust them more.
* alpha is the transparency for the circles in the ROI plots (0 is fully transparent, 1 is fully opaque)
* marker_size_step=0.80 means the radius should be 80% larger for cells plotted behind other cells
'''
# Antonio's fix to enable plot generation in SLURM's batch mode
import matplotlib
matplotlib.use('Agg')
# Import relevant libraries
import os
import pandas as pd
import subprocess
import multiprocessing as mp
# Set a default value for this; note the input parameter to this call can be something like max_nbins=np.max([slices1.max_nbins_over_exp, slices2.max_nbins_over_exp])
if max_nbins is None:
max_nbins = self.max_nbins_over_exp
self.max_nbins_used = max_nbins
# Set variables already defined as attributes
plotting_map = self.plotting_map
metrics = self.metrics
num_colors = self.num_colors
if pickle_dir is None:
webpage_dir = self.webpage_dir
pickle_dir = self.pickle_dir
else:
webpage_dir = pickle_dir
mapping_dict = self.mapping_dict
metrics = self.metrics
contents = self.contents
coord_units_in_microns = self.coord_units_in_microns
# Define the directory holding all the images for the webpage and the filename of the file holding all the corresponding figure data
webpage_dir = os.path.join(webpage_dir, ('real' if plot_real_data else 'simulated'))
pickle_file = 'figure_data-{}.pkl'.format(('real' if plot_real_data else 'simulated'))
# If the pickle file doesn't already exist...
pickle_pathname = os.path.join(pickle_dir, pickle_file)
if not os.path.exists(pickle_pathname):
# Experiment-wide variables
all_species_list = [x[0] for x in plotting_map]
nall_species = len(all_species_list)
if mapping_dict is not None: # species_names = [x[1] for x in plotting_map]
species_names = [get_descriptive_cell_label(x[1], mapping_dict)[0] for x in plotting_map]
else:
species_names = [phenotypes_to_string(x[1]) for x in plotting_map]
# Extract the correct number of colors from the default color palette
ielem = 0
colors = []
for elem in matplotlib.rcParams['axes.prop_cycle']():
color = elem['color']
colors.append(color)
ielem = ielem + 1
if ielem == num_colors:
break
default_marker_size = matplotlib.rcParams['lines.markersize'] * default_marker_size_fac
# Since at this point we're probably saving some images, ensure their parent directory exists
os.makedirs(webpage_dir, exist_ok=True)
# Define the experiment-wide metadata to save and initialize the filedata
metadata = {
'webpage_dir': webpage_dir, 'pickle_pathname': pickle_pathname, 'num_slides': len(metrics), 'plot_real_data': plot_real_data, 'all_species_ids': all_species_list, 'all_species_names': species_names, 'nall_species': nall_species,
'roi_dpi': roi_dpi, 'roi_figsize': roi_figsize,
'pval_dpi': pval_dpi, 'pval_figsize': pval_figsize, 'log_pval_range': log_pval_range, 'calculate_empty_bin_pvals': calculate_empty_bin_pvals, 'max_nbins': max_nbins
}
# This block simply flattens out all the metrics data to the level of the ROIs, instead of the slide-then-ROI heirarchy
# Note that the ROI order is the same as in contents, which is out of order due to the decompounding process
roi_index_holder = []
roi_name_holder = []
roi_data_holder = []
slide_name_holder = []
num_rois_in_slide_holder = []
for slide_data in metrics: # for each slide of metrics data...
num_rois_in_slide = len(slide_data[1]) # get the number of ROIs in the current slide
for roi_name, roi_data in zip(slide_data[1], slide_data[2]): # for each ROI of metrics data...
roi_index = contents[contents['unique_roi'] == roi_name].index[0] # use the "contents" dataframe to determine the contents index of the current ROI
slide_name = contents.loc[roi_index, 'unique_slide'] # get the corresponding slide name (not currently used)
roi_index_holder.append(roi_index)
roi_name_holder.append(roi_name)
roi_data_holder.append(roi_data)
slide_name_holder.append(slide_name)
num_rois_in_slide_holder.append(num_rois_in_slide)
df = pd.DataFrame({'roi_name': roi_name_holder, 'roi_data': roi_data_holder, 'slide_name': slide_name_holder, 'num_rois_in_slide': num_rois_in_slide_holder}, index=roi_index_holder).sort_index() # create a Pandas dataframe holding all of the flattened data in the same order as in the contents dataframe
# Determine the ROIs whose figures need to be calculated and corresponding data saved, based only on the data, not the figures, i.e., those whose corresponding pickle files are not present on the filesystem
retval = subprocess.run(['ls {}/figure_data-{}-roi_index_*.pkl'.format(pickle_dir, ('real' if plot_real_data else 'simulated'))], shell=True, capture_output=True)
roi_ids_not_present = set(range(len(contents))) - set([int(x.split('.pkl')[0].split('_')[-1]) for x in retval.stdout.decode().split('\n')[:-1]])
# Generate a list of tuple arguments each of which is inputted into save_figs_and_corresp_data_for_roi() to be run by a single worker
constant_tuple = (pickle_dir, plotting_map, roi_figsize, pval_figsize, colors, marker_size_step, default_marker_size, roi_dpi, mapping_dict, coord_units_in_microns, alpha, edgecolors, yaxis_dir, webpage_dir, plot_real_data, nall_species, species_names, log_pval_range, calculate_empty_bin_pvals, max_nbins, square, yticklabels, all_species_list, save_individual_pval_plots, pval_dpi, df)
list_of_tuple_arguments = [constant_tuple + (x,) for x in roi_ids_not_present]
# Farm out the figure creation and corresponding data saving to the worker CPUs. This ensures that a pickle file gets created for each ROI
print('Running {} function calls using {} workers'.format(len(list_of_tuple_arguments), nworkers))
with mp.get_context('spawn').Pool(nworkers) as pool:
pool.map(save_figs_and_corresp_data_for_roi, list_of_tuple_arguments)
# Lists to contain data to save and intermediate files to delete
filedata = [] # this is the main data we want to save
roi_pickle_files = [] # these intermediate files should later be deleted
# Read in all valid center-neighbor pairs for each ROI in each slide; this is in the same hierarchical slide-ROI order, as opposed to the order in contents. Order shouldn't matter, but still
for islide, slide_data in enumerate(metrics): # for each slide of metrics data...
print('Reading slide {} of {}...'.format(islide + 1, len(metrics)))
for roi_name in slide_data[1]: # for each ROI of metrics data...
# Determine and print the current ROI index and pickle file
roi_index = contents[contents['unique_roi'] == roi_name].index[0] # use the "contents" dataframe to determine the contents index of the current ROI
roi_pickle_file = 'figure_data-{}-roi_index_{:06}.pkl'.format(('real' if plot_real_data else 'simulated'), roi_index)
print(' Reading data for ROI {} from file {}...'.format(roi_name, roi_pickle_file))
# Save the ROI pickle filename to later delete
roi_pickle_files.append(roi_pickle_file)
# Load the ROI data
filedata_for_roi = load_pickle(pickle_dir, roi_pickle_file)
# For each center-neighbor species pair in the current ROI, append the data to an overall list filedata
for pair_data in filedata_for_roi:
filedata.append(pair_data)
# Load the individual pickle files into a new, single pickle file called figure_data-{real,simulated}.pkl
make_pickle((metadata, filedata), pickle_dir, pickle_file)
# If the overall pickle file was successfully created, delete all intermediate pickle files for the ROIs
if os.path.exists(os.path.join(pickle_dir, pickle_file)):
for roi_pickle_file in roi_pickle_files:
os.remove(os.path.join(pickle_dir, roi_pickle_file))
# If the overall pickle file already exists...
else:
# Read in the figure data from disk
(metadata, filedata) = load_pickle(pickle_dir, pickle_file)
# Save the calculated data as a property of the class object
self.metadata = metadata
self.filedata = filedata
def plot_rois(self, plot_real_data=True, roi_figsize=(6, 4), marker_size_step=0.80, alpha=0.5, roi_dpi=150, edgecolors='k', default_marker_size_fac=1, yaxis_dir=-1, nworkers=1, use_multiprocessing=True):
'''
asdf
* alpha is the transparency for the circles in the ROI plots (0 is fully transparent, 1 is fully opaque)
* marker_size_step=0.80 means the radius should be 80% larger for cells plotted behind other cells
'''
# # Antonio's fix to enable plot generation in SLURM's batch mode
# import matplotlib
# matplotlib.use('Agg')
# Import relevant libraries
import os
import subprocess
import matplotlib
# Set variables already defined as attributes
plotting_map = self.plotting_map
num_colors = self.num_colors
webpage_dir = self.webpage_dir
mapping_dict = self.mapping_dict
df_data_by_roi = self.df_data_by_roi
coord_units_in_microns = self.coord_units_in_microns
# Define the directory holding all the images for the webpage and the filename of the file holding all the corresponding figure data
savedir = os.path.join(webpage_dir, 'roi_plots')
if not os.path.exists(savedir):
os.makedirs(savedir)
# Extract the correct number of colors from the default color palette
ielem = 0
colors = []
for elem in matplotlib.rcParams['axes.prop_cycle']():
color = elem['color']
colors.append(color)
ielem = ielem + 1
if ielem == num_colors:
break
default_marker_size = matplotlib.rcParams['lines.markersize'] * default_marker_size_fac
# Determine the ROIs whose figures need to be plotted based on those whose corresponding image files are not present on the filesystem
retval = subprocess.run(['ls {}/roi_plot_*.{}'.format(savedir, save_image_ext)], shell=True, capture_output=True)
roi_ids_not_present = set(range(len(df_data_by_roi))) - set([int(x.split(f'.{save_image_ext}')[0].split('_')[-1]) for x in retval.stdout.decode().split('\n')[:-1]])
# Generate a list of tuple arguments each of which is inputted into plot_single_roi() to be run by a single worker
constant_tuple = (plotting_map, roi_figsize, colors, marker_size_step, default_marker_size, roi_dpi, mapping_dict, coord_units_in_microns, alpha, edgecolors, yaxis_dir, savedir, df_data_by_roi)
list_of_tuple_arguments = [constant_tuple + (x,) for x in roi_ids_not_present]
# Farm out the figure creation to the worker CPUs. This ensures that an image file gets (or already is) created for each ROI
if 'roi_plot_fig_pathname' not in df_data_by_roi:
df_data_by_roi['roi_plot_fig_pathname'] = ''
utils.execute_data_parallelism_potentially(function=plot_single_roi, list_of_tuple_arguments=list_of_tuple_arguments, nworkers=(0 if not use_multiprocessing else nworkers), task_description='plotting of the ROIs')
def calculate_max_nbins(self):
'''
Calculate the maximum number of possible bins (of numbers neighbors) over the entire experiment, for both the real and simulated data.
'''
# Set variables already defined as attributes
plotting_map = self.plotting_map
metrics = self.metrics
# Experiment-wide variables
nall_species = len(plotting_map)
# Initialize the variable of interest
max_nbins_over_exp = 0
# For each slide...
for slide_data in metrics:
# For each ROI in the slide...
for roi_data in slide_data[2]: # get the metrics data for the current ROI of the current slide
# Determine which dataset to plot, each of which is (nall_species, nall_species, nslices)
# The elements of the dataset ndarray are either a tuple (if both the center and neighbor species are in the ROI) or None
for data_to_plot in roi_data[1:3]:
# Determine whether both the center and neighbor species are in the dataset
center_and_neighbor_species_in_dataset = ((data_to_plot != None).sum(axis=2)) > 0 # (nall_species, nall_species)... since we want to do element-wise comparisons here, don't listen to linting when it says the right way to do the comparison is "data_to_plot is not None"
# For all combinations of centers and neighbors...
for icenter_spec in range(nall_species):
for ineighbor_spec in range(nall_species):
# If data exist for at least one of the slices for the current center/neighbor combination...
if center_and_neighbor_species_in_dataset[icenter_spec, ineighbor_spec]:
# Create the P value figure containing four heatmaps and return the figure object
# ^---This must be an incorrect comment right?!
max_nbins_over_exp = get_max_nbins_for_center_neighbor_pair(data_to_plot[icenter_spec, ineighbor_spec, :], max_nbins_over_exp)
# Set the maximum number of bins as a property of the object itself and print this result
self.max_nbins_over_exp = max_nbins_over_exp
print('Calculated maximum number of bins over the entire experiment: {}'.format(max_nbins_over_exp))
return(max_nbins_over_exp)
def load_pickle_class(self, pickle_file, pickle_dir=None):
'''
Load some data from a pickle file ("class" just refers to this function being part of the TIMECellInteraction class)
'''
if pickle_dir is None:
pickle_dir = self.pickle_dir
return(load_pickle(pickle_dir, pickle_file))
def load_pickle_dict(self, pickle_file, pickle_dir=None):
'''
Load a bunch of values to the self object from a pickle file by way of a dictionary
'''
dict2load = self.load_pickle_class(pickle_file, pickle_dir=pickle_dir)
for key in dict2load:
val = dict2load[key]
setattr(self, key, val)
def make_pickle_dict(self, vars2save, local_dict, pickle_file):
'''
Make a pickle file of a dictionary of data
'''
dict2save = {}
for key in vars2save:
if key in local_dict:
val = local_dict[key]
setattr(self, key, val)
else:
val = getattr(self, key)
dict2save.update({key: val})
make_pickle(dict2save, self.pickle_dir, pickle_file)
def preprocess_dataframe(self, allow_compound_species):
'''
Preprocess the initial Pandas dataframe from Consolidata_data.txt (or a simulated one for simulated data) by creating another column (Species int) specifying a unique integer identifying the cell type
If requested, remove compound species, and return the list of single-protein "phenotypes" contained in the data
'''
# Import relevant module
import numpy as np
# Due to my pre-Pandas-learned days and thus implementing functionality below suboptimally, we should ensure the dataset has a sorted index
self.data = self.data.reset_index(drop=True)
# Preprocess the pandas dataframe in various ways
data_phenotypes = self.data.filter(regex='^Phenotype ') # get just the "Phenotype " columns
# data_phenotypes = data_phenotypes.reset_index(drop=True) # this is bad if the input data is not already sorted and should be commented out! However, note that was likely already here because of the non-optimal way we drop trivial objects below using the drop() method. Thus, we just need to ensure that the index is reset at the beginning of this function, as we do above
phenotype_cols = list(data_phenotypes.columns) # get a list of those column names
phenotypes = np.array([x.replace('Phenotype ', '') for x in phenotype_cols]) # extract just the phenotypes from that list
n_phenotypes = len(phenotypes) # get the number of possible phenotypes in the datafile
self.data['Species string'] = data_phenotypes.map(lambda x: '1' if (str(x)[-1] == '+') else '0').apply(lambda x: ''.join(list(x)), axis=1) # add a column to the original data that tells us the unique "binary" species string of the species corresponding to that row/cell
check_roi_order(self.data, 'prior to dropping invalid objects')
self.data = self.data.drop(np.nonzero((self.data['Species string'] == ('0' * n_phenotypes)).to_numpy())[0]) # delete rows that all have '...-' as the phenotype or are blank. As mentioned above, this is a non-optimal way of performing this; a better way is likely something like:
# df_phenotypes = df.filter(regex='^Phenotype ')
# num_phenotypes = df_phenotypes.shape[1]
# df_pared = df[df_phenotypes.apply(lambda x: ''.join(x), axis='columns') != '-' * num_phenotypes]
check_roi_order(self.data, 'after dropping invalid objects')
self.data = self.data.reset_index(drop=True) # reset the indices
self.data['Species int'] = self.data['Species string'].apply(lambda x: int(x, base=2)) # add an INTEGER species column
# Remove compound species if requested
if not allow_compound_species:
self.remove_compound_species()
self.remove_compound_species() # ensure nothing happens
return(phenotypes)
def remove_compound_species(self):
'''
For each compound species ('Species int' not just a plain power of two), add each individual phenotype to the end of the dataframe individually and then delete the original compound entry
'''
# Import relevant modules
import numpy as np
import pandas as pd
# Get the species IDs
x = np.array(self.data['Species int'])
print('Data size:', len(self.data))
# Determine which are not powers of 2, i.e., are compound species
powers = np.log2(x)
compound_loc = np.nonzero(powers != np.round(powers))[0]
ncompound = len(compound_loc)
print(' Compound species found:', ncompound)
check_roi_order(self.data, 'prior to decompounding')
# If compound species exist...
if ncompound > 0:
print(' Removing compound species from the dataframe...')
# Get a list of tuples each corresponding to compound species, the first element of which is the row of the compound species, and the second of which is the species IDs of the pure phenotypes that make up the compound species
compound_entries = [(cl, 2**np.nonzero([int(y) for y in bin(x[cl])[2:]][-1::-1])[0]) for cl in compound_loc]
# For each compound species...
data_to_add = []
for index, subspecies in compound_entries:
# For each pure phenotype making up the compound species...
for subspec in subspecies:
# You have to put this here instead of outside this loop for some weird reason! Even though you can see the correct change made to series and EVEN TO DATA_TO_ADD by looking at series and data_to_add[-1] below, for some Godforsaken reason the actual data_to_add list does not get updated with the change to 'Species int' when you print data_to_add at the end of both these loops, and therefore the data that gets added to the data dataframe contains all the same 'Species string' values, namely the last one assigned. Thus, we are actually adding the SAME species to multiple (usually 2) spatial points, so that the even-spacing problem arises.
series = self.data.iloc[index].copy()
# Note the only field I'm updating here is "Species int" and NOT the phenotype columns nor the "Species string" column. "Species int" is the only one that matters. So don't get confused when I print the dataframe and see apparently incorrect other columns... I am copying their values!
series['Species int'] = subspec # set the species ID of the series data to that of the current phenotype
data_to_add.append(series) # add the data to a running list
# Add all the data in the list to the dataframe
# self.data = self.data.append(data_to_add, ignore_index=True)
self.data = pd.concat([self.data, pd.DataFrame(data_to_add)], ignore_index=True) # needed because Pandas 2.0 deprecates .append()
print(' Added rows:', len(data_to_add))
print(' Data size:', len(self.data))
# Delete the original compound species entries
self.data = self.data.drop(compound_loc)
self.data = self.data.reset_index(drop=True)
print(' Deleted rows:', len(compound_loc))
print(' Data size:', len(self.data))
check_roi_order(self.data, 'post decompounding')
def average_over_rois(self, plot_real_data=True, log_pval_range=(-200, 0), figsize=(14, 4), dpi=150, img_file_suffix='', regular_pval_figsize=(8, 12), square=True, yticklabels=2, pval_dpi=150, plot_summary_dens_pvals=True, plot_main_pvals=True, write_csv_files=True, write_pickle_datafile=True, start_slide=None, num_slides=None):
'''
Perform a weighted geometric mean of the density and PMF P values over the valid ROIs, saving the corresponding figures as PNG/JPG files and data as CSV files.
Old:
Read all the data corresponding to the P value plots into a Pandas dataframe, using this structure to write an ndarray holding the left and right density (not PMF) P values (for a single-slice dataset) for every patient (slide) and center/neighbor combination in the entire experiment, performing a weighted average of the P values over all the ROIs for each patient.
The resulting array can then be read back in to other functions (such as create_summary_plots(), below) to plot these results in heatmaps or to write them to CSV files.
From def create_summary_plots():
Read in the summary ndarray from create_summary_array(), above, to plot and save heatmaps of all these data.
Call like, e.g., create_summary_plots(dataset_name, project_dir, plot_real_data, summary_arr, unique_slides, metadata['all_species_names']).
From def create_summary_csv_files():
Write the left and right summary P values to a CSV file.
Call like:
tci.create_summary_csv_files(dataset_name, project_dir, plot_real_data, summary_arr, unique_slides, metadata['all_species_names'])
'''
# Import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
# Define variables already defined as attributes
filedata = self.filedata
nslices = self.nslices
max_nbins = self.max_nbins_used
pickle_dir = self.pickle_dir
webpage_dir = self.webpage_dir
all_species_names = self.all_species_names
nall_species = self.nall_species
all_species_ids = self.all_species_ids
# Create the Pandas dataframe from the list of dictionaries that were outputted by save_figs_and_corresp_data()
df = pd.DataFrame()
df = df.append(filedata) # may need to fix due to deprecation of .append(); use pd.concat() instead
# Add a slide column to the dataframe and get a list of the unique slides while maintaining order
df['slide_name'] = df['roi_name'].apply(lambda x: x.split('_')[0])
unique_slides = df['slide_name'].unique()
nunique_slides = len(unique_slides)
# Determine which unique slides to plot in the main plotting functionality below
if start_slide is None:
start_slide = 0
num_slides = nunique_slides
stop_slide = start_slide + num_slides
unique_slides_to_plot = unique_slides[start_slide:stop_slide]
# Define the directory holding all the images of the averaged data for the webpage and the filename of the file holding all the corresponding data
webpage_dir = os.path.join(webpage_dir, ('real' if plot_real_data else 'simulated'))
pickle_file = 'averaged_data-{}.pkl'.format(('real' if plot_real_data else 'simulated'))
# If the pickle file doesn't already exist...
if not os.path.exists(os.path.join(pickle_dir, pickle_file)):
# Initialize the arrays of interest
nmatches_holder = np.zeros((nunique_slides, nall_species, nall_species, nslices))
log_dens_pvals_avg = np.zeros((nunique_slides, nall_species, nall_species, 2, nslices))
log_pmf_pvals_avg = np.zeros((nunique_slides, nall_species, nall_species, max_nbins, 2, nslices))
# For every unique slide...
for islide, slide in enumerate(unique_slides[:]):
# For every possible center species...
for icenter_spec, center_spec in enumerate(all_species_ids[:]):
# For every possible neighbor species...
for ineighbor_spec, neighbor_spec in enumerate(all_species_ids[:]):
# Get a temporary dataframe containing the current slide/center/neighbor combination (this will usually be five rows, one per ROI)
df_tmp = df[(((df['slide_name'] == slide) & (df['center_species_id'] == center_spec) & (df['neighbor_species_id'] == neighbor_spec)))]
nrows = len(df_tmp)
# For every slice...
for islice in range(nslices):
# Initialize and populate the three temporary arrays
nvalid_centers_holder = np.zeros((nrows,))
log_dens_pvals = np.zeros((nrows, 2))
log_pmf_pvals = np.zeros((nrows, max_nbins, 2))
for irow, (nvalid_centers_per_slice, left_log_dens_pvals, right_log_dens_pvals, left_log_pmf_pvals, right_log_pmf_pvals) in enumerate(zip(df_tmp['nvalid_centers_per_slice'], df_tmp['left_log_dens_pvals'], df_tmp['right_log_dens_pvals'], df_tmp['left_log_pmf_pvals'], df_tmp['right_log_pmf_pvals'])):
nvalid_centers_holder[irow] = nvalid_centers_per_slice[islice]
log_dens_pvals[irow, :] = np.array([left_log_dens_pvals[0, islice], right_log_dens_pvals[0, islice]]) # (2,)
log_pmf_pvals[irow, :, :] = np.c_[left_log_pmf_pvals[:, islice], right_log_pmf_pvals[:, islice]] # (max_nbins,2)
# Determine the rows in the temporary dataframe that have at least 1 valid center
matches = nvalid_centers_holder >= 1 # (nrows,)
nmatches = matches.sum()
nmatches_holder[islide, icenter_spec, ineighbor_spec, islice] = nmatches
# Perform the weighted averaging over the ROIs
if nmatches >= 1:
nvalid_centers_tmp = nvalid_centers_holder[matches][:, np.newaxis] # (nmatches, 1)
log_dens_pvals_avg[islide, icenter_spec, ineighbor_spec, :, islice] = (nvalid_centers_tmp * log_dens_pvals[matches, :]).sum(axis=0) / nvalid_centers_tmp.sum(axis=0) # (2,)
nvalid_centers_tmp = nvalid_centers_tmp[:, :, np.newaxis] # (nmatches, 1, 1)
log_pmf_pvals_avg[islide, icenter_spec, ineighbor_spec, :, :, islice] = (nvalid_centers_tmp * log_pmf_pvals[matches, :, :]).sum(axis=0) / nvalid_centers_tmp.sum(axis=0) # (max_nbins, 2)
else:
log_dens_pvals_avg[islide, icenter_spec, ineighbor_spec, :, islice] = None
log_pmf_pvals_avg[islide, icenter_spec, ineighbor_spec, :, :, islice] = None
# Set the log of the P value range for the color plotting
vmin = log_pval_range[0]
vmax = log_pval_range[1]
# Set the (negative) infinite values to the darkest color (or else they won't be plotted, as inf values are not plotted)
log_dens_pvals_avg[np.isneginf(log_dens_pvals_avg)] = vmin
log_pmf_pvals_avg[np.isneginf(log_pmf_pvals_avg)] = vmin
# Initialize the figure and axes
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=figsize)
if plot_summary_dens_pvals:
# Plot the average density P values for every slice
for islice in range(nslices):
# For every unique slide...
for islide in range(nunique_slides):
# Determine the filename of the figure
filename = os.path.join(webpage_dir, 'average_density_pvals-{}-{}-slice_{:02d}_of_{:02d}{}.{}'.format(('real' if plot_real_data else 'simulated'), unique_slides[islide], islice + 1, nslices, img_file_suffix, save_image_ext))
# Reset the figure/axes
fig.clf()
ax = fig.subplots(nrows=1, ncols=2)
# Plot the log of the left/less P values
sns.heatmap(log_dens_pvals_avg[islide, :, :, 0, islice], vmin=vmin, vmax=vmax, linewidths=.5, ax=ax[0], cbar=True, xticklabels=all_species_names, yticklabels=all_species_names, square=True)
ax[0].set_title('log10(\"less\" density pvals)')
ax[0].set_xlabel('Neighbor species')
ax[0].set_ylabel('Center species')
# Plot the log of the right/greater P values
sns.heatmap(log_dens_pvals_avg[islide, :, :, 1, islice], vmin=vmin, vmax=vmax, linewidths=.5, ax=ax[1], cbar=True, xticklabels=all_species_names, yticklabels=all_species_names, square=True)
ax[1].set_title('log10(\"greater\" density pvals)')
ax[1].set_xlabel('Neighbor species')
ax[1].set_ylabel('Center species')
# Set the figure title to the slide title and ensure the facecolor is white
fig.suptitle('Average density P values - {} - {} data - slice {} of {}'.format(unique_slides[islide], ('real' if plot_real_data else 'simulated'), islice + 1, nslices))
fig.patch.set_facecolor('white')
# Save the figure to disk
fig.savefig(filename, dpi=dpi, bbox_inches='tight')
# Initialize the P value figure
fig = plt.subplots(nrows=2, ncols=2, figsize=regular_pval_figsize)[0]
if plot_main_pvals:
# For every slide, center, and neighbor species...
# for islide, slide_name in enumerate(unique_slides):
for islide2, slide_name in enumerate(unique_slides_to_plot):
islide = islide2 + start_slide
for icenter, center_name in enumerate(all_species_names):
for ineighbor, neighbor_name in enumerate(all_species_names):
# Initialize the current figure by clearing it and all its axes
fig.clf()
ax = fig.subplots(nrows=2, ncols=2)
# Create the four-axis figure, where the top row has the left and right density P values and the bottom row has the left and right PMF P values
# Plot the log10 of the left density P values
sns.heatmap(log_dens_pvals_avg[islide, icenter, ineighbor, 0, :][np.newaxis, :], vmin=vmin, vmax=vmax, linewidths=.5, ax=ax[0, 0], cbar=True, yticklabels=True, square=True)
ax[0, 0].set_title('log10(\"less\" density pvals)')
ax[0, 0].set_xlabel('Slice')
# Plot the log10 of the right density P values
sns.heatmap(log_dens_pvals_avg[islide, icenter, ineighbor, 1, :][np.newaxis, :], vmin=vmin, vmax=vmax, linewidths=.5, ax=ax[0, 1], cbar=True, yticklabels=True, square=True)
ax[0, 1].set_title('log10(\"greater\" density pvals)')
ax[0, 1].set_xlabel('Slice')
# Plot the log10 of the left PMF P values
sns.heatmap(log_pmf_pvals_avg[islide, icenter, ineighbor, :, 0, :], vmin=vmin, vmax=vmax, linewidths=.5, ax=ax[1, 0], cbar=True, yticklabels=yticklabels, square=square)
ax[1, 0].set_title('log10(\"less\" PMF pvals)')
ax[1, 0].set_xlabel('Slice')
ax[1, 0].set_ylabel('Number of neighbors')
# Plot the log10 of the right PMF P values
sns.heatmap(log_pmf_pvals_avg[islide, icenter, ineighbor, :, 1, :], vmin=vmin, vmax=vmax, linewidths=.5, ax=ax[1, 1], cbar=True, yticklabels=yticklabels, square=square)
ax[1, 1].set_title('log10(\"greater\" PMF pvals)')
ax[1, 1].set_xlabel('Slice')
ax[1, 1].set_ylabel('Number of neighbors')
# Place a descriptive title on the figure
figure_title = 'Averaged-over-ROI, {} data\nSlide: {}\ncenter={}, neighbor={}'.format(('real' if plot_real_data else 'simulated'), slide_name, center_name, neighbor_name)
fig.suptitle(figure_title)
# Save the figure
pvals_fig_filename = 'averaged_pvals_{}_center-{}_neighbor-{}.{}'.format(slide_name, all_species_ids[icenter], all_species_ids[ineighbor], save_image_ext)
pvals_fig_dirname = os.path.join(webpage_dir, slide_name)
pvals_fig_pathname = os.path.join(pvals_fig_dirname, pvals_fig_filename)
os.makedirs(pvals_fig_dirname, exist_ok=True)
fig.savefig(pvals_fig_pathname, dpi=pval_dpi, bbox_inches='tight')
# Determine the filename of the CSV files we intend to write
density_csv_pathname = os.path.join(pickle_dir, 'average_density_pvals-{}.csv'.format(('real' if plot_real_data else 'simulated')))
pmf_csv_pathname = os.path.join(pickle_dir, 'average_pmf_pvals-{}.csv'.format(('real' if plot_real_data else 'simulated')))
# Part the slide names into the individual patients and drug treatment
patient = [int(x.split('-')[0][:-1]) for x in unique_slides]
pre_or_post = [x.split('-')[0][-1:] for x in unique_slides]
upatient = []
upre_or_post = []
for curr_patient, curr_pre_or_post in zip(patient, pre_or_post):
if curr_patient not in upatient:
upatient.append(curr_patient)
if curr_pre_or_post not in upre_or_post:
upre_or_post.append(curr_pre_or_post)
# Reshape the average holder for the main arrays to the individual patients and drug treatment, filling in "blanks" if necessary
impossible_val = 444.4
log_dens_pvals_avg2 = np.ones((len(upatient), len(upre_or_post), nall_species, nall_species, 2, nslices)) * impossible_val
log_pmf_pvals_avg2 = np.ones((len(upatient), len(upre_or_post), nall_species, nall_species, max_nbins, 2, nslices)) * impossible_val
for iunique_slide, (curr_patient, curr_pre_or_post) in enumerate(zip(patient, pre_or_post)):
upatient_idx = upatient.index(curr_patient)
upre_or_post_idx = upre_or_post.index(curr_pre_or_post)
log_dens_pvals_avg2[upatient_idx, upre_or_post_idx, :, :, :, :] = log_dens_pvals_avg[iunique_slide, :, :, :, :]
log_pmf_pvals_avg2[upatient_idx, upre_or_post_idx, :, :, :, :, :] = log_pmf_pvals_avg[iunique_slide, :, :, :, :, :]
log_dens_pvals_avg = log_dens_pvals_avg2
log_pmf_pvals_avg = log_pmf_pvals_avg2
# Create the Pandas indexes
index_density = pd.MultiIndex.from_product([upre_or_post, all_species_names, all_species_names, ['left', 'right'], np.arange(nslices) + 1], names=['drug_status', 'center_species', 'neighbor_species', 'pval_type', 'slice_number'])
index_pmf = pd.MultiIndex.from_product([upre_or_post, all_species_names, all_species_names, np.arange(max_nbins), ['left', 'right'], np.arange(nslices) + 1], names=['drug_status', 'center_species', 'neighbor_species', 'poss_num_neighbors', 'pval_type', 'slice_number'])
# For the density, Create the Pandas dataframes from the indexes
df_log_dens_pvals_avg = pd.DataFrame(data=log_dens_pvals_avg.reshape((len(upatient), -1)), index=upatient, columns=index_density).rename_axis('subject')
df_log_pmf_pvals_avg = pd.DataFrame(data=log_pmf_pvals_avg.reshape((len(upatient), -1)), index=upatient, columns=index_pmf).rename_axis('subject')
# Write the Pandas dataframes to disk
if write_csv_files:
df_log_dens_pvals_avg.to_csv(density_csv_pathname)
df_log_pmf_pvals_avg.to_csv(pmf_csv_pathname)
# Save the averaged data to disk
if write_pickle_datafile:
make_pickle((nmatches_holder, log_dens_pvals_avg, log_pmf_pvals_avg, df_log_dens_pvals_avg, df_log_pmf_pvals_avg), pickle_dir, pickle_file)
# Close the figure
plt.close(fig)
else:
# Read in the averaged data from disk
(nmatches_holder, log_dens_pvals_avg, log_pmf_pvals_avg, df_log_dens_pvals_avg, df_log_pmf_pvals_avg) = load_pickle(pickle_dir, pickle_file)
return(nmatches_holder, log_dens_pvals_avg, log_pmf_pvals_avg, unique_slides, df_log_dens_pvals_avg, df_log_pmf_pvals_avg)
# def average_over_rois2(self, plot_real_data=True, log_pval_range=(-200, 0), figsize=(14, 4), dpi=150, img_file_suffix='', regular_pval_figsize=(8, 12), square=True, yticklabels=2, pval_dpi=150, plot_summary_dens_pvals=True, plot_main_pvals=True, write_csv_files=True, write_pickle_datafile=True, start_slide=None, num_slides=None, min_num_valid_centers=1):
def average_over_rois2(self, plot_real_data=True, log_pval_range=(-200, 0), figsize=(14, 4), dpi=150, img_file_suffix='', plot_summary_dens_pvals=True, write_pickle_datafile=True, start_slide=None, min_num_valid_centers=1, weight_rois_by_num_valid_centers=False):
'''
Perform a weighted geometric mean of the density and PMF P values over the valid ROIs, saving the corresponding figures as PNG/JPG files and data as CSV files.
Old:
Read all the data corresponding to the P value plots into a Pandas dataframe, using this structure to write an ndarray holding the left and right density (not PMF) P values (for a single-slice dataset) for every patient (slide) and center/neighbor combination in the entire experiment, performing a weighted average of the P values over all the ROIs for each patient.
The resulting array can then be read back in to other functions (such as create_summary_plots(), below) to plot these results in heatmaps or to write them to CSV files.
From def create_summary_plots():
Read in the summary ndarray from create_summary_array(), above, to plot and save heatmaps of all these data.
Call like, e.g., create_summary_plots(dataset_name, project_dir, plot_real_data, summary_arr, unique_slides, metadata['all_species_names']).
From def create_summary_csv_files():
Write the left and right summary P values to a CSV file.
Call like:
tci.create_summary_csv_files(dataset_name, project_dir, plot_real_data, summary_arr, unique_slides, metadata['all_species_names'])
'''
# Import relevant libraries
# import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
# Define variables already defined as attributes
filedata = self.df_density_pvals
nslices = self.nslices
# max_nbins = self.max_nbins_used
pickle_dir = self.pickle_dir
webpage_dir = self.webpage_dir
all_species_names = self.all_species_names
nall_species = self.nall_species
all_species_ids = self.all_species_ids
# Create the Pandas dataframe from the list of dictionaries that were outputted by save_figs_and_corresp_data()
# df = pd.DataFrame()
# df = df.append(filedata)
df = filedata
# Add a slide column to the dataframe and get a list of the unique slides while maintaining order
df['slide_name'] = df['roi_name'].apply(lambda x: x.split('_')[0])
unique_slides = df['slide_name'].unique()
nunique_slides = len(unique_slides)
# Determine which unique slides to plot in the main plotting functionality below
if start_slide is None:
start_slide = 0
# num_slides = nunique_slides
# stop_slide = start_slide + num_slides
# unique_slides_to_plot = unique_slides[start_slide:stop_slide]
# Define the directory holding all the images of the averaged data for the webpage and the filename of the file holding all the corresponding data
webpage_dir = os.path.join(webpage_dir, ('real' if plot_real_data else 'simulated'))
pickle_file = 'averaged_data-{}.pkl'.format(('real' if plot_real_data else 'simulated'))
# If the pickle file doesn't already exist...
if not os.path.exists(os.path.join(pickle_dir, pickle_file)):
# Initialize the arrays of interest
nmatches_holder = np.zeros((nunique_slides, nall_species, nall_species, nslices))
log_dens_pvals_avg = np.zeros((nunique_slides, nall_species, nall_species, 2, nslices))
# log_pmf_pvals_avg = np.zeros((nunique_slides, nall_species, nall_species, max_nbins, 2, nslices))
# Set the log of the P value range for the color plotting
vmin = log_pval_range[0]
vmax = log_pval_range[1]
# For every unique slide...
for islide, slide in enumerate(unique_slides[:]):
print('Averaging slide {}'.format(slide))
# For every possible center species...
for icenter_spec, center_spec in enumerate(all_species_ids[:]):
# For every possible neighbor species...
for ineighbor_spec, neighbor_spec in enumerate(all_species_ids[:]):
# Get a temporary dataframe containing the current slide/center/neighbor combination (this will usually be five rows, one per ROI)
df_tmp = df[(((df['slide_name'] == slide) & (df['center_species_id'] == center_spec) & (df['neighbor_species_id'] == neighbor_spec)))]