-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTime_Series.py
987 lines (841 loc) · 55.2 KB
/
Time_Series.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 1000)
from datetime import datetime
from datetime import date
import re
import time
def TS_All_Proteins(meta_path,
variant_events_directory,
cluster_directory,
start_date,
end_date,
where_ref_path,
time_series_directory,
heatmap_table=True,
n_seq_by_continent=True,
variant_combinations=True,
continent="get_all",
exclude_singletons=True,
cmap="YlGnBu",
**cmap_kwargs):
"""
Performs time series analysis on all of proteins in the directory specified for MSA output.
Arguments
----------
meta_path: path to the metadata for the group of sequences.
variant_events_directory: path to the directory containing the outputs from the MSA_Reader program. This directory contains one subdirectory for each protein and is called "Variant_Events" by default.
cluster_directory: path to the directory containing cluster information files generated during the sequence dereplication step.
start_date: the start date of the analysis. Must be in ISO format, and must be a Sunday to avoid unexpected output.
end_date: the end date of the analysis. Must be in ISO format, and must be a Saturday to avoid unexpected output.
where_ref_path: path to the where_reference.tsv file generated by the script "Find_Reference_Clusters.R".
time_series_directory: desired directory for output. Each dataset will be created as a subdirectory within this directory.
subdirectory names
"Metadata_with_Variants/": metadata is stored in this directory with the added columns "cluster name", "variant combinations", and "number of variants".
"Time_Series_Frequency": time series frequency tables are created in this directory.
"Time_Series_Percent/": time series percentage tables are created in this directory.
optional subdirectories
"Time_Series_Heatmap_Tables/": heatmap tables are created and stored in this directory. May skip creation by setting "heatmap_table" to False.
"Total_Sequences_by_Continent/": data giving total sequences analyzed by collection date and continent is stored in this directory. May skip creation by setting "n_seq_by_continent" to False.
"Variant_Combinations/": data giving the combinations of variants observed in each cluster with their frequency is stored in this directory. May skip creation by setting "variant_combinations" to False.
heatmap_table: (default=True) if False, heatmap tables will not be created. If True, heatmap tables will be created and stored in the "Time_Series_Heatmap_Tables/" subdirectory.
n_seq_by_continent: (default=True) if False, tables for number of sequences by continent and collection date will not be created. If True, tables will be created and stored in the "Total_Sequences_by_Continent/" subdirectory.
variant_combinations: (default=True) if False, tables giving variant combinations with frequency data will not be created. If True, tables will be created and stored in the "Total_Sequences_by_Continent/" subdirectory.
continent: (default="get-all") if "get-all", time series data for each protein will be calculated separately for all continents and globally. May also be a list of continents, or the string "Global" to calculate only the worldwide analysis.
Accepted list entries for continent: "Africa","Europe","Oceania","North America","South America","Asia","Global" (case sensitive). All other entries will return an error.
exclude_singletons: (default=True) if True, remove clusters of size 1 from the metadata analysis. If False, include all clusters.
cmap: (default="YlGnBu") for the heatmap tables, a string defining the matplotlib color map to use. For possible colormaps, see https://matplotlib.org/tutorials/colors/colormaps.html
cmap_kwargs: optional kwargs to be passed to the heatmap table.
supported kwargs
low: compress lower end of color range by given value
high: compres higher end of color range by given value
vmin: value corresponding to the low end of the colormap range. Values lower than vmin will be colored according to the lower extreme of the colormap.
vmax: value corresponding to the high end of the colormap range. Values higher than vax will be colored according to the upper extreme of the colormap.
text_color_threshold (default=0.408): threshold for text color. 0 makes all text dark colored, 1 makes all text light colored.
"""
#Define directories within the time series data and create them if they do not already exist
#Metadata and variants
metadata_variants_directory=time_series_directory+"Metadata_with_Variants/"
if os.path.isdir(metadata_variants_directory)==False:
os.mkdir(metadata_variants_directory)
#Frequency tables
time_series_frequency_directory=time_series_directory+"Time_Series_Frequency/"
if os.path.isdir(time_series_frequency_directory)==False:
os.mkdir(time_series_frequency_directory)
#Percentage tables
time_series_percent_directory=time_series_directory+"Time_Series_Percent/"
if os.path.isdir(time_series_percent_directory)==False:
os.mkdir(time_series_percent_directory)
#Heatmap tables (optional but included by default)
if heatmap_table==True:
time_series_htmp_table_directory=time_series_directory+"Time_Series_Heatmap_Tables/"
if os.path.isdir(time_series_htmp_table_directory)==False:
os.mkdir(time_series_htmp_table_directory)
#Number of Sequences by Continent (optional but included by default)
if n_seq_by_continent==True:
n_seq_by_continent_directory=time_series_directory+"Total_Sequences_by_Continent/"
if os.path.isdir(n_seq_by_continent_directory)==False:
os.mkdir(n_seq_by_continent_directory)
#Variant Combinations dataset (optional but included by default)
if variant_combinations==True:
top_combinations_directory=time_series_directory+"Variant_Combinations/"
if os.path.isdir(top_combinations_directory)==False:
os.mkdir(top_combinations_directory)
#If variant_combinationsis true, create two directories for frequency and percentage data of combos, respectively
if variant_combinations==True:
#Variant Combinations TS: Frequency
combos_TS_freq_directory=time_series_directory+"Variant_Combinations_TS_Frequency/"
if os.path.isdir(combos_TS_freq_directory)==False:
os.mkdir(combos_TS_freq_directory)
#Variant Combinations TS: Percentage
combos_TS_percentage_directory=time_series_directory+"Variant_Combinations_TS_Percentage/"
if os.path.isdir(combos_TS_percentage_directory)==False:
os.mkdir(combos_TS_percentage_directory)
#Load reference cluster data
ref_names=find_ref_cluster_name(where_ref_path)
#As long as default file formats are preserved, the MSA reader produces one directory for each protein analyzed.
for protein in os.listdir(variant_events_directory):
#Input files
variant_events_path=variant_events_directory+"{0}/{0}_variants_raw.tsv".format(protein)
#Find Path of Cluster information file for the protein in the directory for cluster information
for file in os.listdir(cluster_directory):
#Match the pattern "<protein>_" which appears at the beginning of cluster information files by default
if re.match(protein+"_",file):
cluster_path=cluster_directory+file
#Reference cluster ID for protein
ref_cluster=ref_names[protein]
#Run Time Series Computation
print("Protein:",protein)
time_series_pipeline(meta_path,
variant_events_path,
cluster_path,
protein,
start_date,
end_date,
ref_cluster,
time_series_directory,
heatmap_table,
n_seq_by_continent,
variant_combinations,
continent,
exclude_singletons,
cmap,
**cmap_kwargs)
print("-"*80)
def time_series_pipeline(meta_path,
variant_events_path,
cluster_path,
protein,
start_date,
end_date,
ref_cluster,
time_series_directory,
heatmap_table=True,
n_seq_by_continent=True,
variant_combinations=True,
continent="Global",
exclude_singletons=True,
cmap="YlGnBu",
**cmap_kwargs):
"""
Creates time series matrices detailing the frequency of variation over time for the defined protein.
May optionally subset data by continent or country.
Aruguments
----------
meta_path: path to the metadata file for the sequences being analyzed.
variant_events_path: path to the variant events dataset generated for the protein by the MSA reader. By default, this file ends with "variants_raw.tsv".
cluster_path: path to the cluster information file generated by USEARCH for the protein.
protein: name of the protein being analyzed (string)
start_date: string in ISO date format representing the start date of analysis. Must be a Sunday to avoid unexpected results in date range creation.
end_date: string in ISO date formt specifying end date. Must be a Saturday to avoid unexpected results in date range creation.
ref_cluster: string specifying the cluster ID of the reference cluster, in the format it appears in the cluster information file (e.g. Uniq2).
metadata_variants_directory: directory to write out metadata with variants CSV. Files created in this folder will have the pattern <protein>_Metadata_with_Variants.csv.
time_series_frequency_directory: directory for writing out the CSV of frequency table of mutations by week for each continent. Files will be automatically created in this folder with the format <protein>_Frequency_<continent>.csv.
time_series_percent_directory: directory for writing out the CSV of percentage table of mutations by week for each continent. Files will be automatically created in this folder with the format <protein>_Percent_<continent>.csv.
time_series_htmp_table_directory: directory for writing out the Excel spreadsheet with the percentage table colored according to values (default=None). If specified, files will be automatically created in this folder with the format <protein>_Heatmap_Table_<continent>.xlsx.
n_seq_by_continent_directory: directory for writing out CSV table giving number of sequences by week for each continent (default=None). If specified, files will be created in the directory with the pattern <protein>_Total_Seq_by_Continent.csv.
top_combinations_out: directory for writing out table giving combinations of variants for each cluster in decreasing order based on number of sequences (default=None). If specified, files will be created in the directory with the pattern <protein>_Variant_Combinations.csv.
continent: A string or a list specifying desired subsets based on continent. If "get-all", all continents and a worldwide analysis will be computed. If "Global", no subsetting will be performed (just worldwide data). If a list of continents is passed, then time series analysis will be completed just for the continents in the list. Continents entered should use upper case, and "Oceania" should be entered instead of "Australia".
exclude_singletons: if True, clusters with size 1 will be excluded from analysis (default=True).
cmap: matplotlib color map to use for heatmap table (default='YlGnBu'). For possible color maps see https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html
cmap_kwargs: kwargs passed to the Pandas.Styler.Background_Gradient() function during creation of the heatmap table.
-supported kwargs-
low: compress lower end of color range by given value
high: compres higher end of color range by given value
vmin: value corresponding to the low end of the colormap range. Values lower than vmin will be colored according to the lower extreme of the colormap.
vmax: value corresponding to the high end of the colormap range. Values higher than vax will be colored according to the upper extreme of the colormap.
text_color_threshold (default=0.408): threshold for text color. 0 makes all text dark colored, 1 makes all text light colored.
Returns
----------
None. All output written to file at directories specified.
"""
time_start=time.time()
#Define directory paths (all directory paths will eventually be defined here using the time_series_directory)
#Metadata and variants
metadata_variants_directory=time_series_directory+"Metadata_with_Variants/"
#Frequency tables
time_series_frequency_directory=time_series_directory+"Time_Series_Frequency/"
#Percentage tables
time_series_percent_directory=time_series_directory+"Time_Series_Percent/"
#Heatmap tables (optional but included by default)
if heatmap_table==True:
time_series_htmp_table_directory=time_series_directory+"Time_Series_Heatmap_Tables/"
#Number of Sequences by Continent (optional but included by default)
if n_seq_by_continent==True:
n_seq_by_continent_directory=time_series_directory+"Total_Sequences_by_Continent/"
#Variant Combinations dataset (optional but included by default)
if variant_combinations==True:
top_combinations_directory=time_series_directory+"Variant_Combinations/"
#Variant Combinations TS: Frequency
combos_TS_freq_directory=time_series_directory+"Variant_Combinations_TS_Frequency/"
#Variant Combinations TS: Percentage
combos_TS_percentage_directory=time_series_directory+"Variant_Combinations_TS_Percentage/"
#Return error for incorrect continent entries
if type(continent)==list:
#If a list is passed to continent
for element in continent:
if element not in ["Africa","Europe","Oceania","North America","South America","Asia","Global"]:
raise ValueError('Invalid entry for continent. Please use any of the following entries exactly as they appear:\n"Africa","Europe","Oceania","North America","South America","Asia","Global"\nAlternately, the strings "get_all" and "Global" may be passed.')
else:
if continent not in ["get_all","Global"]:
raise ValueError('Invalid entry for continent. Please enter "get_all", "Global", or a list of desired continents.')
#Create output file paths within entered directories
#Required files
metadata_variants_out=metadata_variants_directory+protein+"_Metadata_with_Variants.tsv"
#Optional files
if n_seq_by_continent_directory:
n_seq_by_continent_out=n_seq_by_continent_directory+protein+"_Total_Seq_by_Continent.csv"
if top_combinations_directory:
top_combinations_out=top_combinations_directory+protein+"_Variant_Combinations.tsv"
else:
top_combinations_out=None
if variant_combinations==True:
combo_TS_freq_out=combos_TS_freq_directory+protein+"_combo_TS_freq.csv"
combo_TS_percent_out=combos_TS_percentage_directory+protein+"_TS_Combo_Percentage.csv"
#Step 1: generate metadata-cluster pairs
print("Step 1: Link Metadata to Cluster IDs")
metadata_with_clusters=prepare_metadata(cluster_path,meta_path,protein,exclude_singletons)
print()
#Step 2: generate variant list for each cluster
print("Step 2: Link Variants to Cluster IDs")
#UPDATE 2021-03-17: include the reference cluster in the variant list_by_cluster dataset
variants_each_cluster,variant_events,codes=variant_list_by_cluster(variant_events_path,protein,ref_cluster)
print()
#Step 2a: if desired, generate list of top mutations for the protein
if top_combinations_out:
print("Step 2a: Generate Top Mutation Combinations for",protein)
top_combinations=find_top_combinations(variants_each_cluster,cluster_path,protein)
#For output, the lists of variant combinations need to be converted to comma-separated strings to improve readability.
top_combinations["Variants"]=top_combinations["Variants"].apply(lambda x:variant_list_to_string(x))
top_combinations.to_csv(top_combinations_out,sep="\t",chunksize=1000)
print("Variant Combinations for {} written to {}".format(protein,top_combinations_out))
print()
#Step 3: map variant list to metadata and Clean metadata
print("Step 3: Prepare Metadata with Variants Table")
metadata_with_variants=link_meta_to_variants(metadata_with_clusters,variants_each_cluster,ref_cluster,protein)
print("Cleaning Metadata")
standard_date=clean_metadata(metadata_with_variants,variant_events,metadata_variants_out)
print()
#Step 3a: If specified, output a breakdown of sequences by continent over time
if n_seq_by_continent_out:
print("Step 3a: Prepare Table of Total Sequences by Continent Over Time")
total_seq_by_continent(metadata_variants_out,protein,start_date,end_date,n_seq_by_continent_out)
print()
#Subset data for all unique continents and complete steps 4-6 for each.
print("Step 4: Computing Time Series for Each Continent")
#Generate date range for time series creation
start_dates,end_dates=date_range_generator_weekly(start_date,end_date)
print("Start date entered:",start_dates[0].strftime('%Y-%m-%d'),end=". ")
print("End date entered:",end_dates[-1].strftime('%Y-%m-%d'),end=".\n")
print("Analysis covers {} weeks.".format(len(start_dates)))
#If time series for variant combinations is specified, store the list of unique variant combinations
if variant_combinations==True:
var_combos=define_variant_combos(top_combinations_out)
combo_meta=form_combo_metadata(standard_date)
#Set up list of continents to iterate through, adding an option for global
#If get_all is specified for continent, all continents and a global analysis will be computed.
if continent=="get_all":
continent_list=list(standard_date['region'].unique())
continent_list.append("Global")
#The string 'Global' may alternately be entered to print only the global analysis.
elif continent=="Global":
continent_list=["Global"]
#A list of continents may also be passed to the function, which will be ran in the for loop below.
#Progress meter
progress=1
#Progress meter not used if only one continent is entered
if len(continent_list)==1:
print("Generating time series information for {} ({})".format(protein,continent_list[0]))
for continent in continent_list:
#Progress displayed if multiple continents are entered
if len(continent_list)>1:
#Exact wording of progress meter depends on whether analysis is worldwide or for a specific continent
if continent!="Global":
print("Time series info for {} ({}/{})".format(continent,progress,len(continent_list)))
else:
print("Global time series information ({}/{})".format(progress,len(continent_list)))
#Subset the data for the continent for all iterations except for global data
if continent=="Global":
cont_subset=standard_date.copy()
else:
cont_subset=standard_date[standard_date['region']==continent]
#If variant combinations is specified, create a copy of the subset with the variant column as a string.
if variant_combinations==True:
combo_subset=form_combo_metadata(cont_subset)
#Generate filenames for continent-specific outputs
time_series_freq_out=time_series_frequency_directory+protein+"_Frequency_"+continent+".csv"
time_series_percent_out=time_series_percent_directory+protein+"_Percent_"+continent+".csv"
#Heatmap table generation is optional
if heatmap_table==True:
time_series_htmp_table_out=time_series_htmp_table_directory+protein+"_Heatmap_Table_"+continent+".xlsx"
else:
time_series_htmp_table_out=None
#Step 4: Generate time series of mutation events
time_series_cont=time_series_generation(cont_subset,codes,protein,start_dates,end_dates,time_series_freq_out)
#Step 5: Create a percentage table
global_percentage=percentage_table(time_series_cont,time_series_percent_out)
#Step 5a: If time series for variant combinations is specified, compute frequency and precentage TS for the current continent
if variant_combinations==True:
combo_TS_freq=time_series_combo(combo_subset,var_combos,protein,start_dates,end_dates,combo_TS_freq_out)
#Step 5b: Combo Time series percentage table
percentage_table(combo_TS_freq,combo_TS_percent_out)
#Step 6: Create heatmap table from percentage data
if heatmap_table==True:
global_percentage_HT=create_heatmap_table(global_percentage,time_series_htmp_table_out,protein,cmap="YlGnBu",**cmap_kwargs)
#Progress meter: increase by one
progress+=1
time_end=time.time()
print("\nTime Series for {} complete.".format(protein))
print("Frequency data written to",time_series_frequency_directory)
print("Percentage data written to",time_series_percent_directory)
if heatmap_table==True:
print("Heatmap tables written to",time_series_htmp_table_directory)
print("Total time elapsed: {}".format(time_end-time_start))
def extract_id(entry, field_index=3):
"""
When applied to the "Input_ID" column of the USEARCH cluster information dataset, this function will yield the GISAID accession ID.
The "Input_ID" column contains the FASTA header for each sequence, with metadata split by "|" characters.
Arguments:
field_index: As of January 2021, the accession ID is in the fourth field (index=3). If this changes in future metadata, the field index may be specifed using this argument.
"""
#Split on the vertical line character and return the field with the accession ID (fourth field by default)
return entry.split("|")[field_index]
def count_mutation_list(entry):
"""
When applied to the variant list column of the top combinations dataset, count_mutation_list will return the number of variants for each variant list.
"""
return len(entry)
def count_AA_changes(var_list,events):
"""
Given a list of variants, count_AA_Changes() will return the total number of amino acid changes present in the list.
arguments
----------
var_list: a list of variant codes, in the format generated by the MSA_reader() functions.
events: Variant events dataset generated by the MSA_reader() program. This can be loaded from the file <protein name>_variants_raw.tsv in the output folder for the MSA_reader functions.
outputs
----------
aa_change: integer giving the total number of amino acid changes detected in the list.
"""
#Counter for number of amino acid changes
aa_change=0
#Create a counter for amino acid changes, that will increment depending on the nature of the mutation
for variant in var_list:
#Get info for variant code: subset event table, then take the first value
#relevant data is the same for all instances of the code in the variant events dataset
code_info=events[events["Code"]==variant].groupby("Code").first()
#Retrieve mutation type and determine if it is a single-residue or a multi-residue event
#Substitutions: all substitutions are single-residue events
if code_info["Type"].values=="sub":
aa_change+=1
#Deletions: if an MSA end position is given, it is a multi-residue deletion. If this value contains a "-", it is a single-residue deletion.
elif code_info["Type"].values=="del":
if code_info["AA_End(MSA)"].values=="-":
aa_change+=1
elif code_info["AA_End(MSA)"].values!="-":
#Determine length of multi-residue deletion: length is equal to the number of non-blank residues in the ref residues column
length=0
for res in str(code_info["Ref Residue(s)"].values[0]): #Entries are stored as one-element lists by default: must convert element to strinng
if res!="-":
length+=1
aa_change+=length
#Insertions: the length of an insertion is determined by the number of non-blank residues in the "Var Residue(s)" column (residues inserted)
elif code_info["Type"].values=="ins":
length=0
for res in str(code_info["Var Residue(s)"].values[0]): #Entries are stored as one-element lists by default: must convert element to strinng
if res!="-":
length+=1
aa_change+=length
#Indels: the length of an indel is defined by the sum of the residues inserted and the residues deleted.
elif code_info["Type"].values=="delins":
length=0
#Count number of insertions (Var residues)
for res in str(code_info["Var Residue(s)"].values[0]): #Entries are stored as one-element lists by default: must convert element to strinng
if res!="-":
length+=1
#Count number of deletions (Ref residues)
for res in str(code_info["Ref Residue(s)"].values[0]):
if res!="-":
length+=1
#Add the number of amino acid changes for this indel to the total
aa_change+=length
#Extensions: the length of an extension is defined by the number of amino acids added to either end
elif code_info["Type"].values=="ext":
#A single-residue extension will have a blank for AA_End.
if code_info["AA_End(MSA)"].values=="-":
aa_change+=1
#Multi-residue extension: count the number of non-blank residues added
elif code_info["AA_End(MSA)"].values!="-":
length=0
#Entries are stored as one-element lists by default: must convert element to string
for res in str(code_info["Var Residue(s)"].values[0]):
if res!="-":
length+=1
#Add amino acid changes for extension to total
aa_change+=length
return aa_change
def prepare_metadata(cluster_path,meta_path,protein,exclude_singletons=True):
#Load Cluster Information file and name columns according to the format defined by USEARCH
#https://drive5.com/usearch/manual/cmd_fastx_uniques.html
print("Loading cluster information for {} at {}".format(protein,cluster_path))
cluster_info=pd.read_csv(cluster_path,sep="\t",header=None)
n_entries=len(cluster_info)
print("Cluster entries read:",n_entries)
#Rename cluster name column to "Cluster_name_<protein>" (specifys on the merged metadata that the cluster assingments are protein-specific)
cluster_column="Cluster_Name_"+protein
cluster_info.columns=["Input_ID",cluster_column,"Cluster_num","Member_num","Cluster_Size","Target_Seq"]
print("Loading metadata at {}".format(meta_path))
#Load Metadata File
metadata=pd.read_csv(meta_path,sep="\t",dtype="object")
print("Metadata entries read:",len(metadata))
#If singletons_only is specified, filter cluster information file to inclue informations on singletons only
if exclude_singletons==True:
cluster_info=cluster_info[cluster_info["Cluster_Size"]>=2]
n_no_singletons=len(cluster_info)
print("Excluding {} singletons from cluster info. Entries remaining: {}".format(n_entries-n_no_singletons,n_no_singletons))
#The common column between the metadata and the cluster information file is the GISAID accession ID.
#Extract GISAID_EPI_ISL IDs from cluster information file, and create a new column named "gisaid_epi_isl" (must match metadata)
cluster_info["gisaid_epi_isl"]=cluster_info["Input_ID"].apply(lambda x:extract_id(x))
#Select for cluster name and the new column
clustmap=cluster_info[["gisaid_epi_isl",cluster_column]]
#Perform an inner join (merge function in pandas) between cluster information and metadata
metadata_with_clusters=metadata.merge(clustmap,how="inner",on="gisaid_epi_isl")
print("Number of sequences with metadata and cluster information:",len(metadata_with_clusters))
#Return metadata-cluster pairs for use in time series pipeline
return metadata_with_clusters
def variant_list_by_cluster(variant_events_path, protein, ref_cluster):
#Form filename for variant events dataset using protein name and load dataset
print("Loading Variant Data File at",variant_events_path)
variant_events=pd.read_csv(variant_events_path,sep="\t")
#Store all unique variant codes observed
codes=list(variant_events['Code'].unique())
#Store all unique cluster IDs
clusters=list(variant_events['Cluster_ID'].unique())
print("Number of unique mutations: {}.".format(len(codes)))
print("Number of clusters: {}.".format(len(clusters)))
#Store variants associated wtih each cluster
#Create an empty list for storing the variants associated with each cluster, which will be
#zipped with the list of unique clusters being used as the iterable to create a new dataframe.
list_of_variant_lists=[]
#Form for loop using clusters variable from earlier
for i in range(len(clusters)):
#Progress meter: update for every fifth code
if (i+1)%5==0:
print("Store variants for cluster IDs: {} of {}. {:.1%} complete.".format(i+1,len(clusters),(i+1)/len(clusters)),sep="",end="\r")
#Subset database for ith cluster id
sub=variant_events[variant_events['Cluster_ID']==clusters[i]]
#Extract the 'Code' column from the subset, which will form an array with the variant code at each row.
#Convert the array to a list so it can be indexed
sub_codes=list(sub["Code"])
#For each row in the subset, add the variant code to the list of mutations observed in cluster i
variant_list=[] #Create empty list
for j in range(len(sub)):
#For each row i, store the variant code at position i
var_code=sub_codes[j]
#Add the variant code to the list
variant_list.append(var_code)
#Add the variant list for cluster i to the list
list_of_variant_lists.append(variant_list)
print("\rStore variants for cluster IDs: {} of {}. {:.1%} complete.".format(len(clusters),len(clusters),1.00),sep="")
#The list of lists of variants for each cluster will be zipped to the list of cluster ID's, and a dataframe will be created
data=list(zip(clusters,list_of_variant_lists))
#Add reference cluster information to the list of variants by cluster
#Tuple adds the ref cluster ID and its corresponding empty list of variants to the data
data.append((ref_cluster,[]))
#Ref cluster is added to the end. Next, sort the data in increasing order by cluster number
#Lambda function pulls out the first element in the tuple, then takes the text after "Uniq" which is the cluster number.
data=sorted(data,key=lambda x:int(x[0].split("Uniq")[1]))
#Set name of column storing cluster info so it can be mapped to the cooresponding cluster in the metadata.
cluster_colname="Cluster_Name_"+protein
#Create Pandas df
variants_each_cluster=pd.DataFrame(data,columns=[cluster_colname,'Variants'])
#Add number of amino acid changes in each cluster to the new dataframe
print("Adding number of variants column to variants by cluster dataset.")
tic=time.time()
variants_each_cluster["Number_of_Variants"]=variants_each_cluster["Variants"].apply(lambda x:count_AA_changes(x,variant_events))
toc=time.time()
print("Complete. Time elapsed:",toc-tic,"seconds.")
#Return the created dataset, along with the variant events table and the list of unique codes, which are used in subsequent steps.
return variants_each_cluster,variant_events,codes
def find_top_combinations(variants_each_cluster,cluster_path,protein):
#Create dataset mapping cluster name to cluster size
#Load cluster information file from USEARCH and add column names
cluster_info=pd.read_csv(cluster_path,sep="\t",header=None)
cluster_info.columns=["Input_ID","Cluster_ID","Cluster_num","Member_num","Cluster_Size","Target_Seq"]
#Remove singletons and return one row for each cluster (one row for each sequence is given by default)
cluster_info=cluster_info[cluster_info["Cluster_Size"]>=2].groupby("Cluster_num").first()
#Select cluster ID and cluster size columns
size_map=cluster_info[["Cluster_ID","Cluster_Size"]]
#Add Cluster Size to variant list dataset
variants_each_cluster=variants_each_cluster.rename(columns={'Cluster_Name_{}'.format(protein): 'Cluster_ID'})
top_combinations=variants_each_cluster.merge(size_map,how="inner",on="Cluster_ID")
top_combinations.fillna(value=" ")
#Add Column for number of mutations in cluster
#top_combinations["Number_of_Variants"]=top_combinations["Variants"].apply(lambda x:count_AA_changes(x,variant_events))
return top_combinations
def find_ref_cluster_name(where_ref_path):
"""
Taking a 'where_reference.tsv' produced by the R script in 'reference_finder_script.Rmd', find_ref_cluster_id
will map the **name** (not index) of the reference cluster for each protein clustered.
where_ref_path: Path to the 'where_reference.tsv' file.
"""
#Create an empty dictionary for storing the names of the reference clusters
names=dict()
#Open the where_reference.tsv file
with open(where_ref_path,"r") as file:
#Create a list with one element for each line in the input file
lines=file.readlines()
#Remove newline characters in input
for i in range(len(lines)):
lines[i]=lines[i].strip()
#For each protein, extract protein name, and the name of the cluster with the ref. genome location in the corresponding protein
for line in lines:
#For each protein, extract protein name, and
protein=line.split(sep="\t")[0]
#Ectract cluster name corresponding to location of ref. genome in protein
cluster=line.split(sep="\t")[1]
#Store the cluster name in the dictionary 'names' with the protein name as the key
names[protein]=cluster
return names
def link_meta_to_variants(metadata_with_clusters,variants_each_cluster,ref_cluster,protein):
#Define name of cluster column in the metadata-cluster dataframe
cluster_colname="Cluster_Name_"+protein
#Join the variant list dataframe to the metadata dataframe, using cluster_colname from earlier
metadata_with_variants=pd.merge(left=metadata_with_clusters,right=variants_each_cluster,on=cluster_colname,how='left')
#Check for any clusters without variants outside of the reference cluster (should be zero entries)
meta_outside_reference=len(metadata_with_variants[(metadata_with_variants['Variants'].isna()) & (metadata_with_variants[cluster_colname]!=ref_cluster)])
if meta_outside_reference!=0:
print("Warning: Clusters with zero variants detected outside of the reference cluster. \nPlease check the -tabbedout output from USEARCH for inconsistencies with the .FASTA output file.")
return metadata_with_variants
def ISO_date_filter(metadata_with_variants):
"""
Previous investigation of GISAID 'Next Metadata' shows that only dates in the format YYYY-mm-dd (ISO format) have information on the day the sequence was collected. Having the day of sequence collection is required for this analysis, which blocks time by weeks.
ISO_date_filter will find use regular expressions to find collection dates that match ISO format, excluding all entries not in this format.
"""
print("Filtering metadata with variants for entries with a collection day specified.")
pattern="\d{4}-\d{2}-\d{2}"
#Re.search() returns None if there is no match, and an re.match object if there is a match.
#pd.notna() will return True for the rows that match the expression.
standard_date=metadata_with_variants[metadata_with_variants.date.apply(lambda x: pd.notna(re.search(pattern,x)))]
#Report the number of non-standard dates removed
n_removed=len(metadata_with_variants)-len(standard_date)
print("Excluded {} entries with no defined collection day. Entries remaining: {}".format(n_removed,len(standard_date)))
return standard_date
def clean_metadata(metadata_with_variants,variant_events,metadata_variants_out=None):
"""
Clean metadata will take the metadata linked to variants and process it to a format that can be used for the generation of a time series matrix.
Entries will be filtered for collection dates with a defined day of collection, the collection date column will be converted to a datetime object, and NaN values for the variants will be filled with an empty list.
Arguments
metadata_variants_out: may specify a path here for writing out the cleaned metadata with variants dataset.
"""
#Filter entries for defined day of collection
standard_date=ISO_date_filter(metadata_with_variants)
#Convert date sequenced ('date' column) from string to datetime object
standard_date=standard_date.copy()
standard_date['date']=pd.to_datetime(standard_date['date'],format="%Y-%m-%d")
#If vaiants cell is empty, fill it with an empty list. Otherwise, leave it as is.
fill_nans=lambda x:[] if x is np.nan else x
standard_date['Variants']=standard_date['Variants'].apply(fill_nans)
#The number of variants column must also be filled for sequences with no variants (zeros are added instead of an empty list)
standard_date["Number_of_Variants"]=standard_date["Number_of_Variants"].fillna(0)
#Write cleaned file to csv if specified
if metadata_variants_out:
print("Writing cleaned metadata with variants to {}.".format(metadata_variants_out))
#Create a copy of the standard_date table and change the variants column to a comma-separated string for output
meta_var_table=standard_date.copy()
meta_var_table["Variants"]=meta_var_table["Variants"].apply(lambda x:variant_list_to_string(x))
time_1=time.time()
print("Creation of CSV file")
meta_var_table.to_csv(metadata_variants_out,sep="\t",chunksize=1000)
time_2=time.time()
print("Complete. Time elapsed:",time_2-time_1,"seconds.")
return standard_date
def date_range_generator_weekly(start_date,end_date):
"""
date_range_generator will create date "bins" on weekly intervals given a start date and an end date.
The function creates i bins, with i being the number of weeks covered between start_date and end_date. The bins are stored in separate lists: start_dates, which contains the starting day of bin i (always a sunday) and end_dates, which contains the ending day of bin i (always a saturday).
Inputs: must be in ISO format (YYYY-mm-dd) for the function to work.
start_date: should be a Sunday. Other days will be accepted, but will generate unexpected results (intervals will begin at the nearest Sunday to the date entered).
end_date: should be a Saturday. As with start_date, other days will be accepted but will generate unexpected results.
"""
from datetime import date
from datetime import timedelta
#Print a warning if start_date is not a sunday and/or end_date is not a saturday
#datetime.weekday() will return day of week as a number. Sunday==6, Saturday==5.
if date.fromisoformat(start_date).weekday()!=6:
print("Warning: start date specified is not a Sunday. Time interval creation may yield unexpected results.")
if date.fromisoformat(end_date).weekday()!=5:
print("Warning: end date specified is not a Saturday. Time interval creation may yield unexpected results.")
#Creation of start_dates and end_dates lists
#Convert to datetime object to compute on entered dates
start_date=date.fromisoformat(start_date)
end_date=date.fromisoformat(end_date)
#Start_dates
#The start of the first bin is the start date passed to the function
#The start of the last bin is 6 days before the end_date (end of the last bin)
six_days=timedelta(days=6)
start_final=end_date-six_days
#Create start_dates list
start_dates=pd.date_range(start=start_date,end=start_final,freq="W-SUN")
#End_dates
#The end of the first bin is the start date entered plus 6 days
end_initial=start_date+six_days
#The end of the last bin is the end date passed to the function
#Create end_dates list
end_dates=pd.date_range(start=end_initial,end=end_date,freq="W-SAT")
return start_dates,end_dates
def variant_list_to_string(var_list):
"""
Changes cluster variant combinations into a format readable to external scripts.
"""
delim=","
return(delim.join(var_list))
def to_1D(series):
"""
Code from https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
Collapses a series of lists into a one-dimensional series, allowing for counts on the values
"""
return pd.Series([x for _list in series for x in _list],dtype='object')
def time_series_generation(standard_date,codes,protein,start_dates,end_dates,time_series_freq_out=None,verbose=False):
#The list 'data' below will store each column of the time seies analysis.
#Each element in the list will be a list of the frequencies of each variant for the week represented by the variant.
data=[]
#Create list for storing column names for the time series DataFrame
column_names=[]
#Create list of rownames for dataframe
zero_column="Zero_Mutations_in_"+protein
row_names=['Total_Genomes',zero_column,*codes]
#Iterate through each week in the start-end dates lists
for w in range(len(start_dates)):
print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(w+1,len(start_dates),(w+1)/len(start_dates)),end="")
#Form the dataset for week w.
week_w=standard_date[(standard_date['date']>=start_dates[w]) & (standard_date['date']<=end_dates[w])]
#Store number of sequences in week w with zero variants relative to reference
no_mutation_count=len(week_w[week_w["Number_of_Variants"]==0])
#Create list of variant frequencies for week w
#Code counts will store the frequency of every mutation in the list of mutation codes.
code_freq=[]
#Before iteration, prepare series mapping mutation codes to the freq. of appearance in the sequences.
week_w_variant_freq=to_1D(week_w["Variants"]).value_counts()
#Iteratively check all unique mutation codes against the series.
for j in range(len(codes)):
#If the code is present in the series, record the freq. in the code freq. list.
if codes[j] in week_w_variant_freq:
code_count=week_w_variant_freq[codes[j]]
code_freq.append(code_count)
else:
code_freq.append(0)
#Formation of column of frequency counts for week w
#Add the count of no mutations to the beginning of the code_freq list
code_freq.insert(0,no_mutation_count)
#Add total number of sequences to the beginning of the code_freq list (Before the count of no mutations)
code_freq.insert(0,len(week_w))
#The code_freq list for week w will be appended to the data list
data.append(code_freq)
#Create the column name for week w and append to column_names
name="Week{} ({}-{})".format(w+1,start_dates[w].strftime('%m/%d/%Y'),end_dates[w].strftime('%m/%d/%Y'))
column_names.append(name)
print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(len(start_dates),len(start_dates),1.000),end="")
#line below erases progress meter once complete
print(" "*100+"\r",end="")
#Create Pandas DafaFrame from the data
#Form a np.array first, then transpose (each list represents a row, and default is for each list to represent a column)
data=np.array(data,dtype="int64").transpose()
time_series_global=pd.DataFrame(data=data,index=row_names,columns=column_names)
#Write DataFrame to CSV
if verbose==True:
print("Writing Time Series Frequency Dataframe to {}.".format(time_series_freq_out))
time_series_global.to_csv(time_series_freq_out,sep=",",chunksize=1000)
return time_series_global
def percentage_table(time_series_global,time_series_percent_out,verbose=False):
def normalize_to_percentage(df):
"""
This function is used only in the creation of the percentage table. Each frequency value is divided by the number of sequences analyzed to yield the percentage of sequences containing each variant.
"""
#For every column in the time series data, divide all values in the coulmn by the column by the first value (total sequences analyzed)
for i in range(0,df.shape[1]):
#It is possible to have division by zero if zero genomes were included in a given week. This case must be caught to avoid undefined values.
if df.iloc[0,i]==0:
#If total genomes==0, all cells in the column will be set to zero.
for j in range(df.shape[0]):
df.iloc[j,i]=0
else:
df.iloc[:,i]=df.iloc[:,i]/df.iloc[0,i]
#Remove total genomes row when finished
df=df.iloc[1:,:]
return df
global_percentage=normalize_to_percentage(time_series_global)
global_percentage.to_csv(time_series_percent_out,sep=",",chunksize=1000)
if verbose==True:
print("Time series percentage dataframe written to {}.".format(time_series_percent_out))
return global_percentage
def create_heatmap_table(global_percentage,time_series_HT_out,protein,cmap,verbose=False,**cmap_kwargs):
"""
Arguments
----------
global_percentage: dataset with percentage values for unique variants.
time_series_HT_out: path for writing out heatmap table.
cmap: a matplotlib colormap (see https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html for options).
cmap_kwargs: supported kwargs
low: compress lower end of color range by given value
high: compres higher end of color range by given value
vmin: value corresponding to the low end of the colormap range. Values lower than vmin will be colored according to the lower extreme of the colormap.
vmax: value corresponding to the high end of the colormap range. Values higher than vax will be colored according to the upper extreme of the colormap.
text_color_threshold (default=0.408): threshold for text color. 0 makes all text dark colored, 1 makes all text light colored.
returns
----------
global_percentage_HT: a Pandas DataFrame object giving the total number of sequences segmented by continent and week of analysis
"""
#Show percentage values with 5 decimal places and color cells according to values, with 0.1% as the minimum value for coloration
global_percentage_HT=global_percentage.style.format("{0:.5%}").background_gradient(cmap,**cmap_kwargs)
global_percentage_HT.to_excel(time_series_HT_out,sheet_name="{} - Global".format(protein))
if verbose==True:
print("Heatmap table written to {}.".format(time_series_HT_out))
return global_percentage_HT
def total_seq_by_continent(meta_clean_path,protein,start_date,end_date,output_path=None):
"""
Returns a table giving the number of sequences in the analysis by week and by continent.
Arguments
----------
meta_clean_path: path of the cleaned metadata with variants file.
protein: name of protein (string).
start_date: the beginning date for weekly intervals. Must be in ISO format, and should be a Sunday to avoid unexpected interval creation.
end_date: the end date for weekly intervals. Must be in ISO format, and should be a Saturday to avoid unexpected interval creation.
output_path: path for writing dataframe; if None, output will not be written (default=None)
Returns
----------
seq_continent: a Pandas DataFrame, giving the number of sequences analyzed by week, both worldwide and broken down by continent.
"""
#Generate date range
start_dates,end_dates=date_range_generator_weekly(start_date,end_date)
#Load metadata
print("Loading metadata at",meta_clean_path)
metadata_clean=pd.read_csv(meta_clean_path,sep="\t",parse_dates=['date'],infer_datetime_format=True,dtype="object")
#Generate column names from dates entered
column_names=[]
for w in range(0,len(start_dates)):
#Create the column name for week w and append to column_names
name="Week{} ({}-{})".format(w+1,start_dates[w].strftime('%m/%d/%Y'),end_dates[w].strftime('%m/%d/%Y'))
column_names.append(name)
#Row names: one for each unique continent in the 'region' column of the metadata, plus one for global totals
row_names=list(metadata_clean.region.unique())
row_names.append("Worldwide")
#Create list for storing data
data=[]
#Progress meter: use a counter that increments by one for each iteration in the region for loop
progress=0
total_items=len(row_names)
#For each unique region (continent) name, perform the following:
for continent in row_names:
print("Computing Sequence Counts by Continent: {} of {}.\r".format(progress,total_items),sep="",end="")
#Begin by subsetting metadata with variants by region
if continent!="Worldwide":
region_subset=metadata_clean[metadata_clean["region"]==continent]
#Must reset "region_subset" to metadata_clean when "worldwide" is entered (otherwise the subset will be for the last continent)
elif continent=="Worldwide":
region_subset=metadata_clean
#Generate list for storing the weekly number of genomes for the continent
n_genomes_total=[]
#For each week w in the date range, subset the dataset for that week and record the number of entries found.
for w in range(0,len(start_dates)):
region_subset_w=region_subset[(region_subset['date']>=start_dates[w]) & (region_subset['date']<=end_dates[w])]
n_genomes_w=len(region_subset_w)
#Add number of genomes for week w to the list
n_genomes_total.append(n_genomes_w)
data.append(n_genomes_total)
#Progress meter: increment by one
progress+=1
print("Computing Sequence Counts by Continent: {} of {}.".format(total_items,total_items),sep="",end="\n")
#Create dataframe from total
seq_continent=pd.DataFrame(data,index=row_names,columns=column_names)
#Write dataframe to file
if output_path:
seq_continent.to_csv(output_path,chunksize=1000)
print("Output written to",output_path,sep=" ")
return seq_continent
def define_variant_combos(variant_combinations_path):
"""
Reads the variant combinations file, which contains all unique combinations in the dataset. Returns the unique variant combinations as a list.
"""
#Load variant combinations dataset, extract "Variants" column, and store result as a list
var_combos=list(pd.read_csv(variant_combinations_path,sep="\t")["Variants"])
return var_combos
def form_combo_metadata(meta_clean):
"""
Takes the cleaned metadata used for time series computations of individual variants and converts the lists in the variant colum to strings so combinations can be compared.
"""
combo_meta=meta_clean.copy()
combo_meta["Variants"]=combo_meta["Variants"].apply(lambda x:variant_list_to_string(x))
return combo_meta
def time_series_combo(combo_meta,var_combos,protein,start_dates,end_dates,combo_TS_freq_out=None,verbose=False):
#The list 'data' below will store each column of the time seies analysis.
#Each element in the list will be a list of the frequencies of each variant for the week represented by the variant.
data=[]
#Create list for storing column names for the time series DataFrame
column_names=[]
#Create list of rownames for dataframe
row_names=['Total_Genomes',"Zero_Variants",*var_combos]
#Iterate through each week in the start-end dates lists
for w in range(len(start_dates)):
print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(w+1,len(start_dates),(w+1)/len(start_dates)),end="")
#Subset dataset for week w.
week_w=combo_meta[(combo_meta['date']>=start_dates[w]) & (combo_meta['date']<=end_dates[w])]
#Store number of sequences in week w with zero variants relative to reference
no_mutation_count=len(week_w[week_w["Number_of_Variants"]==0])
#Create list for storing frequencies of each variant combo during week w
combo_freq=[]
#Use value_counts() to create a series with the frequency of each variant combo in the subset for week w
week_w_combo_freq=week_w["Variants"].value_counts()
#Iteratively check all unique mutation codes against the series.
for j in range(len(var_combos)):
#If the code is present in the series, record the freq. in the code freq. list.
if var_combos[j] in week_w_combo_freq:
combo_count=week_w_combo_freq[var_combos[j]]
combo_freq.append(combo_count)
else:
combo_freq.append(0)
#Formation of column of frequency counts for week w
#Add the count of no mutations to the beginning of the code_freq list
combo_freq.insert(0,no_mutation_count)
#Add total number of sequences to the beginning of the code_freq list (Before the count of no mutations)
combo_freq.insert(0,len(week_w))
#The code_freq list for week w will be appended to the data list
data.append(combo_freq)
#Create the column name for week w and append to column_names
name="Week{} ({}-{})".format(w+1,start_dates[w].strftime('%m/%d/%Y'),end_dates[w].strftime('%m/%d/%Y'))
column_names.append(name)
print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(len(start_dates),len(start_dates),1.000),end="")
#line below erases progress meter once complete
print(" "*100+"\r",end="")
#Create Pandas DafaFrame from the data
#Form a np.array first, then transpose (each list represents a column, and default is for each list to represent a row)
data=np.array(data,dtype="int64").transpose()
combo_TS_freq=pd.DataFrame(data=data,index=row_names,columns=column_names)
#Write DataFrame to CSV
if verbose==True:
print("Writing Time Series Frequency Dataframe to {}.".format(time_series_freq_out))
combo_TS_freq.to_csv(combo_TS_freq_out,sep=",",chunksize=1000)
return combo_TS_freq