Time_Series.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
pd.set_option('display.max_rows', 999) 
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 1000)
from datetime import datetime
from datetime import date
import re
import time

def TS_All_Proteins(meta_path, 
                    variant_events_directory,
                    cluster_directory,
                    start_date,
                    end_date,
                    where_ref_path,
                    time_series_directory,
                    heatmap_table=True,
                    n_seq_by_continent=True,
                    variant_combinations=True,
                    continent="get_all",
                    exclude_singletons=True,
                    cmap="YlGnBu",
                    **cmap_kwargs):
    """
    Performs time series analysis on all of proteins in the directory specified for MSA output.
    
    Arguments
    ----------
    meta_path: path to the metadata for the group of sequences.
    
    variant_events_directory: path to the directory containing the outputs from the MSA_Reader program. This directory contains one subdirectory for each protein and is called "Variant_Events" by default.
    
    cluster_directory: path to the directory containing cluster information files generated during the sequence dereplication step.
    
    start_date: the start date of the analysis. Must be in ISO format, and must be a Sunday to avoid unexpected output.
    
    end_date: the end date of the analysis. Must be in ISO format, and must be a Saturday to avoid unexpected output.
    
    where_ref_path: path to the where_reference.tsv file generated by the script "Find_Reference_Clusters.R".
    
    time_series_directory: desired directory for output. Each dataset will be created as a subdirectory within this directory.
        subdirectory names
        "Metadata_with_Variants/": metadata is stored in this directory with the added columns "cluster name", "variant combinations", and "number of variants".
        "Time_Series_Frequency": time series frequency tables are created in this directory. 
        "Time_Series_Percent/": time series percentage tables are created in this directory.
        
        optional subdirectories
        "Time_Series_Heatmap_Tables/": heatmap tables are created and stored in this directory. May skip creation by setting "heatmap_table" to False.
        "Total_Sequences_by_Continent/": data giving total sequences analyzed by collection date and continent is stored in this directory. May skip creation by setting "n_seq_by_continent" to False.
        "Variant_Combinations/": data giving the combinations of variants observed in each cluster with their frequency is stored in this directory. May skip creation by setting "variant_combinations" to False.
    
    heatmap_table: (default=True) if False, heatmap tables will not be created. If True, heatmap tables will be created and stored in the "Time_Series_Heatmap_Tables/" subdirectory. 
    
    n_seq_by_continent: (default=True) if False, tables for number of sequences by continent and collection date will not be created. If True, tables will be created and stored in the "Total_Sequences_by_Continent/" subdirectory. 
    
    variant_combinations: (default=True) if False, tables giving variant combinations with frequency data will not be created. If True, tables will be created and stored in the "Total_Sequences_by_Continent/" subdirectory. 
    
    continent: (default="get-all") if "get-all", time series data for each protein will be calculated separately for all continents and globally. May also be a list of continents, or the string "Global" to calculate only the worldwide analysis.
        Accepted list entries for continent: "Africa","Europe","Oceania","North America","South America","Asia","Global" (case sensitive). All other entries will return an error.
    
    exclude_singletons: (default=True) if True, remove clusters of size 1 from the metadata analysis. If False, include all clusters.
    
    cmap: (default="YlGnBu") for the heatmap tables, a string defining the matplotlib color map to use. For possible colormaps, see https://matplotlib.org/tutorials/colors/colormaps.html
    
    cmap_kwargs: optional kwargs to be passed to the heatmap table.
        supported kwargs
        low: compress lower end of color range by given value
        high: compres higher end of color range by given value 
        vmin: value corresponding to the low end of the colormap range. Values lower than vmin will be colored according to the lower extreme of the colormap.
        vmax: value corresponding to the high end of the colormap range. Values higher than vax will be colored according to the upper extreme of the colormap.
        text_color_threshold (default=0.408): threshold for text color. 0 makes all text dark colored, 1 makes all text light colored.
    """
    #Define directories within the time series data and create them if they do not already exist
    #Metadata and variants
    metadata_variants_directory=time_series_directory+"Metadata_with_Variants/"
    if os.path.isdir(metadata_variants_directory)==False:
        os.mkdir(metadata_variants_directory)
    #Frequency tables
    time_series_frequency_directory=time_series_directory+"Time_Series_Frequency/"
    if os.path.isdir(time_series_frequency_directory)==False:
            os.mkdir(time_series_frequency_directory)
    #Percentage tables
    time_series_percent_directory=time_series_directory+"Time_Series_Percent/"
    if os.path.isdir(time_series_percent_directory)==False:
            os.mkdir(time_series_percent_directory)
    #Heatmap tables (optional but included by default)
    if heatmap_table==True:
        time_series_htmp_table_directory=time_series_directory+"Time_Series_Heatmap_Tables/"
        if os.path.isdir(time_series_htmp_table_directory)==False:
            os.mkdir(time_series_htmp_table_directory)
    #Number of Sequences by Continent (optional but included by default)
    if n_seq_by_continent==True:
        n_seq_by_continent_directory=time_series_directory+"Total_Sequences_by_Continent/"
        if os.path.isdir(n_seq_by_continent_directory)==False:
            os.mkdir(n_seq_by_continent_directory)
    #Variant Combinations dataset (optional but included by default)
    if variant_combinations==True:
        top_combinations_directory=time_series_directory+"Variant_Combinations/"
        if os.path.isdir(top_combinations_directory)==False:
            os.mkdir(top_combinations_directory)
    
    #If variant_combinationsis true, create two directories for frequency and percentage data of combos, respectively
    if variant_combinations==True:
        #Variant Combinations TS: Frequency
        combos_TS_freq_directory=time_series_directory+"Variant_Combinations_TS_Frequency/"
        if os.path.isdir(combos_TS_freq_directory)==False:
            os.mkdir(combos_TS_freq_directory)
        #Variant Combinations TS: Percentage
        combos_TS_percentage_directory=time_series_directory+"Variant_Combinations_TS_Percentage/"
        if os.path.isdir(combos_TS_percentage_directory)==False:
            os.mkdir(combos_TS_percentage_directory)
    
    #Load reference cluster data
    ref_names=find_ref_cluster_name(where_ref_path)
    #As long as default file formats are preserved, the MSA reader produces one directory for each protein analyzed. 
    for protein in os.listdir(variant_events_directory):
        #Input files
        variant_events_path=variant_events_directory+"{0}/{0}_variants_raw.tsv".format(protein)
        #Find Path of Cluster information file for the protein in the directory for cluster information
        for file in os.listdir(cluster_directory):
            #Match the pattern "<protein>_" which appears at the beginning of cluster information files by default
            if re.match(protein+"_",file):
                cluster_path=cluster_directory+file
        #Reference cluster ID for protein
        ref_cluster=ref_names[protein]
        #Run Time Series Computation
        print("Protein:",protein)
        time_series_pipeline(meta_path,
                         variant_events_path,
                         cluster_path,
                         protein,
                         start_date,
                         end_date,
                         ref_cluster,
                         time_series_directory,
                         heatmap_table,
                         n_seq_by_continent,
                         variant_combinations,
                         continent,
                         exclude_singletons,
                         cmap,
                         **cmap_kwargs)
        print("-"*80)

def time_series_pipeline(meta_path,
                         variant_events_path,
                         cluster_path,
                         protein,
                         start_date,
                         end_date,
                         ref_cluster,
                         time_series_directory,
                         heatmap_table=True,
                         n_seq_by_continent=True,
                         variant_combinations=True,
                         continent="Global",
                         exclude_singletons=True,
                         cmap="YlGnBu",
                         **cmap_kwargs):
    """
    Creates time series matrices detailing the frequency of variation over time for the defined protein.
    
    May optionally subset data by continent or country.
    
    Aruguments
    ----------
    meta_path: path to the metadata file for the sequences being analyzed.
    
    variant_events_path: path to the variant events dataset generated for the protein by the MSA reader. By default, this file ends with "variants_raw.tsv".
    
    cluster_path: path to the cluster information file generated by USEARCH for the protein.
    
    protein: name of the protein being analyzed (string)
    
    start_date: string in ISO date format representing the start date of analysis. Must be a Sunday to avoid unexpected results in date range creation.
    
    end_date: string in ISO date formt specifying end date. Must be a Saturday to avoid unexpected results in date range creation.
    
    ref_cluster: string specifying the cluster ID of the reference cluster, in the format it appears in the cluster information file (e.g. Uniq2).
    
    metadata_variants_directory: directory to write out metadata with variants CSV. Files created in this folder will have the pattern <protein>_Metadata_with_Variants.csv.
    
    time_series_frequency_directory: directory for writing out the CSV of frequency table of mutations by week for each continent. Files will be automatically created in this folder with the format <protein>_Frequency_<continent>.csv.
    
    time_series_percent_directory: directory for writing out the CSV of percentage table of mutations by week for each continent. Files will be automatically created in this folder with the format <protein>_Percent_<continent>.csv.
    
    time_series_htmp_table_directory: directory for writing out the Excel spreadsheet with the percentage table colored according to values (default=None). If specified, files will be automatically created in this folder with the format <protein>_Heatmap_Table_<continent>.xlsx.
    
    n_seq_by_continent_directory: directory for writing out CSV table giving number of sequences by week for each continent (default=None). If specified, files will be created in the directory with the pattern <protein>_Total_Seq_by_Continent.csv.
    
    top_combinations_out: directory for writing out table giving combinations of variants for each cluster in decreasing order based on number of sequences (default=None). If specified, files will be created in the directory with the pattern <protein>_Variant_Combinations.csv.
    
    continent: A string or a list specifying desired subsets based on continent. If "get-all", all continents and a worldwide analysis will be computed. If "Global", no subsetting will be performed (just worldwide data). If a list of continents is passed, then time series analysis will be completed just for the continents in the list. Continents entered should use upper case, and "Oceania" should be entered instead of "Australia".
    
    exclude_singletons: if True, clusters with size 1 will be excluded from analysis (default=True).
    
    cmap: matplotlib color map to use for heatmap table (default='YlGnBu'). For possible color maps see https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html
    
    cmap_kwargs: kwargs passed to the Pandas.Styler.Background_Gradient() function during creation of the heatmap table.
    -supported kwargs-
    low: compress lower end of color range by given value
    high: compres higher end of color range by given value 
    vmin: value corresponding to the low end of the colormap range. Values lower than vmin will be colored according to the lower extreme of the colormap.
    vmax: value corresponding to the high end of the colormap range. Values higher than vax will be colored according to the upper extreme of the colormap.
    text_color_threshold (default=0.408): threshold for text color. 0 makes all text dark colored, 1 makes all text light colored.
    
    Returns
    ----------
    None. All output written to file at directories specified. 
    """
    time_start=time.time()
    #Define directory paths (all directory paths will eventually be defined here using the time_series_directory)
    #Metadata and variants
    metadata_variants_directory=time_series_directory+"Metadata_with_Variants/"
    #Frequency tables
    time_series_frequency_directory=time_series_directory+"Time_Series_Frequency/"
    #Percentage tables
    time_series_percent_directory=time_series_directory+"Time_Series_Percent/"
    #Heatmap tables (optional but included by default)
    if heatmap_table==True:
        time_series_htmp_table_directory=time_series_directory+"Time_Series_Heatmap_Tables/"
    #Number of Sequences by Continent (optional but included by default)
    if n_seq_by_continent==True:
        n_seq_by_continent_directory=time_series_directory+"Total_Sequences_by_Continent/"
    #Variant Combinations dataset (optional but included by default)
    if variant_combinations==True:
        top_combinations_directory=time_series_directory+"Variant_Combinations/"
        #Variant Combinations TS: Frequency
        combos_TS_freq_directory=time_series_directory+"Variant_Combinations_TS_Frequency/"
        #Variant Combinations TS: Percentage
        combos_TS_percentage_directory=time_series_directory+"Variant_Combinations_TS_Percentage/" 
    
    #Return error for incorrect continent entries
    if type(continent)==list:
        #If a list is passed to continent 
        for element in continent:
            if element not in ["Africa","Europe","Oceania","North America","South America","Asia","Global"]:
                raise ValueError('Invalid entry for continent. Please use any of the following entries exactly as they appear:\n"Africa","Europe","Oceania","North America","South America","Asia","Global"\nAlternately, the strings "get_all" and "Global" may be passed.')
    else:
        if continent not in ["get_all","Global"]:
            raise ValueError('Invalid entry for continent. Please enter "get_all", "Global", or a list of desired continents.')
    
    #Create output file paths within entered directories
    #Required files
    metadata_variants_out=metadata_variants_directory+protein+"_Metadata_with_Variants.tsv"
    #Optional files
    if n_seq_by_continent_directory:
        n_seq_by_continent_out=n_seq_by_continent_directory+protein+"_Total_Seq_by_Continent.csv"
    if top_combinations_directory:
        top_combinations_out=top_combinations_directory+protein+"_Variant_Combinations.tsv"
    else:
        top_combinations_out=None
    if variant_combinations==True:
        combo_TS_freq_out=combos_TS_freq_directory+protein+"_combo_TS_freq.csv"
        combo_TS_percent_out=combos_TS_percentage_directory+protein+"_TS_Combo_Percentage.csv"
        
    #Step 1: generate metadata-cluster pairs
    print("Step 1: Link Metadata to Cluster IDs")
    metadata_with_clusters=prepare_metadata(cluster_path,meta_path,protein,exclude_singletons)
    print()

    #Step 2: generate variant list for each cluster
    print("Step 2: Link Variants to Cluster IDs")
    #UPDATE 2021-03-17: include the reference cluster in the variant list_by_cluster dataset   
    variants_each_cluster,variant_events,codes=variant_list_by_cluster(variant_events_path,protein,ref_cluster)
    print()

    #Step 2a: if desired, generate list of top mutations for the protein
    if top_combinations_out:
        print("Step 2a: Generate Top Mutation Combinations for",protein)
        top_combinations=find_top_combinations(variants_each_cluster,cluster_path,protein)
        #For output, the lists of variant combinations need to be converted to comma-separated strings to improve readability.
        top_combinations["Variants"]=top_combinations["Variants"].apply(lambda x:variant_list_to_string(x))
        top_combinations.to_csv(top_combinations_out,sep="\t",chunksize=1000)
        print("Variant Combinations for {} written to {}".format(protein,top_combinations_out))
        print()

    #Step 3: map variant list to metadata and Clean metadata
    print("Step 3: Prepare Metadata with Variants Table")
    metadata_with_variants=link_meta_to_variants(metadata_with_clusters,variants_each_cluster,ref_cluster,protein)
    print("Cleaning Metadata")
    standard_date=clean_metadata(metadata_with_variants,variant_events,metadata_variants_out)
    print()

    #Step 3a: If specified, output a breakdown of sequences by continent over time
    if n_seq_by_continent_out:
        print("Step 3a: Prepare Table of Total Sequences by Continent Over Time")
        total_seq_by_continent(metadata_variants_out,protein,start_date,end_date,n_seq_by_continent_out)
        print()

    #Subset data for all unique continents and complete steps 4-6 for each.
    print("Step 4: Computing Time Series for Each Continent")
    #Generate date range for time series creation
    start_dates,end_dates=date_range_generator_weekly(start_date,end_date)
    print("Start date entered:",start_dates[0].strftime('%Y-%m-%d'),end=". ")
    print("End date entered:",end_dates[-1].strftime('%Y-%m-%d'),end=".\n")
    print("Analysis covers {} weeks.".format(len(start_dates)))
    
    #If time series for variant combinations is specified, store the list of unique variant combinations 
    if variant_combinations==True:
        var_combos=define_variant_combos(top_combinations_out)
        combo_meta=form_combo_metadata(standard_date)
    
    #Set up list of continents to iterate through, adding an option for global
    #If get_all is specified for continent, all continents and a global analysis will be computed.
    if continent=="get_all":
        continent_list=list(standard_date['region'].unique())
        continent_list.append("Global")
    #The string 'Global' may alternately be entered to print only the global analysis. 
    elif continent=="Global":
        continent_list=["Global"]
    #A list of continents may also be passed to the function, which will be ran in the for loop below.
    
    #Progress meter
    progress=1
    #Progress meter not used if only one continent is entered
    if len(continent_list)==1:
        print("Generating time series information for {} ({})".format(protein,continent_list[0]))
    
    for continent in continent_list:
        #Progress displayed if multiple continents are entered
        if len(continent_list)>1:
            #Exact wording of progress meter depends on whether analysis is worldwide or for a specific continent
            if continent!="Global":
                print("Time series info for {} ({}/{})".format(continent,progress,len(continent_list)))
            else:
                print("Global time series information ({}/{})".format(progress,len(continent_list)))
        #Subset the data for the continent for all iterations except for global data
        if continent=="Global":
            cont_subset=standard_date.copy()
        else:
            cont_subset=standard_date[standard_date['region']==continent]
        #If variant combinations is specified, create a copy of the subset with the variant column as a string.
        if variant_combinations==True:
            combo_subset=form_combo_metadata(cont_subset)
        
        #Generate filenames for continent-specific outputs
        time_series_freq_out=time_series_frequency_directory+protein+"_Frequency_"+continent+".csv"
        time_series_percent_out=time_series_percent_directory+protein+"_Percent_"+continent+".csv"
        #Heatmap table generation is optional
        if heatmap_table==True:
            time_series_htmp_table_out=time_series_htmp_table_directory+protein+"_Heatmap_Table_"+continent+".xlsx"
        else:
            time_series_htmp_table_out=None
        
        #Step 4: Generate time series of mutation events
        time_series_cont=time_series_generation(cont_subset,codes,protein,start_dates,end_dates,time_series_freq_out)

        #Step 5: Create a percentage table
        global_percentage=percentage_table(time_series_cont,time_series_percent_out)

        #Step 5a: If time series for variant combinations is specified, compute frequency and precentage TS for the current continent 
        if variant_combinations==True:
            combo_TS_freq=time_series_combo(combo_subset,var_combos,protein,start_dates,end_dates,combo_TS_freq_out)
            #Step 5b: Combo Time series percentage table 
            percentage_table(combo_TS_freq,combo_TS_percent_out)
            
        #Step 6: Create heatmap table from percentage data
        if heatmap_table==True:
            global_percentage_HT=create_heatmap_table(global_percentage,time_series_htmp_table_out,protein,cmap="YlGnBu",**cmap_kwargs)
        #Progress meter: increase by one
        progress+=1
       
    time_end=time.time()
    print("\nTime Series for {} complete.".format(protein))
    print("Frequency data written to",time_series_frequency_directory)
    print("Percentage data written to",time_series_percent_directory)
    if heatmap_table==True:
        print("Heatmap tables written to",time_series_htmp_table_directory)
    print("Total time elapsed: {}".format(time_end-time_start))
    
def extract_id(entry, field_index=3):
    """
    When applied to the "Input_ID" column of the USEARCH cluster information dataset, this function will yield the GISAID accession ID.
    
    The "Input_ID" column contains the FASTA header for each sequence, with metadata split by "|" characters. 
    
    Arguments:
    field_index: As of January 2021, the accession ID is in the fourth field (index=3). If this changes in future metadata, the field index may be specifed using this argument.
    """
    #Split on the vertical line character and return the field with the accession ID (fourth field by default)
    return entry.split("|")[field_index]

def count_mutation_list(entry):
    """
    When applied to the variant list column of the top combinations dataset, count_mutation_list will return the number of variants for each variant list.
    """
    return len(entry)

def count_AA_changes(var_list,events):
    """
    Given a list of variants, count_AA_Changes() will return the total number of amino acid changes present in the list.
    
    arguments
    ----------
    var_list: a list of variant codes, in the format generated by the MSA_reader() functions.
    events: Variant events dataset generated by the MSA_reader() program. This can be loaded from the file <protein name>_variants_raw.tsv in the output folder for the MSA_reader functions.
    
    outputs
    ----------
    aa_change: integer giving the total number of amino acid changes detected in the list.
    """
    #Counter for number of amino acid changes
    aa_change=0
    #Create a counter for amino acid changes, that will increment depending on the nature of the mutation
    for variant in var_list:
        #Get info for variant code: subset event table, then take the first value 
        #relevant data is the same for all instances of the code in the variant events dataset
        code_info=events[events["Code"]==variant].groupby("Code").first()
        #Retrieve mutation type and determine if it is a single-residue or a multi-residue event
        #Substitutions: all substitutions are single-residue events
        if code_info["Type"].values=="sub":
            aa_change+=1

        #Deletions: if an MSA end position is given, it is a multi-residue deletion. If this value contains a "-", it is a single-residue deletion.
        elif code_info["Type"].values=="del":
            if code_info["AA_End(MSA)"].values=="-":
                aa_change+=1
            elif code_info["AA_End(MSA)"].values!="-":
                #Determine length of multi-residue deletion: length is equal to the number of non-blank residues in the ref residues column
                length=0
                for res in str(code_info["Ref Residue(s)"].values[0]): #Entries are stored as one-element lists by default: must convert element to strinng
                    if res!="-":
                        length+=1
                aa_change+=length

        #Insertions: the length of an insertion is determined by the number of non-blank residues in the "Var Residue(s)" column (residues inserted)
        elif code_info["Type"].values=="ins":
            length=0
            for res in str(code_info["Var Residue(s)"].values[0]): #Entries are stored as one-element lists by default: must convert element to strinng
                if res!="-":
                    length+=1
            aa_change+=length

        #Indels: the length of an indel is defined by the sum of the residues inserted and the residues deleted.
        elif code_info["Type"].values=="delins":
            length=0
            #Count number of insertions (Var residues)
            for res in str(code_info["Var Residue(s)"].values[0]): #Entries are stored as one-element lists by default: must convert element to strinng
                if res!="-":
                    length+=1
            #Count number of deletions (Ref residues)
            for res in str(code_info["Ref Residue(s)"].values[0]):
                if res!="-":
                    length+=1
            #Add the number of amino acid changes for this indel to the total
            aa_change+=length
        
        #Extensions: the length of an extension is defined by the number of amino acids added to either end
        elif code_info["Type"].values=="ext":
            #A single-residue extension will have a blank for AA_End.
            if code_info["AA_End(MSA)"].values=="-":
                aa_change+=1
            #Multi-residue extension: count the number of non-blank residues added
            elif code_info["AA_End(MSA)"].values!="-":
                length=0
                #Entries are stored as one-element lists by default: must convert element to string 
                for res in str(code_info["Var Residue(s)"].values[0]): 
                    if res!="-":
                        length+=1
                #Add amino acid changes for extension to total
                aa_change+=length
            
    return aa_change

def prepare_metadata(cluster_path,meta_path,protein,exclude_singletons=True):
    #Load Cluster Information file and name columns according to the format defined by USEARCH 
    #https://drive5.com/usearch/manual/cmd_fastx_uniques.html
    print("Loading cluster information for {} at {}".format(protein,cluster_path))
    cluster_info=pd.read_csv(cluster_path,sep="\t",header=None)
    n_entries=len(cluster_info)
    print("Cluster entries read:",n_entries)
    #Rename cluster name column to "Cluster_name_<protein>" (specifys on the merged metadata that the cluster assingments are protein-specific)
    cluster_column="Cluster_Name_"+protein
    cluster_info.columns=["Input_ID",cluster_column,"Cluster_num","Member_num","Cluster_Size","Target_Seq"]
    print("Loading metadata at {}".format(meta_path))
    #Load Metadata File
    metadata=pd.read_csv(meta_path,sep="\t",dtype="object")
    print("Metadata entries read:",len(metadata))

    #If singletons_only is specified, filter cluster information file to inclue informations on singletons only
    if exclude_singletons==True:  
        cluster_info=cluster_info[cluster_info["Cluster_Size"]>=2]
        n_no_singletons=len(cluster_info)
        print("Excluding {} singletons from cluster info. Entries remaining: {}".format(n_entries-n_no_singletons,n_no_singletons))

    #The common column between the metadata and the cluster information file is the GISAID accession ID.
    #Extract GISAID_EPI_ISL IDs from cluster information file, and create a new column named "gisaid_epi_isl" (must match metadata) 
    cluster_info["gisaid_epi_isl"]=cluster_info["Input_ID"].apply(lambda x:extract_id(x))
    #Select for cluster name and the new column
    clustmap=cluster_info[["gisaid_epi_isl",cluster_column]]

    #Perform an inner join (merge function in pandas) between cluster information and metadata
    metadata_with_clusters=metadata.merge(clustmap,how="inner",on="gisaid_epi_isl")
    print("Number of sequences with metadata and cluster information:",len(metadata_with_clusters))
    #Return metadata-cluster pairs for use in time series pipeline
    return metadata_with_clusters

def variant_list_by_cluster(variant_events_path, protein, ref_cluster):
   #Form filename for variant events dataset using protein name and load dataset
    print("Loading Variant Data File at",variant_events_path)
    variant_events=pd.read_csv(variant_events_path,sep="\t")
    #Store all unique variant codes observed
    codes=list(variant_events['Code'].unique())
    #Store all unique cluster IDs 
    clusters=list(variant_events['Cluster_ID'].unique())
    print("Number of unique mutations: {}.".format(len(codes)))
    print("Number of clusters: {}.".format(len(clusters)))

    #Store variants associated wtih each cluster
    #Create an empty list for storing the variants associated with each cluster, which will be 
    #zipped with the list of unique clusters being used as the iterable to create a new dataframe.
    list_of_variant_lists=[]

    #Form for loop using clusters variable from earlier
    for i in range(len(clusters)):
        #Progress meter: update for every fifth code
        if (i+1)%5==0:
            print("Store variants for cluster IDs: {} of {}. {:.1%} complete.".format(i+1,len(clusters),(i+1)/len(clusters)),sep="",end="\r")
        #Subset database for ith cluster id
        sub=variant_events[variant_events['Cluster_ID']==clusters[i]]
        #Extract the 'Code' column from the subset, which will form an array with the variant code at each row. 
        #Convert the array to a list so it can be indexed
        sub_codes=list(sub["Code"])
        #For each row in the subset, add the variant code to the list of mutations observed in cluster i
        variant_list=[] #Create empty list
        for j in range(len(sub)):
            #For each row i, store the variant code at position i
            var_code=sub_codes[j]
            #Add the variant code to the list
            variant_list.append(var_code)
        #Add the variant list for cluster i to the list
        list_of_variant_lists.append(variant_list)

    print("\rStore variants for cluster IDs: {} of {}. {:.1%} complete.".format(len(clusters),len(clusters),1.00),sep="")

    #The list of lists of variants for each cluster will be zipped to the list of cluster ID's, and a dataframe will be created
    data=list(zip(clusters,list_of_variant_lists))

    #Add reference cluster information to the list of variants by cluster
    #Tuple adds the ref cluster ID and its corresponding empty list of variants to the data
    data.append((ref_cluster,[]))

    #Ref cluster is added to the end. Next, sort the data in increasing order by cluster number
    #Lambda function pulls out the first element in the tuple, then takes the text after "Uniq" which is the cluster number.
    data=sorted(data,key=lambda x:int(x[0].split("Uniq")[1]))

    #Set name of column storing cluster info so it can be mapped to the cooresponding cluster in the metadata. 
    cluster_colname="Cluster_Name_"+protein
    #Create Pandas df
    variants_each_cluster=pd.DataFrame(data,columns=[cluster_colname,'Variants'])


    #Add number of amino acid changes in each cluster to the new dataframe
    print("Adding number of variants column to variants by cluster dataset.")
    tic=time.time()
    variants_each_cluster["Number_of_Variants"]=variants_each_cluster["Variants"].apply(lambda x:count_AA_changes(x,variant_events))
    toc=time.time()
    print("Complete. Time elapsed:",toc-tic,"seconds.")
    
    #Return the created dataset, along with the variant events table and the list of unique codes, which are used in subsequent steps.
    return variants_each_cluster,variant_events,codes

def find_top_combinations(variants_each_cluster,cluster_path,protein):
    #Create dataset mapping cluster name to cluster size
    #Load cluster information file from USEARCH and add column names
    cluster_info=pd.read_csv(cluster_path,sep="\t",header=None)
    cluster_info.columns=["Input_ID","Cluster_ID","Cluster_num","Member_num","Cluster_Size","Target_Seq"]
    #Remove singletons and return one row for each cluster (one row for each sequence is given by default)
    cluster_info=cluster_info[cluster_info["Cluster_Size"]>=2].groupby("Cluster_num").first()
    #Select cluster ID and cluster size columns
    size_map=cluster_info[["Cluster_ID","Cluster_Size"]]
    
    #Add Cluster Size to variant list dataset
    variants_each_cluster=variants_each_cluster.rename(columns={'Cluster_Name_{}'.format(protein): 'Cluster_ID'})
    top_combinations=variants_each_cluster.merge(size_map,how="inner",on="Cluster_ID")
    top_combinations.fillna(value=" ")
    
    #Add Column for number of mutations in cluster
    #top_combinations["Number_of_Variants"]=top_combinations["Variants"].apply(lambda x:count_AA_changes(x,variant_events))

    return top_combinations

def find_ref_cluster_name(where_ref_path):
    """
    Taking a 'where_reference.tsv' produced by the R script in 'reference_finder_script.Rmd', find_ref_cluster_id
    will map the **name** (not index) of the reference cluster for each protein clustered.
    
    where_ref_path: Path to the 'where_reference.tsv' file.
    """
    #Create an empty dictionary for storing the names of the reference clusters
    names=dict()
    #Open the where_reference.tsv file
    with open(where_ref_path,"r") as file:
        #Create a list with one element for each line in the input file
        lines=file.readlines()
        #Remove newline characters in input
        for i in range(len(lines)):
            lines[i]=lines[i].strip()
        #For each protein, extract protein name, and the name of the cluster with the ref. genome location in the corresponding protein
        for line in lines:
            #For each protein, extract protein name, and 
            protein=line.split(sep="\t")[0]
            #Ectract cluster name corresponding to location of ref. genome in protein
            cluster=line.split(sep="\t")[1]
            #Store the cluster name in the dictionary 'names' with the protein name as the key
            names[protein]=cluster
    return names

def link_meta_to_variants(metadata_with_clusters,variants_each_cluster,ref_cluster,protein):
    #Define name of cluster column in the metadata-cluster dataframe
    cluster_colname="Cluster_Name_"+protein
    #Join the variant list dataframe to the metadata dataframe, using cluster_colname from earlier
    metadata_with_variants=pd.merge(left=metadata_with_clusters,right=variants_each_cluster,on=cluster_colname,how='left')

    #Check for any clusters without variants outside of the reference cluster (should be zero entries)
    meta_outside_reference=len(metadata_with_variants[(metadata_with_variants['Variants'].isna()) & (metadata_with_variants[cluster_colname]!=ref_cluster)])
    if meta_outside_reference!=0:
        print("Warning: Clusters with zero variants detected outside of the reference cluster. \nPlease check the -tabbedout output from USEARCH for inconsistencies with the .FASTA output file.")
    return metadata_with_variants

def ISO_date_filter(metadata_with_variants):
    """
    Previous investigation of GISAID 'Next Metadata' shows that only dates in the format YYYY-mm-dd (ISO format) have information on the day the sequence was collected. Having the day of sequence collection is required for this analysis, which blocks time by weeks.

    ISO_date_filter will find use regular expressions to find collection dates that match ISO format, excluding all entries not in this format.
    """
    print("Filtering metadata with variants for entries with a collection day specified.")
    pattern="\d{4}-\d{2}-\d{2}"
    #Re.search() returns None if there is no match, and an re.match object if there is a match.
    #pd.notna() will return True for the rows that match the expression. 
    standard_date=metadata_with_variants[metadata_with_variants.date.apply(lambda x: pd.notna(re.search(pattern,x)))]
    #Report the number of non-standard dates removed
    n_removed=len(metadata_with_variants)-len(standard_date)
    print("Excluded {} entries with no defined collection day. Entries remaining: {}".format(n_removed,len(standard_date)))
    return standard_date

def clean_metadata(metadata_with_variants,variant_events,metadata_variants_out=None):
    """
    Clean metadata will take the metadata linked to variants and process it to a format that can be used for the generation of a time series matrix.
    
    Entries will be filtered for collection dates with a defined day of collection, the collection date column will be converted to a datetime object, and NaN values for the variants will be filled with an empty list.
    
    Arguments
    metadata_variants_out: may specify a path here for writing out the cleaned metadata with variants dataset.
    """
    #Filter entries for defined day of collection
    standard_date=ISO_date_filter(metadata_with_variants)

    #Convert date sequenced ('date' column) from string to datetime object
    standard_date=standard_date.copy()
    standard_date['date']=pd.to_datetime(standard_date['date'],format="%Y-%m-%d")

    #If vaiants cell is empty, fill it with an empty list. Otherwise, leave it as is.
    fill_nans=lambda x:[] if x is np.nan else x
    standard_date['Variants']=standard_date['Variants'].apply(fill_nans)
    #The number of variants column must also be filled for sequences with no variants (zeros are added instead of an empty list)
    standard_date["Number_of_Variants"]=standard_date["Number_of_Variants"].fillna(0)

    #Write cleaned file to csv if specified
    if metadata_variants_out:
        print("Writing cleaned metadata with variants to {}.".format(metadata_variants_out))
        #Create a copy of the standard_date table and change the variants column to a comma-separated string for output
        meta_var_table=standard_date.copy()
        meta_var_table["Variants"]=meta_var_table["Variants"].apply(lambda x:variant_list_to_string(x))
        time_1=time.time()
        print("Creation of CSV file")
        meta_var_table.to_csv(metadata_variants_out,sep="\t",chunksize=1000)
        time_2=time.time()
        print("Complete. Time elapsed:",time_2-time_1,"seconds.")
    
    return standard_date

def date_range_generator_weekly(start_date,end_date):
    """
    date_range_generator will create date "bins" on weekly intervals given a start date and an end date. 
    
    The function creates i bins, with i being the number of weeks covered between start_date and end_date. The bins are stored in separate lists: start_dates, which contains the starting day of bin i (always a sunday) and end_dates, which contains the ending day of bin i (always a saturday). 
    
    Inputs: must be in ISO format (YYYY-mm-dd) for the function to work.
    start_date: should be a Sunday. Other days will be accepted, but will generate unexpected results (intervals will begin at the nearest Sunday to the date entered). 
    end_date: should be a Saturday. As with start_date, other days will be accepted but will generate unexpected results.
    """
    from datetime import date
    from datetime import timedelta
    #Print a warning if start_date is not a sunday and/or end_date is not a saturday
    #datetime.weekday() will return day of week as a number. Sunday==6, Saturday==5.
    if date.fromisoformat(start_date).weekday()!=6:
        print("Warning: start date specified is not a Sunday. Time interval creation may yield unexpected results.")
    if date.fromisoformat(end_date).weekday()!=5:
        print("Warning: end date specified is not a Saturday. Time interval creation may yield unexpected results.")
    
    #Creation of start_dates and end_dates lists
    #Convert to datetime object to compute on entered dates
    start_date=date.fromisoformat(start_date)
    end_date=date.fromisoformat(end_date)
    
    #Start_dates
    #The start of the first bin is the start date passed to the function
    #The start of the last bin is 6 days before the end_date (end of the last bin)
    six_days=timedelta(days=6)
    start_final=end_date-six_days
    #Create start_dates list
    start_dates=pd.date_range(start=start_date,end=start_final,freq="W-SUN")
    
    #End_dates
    #The end of the first bin is the start date entered plus 6 days
    end_initial=start_date+six_days
    #The end of the last bin is the end date passed to the function
    #Create end_dates list
    end_dates=pd.date_range(start=end_initial,end=end_date,freq="W-SAT")
    return start_dates,end_dates

def variant_list_to_string(var_list):
    """
    Changes cluster variant combinations into a format readable to external scripts.
    """
    delim=","
    return(delim.join(var_list))

def to_1D(series):
    """
    Code from https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
    Collapses a series of lists into a one-dimensional series, allowing for counts on the values
    """
    return pd.Series([x for _list in series for x in _list],dtype='object')

def time_series_generation(standard_date,codes,protein,start_dates,end_dates,time_series_freq_out=None,verbose=False):
    #The list 'data' below will store each column of the time seies analysis.
    #Each element in the list will be a list of the frequencies of each variant for the week represented by the variant.
    data=[]
    #Create list for storing column names for the time series DataFrame
    column_names=[]
    #Create list of rownames for dataframe
    zero_column="Zero_Mutations_in_"+protein
    row_names=['Total_Genomes',zero_column,*codes]
    #Iterate through each week in the start-end dates lists
    for w in range(len(start_dates)):
        print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(w+1,len(start_dates),(w+1)/len(start_dates)),end="")
        #Form the dataset for week w.
        week_w=standard_date[(standard_date['date']>=start_dates[w]) & (standard_date['date']<=end_dates[w])]

        #Store number of sequences in week w with zero variants relative to reference
        no_mutation_count=len(week_w[week_w["Number_of_Variants"]==0])

        #Create list of variant frequencies for week w
        #Code counts will store the frequency of every mutation in the list of mutation codes.
        code_freq=[]
        #Before iteration, prepare series mapping mutation codes to the freq. of appearance in the sequences.
        week_w_variant_freq=to_1D(week_w["Variants"]).value_counts()
        #Iteratively check all unique mutation codes against the series.
        for j in range(len(codes)):
            #If the code is present in the series, record the freq. in the code freq. list.
            if codes[j] in week_w_variant_freq:
                code_count=week_w_variant_freq[codes[j]]
                code_freq.append(code_count)
            else:
                code_freq.append(0)
        #Formation of column of frequency counts for week w
        #Add the count of no mutations to the beginning of the code_freq list
        code_freq.insert(0,no_mutation_count)
        #Add total number of sequences to the beginning of the code_freq list (Before the count of no mutations)
        code_freq.insert(0,len(week_w))
        
        #The code_freq list for week w will be appended to the data list
        data.append(code_freq)
        
        #Create the column name for week w and append to column_names
        name="Week{} ({}-{})".format(w+1,start_dates[w].strftime('%m/%d/%Y'),end_dates[w].strftime('%m/%d/%Y'))
        column_names.append(name)
    print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(len(start_dates),len(start_dates),1.000),end="")
    #line below erases progress meter once complete
    print(" "*100+"\r",end="")
    
    #Create Pandas DafaFrame from the data
    #Form a np.array first, then transpose (each list represents a row, and default is for each list to represent a column)
    data=np.array(data,dtype="int64").transpose()
    time_series_global=pd.DataFrame(data=data,index=row_names,columns=column_names)
    
    #Write DataFrame to CSV
    if verbose==True:
        print("Writing Time Series Frequency Dataframe to {}.".format(time_series_freq_out))
    time_series_global.to_csv(time_series_freq_out,sep=",",chunksize=1000)
    
    return time_series_global

def percentage_table(time_series_global,time_series_percent_out,verbose=False):
    def normalize_to_percentage(df):
        """
        This function is used only in the creation of the percentage table. Each frequency value is divided by the number of sequences analyzed to yield the percentage of sequences containing each variant.
        """
        #For every column in the time series data, divide all values in the coulmn by the column by the first value (total sequences analyzed)
        for i in range(0,df.shape[1]):
            #It is possible to have division by zero if zero genomes were included in a given week. This case must be caught to avoid undefined values.
            if df.iloc[0,i]==0:
                #If total genomes==0, all cells in the column will be set to zero.
                for j in range(df.shape[0]):
                    df.iloc[j,i]=0
            else:
                df.iloc[:,i]=df.iloc[:,i]/df.iloc[0,i]
        #Remove total genomes row when finished
        df=df.iloc[1:,:]
        return df
    global_percentage=normalize_to_percentage(time_series_global)
    global_percentage.to_csv(time_series_percent_out,sep=",",chunksize=1000)
    if verbose==True:
        print("Time series percentage dataframe written to {}.".format(time_series_percent_out))
    return global_percentage

def create_heatmap_table(global_percentage,time_series_HT_out,protein,cmap,verbose=False,**cmap_kwargs):
    """
    Arguments
    ----------
    global_percentage: dataset with percentage values for unique variants.
    time_series_HT_out: path for writing out heatmap table.
    cmap: a matplotlib colormap (see https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html for options).
    
    cmap_kwargs: supported kwargs
    low: compress lower end of color range by given value
    high: compres higher end of color range by given value 
    vmin: value corresponding to the low end of the colormap range. Values lower than vmin will be colored according to the lower extreme of the colormap.
    vmax: value corresponding to the high end of the colormap range. Values higher than vax will be colored according to the upper extreme of the colormap.
    text_color_threshold (default=0.408): threshold for text color. 0 makes all text dark colored, 1 makes all text light colored.
    
    returns
    ----------
    global_percentage_HT: a Pandas DataFrame object giving the total number of sequences segmented by continent and week of analysis
    """
    #Show percentage values with 5 decimal places and color cells according to values, with 0.1% as the minimum value for coloration
    global_percentage_HT=global_percentage.style.format("{0:.5%}").background_gradient(cmap,**cmap_kwargs)
    global_percentage_HT.to_excel(time_series_HT_out,sheet_name="{} - Global".format(protein))
    if verbose==True:
        print("Heatmap table written to {}.".format(time_series_HT_out))
    return global_percentage_HT

def total_seq_by_continent(meta_clean_path,protein,start_date,end_date,output_path=None):
    """
    Returns a table giving the number of sequences in the analysis by week and by continent.
    
    Arguments
    ----------
    meta_clean_path: path of the cleaned metadata with variants file.
    protein: name of protein (string).
    start_date: the beginning date for weekly intervals. Must be in ISO format, and should be a Sunday to avoid unexpected interval creation.
    end_date: the end date for weekly intervals. Must be in ISO format, and should be a Saturday to avoid unexpected interval creation.
    output_path: path for writing dataframe; if None, output will not be written (default=None)
    
    Returns
    ----------
    seq_continent: a Pandas DataFrame, giving the number of sequences analyzed by week, both worldwide and broken down by continent.
    """
    #Generate date range
    start_dates,end_dates=date_range_generator_weekly(start_date,end_date)
    #Load metadata
    print("Loading metadata at",meta_clean_path)
    metadata_clean=pd.read_csv(meta_clean_path,sep="\t",parse_dates=['date'],infer_datetime_format=True,dtype="object")
    
    #Generate column names from dates entered
    column_names=[]
    for w in range(0,len(start_dates)):
        #Create the column name for week w and append to column_names
        name="Week{} ({}-{})".format(w+1,start_dates[w].strftime('%m/%d/%Y'),end_dates[w].strftime('%m/%d/%Y'))
        column_names.append(name)
    #Row names: one for each unique continent in the 'region' column of the metadata, plus one for global totals
    row_names=list(metadata_clean.region.unique())
    row_names.append("Worldwide")

    #Create list for storing data
    data=[]

    #Progress meter: use a counter that increments by one for each iteration in the region for loop 
    progress=0
    total_items=len(row_names)
    
    #For each unique region (continent) name, perform the following:
    for continent in row_names:
        print("Computing Sequence Counts by Continent: {} of {}.\r".format(progress,total_items),sep="",end="")
        #Begin by subsetting metadata with variants by region
        if continent!="Worldwide":
            region_subset=metadata_clean[metadata_clean["region"]==continent]
        #Must reset "region_subset" to metadata_clean when "worldwide" is entered (otherwise the subset will be for the last continent)
        elif continent=="Worldwide":
            region_subset=metadata_clean

        #Generate list for storing the weekly number of genomes for the continent
        n_genomes_total=[]

        #For each week w in the date range, subset the dataset for that week and record the number of entries found.
        for w in range(0,len(start_dates)):
            region_subset_w=region_subset[(region_subset['date']>=start_dates[w]) & (region_subset['date']<=end_dates[w])]
            n_genomes_w=len(region_subset_w)
            #Add number of genomes for week w to the list
            n_genomes_total.append(n_genomes_w)
        data.append(n_genomes_total)
        #Progress meter: increment by one
        progress+=1
    print("Computing Sequence Counts by Continent: {} of {}.".format(total_items,total_items),sep="",end="\n")
        
    #Create dataframe from total
    seq_continent=pd.DataFrame(data,index=row_names,columns=column_names)

    #Write dataframe to file
    if output_path:
        seq_continent.to_csv(output_path,chunksize=1000)
        print("Output written to",output_path,sep=" ")
    return seq_continent

def define_variant_combos(variant_combinations_path):
    """
    Reads the variant combinations file, which contains all unique combinations in the dataset. Returns the unique variant combinations as a list. 
    """
    #Load variant combinations dataset, extract "Variants" column, and store result as a list
    var_combos=list(pd.read_csv(variant_combinations_path,sep="\t")["Variants"])
    return var_combos

def form_combo_metadata(meta_clean):
    """
    Takes the cleaned metadata used for time series computations of individual variants and converts the lists in the variant colum to strings so combinations can be compared.
    """
    combo_meta=meta_clean.copy()
    combo_meta["Variants"]=combo_meta["Variants"].apply(lambda x:variant_list_to_string(x))
    return combo_meta

def time_series_combo(combo_meta,var_combos,protein,start_dates,end_dates,combo_TS_freq_out=None,verbose=False):
    #The list 'data' below will store each column of the time seies analysis.
    #Each element in the list will be a list of the frequencies of each variant for the week represented by the variant.
    data=[]
    #Create list for storing column names for the time series DataFrame
    column_names=[]
    #Create list of rownames for dataframe
    row_names=['Total_Genomes',"Zero_Variants",*var_combos]

    #Iterate through each week in the start-end dates lists
    for w in range(len(start_dates)):
        print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(w+1,len(start_dates),(w+1)/len(start_dates)),end="")
        #Subset dataset for week w.
        week_w=combo_meta[(combo_meta['date']>=start_dates[w]) & (combo_meta['date']<=end_dates[w])]

        #Store number of sequences in week w with zero variants relative to reference
        no_mutation_count=len(week_w[week_w["Number_of_Variants"]==0])

        #Create list for storing frequencies of each variant combo during week w
        combo_freq=[]

        #Use value_counts() to create a series with the frequency of each variant combo in the subset for week w
        week_w_combo_freq=week_w["Variants"].value_counts()

        #Iteratively check all unique mutation codes against the series.
        for j in range(len(var_combos)):
            #If the code is present in the series, record the freq. in the code freq. list.
            if var_combos[j] in week_w_combo_freq:
                combo_count=week_w_combo_freq[var_combos[j]]
                combo_freq.append(combo_count)
            else:
                combo_freq.append(0)

        #Formation of column of frequency counts for week w
        #Add the count of no mutations to the beginning of the code_freq list
        combo_freq.insert(0,no_mutation_count)
        #Add total number of sequences to the beginning of the code_freq list (Before the count of no mutations)
        combo_freq.insert(0,len(week_w))

        #The code_freq list for week w will be appended to the data list
        data.append(combo_freq)

        #Create the column name for week w and append to column_names
        name="Week{} ({}-{})".format(w+1,start_dates[w].strftime('%m/%d/%Y'),end_dates[w].strftime('%m/%d/%Y'))
        column_names.append(name)

    print("Computing Time Series of Variants for Week {} out of {}. {:.1%} complete\r".format(len(start_dates),len(start_dates),1.000),end="")
    #line below erases progress meter once complete
    print(" "*100+"\r",end="")

    #Create Pandas DafaFrame from the data
    #Form a np.array first, then transpose (each list represents a column, and default is for each list to represent a row)
    data=np.array(data,dtype="int64").transpose()
    combo_TS_freq=pd.DataFrame(data=data,index=row_names,columns=column_names)

    #Write DataFrame to CSV
    if verbose==True:
        print("Writing Time Series Frequency Dataframe to {}.".format(time_series_freq_out))
    combo_TS_freq.to_csv(combo_TS_freq_out,sep=",",chunksize=1000)
    
    return combo_TS_freq