-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #50 from anthonyboos559/correlations
Census Data Filtering and Pearson Corrleation SnakeMake Rule
- Loading branch information
Showing
11 changed files
with
578 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import os | ||
import re | ||
import glob | ||
import argparse as ap | ||
import multiprocessing as mp | ||
import numpy as np | ||
import pandas as pd | ||
import scipy.sparse as sp | ||
import data_filtering_functions as ff | ||
from collections import defaultdict | ||
from data_processing_functions import extract_file_number | ||
|
||
def main( | ||
directory: str, | ||
species: list[str], | ||
train_dir: str, | ||
save_dir: str, | ||
seed: int, | ||
skip_metadata: bool, | ||
): | ||
if not skip_metadata: | ||
np.random.seed(seed) | ||
dataframes = {} | ||
for specie in species: | ||
# Process file paths for full and training data | ||
train_files = tuple(glob.glob(os.path.join(train_dir, f"{specie}*.pkl"))) | ||
train_files = ff.filter_and_sort_train_files(train_files) | ||
|
||
# Get somaIDs of training samples | ||
train_ids = ff.get_train_data_ids(train_files) | ||
|
||
# Prepare main df that is to be filtered | ||
metadata_files = glob.glob(os.path.join(directory, f"{specie}*.pkl")) | ||
metadata_files.sort(key=extract_file_number) | ||
meta_dataframe = ff.load_and_merge_metadata(tuple(metadata_files)) | ||
|
||
# Filter out training samples from the full data | ||
dataframes[specie] = ff.filter_train_ids(meta_dataframe, train_ids) | ||
|
||
# Process groups and subset size if needed | ||
grouped_data = ff.filter_into_groups(dataframes) | ||
valid_groups = ff.validate_and_sample_groups(grouped_data, species[0]) | ||
|
||
# Save filtered metadata | ||
ff.save_grouped_data(valid_groups, dataframes, save_dir) | ||
|
||
# Load data and slice by metadata index | ||
for specie in species: | ||
data_files = glob.glob(os.path.join(directory, f"{specie}*.npz")) | ||
data_files.sort(key=extract_file_number) | ||
metadata_files = glob.glob(os.path.join(save_dir, f"{specie}*.pkl")) | ||
metadata_files.sort(key=extract_file_number) | ||
filtered_data = defaultdict(list) | ||
for chunk_n, data_file in enumerate(data_files, start=1): | ||
current_chunk = sp.load_npz(data_file) | ||
for gid, metadata_file in enumerate(metadata_files, start=1): | ||
current_df = pd.read_pickle(metadata_file) | ||
idxes = current_df[current_df["chunk_source"] == chunk_n]["data_index"] | ||
sliced_chunk = current_chunk[idxes, :] | ||
filtered_data[gid].append(sliced_chunk) | ||
|
||
# Save filtered data | ||
for gid, data in filtered_data.items(): | ||
chunk_data = sp.vstack(data) | ||
sp.save_npz(os.path.join(save_dir, f"{specie}_filtered_{gid}.npz"), chunk_data) | ||
|
||
if __name__ == "__main__": | ||
parser = ap.ArgumentParser() | ||
parser.add_argument( | ||
"--directory", | ||
type=str, | ||
required=True, | ||
help="Directory to load data from." | ||
) | ||
parser.add_argument( | ||
"--species", | ||
type=str, | ||
nargs="+", | ||
required=True, | ||
help="Species to load data for." | ||
) | ||
parser.add_argument( | ||
"--train_data", | ||
type=str, | ||
required=False, | ||
default=None, | ||
help="Directory where the training data is stored. Defaults to '--directory'" | ||
) | ||
parser.add_argument( | ||
"--save_directory", | ||
type=str, | ||
required=False, | ||
default=None, | ||
help="Directory to save filtered data in. Defaults to '--directory'" | ||
) | ||
parser.add_argument( | ||
"--seed", | ||
type=int, | ||
required=False, | ||
default=42, | ||
help="Seed for the random module." | ||
) | ||
parser.add_argument( | ||
"--skip_metadata", | ||
action="store_true", | ||
help="Whether to skip the metadata filtering and just slice the count data." | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
train_dir = args.directory if args.train_data is None else args.train_data | ||
save_dir = args.directory if args.save_directory is None else args.save_directory | ||
|
||
main( | ||
directory= args.directory, | ||
species= args.species, | ||
train_dir= train_dir, | ||
save_dir= save_dir, | ||
seed= args.seed, | ||
skip_metadata= args.skip_metadata | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import os | ||
import re | ||
import csv | ||
import numpy as np | ||
import pandas as pd | ||
import pandas.core.groupby.generic as gb | ||
from collections import defaultdict | ||
from data_processing_functions import extract_file_number | ||
|
||
TRAIN_DATA_FILES = set(range(1, 14)) | ||
GROUPING_COLUMNS = ["sex", "tissue", "cell_type", "assay"] | ||
MIN_SAMPLE_SIZE = 100 | ||
MAX_SAMPLE_SIZE = 10000 | ||
SPECIES_SAMPLE_SIZE = {"human": 60530, "mouse": 52437} | ||
|
||
def get_train_data_ids(files: tuple[str]) -> set[int]: | ||
|
||
combined_df = pd.DataFrame() | ||
|
||
for file in files: | ||
df = pd.read_pickle(file) | ||
combined_df = pd.concat([combined_df, df], ignore_index=True) | ||
|
||
return set(combined_df["soma_joinid"]) | ||
|
||
def filter_train_ids(df: pd.DataFrame, ids: set[int]) -> pd.DataFrame: | ||
filtered_df = df[~(df["soma_joinid"].isin(ids))] | ||
filtered_df = filtered_df.reset_index(drop=True) | ||
return filtered_df | ||
|
||
def filter_and_sort_train_files(unfiltered_files: tuple[str]) -> tuple[str]: | ||
|
||
filtered_files = [ | ||
file for file in unfiltered_files | ||
if (match := re.search(r'(\d+)\.pkl$', file)) and int(match.group(1)) in TRAIN_DATA_FILES | ||
] | ||
filtered_files.sort(key=extract_file_number) | ||
return tuple(filtered_files) | ||
|
||
def load_and_merge_metadata(files: tuple[str]) -> pd.DataFrame: | ||
|
||
merged_df = pd.DataFrame() | ||
for file in files: | ||
df = pd.read_pickle(file) | ||
df["chunk_source"] = extract_file_number(file) | ||
df = df.reset_index(drop=False) | ||
df.rename(columns={"index": "data_index"}, inplace=True) | ||
merged_df = pd.concat([merged_df, df], ignore_index=True) | ||
|
||
return merged_df | ||
|
||
def filter_into_groups(dfs: dict[str, pd.DataFrame]): | ||
|
||
grouped = {} | ||
for specie, data in dfs.items(): | ||
grouped[specie] = data.groupby(GROUPING_COLUMNS) | ||
|
||
return grouped | ||
|
||
def validate_and_sample_groups(data_groups: dict[str, gb.DataFrameGroupBy], primary_species: str = None): | ||
|
||
valid_groups = defaultdict(dict) | ||
if primary_species is not None: | ||
main_df = data_groups.pop(primary_species) | ||
else: | ||
primary_species, main_df = data_groups.popitem() | ||
|
||
for gid, idxes in main_df.groups.items(): | ||
if len(idxes) < MIN_SAMPLE_SIZE: | ||
continue | ||
elif all( | ||
gid in group.groups.keys() and len(group.groups[gid]) >= MIN_SAMPLE_SIZE | ||
for group in data_groups.values() | ||
): | ||
sample_size = min( | ||
[len(idxes), MAX_SAMPLE_SIZE] + [len(group.groups[gid]) for group in data_groups.values()] | ||
) | ||
|
||
valid_groups[gid][primary_species] = np.random.choice(idxes, sample_size, replace= False) | ||
for specie, group in data_groups.items(): | ||
valid_groups[gid][specie] = np.random.choice(group.groups[gid], sample_size, replace= False) | ||
|
||
return valid_groups | ||
|
||
def save_grouped_data(groups: dict[tuple[str], dict[str, np.ndarray]], dfs: dict[str, pd.DataFrame], save_dir: str): | ||
|
||
with open(os.path.join(save_dir, "group_references.csv"), "w") as file: | ||
writer = csv.writer(file) | ||
writer.writerow(["group_id", "num_samples"] + GROUPING_COLUMNS) | ||
for i, gid in enumerate(groups.keys(), start=1): | ||
for specie, idx in groups[gid].items(): | ||
df = dfs[specie].iloc[idx] | ||
df["group_id"] = i | ||
df["num_samples"] = len(idx) | ||
df = df.sort_values("chunk_source") | ||
df.to_pickle(os.path.join(save_dir, f"{specie}_filtered_{i}.pkl")) | ||
writer.writerow([i, len(idx)] + list(gid)) | ||
file = pd.read_csv(os.path.join(save_dir, "group_references.csv")) | ||
file.to_pickle(os.path.join(save_dir, "group_references.pkl")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.