diff --git a/.vscode/settings.json b/.vscode/settings.json index 5657220a..c209ce1c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { - "editor.formatOnSave": true, + "[python]": { + "editor.formatOnSave": true + }, "python.testing.pytestEnabled": true, "python.testing.unittestEnabled": false, "python.testing.nosetestsEnabled": false, diff --git a/nltools/data/adjacency.py b/nltools/data/adjacency.py index 3456719e..dd0bad2c 100644 --- a/nltools/data/adjacency.py +++ b/nltools/data/adjacency.py @@ -692,13 +692,17 @@ def similarity( ignore_diagonal=False, **kwargs, ): - """Calculate similarity between two Adjacency matrices. - Default is to use spearman correlation and permutation test. + """ + Calculate similarity between two Adjacency matrices. Default is to use spearman + correlation and permutation test. + Args: - data: Adjacency data, or 1-d array same size as self.data + data (Adjacency or array): Adjacency data, or 1-d array same size as self.data perm_type: (str) '1d','2d', or None metric: (str) 'spearman','pearson','kendall' - ignore_diagonal: (bool) only applies to 'directed' Adjacency types using perm_type=None or perm_type='1d' + ignore_diagonal: (bool) only applies to 'directed' Adjacency types using + perm_type=None or perm_type='1d' + """ data1 = self.copy() if not isinstance(data, Adjacency): @@ -1015,11 +1019,6 @@ def bootstrap( ): """Bootstrap an Adjacency method. - Example Useage: - b = dat.bootstrap('mean', n_samples=5000) - b = dat.bootstrap('predict', n_samples=5000, algorithm='ridge') - b = dat.bootstrap('predict', n_samples=5000, save_weights=True) - Args: function: (str) method to apply to data for each bootstrap n_samples: (int) number of samples to bootstrap with replacement @@ -1027,7 +1026,14 @@ def bootstrap( (useful for aggregating many bootstraps on a cluster) n_jobs: (int) The number of CPUs to use to do the computation. -1 means all CPUs.Returns: - output: summarized studentized bootstrap output + + Returns: + summarized studentized bootstrap output + + Examples: + >>> b = dat.bootstrap('mean', n_samples=5000) + >>> b = dat.bootstrap('predict', n_samples=5000, algorithm='ridge') + >>> b = dat.bootstrap('predict', n_samples=5000, save_weights=True) """ diff --git a/nltools/data/brain_data.py b/nltools/data/brain_data.py index cacb411c..446975d0 100644 --- a/nltools/data/brain_data.py +++ b/nltools/data/brain_data.py @@ -13,7 +13,8 @@ __license__ = "MIT" from nilearn.signal import clean -from scipy.stats import ttest_1samp, pearsonr +from scipy.stats import ttest_1samp, pearsonr, spearmanr +from scipy.spatial.distance import cdist from scipy.stats import t as t_dist from scipy.signal import detrend from scipy.interpolate import pchip @@ -33,7 +34,7 @@ from joblib import Parallel, delayed from nltools.mask import expand_mask from nltools.analysis import Roc -from nilearn.input_data import NiftiMasker +from nilearn.maskers import NiftiMasker from nilearn.plotting import plot_stat_map from nilearn.image import smooth_img, resample_to_img from nilearn.masking import intersect_masks @@ -71,6 +72,7 @@ from nilearn.decoding import SearchLight import deepdish as dd from pathlib import Path +import warnings # Optional dependencies @@ -90,15 +92,12 @@ class Brain_Data(object): Y: Pandas DataFrame of training labels X: Pandas DataFrame Design Matrix for running univariate models mask: binary nifiti file to mask brain data - output_file: Name to write out to nifti file **kwargs: Additional keyword arguments to pass to the prediction algorithm """ - def __init__( - self, data=None, Y=None, X=None, mask=None, output_file=None, **kwargs - ): + def __init__(self, data=None, Y=None, X=None, mask=None, **kwargs): if mask is not None: if not isinstance(mask, nib.Nifti1Image): if isinstance(mask, str) or isinstance(mask, Path): @@ -112,8 +111,10 @@ def __init__( ) self.mask = mask else: + # Load default mask self.mask = nib.load(resolve_mni_path(MNI_Template)["mask"]) - self.nifti_masker = NiftiMasker(mask_img=self.mask) + # Learn transformation on mask + self.nifti_masker = NiftiMasker(mask_img=self.mask, **kwargs) if data is not None: if isinstance(data, str) or isinstance(data, Path): @@ -150,25 +151,36 @@ def __init__( for e in f["Y_index"] ], ) - self.mask = nib.Nifti1Image( - f["mask_data"], - affine=f["mask_affine"], - file_map={ - "image": nib.FileHolder(filename=f["mask_file_name"]) - }, - ) - nifti_masker = NiftiMasker(self.mask) - self.nifti_masker = nifti_masker.fit(self.mask) - self.file_name = f["file_name"] + if mask is None: + # User didn't request a mask so try to load it from the h5 file, + # i.e. overwrite the default mask loaded above + self.mask = nib.Nifti1Image( + f["mask_data"], + affine=f["mask_affine"], + file_map={ + "image": nib.FileHolder(filename=f["mask_file_name"]) + }, + ) + nifti_masker = NiftiMasker(self.mask) + self.nifti_masker = nifti_masker.fit(self.mask) + else: + # Mask is already set above so use the default or user requested + # mask rather than the one in h5 (if it exists) + if "mask_data" in f: + warnings.warn( + "Existing mask found in HDF5 file but is being ignored because you passed a value for mask. Set mask=None to use existing mask in the HDF5 file" + ) + # We're done initializing from the h5 file so just return no need to + # run any of the additional checks and code below. We should really + # refactor this entire init... return - else: data = nib.load(data) self.data = self.nifti_masker.fit_transform(data) elif isinstance(data, list): if isinstance(data[0], Brain_Data): tmp = concatenate(data) - for item in ["data", "Y", "X", "mask", "nifti_masker", "file_name"]: + for item in ["data", "Y", "X", "mask", "nifti_masker"]: setattr(self, item, getattr(tmp, item)) else: if all(isinstance(x, data[0].__class__) for x in data): @@ -222,17 +234,14 @@ def __init__( else: self.X = pd.DataFrame() - self.file_name = output_file if output_file is not None else [] - def __repr__(self): - return "%s.%s(data=%s, Y=%s, X=%s, mask=%s, output_file=%s)" % ( + return "%s.%s(data=%s, Y=%s, X=%s, mask=%s)" % ( self.__class__.__module__, self.__class__.__name__, self.shape(), len(self.Y), self.X.shape, os.path.basename(self.mask.get_filename()), - self.file_name, ) def __getitem__(self, index): @@ -479,7 +488,7 @@ def to_nifti(self): return self.nifti_masker.inverse_transform(self.data) - def write(self, file_name=None, **kwargs): + def write(self, file_name, **kwargs): """Write out Brain_Data object to Nifti or HDF5 File. Args: @@ -504,7 +513,6 @@ def write(self, file_name=None, **kwargs): "mask_affine": self.mask.affine, "mask_data": self.mask.get_fdata(), "mask_file_name": self.mask.get_filename(), - "file_name": self.file_name, }, compression=kwargs.get("compression", "blosc"), ) @@ -544,7 +552,7 @@ def plot( threshold_lower=None, figsize=(15, 2), axes=None, - **kwargs + **kwargs, ): """Create a quick plot of self.data. Will plot each image separately @@ -589,7 +597,7 @@ def plot( colorbar=colorbar, draw_cross=draw_cross, axes=axes, - **kwargs + **kwargs, ) else: if axes is not None: @@ -608,7 +616,7 @@ def plot( colorbar=colorbar, draw_cross=draw_cross, axes=a[i], - **kwargs + **kwargs, ) return elif view in ["glass", "mni", "full"]: @@ -618,7 +626,7 @@ def plot( how=view, thr_upper=threshold_upper, thr_lower=threshold_lower, - **kwargs + **kwargs, ) else: raise ValueError( @@ -905,6 +913,17 @@ def similarity(self, image, method="correlation"): """ + supported_metrics = [ + "correlation", + "pearson", + "rank_correlation", + "spearman", + "dot_product", + "cosine", + ] + if method not in supported_metrics: + raise ValueError(f"method must be one of {supported_metrics}") + image = check_brain_data(image) # Check to make sure masks are the same for each dataset and if not @@ -925,59 +944,20 @@ def similarity(self, image, method="correlation"): data2 = self.data image2 = image.data - def vector2array(data): - if len(data.shape) == 1: - return data.reshape(-1, 1).T - else: - return data - - def flatten_array(data): - if np.any(np.array(data.shape) == 1): - data = data.flatten() - if len(data) == 1 & data.shape[0] == 1: - data = data[0] - return data - - # Calculate pattern expression if method == "dot_product": - if len(image2.shape) > 1: - if image2.shape[0] > 1: - pexp = [] - for i in range(image2.shape[0]): - pexp.append(np.dot(data2, image2[i, :])) - pexp = np.array(pexp) - else: - pexp = np.dot(data2, image2) - else: - pexp = np.dot(data2, image2) - elif method == "correlation": - if len(image2.shape) > 1: - if image2.shape[0] > 1: - pexp = [] - for i in range(image2.shape[0]): - pexp.append(pearson(image2[i, :], data2)) - pexp = np.array(pexp) - else: - pexp = pearson(image2, data2) - else: - pexp = pearson(image2, data2) + func = lambda x, y: np.dot(x, y) + elif method in ["pearson", "correlation"]: + func = lambda x, y: pearsonr(x, y)[0] + elif method in ["spearman", "rank_correlation"]: + func = lambda x, y: spearmanr(x, y)[0] elif method == "cosine": - image2 = vector2array(image2) - data2 = vector2array(data2) - if image2.shape[1] > 1: - pexp = [] - for i in range(image2.shape[0]): - pexp.append( - cosine_similarity( - image2[i, :].reshape(-1, 1).T, data2 - ).flatten() - ) - pexp = np.array(pexp) - else: - pexp = cosine_similarity(image2, data2).flatten() - else: - raise ValueError("Method must be one of: correlation, dot_product, cosine") - return flatten_array(pexp) + func = method + + out = cdist(np.atleast_2d(data2), np.atleast_2d(image2), func).squeeze() + # cdist metric argument returns distances by default (unless we specific a + # custom function like above) so flip it to similarity + out = 1 - out if method == "cosine" else out + return out def distance(self, metric="euclidean", **kwargs): """Calculate distance between images within a Brain_Data() instance. @@ -1335,7 +1315,7 @@ def predict_multi( scoring=None, n_jobs=1, verbose=0, - **kwargs + **kwargs, ): """Perform multi-region prediction. This can be a searchlight analysis or multi-roi analysis if provided a Brain_Data instance with labeled non-overlapping rois. @@ -1661,7 +1641,7 @@ def upload_neurovault( collection_id=None, img_type=None, img_modality=None, - **kwargs + **kwargs, ): """Upload Data to Neurovault. Will add any columns in self.X to image metadata. Index will be used as image name. @@ -1726,7 +1706,7 @@ def add_image_to_collection( name=img_name, modality=img_modality, map_type=img_type, - **kwargs + **kwargs, ) if len(self.shape()) == 1: @@ -1786,7 +1766,7 @@ def filter(self, sampling_freq=None, high_pass=None, low_pass=None, **kwargs): standardize=standardize, high_pass=high_pass, low_pass=low_pass, - **kwargs + **kwargs, ) return out @@ -1955,23 +1935,25 @@ def bootstrap( n_jobs=-1, random_state=None, *args, - **kwargs + **kwargs, ): - """Bootstrap a Brain_Data method. - - Example Useage: - b = dat.bootstrap('mean', n_samples=5000) - b = dat.bootstrap('predict', n_samples=5000, algorithm='ridge') - b = dat.bootstrap('predict', n_samples=5000, save_weights=True) + """Bootstrap a `Brain_Data` method. Args: function: (str) method to apply to data for each bootstrap n_samples: (int) number of samples to bootstrap with replacement - save_weights: (bool) Save each bootstrap iteration - (useful for aggregating many bootstraps on a cluster) - n_jobs: (int) The number of CPUs to use to do the computation. - -1 means all CPUs.Returns: - output: summarized studentized bootstrap output + save_weights: (bool) Save each bootstrap iteration (useful for aggregating + many bootstraps on a cluster) + n_jobs: (int) The number of CPUs to use to do the computation. -1 means all + CPUs.Returns: + + Returns: + output: summarized studentized bootstrap output + + Examples: + >>> b = dat.bootstrap('mean', n_samples=5000) + >>> b = dat.bootstrap('predict', n_samples=5000, algorithm='ridge') + >>> b = dat.bootstrap('predict', n_samples=5000, save_weights=True) """ @@ -2036,16 +2018,6 @@ def align(self, target, method="procrustes", axis=0, *args, **kwargs): See nltools.stats.align for aligning multiple Brain_Data instances - Examples: - Hyperalign using procrustes transform: - out = data.align(target, method='procrustes') - - Align using shared response model: - out = data.align(target, method='probabilistic_srm', n_features=None) - - Project aligned data into original data: - original_data = np.dot(out['transformed'].data,out['transformation_matrix'].T) - Args: target: (Brain_Data) object to align to. method: (str) alignment method to use @@ -2056,6 +2028,13 @@ def align(self, target, method="procrustes", axis=0, *args, **kwargs): out: (dict) a dictionary containing transformed object, transformation matrix, and the shared response matrix + Examples: + - Hyperalign using procrustes transform: + >>> out = data.align(target, method='procrustes') + - Align using shared response model: + >>> out = data.align(target, method='probabilistic_srm', n_features=None) + - Project aligned data into original data: + >>> original_data = np.dot(out[‘transformed’].data,out[‘transformation_matrix’].T) """ if method not in ["probabilistic_srm", "deterministic_srm", "procrustes"]: @@ -2156,19 +2135,20 @@ def find_spikes(self, global_spike_cutoff=3, diff_spike_cutoff=3): ) def temporal_resample(self, sampling_freq=None, target=None, target_type="hz"): - """Resample Brain_Data timeseries to a new target frequency or number of samples + """ + Resample Brain_Data timeseries to a new target frequency or number of samples using Piecewise Cubic Hermite Interpolating Polynomial (PCHIP) interpolation. This function can up- or down-sample data. Note: this function can use quite a bit of RAM. - Args: - sampling_freq: (float) sampling frequency of data in hertz - target: (float) upsampling target - target_type: (str) type of target can be [samples,seconds,hz] + Args: + sampling_freq: (float) sampling frequency of data in hertz + target: (float) upsampling target + target_type: (str) type of target can be [samples,seconds,hz] - Returns: - upsampled Brain_Data instance + Returns: + upsampled Brain_Data instance """ out = self.copy() diff --git a/nltools/data/design_matrix.py b/nltools/data/design_matrix.py index 6a186ec2..036b2d46 100644 --- a/nltools/data/design_matrix.py +++ b/nltools/data/design_matrix.py @@ -464,11 +464,16 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose): def vif(self, exclude_polys=True): """ - Compute variance inflation factor amongst columns of design matrix,ignoring polynomial terms. Much faster that statsmodels and more reliable too. Uses the same method as Matlab and R (diagonal elements of the inverted correlation matrix). + Compute variance inflation factor amongst columns of design matrix,ignoring + polynomial terms. Much faster that statsmodels and more reliable too. Uses the + same method as Matlab and R (diagonal elements of the inverted correlation + matrix). + + Args: + exclude_polys (bool): whether to skip checking of polynomial terms (i.e intercept, trends, basis functions); default True Returns: vifs (list): list with length == number of columns - intercept - exclude_polys (bool): whether to skip checking of polynomial terms (i.e intercept, trends, basis functions); default True """ if self.shape[1] <= 1: @@ -560,9 +565,8 @@ def convolve(self, conv_func="hrf", columns=None): return out def downsample(self, target, **kwargs): - """Downsample columns of design matrix. Relies on - nltools.stats.downsample, but ensures that returned object is a - design matrix. + """ + Downsample columns of design matrix. Relies on nltools.stats.downsample, but ensures that returned object is a design matrix. Args: target(float): desired frequency in hz @@ -588,9 +592,8 @@ def downsample(self, target, **kwargs): return newMat def upsample(self, target, **kwargs): - """Upsample columns of design matrix. Relies on - nltools.stats.upsample, but ensures that returned object is a - design matrix. + """ + Upsample columns of design matrix. Relies on nltools.stats.upsample, but ensures that returned object is a design matrix. Args: target(float): desired frequence in hz @@ -616,9 +619,8 @@ def upsample(self, target, **kwargs): return newMat def zscore(self, columns=[]): - """Z-score specific columns of design matrix. Relies on - nltools.stats.downsample, but ensures that returned object is a - design matrix. + """ + Z-score specific columns of design matrix. Relies on nltools.stats.downsample, but ensures that returned object is a design matrix. Args: columns (list): columns to z-score; defaults to all columns @@ -774,6 +776,11 @@ def clean(self, fill_na=0, exclude_polys=False, thresh=0.95, verbose=True): """ # Temporarily turn off warnings for correlations + if any(self.columns.duplicated()): + raise ValueError( + "Duplicate columns names detected. Using .clean() with duplicate columns is not supported as it can produce unexpected results" + ) + old_settings = np.seterr(all="ignore") if fill_na is not None: out = self.fillna(fill_na) @@ -801,6 +808,10 @@ def clean(self, fill_na=0, exclude_polys=False, thresh=0.95, verbose=True): out = out.drop(remove, axis=1) out.polys = [elem for elem in out.polys if elem not in remove] out = out._sort_cols() + if out.shape[1] >= self.shape[1]: + raise ValueError( + f"Removing {len(remove)} cols has somehow made the design matrix LARGER or failed to change it's shape. Previously had {self.shape[1]} columns and now has {out.shape[1]} columns. This usually happens if you had duplicate column names prior to calling clean!" + ) else: print("Dropping columns not needed...skipping") np.seterr(**old_settings) diff --git a/nltools/file_reader.py b/nltools/file_reader.py index 8913b7de..79dde817 100644 --- a/nltools/file_reader.py +++ b/nltools/file_reader.py @@ -28,7 +28,7 @@ def onsets_to_dm( **kwargs, ): """ - This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. sampling_freq should be specified in hertz; for TRs use hertz = 1/TR. Onsets files **must** be organized with columns in one of the following 4 formats: + This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. sampling_freq should be specified in hertz; for TRs use hertz = 1/TR. Onsets files **must** be organized with columns in one of the following 4 formats: 1) 'Stim, Onset' 2) 'Onset, Stim' @@ -38,22 +38,23 @@ def onsets_to_dm( No other file organizations are currently supported. *Note:* Stimulus offsets (onset + duration) that fall into an adjacent TR include that full TR. E.g. offset of 10.16s with TR = 2 has an offset of TR 5, which spans 10-12s, rather than an offset of TR 4, which spans 8-10s. Args: - F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes - sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR) run_length (int): number of TRs in the run these onsets came from - sort (bool, optional): whether to sort the columns of the resulting - design matrix alphabetically; defaults to - False - addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None - header (str,optional): None if missing header, otherwise pandas - header keyword; defaults to 'infer' - keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option; defaults to True - unique_cols (list, optional): additional columns to keep seperate across files (e.g. spikes); defaults to [] - fill_na (str/int/float, optional): what value fill NaNs in with if reading in a list of files; defaults to None - kwargs: additional inputs to pandas.read_csv - - Returns: - Design_Matrix class - + F (str/Path/pd.DataFrame): filepath or pandas dataframe + sampling_freq (float): samping frequency in hertz, i.e 1 / TR + run_length (int): run length in number of TRs + header (str/None, optional): whether there's an additional header row in the + supplied file/dataframe. See `pd.read_csv` for more details. Defaults to `"infer"`. + sort (bool, optional): whether to sort dataframe columns alphabetically. Defaults to False. + keep_separate (bool, optional): if a list of files or dataframes is supplied, + whether to create separate polynomial columns per file. Defaults to `True`. + add_poly (bool/int, optional): whether to add Nth order polynomials to design + matrix. Defaults to None. + unique_cols (list/None, optional): if a list of files or dataframes is supplied, + what additional columns to keep separate per file (e.g. spikes). Defaults to None. + fill_na (Any, optional): what to replace NaNs with. Defaults to None (no filling). + + + Returns: + nltools.data.Design_Matrix: design matrix organized as TRs x Stims """ if not isinstance(F, list): @@ -128,18 +129,19 @@ def conditional_round(x, TR): X.loc[row["Onset"] : row["Offset"], row["Stim"]] = 1 else: X.loc[row["Onset"], row["Stim"]] = 1 + # DISABLED cause this isn't quite accurate for stimuli of different durations # Run a check - if "Offset" in df.columns: - onsets = X.sum().values - stim_counts = data.Stim.value_counts(sort=False)[X.columns] - durations = data.groupby("Stim").Duration.mean().values - for i, (o, c, d) in enumerate(zip(onsets, stim_counts, durations)): - if c * (d / TR) <= o <= c * ((d / TR) + 1): - pass - else: - warnings.warn( - f"Computed onsets for {data.Stim.unique()[i]} are inconsistent with expected values. Please manually verify the outputted Design_Matrix!" - ) + # if "Offset" in df.columns: + # onsets = X.sum().values + # stim_counts = data.Stim.value_counts(sort=False)[X.columns] + # durations = data.groupby("Stim").Duration.mean().values + # for i, (o, c, d) in enumerate(zip(onsets, stim_counts, durations)): + # if c * (d / TR) <= o <= c * ((d / TR) + 1): + # pass + # else: + # warnings.warn( + # f"Computed onsets for {data.Stim.unique()[i]} are inconsistent ({o}) with expected values ({c * (d / TR)} to {c * ((d / TR) + 1)}). Please manually verify the outputted Design_Matrix!" + # ) if sort: X = X.reindex(sorted(X.columns), axis=1) diff --git a/nltools/mask.py b/nltools/mask.py index c8c12d51..976e4e3f 100644 --- a/nltools/mask.py +++ b/nltools/mask.py @@ -155,9 +155,7 @@ def collapse_mask(mask, auto_label=True, custom_mask=None): m_list.append(mask[x].to_nifti()) intersect = intersect_masks(m_list, threshold=1, connected=False) intersect = Brain_Data( - nib.Nifti1Image( - np.abs(intersect.get_data() - 1), intersect.get_affine() - ), + nib.Nifti1Image(np.abs(intersect.get_data() - 1), intersect.affine), mask=custom_mask, ) diff --git a/nltools/plotting.py b/nltools/plotting.py index 7cfc34f1..0543d1a4 100644 --- a/nltools/plotting.py +++ b/nltools/plotting.py @@ -146,12 +146,12 @@ def plot_t_brain( """ Takes a brain data object and computes a 1 sample t-test across it's first axis. If a list is provided will compute difference between brain data objects in list (i.e. paired samples t-test). Args: - objIn:(list/Brain_Data) if list will compute difference map first - how: (list) whether to plot a glass brain 'glass', 3 view-multi-slice mni 'mni', or both 'full' - thr: (str) what method to use for multiple comparisons correction unc, fdr, or tfce - alpha: (float) p-value threshold - nperm: (int) number of permutations for tcfe; default 1000 - cut_coords: (list) x,y,z coords to plot brain slice + objIn (list/Brain_Data): if list will compute difference map first + how (list): whether to plot a glass brain 'glass', 3 view-multi-slice mni 'mni', or both 'full' + thr (str): what method to use for multiple comparisons correction unc, fdr, or tfce + alpha (float): p-value threshold + nperm (int): number of permutations for tcfe; default 1000 + cut_coords (list): x,y,z coords to plot brain slice kwargs: optionals args to nilearn plot functions (e.g. vmax) """ @@ -250,11 +250,12 @@ def plot_t_brain( def plot_brain(objIn, how="full", thr_upper=None, thr_lower=None, save=False, **kwargs): """ More complete brain plotting of a Brain_Data instance + Args: - obj: (Brain_Data) object to plot - how: (str) whether to plot a glass brain 'glass', 3 view-multi-slice mni 'mni', or both 'full' - thr_upper: (str/float) thresholding of image. Can be string for percentage, or float for data units (see Brain_Data.threshold() - thr_lower: (str/float) thresholding of image. Can be string for percentage, or float for data units (see Brain_Data.threshold() + obj (Brain_Data): object to plot + how (str): whether to plot a glass brain 'glass', 3 view-multi-slice mni 'mni', or both 'full' + thr_upper (str/float): thresholding of image. Can be string for percentage, or float for data units (see Brain_Data.threshold() + thr_lower (str/float): thresholding of image. Can be string for percentage, or float for data units (see Brain_Data.threshold() save (str): if a string file name or path is provided plots will be saved into this directory appended with the orientation they belong to kwargs: optionals args to nilearn plot functions (e.g. vmax) diff --git a/nltools/simulator.py b/nltools/simulator.py index 8aa48efa..fc39e535 100755 --- a/nltools/simulator.py +++ b/nltools/simulator.py @@ -16,7 +16,7 @@ import nibabel as nib import pandas as pd import matplotlib.pyplot as plt -from nilearn.input_data import NiftiMasker +from nilearn.maskers import NiftiMasker from scipy.stats import multivariate_normal, binom, ttest_1samp from nltools.data import Brain_Data from nltools.stats import fdr, one_sample_permutation diff --git a/nltools/stats.py b/nltools/stats.py index 3bbebd5c..1d20f9bc 100644 --- a/nltools/stats.py +++ b/nltools/stats.py @@ -225,7 +225,7 @@ def multi_threshold(t_map, p_map, thresh): if not isinstance(thresh, list): raise ValueError("Make sure thresh is a list of p-values") - affine = t_map.to_nifti().get_affine() + affine = t_map.to_nifti().affine pos_out = np.zeros(t_map.to_nifti().shape) neg_out = deepcopy(pos_out) for thr in thresh: @@ -812,9 +812,9 @@ def transform_pairwise(X, y): there are the same number of -1 as +1 Reference: "Large Margin Rank Boundaries for Ordinal Regression", - R. Herbrich, T. Graepel, K. Obermayer. - Authors: Fabian Pedregosa - Alexandre Gramfort + R. Herbrich, T. Graepel, K. Obermayer. Authors: Fabian Pedregosa + Alexandre Gramfort + Args: X: (np.array), shape (n_samples, n_features) The data @@ -996,12 +996,34 @@ def regress(X, Y, mode="ols", stats="full", **kwargs): Does NOT add an intercept automatically to the X matrix before fitting like some other software packages. This is left up to the user. This function can compute regression in 3 ways: - 1) Standard OLS - 2) OLS with robust sandwich estimators for standard errors. 3 robust types of estimators exist: - 1) 'hc0' - classic huber-white estimator robust to heteroscedasticity (default) - 2) 'hc3' - a variant on huber-white estimator slightly more conservative when sample sizes are small - 3) 'hac' - an estimator robust to both heteroscedasticity and auto-correlation; auto-correlation lag can be controlled with the 'nlags' keyword argument; default is 1 - 3) ARMA (auto-regressive moving-average) model (experimental). This model is fit through statsmodels.tsa.arima_model.ARMA, so more information about options can be found there. Any settings can be passed in as kwargs. By default fits a (1,1) model with starting lags of 2. This mode is **computationally intensive** and can take quite a while if Y has many columns. If Y is a 2d array joblib.Parallel is used for faster fitting by parallelizing fits across columns of Y. Parallelization can be controlled by passing in kwargs. Defaults to multi-threading using 10 separate threads, as threads don't require large arrays to be duplicated in memory. Defaults are also set to enable memory-mapping for very large arrays if backend='multiprocessing' to prevent crashes and hangs. Various levels of progress can be monitored using the 'disp' (statsmodels) and 'verbose' (joblib) keyword arguments with integer values > 0. + + 1. Standard OLS + 2. OLS with robust sandwich estimators for standard errors. 3 robust types of + estimators exist: + + - 'hc0' - classic huber-white estimator robust to heteroscedasticity (default) + - 'hc3' - a variant on huber-white estimator slightly more conservative when sample sizes are small + - 'hac' - an estimator robust to both heteroscedasticity and auto-correlation; + auto-correlation lag can be controlled with the `nlags` keyword argument; default + is 1 + + 3. ARMA (auto-regressive moving-average) model (experimental). This model is fit through statsmodels.tsa.arima_model.ARMA, so more information about options can be found there. Any settings can be passed in as kwargs. By default fits a (1,1) model with starting lags of 2. This mode is **computationally intensive** and can take quite a while if Y has many columns. If Y is a 2d array joblib.Parallel is used for faster fitting by parallelizing fits across columns of Y. Parallelization can be controlled by passing in kwargs. Defaults to multi-threading using 10 separate threads, as threads don't require large arrays to be duplicated in memory. Defaults are also set to enable memory-mapping for very large arrays if backend='multiprocessing' to prevent crashes and hangs. Various levels of progress can be monitored using the 'disp' (statsmodels) and 'verbose' (joblib) keyword arguments with integer values > 0. + + Args: + X (ndarray): design matrix; assumes intercept is included + Y (ndarray): dependent variable array; if 2d, a model is fit to each column of Y separately + mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma' + robust_estimator (str,optional): kind of robust estimator to use if mode = 'robust'; default 'hc0' + nlags (int,optional): auto-correlation lag correction if mode = 'robust' and robust_estimator = 'hac'; default 1 + order (tuple,optional): auto-regressive and moving-average orders for mode = 'arma'; default (1,1) + kwargs (dict): additional keyword arguments to statsmodels.tsa.arima_model.ARMA and joblib.Parallel + + Returns: + b: coefficients + t: t-statistics (coef/sterr) + p : p-values + df: degrees of freedom + res: residuals Examples: Standard OLS @@ -1024,22 +1046,6 @@ def regress(X, Y, mode="ols", stats="full", **kwargs): >>> results = regress(X,Y,mode='arma',order=(2,3),backend='multiprocessing',n_jobs=8) - Args: - X (ndarray): design matrix; assumes intercept is included - Y (ndarray): dependent variable array; if 2d, a model is fit to each column of Y separately - mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma' - robust_estimator (str,optional): kind of robust estimator to use if mode = 'robust'; default 'hc0' - nlags (int,optional): auto-correlation lag correction if mode = 'robust' and robust_estimator = 'hac'; default 1 - order (tuple,optional): auto-regressive and moving-average orders for mode = 'arma'; default (1,1) - kwargs (dict): additional keyword arguments to statsmodels.tsa.arima_model.ARMA and joblib.Parallel - - Returns: - b: coefficients - t: t-statistics (coef/sterr) - p : p-values - df: degrees of freedom - res: residuals - """ if not isinstance(mode, str): @@ -1192,15 +1198,6 @@ def align(data, method="deterministic_srm", n_features=None, axis=0, *args, **kw original data using Tranformation matrix. Inputs must be a list of Brain_Data instances or numpy arrays (observations by features). - Examples: - Hyperalign using procrustes transform: - out = align(data, method='procrustes') - - Align using shared response model: - out = align(data, method='probabilistic_srm', n_features=None) - - Project aligned data into original data: - original_data = [np.dot(t.data,tm.T) for t,tm in zip(out['transformed'], out['transformation_matrix'])] Args: data: (list) A list of Brain_Data objects @@ -1215,6 +1212,13 @@ def align(data, method="deterministic_srm", n_features=None, axis=0, *args, **kw matrices, a list of transformation matrices, the shared response matrix, and the intersubject correlation of the shared resposnes + Examples: + - Hyperalign using procrustes transform: + >>> out = align(data, method='procrustes') + - Align using shared response model: + >>> out = align(data, method='probabilistic_srm', n_features=None) + - Project aligned data into original data: + >>> original_data = [np.dot(t.data,tm.T) for t,tm in zip(out['transformed'], out['transformation_matrix'])] """ from nltools.data import Brain_Data, Adjacency diff --git a/nltools/tests/test_brain_data.py b/nltools/tests/test_brain_data.py index eccf67a3..89dd40b4 100644 --- a/nltools/tests/test_brain_data.py +++ b/nltools/tests/test_brain_data.py @@ -9,7 +9,7 @@ from nltools.mask import create_sphere, roi_to_brain from pathlib import Path -# from nltools.prefs import MNI_Template +from nltools.prefs import MNI_Template shape_3d = (91, 109, 91) @@ -52,7 +52,7 @@ def test_load(tmpdir): # Test i/o for hdf5 dat.write(os.path.join(str(tmpdir.join("test_write.h5")))) b = Brain_Data(os.path.join(tmpdir.join("test_write.h5"))) - for k in ["X", "Y", "mask", "nifti_masker", "file_name", "data"]: + for k in ["X", "Y", "mask", "nifti_masker", "data"]: if k == "data": assert np.allclose(b.__dict__[k], dat.__dict__[k]) elif k in ["X", "Y"]: @@ -68,6 +68,14 @@ def test_load(tmpdir): ) else: assert b.__dict__[k] == dat.__dict__[k] + # Test situation where we present a user warning when they're trying to load an .h5 + # file that includes a mask AND they pass in value for the mask argument. In this + # case the mask argument takes precedence so we warn the user + with pytest.warns(UserWarning): + bb = Brain_Data( + os.path.join(tmpdir.join("test_write.h5")), mask=MNI_Template["mask"] + ) + assert bb.mask.get_filename() == MNI_Template["mask"] def test_shape(sim_brain_data): @@ -506,6 +514,17 @@ def test_similarity(sim_brain_data): assert len(r) == shape_2d[0] r = sim_brain_data.similarity(stats["weight_map"], method="cosine") assert len(r) == shape_2d[0] + r = sim_brain_data.similarity(stats["weight_map"], method="spearman") + assert len(r) == shape_2d[0] + r = sim_brain_data.similarity(stats["weight_map"], method="pearson") + assert len(r) == shape_2d[0] + r = sim_brain_data.similarity(stats["weight_map"], method="correlation") + assert len(r) == shape_2d[0] + r = sim_brain_data.similarity(stats["weight_map"], method="rank_correlation") + assert len(r) == shape_2d[0] + r = stats["weight_map"].similarity(sim_brain_data) + assert len(r) == shape_2d[0] + r = sim_brain_data.similarity(sim_brain_data, method="correlation") assert r.shape == (sim_brain_data.shape()[0], sim_brain_data.shape()[0]) r = sim_brain_data.similarity(sim_brain_data, method="dot_product") diff --git a/nltools/tests/test_design_matrix.py b/nltools/tests/test_design_matrix.py index ce6cd7f2..5c65dcb9 100644 --- a/nltools/tests/test_design_matrix.py +++ b/nltools/tests/test_design_matrix.py @@ -1,6 +1,8 @@ import numpy as np +import pandas as pd from nltools.data import Design_Matrix from nltools.external.hrf import glover_hrf +import pytest def test_add_poly(sim_design_matrix): @@ -127,3 +129,16 @@ def test_append(sim_design_matrix): run = run.add_poly(2) all_runs = all_runs.append(run, unique_cols=["stim*", "cond*"]) assert all_runs.shape == (44, 28) + + +def test_clean(sim_design_matrix): + + # Drop correlated column + corr_cols = sim_design_matrix.assign(new_col=lambda df: df.iloc[:, 0]) + out = corr_cols.clean(verbose=True) + assert out.shape[1] < corr_cols.shape[1] + + # Raise an error if try to clean with an input matrix that has duplicate column names + dup_cols = pd.concat([sim_design_matrix, sim_design_matrix], axis=1) + with pytest.raises(ValueError): + _ = dup_cols.clean() diff --git a/nltools/version.py b/nltools/version.py index 3f24b49b..a510e061 100644 --- a/nltools/version.py +++ b/nltools/version.py @@ -1,4 +1,4 @@ """Specifies current version of nltools to be used by setup.py and __init__.py """ -__version__ = "0.4.5" +__version__ = "0.4.6"