From 42a96039feafdf4fc2ede1960b6be5972774eed4 Mon Sep 17 00:00:00 2001 From: Giovanni De Crescenzo Date: Mon, 20 Jan 2025 18:09:42 +0100 Subject: [PATCH] additional mods for bias calculation --- .../multiclosure_inconsistent_output.py | 24 ++- .../src/validphys/closuretest/multiclosure.py | 185 ++++++++++++------ 2 files changed, 147 insertions(+), 62 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index afaf5e5458..2b56f388fb 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -41,27 +41,35 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea """ records = [] for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): - biases, biases_mean_cov, n_comp, n_comp_m = pc_bias_var_dataset + biases, biases_mean_cov, n_comp, n_comp_m, evr_single, evr_avg = pc_bias_var_dataset dic = {} dic["dataset"] = str(ds.name) for i,(e,e_m) in enumerate(zip(biases, biases_mean_cov)): dic[f"bias_{i}"] = (round(e/n_comp, 3), round(e_m/n_comp_m, 3)) dic["bias_mean"] = round(np.mean(biases/n_comp), 3) dic["bias_std"] = round(np.std(biases/n_comp), 3) + dic["bias_mean_sqrt"] = round(np.sqrt(np.mean(biases/n_comp)), 3) dic["bias_mean_single_cov"] = round(np.mean(biases_mean_cov/n_comp_m), 3) dic["bias_std_single_cov"] = round(np.std(biases_mean_cov/n_comp_m), 3) + dic["bias_mean_single_cov_sqrt"] = round(np.sqrt(np.mean(biases_mean_cov/n_comp_m)), 3) dic["n_comp"] = n_comp dic["n_comp_single_cov"] = n_comp_m + dic["evr_single"] = evr_single + dic["evr_avg"] = evr_avg records.append(dic) column_names = ["dataset"] for i in range(len(biases)): column_names.append(f"bias_{i}") column_names.append("bias_mean") column_names.append("bias_std") + column_names.append("bias_mean_sqrt") column_names.append("bias_mean_single_cov") column_names.append("bias_std_single_cov") + column_names.append("bias_mean_single_cov_sqrt") column_names.append("n_comp") column_names.append("n_comp_single_cov") + column_names.append("evr_single") + column_names.append("evr_avg") df = pd.DataFrame.from_records( records, index="dataset", @@ -93,28 +101,38 @@ def table_bias_variance_data(principal_components_bias_variance_data): # First let's do the total records = [] - biases, biases_mean_cov, n_comp, n_comp_m = principal_components_bias_variance_data + biases, biases_mean_cov, n_comp, n_comp_m, evr_single, evr_avg = principal_components_bias_variance_data dic = {} dic["dataset"] = "Total" for i,(e,e_m) in enumerate(zip(biases, biases_mean_cov)): dic[f"bias_{i}"] = (round(e/n_comp, 3), round(e_m/n_comp_m, 3)) dic["bias_mean"] = round(np.mean(biases/n_comp), 3) dic["bias_std"] = round(np.std(biases/n_comp), 3) + dic["bias_mean_sqrt"] = round(np.sqrt(np.mean(biases/n_comp)), 3) dic["bias_mean_single_cov"] = round(np.mean(biases_mean_cov/n_comp_m), 3) dic["bias_std_single_cov"] = round(np.std(biases_mean_cov/n_comp_m), 3) + dic["bias_mean_single_cov_sqrt"] = round(np.sqrt(np.mean(biases_mean_cov/n_comp_m)), 3) dic["n_comp"] = n_comp dic["n_comp_single_cov"] = n_comp_m + dic["evr_single"] = evr_single + dic["evr_avg"] = evr_avg column_names = ["dataset"] for i in range(len(biases)): column_names.append(f"bias_{i}") column_names.append("bias_mean") column_names.append("bias_std") + column_names.append("bias_mean_sqrt") column_names.append("bias_mean_single_cov") column_names.append("bias_std_single_cov") + column_names.append("bias_mean_single_cov_sqrt") column_names.append("n_comp") column_names.append("n_comp_single_cov") + column_names.append("evr_single") + column_names.append("evr_avg") + records.append(dic) df = pd.DataFrame.from_records( - dic, + records, + index="dataset", columns=column_names, ) diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 90731169b8..df74cde153 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -181,11 +181,14 @@ class PCAInternalMulticlosureLoader: mean_covmat_pca: np.array sqrt_mean_covmat_pca: np.array + expl_variance_ratio_single: np.float32 + expl_variance_ratio_mean: np.float32 + @check_multifit_replicas def internal_multiclosure_dataset_loader_pca( internal_multiclosure_dataset_loader, - explained_variance_ratio=0.99, + cond_num=10000, _internal_max_reps=None, _internal_min_reps=20, ): @@ -228,7 +231,8 @@ def internal_multiclosure_dataset_loader_pca( # diagonalize the mean covariance matrix and only keep the principal components # that explain the required variance - + expl_variance_ratio_single = 0.99 + expl_variance_ratio_mean = 0.99 if _covmat_mean.shape == (): @@ -243,51 +247,70 @@ def internal_multiclosure_dataset_loader_pca( pc_basis_mean=1, n_comp_mean=1, mean_covmat_pca=_covmat_mean, - sqrt_mean_covmat_pca = np.sqrt(_covmat_mean) + sqrt_mean_covmat_pca = np.sqrt(_covmat_mean), + expl_variance_ratio_single = expl_variance_ratio_single, + expl_variance_ratio_mean = expl_variance_ratio_mean )) - eighvals_list = [] eigvecs_list = [] eighvals_norm_list = [] + + for _cov in _covmats: eighvals, eigvecs, eighvals_norm = eigendecomposition(_cov) eighvals_list.append(eighvals) eigvecs_list.append(eigvecs) eighvals_norm_list.append(eighvals_norm) - max_comp_value = [] + - for elem in eighvals_norm_list: - n_comp = 1 - for _ in range(elem.shape[0]): - if np.sum(elem[:n_comp]) >= explained_variance_ratio: - break - n_comp += 1 + eighvals_mean, eigvecs_mean, eighvals_norm_mean = eigendecomposition(_covmat_mean) + + while True: + max_comp_value = [] + for elem in eighvals_norm_list: + n_comp = 1 + for _ in range(elem.shape[0]): + if np.sum(elem[:n_comp])/np.sum(elem) >= expl_variance_ratio_single: + break + n_comp += 1 + + max_comp_value.append(n_comp) - max_comp_value.append(n_comp) - n_comp = np.min(max_comp_value) + n_comp = np.min(max_comp_value) - eighvals_mean, eigvecs_mean, eighvals_norm_mean = eigendecomposition(_covmat_mean) - n_comp_mean = 1 - for _ in range(eighvals_norm_mean.shape[0]): - if np.sum(eighvals_norm_mean[:n_comp_mean]) >= explained_variance_ratio: + pc_basis = [] + covmat_pca = [] + for j,elem in enumerate(eighvals_list): + pc_basis.append(eigvecs_list[j][:, :n_comp]) + covmat_pca.append(pc_basis[-1].T @ _covmats[j] @ pc_basis[-1]) + + pc_basis = np.asarray(pc_basis) + covmat_pca_ = np.asarray(covmat_pca) + if np.all(np.linalg.cond(covmat_pca_)= expl_variance_ratio_mean: + break + n_comp_mean += 1 - - pc_basis_mean = eigvecs_mean[:, :n_comp_mean] - covmat_pca_mean = pc_basis_mean.T @ _covmat_mean @ pc_basis_mean - pc_basis = np.asarray(pc_basis) - covmat_pca_ = np.asarray(covmat_pca) + pc_basis_mean = eigvecs_mean[:, :n_comp_mean] + covmat_pca_mean = pc_basis_mean.T @ _covmat_mean @ pc_basis_mean + + if np.linalg.cond(covmat_pca_mean) < cond_num: + break + + print('avg not broken') + expl_variance_ratio_mean -= 0.03 if n_comp == 1: @@ -304,6 +327,8 @@ def internal_multiclosure_dataset_loader_pca( n_comp_mean=1, mean_covmat_pca = covmat_pca_mean, sqrt_mean_covmat_pca=np.sqrt(covmat_pca_mean), + expl_variance_ratio_single = expl_variance_ratio_single, + expl_variance_ratio_mean = expl_variance_ratio_mean )) else: @@ -321,6 +346,8 @@ def internal_multiclosure_dataset_loader_pca( n_comp_mean = n_comp_mean, mean_covmat_pca = covmat_pca_mean, sqrt_mean_covmat_pca = covmats.sqrt_covmat(covmat_pca_mean), + expl_variance_ratio_single = expl_variance_ratio_single, + expl_variance_ratio_mean = expl_variance_ratio_mean )) @@ -329,7 +356,7 @@ def internal_multiclosure_dataset_loader_pca( @check_multifit_replicas def internal_multiclosure_data_loader_pca( internal_multiclosure_data_loader, - explained_variance_ratio=0.99, + cond_num=10000, _internal_max_reps=None, _internal_min_reps=20, ): @@ -370,7 +397,9 @@ def internal_multiclosure_data_loader_pca( # diagonalize the mean covariance matrix and only keep the principal components # that explain the required variance - + expl_variance_ratio_single = 0.99 + expl_variance_ratio_mean = 0.99 + if _covmat_mean.shape == (): @@ -385,7 +414,9 @@ def internal_multiclosure_data_loader_pca( pc_basis_mean=1, n_comp_mean=1, mean_covmat_pca=_covmat_mean, - sqrt_mean_covmat_pca = np.sqrt(_covmat_mean) + sqrt_mean_covmat_pca = np.sqrt(_covmat_mean), + expl_variance_ratio_single = expl_variance_ratio_single, + expl_variance_ratio_mean = expl_variance_ratio_mean )) @@ -393,47 +424,79 @@ def internal_multiclosure_data_loader_pca( eigvecs_list = [] eighvals_norm_list = [] + + _corrmats = [] for _cov in _covmats: D = np.sqrt(np.diag(_cov)) _corrmat = _cov/np.outer(D,D) + _corrmats.append(_corrmat) eighvals, eigvecs, eighvals_norm = eigendecomposition(_corrmat) eighvals_list.append(eighvals) eigvecs_list.append(eigvecs) eighvals_norm_list.append(eighvals_norm) - - max_comp_value = [] - for elem in eighvals_norm_list: - n_comp = 1 - for _ in range(elem.shape[0]): - if np.sum(elem[:n_comp]) >= explained_variance_ratio: - break - n_comp += 1 - - max_comp_value.append(n_comp) - n_comp = np.max(max_comp_value) D_mean = np.sqrt(np.diag(_covmat_mean)) _corrmat_mean = _covmat_mean/np.outer(D_mean,D_mean) eighvals_mean, eigvecs_mean, eighvals_norm_mean = eigendecomposition(_corrmat_mean) - n_comp_mean = 1 - for _ in range(eighvals_norm_mean.shape[0]): - if np.sum(eighvals_norm_mean[:n_comp_mean]) >= explained_variance_ratio: + + while True: + + max_comp_value = [] + + for elem in eighvals_norm_list: + n_comp = 1 + for _ in range(elem.shape[0]): + if np.sum(elem[:n_comp])/np.sum(elem) >= expl_variance_ratio_single: + break + n_comp += 1 + + max_comp_value.append(n_comp) + n_comp = np.max(max_comp_value) + + + + pc_basis = [] + covmat_pca = [] + corrmat_pca = [] + for j,elem in enumerate(eighvals_list): + pc_basis.append(eigvecs_list[j][:, :n_comp]) + covmat_pca.append(pc_basis[-1].T @ _covmats[j] @ pc_basis[-1]) + corrmat_pca.append(pc_basis[-1].T @ _corrmats[j] @ pc_basis[-1]) + + + pc_basis = np.asarray(pc_basis) + covmat_pca_ = np.asarray(covmat_pca) + corrmat_pca_ = np.asarray(corrmat_pca) + + if np.all(np.linalg.cond(corrmat_pca_)= expl_variance_ratio_mean: + break + n_comp_mean += 1 - - pc_basis_mean = eigvecs_mean[:, :n_comp_mean] - covmat_pca_mean = pc_basis_mean.T @ _covmat_mean @ pc_basis_mean + pc_basis_mean = eigvecs_mean[:, :n_comp_mean] + covmat_pca_mean = pc_basis_mean.T @ _covmat_mean @ pc_basis_mean + corrmat_pca_mean = pc_basis_mean.T @ _corrmat_mean @ pc_basis_mean - pc_basis = np.asarray(pc_basis) - covmat_pca_ = np.asarray(covmat_pca) + print(f'data mean corrmat cond value: {np.linalg.cond(corrmat_pca_mean)}') + if np.linalg.cond(corrmat_pca_mean)