From 1bdb2ceba5770c2b7cd21f3bc811cb544ccdcdaa Mon Sep 17 00:00:00 2001 From: Abigail E Williams Date: Wed, 22 May 2024 14:59:55 +0100 Subject: [PATCH 1/3] Added functions to calculate mean and CI for forest cover --- main.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index d244b74..21187ed 100755 --- a/main.py +++ b/main.py @@ -5,6 +5,8 @@ import pandas as pd from geojson import Feature, FeatureCollection, Point, dumps +from statistics import mean, stdev +from math import sqrt def is_not_matchless(path: str) -> bool: return not path.endswith("_matchless.parquet") @@ -12,11 +14,58 @@ def is_not_matchless(path: str) -> bool: def round_float(arg): return round(float(arg), 6) +# Calculate proportion of forested pixels in one set of pairs +def calculate_proportions(matches_df, years): + proportions = [] + for year in years: + treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'treatment')].shape[0] + control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'control')].shape[0] + total_treat = matches_df[matches_df['treatment'] == 'treatment'].shape[0] + total_control = matches_df[matches_df['treatment'] == 'control'].shape[0] + + treat_prop = treat_forest / total_treat + control_prop = control_forest / total_control + + proportions.append({ + 'year': year, + 'treatment': treat_prop, + 'control': control_prop + }) + + return proportions + +# Calculate mean and SE forested proportions across all sets of pairs +def calculate_statistics(proportions_list, years): + treatment_means = {year: [] for year in years} + control_means = {year: [] for year in years} + + for proportions in proportions_list: + for prop in proportions: + year = prop['year'] + treatment_means[year].append(prop['treatment']) + control_means[year].append(prop['control']) + + mean_proportions = {} + se_proportions = {} + + for year in years: + treat_mean = mean(treatment_means[year]) * 100 + control_mean = mean(control_means[year]) * 100 + treat_se = (stdev(treatment_means[year]) / sqrt(len(treatment_means[year]))) * 100 + control_se = (stdev(control_means[year]) / sqrt(len(control_means[year]))) * 100 + + mean_proportions[year] = {'treatment': treat_mean, 'control': control_mean} + se_proportions[year] = {'treatment': treat_se, 'control': control_se} + + return mean_proportions, se_proportions + def run (root_matches_directory, partials_dir): all_pairs = glob.glob("*_pairs", root_dir=root_matches_directory) id = 0 + all_proportions = [] + for idx, pair in enumerate(all_pairs): matches_directory = os.path.join(root_matches_directory, pair) @@ -48,6 +97,9 @@ def run (root_matches_directory, partials_dir): points = [] + proportions = calculate_proportions(matches_df, years) + all_proportions.append(proportions) + for _, row in matches_df.iterrows(): treat_props = { "slope": row["k_slope"], @@ -106,7 +158,18 @@ def run (root_matches_directory, partials_dir): with open(out_path, "w", encoding="utf-8") as output_file: output_file.write(dumps(points_gc)) - + + # Calculate statistics + mean_proportions, se_proportions = calculate_statistics(all_proportions, years) + + print("Mean Proportions (%):") + for year, values in mean_proportions.items(): + print(f"{year}: Treatment = {values['treatment']:.2f}%, Control = {values['control']:.2f}%") + + print("Standard Errors (%):") + for year, values in se_proportions.items(): + print(f"{year}: Treatment SE = {values['treatment']:.2f}%, Control SE = {values['control']:.2f}%") + def main(): try: From 2026a0fa777db1051e0843c7fd0696910df4ca8b Mon Sep 17 00:00:00 2001 From: Abigail E Williams Date: Thu, 23 May 2024 11:33:12 +0100 Subject: [PATCH 2/3] Corrected indexing and file formatting --- main.py | 82 +++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/main.py b/main.py index 21187ed..b83fd01 100755 --- a/main.py +++ b/main.py @@ -14,14 +14,14 @@ def is_not_matchless(path: str) -> bool: def round_float(arg): return round(float(arg), 6) -# Calculate proportion of forested pixels in one set of pairs +# Function for calculating proportion of forested pixels in each set of pairs def calculate_proportions(matches_df, years): proportions = [] for year in years: - treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'treatment')].shape[0] - control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'control')].shape[0] - total_treat = matches_df[matches_df['treatment'] == 'treatment'].shape[0] - total_control = matches_df[matches_df['treatment'] == 'control'].shape[0] + treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0] + control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0] + total_treat = matches_df.shape[0] + total_control = matches_df.shape[0] treat_prop = treat_forest / total_treat control_prop = control_forest / total_control @@ -34,38 +34,11 @@ def calculate_proportions(matches_df, years): return proportions -# Calculate mean and SE forested proportions across all sets of pairs -def calculate_statistics(proportions_list, years): - treatment_means = {year: [] for year in years} - control_means = {year: [] for year in years} - - for proportions in proportions_list: - for prop in proportions: - year = prop['year'] - treatment_means[year].append(prop['treatment']) - control_means[year].append(prop['control']) - - mean_proportions = {} - se_proportions = {} - - for year in years: - treat_mean = mean(treatment_means[year]) * 100 - control_mean = mean(control_means[year]) * 100 - treat_se = (stdev(treatment_means[year]) / sqrt(len(treatment_means[year]))) * 100 - control_se = (stdev(control_means[year]) / sqrt(len(control_means[year]))) * 100 - - mean_proportions[year] = {'treatment': treat_mean, 'control': control_mean} - se_proportions[year] = {'treatment': treat_se, 'control': control_se} - - return mean_proportions, se_proportions - def run (root_matches_directory, partials_dir): all_pairs = glob.glob("*_pairs", root_dir=root_matches_directory) id = 0 - all_proportions = [] - for idx, pair in enumerate(all_pairs): matches_directory = os.path.join(root_matches_directory, pair) @@ -90,15 +63,17 @@ def run (root_matches_directory, partials_dir): matches = glob.glob("*.parquet", root_dir=matches_directory) matches = [x for x in matches if is_not_matchless(x)] + all_proportions = [] + for pair_idx, pairs in enumerate(matches): matches_df = pd.read_parquet(os.path.join(matches_directory, pairs)) - years = [col.replace("k_luc_", "") for col in matches_df.columns if "k_luc_" in col] - points = [] - + # Calculate % forest cover proportions = calculate_proportions(matches_df, years) - all_proportions.append(proportions) + all_proportions.extend(proportions) + + points = [] for _, row in matches_df.iterrows(): treat_props = { @@ -159,16 +134,31 @@ def run (root_matches_directory, partials_dir): with open(out_path, "w", encoding="utf-8") as output_file: output_file.write(dumps(points_gc)) - # Calculate statistics - mean_proportions, se_proportions = calculate_statistics(all_proportions, years) - - print("Mean Proportions (%):") - for year, values in mean_proportions.items(): - print(f"{year}: Treatment = {values['treatment']:.2f}%, Control = {values['control']:.2f}%") - - print("Standard Errors (%):") - for year, values in se_proportions.items(): - print(f"{year}: Treatment SE = {values['treatment']:.2f}%, Control SE = {values['control']:.2f}%") + # Set up df for stats + prop_df = pd.DataFrame(all_proportions) + prop_df_grouped = prop_df.groupby('year') + + results = [] + + # Calculate mean and 95% confidence intervals + for year, group in prop_df_grouped: + for treatment in ['treatment', 'control']: + data = group[treatment] + mean_prop = data.mean() + std_dev = data.std() + count = data.count() + ci_95 = 1.96 * (std_dev / sqrt(count)) + + results.append({ + 'year': year, + 'treatment': treatment, + 'mean': mean_prop * 100, + 'ci_lower': (mean_prop - ci_95) * 100, + 'ci_upper': (mean_prop + ci_95) * 100 + }) + + results_df = pd.DataFrame(results) + results_df.to_json(os.path.join(output_directory, "forest-cover-stats.json"), orient="records") def main(): From d94b636057ce3f817f982d2f44e150a7be4656f8 Mon Sep 17 00:00:00 2001 From: Abigail E Williams Date: Fri, 24 May 2024 10:07:52 +0100 Subject: [PATCH 3/3] Fixed typo in calculate_proportions --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b83fd01..7152cff 100755 --- a/main.py +++ b/main.py @@ -19,7 +19,7 @@ def calculate_proportions(matches_df, years): proportions = [] for year in years: treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0] - control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0] + control_forest = matches_df[(matches_df[f's_luc_{year}'] == 1)].shape[0] total_treat = matches_df.shape[0] total_control = matches_df.shape[0]