From 1bdb2ceba5770c2b7cd21f3bc811cb544ccdcdaa Mon Sep 17 00:00:00 2001
From: Abigail E Williams <abbyevewilliams@gmail.com>
Date: Wed, 22 May 2024 14:59:55 +0100
Subject: [PATCH 1/3] Added functions to calculate mean and CI for forest cover

---
 main.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index d244b74..21187ed 100755
--- a/main.py
+++ b/main.py
@@ -5,6 +5,8 @@
 
 import pandas as pd
 from geojson import Feature, FeatureCollection, Point, dumps
+from statistics import mean, stdev
+from math import sqrt
 
 def is_not_matchless(path: str) -> bool:
     return not path.endswith("_matchless.parquet")
@@ -12,11 +14,58 @@ def is_not_matchless(path: str) -> bool:
 def round_float(arg):
     return round(float(arg), 6)
 
+# Calculate proportion of forested pixels in one set of pairs
+def calculate_proportions(matches_df, years):
+    proportions = []
+    for year in years:
+        treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'treatment')].shape[0]
+        control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'control')].shape[0]
+        total_treat = matches_df[matches_df['treatment'] == 'treatment'].shape[0]
+        total_control = matches_df[matches_df['treatment'] == 'control'].shape[0]
+
+        treat_prop = treat_forest / total_treat
+        control_prop = control_forest / total_control
+
+        proportions.append({
+            'year': year,
+            'treatment': treat_prop,
+            'control': control_prop
+        })
+
+    return proportions
+
+# Calculate mean and SE forested proportions across all sets of pairs
+def calculate_statistics(proportions_list, years):
+    treatment_means = {year: [] for year in years}
+    control_means = {year: [] for year in years}
+
+    for proportions in proportions_list:
+        for prop in proportions:
+            year = prop['year']
+            treatment_means[year].append(prop['treatment'])
+            control_means[year].append(prop['control'])
+
+    mean_proportions = {}
+    se_proportions = {}
+
+    for year in years:
+        treat_mean = mean(treatment_means[year]) * 100
+        control_mean = mean(control_means[year]) * 100
+        treat_se = (stdev(treatment_means[year]) / sqrt(len(treatment_means[year]))) * 100
+        control_se = (stdev(control_means[year]) / sqrt(len(control_means[year]))) * 100
+
+        mean_proportions[year] = {'treatment': treat_mean, 'control': control_mean}
+        se_proportions[year] = {'treatment': treat_se, 'control': control_se}
+
+    return mean_proportions, se_proportions
+
 def run (root_matches_directory, partials_dir):
   all_pairs = glob.glob("*_pairs", root_dir=root_matches_directory)
 
   id = 0
 
+  all_proportions = []
+
   for idx, pair in enumerate(all_pairs):
       matches_directory = os.path.join(root_matches_directory, pair)
 
@@ -48,6 +97,9 @@ def run (root_matches_directory, partials_dir):
 
           points = []
 
+          proportions = calculate_proportions(matches_df, years)
+          all_proportions.append(proportions)
+
           for _, row in matches_df.iterrows():
               treat_props = { 
                   "slope": row["k_slope"],
@@ -106,7 +158,18 @@ def run (root_matches_directory, partials_dir):
 
           with open(out_path, "w", encoding="utf-8") as output_file:
               output_file.write(dumps(points_gc))
-              
+
+        # Calculate statistics
+        mean_proportions, se_proportions = calculate_statistics(all_proportions, years)
+
+      print("Mean Proportions (%):")
+      for year, values in mean_proportions.items():
+        print(f"{year}: Treatment = {values['treatment']:.2f}%, Control = {values['control']:.2f}%")
+
+      print("Standard Errors (%):")
+      for year, values in se_proportions.items():
+        print(f"{year}: Treatment SE = {values['treatment']:.2f}%, Control SE = {values['control']:.2f}%")
+
 
 def main():
     try:

From 2026a0fa777db1051e0843c7fd0696910df4ca8b Mon Sep 17 00:00:00 2001
From: Abigail E Williams <abbyevewilliams@gmail.com>
Date: Thu, 23 May 2024 11:33:12 +0100
Subject: [PATCH 2/3] Corrected indexing and file formatting

---
 main.py | 82 +++++++++++++++++++++++++--------------------------------
 1 file changed, 36 insertions(+), 46 deletions(-)

diff --git a/main.py b/main.py
index 21187ed..b83fd01 100755
--- a/main.py
+++ b/main.py
@@ -14,14 +14,14 @@ def is_not_matchless(path: str) -> bool:
 def round_float(arg):
     return round(float(arg), 6)
 
-# Calculate proportion of forested pixels in one set of pairs
+# Function for calculating proportion of forested pixels in each set of pairs
 def calculate_proportions(matches_df, years):
     proportions = []
     for year in years:
-        treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'treatment')].shape[0]
-        control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1) & (matches_df['treatment'] == 'control')].shape[0]
-        total_treat = matches_df[matches_df['treatment'] == 'treatment'].shape[0]
-        total_control = matches_df[matches_df['treatment'] == 'control'].shape[0]
+        treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0]
+        control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0]
+        total_treat = matches_df.shape[0]
+        total_control = matches_df.shape[0]
 
         treat_prop = treat_forest / total_treat
         control_prop = control_forest / total_control
@@ -34,38 +34,11 @@ def calculate_proportions(matches_df, years):
 
     return proportions
 
-# Calculate mean and SE forested proportions across all sets of pairs
-def calculate_statistics(proportions_list, years):
-    treatment_means = {year: [] for year in years}
-    control_means = {year: [] for year in years}
-
-    for proportions in proportions_list:
-        for prop in proportions:
-            year = prop['year']
-            treatment_means[year].append(prop['treatment'])
-            control_means[year].append(prop['control'])
-
-    mean_proportions = {}
-    se_proportions = {}
-
-    for year in years:
-        treat_mean = mean(treatment_means[year]) * 100
-        control_mean = mean(control_means[year]) * 100
-        treat_se = (stdev(treatment_means[year]) / sqrt(len(treatment_means[year]))) * 100
-        control_se = (stdev(control_means[year]) / sqrt(len(control_means[year]))) * 100
-
-        mean_proportions[year] = {'treatment': treat_mean, 'control': control_mean}
-        se_proportions[year] = {'treatment': treat_se, 'control': control_se}
-
-    return mean_proportions, se_proportions
-
 def run (root_matches_directory, partials_dir):
   all_pairs = glob.glob("*_pairs", root_dir=root_matches_directory)
 
   id = 0
 
-  all_proportions = []
-
   for idx, pair in enumerate(all_pairs):
       matches_directory = os.path.join(root_matches_directory, pair)
 
@@ -90,15 +63,17 @@ def run (root_matches_directory, partials_dir):
       matches = glob.glob("*.parquet", root_dir=matches_directory)
       matches = [x for x in matches if is_not_matchless(x)]
 
+      all_proportions = []
+
       for pair_idx, pairs in enumerate(matches):
           matches_df = pd.read_parquet(os.path.join(matches_directory, pairs))
-
           years = [col.replace("k_luc_", "") for col in matches_df.columns if "k_luc_" in col]
 
-          points = []
-
+          # Calculate % forest cover
           proportions = calculate_proportions(matches_df, years)
-          all_proportions.append(proportions)
+          all_proportions.extend(proportions)
+
+          points = []
 
           for _, row in matches_df.iterrows():
               treat_props = { 
@@ -159,16 +134,31 @@ def run (root_matches_directory, partials_dir):
           with open(out_path, "w", encoding="utf-8") as output_file:
               output_file.write(dumps(points_gc))
 
-        # Calculate statistics
-        mean_proportions, se_proportions = calculate_statistics(all_proportions, years)
-
-      print("Mean Proportions (%):")
-      for year, values in mean_proportions.items():
-        print(f"{year}: Treatment = {values['treatment']:.2f}%, Control = {values['control']:.2f}%")
-
-      print("Standard Errors (%):")
-      for year, values in se_proportions.items():
-        print(f"{year}: Treatment SE = {values['treatment']:.2f}%, Control SE = {values['control']:.2f}%")
+     # Set up df for stats
+      prop_df = pd.DataFrame(all_proportions)
+      prop_df_grouped = prop_df.groupby('year')
+
+      results = []
+
+     # Calculate mean and 95% confidence intervals
+      for year, group in prop_df_grouped:
+        for treatment in ['treatment', 'control']:
+            data = group[treatment]
+            mean_prop = data.mean()
+            std_dev = data.std()
+            count = data.count()
+            ci_95 = 1.96 * (std_dev / sqrt(count))
+
+            results.append({
+                'year': year,
+                'treatment': treatment,
+                'mean': mean_prop * 100,
+                'ci_lower': (mean_prop - ci_95) * 100,
+                'ci_upper': (mean_prop + ci_95) * 100
+            })
+
+      results_df = pd.DataFrame(results)
+      results_df.to_json(os.path.join(output_directory, "forest-cover-stats.json"), orient="records") 
 
 
 def main():

From d94b636057ce3f817f982d2f44e150a7be4656f8 Mon Sep 17 00:00:00 2001
From: Abigail E Williams <abbyevewilliams@gmail.com>
Date: Fri, 24 May 2024 10:07:52 +0100
Subject: [PATCH 3/3] Fixed typo in calculate_proportions

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index b83fd01..7152cff 100755
--- a/main.py
+++ b/main.py
@@ -19,7 +19,7 @@ def calculate_proportions(matches_df, years):
     proportions = []
     for year in years:
         treat_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0]
-        control_forest = matches_df[(matches_df[f'k_luc_{year}'] == 1)].shape[0]
+        control_forest = matches_df[(matches_df[f's_luc_{year}'] == 1)].shape[0]
         total_treat = matches_df.shape[0]
         total_control = matches_df.shape[0]