From 09161bb9df99af8a2679c99a70072e00ddad8006 Mon Sep 17 00:00:00 2001 From: Favour Date: Wed, 14 Aug 2024 16:04:18 +0100 Subject: [PATCH] cleaned up files --- .../feature_extractor.cpython-310.pyc | Bin 8083 -> 0 bytes .../model_predictor.cpython-310.pyc | Bin 4204 -> 0 bytes src/prep/prep_becd.py | 138 -------- src/prep/prep_carbenmats.py | 224 ------------- src/prep/prep_clf.py | 136 -------- src/prep/scrapers/building_desc_scraper.py | 74 ----- src/prep/scrapers/fcbs_data_scraper.bas | 310 ------------------ 7 files changed, 882 deletions(-) delete mode 100644 src/__pycache__/feature_extractor.cpython-310.pyc delete mode 100644 src/__pycache__/model_predictor.cpython-310.pyc delete mode 100644 src/prep/prep_becd.py delete mode 100644 src/prep/prep_carbenmats.py delete mode 100644 src/prep/prep_clf.py delete mode 100644 src/prep/scrapers/building_desc_scraper.py delete mode 100644 src/prep/scrapers/fcbs_data_scraper.bas diff --git a/src/__pycache__/feature_extractor.cpython-310.pyc b/src/__pycache__/feature_extractor.cpython-310.pyc deleted file mode 100644 index 6c640bc1e067bfa9a0015f027e755a23170cc21b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8083 zcma)B+ix7#d7s*^4N-KzA`` z&V9}~^L@YX_k9zVN|u7(&vO5|`7f_2${(rl^iN0OZCufRAYlqqeZ@}is;$bhW^3}U z+q%3Pwju8sJ0tH|JBPdGo1MI!S81&7Tb+Vkka@!|c1m`sQ?|<}tIiaz3a%>4u=pf0;{rV+!xt7Hp@;tQtgv$j?JUy z6kA}6xSw`sH&u3$okH6gcAA|*-C0?8mc4+w7g&wGh`Jh6*OmH9Ut`b8YF$nAy^8^E zxohEW5WBptB{SgX{J59J#AWw&$N=62pe%{oS7VVPCC>;{ci$lb=C+iY}L zlI=P?!pg8n=0#4^cl#B`_m^&afp_cn(ycd_9$c(jiPZ@FZle=2*H1E0*JO=@jx{qCLj8~5(4t*n2g@F|Rv z$>Wb(Z*6>l)kPaOoCo19Us`bj&-FJ}+-L`LZoJRKZMPLi8y^JT19bD^{f+C_?$EV# zGl;zQR=g4M*2bpm#JkwD`w(kv#Ua1Y-A`t*!`O3tukSXv8)5pE8_m#pmywuyMKv`O zS4B0R@G7e6W`E%w9!62^CcVtS@oQJO>(tJ7F4p^VS9d+1c|p7OzRSIiiwe)r z$F)~oKinJE_N(vsPM?;t=EPoDJOAB!dsF`Y{c=miMn*rG}0XmSJFNMCrJZ_h$S7SHm(F9RaIyT$0yP1}|Io9O|w5=?XvTCV@%Foh_fRqhnK&ad018$b}_Yw7CI}#_2-JSv|*J zrHORJ|RP3j^+#L?XVLpuI%b!C>Q#CYG zE2~*4El>mvN2KvL(DZ++K@F|26_6OA9H}@c*b1gUQiQr9k4>tAvHDo89h-xskdApB zS467R1{B=42C7iEq5hC4Na42n_gasXUA;v8ZDaM$2dlPzW#zrPW*glwYRL0|mD4V~ z&R<8DXZLC?`(6~c!cO8YS0w1(~d4I`(X0_ zz?p1%z8u^`Nv6{A<5Pe3vBUQ9{G?h#MJ$sd{w2y^lj5}q5=XydqPL1#=$UdvVy6ln zC-Pkg63zz@X~eL}!Vubm0)fkrCJOyP*35&#%u{Vv=n%;4FSSJ+a+|ENzRC%WzorCQ z2D9pWq>}-2ACfXs0zA@vHyBEqU5Fh{x+E2{u!pYUUL%49OES{F+Sagx#~#}G2!*&o z*&L*mGS41NbrVxG0@r~7OlP&RnA8BOCgiTH&H;jPL|0IMOk(J`#)|n{6zFsSypTAV z)5vtloX!luu`csaJE$vab~Jv5k-5n(LpE6s<@3Ve=b4GR{IU{QsI^dg5w#atA*~(u zd|PO;?QJ=Oh8b|4!~X9w3w?)m_2O!hT?a)7dEZ*!Z7!vGK8^Lt!+$*gUI+=O7B5rRgOOut=< z5wLI4t~;+5;siP0;PcYRr5P!lN6xo7Kx*I9;IqL*1#_IOS9 z$UzidlC$QHRK6y25UuEv%)!9%wOBjUnaT2P)brKn=7m=QfxtWv40`g0EEiItVMSTD zDeFo@L83jBFxM~SGYXVD_Kd-?8EE$5$a;Cz&b$w56ZNO*R=ehOCz8{jL$OP|?5gW@ zCiS+#oz1wP@6w1CM)XTaWydCU?3dQiK9v~S2&ZW~e(%Ilr z1e-p10e06IiS5rYe{>azsg^YZaDu;1pi;pFX;ZZ)bd_ZtxHj%a!Tb}_eEGjMux@IB z3=;Qy57V>=az?*X$Pks1*78kAV5|UmRlsaU@`GFqStOtqkfwtSh?6}2?_ z#0fDas>@36N08KEE}junc-B-riyAUWQ({WGgvU^w3^Lh3$pZ)_B@f7?JRr6H(uUO4 zp{eJ8fli4IIDE#oaALdvjO)K>&S|Cp^3eMl$;XI=e~YbueCm3q88Q#Be9hs_FsPm1 zX&P{3e?D8R|0~?83qKKxW2EO6g)g-hwgM3VmklaIKS!bTd*qQ+;k5Nom4xuvfbX zve_6_*~I|di40QgxluAR;gb_W^d=T#TP;9cv`dnjEE(3qxU}AOuEqDW3v(2EF(v-qY>CVA&1_AQ7eK|7uAX`oqJW&EHG^F zY;_v7Mf5fRPueFY%piK}mJaaoFx4?;NYQ{iF4C0f=MsF#?7{uii0KR&H_Q@21eA_E zHUsLLfztwjA~pY9>6hcoVfHhy^g#~Lkc&-`BWO^SdrEJ?P~!YPaE<=dO8v;)?k()W zgaT6F%nVEblAIT?k~sjwzyA}!NB(p2`HB*ruA*!W3UTRE1pzDyjhUiAYdTQ*Z$u_8 zi#*FMD9k+2;wj|w^GaNyxehd9s)M2^N*AjHu3Hr_ih#p1|3Z`xr0Wwa?^bU`7?|bcz)IWsKgH$vM4Fc@8Dq8)l=}kStFQoD|G>)Q z9Kh;u{)jw~KL~?W78OjjU?1`;z{qjb${v0q8;~4{^ z2fKe?0%KeGN9h(#q!Ydh!6I)^@{GG@EtCH&U7$Zw^DQL(rI8>%2NzGdLgR7#BF22{ z>gcJ`eftHiE>E!i8E82j--i&S9wM$ofA=d@a)aSY6K!V?HQ%QN$@QiIndP<{0E_sg z|D<_o``RZbmHw;$pVf|reoab}Iyy7~|7#@u?8iUdU>EKua3%T6s7MU3W4CWPwdRm@ zONw4!NHkzZj!p|CB(ZM3vwCOk`n49)@StFVws<()g^0h6`Rp=S6T+5h6y;%@WYbo=m_}YG zwt`yKlX8@p<7u<0pg9ddEQ6sDY*r8lokOIC_?rP}ENj!M@rm{)vq%7oo=+^Js7nCV_#;f% zB6c?Vk<(7@v;<5B!1Mrw6m2|2V2dI*2kPg5&jSU}rKYfX|5t#;oG{=KoDw;vLg`QU zsv;*x7zD9^S`*%c`6U4Hclsbta0(u$#F_0Jz{nC7K&K7xgG(t2#McM_ztF+*ES5Vg zh~kk7Zm5Cn6-5a!P0;owfUX4Im|G-Zo|2_9aux8kYFq|Ts)K3dX8@OFfE;nQa{$>{ zu(=8#63h$hniF%9nau$7=f%93250+B%!4P&nbA6neWMi+TNSf|ncgPYTn60jOEAZ* zfVTp@Jt1ajw{2+@R3%V*Kj>9IRX$ZWHTgvY`=YaxCT2nIXaOr-#Q4Qc3eyx;`E49} zKVO4DORtvqsH|-?`ClOe<-3fVgcD8yj^{9i-r)?ch(cVF3RVU{l+@c>R6YZ~5V|ZX zJ2F7@6hY)W7%{>@!j-Qyy99YBkp_)|3fY!a*;Jjzpd)GH6-Y=#(c`S5X$z|MO;(#g zHh{#b)@|%4ZxCHlpz)Y}ar_zebXV@@0x>+7Y=-#IICDwZXKY zV%oSQIoG&EuwcMd)1auCI7e|fKn;Kdcj8zISWjMLW9uL@FylPbw+!e&dGamHv%Co2EvsaQMNte}e%) zCJ9h4rbkEG%1=;2Za*jL&EG+iWUv%?a$3+$3BW~^VP7RBz@db`k6@V>29ZPq`^2|> ztBmAl%2ccA696Y!l)-=b)h|_D63O(n$QSeq6(veg=r~1=8!FcM@|bzmmwQ}~4bRkH%d@2+&EGPwAuVa+>&S+5 z@O7mtoA@?mOK#!Yl5M$-@0RSyTllsm|Cn`m{)rRVen4&gY@y@geTp~x5oauCLw;rG z*37MIe$8jqoLxY+Yh%Xsnq6>dU{q1k$i&E8tXUXYi;<0yoi_?gHZG0dZOCQU7DOF?hje^?hDQt{XWGD9^(d&dxNL9!DHUwY2M&D+u*Tp@U(95{C<_E za0<6*7OjB=-MZ8Lmca8o^`*#?BXey^mBn41+xD9VF2i$fEqb_ukVF>3#EONfsR)=eG=FBIuV;0 z1-oL`>>2yz$34W{%qq-+pD}e%@WOn|X7&-AA=a?|*@A<4Gy_*r;c{E>b;Mnm^nMvJSjgD5c3)Q^o0jkQyM51pJ9tS~Mqfxv|e zj-FLmLqpxpP026$Z}^2-5X>6>YtW=t=|?JrPY}qinD&3*FtuIxL=3bg{GQJU0#7ZB z6mp`p<0?N&(D|wy4sbKSGXkNCkcCs|Gp5hqiKC+gtBZLNWg;An^YdNt^f(MoL^PPQ z);AF&KM#%tJWdN+-zVGcTH1DH$EQvEHN9u+{mRxaD4V>eRI0kRdIUSFZEDCLOIM56 zmxHpUxkHKmrep0aFI4&M+hENIr(4YyZx{wQfS87D8H(I3u+Y>`105P`CwhWT1Y*=v>jpbh&N%SsA>!d~)Ko3_9U#su`;A7B@`Tk7ffx z!TKTl5fH3^z$NQhGd|Nf4M4z<&yY(lEm)CX7#C)3%jqAq_#cExsQf$>z6jFMq1O6R ztgq0%HQvAKly!Ad{?=E0wJOyuta}y-xuLbPIhQo^_mlJ35PnYF5b{C?s4 z6f;z5qXy@nL3;&9=D#~L0iu(JKj)=0KL|ke0Qyo7 zsrfE7_tBJQ;wL~mfxfr1DV+KiO`F(Z!)WRoSojs*j3BS=QvU1kwxJ!K*4B(h&c67o z^>YjsWt>162d9UDs8IWMVZwm-DFadfJZ5KHy^+JXh4Dv*VolGXXa32i%y`;4wJz8N zmy{pv>A&=ify|Ml&{(OZ{38GpGFDXMq0W@iRq>BYqz1gSm= lower_bound) & (becd_df['Total Embodied Carbon PER m2'] <= upper_bound)] - -""" -11. Save Cleaned DataFrame to CSV for inspection. -""" -# Save dataframe to CSV for modeling -becd_df_PATH = os.path.join(export_dir, 'inspect/cleaned_becd.csv') -becd_df.to_csv(becd_df_PATH, index=False) -becd_df.info() - -""" -12. Label encode categorical data for ML use. -""" -# Label encode categorical columns -label_encoder = LabelEncoder() -categorical_columns = becd_df.select_dtypes(include=['object']).columns - -for col in categorical_columns: - becd_df[col] = label_encoder.fit_transform(becd_df[col]) - - -""" -13. Save dataframe to CSV for modeling. -""" -becd_df_PATH = os.path.join(export_dir, 'encoded/encoded_becd.csv') -becd_df.to_csv(becd_df_PATH, index=False) -becd_df.info() \ No newline at end of file diff --git a/src/prep/prep_carbenmats.py b/src/prep/prep_carbenmats.py deleted file mode 100644 index 3f60f77..0000000 --- a/src/prep/prep_carbenmats.py +++ /dev/null @@ -1,224 +0,0 @@ -import pandas as pd -import os -import numpy as np -from sklearn.impute import IterativeImputer, SimpleImputer -from sklearn.preprocessing import LabelEncoder - -# Define the base directory and data paths -current_dir = os.path.dirname(os.path.abspath(__file__)) -data_dir = os.path.join(current_dir, "../../data/raw") -export_dir = os.path.join(current_dir, "../../data/processed") - -os.makedirs(export_dir, exist_ok=True) - -CARB_EN_MATS_PATH = os.path.join(data_dir, "model/CarbEnMats_dataset.xlsx") - -carbenmats_df = pd.read_excel(CARB_EN_MATS_PATH) - -""" -2. Clean the Dataset -""" - -# Define a list of values to be replaced with NaN -na_values = ["n/a", "N/a", "N/A", "No data", "no data"] -# Replace these values with NaN -carbenmats_df.replace(na_values, np.nan, inplace=True) - - -# Define a function to calculate the median from an interval -def median_from_interval(interval): - if pd.isna(interval): - return np.nan - try: - start, end = map(float, interval.split("-")) - return (start + end) / 2 - except: - return np.nan - - -# Fill missing values -carbenmats_df["bldg_area_gfa"] = carbenmats_df["bldg_area_gfa"].fillna( - carbenmats_df["bldg_area_interval"].apply(median_from_interval) -) # Fill missing areas with median from interval -carbenmats_df["bldg_floors_ag"] = carbenmats_df["bldg_floors_ag"].fillna( - carbenmats_df["bldg_floors_ag_interval"].apply(median_from_interval) -) # Fill missing storeys with median from interval -carbenmats_df["bldg_users_total"] = carbenmats_df["bldg_users_total"].fillna( - np.nan -) # Fill missing with nan -carbenmats_df["bldg_floors_bg"] = carbenmats_df["bldg_floors_bg"].fillna( - np.nan -) # Fill missing with nan - -""" -3. Fill and impute missing values. -""" -# Define categorical and numerical columns -categorical_cols = [ - "bldg_project_type", - "bldg_use_type", - "bldg_use_subtype", - "site_region_world", - "site_country", - "site_region_local", - "bldg_struct_type", - "bldg_roof_type", -] -numerical_cols = [ - "bldg_area_gfa", - "bldg_users_total", - "bldg_floors_ag", - "bldg_floors_bg", -] - -# Fill missing values in categorical columns with 'missing' -imp_categorical = SimpleImputer(strategy="constant", fill_value="missing") -carbenmats_df[categorical_cols] = imp_categorical.fit_transform( - carbenmats_df[categorical_cols] -) - -# Convert categorical columns to numerical values using LabelEncoder -encoders = {col: LabelEncoder() for col in categorical_cols} -for col in categorical_cols: - carbenmats_df[col] = encoders[col].fit_transform(carbenmats_df[col].astype(str)) - -# Use Iterative Imputer for numerical columns -imp_numerical = IterativeImputer(max_iter=10, random_state=0) -carbenmats_df[numerical_cols] = imp_numerical.fit_transform( - carbenmats_df[numerical_cols] -) - -# Convert categorical columns back to their original data types -for col in categorical_cols: - carbenmats_df[col] = encoders[col].inverse_transform(carbenmats_df[col].astype(int)) - -""" -4. Select and Rename relevant columns -""" -# Calculate total carbon -ghg_columns = [ - "GHG_A123_m2a", - "GHG_A45_m2a", - "GHG_B1234_m2a", - "GHG_B5_m2a", - "GHG_B67_m2a", - "GHG_C12_m2a", - "GHG_C34_m2a", - "GHG_D_m2a", -] -carbenmats_df["Total_Embodied_Carbon"] = carbenmats_df[ghg_columns].sum(axis=1) - -# Calculate kg CO2e per square meter - need to multiply by reference study period, as values are stored as per m2 per year -carbenmats_df["Total Embodied Carbon PER m2"] = ( - carbenmats_df["Total_Embodied_Carbon"] * carbenmats_df["lca_RSP"] -) - -carbenmats_df = carbenmats_df[ - [ - "bldg_project_type", - "bldg_use_type", - "bldg_use_subtype", - "site_region_world", - "site_country", - "site_region_local", - "bldg_users_total", - "bldg_floors_ag", - "bldg_floors_bg", - "bldg_struct_type", - "bldg_roof_type", - "Total Embodied Carbon PER m2", - ] -] - -# Rename columns for better inspection -carbenmats_df.rename( - columns={ - "bldg_project_type": "Building Project Type", - "bldg_use_type": "Building Use Type", - "bldg_use_subtype": "Building Use Subtype", - "site_region_world": "Continent", - "site_country": "Country", - "site_region_local": "City", - "bldg_users_total": "Total Users", - "bldg_floors_ag": "Floors Above Ground", - "bldg_floors_bg": "Floors Below Ground", - "bldg_struct_type": "Structure Type", - "bldg_roof_type": "Roof Type", - }, - inplace=True, -) - -updated_categorical_cols = [ - "Building Project Type", - "Building Use Type", - "Building Use Subtype", - "Continent", - "Country", - "City", - "Structure Type", - "Roof Type", -] - -""" -5. Drop rows with any remaining NaN values, and drop rows with "0" embodied carbon. -""" -carbenmats_df = carbenmats_df.dropna() -carbenmats_df = carbenmats_df[carbenmats_df["Total Embodied Carbon PER m2"] != 0] - -""" -6. Remove Outliers -""" -# Calculate Q1, Q3, and IQR for Total_Embodied_Carbon -Q1 = carbenmats_df["Total Embodied Carbon PER m2"].quantile(0.25) -Q3 = carbenmats_df["Total Embodied Carbon PER m2"].quantile(0.75) -IQR = Q3 - Q1 - -# Define the lower and upper bounds for outliers -lower_bound = Q1 - 1.5 * IQR -upper_bound = Q3 + 1.5 * IQR - -# Remove outliers -carbenmats_df = carbenmats_df[ - (carbenmats_df["Total Embodied Carbon PER m2"] >= lower_bound) - & (carbenmats_df["Total Embodied Carbon PER m2"] <= upper_bound) -] - -""" -7. Drop categorical columns with more than 40% 'missing' values -""" -threshold = 0.40 -categorical_cols_to_drop = [ - col - for col in updated_categorical_cols - if (carbenmats_df[col] == "missing").mean() > threshold -] -carbenmats_df.drop(columns=categorical_cols_to_drop, inplace=True) - -# Update the list of categorical columns after dropping -updated_categorical_cols = [ - col for col in updated_categorical_cols if col not in categorical_cols_to_drop -] - -""" -8. Save Cleaned DataFrame to CSV for inspection. -""" -# Save the cleaned dataframe to a CSV file -carbenmats_df_PATH = os.path.join(export_dir, "inspect/cleaned_carbenmats.csv") -carbenmats_df.to_csv(carbenmats_df_PATH, index=False) -carbenmats_df.info() - -""" -9. Label encode categorical data for ML use. -""" -# Label encode categorical columns -label_encoders = {} -for col in updated_categorical_cols: - label_encoders[col] = LabelEncoder() - carbenmats_df[col] = label_encoders[col].fit_transform(carbenmats_df[col]) - -""" -10. Save dataframe to CSV for modeling. -""" -carbenmats_df_PATH = os.path.join(export_dir, "encoded/encoded_carbenmats.csv") -carbenmats_df.to_csv(carbenmats_df_PATH, index=False) -carbenmats_df.info() diff --git a/src/prep/prep_clf.py b/src/prep/prep_clf.py deleted file mode 100644 index 4967e93..0000000 --- a/src/prep/prep_clf.py +++ /dev/null @@ -1,136 +0,0 @@ -import pandas as pd -import os -import numpy as np -from sklearn.preprocessing import LabelEncoder - -# Define the base directory and data paths -current_dir = os.path.dirname(os.path.abspath(__file__)) -data_dir = os.path.join(current_dir, "../../data/raw") -export_dir = os.path.join(current_dir, "../../data/processed") - -os.makedirs(export_dir, exist_ok=True) - -CLF_EMBODIED_CARBON_PATH = os.path.join( - data_dir, "model/CLF Embodied Carbon_Cleaned.csv" -) - -clf_df = pd.read_csv(CLF_EMBODIED_CARBON_PATH) - -""" -2. Clean the Datasets -""" -#### CLF -# Convert relevant columns to numeric, forcing errors to NaN -cols_to_convert = [ - "Minimum Building Area in Square Meters", - "Maximum Building Area in Square Meters", - "Minimum Building Storeys", - "Maximum Building Storeys", -] - -for col in cols_to_convert: - clf_df[col] = pd.to_numeric(clf_df[col], errors="coerce") - -# Replace np.inf and NaN with the maximum observed value in the respective columns -for col in cols_to_convert: - clf_df[col].replace(np.inf, np.nan, inplace=True) - max_value = clf_df[col].max(skipna=True) - clf_df[col].fillna(max_value, inplace=True) - -# Calculate average area and storeys -clf_df["Average Building Area in Square Meters"] = ( - clf_df["Minimum Building Area in Square Meters"] - + clf_df["Maximum Building Area in Square Meters"] -) / 2 -clf_df["Average Building Storeys"] = ( - clf_df["Minimum Building Storeys"] + clf_df["Maximum Building Storeys"] -) / 2 - -# Drop the minimum and maximum columns if they are no longer needed -clf_df = clf_df.drop( - columns=[ - "Minimum Building Area in Square Meters", - "Maximum Building Area in Square Meters", - "Minimum Building Storeys", - "Maximum Building Storeys", - ] -) - -# Select relevant columns -clf_df = clf_df[ - [ - "Building Type", - "Building Use", - "Building Location Region", - "Building New or Renovation", - "Average Building Storeys", - "Embodied Carbon Life Cycle Assessment Area Per Square Meter", - ] -] - -clf_df = clf_df.dropna() - -clf_df = clf_df.rename( - columns={ - "Building Type": "Building Use Type", - "Building Use": "Building Use Subtype", - "Building Location Region": "Continent", - "Building New or Renovation": "Building Project Type", - "Average Building Storeys": "Floors Above Ground", - "Embodied Carbon Life Cycle Assessment Area Per Square Meter": "Total Embodied Carbon PER m2", - } -) - -# Replace all values in "Building_Use_Type" that are not equal to "Residential" with "Non-residential" -clf_df["Building Use Type"] = clf_df["Building Use Type"].apply( - lambda x: "Residential" if x == "Residential" else "Non-residential" -) - -""" -3. Remove Outliers -""" -# Calculate Q1, Q3, and IQR for Total_Embodied_Carbon -Q1 = clf_df["Total Embodied Carbon PER m2"].quantile(0.25) -Q3 = clf_df["Total Embodied Carbon PER m2"].quantile(0.75) -IQR = Q3 - Q1 - -# Define the lower and upper bounds for outliers -lower_bound = Q1 - 1.5 * IQR -upper_bound = Q3 + 1.5 * IQR - -# Remove outliers -clf_df = clf_df[ - (clf_df["Total Embodied Carbon PER m2"] >= lower_bound) - & (clf_df["Total Embodied Carbon PER m2"] <= upper_bound) -] - -""" -4. Save dataframe to CSV for inspection. -""" -clf_df_PATH = os.path.join(export_dir, "inspect/cleaned_clf.csv") -clf_df.to_csv(clf_df_PATH, index=False) -clf_df.info() - -""" -5. Label encode categorical data for ML use. -""" -# Define categorical columns -categorical_cols = [ - "Building Use Type", - "Building Use Subtype", - "Continent", - "Building Project Type", -] - -# Label encode categorical columns -label_encoders = {} -for col in categorical_cols: - label_encoders[col] = LabelEncoder() - clf_df[col] = label_encoders[col].fit_transform(clf_df[col]) - -""" -6. Save dataframe to CSV for modeling. -""" -clf_df_PATH = os.path.join(export_dir, "encoded/encoded_clf.csv") -clf_df.to_csv(clf_df_PATH, index=False) -clf_df.info() diff --git a/src/prep/scrapers/building_desc_scraper.py b/src/prep/scrapers/building_desc_scraper.py deleted file mode 100644 index ed8e56f..0000000 --- a/src/prep/scrapers/building_desc_scraper.py +++ /dev/null @@ -1,74 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import pandas as pd -import os - -# Function to scrape article links from a page -def get_article_links(page_url): - response = requests.get(page_url) - soup = BeautifulSoup(response.text, 'html.parser') - article_links = [] - - for article in soup.find_all('article'): - link = article.find('a', href=True) - if link and "eventsguide" not in link['href'] and "dezeenjobs" not in link['href']: - article_links.append(link['href']) - - return article_links - -# Function to scrape the title and main text from an article -def scrape_article_details(article_url): - response = requests.get(article_url) - soup = BeautifulSoup(response.text, 'html.parser') - - # Extract title - title_tag = soup.find('a', href=article_url) - title = title_tag.text if title_tag else 'No title found' - - # Extract article text - article_text = '' - article_section = soup.find('article') - if article_section: - for paragraph in article_section.find_all('p'): - article_text += paragraph.text + ' ' - - return title, article_text.strip() - -# Main function to create the training set -def create_training_set(main_page_url, num_articles): - page = 1 - articles = [] - article_count = 0 - - while article_count < num_articles: - page_url = f"{main_page_url}page/{page}/" - article_links = get_article_links(page_url) - - for link in article_links: - if article_count >= num_articles: - break - title, text = scrape_article_details(link) - articles.append({'title': title, 'article': text}) - article_count += 1 - - page += 1 - - return articles - -# URL of the Dezeen architecture page -main_page_url = 'https://www.dezeen.com/architecture/' - -# Number of articles to scrape -num_articles = 15000 # Adjust as needed - -# Create the training set -training_set = create_training_set(main_page_url, num_articles) - -# Save the training set to a CSV file -current_dir = os.path.dirname(os.path.abspath(__file__)) -save_path = os.path.join(current_dir, '../../../data/raw/misc') - -df = pd.DataFrame(training_set) -df.to_csv(os.path.join(save_path, 'building_descriptions.csv'), index=False) - -print('Training set created and saved to ../../../data/raw/misc/building_descriptions.csv') diff --git a/src/prep/scrapers/fcbs_data_scraper.bas b/src/prep/scrapers/fcbs_data_scraper.bas deleted file mode 100644 index 2e5df28..0000000 --- a/src/prep/scrapers/fcbs_data_scraper.bas +++ /dev/null @@ -1,310 +0,0 @@ - -Sub AutomateBuildingVariants() - - Application.Calculation = xlCalculationAutomatic - - Dim wb As Workbook - Dim inputSheetProject As Worksheet, inputSheetEmbodied As Worksheet - Dim outputSheet As Worksheet - Dim sheetName As String - Dim sheetIndex As Integer - Dim sector As Variant, subSector As Variant, gia As Double - Dim perimeter As Double, footprint As Double, width As Double, height As Double - Dim storeysAbove As Integer, storeysBelow As Integer, glazingRatio As Double - Dim rowCounter As Long, sheetCounter As Integer - Dim buildingElements As Variant, materialOptions As Object - Dim currentMaterials As Variant - - Dim iterationCount As Long - Dim userLimit As Long - userLimit = InputBox("Enter how many data points you want.", "Set Limit", 1000) ' Default 1000 values - - ' Setup workbook and sheets - Set wb = ThisWorkbook - Set inputSheetProject = wb.Sheets("0. INPUT Project Details") - Set inputSheetEmbodied = wb.Sheets("2. INPUT Embodied Carbon") - - ' Create unique sheet name - sheetIndex = 1 - sheetName = "Results " & sheetIndex - While SheetExists(sheetName, wb) - sheetIndex = sheetIndex + 1 - sheetName = "Results " & sheetIndex - Wend - - ' Add new sheet with unique name - Set outputSheet = wb.Sheets.Add(After:=wb.Sheets(wb.Sheets.Count)) - outputSheet.Name = sheetName - - ' Initialize building elements and their corresponding material options - Set materialOptions = CreateObject("Scripting.Dictionary") - buildingElements = Array("Piles", "Pile caps", "Capping beams", "Raft", "Basement walls", "Lowest floor slab", _ - "Ground insulation", "Core structure", "Columns", "Beams", "Secondary beams", "Floor slab", _ - "Joisted floors", "Roof", "Roof insulation", "Roof finishes", "Facade", "Wall insulation", _ - "Glazing", "Window frames", "Partitions", "Ceilings", "Floors", "Services") - - ' Initialize building elements and their corresponding material options - Set materialOptions = CreateObject("Scripting.Dictionary") - - ' Add material options for each building element - materialOptions.Add "Piles", Array("RC 32/40 (50kg/m3 reinforcement)", "Steel", "") - materialOptions.Add "Pile caps", Array("RC 32/40 (200kg/m3 reinforcement)", "") - materialOptions.Add "Capping beams", Array("RC 32/40 (200kg/m3 reinforcement)", "Foamglass (domestic only)", "") - materialOptions.Add "Raft", Array("RC 32/40 (150kg/m3 reinforcement)", "") - materialOptions.Add "Basement walls", Array("RC 32/40 (125kg/m3 reinforcement)", "") - materialOptions.Add "Lowest floor slab", Array("RC 32/40 (150kg/m3 reinforcement)", "Beam and Block", "") - materialOptions.Add "Ground insulation", Array("EPS", "XPS", "Glass mineral wool", "") - materialOptions.Add "Core structure", Array("CLT", "Precast RC 32/40 (100kg/m3 reinforcement)", "RC 32/40 (100kg/m3 reinforcement)", "") - materialOptions.Add "Columns", Array("Glulam", "Iron (existing buildings)", "Precast RC 32/40 (300kg/m3 reinforcement)", "RC 32/40 (300kg/m3 reinforcement)", "Steel", "") - materialOptions.Add "Beams", Array("Glulam", "Iron (existing buildings)", "Precast RC 32/40 (250kg/m3 reinforcement)", "RC 32/40 (250kg/m3 reinforcement)", "Steel", "") - materialOptions.Add "Secondary beams", Array("Glulam", "Iron (existing buildings)", "Precast RC 32/40 (250kg/m3 reinforcement)", "RC 32/40 (250kg/m3 reinforcement)", "Steel", "") - materialOptions.Add "Floor slab", Array("CLT", "Precast RC 32/40 (100kg/m3 reinforcement)", "RC 32/40 (100kg/m3 reinforcement)", "Steel Concrete Composite", "") - materialOptions.Add "Joisted floors", Array("JJI Engineered Joists + OSB topper", "Timber Joists + OSB topper (Domestic)", "Timber Joists + OSB topper (Office)", "") - materialOptions.Add "Roof", Array("CLT", "Metal Deck", "Precast RC 32/40 (100kg/m3 reinforcement)", "RC 32/40 (100kg/m3 reinforcement)", "Steel Concrete Composite", "Timber Cassette", "Timber Pitch Roof", "") - materialOptions.Add "Roof insulation", Array("Cellulose, loose fill", "EPS", "Expanded Perlite", "Expanded Vermiculite", "Glass mineral wool", "PIR", "Rockwool", "Sheeps wool", "Vacuum Insulation", "Woodfibre", "XPS", "") - materialOptions.Add "Roof finishes", Array("Aluminium", "Asphalt (Mastic)", "Asphalt (Polymer modified)", "Bitumous Sheet", "Ceramic tile", "Fibre cement tile", "Green Roof", "Roofing membrane (PVC)", "Slate tile", "Zinc Standing Seam", "") - materialOptions.Add "Facade", Array("Blockwork with Brick", "Blockwork with render", "Blockwork with Timber", "Curtain Walling", "Load Bearing Precast Concrete Panel", "Load Bearing Precast Concrete with Brick Slips", "Party Wall Blockwork", "Party Wall Brick", "Party Wall Timber Cassette", "SFS with Aluminium Cladding", "SFS with Brick", "SFS with Ceramic Tiles", "SFS with Granite", "SFS with Limestone", "SFS with Zinc Cladding", "Solid Brick, single leaf", "Timber Cassette Panel with brick", "Timber Cassette Panel with Cement Render", "Timber Cassette Panel with Larch Cladding", "Timber Cassette Panel with Lime Render", "Timber SIPs with Brick", "") - materialOptions.Add "Wall insulation", Array("Cellulose, loose fill", "EPS", "Expanded Perlite", "Expanded Vermiculite", "Glass mineral wool", "PIR", "Rockwool", "Sheeps wool", "Vacuum Insulation", "Woodfibre", "XPS", "") - materialOptions.Add "Glazing", Array("Triple Glazing", "Double Glazing", "Single Glazing", "") - materialOptions.Add "Window frames", Array("Al/Timber Composite", "Aluminium", "Steel (single glazed)", "Solid softwood timber frame", "uPVC", "") - materialOptions.Add "Partitions", Array("CLT", "Plasterboard + Steel Studs", "Plasterboard + Timber Studs", "Plywood + Timber Studs", "Blockwork", "") - materialOptions.Add "Ceilings", Array("Exposed Soffit", "Plasterboard", "Steel grid system", "Steel tile", "Steel tile with 18mm acoustic pad", "Suspended plasterboard", "") - materialOptions.Add "Floors", Array("70mm screed", "Carpet", "Earthenware tile", "Raised floor", "Solid timber floorboards", "Stoneware tile", "Terrazzo", "Vinyl", "") - materialOptions.Add "Services", Array("Low", "Medium", "High", "") - - - - ' Initialize sector options and sub-sectors - Dim sectorOptions As Variant - Dim allSubSectors As Object - Set allSubSectors = CreateObject("Scripting.Dictionary") - sectorOptions = Array("Housing", "Office") - allSubSectors.Add "Housing", Array("Flat/maisonette", "Single family house", "Multi-family (< 6 storeys)", _ - "Multi-family (6 - 15 storeys)", "Multi-family (> 15 storeys)") - allSubSectors.Add "Office", Array("Office") - - ' Prepare header for the first Results Sheet - Call PrepareResultsSheetHeader(outputSheet, buildingElements) - - ' Counter initialization - rowCounter = 2 - sheetCounter = 1 - startRow = 29 - - ' Random selection process - Do While iterationCount < userLimit - ' Constraint dims - Dim hasPiles As Boolean - Dim hasCappingbeams As Boolean - Dim hasPilecaps As Boolean - Dim hasFloorSlab As Boolean - - hasPiles = False - hasCappingbeams = False - hasPilecaps = False - hasFloorSlab = False - - For Each sector In sectorOptions - For Each subSector In allSubSectors(sector) - gia = Int((20000 + 1) * Rnd) - perimeter = Int((5000 - 100 + 1) * Rnd + 100) - footprint = Int((10000 - 100 + 1) * Rnd + 100) - width = Int((200 - 10 + 1) * Rnd + 10) - height = Int((6 - 2.3 + 1) * Rnd * 10) / 10 + 2.3 ' Maintain decimal accuracy - storeysAbove = Int((60 - 1 + 1) * Rnd + 1) - storeysBelow = Int((5 - 0 + 1) * Rnd) - glazingRatio = Int((80 - 10 + 1) * Rnd + 10) - - ' Initialize current materials array - ReDim currentMaterials(UBound(buildingElements)) - - ' Recursive call to process all material combinations - ProcessMaterials 0, buildingElements, materialOptions, currentMaterials, _ - outputSheet, rowCounter, sector, subSector, gia, _ - perimeter, footprint, width, height, storeysAbove, _ - storeysBelow, glazingRatio, startRow, hasPiles, hasCappingbeams, hasPilecaps, _ - hasFloorSlab - - ' Increment the iteration counter - iterationCount = iterationCount + 1 - - If iterationCount >= userLimit Then - Exit Do - End If - - ' Scroll to the current row after 60 - If rowCounter Mod 60 = 1 Then - Application.Goto outputSheet.Cells(rowCounter - 1, 1), True - End If - - Next subSector - If iterationCount >= userLimit Then - Exit Do - End If - Next sector - Loop - MsgBox "Automation complete!" -End Sub -' Recursive function to handle all material combinations -Sub ProcessMaterials(ByVal elementIndex As Integer, ByRef buildingElements As Variant, ByRef materialOptions As Object, _ - ByRef currentMaterials As Variant, ByRef outputSheet As Worksheet, ByRef rowCounter As Long, _ - ByVal sector As Variant, ByVal subSector As Variant, ByVal gia As Double, _ - ByVal perimeter As Double, ByVal footprint As Double, ByVal width As Double, _ - ByVal height As Double, ByVal storeysAbove As Integer, ByVal storeysBelow As Integer, _ - ByVal glazingRatio As Double, ByVal startRow As Integer, ByVal hasPiles As Boolean, _ - ByVal hasCappingbeams As Boolean, ByVal hasPilecaps As Boolean, ByVal hasFloorSlab As Boolean) - - If elementIndex > UBound(buildingElements) Then - ' All elements have materials assigned, output the results - RecordResults currentMaterials, outputSheet, rowCounter, sector, subSector, gia, _ - perimeter, footprint, width, height, storeysAbove, storeysBelow, glazingRatio - rowCounter = rowCounter + 1 - Exit Sub - End If - - Dim element As String - element = buildingElements(elementIndex) - Dim materials As Variant - materials = materialOptions(element) - - Randomize ' Initialize random number generator - Dim i As Integer - i = Int((UBound(materials) - LBound(materials) + 1) * Rnd + LBound(materials)) - - ' Skip logic if realistic building conditions aren't met - Select Case element - Case "Raft" - If hasCappingbeams Or hasPilecaps Then - ProcessMaterials elementIndex + 1, buildingElements, materialOptions, currentMaterials, _ - outputSheet, rowCounter, sector, subSector, gia, perimeter, footprint, width, _ - height, storeysAbove, storeysBelow, glazingRatio, startRow, hasPiles, hasCappingbeams, _ - hasPilecaps, hasFloorSlab - Exit Sub - End If - - Case "Pile caps", "Capping beams" - If Not hasPiles Then - ProcessMaterials elementIndex + 1, buildingElements, materialOptions, currentMaterials, _ - outputSheet, rowCounter, sector, subSector, gia, perimeter, footprint, width, _ - height, storeysAbove, storeysBelow, glazingRatio, startRow, hasPiles, hasCappingbeams, _ - hasPilecaps, hasFloorSlab - Exit Sub - End If - - Case "Basement walls" - If storeysBelow = 0 Then - ProcessMaterials elementIndex + 1, buildingElements, materialOptions, currentMaterials, _ - outputSheet, rowCounter, sector, subSector, gia, perimeter, footprint, width, _ - height, storeysAbove, storeysBelow, glazingRatio, startRow, hasPiles, hasCappingbeams, _ - hasPilecaps, hasFloorSlab - Exit Sub - End If - - Case "Joisted floors" - If hasFloorSlab Then - ProcessMaterials elementIndex + 1, buildingElements, materialOptions, currentMaterials, _ - outputSheet, rowCounter, sector, subSector, gia, perimeter, footprint, width, _ - height, storeysAbove, storeysBelow, glazingRatio, startRow, hasPiles, hasCappingbeams, _ - hasPilecaps, hasFloorSlab - Exit Sub - End If - End Select - - ' Assign and log the selected material - currentMaterials(elementIndex) = materials(i) - - ' Update material use flags - If materials(i) <> "" Then - Select Case element - Case "Piles" - hasPiles = True - - Case "Pile caps" - hasPilecaps = True - - Case "Capping beams" - hasCappingbeams = True - - Case "Floor slab" - hasFloorSlab = True - End Select - End If - - - ' Set the material for the current building element in the input sheet - ThisWorkbook.Sheets("2. INPUT Embodied Carbon").Cells(startRow + elementIndex, 3).Value = materials(i) - - ' Recursively process the next element with the next index - ProcessMaterials elementIndex + 1, buildingElements, materialOptions, currentMaterials, _ - outputSheet, rowCounter, sector, subSector, gia, perimeter, footprint, width, _ - height, storeysAbove, storeysBelow, glazingRatio, startRow, hasPiles, hasCappingbeams, _ - hasPilecaps, hasFloorSlab -End Sub -' Function to record results -Sub RecordResults(ByRef currentMaterials As Variant, ByRef outputSheet As Worksheet, ByVal rowCounter As Long, _ - ByVal sector As Variant, ByVal subSector As Variant, ByVal gia As Double, _ - ByVal perimeter As Double, ByVal footprint As Double, ByVal width As Double, _ - ByVal height As Double, ByVal storeysAbove As Integer, ByVal storeysBelow As Integer, _ - ByVal glazingRatio As Double) - - Dim wb As Workbook - Set wb = ThisWorkbook - embodiedCarbon = wb.Sheets("5. OUTPUT Machine").Cells(19, 2).Value - - outputSheet.Cells(rowCounter, 1).Value = sector - outputSheet.Cells(rowCounter, 2).Value = subSector - outputSheet.Cells(rowCounter, 3).Value = gia - outputSheet.Cells(rowCounter, 4).Value = perimeter - outputSheet.Cells(rowCounter, 5).Value = footprint - outputSheet.Cells(rowCounter, 6).Value = width - outputSheet.Cells(rowCounter, 7).Value = height - outputSheet.Cells(rowCounter, 8).Value = storeysAbove - outputSheet.Cells(rowCounter, 9).Value = storeysBelow - outputSheet.Cells(rowCounter, 10).Value = glazingRatio - - ' Output materials - Dim colIdx As Integer - colIdx = 11 ' Start from column 11 for material options - Dim idx As Integer - For idx = LBound(currentMaterials) To UBound(currentMaterials) - outputSheet.Cells(rowCounter, colIdx).Value = currentMaterials(idx) - colIdx = colIdx + 1 - Next idx - - outputSheet.Cells(rowCounter, colIdx).Value = embodiedCarbon -End Sub - -Function SheetExists(sheetName As String, wb As Workbook) As Boolean - Dim tmpSheet As Worksheet - On Error Resume Next - Set tmpSheet = wb.Sheets(sheetName) - On Error GoTo 0 - SheetExists = Not tmpSheet Is Nothing -End Function - -Private Sub PrepareResultsSheetHeader(sheet As Worksheet, buildingElements As Variant) - Dim col As Integer - sheet.Cells(1, 1).Value = "Sector" - sheet.Cells(1, 2).Value = "Sub-Sector" - sheet.Cells(1, 3).Value = "GIA (m2)" - sheet.Cells(1, 4).Value = "Building Perimeter" - sheet.Cells(1, 5).Value = "Building Footprint" - sheet.Cells(1, 6).Value = "Building Width" - sheet.Cells(1, 7).Value = "Floor-to-Floor Height" - sheet.Cells(1, 8).Value = "No. Storeys Ground & Above" - sheet.Cells(1, 9).Value = "No. Storeys Below Ground" - sheet.Cells(1, 10).Value = "Glazing Ratio" - - col = 11 ' Start from column 11 for material options - Dim i As Integer - For Each element In buildingElements - Debug.Print element ' Add this line to print the element in Immediate Window - sheet.Cells(1, col).Value = element & " Material" - col = col + 1 - Next element - - sheet.Cells(1, col).Value = "Embodied Carbon (kgCO2e/m2)" -End Sub - - -