From 9d1ee36a2b6bc2ba3a5c2a06a586946b9abb5695 Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 18:51:33 -0500
Subject: [PATCH 1/9] remove scorecard with site/source change

---
 pypeds/datasets.py | 66 +++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 1e335d5..4f117ef 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -53,52 +53,52 @@ def wiche():
     return(wiche_df)
 
 
-def scorecard():
-    """
-    Returns a dataframe of the most recent college scorecard dataset.
+# def scorecard():
+#     """
+#     Returns a dataframe of the most recent college scorecard dataset.
 
-    The Scorecard dataset, not the full dataset.  For the full, use the scorecard_full method.
-    """
+#     The Scorecard dataset, not the full dataset.  For the full, use the scorecard_full method.
+#     """
 
-    url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv"
-    x = pd.read_csv(url)
-    return(x)
+#     url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv"
+#     x = pd.read_csv(url)
+#     return(x)
 
 
-def scorecard_full():
-    """
-    Returns a dataframe of the most recent FULL college scorecard dataset.
+# def scorecard_full():
+#     """
+#     Returns a dataframe of the most recent FULL college scorecard dataset.
 
-    This will take ~ 10 seconds using free online resources, but also asks for a full data download at present.
-    Curently, caching is not used but should be.
-    """
+#     This will take ~ 10 seconds using free online resources, but also asks for a full data download at present.
+#     Curently, caching is not used but should be.
+#     """
 
-    url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-All-Data-Elements.csv"
-    x = pd.read_csv(url)
-    return(x)
+#     url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-All-Data-Elements.csv"
+#     x = pd.read_csv(url)
+#     return(x)
 
 
-def scorecard_nslds():
-    """
-    Returns a dataframe of the most recent cohort for the NSLDS dataset.
+# def scorecard_nslds():
+#     """
+#     Returns a dataframe of the most recent cohort for the NSLDS dataset.
 
-    This will take ~ 10 seconds using free online resources, but also asks for a full data download at present.
-    Curently, caching is not used but should be.
-    """
+#     This will take ~ 10 seconds using free online resources, but also asks for a full data download at present.
+#     Curently, caching is not used but should be.
+#     """
 
-    url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-NSLDS-Elements.csv"
-    x = pd.read_csv(url)
-    return(x)
+#     url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-NSLDS-Elements.csv"
+#     x = pd.read_csv(url)
+#     return(x)
 
 
-def scorecard_earnings():
-    """
-    Returns a dataframe of the most recent cohort for post school earnings.
-    """
+# def scorecard_earnings():
+#     """
+#     Returns a dataframe of the most recent cohort for post school earnings.
+#     """
 
-    url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Treasury-Elements.csv"
-    x = pd.read_csv(url)
-    return(x)
+#     url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Treasury-Elements.csv"
+#     x = pd.read_csv(url)
+#     return(x)
 
 
 def crosswalk():

From 76623a4ffca3bab6812add9ad1823dbb3c521957 Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 19:34:08 -0500
Subject: [PATCH 2/9] remove old scorecard and add tableau export code

---
 pypeds/datasets.py | 110 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 2 deletions(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 4f117ef..839c724 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -1,5 +1,10 @@
 import pickle
 import pandas as pd
+import datetime
+import zipfile
+import requests
+import os
+import pantab
 
 def comp_graph1():
     """
@@ -41,8 +46,6 @@ def comp_graph3():
     return(edges)
 
 
-
-
 def wiche():
     """
     Returns a dataframe with the most recent WICHE projections in long format.
@@ -53,6 +56,109 @@ def wiche():
     return(wiche_df)
 
 
+def scorecard_merged(fname="scorecard", expath="./"):
+    """
+    Parse College Scorecard data and return a dict of dataframes and also save out the hyper files for each.
+    """
+    _today = datetime.datetime.today().strftime('%Y%m%d')
+    sc_datasets = dict()
+    URL = "https://ed-public-download.app.cloud.gov/downloads/CollegeScorecard_Raw_Data_02072022.zip"
+    path = "/tmp/" + str(_today) + URL + "/"  # hacky way to make unique path to extract date and survey
+    file = fname + ".zip"
+    # get and save the file
+    if not os.path.exists(path + file):
+        # get the data
+    os.mkdir(path)
+    try:
+        results = requests.get(url)
+    except:
+        pass
+    with open(path + file, 'wb') as f:
+        f.write(results.content)
+    # extract the files to the path
+    file = zipfile.ZipFile(path + file)
+    file.extractall(path=path)
+    ############################# get the data dictionary
+    DD_URL = "https://data.ed.gov/dataset/9dc70e6b-8426-4d71-b9d5-70ce6094a3f4/resource/658b5b83-ac9f-4e41-913e-9ba9411d7967/download/collegescorecarddatadictionary_02072022.xlsx"
+    dd = pd.read_excel(DD_URL, sheet_name="Institution_Data_Dictionary")
+    ############################# merged file
+    # read in the merged file
+    FNAME = "MERGED2019_20_PP.csv"
+    fpath = path + FNAME
+    df = pd.read_csv(FNAME, low_memory=False)
+    # keep just the valid columns
+    COLS = ['VARIABLE NAME', 'API data type']
+    dd_vals = dd.loc[:, COLS]
+    # keep all valid values
+    dd_vals = dd_vals.dropna()
+    # cleanup column names
+    dd_vals.columns = dd_vals.columns.str.lower().str.replace(' ', '_')
+    # flag the columns that will be changed to floats (just to be safe)
+    ROWS = dd_vals.api_data_type.isin(['float','integer','long'])
+    dd_nums = dd_vals.loc[ROWS,:]
+    NUM_COLS = dd_nums['variable_name'].to_list()
+    NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)]
+    # .astype on df gave me fits, and this was suprisingly faster
+    for COL in NUM_COLS:
+        try:
+            df[COL] = df[COL].astype('float64')
+            # print(f"changed {COL}")
+        except:
+            pass
+    merged = df.copy()
+    # write the file and append to a dictionary to store the dataframes
+    EXPORT = expath + "merged.hyper"
+    pantab.frame_to_hyper(merged, EXPORT, table="merged")
+    sc_datasets['merged'] = merged
+    ############################# cohort all
+    # most recent - all
+    FNAME = "Most-Recent-Cohorts-All-Data-Elements.csv"
+    fpath = path + FNAME
+    df = pd.read_csv(FNAME, low_memory=False)
+    # use same as above
+    NUM_COLS = dd_nums['variable_name'].to_list()
+    NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)]
+    for COL in NUM_COLS:
+        try:
+            df[COL] = df[COL].astype('float64')
+        except:
+            pass
+    merged = df.copy()
+    # write the file and append to a dictionary to store the dataframes
+    EXPORT = expath + "mostrecent-all.hyper"
+    pantab.frame_to_hyper(merged, EXPORT, table="mrall")
+    sc_datasets['recent_all'] = merged
+    ############################# cohort field of study
+    # most recent - all
+    FNAME = "Most-Recent-Cohorts-Field-of-Study.csv"
+    fpath = path + FNAME
+    df = pd.read_csv(FNAME, low_memory=False)
+    # use same as above
+    NUM_COLS = dd_nums['variable_name'].to_list()
+    NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)]
+    for COL in NUM_COLS:
+        try:
+            df[COL] = df[COL].astype('float64')
+        except:
+            pass
+    merged = df.copy()
+    # write the file and append to a dictionary to store the dataframes
+    EXPORT = expath + "mostrecent-fieldstudy.hyper"
+    pantab.frame_to_hyper(merged, EXPORT, table="mrfieldstudy")
+    sc_datasets['recent_field_study'] = merged
+    
+    return sc_datasets
+
+
+
+
+
+
+
+
+
+
+
 # def scorecard():
 #     """
 #     Returns a dataframe of the most recent college scorecard dataset.

From 09b6ff24caeed72592316ee570c56a289c4a0528 Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 19:41:49 -0500
Subject: [PATCH 3/9] version bump

---
 pypeds/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypeds/__init__.py b/pypeds/__init__.py
index 063c537..5ee46d9 100644
--- a/pypeds/__init__.py
+++ b/pypeds/__init__.py
@@ -1,7 +1,7 @@
 """A python package to help facilitate the collection and analysis of education-related datasets. """
 
 # change to calendar versioning of sorts, last digit is a version within the mmddv
-__version__ = '2022.02121'
+__version__ = '2022.03081'
 
 
 from pypeds.ipeds import *

From f1fa79270f0337f63f01e9bb66cf6a1a4764f3af Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 19:44:26 -0500
Subject: [PATCH 4/9] indent error

---
 pypeds/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 839c724..9787901 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -68,7 +68,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
     # get and save the file
     if not os.path.exists(path + file):
         # get the data
-    os.mkdir(path)
+        os.mkdir(path)
     try:
         results = requests.get(url)
     except:

From 45124619174c97da5a341f6ead750f4442d63cec Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 19:52:35 -0500
Subject: [PATCH 5/9] path bugs

---
 pypeds/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 9787901..6a02423 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -70,7 +70,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
         # get the data
         os.mkdir(path)
     try:
-        results = requests.get(url)
+        results = requests.get(URL)
     except:
         pass
     with open(path + file, 'wb') as f:

From 7ecd1e6c618cf5be50c3bb9ffc9220a9dcb91652 Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 19:55:41 -0500
Subject: [PATCH 6/9] URL in the path bug

---
 pypeds/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 6a02423..9cc89d2 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -63,7 +63,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
     _today = datetime.datetime.today().strftime('%Y%m%d')
     sc_datasets = dict()
     URL = "https://ed-public-download.app.cloud.gov/downloads/CollegeScorecard_Raw_Data_02072022.zip"
-    path = "/tmp/" + str(_today) + URL + "/"  # hacky way to make unique path to extract date and survey
+    path = "/tmp/" + str(_today) + "/"  # hacky way to make unique path to extract date and survey
     file = fname + ".zip"
     # get and save the file
     if not os.path.exists(path + file):

From f982d992f07e5b908d4028da2dae3fbe864ca274 Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 20:03:28 -0500
Subject: [PATCH 7/9] build extracted path error

---
 pypeds/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 9cc89d2..07e5e19 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -85,7 +85,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
     # read in the merged file
     FNAME = "MERGED2019_20_PP.csv"
     fpath = path + FNAME
-    df = pd.read_csv(FNAME, low_memory=False)
+    df = pd.read_csv(fpath, low_memory=False)
     # keep just the valid columns
     COLS = ['VARIABLE NAME', 'API data type']
     dd_vals = dd.loc[:, COLS]
@@ -114,7 +114,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
     # most recent - all
     FNAME = "Most-Recent-Cohorts-All-Data-Elements.csv"
     fpath = path + FNAME
-    df = pd.read_csv(FNAME, low_memory=False)
+    df = pd.read_csv(fpath, low_memory=False)
     # use same as above
     NUM_COLS = dd_nums['variable_name'].to_list()
     NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)]
@@ -132,7 +132,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
     # most recent - all
     FNAME = "Most-Recent-Cohorts-Field-of-Study.csv"
     fpath = path + FNAME
-    df = pd.read_csv(FNAME, low_memory=False)
+    df = pd.read_csv(fpath, low_memory=False)
     # use same as above
     NUM_COLS = dd_nums['variable_name'].to_list()
     NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)]

From 8aa370b804dca0d02414e6a9709d18c2f125aa5a Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 20:16:32 -0500
Subject: [PATCH 8/9] cheat and add print statements

---
 pypeds/datasets.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 07e5e19..62df91c 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -65,28 +65,36 @@ def scorecard_merged(fname="scorecard", expath="./"):
     URL = "https://ed-public-download.app.cloud.gov/downloads/CollegeScorecard_Raw_Data_02072022.zip"
     path = "/tmp/" + str(_today) + "/"  # hacky way to make unique path to extract date and survey
     file = fname + ".zip"
+    print(path, file)
     # get and save the file
     if not os.path.exists(path + file):
         # get the data
         os.mkdir(path)
     try:
+        print(f"requesting {URL}")
         results = requests.get(URL)
     except:
         pass
     with open(path + file, 'wb') as f:
+        print(f"writing file from {URL}")
         f.write(results.content)
     # extract the files to the path
     file = zipfile.ZipFile(path + file)
+    print(f"extracting file at {path + file}")
     file.extractall(path=path)
+    print("files extracted, getting data dictionary")
     ############################# get the data dictionary
     DD_URL = "https://data.ed.gov/dataset/9dc70e6b-8426-4d71-b9d5-70ce6094a3f4/resource/658b5b83-ac9f-4e41-913e-9ba9411d7967/download/collegescorecarddatadictionary_02072022.xlsx"
     dd = pd.read_excel(DD_URL, sheet_name="Institution_Data_Dictionary")
+    print(f"data dictionary imported")
     ############################# merged file
     # read in the merged file
     FNAME = "MERGED2019_20_PP.csv"
     fpath = path + FNAME
+    print(f"reading in merged file at {fpath}")
     df = pd.read_csv(fpath, low_memory=False)
     # keep just the valid columns
+    print("columns for numeric datatypes")
     COLS = ['VARIABLE NAME', 'API data type']
     dd_vals = dd.loc[:, COLS]
     # keep all valid values
@@ -98,6 +106,7 @@ def scorecard_merged(fname="scorecard", expath="./"):
     dd_nums = dd_vals.loc[ROWS,:]
     NUM_COLS = dd_nums['variable_name'].to_list()
     NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)]
+    print("for numeric columns, changing the datatype to numeric in the dataframe")
     # .astype on df gave me fits, and this was suprisingly faster
     for COL in NUM_COLS:
         try:
@@ -105,15 +114,18 @@ def scorecard_merged(fname="scorecard", expath="./"):
             # print(f"changed {COL}")
         except:
             pass
+    print("writing the merged file")
     merged = df.copy()
     # write the file and append to a dictionary to store the dataframes
     EXPORT = expath + "merged.hyper"
     pantab.frame_to_hyper(merged, EXPORT, table="merged")
     sc_datasets['merged'] = merged
+    print("merged file logged.  Moving onto next")
     ############################# cohort all
     # most recent - all
     FNAME = "Most-Recent-Cohorts-All-Data-Elements.csv"
     fpath = path + FNAME
+    print(f"getting next file {fpath}")
     df = pd.read_csv(fpath, low_memory=False)
     # use same as above
     NUM_COLS = dd_nums['variable_name'].to_list()
@@ -123,15 +135,18 @@ def scorecard_merged(fname="scorecard", expath="./"):
             df[COL] = df[COL].astype('float64')
         except:
             pass
+    print(f"writing the most recent files")
     merged = df.copy()
     # write the file and append to a dictionary to store the dataframes
     EXPORT = expath + "mostrecent-all.hyper"
     pantab.frame_to_hyper(merged, EXPORT, table="mrall")
     sc_datasets['recent_all'] = merged
+    print("files saved, moving onto next.")
     ############################# cohort field of study
     # most recent - all
     FNAME = "Most-Recent-Cohorts-Field-of-Study.csv"
     fpath = path + FNAME
+    print(f"reading in file at {fpath}")
     df = pd.read_csv(fpath, low_memory=False)
     # use same as above
     NUM_COLS = dd_nums['variable_name'].to_list()
@@ -141,12 +156,14 @@ def scorecard_merged(fname="scorecard", expath="./"):
             df[COL] = df[COL].astype('float64')
         except:
             pass
+    print("writing the files...")
     merged = df.copy()
     # write the file and append to a dictionary to store the dataframes
     EXPORT = expath + "mostrecent-fieldstudy.hyper"
     pantab.frame_to_hyper(merged, EXPORT, table="mrfieldstudy")
     sc_datasets['recent_field_study'] = merged
     
+    print("files written and exporting a dict of DataFrames")
     return sc_datasets
 
 

From 61c33de90a0007678cea7ce37e933195f068d61e Mon Sep 17 00:00:00 2001
From: Brock Tibert <btibert3@gmail.com>
Date: Tue, 8 Mar 2022 20:19:28 -0500
Subject: [PATCH 9/9] first of many print statement errors

---
 pypeds/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypeds/datasets.py b/pypeds/datasets.py
index 62df91c..eed94b2 100644
--- a/pypeds/datasets.py
+++ b/pypeds/datasets.py
@@ -79,8 +79,8 @@ def scorecard_merged(fname="scorecard", expath="./"):
         print(f"writing file from {URL}")
         f.write(results.content)
     # extract the files to the path
-    file = zipfile.ZipFile(path + file)
     print(f"extracting file at {path + file}")
+    file = zipfile.ZipFile(path + file)
     file.extractall(path=path)
     print("files extracted, getting data dictionary")
     ############################# get the data dictionary