From 9d1ee36a2b6bc2ba3a5c2a06a586946b9abb5695 Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 18:51:33 -0500 Subject: [PATCH 1/9] remove scorecard with site/source change --- pypeds/datasets.py | 66 +++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 1e335d5..4f117ef 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -53,52 +53,52 @@ def wiche(): return(wiche_df) -def scorecard(): - """ - Returns a dataframe of the most recent college scorecard dataset. +# def scorecard(): +# """ +# Returns a dataframe of the most recent college scorecard dataset. - The Scorecard dataset, not the full dataset. For the full, use the scorecard_full method. - """ +# The Scorecard dataset, not the full dataset. For the full, use the scorecard_full method. +# """ - url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv" - x = pd.read_csv(url) - return(x) +# url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv" +# x = pd.read_csv(url) +# return(x) -def scorecard_full(): - """ - Returns a dataframe of the most recent FULL college scorecard dataset. +# def scorecard_full(): +# """ +# Returns a dataframe of the most recent FULL college scorecard dataset. - This will take ~ 10 seconds using free online resources, but also asks for a full data download at present. - Curently, caching is not used but should be. - """ +# This will take ~ 10 seconds using free online resources, but also asks for a full data download at present. +# Curently, caching is not used but should be. +# """ - url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-All-Data-Elements.csv" - x = pd.read_csv(url) - return(x) +# url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-All-Data-Elements.csv" +# x = pd.read_csv(url) +# return(x) -def scorecard_nslds(): - """ - Returns a dataframe of the most recent cohort for the NSLDS dataset. +# def scorecard_nslds(): +# """ +# Returns a dataframe of the most recent cohort for the NSLDS dataset. - This will take ~ 10 seconds using free online resources, but also asks for a full data download at present. - Curently, caching is not used but should be. - """ +# This will take ~ 10 seconds using free online resources, but also asks for a full data download at present. +# Curently, caching is not used but should be. +# """ - url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-NSLDS-Elements.csv" - x = pd.read_csv(url) - return(x) +# url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-NSLDS-Elements.csv" +# x = pd.read_csv(url) +# return(x) -def scorecard_earnings(): - """ - Returns a dataframe of the most recent cohort for post school earnings. - """ +# def scorecard_earnings(): +# """ +# Returns a dataframe of the most recent cohort for post school earnings. +# """ - url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Treasury-Elements.csv" - x = pd.read_csv(url) - return(x) +# url = "https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Treasury-Elements.csv" +# x = pd.read_csv(url) +# return(x) def crosswalk(): From 76623a4ffca3bab6812add9ad1823dbb3c521957 Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 19:34:08 -0500 Subject: [PATCH 2/9] remove old scorecard and add tableau export code --- pypeds/datasets.py | 110 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 4f117ef..839c724 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -1,5 +1,10 @@ import pickle import pandas as pd +import datetime +import zipfile +import requests +import os +import pantab def comp_graph1(): """ @@ -41,8 +46,6 @@ def comp_graph3(): return(edges) - - def wiche(): """ Returns a dataframe with the most recent WICHE projections in long format. @@ -53,6 +56,109 @@ def wiche(): return(wiche_df) +def scorecard_merged(fname="scorecard", expath="./"): + """ + Parse College Scorecard data and return a dict of dataframes and also save out the hyper files for each. + """ + _today = datetime.datetime.today().strftime('%Y%m%d') + sc_datasets = dict() + URL = "https://ed-public-download.app.cloud.gov/downloads/CollegeScorecard_Raw_Data_02072022.zip" + path = "/tmp/" + str(_today) + URL + "/" # hacky way to make unique path to extract date and survey + file = fname + ".zip" + # get and save the file + if not os.path.exists(path + file): + # get the data + os.mkdir(path) + try: + results = requests.get(url) + except: + pass + with open(path + file, 'wb') as f: + f.write(results.content) + # extract the files to the path + file = zipfile.ZipFile(path + file) + file.extractall(path=path) + ############################# get the data dictionary + DD_URL = "https://data.ed.gov/dataset/9dc70e6b-8426-4d71-b9d5-70ce6094a3f4/resource/658b5b83-ac9f-4e41-913e-9ba9411d7967/download/collegescorecarddatadictionary_02072022.xlsx" + dd = pd.read_excel(DD_URL, sheet_name="Institution_Data_Dictionary") + ############################# merged file + # read in the merged file + FNAME = "MERGED2019_20_PP.csv" + fpath = path + FNAME + df = pd.read_csv(FNAME, low_memory=False) + # keep just the valid columns + COLS = ['VARIABLE NAME', 'API data type'] + dd_vals = dd.loc[:, COLS] + # keep all valid values + dd_vals = dd_vals.dropna() + # cleanup column names + dd_vals.columns = dd_vals.columns.str.lower().str.replace(' ', '_') + # flag the columns that will be changed to floats (just to be safe) + ROWS = dd_vals.api_data_type.isin(['float','integer','long']) + dd_nums = dd_vals.loc[ROWS,:] + NUM_COLS = dd_nums['variable_name'].to_list() + NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)] + # .astype on df gave me fits, and this was suprisingly faster + for COL in NUM_COLS: + try: + df[COL] = df[COL].astype('float64') + # print(f"changed {COL}") + except: + pass + merged = df.copy() + # write the file and append to a dictionary to store the dataframes + EXPORT = expath + "merged.hyper" + pantab.frame_to_hyper(merged, EXPORT, table="merged") + sc_datasets['merged'] = merged + ############################# cohort all + # most recent - all + FNAME = "Most-Recent-Cohorts-All-Data-Elements.csv" + fpath = path + FNAME + df = pd.read_csv(FNAME, low_memory=False) + # use same as above + NUM_COLS = dd_nums['variable_name'].to_list() + NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)] + for COL in NUM_COLS: + try: + df[COL] = df[COL].astype('float64') + except: + pass + merged = df.copy() + # write the file and append to a dictionary to store the dataframes + EXPORT = expath + "mostrecent-all.hyper" + pantab.frame_to_hyper(merged, EXPORT, table="mrall") + sc_datasets['recent_all'] = merged + ############################# cohort field of study + # most recent - all + FNAME = "Most-Recent-Cohorts-Field-of-Study.csv" + fpath = path + FNAME + df = pd.read_csv(FNAME, low_memory=False) + # use same as above + NUM_COLS = dd_nums['variable_name'].to_list() + NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)] + for COL in NUM_COLS: + try: + df[COL] = df[COL].astype('float64') + except: + pass + merged = df.copy() + # write the file and append to a dictionary to store the dataframes + EXPORT = expath + "mostrecent-fieldstudy.hyper" + pantab.frame_to_hyper(merged, EXPORT, table="mrfieldstudy") + sc_datasets['recent_field_study'] = merged + + return sc_datasets + + + + + + + + + + + # def scorecard(): # """ # Returns a dataframe of the most recent college scorecard dataset. From 09b6ff24caeed72592316ee570c56a289c4a0528 Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 19:41:49 -0500 Subject: [PATCH 3/9] version bump --- pypeds/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypeds/__init__.py b/pypeds/__init__.py index 063c537..5ee46d9 100644 --- a/pypeds/__init__.py +++ b/pypeds/__init__.py @@ -1,7 +1,7 @@ """A python package to help facilitate the collection and analysis of education-related datasets. """ # change to calendar versioning of sorts, last digit is a version within the mmddv -__version__ = '2022.02121' +__version__ = '2022.03081' from pypeds.ipeds import * From f1fa79270f0337f63f01e9bb66cf6a1a4764f3af Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 19:44:26 -0500 Subject: [PATCH 4/9] indent error --- pypeds/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 839c724..9787901 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -68,7 +68,7 @@ def scorecard_merged(fname="scorecard", expath="./"): # get and save the file if not os.path.exists(path + file): # get the data - os.mkdir(path) + os.mkdir(path) try: results = requests.get(url) except: From 45124619174c97da5a341f6ead750f4442d63cec Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 19:52:35 -0500 Subject: [PATCH 5/9] path bugs --- pypeds/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 9787901..6a02423 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -70,7 +70,7 @@ def scorecard_merged(fname="scorecard", expath="./"): # get the data os.mkdir(path) try: - results = requests.get(url) + results = requests.get(URL) except: pass with open(path + file, 'wb') as f: From 7ecd1e6c618cf5be50c3bb9ffc9220a9dcb91652 Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 19:55:41 -0500 Subject: [PATCH 6/9] URL in the path bug --- pypeds/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 6a02423..9cc89d2 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -63,7 +63,7 @@ def scorecard_merged(fname="scorecard", expath="./"): _today = datetime.datetime.today().strftime('%Y%m%d') sc_datasets = dict() URL = "https://ed-public-download.app.cloud.gov/downloads/CollegeScorecard_Raw_Data_02072022.zip" - path = "/tmp/" + str(_today) + URL + "/" # hacky way to make unique path to extract date and survey + path = "/tmp/" + str(_today) + "/" # hacky way to make unique path to extract date and survey file = fname + ".zip" # get and save the file if not os.path.exists(path + file): From f982d992f07e5b908d4028da2dae3fbe864ca274 Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 20:03:28 -0500 Subject: [PATCH 7/9] build extracted path error --- pypeds/datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 9cc89d2..07e5e19 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -85,7 +85,7 @@ def scorecard_merged(fname="scorecard", expath="./"): # read in the merged file FNAME = "MERGED2019_20_PP.csv" fpath = path + FNAME - df = pd.read_csv(FNAME, low_memory=False) + df = pd.read_csv(fpath, low_memory=False) # keep just the valid columns COLS = ['VARIABLE NAME', 'API data type'] dd_vals = dd.loc[:, COLS] @@ -114,7 +114,7 @@ def scorecard_merged(fname="scorecard", expath="./"): # most recent - all FNAME = "Most-Recent-Cohorts-All-Data-Elements.csv" fpath = path + FNAME - df = pd.read_csv(FNAME, low_memory=False) + df = pd.read_csv(fpath, low_memory=False) # use same as above NUM_COLS = dd_nums['variable_name'].to_list() NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)] @@ -132,7 +132,7 @@ def scorecard_merged(fname="scorecard", expath="./"): # most recent - all FNAME = "Most-Recent-Cohorts-Field-of-Study.csv" fpath = path + FNAME - df = pd.read_csv(FNAME, low_memory=False) + df = pd.read_csv(fpath, low_memory=False) # use same as above NUM_COLS = dd_nums['variable_name'].to_list() NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)] From 8aa370b804dca0d02414e6a9709d18c2f125aa5a Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 20:16:32 -0500 Subject: [PATCH 8/9] cheat and add print statements --- pypeds/datasets.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 07e5e19..62df91c 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -65,28 +65,36 @@ def scorecard_merged(fname="scorecard", expath="./"): URL = "https://ed-public-download.app.cloud.gov/downloads/CollegeScorecard_Raw_Data_02072022.zip" path = "/tmp/" + str(_today) + "/" # hacky way to make unique path to extract date and survey file = fname + ".zip" + print(path, file) # get and save the file if not os.path.exists(path + file): # get the data os.mkdir(path) try: + print(f"requesting {URL}") results = requests.get(URL) except: pass with open(path + file, 'wb') as f: + print(f"writing file from {URL}") f.write(results.content) # extract the files to the path file = zipfile.ZipFile(path + file) + print(f"extracting file at {path + file}") file.extractall(path=path) + print("files extracted, getting data dictionary") ############################# get the data dictionary DD_URL = "https://data.ed.gov/dataset/9dc70e6b-8426-4d71-b9d5-70ce6094a3f4/resource/658b5b83-ac9f-4e41-913e-9ba9411d7967/download/collegescorecarddatadictionary_02072022.xlsx" dd = pd.read_excel(DD_URL, sheet_name="Institution_Data_Dictionary") + print(f"data dictionary imported") ############################# merged file # read in the merged file FNAME = "MERGED2019_20_PP.csv" fpath = path + FNAME + print(f"reading in merged file at {fpath}") df = pd.read_csv(fpath, low_memory=False) # keep just the valid columns + print("columns for numeric datatypes") COLS = ['VARIABLE NAME', 'API data type'] dd_vals = dd.loc[:, COLS] # keep all valid values @@ -98,6 +106,7 @@ def scorecard_merged(fname="scorecard", expath="./"): dd_nums = dd_vals.loc[ROWS,:] NUM_COLS = dd_nums['variable_name'].to_list() NUM_COLS = [COL for COL in NUM_COLS if COL in list(df.columns)] + print("for numeric columns, changing the datatype to numeric in the dataframe") # .astype on df gave me fits, and this was suprisingly faster for COL in NUM_COLS: try: @@ -105,15 +114,18 @@ def scorecard_merged(fname="scorecard", expath="./"): # print(f"changed {COL}") except: pass + print("writing the merged file") merged = df.copy() # write the file and append to a dictionary to store the dataframes EXPORT = expath + "merged.hyper" pantab.frame_to_hyper(merged, EXPORT, table="merged") sc_datasets['merged'] = merged + print("merged file logged. Moving onto next") ############################# cohort all # most recent - all FNAME = "Most-Recent-Cohorts-All-Data-Elements.csv" fpath = path + FNAME + print(f"getting next file {fpath}") df = pd.read_csv(fpath, low_memory=False) # use same as above NUM_COLS = dd_nums['variable_name'].to_list() @@ -123,15 +135,18 @@ def scorecard_merged(fname="scorecard", expath="./"): df[COL] = df[COL].astype('float64') except: pass + print(f"writing the most recent files") merged = df.copy() # write the file and append to a dictionary to store the dataframes EXPORT = expath + "mostrecent-all.hyper" pantab.frame_to_hyper(merged, EXPORT, table="mrall") sc_datasets['recent_all'] = merged + print("files saved, moving onto next.") ############################# cohort field of study # most recent - all FNAME = "Most-Recent-Cohorts-Field-of-Study.csv" fpath = path + FNAME + print(f"reading in file at {fpath}") df = pd.read_csv(fpath, low_memory=False) # use same as above NUM_COLS = dd_nums['variable_name'].to_list() @@ -141,12 +156,14 @@ def scorecard_merged(fname="scorecard", expath="./"): df[COL] = df[COL].astype('float64') except: pass + print("writing the files...") merged = df.copy() # write the file and append to a dictionary to store the dataframes EXPORT = expath + "mostrecent-fieldstudy.hyper" pantab.frame_to_hyper(merged, EXPORT, table="mrfieldstudy") sc_datasets['recent_field_study'] = merged + print("files written and exporting a dict of DataFrames") return sc_datasets From 61c33de90a0007678cea7ce37e933195f068d61e Mon Sep 17 00:00:00 2001 From: Brock Tibert Date: Tue, 8 Mar 2022 20:19:28 -0500 Subject: [PATCH 9/9] first of many print statement errors --- pypeds/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypeds/datasets.py b/pypeds/datasets.py index 62df91c..eed94b2 100644 --- a/pypeds/datasets.py +++ b/pypeds/datasets.py @@ -79,8 +79,8 @@ def scorecard_merged(fname="scorecard", expath="./"): print(f"writing file from {URL}") f.write(results.content) # extract the files to the path - file = zipfile.ZipFile(path + file) print(f"extracting file at {path + file}") + file = zipfile.ZipFile(path + file) file.extractall(path=path) print("files extracted, getting data dictionary") ############################# get the data dictionary