diff --git a/docs/statcast_batter.md b/docs/statcast_batter.md index 957768db..b5507ddf 100644 --- a/docs/statcast_batter.md +++ b/docs/statcast_batter.md @@ -103,4 +103,25 @@ data = statcast_batter_pitch_arsenal(2019) # get data for batters with a minimum of 100 plate appearances in 2019 data = statcast_batter_pitch_arsenal(2019, 100) -``` \ No newline at end of file + +``` +# Statcast Batter Bat Tracking +`statcast_batter_bat_tracking(year, minSwings=q)` + +This functions retrives a player\'s bat tracking data in a given year. + +## Arguments: +`year:` The year for which you wish to retrieve the bat tracking data. Format: YYYY. + +`minSwings:` The minimum number of competitive swings for wach player. If a player falls below this threshold, they will be excluded from the results. If no value is specified, the default number of competitive swings is qualified. + +## Examples of Valid Queries +```python +from pybaseball import statcast_batter_bat_tracking + +# Get data for all qualified batters in 2024 +data = statcast_batter_bat_tracking(2024) + +# Get data for batters with a minimum of 250 swings in 2024 +data = statcast_batter_bat_tracking(2024, 250) +``` diff --git a/docs/statcast_pitcher.md b/docs/statcast_pitcher.md index dd338889..7b1242b7 100644 --- a/docs/statcast_pitcher.md +++ b/docs/statcast_pitcher.md @@ -202,3 +202,43 @@ data = statcast_pitcher_spin_dir_comp(2020, pitch_a="Sinker", pitch_b="Slider", # get data for sinker / slider combos in 2020 using pitch codes and from the batter's POV data = statcast_pitcher_spin_dir_comp(2020, pitch_a="SIFT", pitch_b="SL", pitcher_pov=False) ``` +# Statcast Pitcher Bat Tracking +`statcast_pitcher_bat_tracking(year, minSwings=q)` + +This function retrives the bat tracking data against for pitchers. + +## Arguments +`year:` The year for which you wish to retrieve bat tracking data. Format: YYYY + +`minSwings:` The minimum number of swings batters have taken against a pitcher. If a pitcher falls belows the threshold, they will be excluded from the results. The default value is qualified. + +## Examples of Valid Queries +```python +from pybaseball import statcast_pitcher_bat_tracking + +# Get data for all qualified pitchers in 2024 +data = statcast_pitcher_bat_tracking(2024) + +#Get data for all pitchers with a minimum of 250 swings against in 2024 +data = statcast_pitcher_bat_tracking(2024, 250) +``` +# Statcast Pitcher Arm Angle +`statcast_pitcher_arm_angle(year, minP=q)` + +This function retrieves pitcher arm angle data. + +## Arguments +`year:` The year for which you wish to retrieve arm angle data. Format: YYYY + +`minP:` The minimum number of pitchers thrown. If a player falls below this threshold, they will be excluded from the results. The default value if no argument is passed is qualified. + +## Examples of Valid Queries +```python +from pybaseball import statcast_pitcher_arm_angle + +#Get data for all qualified pitchers in 2024 +data = statcast_pitcher_arm_angle(2024) + +#Get data for all pitchers with a minimum of 100 pitches thrown in 2024 +data = statcast_pitcher_arm_angle(2024, 100) +``` diff --git a/pybaseball/datasources/bref.py b/pybaseball/datasources/bref.py index 799ba5af..e8266332 100644 --- a/pybaseball/datasources/bref.py +++ b/pybaseball/datasources/bref.py @@ -32,4 +32,3 @@ def get(self, url: str, **kwargs: Any) -> requests.Response: self.last_request = datetime.datetime.now() return self.session.get(url, **kwargs) - \ No newline at end of file diff --git a/pybaseball/statcast_batter.py b/pybaseball/statcast_batter.py index a6cdb24c..11b569bc 100644 --- a/pybaseball/statcast_batter.py +++ b/pybaseball/statcast_batter.py @@ -5,7 +5,7 @@ import requests from . import cache -from .utils import sanitize_input, split_request, sanitize_statcast_columns +from .utils import sanitize_input, split_request, sanitize_statcast_columns, get_season def statcast_batter(start_dt: Optional[str] = None, end_dt: Optional[str] = None, player_id: Optional[int] = None) -> pd.DataFrame: @@ -25,7 +25,9 @@ def statcast_batter(start_dt: Optional[str] = None, end_dt: Optional[str] = None assert end_dt assert player_id - url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={}&game_date_lt={}&batters_lookup%5B%5D={}&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&' + season = get_season(start_dt, end_dt) #Needs season for query to work + + url = f'https://baseballsavant.mlb.com/statcast_search/csv?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea={season}&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={start_dt}&game_date_lt={end_dt}&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&batters_lookup%5B%5D={player_id}&hfFlag=&metric_1=&group_by=name&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&type=details&all=true&minors=false' df = split_request(start_dt, end_dt, player_id, url) return df @@ -56,7 +58,7 @@ def statcast_batter_expected_stats(year: int, minPA: Union[int, str] = "q") -> p minPA: The minimum number of plate appearances for each player. If a player falls below this threshold, they will be excluded from the results. If no value is specified, only qualified batters will be returned. """ - url = f"https://baseballsavant.mlb.com/leaderboard/expected_statistics?type=batter&year={year}&position=&team=&min={minPA}&csv=true" + url = f"https://baseballsavant.mlb.com/leaderboard/expected_statistics?type=batter&year={year}&position=&team=&filterType=pa&min={minPA}&csv=true" res = requests.get(url, timeout=None).content data = pd.read_csv(io.StringIO(res.decode('utf-8'))) data = sanitize_statcast_columns(data) @@ -98,8 +100,8 @@ def statcast_batter_bat_tracking(year: int, minSwings: Union[int,str] = "q" ) -> Retrieves a player's bat tracking data for a given year. ARGUMENTS - year: The year for which you which to retrieve the bat tracking data. Format: YYYY. - minSwings: The minimum number of competitive swings for wach player. If a player falls below this threshold, + year: The year for which you wish to retrieve the bat tracking data. Format: YYYY. + minSwings: The minimum number of competitive swings for each player. If a player falls below this threshold, they will be excluded from the results. If no value is specified, the default number of competitive swings is qualified. """ diff --git a/pybaseball/statcast_pitcher.py b/pybaseball/statcast_pitcher.py index 040fe49d..eb1287fd 100644 --- a/pybaseball/statcast_pitcher.py +++ b/pybaseball/statcast_pitcher.py @@ -58,7 +58,7 @@ def statcast_pitcher_expected_stats(year: int, minPA: Union[int, str] = "q") -> minPA: The minimum number of plate appearances against for each player. If a player falls below this threshold, they will be excluded from the results. If no value is specified, only qualified pitchers will be returned. """ - url = f"https://baseballsavant.mlb.com/leaderboard/expected_statistics?type=pitcher&year={year}&position=&team=&min={minPA}&csv=true" + url = f"https://baseballsavant.mlb.com/leaderboard/expected_statistics?type=pitcher&year={year}&position=&team=&filterType=pa&min={minPA}&csv=true" res = requests.get(url, timeout=None).content data = pd.read_csv(io.StringIO(res.decode('utf-8'))) data = sanitize_statcast_columns(data) @@ -216,3 +216,19 @@ def statcast_pitcher_bat_tracking(year: int, minSwings: Union[int,str] = "q") -> data = pd.read_csv(io.StringIO(res.decode('utf-8'))) data = sanitize_statcast_columns(data) return data +@cache.df_cache() +def statcast_pitcher_arm_angle(year: int, minP: Union[int,str] = "q") -> pd.DataFrame: + """ + Retreives pitcher arm angle data. + + ARGUMENTS + year: The year for which you wish to retrive arm angle data. Format: YYYY + minP: The minimum number of pitches thrown. If a player falls below this threshold, they will be excluded from + the results. The default value if no argument is passed is qualified. + """ + url = f"https://baseballsavant.mlb.com/leaderboard/pitcher-arm-angles?season={year}&team=&pitchHand=&min={minP}&sort=ascending&csv=true" + res = requests.get(url, timeout=None).content + data = pd.read_csv(io.StringIO(res.decode('utf-8'))) + data = sanitize_statcast_columns(data) + return data + diff --git a/pybaseball/utils.py b/pybaseball/utils.py index df24ea9d..37a81c9c 100644 --- a/pybaseball/utils.py +++ b/pybaseball/utils.py @@ -103,19 +103,19 @@ def get_first_season(team: str, include_equivalents: bool = True) -> Optional[in return oldest STATCAST_VALID_DATES = { - 2008: (date(2008, 3, 25), date(2008, 10, 27)), - 2009: (date(2009, 4, 5), date(2009, 11, 4)), - 2010: (date(2010, 4, 4), date(2010, 11, 1)), - 2011: (date(2011, 3, 31), date(2011, 10, 28)), - 2012: (date(2012, 3, 28), date(2012, 10, 28)), - 2013: (date(2013, 3, 31), date(2013, 10, 30)), - 2014: (date(2014, 3, 22), date(2014, 10, 29)), - 2015: (date(2015, 4, 5), date(2015, 11, 1)), - 2016: (date(2016, 4, 3), date(2016, 11, 2)), - 2017: (date(2017, 4, 2), date(2017, 11, 1)), - 2018: (date(2018, 3, 29), date(2018, 10, 28)), - 2019: (date(2019, 3, 20), date(2019, 10, 30)), - 2020: (date(2020, 7, 23), date(2020, 10, 27)) + 2008: (date(2008, 3, 25), date(2008, 10, 27)), + 2009: (date(2009, 4, 5), date(2009, 11, 4)), + 2010: (date(2010, 4, 4), date(2010, 11, 1)), + 2011: (date(2011, 3, 31), date(2011, 10, 28)), + 2012: (date(2012, 3, 28), date(2012, 10, 28)), + 2013: (date(2013, 3, 31), date(2013, 10, 30)), + 2014: (date(2014, 3, 22), date(2014, 10, 29)), + 2015: (date(2015, 4, 5), date(2015, 11, 1)), + 2016: (date(2016, 4, 3), date(2016, 11, 2)), + 2017: (date(2017, 4, 2), date(2017, 11, 1)), + 2018: (date(2018, 3, 29), date(2018, 10, 28)), + 2019: (date(2019, 3, 20), date(2019, 10, 30)), + 2020: (date(2020, 7, 23), date(2020, 10, 27)) } pitch_codes = ["FF", "CU", "CH", "FC", "EP", "FO", "KN", "KC", "SC", "SI", "SL", "FS", "FT", "ST", "SV", "SIFT", "CUKC", "ALL"] # note: all doesn't work in words, we'll have some special handling @@ -137,251 +137,264 @@ def get_first_season(team: str, include_equivalents: bool = True) -> Optional[in def validate_datestring(date_text: Optional[str]) -> date: - try: - assert date_text - return datetime.strptime(date_text, DATE_FORMAT).date() - except (AssertionError, ValueError) as ex: - raise ValueError("Incorrect data format, should be YYYY-MM-DD") from ex + try: + assert date_text + return datetime.strptime(date_text, DATE_FORMAT).date() + except (AssertionError, ValueError) as ex: + raise ValueError("Incorrect data format, should be YYYY-MM-DD") from ex @functools.lru_cache() def most_recent_season() -> int: - ''' - Find the most recent season. + ''' + Find the most recent season. - Will be either this year (if the season has started or just ended) - or last year (if the season has not yet started). - ''' + Will be either this year (if the season has started or just ended) + or last year (if the season has not yet started). + ''' - # Get the past year of season dates - recent_season_dates = date_range( - (datetime.today() - timedelta(weeks=52)).date(), # From one year ago - datetime.today().date(), # To today - verbose=False, - ) + # Get the past year of season dates + recent_season_dates = date_range( + (datetime.today() - timedelta(weeks=52)).date(), # From one year ago + datetime.today().date(), # To today + verbose=False, + ) - # Grab the last entry as the most recent game date, the year of which is the most recent season - return list(recent_season_dates)[-1][0].year + # Grab the last entry as the most recent game date, the year of which is the most recent season + return list(recent_season_dates)[-1][0].year def date_range(start: date, stop: date, step: int = 1, verbose: bool = True) -> Iterator[Tuple[date, date]]: - ''' - Iterate over dates. Skip the offseason dates. Returns a pair of dates for beginning and end of each segment. - Range is inclusive of the stop date. - If verbose is enabled, it will print a message if it skips offseason dates. - ''' - - low = start - - while low <= stop: - if (low.month, low.day) < (3, 15): - low = low.replace(month=3, day=15) - if verbose: - print('Skipping offseason dates') - elif (low.month, low.day) > (11, 15): - low = low.replace(month=3, day=15, year=low.year + 1) - if verbose: - print('Skipping offseason dates') - - if low > stop: - return - high = min(low + timedelta(step - 1), stop) - yield low, high - low += timedelta(days=step) + ''' + Iterate over dates. Skip the offseason dates. Returns a pair of dates for beginning and end of each segment. + Range is inclusive of the stop date. + If verbose is enabled, it will print a message if it skips offseason dates. + ''' + + low = start + + while low <= stop: + if (low.month, low.day) < (3, 15): + low = low.replace(month=3, day=15) + if verbose: + print('Skipping offseason dates') + elif (low.month, low.day) > (11, 15): + low = low.replace(month=3, day=15, year=low.year + 1) + if verbose: + print('Skipping offseason dates') + + if low > stop: + return + high = min(low + timedelta(step - 1), stop) + yield low, high + low += timedelta(days=step) def statcast_date_range(start: date, stop: date, step: int, verbose: bool = True) -> Iterator[Tuple[date, date]]: - ''' - Iterate over dates. Skip the offseason dates. Returns a pair of dates for beginning and end of each segment. - Range is inclusive of the stop date. - If verbose is enabled, it will print a message if it skips offseason dates. - This version is Statcast specific, relying on skipping predefined dates from STATCAST_VALID_DATES. - ''' - low = start - - while low <= stop: - date_span = low.replace(month=3, day=15), low.replace(month=11, day=15) - season_start, season_end = STATCAST_VALID_DATES.get(low.year, date_span) - if low < season_start: - low = season_start - if verbose: - print('Skipping offseason dates') - elif low > season_end: - low, _ = STATCAST_VALID_DATES.get(low.year + 1, (date(month=3, day=15, year=low.year + 1), None)) - if verbose: - print('Skipping offseason dates') - - if low > stop: - return - high = min(low + timedelta(step - 1), stop) - yield low, high - low += timedelta(days=step) + ''' + Iterate over dates. Skip the offseason dates. Returns a pair of dates for beginning and end of each segment. + Range is inclusive of the stop date. + If verbose is enabled, it will print a message if it skips offseason dates. + This version is Statcast specific, relying on skipping predefined dates from STATCAST_VALID_DATES. + ''' + low = start + + while low <= stop: + date_span = low.replace(month=3, day=15), low.replace(month=11, day=15) + season_start, season_end = STATCAST_VALID_DATES.get(low.year, date_span) + if low < season_start: + low = season_start + if verbose: + print('Skipping offseason dates') + elif low > season_end: + low, _ = STATCAST_VALID_DATES.get(low.year + 1, (date(month=3, day=15, year=low.year + 1), None)) + if verbose: + print('Skipping offseason dates') + + if low > stop: + return + high = min(low + timedelta(step - 1), stop) + yield low, high + low += timedelta(days=step) def sanitize_statcast_columns(df: pd.DataFrame) -> pd.DataFrame: - ''' - Creates uniform structure in Statcast column names - Removes leading whitespace in column names - ''' - df.columns = df.columns.str.strip() - return df + ''' + Creates uniform structure in Statcast column names + Removes leading whitespace in column names + ''' + df.columns = df.columns.str.strip() + return df def sanitize_date_range(start_dt: Optional[str], end_dt: Optional[str]) -> Tuple[date, date]: - # If no dates are supplied, assume they want yesterday's data - # send a warning in case they wanted to specify - if start_dt is None and end_dt is None: - today = date.today() - start_dt = str(today - timedelta(1)) - end_dt = str(today) + # If no dates are supplied, assume they want yesterday's data + # send a warning in case they wanted to specify + if start_dt is None and end_dt is None: + today = date.today() + start_dt = str(today - timedelta(1)) + end_dt = str(today) - print('start_dt', start_dt) - print('end_dt', end_dt) + print('start_dt', start_dt) + print('end_dt', end_dt) - print("Warning: no date range supplied, assuming yesterday's date.") + print("Warning: no date range supplied, assuming yesterday's date.") - # If only one date is supplied, assume they only want that day's stats - # query in this case is from date 1 to date 1 - if start_dt is None: - start_dt = end_dt - if end_dt is None: - end_dt = start_dt + # If only one date is supplied, assume they only want that day's stats + # query in this case is from date 1 to date 1 + if start_dt is None: + start_dt = end_dt + if end_dt is None: + end_dt = start_dt - start_dt_date = validate_datestring(start_dt) - end_dt_date = validate_datestring(end_dt) + start_dt_date = validate_datestring(start_dt) + end_dt_date = validate_datestring(end_dt) - # If end date occurs before start date, swap them - if end_dt_date < start_dt_date: - start_dt_date, end_dt_date = end_dt_date, start_dt_date + # If end date occurs before start date, swap them + if end_dt_date < start_dt_date: + start_dt_date, end_dt_date = end_dt_date, start_dt_date - # Now that both dates are not None, make sure they are valid date strings - return start_dt_date, end_dt_date + # Now that both dates are not None, make sure they are valid date strings + return start_dt_date, end_dt_date def sanitize_input(start_dt: Optional[str], end_dt: Optional[str], player_id: Optional[int]) -> Tuple[str, str, str]: - # error if no player ID provided - if player_id is None: - raise ValueError( - "Player ID is required. If you need to find a player's id, try " - "pybaseball.playerid_lookup(last_name, first_name) and use their key_mlbam. " - "If you want statcast data for all players, try the statcast() function." - ) - # this id should be a string to place inside a url - player_id_str = str(player_id) - start_dt_date, end_dt_date = sanitize_date_range(start_dt, end_dt) - return str(start_dt_date), str(end_dt_date), player_id_str + # error if no player ID provided + if player_id is None: + raise ValueError( + "Player ID is required. If you need to find a player's id, try " + "pybaseball.playerid_lookup(last_name, first_name) and use their key_mlbam. " + "If you want statcast data for all players, try the statcast() function." + ) + # this id should be a string to place inside a url + player_id_str = str(player_id) + start_dt_date, end_dt_date = sanitize_date_range(start_dt, end_dt) + return str(start_dt_date), str(end_dt_date), player_id_str @cache.df_cache() def split_request(start_dt: str, end_dt: str, player_id: int, url: str) -> pd.DataFrame: - """ - Splits Statcast queries to avoid request timeouts - """ - current_dt = datetime.strptime(start_dt, '%Y-%m-%d') - end_dt_datetime = datetime.strptime(end_dt, '%Y-%m-%d') - results = [] # list to hold data as it is returned - player_id_str = str(player_id) - print('Gathering Player Data') - # break query into multiple requests - while current_dt <= end_dt_datetime: - remaining = end_dt_datetime - current_dt - # increment date ranges by at most 60 days - delta = min(remaining, timedelta(days=2190)) - next_dt = current_dt + delta - start_str = current_dt.strftime('%Y-%m-%d') - end_str = next_dt.strftime('%Y-%m-%d') - # retrieve data - data = requests.get(url.format(start_str, end_str, player_id_str)) - df = pd.read_csv(io.StringIO(data.text)) - # add data to list and increment current dates - results.append(df) - current_dt = next_dt + timedelta(days=1) - return pd.concat(results) + """ + Splits Statcast queries to avoid request timeouts + """ + current_dt = datetime.strptime(start_dt, '%Y-%m-%d') + end_dt_datetime = datetime.strptime(end_dt, '%Y-%m-%d') + results = [] # list to hold data as it is returned + player_id_str = str(player_id) + print('Gathering Player Data') + # break query into multiple requests + while current_dt <= end_dt_datetime: + remaining = end_dt_datetime - current_dt + # increment date ranges by at most 60 days + delta = min(remaining, timedelta(days=2190)) + next_dt = current_dt + delta + start_str = current_dt.strftime('%Y-%m-%d') + end_str = next_dt.strftime('%Y-%m-%d') + # retrieve data + data = requests.get(url.format(start_str, end_str, player_id_str)) + df = pd.read_csv(io.StringIO(data.text)) + # add data to list and increment current dates + results.append(df) + current_dt = next_dt + timedelta(days=1) + return pd.concat(results) def get_zip_file(url: str) -> zipfile.ZipFile: - """ - Get zip file from provided URL - """ - with requests.get(url, stream=True) as file_stream: - zip_file = zipfile.ZipFile(io.BytesIO(file_stream.content)) - return zip_file + """ + Get zip file from provided URL + """ + with requests.get(url, stream=True) as file_stream: + zip_file = zipfile.ZipFile(io.BytesIO(file_stream.content)) + return zip_file def get_text_file(url: str) -> str: - """ - Get raw github file from provided URL - """ + """ + Get raw github file from provided URL + """ - with requests.get(url, stream=True) as file_stream: - text = file_stream.text + with requests.get(url, stream=True) as file_stream: + text = file_stream.text - return text + return text def flag_imputed_data(statcast_df: pd.DataFrame) -> pd.DataFrame: - """Function to flag possibly imputed data as a result of no-nulls approach (see: https://tht.fangraphs.com/43416-2/) - For derivation of values see pybaseball/EXAMPLES/imputed_derivation.ipynb - Note that this imputation only occured with TrackMan, not present in Hawk-Eye data (beyond 2020) - Args: - statcast_df (pd.DataFrame): Dataframe loaded via statcast.py, statcast_batter.py, or statcast_pitcher.py - Returns: - pd.DataFrame: Copy of original dataframe with "possible_imputation" flag - """ - - ParameterSet = namedtuple('ParameterSet', ["ev", "angle", "bb_type"]) - impute_combinations = [] - - # pop-ups - impute_combinations.append(ParameterSet(ev=80.0, angle=69.0, bb_type="popup")) - - # Flyout - impute_combinations.append(ParameterSet(ev=89.2, angle=39.0, bb_type="fly_ball")) - impute_combinations.append(ParameterSet(ev=102.8, angle=30.0, bb_type="fly_ball")) - - # Line Drive - impute_combinations.append(ParameterSet(ev=90.4, angle=15.0, bb_type="line_drive")) - impute_combinations.append(ParameterSet(ev=91.1, angle=18.0, bb_type="line_drive")) - - # Ground balls - impute_combinations.append(ParameterSet(ev=82.9, angle=-21.0, bb_type="ground_ball")) - impute_combinations.append(ParameterSet(ev=90.3, angle=-17.0, bb_type="ground_ball")) - - df_imputations = pd.DataFrame(data=impute_combinations) - df_imputations["possible_imputation"] = True - df_return = statcast_df.merge(df_imputations, how="left", - left_on=["launch_speed", "launch_angle", "bb_type"], - right_on=["ev", "angle", "bb_type"]) - # Change NaNs to false for boolean consistency - df_return["possible_imputation"] = df_return["possible_imputation"].fillna(False) - df_return = df_return.drop(["ev", "angle"], axis=1) - return df_return + """Function to flag possibly imputed data as a result of no-nulls approach (see: https://tht.fangraphs.com/43416-2/) + For derivation of values see pybaseball/EXAMPLES/imputed_derivation.ipynb + Note that this imputation only occured with TrackMan, not present in Hawk-Eye data (beyond 2020) + Args: + statcast_df (pd.DataFrame): Dataframe loaded via statcast.py, statcast_batter.py, or statcast_pitcher.py + Returns: + pd.DataFrame: Copy of original dataframe with "possible_imputation" flag + """ + + ParameterSet = namedtuple('ParameterSet', ["ev", "angle", "bb_type"]) + impute_combinations = [] + + # pop-ups + impute_combinations.append(ParameterSet(ev=80.0, angle=69.0, bb_type="popup")) + + # Flyout + impute_combinations.append(ParameterSet(ev=89.2, angle=39.0, bb_type="fly_ball")) + impute_combinations.append(ParameterSet(ev=102.8, angle=30.0, bb_type="fly_ball")) + + # Line Drive + impute_combinations.append(ParameterSet(ev=90.4, angle=15.0, bb_type="line_drive")) + impute_combinations.append(ParameterSet(ev=91.1, angle=18.0, bb_type="line_drive")) + + # Ground balls + impute_combinations.append(ParameterSet(ev=82.9, angle=-21.0, bb_type="ground_ball")) + impute_combinations.append(ParameterSet(ev=90.3, angle=-17.0, bb_type="ground_ball")) + + df_imputations = pd.DataFrame(data=impute_combinations) + df_imputations["possible_imputation"] = True + df_return = statcast_df.merge(df_imputations, how="left", + left_on=["launch_speed", "launch_angle", "bb_type"], + right_on=["ev", "angle", "bb_type"]) + # Change NaNs to false for boolean consistency + df_return["possible_imputation"] = df_return["possible_imputation"].fillna(False) + df_return = df_return.drop(["ev", "angle"], axis=1) + return df_return def norm_pitch_code(pitch: str, to_word: bool = False) -> str: - normed = pitch_name_to_code_map.get(pitch.upper()) - normed = pitch_code_to_name_map.get(normed) if to_word and normed else normed - if normed is None: - if pitch.lower() == 'all': - raise ValueError("'All' is not a valid pitch in this particular context!") - raise ValueError(f'{pitch} is not a valid pitch!') - return normed + normed = pitch_name_to_code_map.get(pitch.upper()) + normed = pitch_code_to_name_map.get(normed) if to_word and normed else normed + if normed is None: + if pitch.lower() == 'all': + raise ValueError("'All' is not a valid pitch in this particular context!") + raise ValueError(f'{pitch} is not a valid pitch!') + return normed def norm_positions(pos: Union[int, str], to_word: bool = False, to_number: bool = True) -> str: - pos_str = str(pos) - normed: Optional[str] = None - if pos_str in pos_code_to_numbers_map.values(): - to_number = False - normed = pos_str - else: - normed = pos_name_to_code_map.get(pos_str.upper()) - normed = pos_code_to_name_map.get(normed) if to_word and normed else normed - if to_number: - if normed not in ["IF", "OF"]: - normed = pos_code_to_numbers_map.get(normed) if normed else normed - if pos_str.lower() == "all": - normed = "" - if normed is None: - raise ValueError(f'{pos} is not a valid position!') - # lower() ok due to positional numbers being cast as strings when created - return normed.lower() - + pos_str = str(pos) + normed: Optional[str] = None + if pos_str in pos_code_to_numbers_map.values(): + to_number = False + normed = pos_str + else: + normed = pos_name_to_code_map.get(pos_str.upper()) + normed = pos_code_to_name_map.get(normed) if to_word and normed else normed + if to_number: + if normed not in ["IF", "OF"]: + normed = pos_code_to_numbers_map.get(normed) if normed else normed + if pos_str.lower() == "all": + normed = "" + if normed is None: + raise ValueError(f'{pos} is not a valid position!') + # lower() ok due to positional numbers being cast as strings when created + return normed.lower() + +def get_season(start_dt: str, end_dt: str) -> str: + start_date = start_dt.split("-") + start_year = int(start_date[0]) + end_date = end_dt.split("-") + end_year = int(end_date[0]) + + if end_year == start_year: + return str(start_year) + "%7C" + else: + seasons = str(end_year) + "%7C" + for i in range(end_year - start_year): + seasons = seasons + str(end_year - (i+1)) + "%7C" + return seasons \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index df15d73e..38b15062 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from pybaseball import cache -CURRENT_SC_COLUMNS = 92 +CURRENT_SC_COLUMNS = 94 _DataFrameComparer = Callable[[pd.DataFrame, pd.DataFrame], bool] diff --git a/tests/integration/pybaseball/test_statcast_batter.py b/tests/integration/pybaseball/test_statcast_batter.py index f5be9c60..f9ca6e44 100644 --- a/tests/integration/pybaseball/test_statcast_batter.py +++ b/tests/integration/pybaseball/test_statcast_batter.py @@ -33,6 +33,15 @@ def test_statcast_batter() -> None: assert len(result.columns) == CURRENT_SC_COLUMNS assert len(result) > 0 +def test_statcast_batter_multiple_seasons() -> None: + result: pd.DataFrame = statcast_batter('2023-04-01', '2024-10-01', 656941) + + assert result is not None + assert not result.empty + + assert len(result.columns) == CURRENT_SC_COLUMNS + assert len(result) > 0 + def test_statcast_batter_expected_stats() -> None: min_pa = 250 result: pd.DataFrame = statcast_batter_expected_stats(2019, min_pa) @@ -40,7 +49,7 @@ def test_statcast_batter_expected_stats() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 15 + assert len(result.columns) == 14 assert len(result) > 0 assert len(result[result['pa'] < min_pa]) == 0 @@ -50,7 +59,7 @@ def test_statcast_batter_percentile_ranks() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 17 + assert len(result.columns) == 23 assert len(result) > 0 def test_statcast_batter_pitch_arsenal() -> None: @@ -60,7 +69,7 @@ def test_statcast_batter_pitch_arsenal() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 21 + assert len(result.columns) == 20 assert len(result) > 0 assert len(result[result['pa'] < min_pa]) == 0 def test_statcast_batter_bat_tracking() -> None: diff --git a/tests/integration/pybaseball/test_statcast_fielding.py b/tests/integration/pybaseball/test_statcast_fielding.py index 3041d3ff..eb559573 100644 --- a/tests/integration/pybaseball/test_statcast_fielding.py +++ b/tests/integration/pybaseball/test_statcast_fielding.py @@ -30,7 +30,7 @@ def test_statcast_outs_above_average_view() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 17 + assert len(result.columns) == 16 assert len(result) > 0 def test_statcast_outfield_directional_oaa() -> None: @@ -40,7 +40,7 @@ def test_statcast_outfield_directional_oaa() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 13 + assert len(result.columns) == 12 assert len(result) > 0 assert len(result.loc[result.attempts < min_opp]) == 0 @@ -51,7 +51,7 @@ def test_statcast_outfield_catch_prob() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 19 + assert len(result.columns) == 18 assert len(result) > 0 def test_statcast_outfielder_jump() -> None: @@ -61,7 +61,7 @@ def test_statcast_outfielder_jump() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 13 + assert len(result.columns) == 12 assert len(result) > 0 assert len(result.loc[result.n < min_att]) == 0 diff --git a/tests/integration/pybaseball/test_statcast_pitcher.py b/tests/integration/pybaseball/test_statcast_pitcher.py index e8a06721..a11c5159 100644 --- a/tests/integration/pybaseball/test_statcast_pitcher.py +++ b/tests/integration/pybaseball/test_statcast_pitcher.py @@ -12,7 +12,8 @@ statcast_pitcher_pitch_arsenal, statcast_pitcher_pitch_movement, statcast_pitcher_spin_dir_comp, - statcast_pitcher_bat_tracking + statcast_pitcher_bat_tracking, + statcast_pitcher_arm_angle ) @@ -43,7 +44,7 @@ def test_statcast_pitchers_expected_stats() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 18 + assert len(result.columns) == 17 assert len(result) > 0 assert len(result[result['pa'] < min_pa]) == 0 @@ -54,7 +55,7 @@ def test_statcast_pitcher_pitch_arsenal() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 11 + assert len(result.columns) == 12 assert len(result) > 0 def test_statcast_pitcher_arsenal_stats() -> None: @@ -64,7 +65,7 @@ def test_statcast_pitcher_arsenal_stats() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 21 + assert len(result.columns) == 20 assert len(result) > 0 assert len(result[result['pa'] < min_pa]) == 0 @@ -86,7 +87,7 @@ def test_statcast_pitcher_active_spin() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 10 + assert len(result.columns) == 11 assert len(result) > 0 def test_statcast_pitcher_percentile_ranks() -> None: @@ -95,7 +96,7 @@ def test_statcast_pitcher_percentile_ranks() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 19 + assert len(result.columns) == 22 assert len(result) > 0 def test_statcast_pitcher_spin_dir_comp() -> None: @@ -104,7 +105,7 @@ def test_statcast_pitcher_spin_dir_comp() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 30 + assert len(result.columns) == 29 assert len(result) > 100 def test_statcast_pitcher_bat_tracking() -> None: result: pd.DataFrame = statcast_pitcher_bat_tracking(2024) @@ -114,3 +115,11 @@ def test_statcast_pitcher_bat_tracking() -> None: assert len(result.columns) == 18 assert len(result) > 0 +def test_statcast_pitcher_arm_angle() -> None: + result: pd.DataFrame = statcast_pitcher_arm_angle(2024) + + assert result is not None + assert not result.empty + + assert len(result.columns) == 11 + assert len(result) > 0 diff --git a/tests/integration/pybaseball/test_statcast_running.py b/tests/integration/pybaseball/test_statcast_running.py index bb8fdcef..57a83f38 100644 --- a/tests/integration/pybaseball/test_statcast_running.py +++ b/tests/integration/pybaseball/test_statcast_running.py @@ -12,7 +12,7 @@ def test_statcast_sprint_speed() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 11 + assert len(result.columns) == 10 assert len(result) > 0 assert len(result.loc[result.competitive_runs < min_opp]) == 0 @@ -24,5 +24,5 @@ def test_statcast_running_splits() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 27 + assert len(result.columns) == 26 assert len(result) > 0 diff --git a/tests/pybaseball/test_utils.py b/tests/pybaseball/test_utils.py index e7b44862..8613d11e 100644 --- a/tests/pybaseball/test_utils.py +++ b/tests/pybaseball/test_utils.py @@ -51,4 +51,4 @@ def test_sanitize_date_range_start_dt_gt_end_dt() -> None: assert start_dt_date < end_dt_date assert str(start_dt_date) == end_dt - assert str(end_dt_date) == start_dt + assert str(end_dt_date) == start_dt \ No newline at end of file