From e9e6df20e6ea38afb9642e759497a6ba0ea1d9d5 Mon Sep 17 00:00:00 2001 From: J-Doc <695362+ijdoc@users.noreply.github.com> Date: Wed, 9 Oct 2024 23:35:25 -0400 Subject: [PATCH] feat: implement retrain and eval workflow (fix #9) --- ...drift_retrain.yml => check_data_drift.yml} | 4 +- .github/workflows/train_and_eval.yml | 38 +++ .gitignore | 1 + drift/check_drift.py | 19 +- evaluate.py | 144 +++++++++ simple_model.py | 50 +++ train.py | 304 +++++++++--------- transformer_model.py | 33 -- utils.py | 27 ++ 9 files changed, 426 insertions(+), 194 deletions(-) rename .github/workflows/{data_drift_retrain.yml => check_data_drift.yml} (92%) create mode 100644 .github/workflows/train_and_eval.yml create mode 100644 evaluate.py create mode 100644 simple_model.py delete mode 100644 transformer_model.py create mode 100644 utils.py diff --git a/.github/workflows/data_drift_retrain.yml b/.github/workflows/check_data_drift.yml similarity index 92% rename from .github/workflows/data_drift_retrain.yml rename to .github/workflows/check_data_drift.yml index 6934b45..d988e25 100644 --- a/.github/workflows/data_drift_retrain.yml +++ b/.github/workflows/check_data_drift.yml @@ -16,8 +16,8 @@ jobs: permissions: contents: read issues: write # Grant write access to issues - outputs: # Define outputs for downstream jobs - drift_detected: ${{ steps.drift_check.outputs.drift_detected }} + # outputs: # Define outputs for downstream jobs + # drift_detected: ${{ steps.drift_check.outputs.drift_detected }} steps: - name: ⏬ Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/train_and_eval.yml b/.github/workflows/train_and_eval.yml new file mode 100644 index 0000000..895fd16 --- /dev/null +++ b/.github/workflows/train_and_eval.yml @@ -0,0 +1,38 @@ +# Checks model performance, and retrains +# if model degradation is detected +name: Performance Evaluation + +on: + repository_dispatch: # Allow triggering from a POST request + types: ["Train On Promoted Dataset"] + push: + branches: ijdoc/issue9 + +jobs: + train: + runs-on: self-hosted + steps: + - name: ⏬ Checkout repository + uses: actions/checkout@v4 + + - name: ⚙️ Train! + env: + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + run: | + python train.py + + eval_check: + needs: train + runs-on: self-hosted + steps: + - name: ⏬ Checkout repository + uses: actions/checkout@v4 + + - name: ⚙️ Run Evaluation + env: + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + id: eval_check + run: | + output=$(python evaluate.py) + echo "$output" >> $GITHUB_STEP_SUMMARY + diff --git a/.gitignore b/.gitignore index 79a6106..a0d62a1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ **/artifacts **/checkpoints **/report.txt +**/*.pth # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/drift/check_drift.py b/drift/check_drift.py index 3c474a8..5bba684 100644 --- a/drift/check_drift.py +++ b/drift/check_drift.py @@ -10,12 +10,13 @@ ) as run: # Grab the latest training and production dataframes - train_artifact = run.use_artifact("jdoc-org/wandb-registry-dataset/training:latest") - run.config["train_data"] = train_artifact.source_name + registered_training_dataset = "jdoc-org/wandb-registry-dataset/training:latest" + train_artifact = run.use_artifact(registered_training_dataset) + run.config["train_data"] = train_artifact.name train_data = train_artifact.get("training_data").get_dataframe() prod_artifact = run.use_artifact("production_data:latest") - run.config["prod_data"] = prod_artifact.source_name + run.config["prod_data"] = prod_artifact.name prod_data = prod_artifact.get("production_data").get_dataframe() feature_list = ["active_power", "temp", "humidity", "pressure"] @@ -72,19 +73,19 @@ artifact.description = prod_artifact.description artifact = run.log_artifact(artifact).wait() # Open a github issue asking for manual review - issue_title = f"Data drift detected on {train_artifact.source_name}" + issue_title = f"Data drift detected on {train_artifact.name}" issue_body = ( f"Data drift has been detected when comparing the registered training dataset with recent production data.\n\n" - f"Please review the [candidate artifact](https://wandb.ai/{run.entity}/{run.project}/artifacts/{artifact.type}/{artifact.source_name}) " + f"Please review the [candidate artifact](https://wandb.ai/{run.entity}/{run.project}/artifacts/{artifact.type}/{artifact.name}) " f"and the [drift report]({report_url}) to determine if the registered training data should be updated.\n\n" f"To approve the new candidate after review, link it to [the training Dataset Registry](https://wandb.ai/registry/dataset?selectionPath=jdoc-org%2Fwandb-registry-dataset%2Ftraining&view=versions) at " - f"(`jdoc-org/wandb-registry-dataset/training`), otherwise close this issue." + f"(`{registered_training_dataset}`), otherwise close this issue." ) issue_url = open_github_issue(issue_title, issue_body, labels=["drift", "data"]) print( f"Production batch `{prod_artifact.source_name}` has been logged " - f"as candidate to replace training data `{artifact.source_name}`. " - f"An [issue]({issue_url}) was created for manual review:\n" + f"as candidate `{artifact.name}` to replace training data. " + f"An [issue]({issue_url}) was also created for manual review:\n" ) print(f"- [Data Drift Issue]({issue_url})") else: @@ -93,7 +94,7 @@ print(f"- [W&B Run]({run.url})") print(f"- [Full data drift report]({report_url})") - # Optionally the drift detection result in a parseable format. + # Optionally print the drift detection result in a parseable format. # Helpful if you want to use this result in a CI/CD pipeline # to automatically update the data and/or retrain your model. # print(f"DRIFT_DETECTED={drift_detected}") diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..3cbff3f --- /dev/null +++ b/evaluate.py @@ -0,0 +1,144 @@ +import torch +import torch.nn.functional as F +from sklearn.preprocessing import StandardScaler +import pandas as pd +from simple_model import load_model +import wandb +import os +from utils import plot_predictions_vs_actuals, prep_time_series_data +import numpy as np + +# Initialize W&B for evaluation job +wandb.init( + project="wandb-webinar-cicd-2024", + job_type="evaluate", +) + +# Load the production model +prod_model_name = "jdoc-org/wandb-registry-model/production:latest" +prod_artifact = wandb.use_artifact(prod_model_name) +wandb.config["prod_model"] = prod_artifact.name +prod_model_path = os.path.join(prod_artifact.download(), "best_model.pth") + +# Load the rival model +rival_artifact = wandb.use_artifact("trained_model:latest") +wandb.config["rival_model"] = rival_artifact.name +rival_model_path = os.path.join(rival_artifact.download(), "best_model.pth") + +# Load the checkpoint +model_checkpoint = torch.load(prod_model_path, map_location=torch.device("cpu")) +rival_checkpoint = torch.load(rival_model_path, map_location=torch.device("cpu")) + +# Load the metrics for comparisson +prod_metrics = model_checkpoint["metrics"] + +# Load rival scalers and metrics +scaler_X = rival_checkpoint["scaler_X"] +scaler_y = rival_checkpoint["scaler_y"] +config = rival_checkpoint["config"] +metrics = rival_checkpoint["metrics"] + +# Instantiate the model and load its state dictionary +model = load_model( + config["input_size"] * config["n_time_steps"], + config["hidden_size"], + config["output_size"], +) +model.load_state_dict(rival_checkpoint["model_state_dict"]) +model.eval() # Set the model to evaluation mode + +# Load the latest production data artifact from W&B +artifact = wandb.use_artifact("production_data:latest") +df_test = artifact.get("production_data").get_dataframe() + +# Prepare data (assumes the first column is the target value) +X_test = df_test.iloc[:, :].values # Last 3 columns as input +y_test = df_test.iloc[:, 0].values.reshape(-1, 1) # First column as target + +# Normalize the data using StandardScaler +scaler_X = StandardScaler() +scaler_y = StandardScaler() + +# Normalize the data using StandardScaler +scaler_X = StandardScaler() +scaler_y = StandardScaler() + +X_test_scaled = scaler_X.fit_transform(X_test) +y_test_scaled = scaler_y.fit_transform(y_test) + +# Create time series data using n_time_steps +n_time_steps = config["n_time_steps"] +X_time_series, y_time_series = prep_time_series_data( + X_test_scaled, y_test_scaled, config["n_time_steps"] +) + +# Convert time series data to tensors +X_test_tensor = torch.tensor(X_time_series, dtype=torch.float32) +y_test_tensor = torch.tensor(y_time_series, dtype=torch.float32) + +# Create a DataLoader for the test data +test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor) +test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config["batch_size"]) + +# Evaluation loop +mse_loss = 0.0 +mae_loss = 0.0 +ss_res = 0.0 +ss_tot = 0.0 +all_predictions = [] +all_actuals = [] + +with torch.no_grad(): + for batch_X, batch_y in test_loader: + outputs = model(batch_X) + + # Store predictions and actuals for plotting + all_predictions.append(outputs.numpy()) + all_actuals.append(batch_y.numpy()) + + # Calculate MSE + mse_loss += F.mse_loss(outputs, batch_y).item() + + # Calculate MAE + mae_loss += F.l1_loss(outputs, batch_y).item() + + # Calculate R² + ss_res += torch.sum((batch_y - outputs) ** 2).item() + ss_tot += torch.sum((batch_y - torch.mean(batch_y)) ** 2).item() + +# Average the losses over the test dataset +mse_loss /= len(test_loader) +mae_loss /= len(test_loader) +r2_score = 1 - (ss_res / ss_tot) + +# Log evaluation metrics to W&B +eval_table = wandb.Table(columns=["Metric", "Production", "Candidate"]) +eval_table.add_data("MSE", prod_metrics["val_loss"], mse_loss) +eval_table.add_data("MAE", prod_metrics["val_mae"], mae_loss) +eval_table.add_data("R²", prod_metrics["val_r2"], r2_score) +wandb.log({"performance_metrics": eval_table}) + +# Convert predictions and actuals to numpy arrays for plotting +all_predictions = scaler_y.inverse_transform(np.vstack(all_predictions)) +all_actuals = scaler_y.inverse_transform(np.vstack(all_actuals)) + +# Generate and log predictions vs actuals plot +plt = plot_predictions_vs_actuals(all_actuals, all_predictions) +wandb.log({"predictions_vs_actuals": wandb.Image(plt)}) + +if prod_metrics["val_r2"] > r2_score: + print("> Candidate model did not perform as well as the production model\n\n") +else: + print("> [!INFO]") + print("> The candidate model performed better than the production model\n\n") + + # Link the rival model to the proction model registry + rival_artifact.link("jdoc-org/wandb-registry-model/production") + print( + "The candidate model has been promoted to the [production model registry](https://wandb.ai/registry/model?selectionPath=jdoc-org%2Fwandb-registry-model%2Fproduction&view=versions)!" + ) + +print(f"- [W&B Run]({wandb.run.url})") + +# Finish W&B run +wandb.finish() diff --git a/simple_model.py b/simple_model.py new file mode 100644 index 0000000..3ee7c33 --- /dev/null +++ b/simple_model.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn + + +class SimpleDNN(nn.Module): + def __init__(self, input_size, hidden_size, output_size, dropout_prob=0.2): + super(SimpleDNN, self).__init__() + self.half_hidden_size = round(hidden_size / 2) + + self.fc1 = nn.Linear(input_size, hidden_size) + self.bn1 = nn.BatchNorm1d(hidden_size) + + self.fc2 = nn.Linear(hidden_size, self.half_hidden_size) + self.bn2 = nn.BatchNorm1d(self.half_hidden_size) + self.fc3 = nn.Linear(self.half_hidden_size, hidden_size) + self.bn3 = nn.BatchNorm1d(hidden_size) + + self.fc4 = nn.Linear(hidden_size, output_size) # Output layer + + # Activation function + self.relu = nn.ReLU() + + # Dropout layer + self.dropout = nn.Dropout(dropout_prob) + + def forward(self, x): + # First hidden layer + out = self.fc1(x) + out = self.bn1(out) # Apply batch normalization + out = self.relu(out) + out = self.dropout(out) # Apply dropout + + # Second hidden layer + out = self.fc2(out) + out = self.bn2(out) # Apply batch normalization + out = self.relu(out) + + # Third hidden layer + out = self.fc3(out) + out = self.bn3(out) # Apply batch normalization + out = self.relu(out) + + # Output layer (no activation here, typically applied outside for regression/classification) + out = self.fc4(out) + return out + + +def load_model(input_size, hidden_size, output_size, dropout_prob=0.2): + model = SimpleDNN(input_size, hidden_size, output_size, dropout_prob) + return model diff --git a/train.py b/train.py index cf2bda1..ef6608a 100644 --- a/train.py +++ b/train.py @@ -1,158 +1,162 @@ -# train.py +# train.property import torch -import torch.nn as nn import torch.optim as optim -from transformer_model import TimeSeriesTransformer +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +import pandas as pd +from simple_model import load_model +from utils import prep_time_series_data import wandb -import os -import numpy as np -with wandb.init( +config = { + "input_size": 4, + "hidden_size": 28, + "output_size": 1, + "num_epochs": 10000, + "batch_size": 64, + "learning_rate": 0.000003, + "validation_split": 0.2, + "n_time_steps": 24, +} + +wandb.init( project="wandb-webinar-cicd-2024", job_type="train", - config={ - "model_dim": 64, # Transformer model dimension - "num_heads": 8, # Number of attention heads - "num_layers": 3, # Number of encoder layers - "dropout_prob": 0.1, # Dropout probability - "learning_rate": 0.00005, # Learning rate - "epochs": 100, # Number of training epochs - "src_len": 30, # Number of past time steps to use (history) - "tgt_len": 7, # Number of future time steps to predict - "batch_size": 32, # Batch size - }, -) as run: - - # Hyperparameters - model_dim = run.config.model_dim - num_heads = run.config.num_heads - num_layers = run.config.num_layers - dropout_prob = run.config.dropout_prob - learning_rate = run.config.learning_rate - epochs = run.config.epochs - src_len = run.config.src_len - tgt_len = run.config.tgt_len - batch_size = run.config.batch_size - - # Grab the training dataset from the registry - artifact = run.use_artifact("jdoc-org/wandb-registry-dataset/training:latest'") - run.config["train_data"] = artifact.source_name - data = artifact.get("training_data").get_dataframe() - input_columns = ["temp", "humidity", "pressure", "active_power"] - target_column = "active_power" - input_dim = len(input_columns) - num_days = data.shape[0] # Total number of days - print(f"Number of days: {num_days}") - - # **Normalization Step** - - # Compute mean and std for input features and target variable - input_mean = data[input_columns].mean() - input_std = data[input_columns].std() - - target_mean = data[target_column].mean() - target_std = data[target_column].std() - - # Normalize input features - data[input_columns] = (data[input_columns] - input_mean) / input_std - - # Normalize target variable - data[target_column] = (data[target_column] - target_mean) / target_std - - # Instantiate the model with separate input dimensions - model = TimeSeriesTransformer( - src_input_dim=input_dim, - tgt_input_dim=1, - d_model=model_dim, - nhead=num_heads, - num_layers=num_layers, - dropout=dropout_prob, + config=config, +) + +artifact = wandb.use_artifact("jdoc-org/wandb-registry-dataset/training:latest") +df = artifact.get("training_data").get_dataframe() + +# Prepare data (assumes the first column is the target value) +X = df.iloc[:, :].values # All columns as input +y = df.iloc[:, 0].values.reshape(-1, 1) # First column as target + +# Normalize the data using StandardScaler +scaler_X = StandardScaler() +scaler_y = StandardScaler() + +X_scaled = scaler_X.fit_transform(X) +y_scaled = scaler_y.fit_transform(y) + +# Create time series data using n_time_steps +n_time_steps = config["n_time_steps"] +X_time_series, y_time_series = prep_time_series_data(X_scaled, y_scaled, n_time_steps) + +# Convert time series data to tensors +X_tensor = torch.tensor(X_time_series, dtype=torch.float32) +y_tensor = torch.tensor(y_time_series, dtype=torch.float32) + +# Split data into training and validation sets +X_train, X_val, y_train, y_val = train_test_split( + X_tensor, y_tensor, test_size=config["validation_split"], random_state=42 +) + +# Create DataLoaders for mini-batch training and validation +train_dataset = TensorDataset(X_train, y_train) +train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True) + +val_dataset = TensorDataset(X_val, y_val) +val_loader = DataLoader(val_dataset, batch_size=config["batch_size"]) + +# Instantiate the model +model = load_model( + input_size=config["input_size"] * config["n_time_steps"], + hidden_size=config["hidden_size"], + output_size=config["output_size"], +) + +# Loss and optimizer +criterion = torch.nn.MSELoss() +optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"]) + +# Training loop with mini-batch training +best_val_loss = float("inf") # Initialize a variable to track the best validation loss +for epoch in range(config["num_epochs"]): + model.train() + running_loss = 0.0 + + # Loop over mini-batches + for batch_X, batch_y in train_loader: + optimizer.zero_grad() + outputs = model(batch_X) + loss = criterion(outputs, batch_y) + loss.backward() + optimizer.step() + running_loss += loss.item() + + avg_train_loss = running_loss / len(train_loader) + + # Validation loop + model.eval() + val_loss = 0.0 + val_mae_loss = 0.0 + val_r2_score = 0.0 + with torch.no_grad(): + ss_res = 0.0 + ss_tot = 0.0 + for batch_X, batch_y in val_loader: + val_outputs = model(batch_X) + loss = criterion(val_outputs, batch_y) + val_loss += loss.item() + + # Calculate MAE + val_mae_loss += F.l1_loss(val_outputs, batch_y).item() + + # Calculate R² + ss_res += torch.sum((batch_y - val_outputs) ** 2).item() + ss_tot += torch.sum((batch_y - torch.mean(batch_y)) ** 2).item() + + avg_val_loss = val_loss / len( + val_loader + ) # Average validation loss for the epoch + avg_val_mae = val_mae_loss / len(val_loader) # Average validation MAE + val_r2_score = 1 - (ss_res / ss_tot) # Validation R² + + wandb.log( + { + "train_loss": avg_train_loss, + "val_loss": avg_val_loss, + "val_mae": avg_val_mae, + "val_r2": val_r2_score, + } ) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - print(f"Training on {device}") - - # Loss function and optimizer - criterion = nn.MSELoss() # Mean Squared Error loss for regression - optimizer = optim.Adam(model.parameters(), lr=learning_rate) - - # Prepare source and target data for training - num_samples = num_days - src_len - tgt_len + 1 - - # Assume `data` is your dataset with multiple features - input_data = data[input_columns].values # (num_days, input_dim) - target_data = data[target_column].values # (num_days,) - - # Prepare source and target tensors without explicit loops - # For src_data - src_data_np = np.array([input_data[i : i + src_len] for i in range(num_samples)]) - src_data = torch.from_numpy( - src_data_np - ).float() # Shape: (num_samples, src_len, input_dim) - - # For tgt_data - tgt_data_np = np.array( - [target_data[i + src_len : i + src_len + tgt_len] for i in range(num_samples)] - ) - tgt_data = ( - torch.from_numpy(tgt_data_np).unsqueeze(-1).float() - ) # Shape: (num_samples, tgt_len, 1) - - # Training loop - for epoch in range(epochs): - model.train() # Set the model to training mode - - for i in range(0, num_samples, batch_size): - # Get batch data - src_batch = src_data[i : i + batch_size] # (batch_size, src_len, input_dim) - tgt_batch = tgt_data[i : i + batch_size] # (batch_size, tgt_len, 1) - src_batch = src_batch.to(device) - tgt_batch = tgt_batch.to(device) - - # Clear the gradients - optimizer.zero_grad() - - # Prepare the target input (shifted by one time step) - tgt_input = tgt_batch[ - :, :-1, : - ] # Remove the last time step from the target - - # Forward pass: predict future power consumption - output = model( - src_batch, tgt_input - ) # Pass the source and the shifted target - - # Compute loss between predicted and actual values - loss = criterion( - output, tgt_batch[:, 1:, :] - ) # Compare with the actual target - - # Backward pass and optimization - loss.backward() - optimizer.step() - - # Print the loss for this epoch - print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}") - wandb.log({"loss": loss.item()}) - - # Create a directory to save the model - model_dir = "checkpoints" - os.makedirs(model_dir, exist_ok=True) - - # Save model state and normalization parameters - checkpoint = { - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - "input_mean": input_mean.to_dict(), - "input_std": input_std.to_dict(), - "target_mean": target_mean.item(), - "target_std": target_std.item(), - } - checkpoint_path = os.path.join(model_dir, "model_checkpoint.pth") - torch.save(checkpoint, checkpoint_path) - - # Create a wandb Artifact - artifact = wandb.Artifact("trained_model", type="model") - artifact.add_file(checkpoint_path) - run.log_artifact(artifact) + # Print metrics every 100 epochs + if (epoch + 1) % 100 == 0: + print( + f'Epoch [{epoch+1}/{config["num_epochs"]}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val MAE: {avg_val_mae:.4f}, Val R²: {val_r2_score:.4f}' + ) + + # Save the best model based on validation loss + if avg_val_loss < best_val_loss: + best_val_loss = avg_val_loss + # Update best/summary metrics + metrics = { + "train_loss": avg_train_loss, + "val_loss": avg_train_loss, + "val_mae": avg_val_loss, + "val_r2": val_r2_score, + } + torch.save( + { + "model_state_dict": model.state_dict(), + "scaler_X": scaler_X, + "scaler_y": scaler_y, + "metrics": metrics, + "config": config, + }, + "best_model.pth", + ) + + wandb.summary.update(metrics) + + # print(f"Best model saved at epoch {epoch+1}") + +# Save model as W&B artifact +artifact = wandb.Artifact("trained_model", type="model") +artifact.add_file("best_model.pth") +wandb.log_artifact(artifact) +wandb.finish() diff --git a/transformer_model.py b/transformer_model.py deleted file mode 100644 index 5570ea9..0000000 --- a/transformer_model.py +++ /dev/null @@ -1,33 +0,0 @@ -# transformer_model.py - -import torch.nn as nn - - -class TimeSeriesTransformer(nn.Module): - def __init__( - self, src_input_dim, tgt_input_dim, d_model, nhead, num_layers, dropout - ): - super(TimeSeriesTransformer, self).__init__() - self.src_input_linear = nn.Linear(src_input_dim, d_model) - self.tgt_input_linear = nn.Linear(tgt_input_dim, d_model) - self.transformer = nn.Transformer( - d_model=d_model, - nhead=nhead, - num_encoder_layers=num_layers, - num_decoder_layers=num_layers, - dropout=dropout, - batch_first=True, # Set batch_first to True - ) - self.output_linear = nn.Linear( - d_model, 1 - ) # Assuming we're predicting one output feature - - def forward(self, src, tgt): - src = self.src_input_linear(src) - tgt = self.tgt_input_linear(tgt) - - # No need to permute dimensions since batch_first=True - output = self.transformer(src, tgt) - - output = self.output_linear(output) - return output diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..45d8760 --- /dev/null +++ b/utils.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import numpy as np + + +# Function to plot predictions vs. actuals +def plot_predictions_vs_actuals(y_true, y_pred): + plt.figure(figsize=(10, 6)) + plt.scatter(y_true, y_pred, alpha=0.3) + plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "r--") + plt.xlabel("Actual Values") + plt.ylabel("Predicted Values") + plt.title("Predictions vs Actuals") + return plt + + +def prep_time_series_data(X, y, n_time_steps): + X_time_series = [] + y_time_series = [] + for i in range(len(X) - n_time_steps): + X_time_series.append( + X[i : i + n_time_steps].flatten() + ) # Flatten N time steps into one vector + y_time_series.append( + y[i + n_time_steps] + ) # Target is the next time step after N steps + + return np.array(X_time_series), np.array(y_time_series)