From d8319448696fa4d54c4d5fe0acc0608bc04f0df0 Mon Sep 17 00:00:00 2001
From: "cameron.johnson" <cameron.johnson@kitware.com>
Date: Thu, 19 Oct 2023 18:49:14 -0400
Subject: [PATCH] Ongoing non-HMM step tracking code addition

---
 .gitignore                                    |   1 +
 .../global_step_prediction/__init__.py        |   0
 .../predict_global_step.py                    | 339 ++++++++++++++++++
 .../predict_global_step_randForest.py         | 272 ++++++++++++++
 ...teps_cofig-recipe-coffee-shortstrings.yaml | 210 +++++++++++
 5 files changed, 822 insertions(+)
 create mode 100644 angel_system/global_step_prediction/__init__.py
 create mode 100644 angel_system/global_step_prediction/predict_global_step.py
 create mode 100644 angel_system/global_step_prediction/predict_global_step_randForest.py
 create mode 100644 config/tasks/task_steps_cofig-recipe-coffee-shortstrings.yaml

diff --git a/.gitignore b/.gitignore
index f80e0d1f1..7aa136cba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 /.container_xauth
 /model_files
 /ros_bags
+/outputs
 
 ### Python template
 # Byte-compiled / optimized / DLL files
diff --git a/angel_system/global_step_prediction/__init__.py b/angel_system/global_step_prediction/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/angel_system/global_step_prediction/predict_global_step.py b/angel_system/global_step_prediction/predict_global_step.py
new file mode 100644
index 000000000..8221204d1
--- /dev/null
+++ b/angel_system/global_step_prediction/predict_global_step.py
@@ -0,0 +1,339 @@
+import yaml
+import os
+import seaborn as sn
+import numpy as np
+import kwcoco
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+import scipy.ndimage as ndi
+
+def sanitize_str(str_: str):
+    """
+    Convert string to lowercase and emove trailing whitespace and period.
+
+    :param str_: Input text
+
+    :return: ``str_`` converted to lowercase and stripped of trailing whitespace and period.
+    :rtype: str
+    """
+    return str_.lower().strip(" .")
+
+def plot_positive_GT_conf_distributions(activity_confs, activity_gt):
+    """
+    plot_TP_conf_distributions:
+    For each activity, plot the distribution of confidences when ground
+    truth indicates that activity is happening.
+
+    i.e.: for activity x, for frames in which ground truth = x, plot 
+    the distribution of confidences.
+
+    Inputs:
+    activity_confs: frames x class-wise-confidences. Given a kwcoco
+        dataset called "coco":
+        ```
+        activity_confs = torch.asarray(coco.images().lookup("activity_conf"))
+        ```
+        (49K x 25 for coffee val set.)
+    activity_gt: frames x ground truth activity_id.
+        Given a kwcoco dataset called "coco":
+        ```
+        activity_gt = torch.asarray(coco.images().lookup("activity_gt"))
+        ```
+    """
+
+    sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
+
+    # Get data together
+    true_confs = [float(activity_confs[i,truth_ind]) for i, truth_ind in enumerate(activity_gt)]
+    data = {"true_conf":true_confs, "gt":activity_gt}
+    df = pd.DataFrame(data)
+
+    false_confs = np.array([[a for i, a in enumerate(act_conf) if i != gt] for act_conf, gt in zip(activity_confs, activity_gt)]).flatten()
+    false_gt = np.array([[gt for i, a in enumerate(act_conf) if i != gt] for act_conf, gt in zip(activity_confs, activity_gt)]).flatten()
+    data_opposite = {"true_conf":false_confs, "gt":false_gt}
+    df_opposite = pd.DataFrame(data_opposite)
+
+    def plot(df):
+        # Initialize the FacetGrid object
+        pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
+        g = sns.FacetGrid(df, row="gt", hue="gt", aspect=15, height=.5, palette=pal)
+
+        # Draw the densities in a few steps
+        g.map(sns.kdeplot, "true_conf",
+              bw_adjust=.5, clip_on=False,
+              fill=True, alpha=1, linewidth=1.5)
+        g.map(sns.kdeplot, "true_conf", clip_on=False, color="w", lw=2, bw_adjust=.5)
+
+        # passing color=None to refline() uses the hue mapping
+        g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)
+        
+        # Define and use a simple function to label the plot in axes coordinates
+        def label(x, color, label):
+            ax = plt.gca()
+            ax.text(0, .2, label, fontweight="bold", color=color,
+                    ha="left", va="center", transform=ax.transAxes)
+        g.map(label, "true_conf")
+
+        # Set the subplots to overlap
+        g.figure.subplots_adjust(hspace=-.25)
+
+        # Remove axes details that don't play well with overlap
+        g.set_titles("")
+        g.set(yticks=[], ylabel="")
+        g.despine(bottom=True, left=True)
+
+        # save
+        plt.savefig("./outputs/plot_positive_GT_conf_distributions.png")
+
+
+def bilateralFtr1D(y, sSpatial = 5, sIntensity = 1):
+    '''
+    The equation of the bilateral filter is
+    
+            (       dx ^ 2       )       (         dI ^2        )
+    F = exp (- ----------------- ) * exp (- ------------------- )
+            (  sigma_spatial ^ 2 )       (  sigma_Intensity ^ 2 )
+        ~~~~~~~~~~~~~~~~~~~~~~~~~~
+        This is a guassian filter!
+        dx - The 'geometric' distance between the 'center pixel' and the pixel
+         to sample
+    dI - The difference between the intensity of the 'center pixel' and
+         the pixel to sample
+    sigma_spatial and sigma_Intesity are constants. Higher values mean
+    that we 'tolerate more' higher value of the distances dx and dI.
+    
+    Dependencies: numpy, scipy.ndimage.gaussian_filter1d
+    
+    calc gaussian kernel size as: filterSize = (2 * radius) + 1; radius = floor (2 * sigma_spatial)
+    y - input data
+    '''
+
+    # gaussian filter and parameters
+    radius = np.floor (2 * sSpatial)
+    filterSize = ((2 * radius) + 1)
+    ftrArray = np.zeros(int(filterSize))
+    ftrArray[int(radius)] = 1
+    
+    # Compute the Gaussian filter part of the Bilateral filter
+    gauss = ndi.gaussian_filter1d(ftrArray, sSpatial)
+
+    # 1d data dimensions
+    width = y.size
+
+    # 1d resulting data
+    ret = np.zeros (width)
+
+    for i in range(width):
+
+        ## To prevent accessing values outside of the array
+        # The left part of the lookup area, clamped to the boundary
+        xmin = max(i - radius, 1);
+        # How many columns were outside the image, on the left?
+        dxmin = xmin - (i - radius);
+
+        # The right part of the lookup area, clamped to the boundary
+        xmax = min(i + radius, width);
+        # How many columns were outside the image, on the right?
+        dxmax = (i + radius) - xmax;
+
+        # The actual range of the array we will look at
+        area = y [int(xmin):int(xmax)]
+
+        # The center position
+        center = y[i]
+
+        # The left expression in the bilateral filter equation
+        # We take only the relevant parts of the matrix of the
+        # Gaussian weights - we use dxmin, dxmax, dymin, dymax to
+        # ignore the parts that are outside the image
+        expS = gauss[int((1+dxmin)):int((filterSize-dxmax))]
+
+        # The right expression in the bilateral filter equation
+        dy = y [int(xmin):int(xmax)] - y[i]
+        dIsquare = (dy * dy)
+        expI = np.exp (- dIsquare / (sIntensity * sIntensity))
+
+        # The bilater filter (weights matrix)
+        F = expI * expS
+
+        # Normalized bilateral filter
+        Fnormalized = F / sum(F)
+
+        # Multiply the area by the filter
+        tempY = y [int(xmin):int(xmax)] * Fnormalized
+
+        # The resulting pixel is the sum of all the pixels in
+        # the area, according to the weights of the filter
+        # ret(i,j,R) = sum (tempR(:))
+        ret[i] = sum (tempY)
+    
+    return ret
+
+
+def get_average_TP_activations(coco):
+    # For each activity, given the Ground Truth-specified
+    # frame subset where that activity is happening, get the
+    # average activation of that class.
+
+    all_activity_ids = np.unique(np.asarray(coco.images().lookup('activity_gt')))
+    all_vid_ids = np.unique(np.asarray(coco.images().lookup('video_id')))
+
+    avg_probs = np.zeros(max(all_activity_ids) + 1)
+
+    for activity_id in all_activity_ids:
+        #image_ids = coco.index.vidid_to_gids[vid_id]
+        image_ids = [img['id'] for img in coco.videos(video_ids=all_vid_ids).images[0].objs if img['activity_gt'] == activity_id]
+        sub_dset = coco.subset(gids=image_ids, copy=True)
+        probs_for_true_inds = np.asarray(
+                sub_dset.images().lookup("activity_conf"))[:,activity_id]
+        avg_prob = np.mean(probs_for_true_inds)
+        avg_probs[activity_id] = avg_prob
+
+    return avg_probs
+
+config_fn = "config/tasks/task_steps_cofig-recipe-coffee-shortstrings.yaml"
+with open(config_fn, "r") as stream:
+    config = yaml.safe_load(stream)
+labels = [sanitize_str(l["description"]) for l in config["steps"]]
+steps = config['steps']
+if steps[0]['id'] == 1:
+    config['steps'].insert(0, {'id':0,
+        'activity_id':0,
+        'description':'background',
+        'median_duration_seconds':0.5,
+        'mean_conf':0.5,
+        'std_conf':0.2,
+        })
+
+coco_val = kwcoco.CocoDataset("model_files/val_activity_preds_epoch40.mscoco.json")
+coco_test = kwcoco.CocoDataset("model_files/test_activity_preds.mscoco.json")
+
+image_ids = coco_test.index.vidid_to_gids[3]
+video_dset = coco_test.subset(gids=image_ids, copy=True)
+
+# "Training": for each activity class, see what the average "true positive"
+# activation was.
+avg_probs = get_average_TP_activations(coco_test)
+print(f"average_probs = {avg_probs}")
+
+all_vid_ids = np.unique(np.asarray(coco_val.images().lookup('video_id')))
+
+for vid_id in all_vid_ids:
+    print(f"vid_id {vid_id}")
+
+    image_ids = coco_test.index.vidid_to_gids[vid_id]
+    video_dset = coco_test.subset(gids=image_ids, copy=True)
+
+    # All N activity confs x each video frame
+    activity_confs = video_dset.images().lookup("activity_conf")
+
+    next_step = 1
+    step_predictions = []
+    num_frames_activated = 0
+
+    # Predicted step: confidence has been above threshold for 5 frames.
+    threshold_frame_count = 8
+    for i, activity_conf in enumerate(activity_confs):
+
+        # Check if we're done: if so, append last step & continue
+        if next_step == len(steps):
+            step_predictions.append(next_step-1)
+            continue
+        # Next step
+        next_activity_id = steps[next_step]['activity_id']
+        next_next_activity_id = steps[min(len(steps)-1,next_step + 1)][
+                'activity_id']
+
+        next_activity_conf = activity_conf[next_activity_id]
+        next_next_activity_conf = activity_conf[next_next_activity_id]
+
+        avg_prob_next_activity = avg_probs[next_activity_id]
+        avg_prob_next_next_activity = avg_probs[next_next_activity_id]
+        '''
+        if next_activity_id == 16 and vid_id == 2:
+            print(f"next_activity_id = {next_activity_id}")
+            print(f"avg_prob_next_activity = {avg_prob_next_activity}")
+        '''
+        if i > 15:
+            threshold_frame_count = 16
+
+        if next_activity_conf > 0.8 * avg_prob_next_activity:
+            num_frames_activated += 1
+            '''
+            if next_activity_id == 16 and vid_id == 2:
+                print(f"num_frames_activated = {num_frames_activated}. prob = {next_activity_conf}")
+            '''
+        else:
+            num_frames_activated = 0
+        if next_next_activity_conf > 0.8 * avg_prob_next_activity:
+            num_skip2_frames_activated += 1
+        else:
+            num_skip2_frames_activated = 0
+
+        if num_frames_activated >= threshold_frame_count:
+            #if next_step < 23:
+                #next_step += 1
+            next_step += 1
+            num_frames_activated = 0
+            num_skip2_frames_activated = 0
+        elif num_skip2_frames_activated >= threshold_frame_count:
+            next_step = min(next_step + 2, len(steps))
+            num_frames_activated = 0
+            num_skip2_frames_activated = 0
+            print("hit a skip-step!!")
+
+        step_predictions.append(next_step-1)
+
+    # Ground truth step:
+    activity_gts = video_dset.images().lookup("activity_gt")
+    step_gts = []
+    step_gts_no_background = []
+    current_step = 0
+    for activity_gt in activity_gts:
+        # convert activity id to step id
+        step_id = next(int(item['id']) for item in steps if item['activity_id'] == activity_gt)
+        step_gts.append(step_id)
+
+        # A version of GT that never jumps back to 0
+        if step_id > 0:
+            current_step = step_id
+        step_gts_no_background.append(current_step)
+
+
+    # Plot confusion matrix
+    fig, ax = plt.subplots(figsize=(100, 100))
+    cm = confusion_matrix(step_gts_no_background, step_predictions, normalize="true")
+    sn.heatmap(cm, annot=True, fmt="0.0%", ax=ax, linewidth=.5)
+    sn.set(font_scale=4)
+    ax.set(
+            title="Confusion Matrix",
+            xlabel="Predicted Label",
+            ylabel="True Label",)
+    fig.savefig(f"./outputs/plot_confusion_mat_vid{vid_id}.png")
+
+    # Plot gt vs predicted class across all vid frames
+    fig = plt.figure()
+    sn.set(font_scale=1)
+    step_gts = [float(i) for i in step_gts]
+    plt.plot(step_gts, label = 'gt')
+    plt.plot(step_predictions, label = 'estimated')
+    #plt.plot(inliers-0.5, label = 'inliers')
+    plt.plot(10*np.asarray(activity_confs)[:,17]-5, label = 'act_preds[17]')
+    plt.plot(10*np.asarray(activity_confs)[:,18]-5, label = 'act_preds[18]')
+    plt.plot(10*np.asarray(activity_confs)[:,19]-5, label = 'act_preds[19]')
+
+    plt.plot(bilateralFtr1D(10*np.asarray(activity_confs)[:,17])-10, label = 'act_preds_bilateral[17]')
+    plt.plot(bilateralFtr1D(10*np.asarray(activity_confs)[:,18])-10, label = 'act_pred_bilateral[18]')
+    plt.plot(bilateralFtr1D(10*np.asarray(activity_confs)[:,19])-10, label = 'act_preds_bilateral[19]')
+    #plt.plot(10*X_conf_incremental, label = 'confidence')
+    #plt.plot(10*vid_acts[:,10], label = act_labels[10])
+    #plt.plot(10*vid_acts[:,11], label = act_labels[11])
+    #plt.plot(10*vid_acts[:,12], label = act_labels[12])
+    plt.legend()
+    fig.savefig(f"./outputs/plot_pred_vs_gt_vid{vid_id}.png")
+
+    if False:
+        plot_positive_GT_conf_distributions(activity_confs, activity_gt)
+
+
+
diff --git a/angel_system/global_step_prediction/predict_global_step_randForest.py b/angel_system/global_step_prediction/predict_global_step_randForest.py
new file mode 100644
index 000000000..89d7d883a
--- /dev/null
+++ b/angel_system/global_step_prediction/predict_global_step_randForest.py
@@ -0,0 +1,272 @@
+import yaml
+import os
+import seaborn as sn
+import numpy as np
+import kwcoco
+import matplotlib.pyplot as plt
+import sklearn
+import sklearn.ensemble
+from sklearn.metrics import confusion_matrix
+import scipy.ndimage as ndi
+import torch
+
+def sanitize_str(str_: str):
+    """
+    Convert string to lowercase and emove trailing whitespace and period.
+
+    :param str_: Input text
+
+    :return: ``str_`` converted to lowercase and stripped of trailing whitespace and period.
+    :rtype: str
+    """
+    return str_.lower().strip(" .")
+
+def bilateralFtr1D(y, sSpatial = 5, sIntensity = 1):
+    '''
+    The equation of the bilateral filter is
+    
+            (       dx ^ 2       )       (         dI ^2        )
+    F = exp (- ----------------- ) * exp (- ------------------- )
+            (  sigma_spatial ^ 2 )       (  sigma_Intensity ^ 2 )
+        ~~~~~~~~~~~~~~~~~~~~~~~~~~
+        This is a guassian filter!
+        dx - The 'geometric' distance between the 'center pixel' and the pixel
+         to sample
+    dI - The difference between the intensity of the 'center pixel' and
+         the pixel to sample
+    sigma_spatial and sigma_Intesity are constants. Higher values mean
+    that we 'tolerate more' higher value of the distances dx and dI.
+    
+    Dependencies: numpy, scipy.ndimage.gaussian_filter1d
+    
+    calc gaussian kernel size as: filterSize = (2 * radius) + 1; radius = floor (2 * sigma_spatial)
+    y - input data
+    '''
+
+    # gaussian filter and parameters
+    radius = np.floor (2 * sSpatial)
+    filterSize = ((2 * radius) + 1)
+    ftrArray = np.zeros(int(filterSize))
+    ftrArray[int(radius)] = 1
+    
+    # Compute the Gaussian filter part of the Bilateral filter
+    gauss = ndi.gaussian_filter1d(ftrArray, sSpatial)
+
+    # 1d data dimensions
+    width = y.size
+
+    # 1d resulting data
+    ret = np.zeros (width)
+
+    for i in range(width):
+
+        ## To prevent accessing values outside of the array
+        # The left part of the lookup area, clamped to the boundary
+        xmin = max(i - radius, 1);
+        # How many columns were outside the image, on the left?
+        dxmin = xmin - (i - radius);
+
+        # The right part of the lookup area, clamped to the boundary
+        xmax = min(i + radius, width);
+        # How many columns were outside the image, on the right?
+        dxmax = (i + radius) - xmax;
+
+        # The actual range of the array we will look at
+        area = y [int(xmin):int(xmax)]
+
+        # The center position
+        center = y[i]
+
+        # The left expression in the bilateral filter equation
+        # We take only the relevant parts of the matrix of the
+        # Gaussian weights - we use dxmin, dxmax, dymin, dymax to
+        # ignore the parts that are outside the image
+        expS = gauss[int((1+dxmin)):int((filterSize-dxmax))]
+
+        # The right expression in the bilateral filter equation
+        dy = y [int(xmin):int(xmax)] - y[i]
+        dIsquare = (dy * dy)
+        expI = np.exp (- dIsquare / (sIntensity * sIntensity))
+
+        # The bilater filter (weights matrix)
+        F = expI * expS
+
+        # Normalized bilateral filter
+        Fnormalized = F / sum(F)
+
+        # Multiply the area by the filter
+        tempY = y [int(xmin):int(xmax)] * Fnormalized
+
+        # The resulting pixel is the sum of all the pixels in
+        # the area, according to the weights of the filter
+        # ret(i,j,R) = sum (tempR(:))
+        ret[i] = sum (tempY)
+    
+    return ret
+
+
+def get_average_TP_activations(coco, clf):
+    # For each activity, given the Ground Truth-specified
+    # frame subset where that activity is happening, get the
+    # average activation of that class.
+
+    all_activity_ids = np.unique(np.asarray(coco.images().lookup('activity_gt')))
+    all_vid_ids = np.unique(np.asarray(coco.images().lookup('video_id')))
+
+    activity_confs = torch.asarray(coco.images().lookup("activity_conf"))
+    new_probs = clf.predict_proba(activity_confs)
+    new_probs_all_classes = np.zeros((new_probs.shape[0], new_probs.shape[1]+1))
+    new_probs_all_classes[:,0:17] = new_probs[:,0:17]
+    new_probs_all_classes[:,18:] = new_probs[:,17:]
+
+    avg_probs = np.zeros(max(all_activity_ids) + 1)
+
+    for activity_id in all_activity_ids:
+        image_ids = [img['id'] for img in coco.videos(video_ids=all_vid_ids).images[0].objs if img['activity_gt'] == activity_id]
+        probs_for_true_inds = np.asarray(new_probs_all_classes)[image_ids][:,activity_id]
+        avg_prob = np.mean(probs_for_true_inds)
+        avg_probs[activity_id] = avg_prob
+
+    # import ipdb; ipdb.set_trace()
+
+    return avg_probs
+
+def train_random_forest(coco):
+    activity_confs = torch.asarray(coco.images().lookup("activity_conf"))
+    activity_preds = torch.asarray(coco.images().lookup("activity_pred"))
+    activity_gt = torch.asarray(coco.images().lookup("activity_gt"))
+    n_classes = len(activity_confs[0])
+    clf = sklearn.ensemble.RandomForestClassifier(n_estimators = 100, max_depth=2, random_state=0) #, class_weight="balanced")
+    # training
+    clf.fit(activity_confs,activity_gt)
+
+    # Sanity check: print out training dataset performance
+    y_hat= clf.predict(activity_confs)
+
+    TP = np.sum(activity_gt.numpy()==y_hat)
+    n = y_hat.shape[0]
+    print(f'{TP}/{n} Train RF Accuracy {100*TP/n:0.2f}%')
+
+    TP = np.sum(activity_gt.numpy()==activity_preds.numpy())
+    n = y_hat.shape[0]
+    print(f'{TP}/{n} TCN Accuracy {100*TP/n:0.2f}%')
+
+    return clf
+
+
+config_fn = "config/tasks/task_steps_cofig-recipe-coffee-shortstrings.yaml"
+with open(config_fn, "r") as stream:
+    config = yaml.safe_load(stream)
+labels = [sanitize_str(l["description"]) for l in config["steps"]]
+steps = config['steps']
+if steps[0]['id'] == 1:
+    config['steps'].insert(0, {'id':0,
+        'activity_id':0,
+        'description':'background',
+        'median_duration_seconds':0.5,
+        'mean_conf':0.5,
+        'std_conf':0.2,
+        })
+
+coco_val = kwcoco.CocoDataset("model_files/val_activity_preds_epoch40.mscoco.json")
+coco_test = kwcoco.CocoDataset("model_files/test_activity_preds.mscoco.json")
+
+image_ids = coco_test.index.vidid_to_gids[3]
+video_dset = coco_test.subset(gids=image_ids, copy=True)
+
+# "Training": for each activity class, see what the average "true positive"
+# activation was.
+clf = train_random_forest(coco_test)
+avg_probs = get_average_TP_activations(coco_test, clf)
+print(f"average_probs = {avg_probs}")
+
+all_vid_ids = np.unique(np.asarray(coco_val.images().lookup('video_id')))
+
+for vid_id in all_vid_ids:
+    print(f"vid_id {vid_id}")
+
+    image_ids = coco_test.index.vidid_to_gids[vid_id]
+    video_dset = coco_test.subset(gids=image_ids, copy=True)
+
+    # All N activity confs x each video frame
+    activity_confs = video_dset.images().lookup("activity_conf")
+    new_probs = clf.predict_proba(activity_confs)
+    new_probs_all_classes = np.zeros((new_probs.shape[0], new_probs.shape[1]+1))
+    new_probs_all_classes[:,0:17] = new_probs[:,0:17]
+    new_probs_all_classes[:,18:] = new_probs[:,17:]
+
+
+    next_step = 1
+    step_predictions = []
+    num_frames_activated = 0
+
+    # Predicted step: confidence has been above threshold for 5 frames.
+    for activity_conf in new_probs_all_classes:
+        # Next step
+        next_activity_id = steps[next_step]['activity_id']
+
+        next_activity_conf = activity_conf[next_activity_id]
+
+        avg_prob_next_activity = avg_probs[next_activity_id]
+
+        if next_activity_conf > 0.8 * avg_prob_next_activity:
+            num_frames_activated += 1
+        else:
+            num_frames_activated = 0
+
+        if num_frames_activated >= 8:
+            if next_step < 23:
+                next_step += 1
+            num_frames_activated = 0
+
+        step_predictions.append(next_step-1)
+
+    # Ground truth step:
+    activity_gts = video_dset.images().lookup("activity_gt")
+    step_gts = []
+    step_gts_no_background = []
+    current_step = 0
+    for activity_gt in activity_gts:
+        # convert activity id to step id
+        step_id = next(int(item['id']) for item in steps if item['activity_id'] == activity_gt)
+        step_gts.append(step_id)
+
+        # A version of GT that never jumps back to 0
+        if step_id > 0:
+            current_step = step_id
+        step_gts_no_background.append(current_step)
+
+
+    # Plot confusion matrix
+    fig, ax = plt.subplots(figsize=(100, 100))
+    cm = confusion_matrix(step_gts_no_background, step_predictions)
+    sn.heatmap(cm, annot=True, fmt="g", ax=ax)
+    sn.set(font_scale=4)
+    ax.set(
+            title="Confusion Matrix",
+            xlabel="Predicted Label",
+            ylabel="True Label",)
+    fig.savefig(f"./outputs/plot_confusion_mat_vid{vid_id}.png")
+
+    # Plot gt vs predicted class across all vid frames
+    fig = plt.figure()
+    sn.set(font_scale=1)
+    step_gts = [float(i) for i in step_gts]
+    plt.plot(step_gts, label = 'gt')
+    plt.plot(step_predictions, label = 'estimated')
+    #plt.plot(inliers-0.5, label = 'inliers')
+    plt.plot(10*np.asarray(activity_confs)[:,17]-5, label = 'act_preds[17]')
+    plt.plot(10*np.asarray(activity_confs)[:,18]-5, label = 'act_preds[18]')
+    plt.plot(10*np.asarray(activity_confs)[:,19]-5, label = 'act_preds[19]')
+
+    plt.plot(bilateralFtr1D(10*np.asarray(activity_confs)[:,17])-10, label = 'act_preds_bilateral[17]')
+    plt.plot(bilateralFtr1D(10*np.asarray(activity_confs)[:,18])-10, label = 'act_pred_bilateral[18]')
+    plt.plot(bilateralFtr1D(10*np.asarray(activity_confs)[:,19])-10, label = 'act_preds_bilateral[19]')
+    #plt.plot(10*X_conf_incremental, label = 'confidence')
+    #plt.plot(10*vid_acts[:,10], label = act_labels[10])
+    #plt.plot(10*vid_acts[:,11], label = act_labels[11])
+    #plt.plot(10*vid_acts[:,12], label = act_labels[12])
+    plt.legend()
+    fig.savefig(f"./outputs/plot_pred_vs_gt_vid{vid_id}.png")
+
+
diff --git a/config/tasks/task_steps_cofig-recipe-coffee-shortstrings.yaml b/config/tasks/task_steps_cofig-recipe-coffee-shortstrings.yaml
new file mode 100644
index 000000000..1ac538b34
--- /dev/null
+++ b/config/tasks/task_steps_cofig-recipe-coffee-shortstrings.yaml
@@ -0,0 +1,210 @@
+# Schema version.
+version: "1.0"
+
+# Reference to the activity classification labels configuration that we will
+# reference into.
+activity_labels: "./config/activity_labels/recipe_coffee.yaml"
+
+# Reference to the file defining the mean and standard deviation of the
+# activity classifications to be used by the HMM. For N activities, both the
+# mean and standard deviation should be N x N matrices such that when activity
+# i is actually occuring, the classifier will emit confidence
+# mean[i, j] +/- std[i, j] for activity j.
+activity_mean_and_std_file: "./model_files/recipe_coffee_shortstrings_mean_std.npy"
+
+# Task title for display purposes.
+title: "Pour-over coffee"
+
+# Layout of the steps that define this task.
+steps:
+  # Item format:
+  # - id: Identifying integer for the step.
+  # - activity_id: The ID of an activity classification associated with this
+  #                step. This must reference an ID within the `activity_labels`
+  #                configuration file referenced above.
+  # - description: Human semantic description of this step.
+  # - median_duration_seconds: Median expected time this task will
+  #                            consume in seconds.
+  # - mean_conf: mean value of classifier confidence for true examples.
+  # - std_conf: standard deviation of confidence for both true and false
+  #             examples.
+  - id: 1   # Must start at 1, 0 is reserved for background.
+    activity_id: 1
+    description: >-
+      measure-12oz-water
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 2
+    activity_id: 2
+    description: >-
+      pour-water-kettle
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 3
+    activity_id: 24
+    description: >-
+      turn-on-kettle
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 4
+    activity_id: 3
+    description: >-
+      place-dipper-on-mug
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 5
+    activity_id: 4
+    description: >-
+      filter-fold-half
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 6
+    activity_id: 5
+    description: >-
+      filter-fold-quarter
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 7
+    activity_id: 6
+    description: >-
+      place-filter
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 8
+    activity_id: 7
+    description: >-
+      spread-filter
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 9
+    activity_id: 8
+    description: >-
+      scale-turn-on
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 10
+    activity_id: 9
+    description: >-
+      place-bowl-on-scale
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 11
+    activity_id: 10
+    description: >-
+      zero-scale
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 12
+    activity_id: 11
+    description: >-
+      measure-coffee-beans
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 13
+    activity_id: 12
+    description: >-
+      pour-coffee-grinder
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 14
+    activity_id: 13
+    description: >-
+      grind-beans
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 15
+    activity_id: 14
+    description: >-
+      pour-beans-filter
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 16
+    activity_id: 15
+    description: >-
+      thermometer-turn-on
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 17
+    activity_id: 16
+    description: >-
+      thermometer-in-water
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 18
+    activity_id: 18
+    description: >-
+      pour-water-grounds-wet
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 19
+    activity_id: 19
+    description: >-
+      pour-water-grounds-circular
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 20
+    activity_id: 20
+    description: >-
+      water-drain
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 21
+    activity_id: 21
+    description: >-
+      remove-dripper
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 22
+    activity_id: 22
+    description: >-
+      remove-grounds
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+  - id: 23
+    activity_id: 23
+    description: >-
+      discard-grounds
+    median_duration_seconds: 5
+    mean_conf: 0.5
+    std_conf: 0.2
+
+# Hidden markov model configuration parameters
+hmm:
+  # Time (seconds) between time steps of HMM. Sets the temporal precision of
+  # the HMM analysis at the expense of processing costs.
+  dt: 0.5
+
+  # Constrain whether HMM sequence can skip steps or jump backwards. When both
+  # values are set to 0, forward progress without skipping steps is enforced.
+  num_steps_can_jump_fwd: 1
+  num_steps_can_jump_bck: 0
+
+  # Default classifier mean confidence to use if not explicitly provided for a
+  # step.
+  default_mean_conf: 0.5
+
+  # Default classifier standard deviation of confidence to use if not
+  # explicitly provided for a step.
+  default_std_conf: 0.2