Issue 36 long term reading variance algorithm (#46)

* Fix: fixed separated test cases * Feat: added 1/2 responses testcase * Feat: test cases and tests added * feat: built longterm variance detection * Fix: removed pasted tests * Fix: added no anomaly testcase * Fix: edited noanom testcase * Fix: testcases * Fix: removed query source from testcase * feat: added make target and fn resource * Fix: added third response to test cases * Fix: changed data size check to account for both previous years * Fix: removed underscore from testcases folder * Fix: testcase file reference * passing: fixed test cases * fix: removed extraneous print statement * fix: correct StuckTwoDay target * added python_lib to pipfile * fix: low_delta_t anomaly message * missing setup.py * .gitignore egg-info * added longTermVariance algorithm to cron handler --------- Co-authored-by: logo0303 <[email protected]>
stevemandl · Oct 23, 2024 · a9dc7d6 · a9dc7d6
1 parent 0ae6354
commit a9dc7d6
Show file tree

Hide file tree

Showing 19 changed files with 18,097 additions and 634 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ coverage
 .build
 *.pyc
 .pytest_cache
-.aws-sam
+.aws-sam
+*.egg-info/
diff --git a/py_src/Makefile b/py_src/Makefile
@@ -1,6 +1,6 @@
 SHELL := /bin/bash
-build-SparseData build-LowDeltaT StuckTwoDay:
+build-SparseData build-LowDeltaT build-StuckTwoDay build-LongTermVariance:
 	cp -r ./* $(ARTIFACTS_DIR)
 	pip install --upgrade pip
 	pip install pipenv
-	pipenv run pip install -r <(pipenv requirements) -t $(ARTIFACTS_DIR)
+	pipenv run pip install -r <(pipenv requirements) -t $(ARTIFACTS_DIR)
diff --git a/py_src/Pipfile b/py_src/Pipfile
@@ -11,6 +11,7 @@ numpy = "*"
 pandas = "*"
 matplotlib = "*"
 tk = "*"
+pythonlib = {file = "python_lib"}
 
 [dev-packages]
 pytest = {version=">=6.2"}

diff --git a/py_src/Pipfile.lock b/py_src/Pipfile.lock
diff --git a/py_src/long_term_variance/handler.py b/py_src/long_term_variance/handler.py
@@ -0,0 +1,92 @@
+"""
+long_term_variance/handler.py
+"""
+
+from datetime import timedelta
+import numpy as np
+import pandas as pd
+from requests.exceptions import ConnectionError as RequestConnectionError, HTTPError
+from python_lib.utils import parse_event, fetch_trends, build_index, build_df, AnomalyError
+
+# the window size we are analyzing. The assumption is the consumption is 
+# normally similar across this period and for the same period one year prior 
+RECENT_DAYS = 7
+# the minimum acceptable length of the datapoints array
+MIN_DATAPOINTS_LENGTH = int(RECENT_DAYS * 24 * 0.6)
+# if the magnitude of the z score is greater than this, it's anomolous
+Z_SCORE_THRESHOLD = 3.0
+ANOMALY_PORTION = 0.5
+
+def run(event, _context):
+    """
+    handler.run - the lambda function entry point
+    """
+
+
+    # start out with blank response
+    response = {"statusCode": 200, "body": ""}
+    # parse event and ensure timeStamp and pointName are present
+    params = parse_event(event)
+    print(f"long_term_variance run with params {params}")
+
+    now = params.get("timeStamp")
+    year_ago = now - timedelta(365)
+    two_years_ago = now - timedelta(365*2)
+    start_time = now - timedelta(RECENT_DAYS)
+    one_year_st = year_ago - timedelta(RECENT_DAYS)
+    two_year_st = two_years_ago - timedelta(RECENT_DAYS)
+    point_name = params.get("pointName")
+    try:
+        # read recent data, prior year, and prior 2 year
+        # fetch recent 
+        df = build_df(fetch_trends(
+            point=point_name, start_time=start_time, end_time=now ))
+        year1 = fetch_trends(
+            point=point_name, start_time=one_year_st, end_time=year_ago )
+        year2 = fetch_trends(
+            point=point_name, start_time=two_year_st, end_time=two_years_ago )
+        a = np.array(year1[0]["datapoints"] + year2[0]["datapoints"])
+        if (a.size < MIN_DATAPOINTS_LENGTH*2) or (df.size < MIN_DATAPOINTS_LENGTH):
+            # not enough data to determine anomaly
+            return response
+        t_df = pd.DataFrame.from_records(a, columns = ("previous", "ts"), index="ts")
+        t_df.index = pd.to_datetime(t_df.index, unit='ms'
+        )
+        df = df.merge(t_df, how="outer", on="ts").interpolate()
+        # add column to separate time of day into 8 bins
+        df["hour"] = df.index.hour
+        # 8 three-hour bins to aggregate the data
+        hourbins = np.array(range(0,24,3))
+        df["hourbin"] = np.digitize(df["hour"], bins= hourbins)
+        # compute mean and std of prior, compute z score for recent
+        result = df.groupby(["hourbin"], as_index=False).agg({"previous": ["mean", "std"], point_name: ["mean"]})
+        result.columns=['bin','prev_mean','prev_std','curr_mean']
+        result["z"]=(result["curr_mean"] - result["prev_mean"])/result["prev_std"]
+        anomoly_count = result.loc[lambda x: np.abs(x["z"]) > Z_SCORE_THRESHOLD]["z"].size
+        # return anomaly if magnitude of z score is greater than threshold
+        if anomoly_count > (hourbins.size * ANOMALY_PORTION):
+            raise(AnomalyError(f"z score over {Z_SCORE_THRESHOLD} for more than {ANOMALY_PORTION:.0%} over past {RECENT_DAYS} relative to past 2 years"))
+    except RequestConnectionError as err:
+        response[
+            "body"
+        ] = f"""{point_name} ConnectionError:
+             {err.response.text} {start_time:%Y-%m-%d %H:%M} to {now:%Y-%m-%d %H:%M}"""
+    except AnomalyError as err:
+        response[
+            "body"
+        ] = f"""{point_name} Anomaly Detected: {err} {start_time:%Y-%m-%d %H:%M} to {now:%Y-%m-%d %H:%M}"""
+    except HTTPError as err:
+        response[
+            "body"
+        ] = f"""{point_name} {err.response.text} 
+            for the period {start_time:%Y-%m-%d %H:%M} to {now:%Y-%m-%d %H:%M}"""
+        if err.response.status_code == 400:
+            try:  # have to try decoding json
+                if err.response.json()["error"] == "No data":
+                    response[
+                        "body"
+                    ] = f"""{point_name} has no data 
+                        for the period {start_time:%Y-%m-%d %H:%M} to {now:%Y-%m-%d %H:%M}"""
+            except ValueError:
+                pass
+    return response
diff --git a/py_src/long_term_variance/longterm_spec.py b/py_src/long_term_variance/longterm_spec.py
@@ -0,0 +1,163 @@
+from long_term_variance.handler import run
+from requests.exceptions import HTTPError
+from requests.models import Response
+import json
+
+
+anom_3 = open('long_term_variance/test_cases/one_month_oldest.json')
+"""
+one_month_oldest.json is a file containing the swagger JSON response from the following query 
+{
+  "range": {
+    "from": "2019-8-19",
+    "to": "2019-9-18"
+  },
+  "targets": [
+    {
+      "target": "CarpenterHall.CW.FP/TONS"
+    }
+  ]
+}
+"""
+anom_2 = open('long_term_variance/test_cases/one_month_old.json')
+"""
+one_month_old.json is a file containing the swagger JSON response from the following query 
+{
+  "range": {
+    "from": "2020-8-19",
+    "to": "2020-9-18"
+  },
+  "targets": [
+    {
+      "target": "CarpenterHall.CW.FP/TONS"
+    }
+  ]
+}
+"""
+
+anom_1 = open('long_term_variance/test_cases/one_month_new.json')
+"""
+one_month_new.json is a file containing the swagger JSON response from the following query 
+{
+  "range": {
+    "from": "2021-8-19",
+    "to": "2021-9-18"
+  },
+  "targets": [
+    {
+      "target": "CarpenterHall.CW.FP/TONS"
+    }
+  ]
+}
+"""
+noanom_3 = open('long_term_variance/test_cases/noanom_oldest.json')
+"""
+noanom_oldest.json is a file containing the swagger JSON response from the following query 
+{
+   "range": {
+     "from": "2021-07-01",
+     "to": "2021-07-30"
+   },
+   "targets": [
+     {
+       "target": "BartonHall.CW.FP/TONS"
+     }
+   ]
+}
+"""
+noanom_2 = open('long_term_variance/test_cases/noanom_old.json')
+"""
+noanom_old.json is a file containing the swagger JSON response from the following query 
+{
+   "range": {
+     "from": "2022-07-01",
+     "to": "2022-07-30"
+   },
+   "targets": [
+     {
+       "target": "BartonHall.CW.FP/TONS"
+     }
+   ]
+}
+"""
+noanom_1 = open('long_term_variance/test_cases/noanom_new.json')
+"""
+noanom_new.json is a file containing the swagger JSON response from the following query 
+{
+   "range": {
+     "from": "2023-07-01",
+     "to": "2023-07-30"
+   },
+   "targets": [
+     {
+       "target": "BartonHall.CW.FP/TONS"
+     }
+   ]
+}
+"""
+
+data_noanom_1 = json.load(noanom_1)
+data_noanom_2 = json.load(noanom_2)
+data_noanom_3 = json.load(noanom_3)
+data_anom_1 = json.load(anom_1)
+data_anom_2 = json.load(anom_2)
+data_anom_3 = json.load(anom_3)
+
+
+def test_onetime(mocker):
+    event = {
+        "body": {
+            "pointName": "BartonHall.CW.FP/TONS",
+            "timeStamp": "2023-07-30",
+        }
+    }
+    #data_current contains the current data from when the algorithm was executed
+    #data1 contains the data from the previous year
+    #data2 contains the data from two years prior
+    data1 = [{"datapoints": []}]
+    mocker.patch(
+        "long_term_variance.handler.fetch_trends",
+        side_effect = [data_noanom_1, data1, data_noanom_3]
+    )
+    result = run(event, None)
+    assert "statusCode" in result
+    # if not enough data is present to be conclusive, 
+    # we expect the algorithm to return no anomaly
+    assert "" == result.get("body")
+
+def test_anom(mocker):
+    event = {
+        "body": {
+            "pointName": "CarpenterHall.CW.FP/TONS",
+            "timeStamp": "2021-9-18",
+        }
+    }
+    #data_current contains the current data from when the algorithm was executed
+    #data1 contains the data from the previous year
+    #data2 contains the data from two years prior
+
+    mocker.patch(
+        "long_term_variance.handler.fetch_trends",
+        side_effect = [data_anom_1, data_anom_2, data_anom_3]
+    )
+    result = run(event, None)
+    assert "statusCode" in result
+    assert "Anomaly Detected" in result.get("body")
+
+def test_noanom(mocker):
+    event = {
+        "body": {
+            "pointName": "BartonHall.CW.FP/TONS",
+            "timeStamp": "2023-07-30",
+        }
+    }
+    #data_current contains the current data from when the algorithm was executed
+    #data1 contains the data from the previous year
+    #data2 contains the data from two years prior
+    mocker.patch(
+        "long_term_variance.handler.fetch_trends",
+        side_effect = [data_noanom_1, data_noanom_2, data_noanom_3]
+    )
+    result = run(event, None)
+    assert "statusCode" in result
+    assert "" == result.get("body")