Merge branch 'main' into main

U-Alberta · Mar 14, 2022 · 050aa30 · 050aa30
2 parents 88f75c6 + fb8c530
commit 050aa30
Show file tree

Hide file tree

Showing 11 changed files with 699 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -126,6 +126,7 @@ crate/blobs
 crate/repos
 
 # test
+sample.jsonl
 example_test.py
 example_data/*
 !example_data/dp/mlruns/multi*.pkl

diff --git a/data-programming/MLproject b/data-programming/MLproject
@@ -27,3 +27,29 @@ entry_points:
         --verbose {verbose}
         {train_data}
         "
+    framing:
+        parameters:
+            train_data: path
+            task: {type: string, default: multiclass}
+            dev_data: {type: int, default: 0}
+            n_epochs: {type: int, default: 1000}
+            optimizer: {type: string, default: sgd}
+            prec_init: {type: float, default: 0.7}
+            seed: {type: int, default: 0}
+            parallel: {type: int, default: 0}
+            device: {type: string, default: cpu}
+            trld: {type: float, default: 0.5}
+            encoder: {type: str, default: roberta}
+        command: "python ./label/example.py
+        --task {task}
+        --dev_data {dev_data}
+        --n_epochs {n_epochs}
+        --optimizer {optimizer}
+        --prec_init {prec_init}
+        --seed {seed}
+        --parallel {parallel}
+        --device {device}
+        --trld {trld}
+        --encoder {encoder}
+        {train_data}
+        "
diff --git a/data-programming/label/framing.py b/data-programming/label/framing.py
@@ -0,0 +1,24 @@
+import os
+
+from label import run, parser
+from label.lfs import FramingLabels
+from label.lfs.framing import get_lfs
+
+REGISTERED_MODEL_NAME = 'FramingLabelModel'
+LF_FEATURES = {
+    'txt_clean_roberta': None,
+    'txt_clean_use': None,
+    }
+DEV_ANNOTATIONS_PATH = os.path.join('/annotations', 'framing', 'gold_df.pkl')
+
+
+def main():
+    parser.add_argument('--trld', default=0.5, type=float, help='cosine similarity threshold')
+    parser.add_argument('--encoder', default='roberta', choices=('roberta', 'use'), type=str,
+                        help='which encoder embeddings to use')
+    parsed_args = parser.parse_args()
+    run.start(REGISTERED_MODEL_NAME, LF_FEATURES, DEV_ANNOTATIONS_PATH, get_lfs, FramingLabels, parsed_args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data-programming/label/lfs/__init__.py b/data-programming/label/lfs/__init__.py
@@ -25,3 +25,16 @@ class ExampleLabels(Enum):
     bird = 2
     horse = 3
     snake = 4
+
+
+class FramingLabels(Enum):
+    settled_science = 0
+    uncertain_science = 1
+    political_or_ideological_struggle = 2
+    disaster = 3
+    opportunity = 4
+    economic = 5
+    morality_and_ethics = 6
+    role_of_science = 7
+    security = 8
+    health = 9
diff --git a/data-programming/label/lfs/framing.py b/data-programming/label/lfs/framing.py
@@ -0,0 +1,45 @@
+import logging
+from snorkel.labeling import LabelingFunction
+import pandas as pd
+from label.lfs import FramingLabels, ABSTAIN
+from label import DATABASE_IP
+from scipy.spatial.distance import cdist
+
+FRAME_ELEMENT_QUERY = """
+SELECT * FROM frame_elements;
+"""
+TRLD = 0.5
+
+
+def get_lfs(parsed_args) -> [LabelingFunction]:
+    """
+    This function creates and returns a list of all lfs in this module
+    :return: A list of LabelingFunctions defined in this module
+    """
+    global TRLD
+    TRLD = parsed_args.trld
+    lfs = []
+    element_lfs = []
+    frame_elements_df = pd.read_sql(FRAME_ELEMENT_QUERY, DATABASE_IP)
+    for label in FramingLabels:
+        element_lfs = element_lfs + [make_element_lf(row.element_id, getattr(row, parsed_args.encoder),
+                                                     'txt_clean_{}'.format(parsed_args.encoder), label)
+                                     for row in frame_elements_df.itertuples(index=False)]
+    lfs = lfs + element_lfs
+    logging.info("LFs have been gathered.")
+    return lfs
+
+
+def frame_element_similarity(x, element, encoder, label) -> int:
+    distances = cdist([element], x[encoder], 'cosine')[0]
+    smallest = min(distances)
+    similarity = 1 - smallest
+    return label.value if similarity >= TRLD else ABSTAIN
+
+
+def make_element_lf(element_id: str, element, encoder: str, label) -> LabelingFunction:
+    return LabelingFunction(
+        name=element_id,
+        f=frame_element_similarity,
+        resources=dict(element=element, encoder=encoder, label=label)
+    )
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -215,6 +215,7 @@ services:
       - DATABASE_IP=${DATABASE_IP}
       - MULTICLASS_EXAMPLE_MODEL_PATH=${MULTICLASS_EXAMPLE_MODEL_PATH}
       - MULTILABEL_EXAMPLE_MODEL_PATH=${MULTILABEL_EXAMPLE_MODEL_PATH}
+      - CLIMATE_FRAMES_MODEL_PATH=${CLIMATE_FRAMES_MODEL_PATH}
 
 networks:
   ls_network:

diff --git a/label-studio/config/framing_config.xml b/label-studio/config/framing_config.xml
@@ -0,0 +1,16 @@
+<View>
+  <Text name="text" value="$txt"/>
+  <Choices name="frame" toName="txt" choice="single">
+    <Choice value="settled_science"/>
+    <Choice value="uncertain_science"/>
+    <Choice value="political_or_ideological_struggle"/>
+    <Choice value="disaster"/>
+    <Choice value="opportunity"/>
+    <Choice value="economic"/>
+    <Choice value="morality_and_ethics"/>
+    <Choice value="role_of_science"/>
+    <Choice value="security"/>
+    <Choice value="health"/>
+  </Choices>
+</View>
+
diff --git a/label-studio/ls/__init__.py b/label-studio/ls/__init__.py
@@ -6,7 +6,9 @@
 
 LABEL_PREFIX = 'worker_'
 CLASSIFICATION_TASKS = {
-    'example': ('cat', 'dog', 'bird', 'horse', 'snake')
+    'example': ('cat', 'dog', 'bird', 'horse', 'snake'),
+    'framing': ("settled_science", "uncertain_science", "political_or_ideological_struggle", "disaster", "opportunity",
+                "economic", "morality_and_ethics", "role_of_science", "security", "health")
 }
 
 for task in CLASSIFICATION_TASKS:

diff --git a/modelling/app/main.py b/modelling/app/main.py
@@ -54,7 +54,33 @@ class ExampleModelResponse(BaseModel):
     prob_horse: List[float]
     prob_snake: List[float]
 
-# Try to load each model from the location given in the .env file. Don't interfere with the startup if one can't load,
+
+class FramingModelResponse(BaseModel):
+    table: List[str]
+    id: Set[str]
+    settled_science: List[int]
+    uncertain_science: List[int]
+    political_or_ideological_struggle: List[int]
+    disaster: List[int]
+    opportunity: List[int]
+    economic: List[int]
+    morality_and_ethics: List[int]
+    role_of_science: List[int]
+    security: List[int]
+    health: List[int]
+    prob_settled_science: List[float]
+    prob_uncertain_science: List[float]
+    prob_political_or_ideological_struggle: List[float]
+    prob_disaster: List[float]
+    prob_opportunity: List[float]
+    prob_economic: List[float]
+    prob_morality_and_ethics: List[float]
+    prob_role_of_science: List[float]
+    prob_security: List[float]
+    prob_health: List[float]
+
+
+# Try to load each model from the location given in the .env.dev file. Don't interfere with the startup if one can't load,
 # just wait and see if the client tries to get a prediction or do something with the model that failed to load.
 try:
     with open(os.environ['MULTICLASS_EXAMPLE_MODEL_PATH'], 'rb') as infile:
@@ -68,10 +94,17 @@ class ExampleModelResponse(BaseModel):
 except FileNotFoundError as err:
     print("Could not load MultilabelExampleModel: {}".format(err))
     multilabel_example_model = None
+try:
+    with open(os.environ['CLIMATE_FRAMES_MODEL_PATH'], 'rb') as infile:
+        climate_frames_model = pickle.load(infile)
+except FileNotFoundError as err:
+    print("Could not load ClimateFramesModel: {}".format(err))
+    climate_frames_model = None
 
 loaded_models_dict = {
     'Multiclass Example Model': multiclass_example_model is not None,
-    'Multilabel Example Model': multilabel_example_model is not None
+    'Multilabel Example Model': multilabel_example_model is not None,
+    'Climate Frames Model': climate_frames_model is not None
 }
 
 
@@ -159,3 +192,50 @@ def predict_multilabel_example(data_point_item: DataPointItem):
     }
 
     return response_dict
+
+
+@app.post("/predict_climate_frames", response_model=FramingModelResponse)
+def predict_climate_frames(data_point_item: DataPointItem):
+    """
+    Takes a list of data point ids from a table. Looks up the appropriate features,
+    then returns predictions as columns 'class_1' ... 'class_n' and binary
+    values indicating the presence/absence of the class in the prediction.
+    """
+    try:
+        assert loaded_models_dict['Climate Frames Model']
+    except Exception as error:
+        raise ModelException(name="ClimateFramesModel", code="load")
+    json_data_point_item = jsonable_encoder(data_point_item)
+    id_json = json.dumps(json_data_point_item)
+    id_df = pd.read_json(id_json, dtype={'id': str, 'table': str})
+
+    try:
+        result_df = climate_frames_model.predict(id_df)
+    except Exception as error:
+        raise ModelException(name="ClimateFramesModel", code='predict', error=error)
+    response_dict = {
+        'table': result_df.table.tolist(),
+        'id': result_df.id.tolist(),
+        'settled_science': result_df.settled_science.tolist(),
+        'uncertain_science': result_df.uncertain_science.tolist(),
+        'political_or_ideological_struggle': result_df.political_or_ideological_struggle.tolist(),
+        'disaster': result_df.disaster.tolist(),
+        'opportunity': result_df.opportunity.tolist(),
+        'economic': result_df.economic.tolist(),
+        'morality_and_ethics': result_df.morality_and_ethics.tolist(),
+        'role_of_science': result_df.role_of_science(),
+        'security': result_df.security.tolist(),
+        'health': result_df.health.tolist(),
+        'prob_settled_science': result_df.prob_settled_science.tolist(),
+        'prob_uncertain_science': result_df.prob_uncertain_science.tolist(),
+        'prob_political_or_ideological_struggle': result_df.prob_political_or_ideological_struggle.tolist(),
+        'prob_disaster': result_df.prob_disaster.tolist(),
+        'prob_opportunity': result_df.prob_opportunity.tolist(),
+        'prob_economic': result_df.prob_economic.tolist(),
+        'prob_morality_and_ethics': result_df.prob_morality_and_ethics.tolist(),
+        'prob_role_of_science': result_df.prob_role_of_science.tolist(),
+        'prob_security': result_df.prob_security.tolist(),
+        'prob_health': result_df.prob_health.tolist()
+    }
+
+    return response_dict