Living-with-machines · fedenanni · Mar 23, 2023 · Mar 22, 2023 · Mar 22, 2023 · Mar 23, 2023
diff --git a/examples/load_use_ner_model.ipynb b/examples/load_use_ner_model.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading and using a NER model\n",
+    "\n",
+    "This notebook shows how to load an existing named entity recognition (NER) model from the HuggingFace hub.\n",
+    "\n",
+    "We start by importing some libraries, and the `recogniser` script from the `geoparser` folder:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "sys.path.insert(0, os.path.abspath(os.path.pardir))\n",
+    "from geoparser import recogniser"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a `myner` object of the `Recogniser` class.\n",
+    "\n",
+    "We only need to pass the path to the model in `model` and set `load_from_hum` to True, as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "myner = recogniser.Recogniser(\n",
+    "    model=\"dslim/bert-base-NER\", # Path to the HuggingFace model\n",
+    "    load_from_hub=True, # Set this to True\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print the Recogniser (see that most fields are empty, because they are not needed):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(myner)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we try to train the model, nothing happens, because we're loading an existing model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "myner.train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, to use the model you want to use, you'll need to load it into a NER pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "myner.model, myner.pipe = myner.create_pipeline()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And, finally, use the newly trained model to predict the named entities in a sentence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"A remarkable case of rattening has just occurred in the building trade at Sheffield.\"\n",
+    "\n",
+    "predictions = myner.ner_predict(sentence)\n",
+    "print(predictions) # Note that, if you've trained the model in the test mode, the model will probably not identify \"Sheffield\" as a location."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "resolution-cNmUJBkC-py3.9",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/train_use_ner_model.ipynb b/examples/train_use_ner_model.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Training and using a NER model\n",
     "\n",
-    "This notebook shows how to train a new toponym recognition (NER) model.\n",
+    "This notebook shows how to train a new toponym recognition (NER) model. You can see how to use a model from the HuggingFace hub in the `load_use_ner_model.ipynb` notebook.\n",
     "\n",
     "We start by importing some libraries, and the `recogniser` script from the `geoparser` folder:"
    ]
@@ -36,9 +36,7 @@
     "> ```json\n",
     "> {\"id\":\"3896239_29\",\"ner_tags\":[\"O\",\"B-STREET\",\"I-STREET\",\"O\",\"O\",\"O\",\"B-BUILDING\",\"I-BUILDING\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\"],\"tokens\":[\",\",\"Old\",\"Millgate\",\",\",\"to\",\"the\",\"Collegiate\",\"Church\",\",\",\"where\",\"they\",\"arrived\",\"a\",\"little\",\"after\",\"ten\",\"oclock\",\".\"]}\n",
     "> {\"id\":\"8262498_11\",\"ner_tags\":[\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\",\"B-LOC\",\"O\",\"B-LOC\",\"O\",\"O\",\"O\",\"O\",\"O\",\"O\"],\"tokens\":[\"On\",\"the\",\"\\u2018\",\"JSth\",\"November\",\"the\",\"ship\",\"Santo\",\"Christo\",\",\",\"from\",\"Monteveido\",\"to\",\"Cadiz\",\",\",\"with\",\"hides\",\"and\",\"copper\",\".\"]}\n",
-    "> ```\n",
-    ">\n",
-    "> **Note:** If your tagset is different from the one we use (`[\"O\", \"B-LOC\", \"I-LOC\", \"B-STREET\", \"I-STREET\", \"B-BUILDING\", \"I-BUILDING\", \"B-OTHER\", \"I-OTHER\", \"B-FICTION\", \"I-FICTION\"]`), you will need to manually modify the mapping dictionaries `map_tag_label()` and `encode_dict()` in the script [utils/ner.py](https://github.com/Living-with-machines/toponym-resolution/blob/main/utils/ner.py) accordingly. In the future, we plan to improve this bit so it's automatically done, but for the time being you will need to do this manually."
+    "> ```"
    ]
   },
   {
@@ -48,30 +46,46 @@
    "outputs": [],
    "source": [
     "myner = recogniser.Recogniser(\n",
-    "    model_name=\"your_chosen_model_name\",  # The name of your NER model (note: we may append suffixes to it).\n",
-    "    model=None,  # We'll store the NER model here, leave this empty.\n",
+    "    model=\"blb_lwm-ner-fine\",\n",
+    "    train_dataset=\"../experiments/outputs/data/lwm/ner_fine_train.json\",  # Path to the json file containing the training set (see note above).\n",
+    "    test_dataset=\"../experiments/outputs/data/lwm/ner_fine_dev.json\",  # Path to the json file containing the test set (see note above).\n",
     "    pipe=None,  # We'll store the NER pipeline here, leave this empty.\n",
     "    base_model=\"khosseini/bert_1760_1900\",  # Base model to fine-tune for NER. The value can be: either \n",
     "                                            # your local path to a model or the huggingface path.\n",
     "                                            # In this case, we use the huggingface path:\n",
     "                                            # https://huggingface.co/khosseini/bert_1760_1900). You can\n",
     "                                            # chose any other model from the HuggingFace hub, as long as it's\n",
     "                                            # trained on the \"Fill-Mask\" objective (filter by task).\n",
-    "    train_dataset=\"../experiments/outputs/data/lwm/ner_df_train.json\",  # Path to the json file containing the training set (see note above).\n",
-    "    test_dataset=\"../experiments/outputs/data/lwm/ner_df_dev.json\",  # Path to the json file containing the test set (see note above).\n",
-    "    output_model_path=\"../resources/models/\",  # Path where the NER model will be stored.\n",
+    "    model_path=\"../resources/models/\",  # Path where the NER model will be stored.\n",
     "    training_args={\n",
     "        \"learning_rate\": 5e-5,\n",
     "        \"batch_size\": 16,\n",
     "        \"num_train_epochs\": 4,\n",
     "        \"weight_decay\": 0.01,\n",
     "    }, # Training arguments: you can change them.\n",
-    "    overwrite_training=True,  # Set to True if you want to overwrite an existing model with the same name.\n",
-    "    do_test=True,  # Set to True if you want to perform the training on test mode (the string \"_test\" will be appended to your model name).\n",
-    "    training_tagset=\"fine\",  # Options are: \"coarse\" (will cluster all tags under LOC) or \"fine\" (will keep the different tags). See note above.\n",
+    "    overwrite_training=False,  # Set to True if you want to overwrite an existing model with the same name.\n",
+    "    do_test=False,  # Set to True if you want to perform the training on test mode (the string \"_test\" will be appended to your model name).\n",
+    "    load_from_hub=False, # Whether the model should be loaded from the HuggingFace hub\n",
     ")"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print the Recogniser:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(myner)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -122,7 +136,7 @@
     "sentence = \"A remarkable case of rattening has just occurred in the building trade at Sheffield.\"\n",
     "\n",
     "predictions = myner.ner_predict(sentence)\n",
-    "print(predictions) # Note that, if you've trained the model in the test mode, the model will probably not identify \"Sheffield\" as a location."
+    "print([pred for pred in predictions if pred[\"entity\"] != \"O\"]) # Note that, if you've trained the model in the test mode, the model will probably not identify \"Sheffield\" as a location."
    ]
   }
  ],

diff --git a/experiments/prepare_data.py b/experiments/prepare_data.py
@@ -79,10 +79,28 @@
 lwm_train_ner, lwm_dev_ner = train_test_split(
     lwm_df, test_size=0.2, random_state=RANDOM_SEED
 )
+
+# Store LwM with fine-grained tags:
+lwm_train_ner.to_json(
+    output_path_lwm + "ner_fine_train.json", orient="records", lines=True
+)
+lwm_dev_ner.to_json(output_path_lwm + "ner_fine_dev.json", orient="records", lines=True)
+
+# Convert fine-grained tags to coarse:
+lwm_train_ner["ner_tags"] = lwm_train_ner["ner_tags"].apply(
+    lambda x: preprocess_data.fine_to_coarse(x)
+)
+lwm_dev_ner["ner_tags"] = lwm_dev_ner["ner_tags"].apply(
+    lambda x: preprocess_data.fine_to_coarse(x)
+)
+
+# Store LwM with coarse-grained tags:
 lwm_train_ner.to_json(
-    output_path_lwm + "ner_df_train.json", orient="records", lines=True
+    output_path_lwm + "ner_coarse_train.json", orient="records", lines=True
+)
+lwm_dev_ner.to_json(
+    output_path_lwm + "ner_coarse_dev.json", orient="records", lines=True
 )
-lwm_dev_ner.to_json(output_path_lwm + "ner_df_dev.json", orient="records", lines=True)
 
 # Process data for the resolution experiments:
 lwm_train_df = preprocess_data.process_lwm_for_linking(topres_path_train, gazetteer_ids)