Merge pull request #270 from Living-with-machines/dev_rw

Restructure t_res
Living-with-machines · Oct 31, 2024 · cff9425 · cff9425
2 parents 713cab0 + d060a16
commit cff9425
Show file tree

Hide file tree

Showing 114 changed files with 5,426 additions and 934 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,3 +50,7 @@ jobs:
         shell: bash
         run: |
           python -m poetry install
+
+      - name: Test with pytest
+        run: |
+          poetry run pytest
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -49,7 +49,7 @@ jobs:
       - name: Install dependencies
         shell: bash
         run: |
-          python -m poetry install --extras docs
+          python -m poetry install --with docs
 
       - name: Build documentation
         run: |

diff --git a/.gitignore b/.gitignore
@@ -129,18 +129,24 @@ dmypy.json
 .pyre/
 
 
-outputs/
-resources/
+/experiments/outputs/
+/resources/
 poetry.lock
 .vscode/*
-evaluation/results/*
-evaluation/CLEF-HIPE-2020-scorer/
-experiments/tmp_*
+/evaluation/results/*
+/evaluation/HIPE-scorer/
+/experiments/tmp_*
 preprocessing/toponymmatching/experiments/
-experiments/REL/
-evaluation/results_table.pkl
-experiments/explore_data.ipynb
-experiments/examine_res.py
+/experiments/REL/
+/evaluation/results_table.pkl
+/experiments/explore_data.ipynb
+/experiments/examine_res.py
+
+/tests/sample_files/experiments/outputs/
+/tests/sample_files/resources/deezymatch/candidate_vectors/
+/tests/sample_files/resources/deezymatch/combined/
+/tests/sample_files/resources/deezymatch/data/
+/tests/sample_files/resources/deezymatch/ranking/
 
 # Docs
 _build

diff --git a/README.md b/README.md
@@ -28,6 +28,9 @@ T-Res relies on several resources in the following directory structure:
 
 ```
 T-Res/
+├── t-res/
+│   ├── geoparser/
+│   └── utils/
 ├── app/
 ├── evaluation/
 ├── examples/
@@ -38,11 +41,10 @@ T-Res/
 │               ├── linking_df_split.tsv [*?]
 │               ├── ner_fine_dev.json [*+?]
 │               └── ner_fine_train.json [*+?]
-├── geoparser/
 ├── resources/
 │   ├── deezymatch/
 │   │   └── data/
-│   │       └── w2v_ocr_pairs.txt [*+?]
+│   │       └── w2v_ocr_pairs.txt [?]
 │   ├── models/
 │   ├── news_datasets/
 │   ├── rel_db/
@@ -53,8 +55,7 @@ T-Res/
 │       ├── mentions_to_wikidata.json [*]
 │       ├── wikidta_gazetteer.csv [*]
 │       └── wikidata_to_mentions_normalized.json [*]
-├── tests/
-└── utils/
+└── tests/
 ```
 
 These resources are described in detail in the documentation. A question mark (`?`) is used to indicate resources which are only required for some approaches (for example, the `rel_db/embeddings_database.db` file is only required by the REL-based disambiguation approaches). Note that an asterisk (`*`) next to the resource means that the path can be changed when instantiating the T-Res objects, and a plus sign (`+`) if the name of the file can be changed in the instantiation.
@@ -68,7 +69,7 @@ This is an example on how to use the default T-Res pipeline:
 ```python
 from geoparser import pipeline
 
-geoparser = pipeline.Pipeline()
+geoparser = pipeline.Pipeline(resources_path="./resources")
 
 output = geoparser.run_text("She was on a visit at Chippenham.")
 ```

diff --git a/app/app_template.py b/app/app_template.py
@@ -8,18 +8,9 @@
 from fastapi import FastAPI, Request
 from pydantic import BaseModel
 
-if "toponym-resolution" in __file__:
-    root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-else:
-    root_path = os.path.dirname(os.path.abspath(__file__))
-experiments_path = Path(root_path, "experiments")
-sys.path.insert(0, str(root_path))
-sys.path.insert(0, str(experiments_path))
-os.chdir(experiments_path)
-
 from config import CONFIG as pipeline_config
 
-from geoparser import pipeline
+from t_res.geoparser import pipeline
 
 geoparser = pipeline.Pipeline(**pipeline_config)
 

diff --git a/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py b/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py
@@ -1,29 +1,26 @@
-import os
-import sys
 import sqlite3
 from pathlib import Path
 
-# sys.path.insert(0, os.path.abspath(os.path.pardir))
-from geoparser import pipeline, ranking, linking
+from t_res.geoparser import linking, pipeline, ranking
 
 # --------------------------------------
 # Instantiate the ranker:
 myranker = ranking.Ranker(
     method="deezymatch",
-    resources_path="../resources/wikidata/",
+    resources_path="./resources/",
     strvar_parameters={
         # Parameters to create the string pair dataset:
         "ocr_threshold": 60,
         "top_threshold": 85,
         "min_len": 5,
         "max_len": 15,
-        "w2v_ocr_path": str(Path("../resources/models/w2v/").resolve()),
+        "w2v_ocr_path": str(Path("./resources/models/w2v/").resolve()),
         "w2v_ocr_model": "w2v_*_news",
         "overwrite_dataset": False,
     },
     deezy_parameters={
         # Paths and filenames of DeezyMatch models and data:
-        "dm_path": str(Path("../resources/deezymatch/").resolve()),
+        "dm_path": str(Path("./resources/deezymatch/").resolve()),
         "dm_cands": "wkdtalts",
         "dm_model": "w2v_ocr",
         "dm_output": "deezymatch_on_the_fly",
@@ -38,15 +35,16 @@
     },
 )
 
-with sqlite3.connect("../resources/rel_db/embeddings_database.db") as conn:
+with sqlite3.connect("./resources/rel_db/embeddings_database.db") as conn:
     cursor = conn.cursor()
     mylinker = linking.Linker(
         method="reldisamb",
-        resources_path="../resources/",
+        resources_path="./resources/",
+        experiments_path="./experiments/",
         linking_resources=dict(),
         rel_params={
-            "model_path": "../resources/models/disambiguation/",
-            "data_path": "outputs/data/lwm/",
+            "model_path": "./resources/models/disambiguation/",
+            "data_path": "./experiments/outputs/data/lwm/",
             "training_split": "originalsplit",
             "db_embeddings": cursor,
             "with_publication": True,

diff --git a/app/run_local_app.py b/app/run_local_app.py
@@ -1,33 +1,23 @@
+import importlib
 import os
 import sys
 import time
 from pathlib import Path
-from typing import Union, Optional, List
+from typing import List, Optional, Union
 
 import uvicorn
 from fastapi import FastAPI, Request
 from pydantic import BaseModel
 
-if "toponym-resolution" in __file__:
-    root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-else:
-    root_path = os.path.dirname(os.path.abspath(__file__))
-experiments_path = Path(root_path, "experiments")
-sys.path.insert(0, str(root_path))
-sys.path.insert(0, str(experiments_path))
-os.chdir(experiments_path)
+from t_res.geoparser import pipeline
 
 os.environ["APP_CONFIG_NAME"] = "t-res_deezy_reldisamb-wpubl-wmtops"
-import importlib
 
 config_mod = importlib.import_module(
     ".t-res_deezy_reldisamb-wpubl-wmtops", "app.configs"
 )
 pipeline_config = config_mod.CONFIG
 
-
-from geoparser import pipeline
-
 geoparser = pipeline.Pipeline(**pipeline_config)
 
 

diff --git a/app/template.Dockerfile b/app/template.Dockerfile
@@ -4,10 +4,11 @@ ARG APP_NAME
 WORKDIR /app
 
 COPY pyproject.toml /app/pyproject.toml
+COPY t_res /app/t_res
 
 RUN pip3 install poetry
 RUN poetry config virtualenvs.create false
-RUN poetry install --no-dev
+RUN poetry install
 
 ENV APP_CONFIG_NAME=${APP_NAME}
 COPY app/app_template.py /app/app.py

diff --git a/docs/source/experiments/index.rst b/docs/source/experiments/index.rst
@@ -6,7 +6,7 @@ Follow these steps to reproduce the experiments in our paper.
 1. Obtain the external resources
 --------------------------------
 
-Follow the instructions in the ":doc:`resources`" page in the documentation
+Follow the instructions in the ":doc:`/getting-started/resources`" page in the documentation
 to obtain the resources required for running the experiments.
 
 2. Preparing the data
@@ -17,7 +17,7 @@ run the following command from the ``./experiments/`` folder:
 
 .. code-block:: bash
 
-    $ python ./prepare_data.py
+    $ python ./prepare_data.py -p ../resources
 
 This script takes care of downloading the LwM and HIPE datasets and format them
 as needed in the experiments.
@@ -30,7 +30,7 @@ folder:
 
 .. code-block:: bash
 
-    $ python ./toponym_resolution.py
+    $ python ./toponym_resolution.py -p ../resources
 
 This script does runs for all different scenarios reported in the experiments in
 the paper.

diff --git a/docs/source/getting-started/complete-tour.rst b/docs/source/getting-started/complete-tour.rst
@@ -47,7 +47,9 @@ To instantiate the default T-Res pipeline, do:
 
     from geoparser import pipeline
 
-    geoparser = pipeline.Pipeline()
+    geoparser = pipeline.Pipeline(resources_path="../resources/")
+
+.. note:: You should update the resources path argument to reflect your set up.
 
 You can also instantiate a pipeline using a customised Recogniser, Ranker and
 Linker. To see the different options, refer to the sections on instantiating
@@ -603,7 +605,7 @@ and ``levenshtein`` respectively), instantiate it as follows, changing the
 
     myranker = ranking.Ranker(
         method="perfectmatch", # or "partialmatch" or "levenshtein"
-        resources_path="resources/wikidata/",
+        resources_path="resources/",
     )
 
 Note that ``resources_path`` should contain the path to the directory
@@ -668,7 +670,7 @@ The Ranker can then be instantiated as follows:
     myranker = ranking.Ranker(
         # Generic Ranker parameters:
         method="deezymatch",
-        resources_path="resources/wikidata/",
+        resources_path="resources/",
         # Parameters to create the string pair dataset:
         strvar_parameters=dict(),
         # Parameters to train, load and use a DeezyMatch model:
@@ -757,7 +759,7 @@ The Ranker can then be instantiated as follows:
     myranker = ranking.Ranker(
         # Generic Ranker parameters:
         method="deezymatch",
-        resources_path="resources/wikidata/",
+        resources_path="resources/",
         # Parameters to create the string pair dataset:
         strvar_parameters={
             "ocr_threshold": 60,

diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst
@@ -113,12 +113,19 @@ To add a package:
 
     $ poetry add [package name]
 
-To run the Python tests:
+To run the Python unit tests:
 
 .. code-block:: bash
 
     $ poetry run pytest
 
+To run unit and integration tests, some of which depend on the `T-Res resources <../getting-started/resources.html>`_:
+
+.. code-block:: bash
+
+    $ poetry run pytest tests --no-skip
+
+
 If you want to use Jupyter notebook, run it as usual, and then select the
 created kernel in "Kernel" > "Change kernel".
 

diff --git a/docs/source/getting-started/resources.rst b/docs/source/getting-started/resources.rst
@@ -561,6 +561,9 @@ for the mentioned resources that are required in order to run the pipeline.
 ::
 
     T-Res/
+    ├── t-res/
+    │   ├── geoparser/
+    │   └── utils/
     ├── app/
     ├── evaluation/
     ├── examples/
@@ -571,7 +574,6 @@ for the mentioned resources that are required in order to run the pipeline.
     │               ├── linking_df_split.tsv [*?]
     │               ├── ner_fine_dev.json [*+?]
     │               └── ner_fine_train.json [*+?]
-    ├── geoparser/
     ├── resources/
     │   ├── deezymatch/
     │   │   └── data/
@@ -586,8 +588,7 @@ for the mentioned resources that are required in order to run the pipeline.
     │       ├── mentions_to_wikidata.json [*]
     │       ├── wikidta_gazetteer.csv [*]
     │       └── wikidata_to_mentions_normalized.json [*]
-    ├── tests/
-    └── utils/
+    └── tests/
 
 A question mark (``?``) is used to indicate resources which are only required
 for some approaches (for example, the ``rel_db/embeddings_database.db`` file

diff --git a/docs/source/reference/geoparser/linker.rst b/docs/source/reference/geoparser/linker.rst
@@ -1,8 +1,8 @@
-``geoparser.linking.Linker``
+``t_res.geoparser.linking.Linker``
 ============================
 
-.. autoclass:: geoparser.linking.Linker
+.. autoclass:: t_res.geoparser.linking.Linker
     :members:
     :undoc-members:
 
-.. autoattribute:: geoparser.linking.RANDOM_SEED
+.. autoattribute:: t_res.geoparser.linking.RANDOM_SEED
diff --git a/docs/source/reference/geoparser/pipeline.rst b/docs/source/reference/geoparser/pipeline.rst
@@ -1,6 +1,6 @@
-``geoparser.pipeline.Pipeline``
+``t_res.geoparser.pipeline.Pipeline``
 ===============================
 
-.. autoclass:: geoparser.pipeline.Pipeline
+.. autoclass:: t_res.geoparser.pipeline.Pipeline
     :members:
     :undoc-members:
diff --git a/docs/source/reference/geoparser/ranker.rst b/docs/source/reference/geoparser/ranker.rst
@@ -1,6 +1,6 @@
-``geoparser.ranking. Ranker``
+``t_res.geoparser.ranking. Ranker``
 =============================
 
-.. autoclass:: geoparser.ranking.Ranker
+.. autoclass:: t_res.geoparser.ranking.Ranker
     :members:
     :undoc-members:
diff --git a/docs/source/reference/geoparser/recogniser.rst b/docs/source/reference/geoparser/recogniser.rst
@@ -1,6 +1,6 @@
-``geoparser.recogniser.Recogniser``
+``t_res.geoparser.recogniser.Recogniser``
 ===================================
 
-.. autoclass:: geoparser.recogniser.Recogniser
+.. autoclass:: t_res.geoparser.recogniser.Recogniser
     :members:
     :undoc-members:
diff --git a/docs/source/reference/utils/deezy_processing.rst b/docs/source/reference/utils/deezy_processing.rst
@@ -1,10 +1,10 @@
-``utils.deezy_processing`` module
+``t_res.utils.deezy_processing`` module
 =================================
 
-.. autofunction:: utils.deezy_processing.obtain_matches
+.. autofunction:: t_res.utils.deezy_processing.obtain_matches
 
-.. autofunction:: utils.deezy_processing.create_training_set
+.. autofunction:: t_res.utils.deezy_processing.create_training_set
 
-.. autofunction:: utils.deezy_processing.train_deezy_model
+.. autofunction:: t_res.utils.deezy_processing.train_deezy_model
 
-.. autofunction:: utils.deezy_processing.generate_candidates
+.. autofunction:: t_res.utils.deezy_processing.generate_candidates