iai-group · vinaysetty · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024 · galuscakova
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.DS_Store
diff --git a/README.md b/README.md
@@ -1,12 +1,21 @@
-# IAI Project Template
+# CLEF 2024 CheckThat! task1 IAI participation.
 
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-
-This repository serves as a template for software projects.
-
-# Testing and GitHub actions
-
-Using `pre-commit` hooks, `flake8`, `black`, `mypy`, `docformatter`, and `pytest` are locally run on every commit. For more details on how to use `pre-commit` hooks see [here](https://github.com/iai-group/guidelines/tree/main/python#install-pre-commit-hooks).
-
-Similarly, Github actions are used to run `flake8`, `black` and `pytest` on every push and pull request. The `pytest` results are sent to [CodeCov](https://about.codecov.io/) using their API for to get test coverage analysis. Details on Github actions are [here](https://github.com/iai-group/guidelines/blob/main/github/Actions.md).
+This repo contains the code and data for the CLEF 2024 CheckThat! task1 IAI participation.
 
+## Folders
+* checkthat - python module for the claim detection.
+* data
+  * task1 - Contains data for task1 which is also uploaded to HuggingFace framework.
+    * The dataset is currently gated/private, make sure you have run huggingface-cli login
+    * Usage: 
+    ```
+        from datasets import load_dataset
+        # English data containing political debates.
+        dataset_en = load_dataset("iai-group/clef2024_checkthat_task1_en")
+        # Spanish data containing Tweets.
+        dataset_es = load_dataset("iai-group/clef2024_checkthat_task1_es")
+        # Dutch data containing Tweets.
+        dataset_nl = load_dataset("iai-group/clef2024_checkthat_task1_nl")
+        # Arabic data containing Tweets.
+        dataset_ar = load_dataset("iai-group/clef2024_checkthat_task1_ar")
+    ```
diff --git a/code/README.md → checkthat/README.md b/code/README.md → checkthat/README.md
diff --git a/checkthat/__init__.py b/checkthat/__init__.py
@@ -0,0 +1 @@
+"""Code to train and evaluate the models for CheckThat tasks."""
diff --git a/checkthat/task1/__init__.py b/checkthat/task1/__init__.py
@@ -0,0 +1 @@
+"""Code to train and evaluate the models for CheckThat tasks1."""
diff --git a/code/main.py b/code/main.py
diff --git a/data/README.md b/data/README.md
@@ -1,3 +1,9 @@
 # Data
 
-This folder should contain the description of data that was used in the project, including datasets, queries, ground truth, run files, etc.  Files under 10MB can be stored on GitHub, larger files should be stored on a server (e.g., gustav1).  This README should provide a comprehensive overview of all the data that is used and where it originates from (e.g., part of an official test collection, generated using code in this repo or a third-party tool, etc.).
+task1 data is under the folder `task1`. It also contains code to load and prepare the HuggingFace data. It is already uploaded to the Hugginface framework:
+*  English data: `iai-group/clef2024_checkthat_task1_en`
+*  Spanish data: `iai-group/clef2024_checkthat_task1_es`
+*  Dutch data: `iai-group/clef2024_checkthat_task1_nl`
+*  Arabic data: `iai-group/clef2024_checkthat_task1_ar`
+
+This data is provided by the [CLEF 2024 CheckThat organizers](https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/tree/main/task1?ref_type=heads)
diff --git a/data/task1/arabic/dev.tsv b/data/task1/arabic/dev.tsv
diff --git a/data/task1/arabic/test.tsv b/data/task1/arabic/test.tsv
diff --git a/data/task1/arabic/train.tsv b/data/task1/arabic/train.tsv
diff --git a/data/task1/clef24_dataset_debate.py b/data/task1/clef24_dataset_debate.py
@@ -0,0 +1,99 @@
+"""Multilang Dataset loading script."""
+
+from datasets import (
+    DatasetInfo,
+    BuilderConfig,
+    Version,
+    GeneratorBasedBuilder,
+    DownloadManager,
+)
+from datasets import SplitGenerator, Split, Features, Value
+from typing import Generator, Tuple, Union
+
+import os
+
+_DESCRIPTION = """
+This dataset includes English data for CLEF 2024 CheckThat! Lab task1.
+"""
+
+_CITATION = """\
+@inproceedings{barron2024clef,
+  title={The CLEF-2024 CheckThat! Lab: Check-Worthiness, Subjectivity, Persuasion, Roles, Authorities, and Adversarial Robustness},
+  author={Barr{\'o}n-Cede{\~n}o, Alberto and Alam, Firoj and Chakraborty, Tanmoy and Elsayed, Tamer and Nakov, Preslav and Przyby{\l}a, Piotr and Stru{\ss}, Julia Maria and Haouari, Fatima and Hasanain, Maram and Ruggeri, Federico and others},
+  booktitle={European Conference on Information Retrieval},
+  pages={449--458},
+  year={2024},
+  organization={Springer}
+}
+"""  # noqa E501
+
+_LICENSE = "Your dataset's license here."
+
+
+class CLEF24EnData(GeneratorBasedBuilder):
+    """A multilingual text dataset."""
+
+    BUILDER_CONFIGS = [
+        BuilderConfig(
+            name="multilang_dataset",
+            version=Version("1.0.0"),
+            description="Multilingual dataset for text classification.",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "multilang_dataset"  # Default configuration name.
+
+    def _info(self):
+        """Construct the DatasetInfo object."""
+        return DatasetInfo(
+            description=_DESCRIPTION,
+            features=Features(
+                {
+                    "Sentence_id": Value("string"),
+                    "Text": Value("string"),
+                    "class_label": Value("string"),
+                }
+            ),
+            supervised_keys=("Text", "class_label"),
+            homepage="https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/tree/main/task1",  # noqa E501
+            citation=_CITATION,
+            license=_LICENSE,
+        )
+
+    def _split_generators(
+        self, dl_manager: DownloadManager
+    ) -> list[SplitGenerator]:
+        """Returns SplitGenerators."""
+        # Assumes your dataset is located in "."
+        data_dir = os.path.abspath(".")
+        splits = {
+            "train": Split.TRAIN,
+            "dev": Split.VALIDATION,
+            "test": Split.TEST,
+        }
+
+        return [
+            SplitGenerator(
+                name=splits[split],
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, f"{split}.tsv"),
+                    "split": splits[split],
+                },
+            )
+            for split in splits.keys()
+        ]
+
+    def _generate_examples(
+        self, filepath: Union[str, os.PathLike], split: str
+    ) -> Generator[Tuple[str, dict], None, None]:
+        """Yields examples."""
+        with open(filepath, encoding="utf-8") as f:
+            for id_, row in enumerate(f):
+                if id_ == 0:  # Optionally skip header
+                    continue
+                cols = row.strip().split("\t")
+                yield f"{split}_{id_}", {
+                    "sentence_id": cols[0],
+                    "sentence": cols[1],
+                    "label": cols[2],
+                }
diff --git a/data/task1/clef24_dataset_twitter.py b/data/task1/clef24_dataset_twitter.py
@@ -0,0 +1,101 @@
+"""Multilang Dataset loading script."""
+
+from datasets import (
+    DatasetInfo,
+    BuilderConfig,
+    Version,
+    GeneratorBasedBuilder,
+    DownloadManager,
+)
+from datasets import SplitGenerator, Split, Features, Value
+from typing import Generator, Tuple, Union
+
+import os
+
+_DESCRIPTION = """
+This dataset includes Arabic/Dutch/Spanish Twitter data for CLEF 2024 CheckThat! Lab task1. 
+"""  # noqa E501
+
+_CITATION = """\
+@inproceedings{barron2024clef,
+  title={The CLEF-2024 CheckThat! Lab: Check-Worthiness, Subjectivity, Persuasion, Roles, Authorities, and Adversarial Robustness},
+  author={Barr{\'o}n-Cede{\~n}o, Alberto and Alam, Firoj and Chakraborty, Tanmoy and Elsayed, Tamer and Nakov, Preslav and Przyby{\l}a, Piotr and Stru{\ss}, Julia Maria and Haouari, Fatima and Hasanain, Maram and Ruggeri, Federico and others},
+  booktitle={European Conference on Information Retrieval},
+  pages={449--458},
+  year={2024},
+  organization={Springer}
+}
+"""  # noqa E501
+
+_LICENSE = "Your dataset's license here."
+
+
+class CLEF24TwitterData(GeneratorBasedBuilder):
+    """A multilingual Twitter dataset for claim detection."""
+
+    BUILDER_CONFIGS = [
+        BuilderConfig(
+            name="clef24_tweet_data",
+            version=Version("1.0.0"),
+            description="Multilingual dataset for text classification.",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "clef24_tweet_data"  # Default configuration name.
+
+    def _info(self):
+        """Construct the DatasetInfo object."""
+        return DatasetInfo(
+            description=_DESCRIPTION,
+            features=Features(
+                {
+                    "tweet_id": Value("string"),
+                    "tweet_url": Value("string"),
+                    "tweet_text": Value("string"),
+                    "class_label": Value("string"),
+                }
+            ),
+            supervised_keys=("tweet_text", "class_label"),
+            homepage="https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/tree/main/task1",  # noqa E501
+            citation=_CITATION,
+            license=_LICENSE,
+        )
+
+    def _split_generators(
+        self, dl_manager: DownloadManager
+    ) -> list[SplitGenerator]:
+        """Returns SplitGenerators."""
+        # Assumes your dataset is located in "."
+        data_dir = os.path.abspath(".")
+        splits = {
+            "train": Split.TRAIN,
+            "dev": Split.VALIDATION,
+            "test": Split.TEST,
+        }
+
+        return [
+            SplitGenerator(
+                name=splits[split],
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, f"{split}.tsv"),
+                    "split": splits[split],
+                },
+            )
+            for split in splits.keys()
+        ]
+
+    def _generate_examples(
+        self, filepath: Union[str, os.PathLike], split: str
+    ) -> Generator[Tuple[str, dict], None, None]:
+        """Yields examples."""
+        with open(filepath, encoding="utf-8") as f:
+            for id_, row in enumerate(f):
+                if id_ == 0:  # Optionally skip header
+                    continue
+                cols = row.strip().split("\t")
+                yield f"{split}_{id_}", {
+                    "tweet_id": cols[0],
+                    "tweet_url": cols[1],
+                    "tweet_text": cols[2],
+                    "class_label": cols[3],
+                }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 +127,4 @@ dmypy.json

		# Pyre type checker
		.pyre/
		.DS_Store
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Code to train and evaluate the models for CheckThat tasks."""