From be97c03a22a4d911384febc55dde26e0a23852c2 Mon Sep 17 00:00:00 2001
From: Nicolas Oudard <nicolas@oudard.org>
Date: Thu, 2 Jan 2025 18:58:09 +0100
Subject: [PATCH] add tests

---
 .../tasks/transform/transform_column.py       |   1 -
 .../business_logic/test_propose_services.py   |   4 -
 .../test_source_data_normalize.py             | 266 +++++++++++++++---
 3 files changed, 228 insertions(+), 43 deletions(-)

diff --git a/dags/sources/tasks/transform/transform_column.py b/dags/sources/tasks/transform/transform_column.py
index 8d4f1cc5f..fd51afedf 100644
--- a/dags/sources/tasks/transform/transform_column.py
+++ b/dags/sources/tasks/transform/transform_column.py
@@ -206,7 +206,6 @@ def clean_souscategorie_codes_sinoe(
         for v in sscat_list
         if v.strip().lower() not in ("", "nan", "np", "none")
     ]
-    print(sscat_list)
     sscat_list = [
         dechet_mapping[v]
         for v in sscat_list
diff --git a/dags_unit_tests/sources/tasks/business_logic/test_propose_services.py b/dags_unit_tests/sources/tasks/business_logic/test_propose_services.py
index 99891f15c..ea9a1f62c 100644
--- a/dags_unit_tests/sources/tasks/business_logic/test_propose_services.py
+++ b/dags_unit_tests/sources/tasks/business_logic/test_propose_services.py
@@ -1,5 +1,3 @@
-# FIXME : ces tests sont à revoir, par très à propos depuis la refacto
-
 import pandas as pd
 import pytest
 from sources.tasks.business_logic.propose_services import propose_services
@@ -133,8 +131,6 @@ def test_create_proposition_services_services(
             actions_id_by_code=actions_id_by_code,
         )
 
-        print(result["df"])
-        print(expected_df)
         assert result["df"].equals(expected_df)
         assert result["metadata"] == expected_metadata
 
diff --git a/dags_unit_tests/sources/tasks/business_logic/test_source_data_normalize.py b/dags_unit_tests/sources/tasks/business_logic/test_source_data_normalize.py
index 233e71716..6922886da 100755
--- a/dags_unit_tests/sources/tasks/business_logic/test_source_data_normalize.py
+++ b/dags_unit_tests/sources/tasks/business_logic/test_source_data_normalize.py
@@ -5,6 +5,7 @@
 from sources.config.airflow_params import TRANSFORMATION_MAPPING
 from sources.tasks.airflow_logic.config_management import DAGConfig
 from sources.tasks.business_logic.source_data_normalize import (
+    _remove_undesired_lines,
     df_normalize_pharmacie,
     df_normalize_sinoe,
     source_data_normalize,
@@ -90,23 +91,33 @@ def test_drop_annee_column(
         assert "ANNEE" not in df.columns
 
 
+NORMALIZATION_RULES = [
+    {
+        "origin": "col_to_rename",
+        "destination": "col_renamed",
+    },
+    {
+        "origin": "nom origin",
+        "transformation": "test_fct",
+        "destination": "nom destination",
+    },
+    {"column": "string_col", "value": "value of col"},
+    {"column": "list_col", "value": ["col1", "col2"]},
+    {
+        "origin": ["nom"],
+        "transformation": "test_fct",
+        "destination": ["nom"],
+    },
+    {"keep": "identifiant_unique"},
+    {"remove": "col_to_remove"},
+]
+
+
 class TestSourceDataNormalize:
-    # FIXME : Add tests to check all kind of transformation is applied
-    def test_normalization_rules_is_called(self):
+
+    def test_source_data_normalize_normalization_rules_are_called(self):
         dag_config_kwargs = {
-            "normalization_rules": [
-                {
-                    "origin": "nom origin",
-                    "transformation": "test_fct",
-                    "destination": "nom destination",
-                },
-                {
-                    "origin": "nom",
-                    "transformation": "test_fct",
-                    "destination": "nom",
-                },
-                {"keep": "identifiant_unique"},
-            ],
+            "normalization_rules": NORMALIZATION_RULES,
             "product_mapping": {},
             "endpoint": "http://example.com/api",
         }
@@ -115,6 +126,8 @@ def test_normalization_rules_is_called(self):
             df_acteur_from_source=pd.DataFrame(
                 {
                     "identifiant_unique": ["id"],
+                    "col_to_remove": ["fake remove"],
+                    "col_to_rename": ["fake rename"],
                     "nom origin": ["nom origin 1"],
                     "nom": ["nom"],
                 }
@@ -122,12 +135,53 @@ def test_normalization_rules_is_called(self):
             dag_config=DAGConfig.model_validate(dag_config_kwargs),
             dag_id="dag_id",
         )
+
+        assert "col_to_rename" not in df.columns
+        assert "col_renamed" in df.columns
+        assert df["col_renamed"].iloc[0] == "fake rename"
+
         assert "nom destination" in df.columns
         assert df["nom destination"].iloc[0] == "success"
 
+        assert "string_col" in df.columns
+        assert df["string_col"].iloc[0] == "value of col"
+
+        assert "list_col" in df.columns
+        assert df["list_col"].iloc[0] == ["col1", "col2"]
+
         assert "nom" in df.columns
         assert df["nom"].iloc[0] == "success"
 
+        assert "identifiant_unique" in df.columns
+        assert df["identifiant_unique"].iloc[0] == "id"
+
+        assert "col_to_remove" not in df.columns
+
+    def test_source_data_normalize_unhandles_column_raise(self):
+        dag_config_kwargs = {
+            "normalization_rules": NORMALIZATION_RULES,
+            "product_mapping": {},
+            "endpoint": "http://example.com/api",
+        }
+
+        with pytest.raises(ValueError) as erreur:
+            source_data_normalize(
+                df_acteur_from_source=pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id"],
+                        "col_to_remove": ["fake remove"],
+                        "col_to_rename": ["fake rename"],
+                        "nom origin": ["nom origin 1"],
+                        "nom": ["nom"],
+                        "col_make_it_raise": ["fake"],
+                    }
+                ),
+                dag_config=DAGConfig.model_validate(dag_config_kwargs),
+                dag_id="dag_id",
+            )
+        assert "Le dataframe n'a pas les colonnes attendues" in str(erreur.value)
+        assert "col_make_it_raise" in str(erreur.value)
+
     # @pytest.mark.parametrize(
     #     "statut, statut_expected",
     #     [
@@ -223,27 +277,163 @@ def _enrich_from_ban_api(row):
         )
 
 
-class TestRemoveUndesired_lines:
-    # FIXME : Add tests
-    pass
-
-    # # "service_a_domicile"
-    # def test_service_a_domicile(
-    #     self,
-    #     df_empty_acteurs_from_db,
-    # ):
-
-    #     result = propose_acteur_changes(
-    #         df=pd.DataFrame(
-    #             {
-    #                 "identifiant_unique": ["1", "2"],
-    #                 "service_a_domicile": ["Oui exclusivement", "Non"],
-    #             }
-    #         ),
-    #         df_acteurs=df_empty_acteurs_from_db,
-    #     )
-    #     result_df = result["df"]
+class TestRemoveUndesiredLines:
+    @pytest.mark.parametrize(
+        "df, expected_df",
+        [
+            # Cas suppression service à domicile
+            (
+                pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id1", "id2", "id3"],
+                        "service_a_domicile": [
+                            "non",
+                            "oui exclusivement",
+                            "service à domicile uniquement",
+                        ],
+                        "public_accueilli": [
+                            "Particuliers",
+                            "Particuliers",
+                            "Particuliers",
+                        ],
+                        "souscategorie_codes": [["code1"], ["code2"], ["code3"]],
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id1"],
+                        "service_a_domicile": ["non"],
+                        "public_accueilli": ["Particuliers"],
+                        "souscategorie_codes": [["code1"]],
+                    }
+                ),
+            ),
+            # Cas suppression professionnele
+            (
+                pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id1", "id2", "id3", "id4"],
+                        "service_a_domicile": ["non", "non", "non", "oui"],
+                        "public_accueilli": [
+                            "Particuliers",
+                            "Particuliers et professionnels",
+                            "Professionnels",
+                            "Aucun",
+                        ],
+                        "souscategorie_codes": [
+                            ["code1"],
+                            ["code2"],
+                            ["code3"],
+                            ["code4"],
+                        ],
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id1", "id2", "id4"],
+                        "service_a_domicile": ["non", "non", "oui"],
+                        "public_accueilli": [
+                            "Particuliers",
+                            "Particuliers et professionnels",
+                            "Aucun",
+                        ],
+                        "souscategorie_codes": [["code1"], ["code2"], ["code4"]],
+                    }
+                ),
+            ),
+            # Cas avec suppression des lignes sans produits acceptés
+            (
+                pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id1", "id2", "id3"],
+                        "service_a_domicile": ["non", "non", "non"],
+                        "public_accueilli": [
+                            "Particuliers",
+                            "Particuliers",
+                            "Particuliers",
+                        ],
+                        "souscategorie_codes": [["code1"], [], ["code3"]],
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "identifiant_unique": ["id1", "id3"],
+                        "service_a_domicile": ["non", "non"],
+                        "public_accueilli": ["Particuliers", "Particuliers"],
+                        "souscategorie_codes": [["code1"], ["code3"]],
+                    }
+                ),
+            ),
+        ],
+    )
+    def test_remove_undesired_lines_suppressions(self, df, expected_df, dag_config):
+        # Mock the DAGConfig
+
+        result_df = _remove_undesired_lines(df, dag_config)
+        pd.testing.assert_frame_equal(
+            result_df.reset_index(drop=True), expected_df.reset_index(drop=True)
+        )
+
+    def test_merge_duplicated_acteurs(self, dag_config):
+        dag_config.merge_duplicated_acteurs = True
+        result = _remove_undesired_lines(
+            pd.DataFrame(
+                {
+                    "identifiant_unique": ["id1", "id1", "id2"],
+                    "service_a_domicile": ["non", "non", "non"],
+                    "public_accueilli": [
+                        "Particuliers",
+                        "Particuliers",
+                        "Particuliers",
+                    ],
+                    "souscategorie_codes": [["code1"], ["code2"], ["code3"]],
+                }
+            ),
+            dag_config,
+        )
+        result = result.sort_values("identifiant_unique")
+        result = result.reset_index(drop=True)
+
+        expected_df = pd.DataFrame(
+            {
+                "identifiant_unique": ["id1", "id2"],
+                "service_a_domicile": ["non", "non"],
+                "public_accueilli": ["Particuliers", "Particuliers"],
+                "souscategorie_codes": [["code1", "code2"], ["code3"]],
+            }
+        )
+        expected_df.reset_index(drop=True)
+
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_ignore_duplicates(self, dag_config):
+        dag_config.ignore_duplicates = True
+        result = _remove_undesired_lines(
+            pd.DataFrame(
+                {
+                    "identifiant_unique": ["id1", "id1", "id2"],
+                    "service_a_domicile": ["non", "non", "non"],
+                    "public_accueilli": [
+                        "Particuliers",
+                        "Particuliers",
+                        "Particuliers",
+                    ],
+                    "souscategorie_codes": [["code1"], ["code2"], ["code3"]],
+                }
+            ),
+            dag_config,
+        )
+        result = result.sort_values("identifiant_unique")
+        result = result.reset_index(drop=True)
+
+        expected_df = pd.DataFrame(
+            {
+                "identifiant_unique": ["id1", "id2"],
+                "service_a_domicile": ["non", "non"],
+                "public_accueilli": ["Particuliers", "Particuliers"],
+                "souscategorie_codes": [["code1"], ["code3"]],
+            }
+        )
+        expected_df.reset_index(drop=True)
 
-    #     assert len(result_df) == 1
-    #     assert result_df["service_a_domicile"].iloc[0] == "Non"
-    #     assert result_df["identifiant_unique"].iloc[0] == "2"
+        pd.testing.assert_frame_equal(result, expected_df)