-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ajout de la source Pharmacies (#1094)
- Loading branch information
Showing
23 changed files
with
524 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from airflow.providers.postgres.hooks.postgres import PostgresHook | ||
from sqlalchemy.engine import Engine | ||
|
||
|
||
class PostgresConnectionManager: | ||
""" | ||
Singleton class to manage the connection to the Postgres database. | ||
use the connecter qfdmo_django_db by default | ||
this connecter is set by using env variable AIRFLOW_CONN_QFDMO_DJANGO_DB | ||
""" | ||
|
||
_instance = None | ||
|
||
def __new__(cls, *args, **kwargs): | ||
if cls._instance is None: | ||
cls._instance = super(PostgresConnectionManager, cls).__new__(cls) | ||
return cls._instance | ||
|
||
def __init__(self, postgres_conn_id="qfdmo_django_db"): | ||
if not hasattr(self, "initialized"): # Pour éviter la réinitialisation | ||
self.postgres_conn_id = postgres_conn_id | ||
self.engine = self._create_engine() | ||
self.initialized = True | ||
|
||
def _create_engine(self) -> Engine: | ||
pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) | ||
return pg_hook.get_sqlalchemy_engine() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from airflow import DAG | ||
from sources.config.airflow_params import get_mapping_config | ||
from sources.tasks.airflow_logic.operators import default_args, eo_task_chain | ||
|
||
with DAG( | ||
dag_id="eo-pharmacies", | ||
dag_display_name="Source - PHARMACIES", | ||
default_args=default_args, | ||
description=("Téléchargement des pharmacies (Ordre National Des Pharmaciens)"), | ||
params={ | ||
"column_transformations": [ | ||
{ | ||
"origin": "Raison sociale", | ||
"transformation": "strip_string", | ||
"destination": "nom", | ||
}, | ||
{ | ||
"origin": "Dénomination commerciale", | ||
"transformation": "strip_string", | ||
"destination": "nom_commercial", | ||
}, | ||
{ | ||
"origin": "Adresse", | ||
"transformation": "strip_string", | ||
"destination": "adresse", | ||
}, | ||
{ | ||
"origin": "Code postal", | ||
"transformation": "strip_string", | ||
"destination": "code_postal", | ||
}, | ||
{ | ||
"origin": "Commune", | ||
"transformation": "strip_string", | ||
"destination": "ville", | ||
}, | ||
], | ||
"column_mapping": { | ||
"Numéro d'établissement": "identifiant_externe", | ||
"Téléphone": "telephone", | ||
}, | ||
"endpoint": "https://www.ordre.pharmacien.fr/download/annuaire_csv.zip", | ||
"columns_to_add_by_default": { | ||
"statut": "ACTIF", | ||
"uniquement_sur_rdv": "non", | ||
"public_accueilli": "Particuliers", | ||
"produitsdechets_acceptes": "Médicaments & DASRI", | ||
"acteur_type_id": "pharmacie", | ||
"point_de_collecte_ou_de_reprise_des_dechets": True, | ||
}, | ||
"source_code": "ordredespharmaciens", | ||
"product_mapping": get_mapping_config(), | ||
}, | ||
schedule=None, | ||
) as dag: | ||
eo_task_chain(dag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,104 @@ | ||
import logging | ||
import tempfile | ||
import zipfile | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from utils import api_utils | ||
import requests | ||
from utils import logging_utils as log | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def source_data_download(api_url: str) -> pd.DataFrame: | ||
def source_data_download(endpoint: str) -> pd.DataFrame: | ||
"""Téléchargement de la données source sans lui apporter de modification""" | ||
logger.info("Téléchargement données de l'API : début...") | ||
# TODO: changer de logique, plutôt que de tout charger en mémoire et se | ||
# trimballer des dataframes en XCOM, on devrait plutôt streamer les données | ||
# directement dans la base de données et déléguer le traitement à la DB | ||
# tant que possible | ||
data = api_utils.fetch_data_from_url(api_url) | ||
data = fetch_data_from_endpoint(endpoint) | ||
logger.info("Téléchargement données de l'API : ✅ succès.") | ||
df = pd.DataFrame(data).replace({pd.NA: None, np.nan: None}) | ||
if df.empty: | ||
raise ValueError("Aucune donnée reçue de l'API") | ||
log.preview("df retournée par la tâche", df) | ||
return df | ||
|
||
|
||
def fetch_data_from_endpoint(endpoint): | ||
if "pointsapport.ademe.fr" in endpoint or "data.ademe.fr" in endpoint: | ||
return fetch_dataset_from_point_apport(endpoint) | ||
elif "artisanat.fr" in endpoint: | ||
return fetch_dataset_from_artisanat(endpoint) | ||
elif "ordre.pharmacien.fr" in endpoint: | ||
return fetch_dataset_from_pharmacies(endpoint) | ||
# Le but de nos intégrations API est de récupérer des données. | ||
# Si on ne récupére pas de données, on sait qu'on à un problème, | ||
# et donc il faut échouer explicitement au plus tôt | ||
raise NotImplementedError(f"Pas de fonction de récupération pour l'url {endpoint}") | ||
|
||
|
||
def fetch_dataset_from_point_apport(url): | ||
all_data = [] | ||
while url: | ||
logger.info(f"Récupération de données pour {url}") | ||
response = requests.get(url, timeout=60) | ||
response.raise_for_status() | ||
data = response.json() | ||
logger.info("Nombre de lignes récupérées: " + str(len(data["results"]))) | ||
all_data.extend(data["results"]) | ||
url = data.get("next", None) | ||
logger.info("Plus d'URL à parcourir") | ||
logger.info("Nombre total de lignes récupérées: " + str(len(all_data))) | ||
return all_data | ||
|
||
|
||
def fetch_dataset_from_artisanat(base_url): | ||
all_data = [] | ||
offset = 0 | ||
total_records = requests.get(base_url, params={"limit": 1, "offset": 0}).json()[ | ||
"total_count" | ||
] | ||
records_per_request = 100 | ||
params = {"limit": records_per_request, "offset": 0} | ||
while offset < total_records: | ||
params.update({"offset": offset}) | ||
response = requests.get(base_url, params=params) | ||
if response.status_code == 200: | ||
data = response.json() | ||
all_data.extend(data["results"]) | ||
offset += records_per_request | ||
else: | ||
response.raise_for_status() | ||
|
||
return all_data | ||
|
||
|
||
def fetch_dataset_from_pharmacies(endpoint): | ||
with tempfile.TemporaryDirectory() as temp_dir: | ||
zip_file = _download_file(endpoint, temp_dir) | ||
unzip_files = _extract_zip(zip_file, temp_dir) | ||
etablissements_file = [f for f in unzip_files if "etablissements" in f][0] | ||
df_etablissements = _read_csv(Path(temp_dir) / etablissements_file) | ||
return df_etablissements | ||
|
||
|
||
def _download_file(url, dest_folder="."): | ||
local_filename = Path(dest_folder) / url.split("/")[-1] | ||
with requests.get(url) as r: | ||
with open(local_filename, "wb") as f: | ||
f.write(r.content) | ||
return local_filename | ||
|
||
|
||
def _extract_zip(zip_file, dest_folder="."): | ||
with zipfile.ZipFile(zip_file, "r") as zip_ref: | ||
zip_ref.extractall(dest_folder) | ||
return zip_ref.namelist() | ||
|
||
|
||
def _read_csv(csv_file): | ||
df = pd.read_csv(csv_file, sep=";", encoding="utf-16-le", on_bad_lines="warn") | ||
return df |
Oops, something went wrong.