Première version de l'agrégat brut IRVE statique (#4397)

Co-authored-by: Frédéric Menou <[email protected]>
etalab · Jan 30, 2025 · 876d36e · 876d36e
1 parent 51e89df
commit 876d36e
Show file tree

Hide file tree

Showing 7 changed files with 382 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ erl_crash.dump
 
 # Data munging
 cache-dir
+data-tmp
 
 # Since we are building assets from client,
 # we ignore priv/static.

diff --git a/apps/transport/lib/irve/data_frame.ex b/apps/transport/lib/irve/data_frame.ex
@@ -2,24 +2,46 @@ defmodule Transport.IRVE.DataFrame do
   @moduledoc """
   Tooling supporting the parsing of an IRVE static file into `Explorer.DataFrame`
   """
+  require Explorer.DataFrame
 
   @doc """
   Helper function to convert TableSchema types into DataFrame ones.
 
   There is no attempt to make this generic at this point, it is focusing solely
   on the static IRVE use.
 
+  In strict mode (the default), the types are remapped as follow:
+
   iex> Transport.IRVE.DataFrame.remap_schema_type(:geopoint)
   :string
   iex> Transport.IRVE.DataFrame.remap_schema_type(:number)
-  {:u, 16}
+  {:f, 32}
+  iex> Transport.IRVE.DataFrame.remap_schema_type(:boolean)
+  :boolean
   iex> Transport.IRVE.DataFrame.remap_schema_type(:literally_anything)
   :literally_anything
+
+  In non-strict mode (used by the current prototype), we read some types as `:string`
+  in order to apply clean-up before casting to the actual target type manually:
+
+  iex> Transport.IRVE.DataFrame.remap_schema_type(:boolean, _strict = false)
+  :string
+  iex> Transport.IRVE.DataFrame.remap_schema_type(:literally_anything, _strict = false)
+  :literally_anything
   """
-  def remap_schema_type(input_type) do
+  def remap_schema_type(input_type, strict \\ true)
+
+  def remap_schema_type(input_type, true) do
     case input_type do
       :geopoint -> :string
-      :number -> {:u, 16}
+      :number -> {:f, 32}
+      type -> type
+    end
+  end
+
+  def remap_schema_type(input_type, false) do
+    case remap_schema_type(input_type, true) do
+      :boolean -> :string
       type -> type
     end
   end
@@ -81,18 +103,96 @@ defmodule Transport.IRVE.DataFrame do
 
   Congratulations for reading this far.
   """
-  def dataframe_from_csv_body!(body, schema \\ Transport.IRVE.StaticIRVESchema.schema_content()) do
+  def dataframe_from_csv_body!(body, schema \\ Transport.IRVE.StaticIRVESchema.schema_content(), strict \\ true) do
     dtypes =
       schema
       |> Map.fetch!("fields")
       |> Enum.map(fn %{"name" => name, "type" => type} ->
         {
           String.to_atom(name),
           String.to_atom(type)
-          |> Transport.IRVE.DataFrame.remap_schema_type()
+          |> Transport.IRVE.DataFrame.remap_schema_type(strict)
         }
       end)
 
-    Explorer.DataFrame.load_csv!(body, dtypes: dtypes)
+    # to be tested - do not call `load_csv!` as it will `inspect` the error
+    case Explorer.DataFrame.load_csv(body, dtypes: dtypes) do
+      {:ok, df} -> df
+      {:error, error} -> raise(error)
+    end
+  end
+
+  @doc """
+  iex> Explorer.DataFrame.new([%{coordonneesXY: "[47.39,0.80]"}]) |> Transport.IRVE.DataFrame.preprocess_data()
+  #Explorer.DataFrame<
+    Polars[1 x 2]
+    x f64 [47.39]
+    y f64 [0.8]
+  >
+
+  We must also support cases where there are extra spaces.
+
+  iex> Explorer.DataFrame.new([%{coordonneesXY: "[43.958037, 4.764347]"}]) |> Transport.IRVE.DataFrame.preprocess_data()
+  #Explorer.DataFrame<
+    Polars[1 x 2]
+    x f64 [43.958037]
+    y f64 [4.764347]
+  >
+
+  But wait, there is more. Leading and trailing spaces can also occur.
+
+  iex> Explorer.DataFrame.new([%{coordonneesXY: " [6.128405 , 48.658737] "}]) |> Transport.IRVE.DataFrame.preprocess_data()
+  #Explorer.DataFrame<
+    Polars[1 x 2]
+    x f64 [6.128405]
+    y f64 [48.658737]
+  >
+  """
+  def preprocess_data(df) do
+    df
+    |> Explorer.DataFrame.mutate(coordonneesXY: coordonneesXY |> strip("[] "))
+    |> Explorer.DataFrame.mutate_with(fn df ->
+      %{
+        coords: Explorer.Series.split_into(df[:coordonneesXY], ",", [:x, :y])
+      }
+    end)
+    |> Explorer.DataFrame.unnest(:coords)
+    # required or we'll get `nil` values
+    |> Explorer.DataFrame.mutate(x: x |> strip(" "), y: y |> strip(" "))
+    |> Explorer.DataFrame.mutate_with(fn df ->
+      [
+        x: Explorer.Series.cast(df[:x], {:f, 64}),
+        y: Explorer.Series.cast(df[:y], {:f, 64})
+      ]
+    end)
+    |> Explorer.DataFrame.discard(:coordonneesXY)
+  end
+
+  # just what we've needed so far
+  @boolean_mappings %{
+    nil => nil,
+    "" => nil,
+    "0" => false,
+    "1" => true,
+    "TRUE" => true,
+    "FALSE" => false,
+    "false" => false,
+    "true" => true,
+    "False" => false,
+    "True" => true
+  }
+
+  # experimental, I think Explorer lacks a feature to allow this operation within Polars.
+  # For now, using `transform`, which is a costly operation comparatively
+  # https://hexdocs.pm/explorer/Explorer.DataFrame.html#transform/3
+  def preprocess_boolean(df, field_name) do
+    df
+    |> Explorer.DataFrame.transform([names: [field_name]], fn row ->
+      %{
+        (field_name <> "_remapped") => Map.fetch!(@boolean_mappings, row[field_name])
+      }
+    end)
+    |> Explorer.DataFrame.discard(field_name)
+    |> Explorer.DataFrame.rename(%{(field_name <> "_remapped") => field_name})
   end
 end
diff --git a/apps/transport/lib/irve/extractor.ex b/apps/transport/lib/irve/extractor.ex
@@ -16,7 +16,7 @@ defmodule Transport.IRVE.Extractor do
 
   The code fetches datasets, then unpack resources belonging to each dataset.
   """
-  def resources(pagination_options \\ []) do
+  def datagouv_resources(pagination_options \\ []) do
     @static_irve_datagouv_url
     |> Transport.IRVE.Fetcher.pages(pagination_options)
     |> Task.async_stream(&process_data_gouv_page/1, on_timeout: :kill_task, max_concurrency: 10)
@@ -53,6 +53,8 @@ defmodule Transport.IRVE.Extractor do
       x
       |> Map.put(:dataset_id, fetch_in!(dataset, ["id"]))
       |> Map.put(:dataset_title, fetch_in!(dataset, ["title"]))
+      # a dataset organisation can be nil (in which case an "owner" will be there)
+      |> Map.put(:dataset_organisation_id, get_in(dataset, ["organization", "id"]) || "???")
       |> Map.put(:dataset_organisation_name, get_in(dataset, ["organization", "name"]) || "???")
       |> Map.put(:dataset_organisation_url, get_in(dataset, ["organization", "page"]) || "???")
     end)
@@ -69,6 +71,7 @@ defmodule Transport.IRVE.Extractor do
       resource_title: fetch_in!(resource, ["title"]),
       dataset_id: fetch_in!(resource, [:dataset_id]),
       dataset_title: fetch_in!(resource, [:dataset_title]),
+      dataset_organisation_id: fetch_in!(resource, [:dataset_organisation_id]),
       dataset_organisation_name: fetch_in!(resource, [:dataset_organisation_name]),
       dataset_organisation_url: fetch_in!(resource, [:dataset_organisation_url]),
       valid: get_in(resource, ["extras", "validation-report:valid_resource"]),

diff --git a/apps/transport/lib/jobs/analyze_irve_job.ex b/apps/transport/lib/jobs/analyze_irve_job.ex
@@ -31,7 +31,7 @@ defmodule Transport.Jobs.AnalyzeIRVEJob do
       try do
         Logger.info("IRVE: starting global analyse...")
         send(job_pid, {:progress, 0})
-        resources = Transport.IRVE.Extractor.resources() |> Enum.into([])
+        resources = Transport.IRVE.Extractor.datagouv_resources() |> Enum.into([])
 
         count = resources |> length()
         Logger.info("IRVE: processing #{count} resources...")

diff --git a/apps/transport/test/transport/irve/irve_extractor_test.exs b/apps/transport/test/transport/irve/irve_extractor_test.exs
@@ -17,13 +17,21 @@ defmodule Transport.IRVE.ExtractorTest do
     }
   end
 
+  @doc """
+  Build a typical data gouv API (list datasets) response.
+
+  If you need to verify or modify the payload, see examples at:
+  - https://www.data.gouv.fr/api/1/datasets/?page=1&page_size=20&schema=etalab%2Fschema-irve-statique
+  - https://doc.data.gouv.fr/api/reference/#/datasets/list_datasets
+  """
   def build_page_payload do
     %{
       "data" => [
         %{
           "id" => "the-dataset-id",
           "title" => "the-dataset-title",
           "organization" => %{
+            "id" => "the-org-id",
             "name" => "the-org",
             "page" => "http://the-org"
           },
@@ -71,10 +79,11 @@ defmodule Transport.IRVE.ExtractorTest do
       }
     end)
 
-    assert Transport.IRVE.Extractor.resources(page_size: 2) == [
+    assert Transport.IRVE.Extractor.datagouv_resources(page_size: 2) == [
              %{
                dataset_id: "the-dataset-id",
                dataset_title: "the-dataset-title",
+               dataset_organisation_id: "the-org-id",
                dataset_organisation_name: "the-org",
                dataset_organisation_url: "http://the-org",
                resource_id: "the-resource-id",