Skip to content

Commit

Permalink
Première version de l'agrégat brut IRVE statique (#4397)
Browse files Browse the repository at this point in the history
Co-authored-by: Frédéric Menou <[email protected]>
  • Loading branch information
thbar and ptitfred authored Jan 30, 2025
1 parent 51e89df commit 876d36e
Show file tree
Hide file tree
Showing 7 changed files with 382 additions and 18 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ erl_crash.dump

# Data munging
cache-dir
data-tmp

# Since we are building assets from client,
# we ignore priv/static.
Expand Down
112 changes: 106 additions & 6 deletions apps/transport/lib/irve/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,46 @@ defmodule Transport.IRVE.DataFrame do
@moduledoc """
Tooling supporting the parsing of an IRVE static file into `Explorer.DataFrame`
"""
require Explorer.DataFrame

@doc """
Helper function to convert TableSchema types into DataFrame ones.
There is no attempt to make this generic at this point, it is focusing solely
on the static IRVE use.
In strict mode (the default), the types are remapped as follow:
iex> Transport.IRVE.DataFrame.remap_schema_type(:geopoint)
:string
iex> Transport.IRVE.DataFrame.remap_schema_type(:number)
{:u, 16}
{:f, 32}
iex> Transport.IRVE.DataFrame.remap_schema_type(:boolean)
:boolean
iex> Transport.IRVE.DataFrame.remap_schema_type(:literally_anything)
:literally_anything
In non-strict mode (used by the current prototype), we read some types as `:string`
in order to apply clean-up before casting to the actual target type manually:
iex> Transport.IRVE.DataFrame.remap_schema_type(:boolean, _strict = false)
:string
iex> Transport.IRVE.DataFrame.remap_schema_type(:literally_anything, _strict = false)
:literally_anything
"""
def remap_schema_type(input_type) do
def remap_schema_type(input_type, strict \\ true)

def remap_schema_type(input_type, true) do
case input_type do
:geopoint -> :string
:number -> {:u, 16}
:number -> {:f, 32}
type -> type
end
end

def remap_schema_type(input_type, false) do
case remap_schema_type(input_type, true) do
:boolean -> :string
type -> type
end
end
Expand Down Expand Up @@ -81,18 +103,96 @@ defmodule Transport.IRVE.DataFrame do
Congratulations for reading this far.
"""
def dataframe_from_csv_body!(body, schema \\ Transport.IRVE.StaticIRVESchema.schema_content()) do
def dataframe_from_csv_body!(body, schema \\ Transport.IRVE.StaticIRVESchema.schema_content(), strict \\ true) do
dtypes =
schema
|> Map.fetch!("fields")
|> Enum.map(fn %{"name" => name, "type" => type} ->
{
String.to_atom(name),
String.to_atom(type)
|> Transport.IRVE.DataFrame.remap_schema_type()
|> Transport.IRVE.DataFrame.remap_schema_type(strict)
}
end)

Explorer.DataFrame.load_csv!(body, dtypes: dtypes)
# to be tested - do not call `load_csv!` as it will `inspect` the error
case Explorer.DataFrame.load_csv(body, dtypes: dtypes) do
{:ok, df} -> df
{:error, error} -> raise(error)
end
end

@doc """
iex> Explorer.DataFrame.new([%{coordonneesXY: "[47.39,0.80]"}]) |> Transport.IRVE.DataFrame.preprocess_data()
#Explorer.DataFrame<
Polars[1 x 2]
x f64 [47.39]
y f64 [0.8]
>
We must also support cases where there are extra spaces.
iex> Explorer.DataFrame.new([%{coordonneesXY: "[43.958037, 4.764347]"}]) |> Transport.IRVE.DataFrame.preprocess_data()
#Explorer.DataFrame<
Polars[1 x 2]
x f64 [43.958037]
y f64 [4.764347]
>
But wait, there is more. Leading and trailing spaces can also occur.
iex> Explorer.DataFrame.new([%{coordonneesXY: " [6.128405 , 48.658737] "}]) |> Transport.IRVE.DataFrame.preprocess_data()
#Explorer.DataFrame<
Polars[1 x 2]
x f64 [6.128405]
y f64 [48.658737]
>
"""
def preprocess_data(df) do
df
|> Explorer.DataFrame.mutate(coordonneesXY: coordonneesXY |> strip("[] "))
|> Explorer.DataFrame.mutate_with(fn df ->
%{
coords: Explorer.Series.split_into(df[:coordonneesXY], ",", [:x, :y])
}
end)
|> Explorer.DataFrame.unnest(:coords)
# required or we'll get `nil` values
|> Explorer.DataFrame.mutate(x: x |> strip(" "), y: y |> strip(" "))
|> Explorer.DataFrame.mutate_with(fn df ->
[
x: Explorer.Series.cast(df[:x], {:f, 64}),
y: Explorer.Series.cast(df[:y], {:f, 64})
]
end)
|> Explorer.DataFrame.discard(:coordonneesXY)
end

# just what we've needed so far
@boolean_mappings %{
nil => nil,
"" => nil,
"0" => false,
"1" => true,
"TRUE" => true,
"FALSE" => false,
"false" => false,
"true" => true,
"False" => false,
"True" => true
}

# experimental, I think Explorer lacks a feature to allow this operation within Polars.
# For now, using `transform`, which is a costly operation comparatively
# https://hexdocs.pm/explorer/Explorer.DataFrame.html#transform/3
def preprocess_boolean(df, field_name) do
df
|> Explorer.DataFrame.transform([names: [field_name]], fn row ->
%{
(field_name <> "_remapped") => Map.fetch!(@boolean_mappings, row[field_name])
}
end)
|> Explorer.DataFrame.discard(field_name)
|> Explorer.DataFrame.rename(%{(field_name <> "_remapped") => field_name})
end
end
5 changes: 4 additions & 1 deletion apps/transport/lib/irve/extractor.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ defmodule Transport.IRVE.Extractor do
The code fetches datasets, then unpack resources belonging to each dataset.
"""
def resources(pagination_options \\ []) do
def datagouv_resources(pagination_options \\ []) do
@static_irve_datagouv_url
|> Transport.IRVE.Fetcher.pages(pagination_options)
|> Task.async_stream(&process_data_gouv_page/1, on_timeout: :kill_task, max_concurrency: 10)
Expand Down Expand Up @@ -53,6 +53,8 @@ defmodule Transport.IRVE.Extractor do
x
|> Map.put(:dataset_id, fetch_in!(dataset, ["id"]))
|> Map.put(:dataset_title, fetch_in!(dataset, ["title"]))
# a dataset organisation can be nil (in which case an "owner" will be there)
|> Map.put(:dataset_organisation_id, get_in(dataset, ["organization", "id"]) || "???")
|> Map.put(:dataset_organisation_name, get_in(dataset, ["organization", "name"]) || "???")
|> Map.put(:dataset_organisation_url, get_in(dataset, ["organization", "page"]) || "???")
end)
Expand All @@ -69,6 +71,7 @@ defmodule Transport.IRVE.Extractor do
resource_title: fetch_in!(resource, ["title"]),
dataset_id: fetch_in!(resource, [:dataset_id]),
dataset_title: fetch_in!(resource, [:dataset_title]),
dataset_organisation_id: fetch_in!(resource, [:dataset_organisation_id]),
dataset_organisation_name: fetch_in!(resource, [:dataset_organisation_name]),
dataset_organisation_url: fetch_in!(resource, [:dataset_organisation_url]),
valid: get_in(resource, ["extras", "validation-report:valid_resource"]),
Expand Down
2 changes: 1 addition & 1 deletion apps/transport/lib/jobs/analyze_irve_job.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ defmodule Transport.Jobs.AnalyzeIRVEJob do
try do
Logger.info("IRVE: starting global analyse...")
send(job_pid, {:progress, 0})
resources = Transport.IRVE.Extractor.resources() |> Enum.into([])
resources = Transport.IRVE.Extractor.datagouv_resources() |> Enum.into([])

count = resources |> length()
Logger.info("IRVE: processing #{count} resources...")
Expand Down
11 changes: 10 additions & 1 deletion apps/transport/test/transport/irve/irve_extractor_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,21 @@ defmodule Transport.IRVE.ExtractorTest do
}
end

@doc """
Build a typical data gouv API (list datasets) response.
If you need to verify or modify the payload, see examples at:
- https://www.data.gouv.fr/api/1/datasets/?page=1&page_size=20&schema=etalab%2Fschema-irve-statique
- https://doc.data.gouv.fr/api/reference/#/datasets/list_datasets
"""
def build_page_payload do
%{
"data" => [
%{
"id" => "the-dataset-id",
"title" => "the-dataset-title",
"organization" => %{
"id" => "the-org-id",
"name" => "the-org",
"page" => "http://the-org"
},
Expand Down Expand Up @@ -71,10 +79,11 @@ defmodule Transport.IRVE.ExtractorTest do
}
end)

assert Transport.IRVE.Extractor.resources(page_size: 2) == [
assert Transport.IRVE.Extractor.datagouv_resources(page_size: 2) == [
%{
dataset_id: "the-dataset-id",
dataset_title: "the-dataset-title",
dataset_organisation_id: "the-org-id",
dataset_organisation_name: "the-org",
dataset_organisation_url: "http://the-org",
resource_id: "the-resource-id",
Expand Down
Loading

0 comments on commit 876d36e

Please sign in to comment.