from_file and indexation do not return None anymore

xoolive · Oct 31, 2024 · d9db024 · d9db024
1 parent 0bee890
commit d9db024
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 137 deletions.
diff --git a/src/traffic/core/flight.py b/src/traffic/core/flight.py
@@ -22,9 +22,7 @@
     Optional,
     Set,
     Tuple,
-    Type,
     TypedDict,
-    TypeVar,
     Union,
     cast,
     overload,
@@ -35,6 +33,7 @@
 from impunity import impunity
 from pitot import geodesy as geo
 from rich.console import Console, ConsoleOptions, RenderResult
+from typing_extensions import Self
 
 import numpy as np
 import pandas as pd
@@ -84,30 +83,15 @@ class Entry(TypedDict, total=False):
     name: str
 
 
-T = TypeVar("T", bound="Flight")
+def _tz_interpolate(
+    data: DatetimeTZBlock, *args: Any, **kwargs: Any
+) -> DatetimeTZBlock:
+    coerced = data.coerce_to_target_dtype("int64")
+    interpolated, *_ = coerced.interpolate(*args, **kwargs)
+    return interpolated
 
-if str(pd.__version__) < "1.3":
 
-    def _tz_interpolate(
-        data: DatetimeTZBlock, *args: Any, **kwargs: Any
-    ) -> DatetimeTZBlock:
-        return data.astype(int).interpolate(*args, **kwargs).astype(data.dtype)
-
-    DatetimeTZBlock.interpolate = _tz_interpolate
-
-else:
-    # - with version 1.3.0, interpolate returns a list
-    # - Windows require "int64" as "int" may be interpreted as "int32" and raise
-    #   an error (was not raised before 1.3.0)
-
-    def _tz_interpolate(
-        data: DatetimeTZBlock, *args: Any, **kwargs: Any
-    ) -> DatetimeTZBlock:
-        coerced = data.coerce_to_target_dtype("int64")
-        interpolated, *_ = coerced.interpolate(*args, **kwargs)
-        return interpolated
-
-    DatetimeTZBlock.interpolate = _tz_interpolate
+DatetimeTZBlock.interpolate = _tz_interpolate
 
 
 def _split(
@@ -813,7 +797,7 @@ def final(
         segment = None
         for segment in fun(self):
             continue
-        return segment  # type: ignore
+        return segment
 
     # --- Iterators ---
 
@@ -1042,6 +1026,9 @@ def _get_unique(
         if field not in self.data.columns:
             return None
         tmp = self.data[field].unique()
+        tmp = list(elt for elt in tmp if elt == elt)
+        if len(tmp) == 0:
+            return None
         if len(tmp) == 1:
             return tmp[0]  # type: ignore
         if warn:
@@ -1057,8 +1044,6 @@ def callsign(self) -> Union[str, Set[str], None]:
         with a route for a commercial aircraft.
         """
         callsign = self._get_unique("callsign")
-        if callsign != callsign:
-            raise ValueError("NaN appearing in callsign field")
         return callsign
 
     @property
@@ -3100,9 +3085,7 @@ def from_fr24(cls, filename: Union[Path, str]) -> Flight:
         return FlightRadar24.from_file(filename)
 
     @classmethod
-    def from_file(
-        cls: Type[T], filename: Union[Path, str], **kwargs: Any
-    ) -> Optional[T]:
+    def from_file(cls, filename: Union[Path, str], **kwargs: Any) -> Self:
         """Read data from various formats.
 
         This class method dispatches the loading of data in various format to
@@ -3126,8 +3109,6 @@ def from_file(
         """
 
         tentative = super().from_file(filename, **kwargs)
-        if tentative is None:
-            return None
 
         # Special treatment for flights to download from flightradar24
         cols_fr24 = {

diff --git a/src/traffic/core/mixins.py b/src/traffic/core/mixins.py
@@ -1,8 +1,6 @@
 # ruff: noqa: E501
 from __future__ import annotations
 
-import gzip
-import json
 import logging
 import re
 from functools import lru_cache
@@ -14,14 +12,14 @@
     ClassVar,
     Mapping,
     Sequence,
-    Type,
     TypedDict,
-    TypeVar,
 )
 
+from py7zr import SevenZipFile
 from rich.box import SIMPLE_HEAVY
 from rich.console import Console, ConsoleOptions, RenderResult
 from rich.table import Table
+from typing_extensions import Self
 
 import numpy as np
 import pandas as pd
@@ -40,8 +38,8 @@
     from matplotlib.artist import Artist
 
 
-T = TypeVar("T", bound="DataFrameMixin")
-G = TypeVar("G", bound="GeoDBMixin")
+# T = TypeVar("T", bound="DataFrameMixin")
+# G = TypeVar("G", bound="GeoDBMixin")
 
 
 _log = logging.getLogger(__name__)
@@ -74,9 +72,7 @@ def __sizeof__(self) -> int:
         return int(self.data.memory_usage().sum())
 
     @classmethod
-    def from_file(
-        cls: Type[T], filename: str | Path, **kwargs: Any
-    ) -> None | T:
+    def from_file(cls, filename: str | Path, **kwargs: Any) -> Self:
         """Read data from various formats.
 
         This class method dispatches the loading of data in various format to
@@ -101,6 +97,16 @@ def from_file(
         >>> t = Traffic.from_file(filename)
         """
         path = Path(filename)
+
+        if path.suffix == (".7z"):
+            with SevenZipFile(path) as archive:
+                if (files := archive.readall()) is None:
+                    raise FileNotFoundError(f"Empty archive {path}")
+                for name, io in files.items():
+                    if name.endswith(".jsonl"):
+                        return cls(pd.read_json(io, lines=True, **kwargs))
+                raise FileNotFoundError(f"Empty archive {path}")
+
         if ".pkl" in path.suffixes or ".pickle" in path.suffixes:
             return cls(pd.read_pickle(path, **kwargs))
         if ".parquet" in path.suffixes:
@@ -109,29 +115,14 @@ def from_file(
             return cls(pd.read_feather(path, **kwargs))
         if ".json" in path.suffixes:
             return cls(pd.read_json(path, **kwargs))
-        if path.suffix == ".jsonl":
-            df = pd.json_normalize(
-                json.loads(elt) for elt in path.read_text().split("\n")[:-1]
-            )
-            df = df.assign(
-                timestamp=pd.to_datetime(df.timestamp, unit="s", utc=True)
-            )
-            return cls(df)
-        if ".jsonl" in path.suffixes and ".gz" in path.suffixes:
-            with gzip.open(path) as fh:
-                df = pd.json_normalize(
-                    json.loads(elt) for elt in fh.readlines()
-                )
-            df = df.assign(
-                timestamp=pd.to_datetime(df.timestamp, unit="s", utc=True)
-            )
-            return cls(df)
-
+        if ".jsonl" in path.suffixes:
+            return cls(pd.read_json(path, lines=True, **kwargs))
         if ".csv" in path.suffixes:
             return cls(pd.read_csv(path, **kwargs))
         if ".h5" == path.suffixes[-1]:  # coverage: ignore
             return cls(pd.read_hdf(path, **kwargs))
-        return None
+
+        raise FileNotFoundError(path)
 
     # --- Special methods ---
 
@@ -189,39 +180,39 @@ def __rich_console__(
 
     # --- Redirected to pandas.DataFrame ---
 
-    def assign(self: T, *args: Any, **kwargs: Any) -> T:
+    def assign(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.assign` method to the
         underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.assign(*args, **kwargs))
 
-    def convert_dtypes(self: T, *args: Any, **kwargs: Any) -> T:
+    def convert_dtypes(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.convert_dtypes` method to
         the underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.convert_dtypes(*args, **kwargs))
 
-    def drop(self: T, *args: Any, **kwargs: Any) -> T:
+    def drop(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.drop` method to the
         underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.drop(*args, **kwargs))
 
-    def drop_duplicates(self: T, *args: Any, **kwargs: Any) -> T:
+    def drop_duplicates(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.drop_duplicates` method to
         the underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.drop_duplicates(*args, **kwargs))
 
-    def fillna(self: T, *args: Any, **kwargs: Any) -> T:
+    def fillna(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.fillna` method to the
         underlying pandas DataFrame and get the result back in the same
@@ -238,15 +229,15 @@ def groupby(
         """
         return self.data.groupby(*args, **kwargs)
 
-    def merge(self: T, *args: Any, **kwargs: Any) -> T:
+    def merge(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.merge` method to the
         underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.merge(*args, **kwargs))
 
-    def query(self: T, query_str: str, *args: Any, **kwargs: Any) -> None | T:
+    def query(self, query_str: str, *args: Any, **kwargs: Any) -> None | Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.query` method to the
         underlying pandas DataFrame and get the result back in the same
@@ -257,31 +248,31 @@ def query(self: T, query_str: str, *args: Any, **kwargs: Any) -> None | T:
             return None
         return self.__class__(df)
 
-    def rename(self: T, *args: Any, **kwargs: Any) -> T:
+    def rename(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.rename` method to the
         underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.rename(*args, **kwargs))
 
-    def replace(self: T, *args: Any, **kwargs: Any) -> T:
+    def replace(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.replace` method to the
         underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.replace(*args, **kwargs))
 
-    def reset_index(self: T, *args: Any, **kwargs: Any) -> T:
+    def reset_index(self, *args: Any, **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.reset_index` method to the
         underlying pandas DataFrame and get the result back in the same
         structure.
         """
         return self.__class__(self.data.reset_index(*args, **kwargs))
 
-    def sort_values(self: T, by: str | Sequence[str], **kwargs: Any) -> T:
+    def sort_values(self, by: str | Sequence[str], **kwargs: Any) -> Self:
         """
         Applies the Pandas :meth:`~pandas.DataFrame.sort_values` method to the
         underlying pandas DataFrame and get the result back in the same
@@ -516,7 +507,7 @@ class GeographyMixin(DataFrameMixin):
 
     __slots__ = ()
 
-    def projection(self: T, proj: str = "lcc") -> pyproj.Proj:
+    def projection(self, proj: str = "lcc") -> pyproj.Proj:
         return pyproj.Proj(
             proj=proj,
             ellps="WGS84",
@@ -527,8 +518,8 @@ def projection(self: T, proj: str = "lcc") -> pyproj.Proj:
         )
 
     def compute_xy(
-        self: T, projection: None | pyproj.Proj | "crs.Projection" = None
-    ) -> T:
+        self, projection: None | pyproj.Proj | "crs.Projection" = None
+    ) -> Self:
         """Enrich the structure with new x and y columns computed through a
         projection of the latitude and longitude columns.
 
@@ -547,7 +538,7 @@ def compute_xy(
             projection = pyproj.Proj(projection.proj4_init)
 
         if projection is None:
-            projection = self.projection(proj="lcc")  # type: ignore
+            projection = self.projection(proj="lcc")
 
         transformer = pyproj.Transformer.from_proj(
             pyproj.Proj("epsg:4326"), projection, always_xy=True
@@ -560,8 +551,8 @@ def compute_xy(
         return self.__class__(self.data.assign(x=x, y=y))
 
     def compute_latlon_from_xy(
-        self: T, projection: pyproj.Proj | crs.Projection
-    ) -> T:
+        self, projection: pyproj.Proj | crs.Projection
+    ) -> Self:
         """Enrich a DataFrame with new longitude and latitude columns computed
         from x and y columns.
 
@@ -675,14 +666,14 @@ def geoencode(self, **kwargs: Any) -> "alt.Chart":  # coverage: ignore
         )
 
     def interpolate_grib(
-        self: T, wind: "xarray.Dataset", features: list[str] = ["u", "v"]
-    ) -> T:
+        self, wind: "xarray.Dataset", features: list[str] = ["u", "v"]
+    ) -> Self:
         from openap import aero
         from sklearn.linear_model import Ridge
         from sklearn.pipeline import make_pipeline
         from sklearn.preprocessing import PolynomialFeatures
 
-        projection: pyproj.Proj = self.projection("lcc")  # type: ignore
+        projection: pyproj.Proj = self.projection("lcc")
         transformer = pyproj.Transformer.from_proj(
             pyproj.Proj("epsg:4326"), projection, always_xy=True
         )
@@ -778,10 +769,10 @@ class GeoDBMixin(DataFrameMixin):
     __slots__ = ()
 
     def extent(
-        self: G,
+        self,
         extent: str | ShapelyMixin | tuple[float, float, float, float],
         buffer: float = 0.5,
-    ) -> None | G:
+    ) -> None | Self:
         """
         Selects the subset of data inside the given extent.