Python 3.11 support (#26)

* py311 support * update py 3.10 version
openAGI · Mar 22, 2024 · 910e4d3 · 910e4d3
1 parent 1a0f6c8
commit 910e4d3
Show file tree

Hide file tree

Showing 27 changed files with 132 additions and 142 deletions.
diff --git a/.github/workflows/datum_py38.yml → .github/workflows/datum_py310.yml b/.github/workflows/datum_py38.yml → .github/workflows/datum_py310.yml
@@ -1,4 +1,4 @@
-name: datum_py38
+name: datum_py310
 
 on: [push]
 
@@ -9,10 +9,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.8
+    - name: Set up Python 3.10
       uses: actions/setup-python@v1
       with:
-        python-version: 3.8
+        python-version: 3.10.13
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/.github/workflows/datum_py37.yml → .github/workflows/datum_py311.yml b/.github/workflows/datum_py37.yml → .github/workflows/datum_py311.yml
@@ -1,4 +1,4 @@
-name: datum_py37
+name: datum_py311
 
 on: [push]
 
@@ -9,10 +9,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.11
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.11
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/configs/image_clf_configs.py b/configs/image_clf_configs.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/configs/image_det_configs.py b/configs/image_det_configs.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/configs/image_seg_configs.py b/configs/image_seg_configs.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/configs/text_json_configs.py b/configs/text_json_configs.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/datum/cache/bucket.py b/datum/cache/bucket.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 # Copyright 2020 The TensorFlow Datasets Authors.
 #
@@ -21,9 +20,9 @@
 import os
 import struct
 import uuid
-from typing import Generator, List, Optional, Tuple
+from collections.abc import Generator
+from typing import Optional
 
-import six
 import tensorflow.compat.v2 as tf
 
 from datum.utils.hashing import Hasher
@@ -51,7 +50,7 @@
 class DuplicatedKeysError(Exception):
 
   def __init__(self, item1: Optional[bytes] = None, item2: Optional[bytes] = None):
-    super(DuplicatedKeysError, self).__init__()
+    super().__init__()
     self.item1 = item1
     self.item2 = item2
 
@@ -75,7 +74,7 @@ def get_bucket_number(hkey: int, shards_number: int) -> int:
   return math.trunc((hkey * shards_number) >> HKEY_SIZE)
 
 
-class _Bucket(object):
+class _Bucket:
   """Holds (key, binary value) tuples to disk, fast.
 
   Bucket instances are designed to be used either:
@@ -142,7 +141,7 @@ def flush(self) -> None:
       self._fobj.flush()
       self._fobj.close()
 
-  def read_values(self) -> Generator[Tuple[int, bytes], None, None]:
+  def read_values(self) -> Generator[tuple[int, bytes], None, None]:
     """Yields (hkey, data) tuples stored in bucket."""
     self.flush()
     path = self._path
@@ -167,7 +166,7 @@ def del_file(self) -> None:
       tf.io.gfile.remove(self._path)
 
 
-class Shuffler(object):
+class Shuffler:
   """Stores data in temp buckets, restitute it shuffled."""
 
   def __init__(self, dirpath: str, hash_salt: str):
@@ -179,23 +178,23 @@ def __init__(self, dirpath: str, hash_salt: str):
     """
     grp_name = uuid.uuid4()
     self._hasher = Hasher(hash_salt)
-    self._buckets: List[_Bucket] = []
+    self._buckets: list[_Bucket] = []
     for i in range(BUCKETS_NUMBER):
       path = os.path.join(dirpath, 'bucket_%s_%03d.tmp' % (grp_name, i))
       self._buckets.append(_Bucket(path))
     self._read_only = False
     self._total_bytes = 0
     # To keep data in memory until enough data has been gathered.
     self._in_memory = True
-    self._mem_buffer: List[Tuple[int, bytes]] = []
+    self._mem_buffer: list[tuple[int, bytes]] = []
 
   @property
   def size(self) -> int:
     """Return total size in bytes of records (not keys)."""
     return self._total_bytes
 
   @property
-  def bucket_lengths(self) -> List[int]:
+  def bucket_lengths(self) -> list[int]:
     if self._in_memory:
       return [len(self._mem_buffer)]
     return [len(b) for b in self._buckets]
@@ -216,7 +215,7 @@ def add(self, key: int, data: bytes) -> None:
     """Add (key, data) to shuffler."""
     if self._read_only:
       raise AssertionError('add() cannot be called after __iter__.')
-    if not isinstance(data, six.binary_type):
+    if not isinstance(data, bytes):
       raise AssertionError('Only bytes (not %s) can be stored in Shuffler!' % (type(data)))
     hkey = self._hasher.hash_key(key)
     self._total_bytes += len(data)
@@ -237,13 +236,11 @@ def __iter__(self) -> Generator[bytes, None, None]:
       yield data
       previous_data = data
 
-  def _iter_mem(self) -> Generator[Tuple[int, bytes], None, None]:
-    for hkey, data in sorted(self._mem_buffer):
-      yield hkey, data
+  def _iter_mem(self) -> Generator[tuple[int, bytes], None, None]:
+    yield from sorted(self._mem_buffer)
 
-  def _iter_buckets(self) -> Generator[Tuple[int, bytes], None, None]:
+  def _iter_buckets(self) -> Generator[tuple[int, bytes], None, None]:
     for bucket in self._buckets:
       bucket_data = sorted(bucket.read_values())
       bucket.del_file()
-      for hkey, data in bucket_data:
-        yield hkey, data
+      yield from bucket_data
diff --git a/datum/configs/config_base.py b/datum/configs/config_base.py
@@ -19,7 +19,7 @@
 from typing import Any, Callable
 
 
-class ConfigBase(object):
+class ConfigBase:
   """Base class for representing a set of tf.data config.
 
   Attributes:
@@ -48,7 +48,7 @@ def __setattr__(self, name: str, value: Any) -> None:
     if hasattr(self, name):
       object.__setattr__(self, name, value)
     else:
-      raise AttributeError("Cannot set the property %s on %s." % (name, type(self).__name__))
+      raise AttributeError("Cannot set the property {} on {}.".format(name, type(self).__name__))
 
   def merge(self, configs: ConfigBase) -> ConfigBase:
     return merge_configs(self, configs)
@@ -89,6 +89,7 @@ def set_fn(config: ConfigBase, value: Any) -> None:
 
 def merge_configs(*configs_list: ConfigBase) -> ConfigBase:
   """Merges the given configs, returning the result as a new configs object.
+
   The input arguments are expected to have a matching type that derives from
   `ConfigBase` (and thus each represent a set of configs). The method outputs
   an object of the same type created by merging the sets of configs represented
@@ -112,7 +113,7 @@ def merge_configs(*configs_list: ConfigBase) -> ConfigBase:
 
   for configs in configs_list:
     if not isinstance(configs, result_type):
-      raise TypeError("Incompatible configs type: %r vs %r" % (type(configs), result_type))
+      raise TypeError("Incompatible configs type: {!r} vs {!r}".format(type(configs), result_type))
 
   if not isinstance(configs_list[0], ConfigBase):
     raise TypeError("The inputs should inherit from `ConfigBase`")

diff --git a/datum/configs/tfr_configs.py b/datum/configs/tfr_configs.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from collections import Callable
+from collections.abc import Callable
 
 import tensorflow as tf
 

diff --git a/datum/encoder/encoder.py b/datum/encoder/encoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import abc
-from typing import Any, Dict
+from typing import Any
 
 import tensorflow as tf
 
@@ -126,7 +126,7 @@ def encode(self, inputs: ValueType) -> ValueType:
     return inputs
 
 
-def datum_name_to_encoder(datum: Dict, problem_type: str) -> Dict[str, Encoder]:
+def datum_name_to_encoder(datum: dict, problem_type: str) -> dict[str, Encoder]:
   """Automatically identify encoder based on data values and problem type.
 
   Args:

diff --git a/datum/encoder/tokenizer.py b/datum/encoder/tokenizer.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,10 +19,11 @@
 from datum.utils.common_utils import add_metaclass
 
 # This set contains all letter and number characters.
-_ALPHANUMERIC_CHAR_SET = set(
-    chr(i) for i in range(sys.maxunicode)
-    if (unicodedata.category(chr(i)).startswith("L") or unicodedata.category(chr(i)).startswith("N")
-        ))
+_ALPHANUMERIC_CHAR_SET = {
+    chr(i)
+    for i in range(sys.maxunicode)
+    if (unicodedata.category(chr(i)).startswith("L") or unicodedata.category(chr(i)).startswith("N"))
+}
 
 
 @add_metaclass(abc.ABCMeta)
@@ -57,7 +57,7 @@ def encode(self, text):
     for pos in range(1, len(text)):
       if is_alnum[pos] != is_alnum[pos - 1]:
         token = text[token_start:pos]
-        if token != u" " or token_start == 0:
+        if token != " " or token_start == 0:
           ret.append(token)
         token_start = pos
     final_token = text[token_start:]
@@ -76,6 +76,6 @@ def decode(self, tokens):
     ret = []
     for i, token in enumerate(tokens):
       if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
-        ret.append(u" ")
+        ret.append(" ")
       ret.append(token)
     return "".join(ret)
diff --git a/datum/generator/image.py b/datum/generator/image.py
@@ -16,7 +16,7 @@
 import os
 import xml.etree.ElementTree
 from ast import literal_eval
-from typing import Any, Dict, List, Tuple, no_type_check
+from typing import Any, no_type_check
 
 import tensorflow as tf
 
@@ -88,7 +88,7 @@ def generate_datum(self, **kwargs: Any) -> GeneratorReturnType:
     sub_dir = kwargs.get('image_dir', split)
     csv_path = kwargs.get('csv_path', split + '.csv')
     data_path = os.path.join(self.path, sub_dir)
-    data: List[Dict] = []
+    data: list[dict] = []
     with tf.io.gfile.GFile(os.path.join(self.path, csv_path)) as csv_f:
       reader = csv.DictReader(csv_f)
       for row in reader:
@@ -100,8 +100,7 @@ def generate_datum(self, **kwargs: Any) -> GeneratorReturnType:
             feature_value = os.path.join(data_path, feature_value + extension)
             feature_dict['image'] = feature_value
         data.append(feature_dict)
-    for idx, datum in enumerate(data):
-      yield idx, datum
+    yield from enumerate(data)
 
 
 class DetDatumGenerator(DatumGenerator):
@@ -171,7 +170,7 @@ def generate_datum(self, **kwargs: Any) -> GeneratorReturnType:
         yield image_id, example
 
   def _generate_example(self, data_path: str, image_dir: str, annon_dir: str, image_id: str,
-                        extension: str, load_annotations: bool) -> Dict:
+                        extension: str, load_annotations: bool) -> dict:
     """Generate a single example of the dataset.
 
     Args:
@@ -214,32 +213,32 @@ def _generate_example(self, data_path: str, image_dir: str, annon_dir: str, imag
     }
 
   @no_type_check
-  def _get_example_objects(self, annon_filepath: str) -> Tuple:
+  def _get_example_objects(self, annon_filepath: str) -> tuple:
     """Function to get all the objects from the annotation XML file."""
     with tf.io.gfile.GFile(annon_filepath, "r") as f:
       root = xml.etree.ElementTree.parse(f).getroot()
       size = root.find("size")
       width = float(size.find("width").text)
       height = float(size.find("height").text)
 
-      xmin: List[float] = []
-      xmax: List[float] = []
-      ymin: List[float] = []
-      ymax: List[float] = []
-      area: List[float] = []
-      label: List[int] = []
-      pose: List[str] = []
-      is_truncated: List[bool] = []
-      is_difficult: List[bool] = []
+      xmin: list[float] = []
+      xmax: list[float] = []
+      ymin: list[float] = []
+      ymax: list[float] = []
+      area: list[float] = []
+      label: list[int] = []
+      pose: list[str] = []
+      is_truncated: list[bool] = []
+      is_difficult: list[bool] = []
       for obj in root.findall("object"):
         class_id = obj.find("name").text.lower()
         if isinstance(class_id, str):
           label.append(self.gen_config.class_map[class_id])
         else:
           label.append(class_id)
         pose.append(obj.find("pose").text.lower())
-        is_truncated.append((obj.find("truncated").text == "1"))
-        is_difficult.append((obj.find("difficult").text == "1"))
+        is_truncated.append(obj.find("truncated").text == "1")
+        is_difficult.append(obj.find("difficult").text == "1")
         bndbox = obj.find("bndbox")
         xmax.append(float(bndbox.find("xmax").text) / width)
         xmin.append(float(bndbox.find("xmin").text) / width)

diff --git a/datum/generator/text.py b/datum/generator/text.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The OpenAGI Datum Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/datum/reader/dataset.py b/datum/reader/dataset.py
@@ -14,7 +14,7 @@
 
 import json
 import os
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Optional
 
 import tensorflow as tf
 from absl import logging
@@ -60,8 +60,8 @@ def _read(self,
             deterministic: bool = False,
             use_tf_padding: bool = False,
             use_datum_padding: bool = False,
-            pre_batching_callback: Optional[Callable[[Dict], Dict]] = None,
-            post_batching_callback: Optional[Callable[[Dict], Dict]] = None) -> DatasetType:
+            pre_batching_callback: Optional[Callable[[dict], dict]] = None,
+            post_batching_callback: Optional[Callable[[dict], dict]] = None) -> DatasetType:
     """Read and process data from tfrecord files.
 
     Args:
@@ -142,9 +142,9 @@ def _read(self,
     return dataset
 
   @memoized_property
-  def padded_shapes(self) -> Dict[str, List]:
+  def padded_shapes(self) -> dict[str, list]:
     """Returns padded shapes from dataset metadata."""
-    with open(os.path.join(self._path, 'datum_to_type_and_shape_mapping.json'), 'r') as json_f:
+    with open(os.path.join(self._path, 'datum_to_type_and_shape_mapping.json')) as json_f:
       mapping = json.load(json_f)
     padded_shapes = {}
     for key, value in mapping.items():