diff --git a/grain/BUILD b/grain/BUILD index 611b61d3..1c9dd42f 100644 --- a/grain/BUILD +++ b/grain/BUILD @@ -51,6 +51,7 @@ py_library( ":python_lazy_dataset", # build_cleaner: keep "//grain/_src/core:transforms", # build_cleaner: keep "//grain/_src/python/experimental/example_packing:packing", # build_cleaner: keep + "//grain/_src/python/experimental/proto_parsers:encode", # build_cleaner: keep "//grain/_src/python/experimental/proto_parsers:fast_proto_parser", # build_cleaner: keep "//grain/_src/python/experimental/shared_memory:np_array_in_shared_memory", # build_cleaner: keep ], diff --git a/grain/_src/python/experimental/proto_parsers/BUILD b/grain/_src/python/experimental/proto_parsers/BUILD deleted file mode 100644 index 3efe78ee..00000000 --- a/grain/_src/python/experimental/proto_parsers/BUILD +++ /dev/null @@ -1,39 +0,0 @@ -# Fast proto parsers (serialized proto -> numpy arrays). - -load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") - -licenses(["notice"]) - -package(default_visibility = ["//visibility:public"]) - -pybind_extension( - name = "fast_proto_parser", - srcs = ["fast_proto_parser.cc"], - deps = [ - "//perftools/profiles/proto:profile_cc_proto", - "//third_party/absl/strings:string_view", - "//third_party/protobuf:protobuf_lite", - "//third_party/tensorflow/core/example:example_protos_cc", - ], -) - -py3_binary( - name = "usage", - srcs = ["usage.py"], - deps = [ - ":fast_proto_parser", - "//third_party/py/absl:app", - "//third_party/py/memory_profiler", - "//third_party/tensorflow/core:protos_all_py_pb2", - ], -) - -py_test( - name = "fast_proto_parser_test", - srcs = ["fast_proto_parser_test.py"], - srcs_version = "PY3", - deps = [ - ":fast_proto_parser", - "//third_party/tensorflow/core:protos_all_py_pb2", - ], -) diff --git a/grain/_src/python/experimental/proto_parsers/fast_proto_parser.cc b/grain/_src/python/experimental/proto_parsers/fast_proto_parser.cc deleted file mode 100644 index 1660ca7a..00000000 --- a/grain/_src/python/experimental/proto_parsers/fast_proto_parser.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright 2023 Google LLC. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include -#include -#include -#include -#include -#include - -#include "perftools/profiles/proto/profile.proto.h" -#include "third_party/absl/strings/string_view.h" -#include "third_party/protobuf/map.h" -#include "third_party/protobuf/repeated_field.h" -#include "third_party/protobuf/repeated_ptr_field.h" -#include "third_party/pybind11/include/pybind11/cast.h" -#include "third_party/pybind11/include/pybind11/detail/common.h" -#include "third_party/pybind11/include/pybind11/gil.h" -#include "third_party/pybind11/include/pybind11/numpy.h" -#include "third_party/pybind11/include/pybind11/pybind11.h" -#include "third_party/pybind11/include/pybind11/pytypes.h" -#include "third_party/pybind11/include/pybind11/stl.h" // IWYU pragma: keep (necessary for py::cast from std::vector) -#include "third_party/tensorflow/core/example/example.proto.h" -#include "third_party/tensorflow/core/example/feature.proto.h" - -namespace py = ::pybind11; - -namespace { - -// Converts a repeated proto field into a np.ndarray. -template || std::is_same_v, int> = 0> -py::array_t ConvertToNpArray(proto2::RepeatedField input) { - auto values = std::make_unique>(std::move(input)); - int size = values->size(); - typename proto2::RepeatedField::const_pointer data_ptr = values->data(); - py::capsule values_capsule( - /*value=*/values.get(), /*name=*/nullptr, - /*destructor=*/[](void* p) { - delete reinterpret_cast*>(p); - }); - // The data will be freed once `values_capsule` is destroyed. - values.release(); - py::array_t np_array(size, data_ptr, values_capsule); - return np_array; -} - -py::array ConvertToNpArray(proto2::RepeatedPtrField& input) { - // TODO(jolesiak): Consider optimizing. - std::vector elements; - elements.reserve(input.size()); - for (std::string& element : input) { - elements.emplace_back(std::move(element)); - } - return py::array(py::cast(std::move(elements))); -} - -} // namespace - -PYBIND11_MODULE(fast_proto_parser, m) { - m.doc() = "Fast example.proto parsing"; - m.def( - "parse_tf_example", - [](absl::string_view serialized_proto) -> py::dict { - tensorflow::Example example; - { - py::gil_scoped_release release; - example.ParseFromString(serialized_proto); - } - proto2::Map& features = - *(example.mutable_features()->mutable_feature()); - py::dict result; - for (auto& [feature_name, feature] : features) { - py::array np_array; - if (feature.has_int64_list()) { - np_array = ConvertToNpArray( - std::move(*feature.mutable_int64_list()->mutable_value())); - } else if (feature.has_float_list()) { - np_array = ConvertToNpArray( - std::move(*feature.mutable_float_list()->mutable_value())); - } else if (feature.has_bytes_list()) { - np_array = ConvertToNpArray( - *feature.mutable_bytes_list()->mutable_value()); - } else { - throw py::value_error("Unexpected feature type"); - } - result[py::str(feature_name)] = np_array; - } - return result; - }, - py::return_value_policy::take_ownership); -} diff --git a/grain/_src/python/experimental/proto_parsers/fast_proto_parser_test.py b/grain/_src/python/experimental/proto_parsers/fast_proto_parser_test.py deleted file mode 100644 index c67dca07..00000000 --- a/grain/_src/python/experimental/proto_parsers/fast_proto_parser_test.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from grain._src.python.experimental.proto_parsers import fast_proto_parser -import numpy as np -from absl.testing import absltest -from tensorflow.core.example import example_pb2 - - -class FastProtoParserTest(absltest.TestCase): - - def test_parse_example_containing_int64(self): - expected_array = np.arange(1, 1_000_000, dtype=np.int64) - e = example_pb2.Example() - e.features.feature["foo"].int64_list.value.extend(expected_array) - - e_parsed = fast_proto_parser.parse_tf_example(e.SerializeToString()) - - self.assertEqual(list(e_parsed.keys()), ["foo"]) - self.assertTrue( - np.array_equal(e_parsed["foo"], expected_array), - f"{e_parsed['foo']=}, {expected_array=}", - ) - - def test_parse_example_containing_float(self): - expected_array = np.arange(0.1, 100_000.0, 0.1, dtype=np.float32) - e = example_pb2.Example() - e.features.feature["bar"].float_list.value.extend(expected_array) - - e_parsed = fast_proto_parser.parse_tf_example(e.SerializeToString()) - - self.assertEqual(list(e_parsed.keys()), ["bar"]) - self.assertTrue( - np.array_equal(e_parsed["bar"], expected_array), - f"{e_parsed['bar']=}, {expected_array=}", - ) - - def test_parse_example_containing_bytes(self): - expected_array = np.array([b"abc", b"cde123", b"bazzzzzz"] * 100_000) - e = example_pb2.Example() - e.features.feature["baz"].bytes_list.value.extend(expected_array) - - e_parsed = fast_proto_parser.parse_tf_example(e.SerializeToString()) - - self.assertEqual(list(e_parsed.keys()), ["baz"]) - self.assertTrue( - np.array_equal(e_parsed["baz"], expected_array), - f"{e_parsed['baz']=}, {expected_array=}", - ) - - def test_parse_example_containing_all_types(self): - expected_output = { - "foo": np.arange(1, 1_000_000.0, dtype=np.int64), - "bar": np.arange(0.1, 100_000.0, 0.1, dtype=np.float32), - "baz": np.array([b"abc", b"cde123", b"bazzzzzz"] * 100_000), - } - e = example_pb2.Example() - e.features.feature["foo"].int64_list.value.extend(expected_output["foo"]) - e.features.feature["bar"].float_list.value.extend(expected_output["bar"]) - e.features.feature["baz"].bytes_list.value.extend(expected_output["baz"]) - - e_parsed = fast_proto_parser.parse_tf_example(e.SerializeToString()) - - self.assertEqual(e_parsed.keys(), expected_output.keys()) - for feature_name in e_parsed.keys(): - self.assertTrue( - np.array_equal(e_parsed[feature_name], expected_output[feature_name]), - f"{e_parsed[feature_name]=}, {expected_output[feature_name]=}", - ) - - def test_parse_example_containing_empty_list(self): - e = example_pb2.Example() - e.features.feature["foo"].int64_list.value[:] = [] - - e_parsed = fast_proto_parser.parse_tf_example(e.SerializeToString()) - - self.assertEqual(list(e_parsed.keys()), ["foo"]) - self.assertEmpty(e_parsed["foo"]) - - -if __name__ == "__main__": - absltest.main() diff --git a/grain/_src/python/experimental/proto_parsers/usage.py b/grain/_src/python/experimental/proto_parsers/usage.py deleted file mode 100644 index 19ad88a4..00000000 --- a/grain/_src/python/experimental/proto_parsers/usage.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Usage example of fast_proto_parser.""" -from collections.abc import Sequence - -from absl import app -from grain._src.python.experimental.proto_parsers import fast_proto_parser -import memory_profiler -import numpy as np - -from tensorflow.core.example import example_pb2 - - -@memory_profiler.profile() -def test_parsing_int(): - e = example_pb2.Example() - # 40 * 10^6 * 8 B = 320 MB - e.features.feature["foo"].int64_list.value.extend([1, 2, 3, 4] * 10_000_000) - e_serialized = e.SerializeToString() - e_parsed: dict[str, np.ndarray] = fast_proto_parser.parse_tf_example( - e_serialized - ) - print(e_parsed) - - -@memory_profiler.profile() -def test_parsing_bytes(): - """Test parsing bytes.""" - e = example_pb2.Example() - e.features.feature["foo"].bytes_list.value.extend( - [b"foo", b"bar", b"baz"] * 10_000_000 - ) - # Adding one long string increases the memory footprint due to - # padding. We may consider strings with variable size (numpy arrays with - # dtype=object). - e.features.feature["foo"].bytes_list.value.append(b"fooeiatnrstieanrtoirsent") - e_serialized = e.SerializeToString() - e_parsed: dict[str, np.ndarray] = fast_proto_parser.parse_tf_example( - e_serialized - ) - print(e_parsed) - - -def main(argv: Sequence[str]) -> None: - if len(argv) > 1: - raise app.UsageError("Too many command-line arguments.") - test_parsing_int() - - -if __name__ == "__main__": - app.run(main)