diff --git a/data/README.md b/data/README.md index b8534fe..3b6cae7 100644 --- a/data/README.md +++ b/data/README.md @@ -45,6 +45,8 @@ | plain-dict-uncompressed-checksum.parquet | uncompressed and dictionary-encoded INT32 and STRING columns in format v1 with a matching CRC | | rle-dict-uncompressed-corrupt-checksum.parquet | uncompressed and dictionary-encoded INT32 and STRING columns in format v2 with a mismatching CRC | | large_string_map.brotli.parquet | MAP(STRING, INT32) with a string column chunk of more than 2GB. See [note](#large-string-map) below | +| float16_nonzeros_and_nans.parquet | Float16 (logical type) column with NaNs and nonzero finite min/max values | +| float16_zeros_and_nans.parquet | Float16 (logical type) column with NaNs and zeros as min/max values. . See [note](#float16-files) below | TODO: Document what each file is in the table above. @@ -94,7 +96,7 @@ The schema for the `datapage_v1-*-checksum.parquet` test files is: message m { required int32 a; required int32 b; -} +} ``` The detailed structure for these files is as follows: @@ -182,7 +184,7 @@ metadata = pq.read_metadata("nan_in_stats.parquet") metadata.row_group(0).column(0) # # file_offset: 88 -# file_path: +# file_path: # type: DOUBLE # num_values: 2 # path_in_schema: x @@ -223,3 +225,97 @@ pq.write_table(tab, "test.parquet", compression='BROTLI') It is meant to exercise reading of structured data where each value is smaller than 2GB but the combined uncompressed column chunk size is greater than 2GB. + +## Float16 Files + +The files `float16_zeros_and_nans.parquet` and `float16_nonzeros_and_nans.parquet` +are meant to exercise a variety of test cases regarding `Float16` columns (which +are represented as 2-byte `FixedLenByteArray`s), including: +* Basic binary representations of standard values, +/- zeros, and NaN +* Comparisons between finite values +* Exclusion of NaNs from statistics min/max +* Normalizing min/max values when only zeros are present (i.e. `min` is always -0 and `max` is always +0) + +The aforementioned files were generated with: + +```python +import pyarrow as pa +import pyarrow.parquet as pq +import numpy as np + +t1 = pa.Table.from_arrays( + [pa.array([None, + np.float16(0.0), + np.float16(np.NaN)], type=pa.float16())], + names="x") +t2 = pa.Table.from_arrays( + [pa.array([None, + np.float16(1.0), + np.float16(-2.0), + np.float16(np.NaN), + np.float16(0.0), + np.float16(-1.0), + np.float16(-0.0), + np.float16(2.0)], + type=pa.float16())], + names="x") + +pq.write_table(t1, "float16_zeros_and_nans.parquet") +pq.write_table(t2, "float16_nonzeros_and_nans.parquet") + +m1 = pq.read_metadata("float16_zeros_and_nans.parquet") +m2 = pq.read_metadata("float16_nonzeros_and_nans.parquet") + +print(m1.row_group(0).column(0)) +print(m2.row_group(0).column(0)) +# +# file_offset: 72 +# file_path: +# physical_type: FIXED_LEN_BYTE_ARRAY +# num_values: 3 +# path_in_schema: x +# is_stats_set: True +# statistics: +# +# has_min_max: True +# min: b'\x00\x80' +# max: b'\x00\x00' +# null_count: 1 +# distinct_count: None +# num_values: 2 +# physical_type: FIXED_LEN_BYTE_ARRAY +# logical_type: Float16 +# converted_type (legacy): NONE +# compression: SNAPPY +# encodings: ('PLAIN', 'RLE', 'RLE_DICTIONARY') +# has_dictionary_page: True +# dictionary_page_offset: 4 +# data_page_offset: 24 +# total_compressed_size: 68 +# total_uncompressed_size: 64 +# +# file_offset: 84 +# file_path: +# physical_type: FIXED_LEN_BYTE_ARRAY +# num_values: 8 +# path_in_schema: x +# is_stats_set: True +# statistics: +# +# has_min_max: True +# min: b'\x00\xc0' +# max: b'\x00@' +# null_count: 1 +# distinct_count: None +# num_values: 7 +# physical_type: FIXED_LEN_BYTE_ARRAY +# logical_type: Float16 +# converted_type (legacy): NONE +# compression: SNAPPY +# encodings: ('PLAIN', 'RLE', 'RLE_DICTIONARY') +# has_dictionary_page: True +# dictionary_page_offset: 4 +# data_page_offset: 34 +# total_compressed_size: 80 +# total_uncompressed_size: 76 +``` diff --git a/data/float16_nonzeros_and_nans.parquet b/data/float16_nonzeros_and_nans.parquet new file mode 100644 index 0000000..eecebde Binary files /dev/null and b/data/float16_nonzeros_and_nans.parquet differ diff --git a/data/float16_zeros_and_nans.parquet b/data/float16_zeros_and_nans.parquet new file mode 100644 index 0000000..61ea6ce Binary files /dev/null and b/data/float16_zeros_and_nans.parquet differ