From dfa96057cbbca299d7c5a9c513836f2a24a7077a Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Thu, 5 Sep 2024 16:57:40 -0700 Subject: [PATCH] Copy over the implementation from FSSpec.jl --- Project.toml | 29 +++- README.md | 33 ++++ src/Kerchunk.jl | 17 +- src/materialize.jl | 16 ++ src/readbytes.jl | 17 ++ src/referencestore.jl | 286 ++++++++++++++++++++++++++++++++++ test/CondaPkg.toml | 10 ++ test/its_live_catalog.jl | 10 ++ test/its_live_catalog.json | 1 + test/python_local_kerchunk.jl | 62 ++++++++ test/runtests.jl | 11 +- 11 files changed, 488 insertions(+), 4 deletions(-) create mode 100644 src/materialize.jl create mode 100644 src/readbytes.jl create mode 100644 src/referencestore.jl create mode 100644 test/CondaPkg.toml create mode 100644 test/its_live_catalog.jl create mode 100644 test/its_live_catalog.json create mode 100644 test/python_local_kerchunk.jl diff --git a/Project.toml b/Project.toml index 3fc08bd..b299b4f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,38 @@ name = "Kerchunk" uuid = "12c09fd5-fe6a-4e79-8f42-b31f49215243" authors = ["Anshul Singhvi and contributors"] -version = "0.1.0-DEV" +version = "0.1.0" + +[deps] +AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" +Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +Mustache = "ffc61752-8dc7-55ee-8c37-f3e9cdd09e70" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +Zarr = "0a941bbe-ad1d-11e8-39d9-ab76183a1d99" [compat] +AWSS3 = "0.10, 0.11" +FilePathsBase = "0.9" +HTTP = "1.10" +JSON3 = "1" +Mustache = "1" +URIs = "1.5" +Zarr = "0.9" julia = "1.10" [extras] +CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab" +PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" +Rasters = "a3a2b9e3-a471-40c9-b274-f788e487c689" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +YAXArrays = "c21b50f5-aa40-41ea-b809-c0f5e47bfa5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["CondaPkg", "PythonCall", "Dates", "JSON3", "NCDatasets", "Rasters", "YAXArrays", "Test"] diff --git a/README.md b/README.md index 566a114..699f542 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,36 @@ [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaIO.github.io/Kerchunk.jl/stable/) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://JuliaIO.github.io/Kerchunk.jl/dev/) [![Build Status](https://github.com/JuliaIO/Kerchunk.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/JuliaIO/Kerchunk.jl/actions/workflows/CI.yml?query=branch%3Amain) + +Kerchunk.jl is a Julia package that enables loading Kerchunk reference catalogs as Zarr arrays via a storage backend. + +## Installation + +```julia +] +add Kerchunk +``` + +## Quick start + +```julia +using Kerchunk, Zarr + +za = Zarr.zopen(Kerchunk.ReferenceStore("path/to/kerchunk/catalog.json")) +# and treat it like any other Zarr array! +# You can even wrap it in YAXArrays.jl to get DimensionalData.jl accessors: +using YAXArrays +YAXArrays.open_dataset(za) +``` + +## Background + +[`kerchunk`] is a Python package that generates the reference catalogs. + +## Alternatives and related packages + +- You can always use Python's `xarray` directly via PythonCall.jl +- [FSSpec.jl](https://github.com/asinghvi17/FSSpec.jl) is an alternative storage backends for Zarr.jl that wraps the same [`fsspec`](https://github.com/fsspec/filesystem_spec) that `xarray` uses under the hood. + +This package is of course built on top of [Zarr.jl](https://github.com/JuliaIO/Zarr.jl), which is a pure-Julia Zarr array library. +[YAXArrays.jl](https://github.com/JuliaDataCubes/YAXArrays.jl) is a Julia package that can wrap Zarr arrays in a DimensionalData-compatible interface. \ No newline at end of file diff --git a/src/Kerchunk.jl b/src/Kerchunk.jl index 4ee64d1..9b3ca74 100644 --- a/src/Kerchunk.jl +++ b/src/Kerchunk.jl @@ -1,5 +1,20 @@ module Kerchunk -# Write your package code here. +using JSON3, Base64 # for decoding +using URIs, Mustache # to resolve paths +using FilePathsBase, AWSS3 # to access files +using Zarr # this is where the magic happens + + +# Utility functions +include("readbytes.jl") + +# Reference store implementation +include("referencestore.jl") + +# Materializing a reference store +include("materialize.jl") + +export ReferenceStore end diff --git a/src/materialize.jl b/src/materialize.jl new file mode 100644 index 0000000..dc37989 --- /dev/null +++ b/src/materialize.jl @@ -0,0 +1,16 @@ +# This file is meant to materialize a Zarr directory from a Kerchunk catalog. + +""" + materialize(path, store::ReferenceStore) + +Materialize a Zarr directory from a Kerchunk catalog. This actually downloads and writes the files to the given path, and you can open that with any Zarr reader. +""" +function materialize(path::Union{String, FilePathsBase.AbstractPath}, store::ReferenceStore) + mkpath(path) + for key in keys(store.mapper) + println("Writing $key") + mkpath(splitdir(joinpath(path, string(key)))[1]) + write(joinpath(path, string(key)), _get_file_bytes(store, store.mapper[key])) + end + return path +end \ No newline at end of file diff --git a/src/readbytes.jl b/src/readbytes.jl new file mode 100644 index 0000000..1ef7e39 --- /dev/null +++ b/src/readbytes.jl @@ -0,0 +1,17 @@ +""" + readbytes(path, start::Integer, stop::Integer)::Vector{UInt8} + +Read bytes from a file at a given range. +""" +function readbytes(path, start::Integer, stop::Integer) + @assert start < stop "In `readbytes`, start ($(start)) must be less than stop ($(stop))." + open(path) do f + seek(f, start) + return read(f, stop + 1 - start) + end +end + +function readbytes(path::Zarr.AWSS3.S3Path, start::Integer, stop::Integer) + @assert start < stop "In `readbytes`, start ($(start)) must be less than stop ($(stop))." + return read(path; byte_range = (start+1):stop) +end \ No newline at end of file diff --git a/src/referencestore.jl b/src/referencestore.jl new file mode 100644 index 0000000..f13e45e --- /dev/null +++ b/src/referencestore.jl @@ -0,0 +1,286 @@ +#= + +# ReferenceStore + +This is a first implementation of a key-value reference store that can store files as: +- base64 encoded UInt8 (byte) file contents +- references to other stores (`[filepath, start_byte_index, end_byte_index]`) + +Currently, this only works for local files. In future it will work on HTTP and S3 stores as well. + +Future optimizations include: +- Lazy directory caching so that subdirs and subkeys are fast +- Parallel read strategy for concurrent reads +- Simple templating via Mustache or similar (kerchunk does not natively generate full Jinja templates, but could be extended to do so) + +Things not in the immediate future are: +- Complex `jinja` template support + +## Notes on templating + +Mustache.jl performs URI character escaping on `{{template}}` values, which is apparently not done in Python. +So we have to unescape them, except it doesn't percent encode, so we actually have to change the template and +indicate that the no html encoding by modifying the `_type` field of each token. Horrifying, I know. + +## Notes on file access + +Files can be: +- base64 encoded string (in memory file) +- reference to a full file (file path in a single element vector) +- reference to a subrange of a file (`file path`, `start index`, `number of bytes to read` in a three element vector) + +Files can also be generated, so we have to parse that and then actually materialize the store, at least for now. + +## The JSON schema +```json +{ + "version": (required, must be equal to) 1, + "templates": (optional, zero or more arbitrary keys) { + "template_name": jinja-str + }, + "gen": (optional, zero or more items) [ + "key": (required) jinja-str, + "url": (required) jinja-str, + "offset": (optional, required with "length") jinja-str, + "length": (optional, required with "offset") jinja-str, + "dimensions": (required, one or more arbitrary keys) { + "variable_name": (required) + {"start": (optional) int, "stop": (required) int, "step": (optional) int} + OR + [int, ...] + } + ], + "refs": (optional, zero or more arbitrary keys) { + "key_name": (required) str OR [url(jinja-str)] OR [url(jinja-str), offset(int), length(int)] + } +} +``` +=# + +struct ReferenceStore{MapperType <: AbstractDict, HasTemplates} <: Zarr.AbstractStore + mapper::MapperType + zmetadata::Dict{String, Any} + templates::Dict{String, String} +end + +function ReferenceStore(filename::Union{String, FilePathsBase.AbstractPath}) + parsed = JSON3.read(read(filename)) + return ReferenceStore(parsed) +end + +function ReferenceStore(parsed::AbstractDict{<: Union{String, Symbol}, <: Any}) + @assert haskey(parsed, "version") "ReferenceStore requires a version field, did not find one. if you have a Kerchunk v0 then you have a problem!" + @assert parsed["version"] == 1 "ReferenceStore only supports Kerchunk version 1, found $version" + @assert !haskey(parsed, "gen") "ReferenceStore does not support generated paths, please file an issue on Github if you need these!" + + has_templates = haskey(parsed, "templates") + templates = if has_templates + td = Dict{String, String}() + for (k, v) in parsed["templates"] + td[string(k)] = string(v) + end + td + else + Dict{String, String}() + end + + zmetadata = if haskey(parsed, ".zmetadata") + td = Dict{String, Any}() + for (k, v) in parsed[".zmetadata"] + td[string(k)] = v + end + td + else + Dict{String, Any}() + end + + refs = parsed["refs"] + + return ReferenceStore{typeof(refs), has_templates}(refs, zmetadata, templates) +end + +function Base.show(io::IO, ::MIME"text/plain", store::ReferenceStore) + println(io, "ReferenceStore with $(length(store.mapper)) references") +end + +function Base.show(io::IO, store::ReferenceStore) + println(io, "ReferenceStore with $(length(store.mapper)) references") +end + +function Base.getindex(store::ReferenceStore, key::String) + return store.mapper[key] +end + +function Base.setindex!(store::ReferenceStore, value, key::String) + error("ReferenceStore is read-only for now") + #store.mapper[key] = value +end + + +function Base.keys(store::ReferenceStore) + return keys(store.mapper) +end + +function Base.values(store::ReferenceStore) + return values(store.mapper) +end + +# Implement the Zarr interface + +function Zarr.subdirs(store::ReferenceStore, key) + path = rstrip(key, '/') + l_path = length(path) + sub_sub_keys = filter(keys(store)) do k + startswith(string(k), isempty(key) ? "" : key * "/") && # path is a child of the key + '/' in string(k)[l_path+1:end] # path has children + end + sub_dirs = unique!([rsplit(string(sub_sub_key), "/", limit=2)[1] for sub_sub_key in sub_sub_keys]) + return sub_dirs +end + +function Zarr.subkeys(store::ReferenceStore, key::String) + path = rstrip(key, '/') + l_path = length(path) + return filter(keys(store)) do k + startswith(string(k), isempty(key) ? "" : key * "/") && # path is a child of the key + '/' ∉ string(k)[l_path+2:end] # path has no children + end .|> string +end + +function Zarr.storagesize(store::ReferenceStore, key::String) + spec = store[key] + if spec isa String + return length(string) + elseif spec isa JSON3.Array + if length(spec) == 1 + return filesize(resolve_uri(store, only(spec))) + elseif length(spec) == 3 + return spec[3] - spec[2] # since we know the byte range, we can return the length directly + else + error("Invalid path spec $spec \n expected 1 or 3 elements, got $(length(spec))") + end + else + error("Invalid path spec $spec \n expected a string or array, got $(typeof(spec))") + end +end + + +function Zarr.read_items!(store::ReferenceStore, c::AbstractChannel, p, i) + cinds = [Zarr.citostring(ii) for ii in i] + ckeys = ["$p/$cind" for cind in cinds] + for (idx, ii) in enumerate(i) + put!(c, (ii => _get_file_bytes(store, store[ckeys[idx]]))) + end +end + +function Zarr.isinitialized(store::ReferenceStore, p::String) + return haskey(store.mapper, p) +end +function Zarr.isinitialized(store::ReferenceStore, p::String, i::Int) + return haskey(store.mapper, "$p/$i") +end + +Zarr.is_zarray(store::ReferenceStore, p::String) = ((normpath(p) in ("/", ".")) ? ".zarray" : normpath("$p/.zarray")) in keys(store) +Zarr.is_zgroup(store::ReferenceStore, p::String) = ((normpath(p) in ("/", ".")) ? ".zgroup" : normpath("$p/.zgroup")) in keys(store) + +Zarr.getattrs(store::ReferenceStore, p::String) = if haskey(store.mapper, normpath(p) in ("/", ".") ? ".zattrs" : "$p/.zattrs") + Zarr.JSON.parse(String(_get_file_bytes(store, store[normpath(p) in ("/", ".") ? ".zattrs" : "$p/.zattrs"]))) +else + Dict{String, Any}() +end + +Zarr.store_read_strategy(::ReferenceStore) = Zarr.SequentialRead() +Zarr.read_items!(s::ReferenceStore, c::AbstractChannel, ::Zarr.SequentialRead, p, i) = Zarr.read_items!(s, c, p, i) + + +# End of Zarr interface implementation + +# Begin file access implementation + +""" + _get_file_bytes(store::ReferenceStore, reference) + +By hook or by crook, this function will return the bytes for the given reference. +The reference could be a base64 encoded binary string, a path to a file, or a subrange of a file. +""" +function _get_file_bytes end + +function _get_file_bytes(store::ReferenceStore, bytes::String) + # single file + if startswith(bytes, "base64:") # base64 encoded binary + # TODO: make this more efficient by reinterpret + view + return base64decode(bytes[7:end]) + else # JSON file + return Vector{UInt8}(bytes) + end +end + +function _get_file_bytes(store::ReferenceStore, spec::JSON3.Array) + if Base.length(spec) == 1 + # path to file, read the whole thing + file = only(spec) + return read(resolve_uri(store, file)) + elseif Base.length(spec) == 3 + # subpath to file + filename, offset, length = spec + uri = resolve_uri(store, filename) + return readbytes(uri, offset #= mimic Python behaviour =#, offset + length) + else + error("Invalid path spec $spec \n expected 1 or 3 elements, got $(length(spec))") + end +end + + +""" + resolve_uri(store::ReferenceStore, source::String) + +This function resolves a string which may or may not have templating to a URI. +""" +function resolve_uri(store::ReferenceStore{<: Any, HasTemplates}, source::String) where {HasTemplates} + resolved = if HasTemplates + apply_templates(store, source) + else + source + end + # Parse the resolved string as a URI + uri = URIs.URI(resolved) + + # If the URI's scheme is empty, we're resolving a local file path. + # Note that we always use PosixPaths here, because the Zarr spec mandates that + # all path separators be forward slashes. Kerchunk also mandates this. + if isempty(uri.scheme) + if isabspath(source) + return FilePathsBase.PosixPath(source) + elseif ispath(source) + return FilePathsBase.PosixPath(joinpath(pwd(), source)) + else + error("Invalid path, presumed local but not resolvable as absolute or relative path: $source") + end + end + # Otherwise, we check the protocol and create the appropriate path type. + if uri.scheme == "file" + return FilePathsBase.SystemPath(uri.path) + elseif uri.scheme == "s3" + return Zarr.AWSS3.S3Path(uri.uri) + end # TODO: add more protocols, like HTTP, Google Cloud, Azure, etc. +end + + +""" + apply_templates(store::ReferenceStore, source::String) + +This function applies the templates stored in `store` to the source string, and returns the resolved string. + +It uses Mustache.jl under the hood, but all `{{template}}` values are set to **not** URI-encode characters. +""" +function apply_templates(store::ReferenceStore, source::String) + tokens = Mustache.parse(source) + # Adjust tokens so that `{{var}}` becomes `{{{var}}}`, the latter of which + # is rendered without URI escaping. + for token in tokens.tokens + if token._type == "name" + token._type = "{" + end + end + return Mustache.render(tokens, store.templates) +end \ No newline at end of file diff --git a/test/CondaPkg.toml b/test/CondaPkg.toml new file mode 100644 index 0000000..1f84655 --- /dev/null +++ b/test/CondaPkg.toml @@ -0,0 +1,10 @@ +[deps] +netcdf4 = "" +h5py = "" +xarray = "" +zarr = "" +certifi = "" +s3fs = "" +pyperf = "" +kerchunk = "" +pydap = "" \ No newline at end of file diff --git a/test/its_live_catalog.jl b/test/its_live_catalog.jl new file mode 100644 index 0000000..b6f2750 --- /dev/null +++ b/test/its_live_catalog.jl @@ -0,0 +1,10 @@ +using JSON3, Kerchunk, Zarr, YAXArrays +using Test + +@testset "ITS_LIVE catalog" begin + catalog_json = JSON3.read(open(joinpath(dirname(dirname(pathof(Kerchunk))), "test", "its_live_catalog.json"))) + arbitrary_choice_dictionary = catalog_json[first(keys(catalog_json))] + st = Kerchunk.ReferenceStore(arbitrary_choice_dictionary) + za = Zarr.zopen(st) + @test_nowarn za["vx"][1, 1] # test that reading works +end diff --git a/test/its_live_catalog.json b/test/its_live_catalog.json new file mode 100644 index 0000000..904fe09 --- /dev/null +++ b/test/its_live_catalog.json @@ -0,0 +1 @@ +{"S1B_IW_SLC__1SSH_20170820T205622_20170820T205648_007028_00C61F_3B97_X_S1B_IW_SLC__1SSH_20170901T205622_20170901T205648_007203_00CB2E_5B22_G0120V02_P080.nc":{"version":1,"refs":{".zgroup":"{\"zarr_format\": 2}","M11\/.zarray":"{\"chunks\":[896,2005],\"compressor\":null,\"dtype\":\" FSSpec._recursive_pyconvert + + st = FSSpec.FSStore("reference://"; fo = "test.json") + st2 = FSSpec.FSStore("reference://"; fo = py_kerchunk_catalog) + + #= + # explore why fsspec might be causing problems + fs, = fsspec.core.url_to_fs("s3://its-live-data/datacubes/v2/N00E020/ITS_LIVE_vel_EPSG32735_G0120_X750000_Y10050000.zarr") + fs2, = fsspec.core.url_to_fs("reference://"; fo = py_kerchunk_catalog) + st.mapper.dirfs.ls("/") + =# + + # ds = xr.open_dataset("reference://", engine="zarr", backend_kwargs={"consolidated": False, "storage_options": {"fo" : h5chunks.translate()}}) + + ds = Zarr.zopen(st; consolidated = false) + + ya = YAXArrays.open_dataset(ds) + + @test all(map(==, ya["unnamed"] |> collect, ras |> collect)) # if not, this goes to YAXArrays + + ds = Zarr.zopen(st2; consolidated = false) + + ya = YAXArrays.open_dataset(ds) + + @test all(map(==, ya["unnamed"] |> collect, ras |> collect)) # if not, this goes to YAXArrays +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index d05714e..4aef620 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,15 @@ +using CondaPkg, PythonCall + using Kerchunk + +# Raster creation and analysis packages +using Rasters, NCDatasets # to save a raster +using YAXArrays # to open a raster +using + using Test @testset "Kerchunk.jl" begin - # Write your tests here. + include("python_local_kerchunk.jl") + include("its_live.jl") end