Byte stores for chunked arrays

byte_store() is a key-to-bytes substrate. It deliberately stops below array semantics: metadata parsing, chunk layout, compression, and type conversion live in ordinary R code or in a package layered above Ropendal.

This vignette builds a tiny uncompressed, Zarr-like integer matrix reader to show where those boundaries sit.

library(Ropendal)

root <- tempfile("ropendal-zarr-")
dir.create(root)
fs <- opendal("fs", root = root)
store <- byte_store(fs, "array.zarr")

A small explicit format layer

The store only knows keys and bytes. The toy format below stores one text metadata object and 2 x 2 little-endian integer chunks under c/<row>/<col>.

encode_i32 <- function(x) writeBin(as.integer(x), raw(), size = 4L, endian = "little")
decode_i32 <- function(x, n) readBin(x, integer(), n = n, size = 4L, endian = "little")
chunk_key <- function(i, j) sprintf("c/%d/%d", i - 1L, j - 1L)

write_array <- function(store, x, chunk_dim = c(2L, 2L)) {
  meta <- paste0(
    "dim=", nrow(x), ",", ncol(x), "\n",
    "chunk_dim=", chunk_dim[[1L]], ",", chunk_dim[[2L]], "\n",
    "type=int32\n"
  )
  store_write(store, "zarr.json", charToRaw(meta))

  for (i0 in seq(1L, nrow(x), by = chunk_dim[[1L]])) {
    for (j0 in seq(1L, ncol(x), by = chunk_dim[[2L]])) {
      rows <- i0:min(i0 + chunk_dim[[1L]] - 1L, nrow(x))
      cols <- j0:min(j0 + chunk_dim[[2L]] - 1L, ncol(x))
      key <- chunk_key((i0 - 1L) %/% chunk_dim[[1L]] + 1L, (j0 - 1L) %/% chunk_dim[[2L]] + 1L)
      store_write(store, key, encode_i32(as.vector(x[rows, cols])))
    }
  }
  invisible(store)
}

Reading is symmetric: fetch bytes, explicitly materialize bytes where needed, and parse above the store layer.

parse_meta <- function(bytes) {
  lines <- strsplit(rawToChar(as.raw(bytes)), "\n", fixed = TRUE)[[1L]]
  pairs <- strsplit(lines[nzchar(lines)], "=", fixed = TRUE)
  values <- setNames(lapply(pairs, `[[`, 2L), vapply(pairs, `[[`, character(1), 1L))
  list(
    dim = as.integer(strsplit(values$dim, ",", fixed = TRUE)[[1L]]),
    chunk_dim = as.integer(strsplit(values$chunk_dim, ",", fixed = TRUE)[[1L]]),
    type = values$type
  )
}

read_array <- function(store) {
  meta <- parse_meta(store_read(store, "zarr.json"))
  out <- matrix(NA_integer_, nrow = meta$dim[[1L]], ncol = meta$dim[[2L]])

  for (i0 in seq(1L, meta$dim[[1L]], by = meta$chunk_dim[[1L]])) {
    for (j0 in seq(1L, meta$dim[[2L]], by = meta$chunk_dim[[2L]])) {
      rows <- i0:min(i0 + meta$chunk_dim[[1L]] - 1L, meta$dim[[1L]])
      cols <- j0:min(j0 + meta$chunk_dim[[2L]] - 1L, meta$dim[[2L]])
      key <- chunk_key((i0 - 1L) %/% meta$chunk_dim[[1L]] + 1L, (j0 - 1L) %/% meta$chunk_dim[[2L]] + 1L)
      raw <- store_read(store, key, mode = "raw")
      out[rows, cols] <- matrix(decode_i32(raw, length(rows) * length(cols)), nrow = length(rows))
    }
  }
  out
}

x <- matrix(seq_len(16L), nrow = 4L)
write_array(store, x)
store_list(store, recursive = TRUE)
#> [[1]]
#> [[1]]$path
#> [1] "c/0/0"
#> 
#> [[1]]$type
#> [1] "file"
#> 
#> [[1]]$size
#> [1] 16
#> 
#> [[1]]$etag
#> NULL
#> 
#> [[1]]$last_modified
#> [1] "2026-06-12T11:05:13.056713112Z"
#> 
#> [[1]]$version
#> NULL
#> 
#> [[1]]$content_type
#> NULL
#> 
#> [[1]]$content_encoding
#> NULL
#> 
#> 
#> [[2]]
#> [[2]]$path
#> [1] "c/0/1"
#> 
#> [[2]]$type
#> [1] "file"
#> 
#> [[2]]$size
#> [1] 16
#> 
#> [[2]]$etag
#> NULL
#> 
#> [[2]]$last_modified
#> [1] "2026-06-12T11:05:13.059713192Z"
#> 
#> [[2]]$version
#> NULL
#> 
#> [[2]]$content_type
#> NULL
#> 
#> [[2]]$content_encoding
#> NULL
#> 
#> 
#> [[3]]
#> [[3]]$path
#> [1] "c/0/"
#> 
#> [[3]]$type
#> [1] "dir"
#> 
#> [[3]]$size
#> [1] 4096
#> 
#> [[3]]$etag
#> NULL
#> 
#> [[3]]$last_modified
#> [1] "2026-06-12T11:05:13.059713192Z"
#> 
#> [[3]]$version
#> NULL
#> 
#> [[3]]$content_type
#> NULL
#> 
#> [[3]]$content_encoding
#> NULL
#> 
#> 
#> [[4]]
#> [[4]]$path
#> [1] "c/1/0"
#> 
#> [[4]]$type
#> [1] "file"
#> 
#> [[4]]$size
#> [1] 16
#> 
#> [[4]]$etag
#> NULL
#> 
#> [[4]]$last_modified
#> [1] "2026-06-12T11:05:13.060713219Z"
#> 
#> [[4]]$version
#> NULL
#> 
#> [[4]]$content_type
#> NULL
#> 
#> [[4]]$content_encoding
#> NULL
#> 
#> 
#> [[5]]
#> [[5]]$path
#> [1] "c/1/1"
#> 
#> [[5]]$type
#> [1] "file"
#> 
#> [[5]]$size
#> [1] 16
#> 
#> [[5]]$etag
#> NULL
#> 
#> [[5]]$last_modified
#> [1] "2026-06-12T11:05:13.062343071Z"
#> 
#> [[5]]$version
#> NULL
#> 
#> [[5]]$content_type
#> NULL
#> 
#> [[5]]$content_encoding
#> NULL
#> 
#> 
#> [[6]]
#> [[6]]$path
#> [1] "c/1/"
#> 
#> [[6]]$type
#> [1] "dir"
#> 
#> [[6]]$size
#> [1] 4096
#> 
#> [[6]]$etag
#> NULL
#> 
#> [[6]]$last_modified
#> [1] "2026-06-12T11:05:13.062343071Z"
#> 
#> [[6]]$version
#> NULL
#> 
#> [[6]]$content_type
#> NULL
#> 
#> [[6]]$content_encoding
#> NULL
#> 
#> 
#> [[7]]
#> [[7]]$path
#> [1] "c/"
#> 
#> [[7]]$type
#> [1] "dir"
#> 
#> [[7]]$size
#> [1] 4096
#> 
#> [[7]]$etag
#> NULL
#> 
#> [[7]]$last_modified
#> [1] "2026-06-12T11:05:13.060713219Z"
#> 
#> [[7]]$version
#> NULL
#> 
#> [[7]]$content_type
#> NULL
#> 
#> [[7]]$content_encoding
#> NULL
#> 
#> 
#> [[8]]
#> [[8]]$path
#> [1] "zarr.json"
#> 
#> [[8]]$type
#> [1] "file"
#> 
#> [[8]]$size
#> [1] 33
#> 
#> [[8]]$etag
#> NULL
#> 
#> [[8]]$last_modified
#> [1] "2026-06-12T11:05:13.055713086Z"
#> 
#> [[8]]$version
#> NULL
#> 
#> [[8]]$content_type
#> NULL
#> 
#> [[8]]$content_encoding
#> NULL
read_array(store)
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    5    9   13
#> [2,]    2    6   10   14
#> [3,]    3    7   11   15
#> [4,]    4    8   12   16

Add an explicit cache

A cache is another store wrapper. Complete chunk-key reads can be cached; partial range reads still go to the parent store.

cached <- store_cache(store, tempfile("ropendal-zarr-cache-"), validate = "last_modified_size")
read_array(cached)
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    5    9   13
#> [2,]    2    6   10   14
#> [3,]    3    7   11   15
#> [4,]    4    8   12   16

# Mutating the parent changes validation metadata, so the cached read refreshes.
store_replace(store, "c/0/0", encode_i32(rep(99L, 4L)))
#> [1] TRUE
read_array(cached)
#>      [,1] [,2] [,3] [,4]
#> [1,]   99   99    9   13
#> [2,]   99   99   10   14
#> [3,]    3    7   11   15
#> [4,]    4    8   12   16

The example is intentionally small. Real Zarr support would add metadata schema handling, chunk codecs, fill values, dimension names, and array slicing above this byte-store layer rather than inside it.

- A small explicit format layer
- Add an explicit cache