Indexed range readers

Many scientific formats are a large byte object plus a small index. Ropendal’s range request objects keep that pattern in the byte layer: build offsets and sizes in R, then submit the same fs_read() or fs_read_bytes_aio() call you would use for any other object.

This vignette uses a tiny VCF-like text file. It is not a VCF parser; the parser is deliberately a few explicit lines above byte reads.

library(Ropendal)

root <- tempfile("ropendal-vcf-")
dir.create(root)
fs <- opendal("fs", root = root)

Write an object and an index

For ASCII fixtures, byte counts match length(charToRaw(x)). Real index formats such as FAI, Tabix, BAI, or custom tile indexes would usually be read from their own files instead of being constructed in memory.

header <- c(
  "##fileformat=VCFv4.3\n",
  "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
)
records <- c(
  "chr1\t10\t.\tA\tG\t50\tPASS\t.\n",
  "chr1\t20\t.\tC\tT\t51\tPASS\t.\n",
  "chr2\t15\t.\tG\tA\t52\tPASS\t.\n",
  "chr1\t30\t.\tT\tC\t53\tPASS\t.\n"
)

payload <- charToRaw(paste0(c(header, records), collapse = ""))
fs_write(fs, "toy.vcf", payload)
#> [1] TRUE

record_sizes <- vapply(records, function(x) length(charToRaw(x)), integer(1))
index <- data.frame(
  chrom = c("chr1", "chr1", "chr2", "chr1"),
  pos = c(10L, 20L, 15L, 30L),
  offset = length(charToRaw(paste0(header, collapse = ""))) + cumsum(c(0L, head(record_sizes, -1L))),
  size = record_sizes
)
index
#>                                  chrom pos offset size
#>                                   chr1  10     60   24
#> chr1\t10\t.\tA\tG\t50\tPASS\t.\n  chr1  20     84   24
#> chr1\t20\t.\tC\tT\t51\tPASS\t.\n  chr2  15    108   24
#> chr2\t15\t.\tG\tA\t52\tPASS\t.\n  chr1  30    132   24

Read selected records by range

byte_ranges() packages path, offset, size, and optional row identifiers without adding another read verb.

hits <- index[index$chrom == "chr1" & index$pos >= 20L, ]
ids <- paste0(hits$chrom, ":", hits$pos)
req <- byte_ranges("toy.vcf", hits$offset, size = hits$size, id = ids)

raw_records <- fs_read(fs, req)
names(raw_records)
#> [1] "chr1:20" "chr1:30"
vapply(raw_records, rawToChar, character(1))
#>                            chr1:20                            chr1:30 
#> "chr1\t20\t.\tC\tT\t51\tPASS\t.\n" "chr1\t30\t.\tT\tC\t53\tPASS\t.\n"

Parsing remains explicit and format-specific.

parse_vcf_records <- function(records) {
  fields <- strsplit(trimws(vapply(records, rawToChar, character(1))), "\t", fixed = TRUE)
  data.frame(
    chrom = vapply(fields, `[[`, character(1), 1L),
    pos = as.integer(vapply(fields, `[[`, character(1), 2L)),
    ref = vapply(fields, `[[`, character(1), 4L),
    alt = vapply(fields, `[[`, character(1), 5L)
  )
}
parse_vcf_records(raw_records)
#>         chrom pos ref alt
#> chr1:20  chr1  20   C   T
#> chr1:30  chr1  30   T   C

Async and byte handles

The same request object works with fs_read_bytes_aio(). Collection returns OpendalBytes handles; raw-vector materialization is still explicit.

aio <- fs_read_bytes_aio(fs, req)
bytes <- collect_aio(aio)
vapply(bytes, function(x) rawToChar(as.raw(x)), character(1))
#>                            chr1:20                            chr1:30 
#> "chr1\t20\t.\tC\tT\t51\tPASS\t.\n" "chr1\t30\t.\tT\tC\t53\tPASS\t.\n"

Range readers built this way can later move hot parsing paths to C or Rust while keeping the same byte ownership boundary: background I/O resolves bytes, and the caller decides when and how to materialize or parse them.