End-to-end Walkthrough#

Builds a float32 dataset on a LocalStore, partitions it, reopens the collection from disk, queries with a filter, and asserts bit-exact equality.

Run with:

python examples/ex_walkthrough.py
from pathlib import Path
import shutil
import tempfile

import numpy

import zcollection as zc

Initialization#

Set up a temporary directory for the collection.

Build a schema#

Declare dimensions and variables with their dtypes and chunk sizes.

schema = (
    zc.Schema()
    .with_dimension("time", chunks=4096)
    .with_dimension("x_ac", size=240, chunks=240)
    .with_variable("time", dtype="int64", dimensions=("time",))
    .with_variable("partition", dtype="int64", dimensions=("time",))
    .with_variable(
        "ssh",
        dtype="float32",
        dimensions=("time", "x_ac"),
        fill_value=numpy.float32("nan"),
    )
    .build()
)

Build a sample dataset#

Create a Dataset with synthetic data split across 4 partitions.

N_PARTITIONS = 4
ROWS_PER_PARTITION = 25_000
rng = numpy.random.default_rng(42)
n = N_PARTITIONS * ROWS_PER_PARTITION
time = numpy.arange(n, dtype="int64")
partition = numpy.repeat(
    numpy.arange(N_PARTITIONS, dtype="int64"), ROWS_PER_PARTITION
)
ssh = rng.standard_normal(size=(n, 240), dtype="float32")

ds = zc.Dataset(
    schema=schema,
    variables={
        "time": zc.Variable(schema.variables["time"], time),
        "partition": zc.Variable(schema.variables["partition"], partition),
        "ssh": zc.Variable(schema.variables["ssh"], ssh),
    },
)
print(f"dataset: {ds}  ({ds['ssh'].to_numpy().nbytes / 1e6:.1f} MB ssh)")
dataset: <zcollection.data.dataset.Dataset '/'> Size: 93.08 MB
  Dimensions: (time: 100000, x_ac: 240)
Data variables:
    time      (time)                   int64       781.25 kB  numpy.ndarray<size=781.25 kB>
    partition (time)                   int64       781.25 kB  numpy.ndarray<size=781.25 kB>
    ssh       (time, x_ac)             float32      91.55 MB  numpy.ndarray<size=91.55 MB>  (96.0 MB ssh)

Create the collection#

create_collection() writes the schema to disk and returns a writable Collection. The Sequence partitioner splits rows by the partition variable.

collection = zc.create_collection(
    f"file://{target}",
    schema=schema,
    axis="time",
    partitioning=zc.partitioning.Sequence(("partition",), dimension="time"),
)

Insert data#

Rows are automatically routed to the correct partition on disk.

written = collection.insert(ds)
print(f"wrote {len(written)} partitions: {written}")
wrote 4 partitions: ['partition=0', 'partition=1', 'partition=2', 'partition=3']

Reopen and query#

Reopen read-only and load the full dataset back; assert bit-exact equality.

reopened = zc.open_collection(f"file://{target}", mode="r")
print(f"reopened: axis={reopened.axis} parts={list(reopened.partitions())}")

full = reopened.query()
assert numpy.array_equal(full["time"].to_numpy(), ds["time"].to_numpy())
assert numpy.array_equal(full["ssh"].to_numpy(), ds["ssh"].to_numpy())
print("bit-exact round-trip: OK")
reopened: axis=time parts=['partition=0', 'partition=1', 'partition=2', 'partition=3']
bit-exact round-trip: OK

Filter pushdown#

Filters are evaluated against partition keys; only matching partitions are read from disk.

sub = reopened.query(filters="partition == 2")
assert sub["partition"].to_numpy().tolist() == [2] * ROWS_PER_PARTITION
print("filter pushdown: OK")
filter pushdown: OK

Gallery generated by Sphinx-Gallery