Source code for zcollection.collection

# Copyright (c) 2023 CNES
#
# All rights reserved. Use of this source code is governed by a
# BSD-style license that can be found in the LICENSE file.
"""
Collection of Zarr groups
=========================
"""
from __future__ import annotations

from typing import Any, Iterable, Iterator, Literal, NoReturn, Sequence
import datetime
import functools
import io
import json
import logging
import os
import types
import warnings

import dask.bag.core
import dask.distributed
import dask.utils
import fsspec
import xarray

from .. import (
    dask_utils,
    dataset,
    fs_utils,
    merging,
    meta,
    partitioning,
    storage,
    sync,
)
from .abc import PartitionFilter, ReadOnlyCollection
from .callable_objects import UpdateCallable, WrappedPartitionCallable
from .detail import (
    _insert,
    _try_infer_callable,
    _wrap_update_func,
    _wrap_update_func_with_overlap,
)

#: Module logger.
_LOGGER: logging.Logger = logging.getLogger(__name__)


def _write_immutable_dataset(
    zds: dataset.Dataset,
    axis: str,
    path: str,
    fs: fsspec.AbstractFileSystem,
    synchronizer: sync.Sync,
) -> None:
    """Write the immutable dataset.

    Args:
        zds: The dataset to write.
        axis: The partitioning axis.
        path: The path to the immutable dataset.
        fs: The file system that the partition is stored on.
    """
    immutable_dataset = zds.select_variables_by_dims((axis, ), predicate=False)
    assert len(immutable_dataset.variables) != 0, (
        'The dataset to insert does not contain any variable '
        'that is not split.')
    _LOGGER.info('Creating the immutable dataset: %s', path)
    storage.write_zarr_group(immutable_dataset,
                             path,
                             fs,
                             synchronizer,
                             distributed=False)


def _infer_callable(
    collection: Collection,
    func: UpdateCallable,
    filters: PartitionFilter | None,
    delayed,
    selected_variables: Iterable[str] | None,
    *args,
    **kwargs,
) -> tuple[str, ...]:
    try:
        one_partition: str = next(collection.partitions(filters=filters))
    except StopIteration:
        return ()

    with collection.synchronizer:
        zds: dataset.Dataset = storage.open_zarr_group(
            one_partition,
            collection.fs,
            delayed=delayed,
            selected_variables=selected_variables)
    func_result: dict[str, Any]
    func_result = _try_infer_callable(func, zds,
                                      collection.partition_properties.dim,
                                      *args, **kwargs)
    unknown_variables: set[str] = set(func_result) - set(
        collection.metadata.variables.keys())
    if len(unknown_variables):
        raise ValueError(f'Unknown variables: {unknown_variables}')
    return tuple(func_result)


def _check_partition(
    partition: str,
    fs: fsspec.AbstractFileSystem,
    partitioning: partitioning.Partitioning,
) -> tuple[str, bool]:
    """Check if a given partition is a valid Zarr group.

    Args:
        partition: The partition to check.
        fs: The file system to use.
        partitioning: The partitioning strategy.

    Returns:
        A tuple containing the partition and a boolean indicating whether it is
        a valid Zarr group.
    """
    try:
        partitioning.parse(partition)
    except ValueError:
        return partition, False
    return partition, storage.check_zarr_group(partition, fs)


[docs] class Collection(ReadOnlyCollection): """This class manages a collection of files in Zarr format stored in a set of subdirectories. These subdirectories split the data, by cycles or dates for example, in order to optimize access and updates, deletion or addition of new data. Args: axis: The axis of the collection. This is the dimension along which the data is partitioned. ds: The dataset containing the collection. This dataset is used to create the metadata of the collection, which is used to validate datasets that are inserted in the collection. partition_handler: The partitioning strategy for the collection. This is an instance of a subclass of :py:class:`zarr_partitioning.PartitionHandler`. partition_base_dir: The base directory for the collection. This is the directory where the subdirectories containing the partitioned data are stored. mode: The mode of the collection. This can be either 'r' (read-only) or 'w' (write). In read-only mode, the collection can only be read and no data can be inserted or modified. In write mode, the collection can be read and modified. filesystem: The filesystem to use for the collection. This is an instance of a subclass of :py:class:`fsspec.AbstractFileSystem`. synchronizer: The synchronizer to use for the collection. This is an instance of a subclass of :py:class:`zarr_synchronizer.Synchronizer`. Raises: ValueError: If the axis does not exist in the dataset, if the partition key is not defined in the dataset or if the access mode is not supported. Notes: Normally, this class is not instantiated directly but through the :py:meth:`create_collection <zcollection.create_collection>` and :py:meth:`open_collection <zcollection.open_collection>` methods of this library. """ def __init__( self, axis: str, ds: meta.Dataset, partition_handler: partitioning.Partitioning, partition_base_dir: str, *, mode: Literal['r', 'w'] | None = None, filesystem: fsspec.AbstractFileSystem | str | None = None, synchronizer: sync.Sync | None = None, ) -> None: super().__init__(axis=axis, ds=ds, partition_handler=partition_handler, partition_base_dir=partition_base_dir, mode=mode, filesystem=filesystem, synchronizer=synchronizer) if self.mode == 'r': # pylint: disable=method-hidden # These methods are overloaded when the collection is opened in # readonly. self._read_only_mode() # pylint: enable=method-hidden else: self._write_config(skip_if_exists=True)
[docs] def __str__(self) -> str: return (f'<{self.__class__.__name__} ' f'filesystem={self.fs.__class__.__name__!r}, ' f'partition_base_dir={self.partition_properties.dir!r}' f'mode={self.mode!r}>')
[docs] @staticmethod def _unsupported_operation(*args, **kwargs) -> NoReturn: """Raise an exception if the operation is not supported.""" raise io.UnsupportedOperation('not writable')
[docs] def _read_only_mode(self) -> None: """Set the unsupported methods to raise an exception when the collection is opened in read-only mode.""" # Set each unsupported method to raise an exception. for item in [ 'add_variable', 'drop_partitions', 'drop_variable', 'insert', 'update', ]: assert hasattr(self, item), f'{item} is not a known method.' setattr(self, item, types.MethodType(Collection._unsupported_operation, self))
[docs] def _write_config(self, skip_if_exists: bool = False) -> None: """Write the configuration file.""" base_dir: str = self.partition_properties.dir config: str = self._config(base_dir) exists: bool = self.fs.exists(config) if exists and skip_if_exists: return _LOGGER.info( "Updating collection's configuration: %s" if exists else 'Creating the collection: %s', config) self.fs.makedirs(base_dir, exist_ok=True) params = { 'axis': self.axis, 'dataset': self.metadata.get_config(), 'partitioning': self.partitioning.get_config(), } with self.fs.open(config, mode='w') as stream: json.dump(params, stream, indent=4) # type: ignore[arg-type]
[docs] def is_readonly(self) -> bool: """Return True if the collection is read-only.""" return self.mode == 'r'
[docs] @classmethod def from_config( cls, path: str, *, mode: Literal['r', 'w'] | None = None, filesystem: fsspec.AbstractFileSystem | str | None = None, synchronizer: sync.Sync | None = None, ) -> Collection: """Open a Collection described by a configuration file. Args: path: The path to the configuration file. mode: The mode of the collection. This can be either 'r' (read-only) or 'w' (write). filesystem: The filesystem to use for the collection. This is an instance of a subclass of :py:class:`fsspec.AbstractFileSystem`. synchronizer: The synchronizer to use for the collection. This is an instance of a subclass of :py:class:`zarr_synchronizer.Synchronizer`. Returns: The collection. Raises: ValueError: If the provided directory does not contain a valid collection configuration file. """ _LOGGER.info('Opening collection: %r', path) fs: fsspec.AbstractFileSystem = fs_utils.get_fs(filesystem) config: str = cls._config(path) if not fs.exists(config): raise ValueError(f'zarr collection not found at path {path!r}') with fs.open(config) as stream: data: dict[str, Any] = json.load(stream) return Collection( data['axis'], meta.Dataset.from_config(data['dataset']), partitioning.get_codecs(data['partitioning']), path, mode=mode or 'r', filesystem=fs, synchronizer=synchronizer, )
# pylint: disable=method-hidden
[docs] def insert( self, ds: xarray.Dataset | dataset.Dataset, *, merge_callable: merging.MergeCallable | None = None, npartitions: int | None = None, validate: bool = False, **kwargs, ) -> Iterable[str]: """Insert a dataset into the collection. Args: ds: The dataset to insert. It can be either an xarray.Dataset or a dataset.Dataset object. merge_callable: A function to use to merge the existing data set already stored in partitions with the new partitioned data. If None, the new partitioned data overwrites the existing partitioned data. npartitions: The maximum number of partitions to process in parallel. By default, partitions are processed one by one. kwargs: Additional keyword arguments passed to the merge callable. .. note:: When inserting partitions, Dask parallelizes the writing of each partition across its workers. Additionally, the writing of variables within a partition is parallelized on the worker responsible for inserting that partition, using multiple threads. If you're using a single Dask worker, partition insertion will happen sequentially and changing this parameter will have no effect. validate: Whether to validate dataset metadata before insertion or not. Returns: A list of the inserted partitions. Raises: ValueError: If the dataset does not match the definition of the collection. Warns: UserWarning: If two different partitions use the same file (chunk), the library that handles the storage of chunked arrays (HDF5, NetCDF, Zarr, etc.) must be compatible with concurrent access. Notes: Each worker will process a set of independent partitions. However, be careful, two different partitions can use the same file (chunk), therefore, the library that handles the storage of chunked arrays (HDF5, NetCDF, Zarr, etc.) must be compatible with concurrent access. """ # pylint: disable=method-hidden if isinstance(ds, xarray.Dataset): ds = dataset.Dataset.from_xarray(ds) _LOGGER.info('Inserting of a %s dataset in the collection', dask.utils.format_bytes(ds.nbytes)) missing_variables: tuple[str, ...] = self.metadata.missing_variables( ds.metadata()) for item in missing_variables: variable: meta.Variable = self.metadata.variables[item] ds.add_variable(variable) if validate and ds.metadata() != self.metadata: raise ValueError( "Provided dataset's metadata do not match the collection's ones" ) ds = ds.set_for_insertion(self.metadata) client: dask.distributed.Client = dask_utils.get_client() # If the dataset contains variables that should not be partitioned. if self._immutable is not None: # On the first call, we store the immutable variables in # a directory located at the root of the collection. if not self.fs.exists(self._immutable): _write_immutable_dataset(ds, self.axis, self._immutable, self.fs, self.synchronizer) # Remove the variables that should not be partitioned. ds = ds.select_variables_by_dims((self.axis, )) # Process the partitions to insert or update by batches to avoid # memory issues. partitions = tuple( self.partitioning.split_dataset(ds, self.partition_properties.dim)) if npartitions is not None: if npartitions < 1: raise ValueError('The number of partitions must be positive') npartitions = len(partitions) // npartitions + 1 scattered_ds: Any = client.scatter(ds) for sequence in dask_utils.split_sequence(partitions, npartitions): futures: list[dask.distributed.Future] = [ dask_utils.simple_delayed('insert', _insert)( args=partition, # type: ignore[arg-type] axis=self.axis, zds=scattered_ds, fs=self.fs, merge_callable=merge_callable, partitioning_properties=self.partition_properties, **kwargs, ) for partition in sequence ] storage.execute_transaction(client, self.synchronizer, futures) return (fs_utils.join_path(*((self.partition_properties.dir, ) + item)) for item, _ in partitions)
# pylint: disable=method-hidden
[docs] def drop_partitions( self, *, filters: PartitionFilter = None, timedelta: datetime.timedelta | None = None, ) -> Iterable[str]: # pylint: disable=method-hidden """Drop the selected partitions. Args: filters: The predicate used to filter the partitions to drop. To get more information on the predicate, see the documentation of the :meth:`partitions` method. timedelta: Select the partitions created before the specified time delta relative to the current time. Returns: A list of the dropped partitions. Example: >>> collection.drop_partitions(filters="year == 2019") >>> collection.drop_partitions( ... timedelta=datetime.timedelta(days=30)) """ now: datetime.datetime = datetime.datetime.now() client: dask.distributed.Client = dask_utils.get_client() folders = list(self.partitions(filters=filters, lock=True)) # No partition selected, nothing to do. if not folders: return folders def is_created_before(path: str, now: datetime.datetime, timedelta: datetime.timedelta) -> bool: """Return whether the partition was created before the timedelta.""" created: datetime.datetime = self.fs.created(path) if created.tzinfo is not None: created = created.replace(tzinfo=None) return now - created > timedelta if timedelta is not None: folders = list( filter( lambda folder: is_created_before( # type: ignore[arg-type] folder, now, timedelta), folders)) storage.execute_transaction( client, self.synchronizer, client.map(self.fs.rm, folders, recursive=True)) def invalidate_cache(path) -> None: """Invalidate the cache.""" _LOGGER.info('Dropped partition: %s', path) self.fs.invalidate_cache(path) tuple(map(invalidate_cache, folders)) return folders
# pylint: disable=method-hidden
[docs] def update( self, func: UpdateCallable, /, *args, delayed: bool = True, depth: int = 0, filters: PartitionFilter | None = None, npartitions: int | None = None, selected_variables: list[str] | None = None, trim: bool = True, variables: Sequence[str] | None = None, **kwargs, ) -> None: # pylint: disable=method-hidden """Update the selected partitions. Args: func: The function to apply on each partition. *args: The positional arguments to pass to the function. delayed: Whether to load data in a dask array or not. depth: The depth of the overlap between the partitions. Default is 0 (no overlap). If depth is greater than 0, the function is applied on the partition and its neighbors selected by the depth. If ``func`` accepts a partition_info as a keyword argument, it will be passed a tuple with the name of the partitioned dimension and the slice allowing getting in the dataset the selected partition. filters: The expression used to filter the partitions to update. npartitions: The number of partitions to update in parallel. By default, it is equal to the number of Dask workers available when calling this method. selected_variables: A list of variables to load from the collection. If None, all variables are loaded. trim: Whether to trim ``depth`` items from each partition after calling ``func``. Set it to ``False`` if your function does this for you. variables: The list of variables updated by the function. If None, the variables are inferred by calling the function on the first partition. In this case, it is important to ensure that the function can be called twice on the same partition without side-effects. Default is None. **kwargs: The keyword arguments to pass to the function. Raises: ValueError: If the variables to update are not in the collection. Example: >>> import dask.array >>> import zcollection >>> def ones(ds): ... return dict(var2=ds.variables["var1"].values * 0 + 1) >>> collection = zcollection.Collection("my_collection", mode="w") >>> collection.update(ones) """ if not callable(func): raise TypeError('func must be a callable') variables = variables or _infer_callable( self, func, filters, delayed, selected_variables, *args, **kwargs) if not variables: warnings.warn('You are trying to update an empty collection.', category=RuntimeWarning, stacklevel=2) return # If depth is not 0, the variables updated must be in the selected # variables. if depth != 0 and selected_variables is not None: selected_variables += list( set(variables) - set(selected_variables)) _LOGGER.info('Updating of the (%s) variable in the collection', ', '.join(repr(item) for item in variables)) selected_partitions = tuple(self.partitions(filters=filters, lock=True)) local_func: WrappedPartitionCallable = _wrap_update_func( *args, delayed=delayed, func=func, fs=self.fs, immutable=self._immutable, selected_variables=selected_variables, **kwargs) if depth == 0 else _wrap_update_func_with_overlap( *args, delayed=delayed, depth=depth, dim=self.partition_properties.dim, func=func, fs=self.fs, immutable=self._immutable, selected_partitions=selected_partitions, selected_variables=selected_variables, trim=trim, **kwargs) client: dask.distributed.Client = dask_utils.get_client() batches: Iterator[Sequence[str]] = dask_utils.split_sequence( selected_partitions, npartitions or dask_utils.dask_workers(client, cores_only=True)) storage.execute_transaction( client, self.synchronizer, client.map(local_func, tuple(batches), key=func.__name__))
[docs] def drop_variable( self, variable: str, ) -> None: """Delete the variable from the collection. Args: variable: The variable to delete. Raises: ValueError: If the variable doesn't exist in the collection or is used by the partitioning. Example: >>> import zcollection >>> collection = zcollection.open_collection( ... "my_collection", mode="w") >>> collection.drop_variable("my_variable") """ _LOGGER.info('Dropping of the %r variable in the collection', variable) if variable in self.partitioning.variables: raise ValueError( f'The variable {variable!r} is part of the partitioning.') if variable not in self.metadata.variables: raise ValueError( f'The variable {variable!r} does not exist in the collection.') if self._immutable: zds: dataset.Dataset = storage.open_zarr_group( self._immutable, self.fs) if variable in zds.variables: raise ValueError( f'The variable {variable!r} is part of the immutable ' 'dataset.') client: dask.distributed.Client = dask_utils.get_client() bag: dask.bag.core.Bag = self._bag_from_partitions(lock=True) awaitables: list[ dask.distributed.Future] = dask.distributed.futures_of( bag.map(storage.del_zarr_array, variable, self.fs).persist()) storage.execute_transaction(client, self.synchronizer, awaitables) del self.metadata.variables[variable] self._write_config()
[docs] def add_variable( self, variable: meta.Variable | dataset.Variable, ) -> None: """Add a variable to the collection. Args: variable: The variable to add. Raises: ValueError: if the variable is already part of the collection, it doesn't use the partitioning dimension or use a dimension that is not part of the dataset. Example: >>> import zcollection >>> collection = zcollection.open_collection( ... "my_collection", mode="w") >>> new_variable = meta.Variable( ... name="my_variable", ... dtype=numpy.dtype("int16"), ... dimensions=("num_lines", "num_pixels"), ... fill_value=32267, ... attrs=(dataset.Attribute(name="my_attr", value=0), ), ... ) >>> collection.add_variable(new_variable) """ variable = dataset.get_variable_metadata(variable) _LOGGER.info('Adding of the %r variable in the collection', variable.name) if self.partition_properties.dim not in variable.dimensions: raise ValueError( 'The new variable must use the partitioning axis.') self.metadata.add_variable(variable) self._write_config() # Remove the attribute from the variable. The attribute will be added # from the collection metadata. variable = variable.set_for_insertion() client: dask.distributed.Client = dask_utils.get_client() template: meta.Variable = self.metadata.search_same_dimensions_as( variable) chunks: dict[str, int] = { dim.name: dim.value for dim in self.metadata.chunks } try: bag: dask.bag.core.Bag = self._bag_from_partitions(lock=True) futures: list[ dask.distributed.Future] = dask.distributed.futures_of( bag.map(storage.add_zarr_array, variable, template.name, self.fs, chunks=chunks).persist()) storage.execute_transaction(client, self.synchronizer, futures) except Exception: self.drop_variable(variable.name) raise
[docs] def copy( self, target: str, *, filters: PartitionFilter | None = None, filesystem: fsspec.AbstractFileSystem | None = None, mode: Literal['r', 'w'] = 'w', npartitions: int | None = None, synchronizer: sync.Sync | None = None, ) -> Collection: """Copy the collection to a new location. Args: target: The target location. filters: The predicate used to filter the partitions to copy. filesystem: The file system to use. If None, the file system of the collection is used. mode: The mode used to open the collection copied. Default is 'w'. npartitions: The number of partitions top copy in parallel. Default is number of cores. synchronizer: The synchronizer used to synchronize the collection copied. Default is None. Returns: The new collection. Example: >>> import zcollection >>> collection = zcollection.open_collection( ... "my_collection", mode="r") >>> collection.copy(target="my_new_collection") """ _LOGGER.info('Copying of the collection to %r', target) if filesystem is None: filesystem = fs_utils.get_fs(target) client: dask.distributed.Client = dask_utils.get_client() npartitions = npartitions or dask_utils.dask_workers(client, cores_only=True) # Sequence of (source, target) to copy split in npartitions args = tuple( dask_utils.split_sequence( [(item, fs_utils.join_path( target, os.path.relpath(item, self.partition_properties.dir))) for item in self.partitions(filters=filters)], npartitions)) # Copy the selected partitions partial = functools.partial(fs_utils.copy_tree, fs_source=self.fs, fs_target=filesystem) def worker_task(args: Sequence[tuple[str, str]]) -> None: """Function call on each worker to copy the partitions.""" tuple(map(lambda arg: partial(*arg), args)) client.gather(client.map(worker_task, args)) # Then the remaining files in the root directory (config, metadata, # etc.) fs_utils.copy_files([ item['name'] for item in self.fs.listdir(self.partition_properties.dir, detail=True) if item['type'] == 'file' ], target, self.fs, filesystem) return Collection.from_config(target, mode=mode, filesystem=filesystem, synchronizer=synchronizer)
[docs] def validate_partitions(self, filters: PartitionFilter | None = None, fix: bool = False) -> list[str]: """Validates partitions in the collection by checking if they exist and are readable. If `fix` is True, invalid partitions will be removed from the collection. Args: filters: The predicate used to filter the partitions to validate. By default, all partitions are validated. fix: Whether to fix invalid partitions by removing them from the collection. Returns: A list of invalid partitions. """ partitions = tuple(self.partitions(filters=filters)) if not partitions: return [] client: dask.distributed.Client = dask_utils.get_client() futures: list[dask.distributed.Future] = client.map( _check_partition, partitions, fs=self.fs, partitioning=self.partitioning) invalid_partitions: list[str] = [] for item in dask.distributed.as_completed(futures): partition, valid = item.result() # type: ignore if not valid: warnings.warn(f'Invalid partition: {partition}', category=RuntimeWarning) invalid_partitions.append(partition) if fix and invalid_partitions: for item in invalid_partitions: _LOGGER.info('Removing invalid partition: %s', item) self.fs.rm(item, recursive=True) return invalid_partitions