Source code for hydromt_fiat.data.fetch
"""Data for examples and testing of HydroMT-FIAT."""
import json
import logging
from pathlib import Path
from typing import Any
import pooch
import requests
from pooch.processors import ExtractorProcessor
__all__ = ["fetch_data"]
logger = logging.getLogger(__name__)
PROCESSORS = {
"tar.gz": pooch.Untar,
"zip": pooch.Unzip,
}
REMOTE_REGISTRY = "https://raw.githubusercontent.com/Deltares/hydromt_fiat/refs/heads/main/src/hydromt_fiat/data/registry.json"
def _fetch_registry(
local_registry: bool = True,
) -> dict[str, Any]:
"""Fetch the registry."""
# Get the data either from the local repo or remote repo
if local_registry:
with open(Path(__file__).parent / "registry.json", "r") as f:
data = f.read()
else:
r = requests.get(REMOTE_REGISTRY, timeout=5)
data = r.text
# Load the json data
database = json.loads(data)
return database
def _unpack_processor(
suffix: str,
extract_dir: Path | str = "./",
) -> ExtractorProcessor:
"""Select the right processor for unpacking."""
if suffix not in PROCESSORS:
return None
processor = PROCESSORS[suffix](members=None, extract_dir=extract_dir)
return processor
[docs]
def fetch_data(
data: str,
local_registry: bool = True,
sub_dir: bool = True,
output_dir: Path | str | None = None,
) -> Path:
"""Fetch data by simply calling the function.
Parameters
----------
data : str
The data to fetch.
local_registry : bool, optional
If True, the registry is taken from the current library location.
Otherwise, it is taken from the remote 'main' branch on github, by default True.
sub_dir : bool
Whether to place the fetched data in a sub directory of the same name.
I.e. if the (tarred) dataset is named 'custom-data' a directory named
'custom-data' is created in which the data are placed. By default True.
output_dir : Path | str | None
The output directory to store the data.
If None, the data will be stored in ~/.cache/hydromt_fiat/<data>.
Returns
-------
Path
The output directory where the data is stored.
"""
# Open the registry
# update the base URL and registry with new versions of the data
# use create_artifact.py script to create the build-data/ test-data archives
database = _fetch_registry(local_registry=local_registry)
base_url: str = database["url"]
registry: dict[str, str] = database["data"]
# Set the cache directory, for at the very least the tarball
cache_dir = Path("~", ".cache", "hydromt_fiat").expanduser()
cache_dir.mkdir(parents=True, exist_ok=True)
if output_dir is None:
output_dir = cache_dir
output_dir = Path(output_dir)
if not output_dir.is_absolute():
output_dir = Path(Path.cwd(), output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Quick check whether the data can be found
choices_raw = list(registry.keys())
choices = [item.split(".", 1)[0] for item in choices_raw]
if data not in choices:
raise ValueError(f"Choose one of the following: {choices}")
idx = choices.index(data)
# Setup Pooch
retriever = pooch.create(
path=cache_dir, # store archive to cache
base_url=base_url,
registry=registry,
)
# Set the way of unpacking it
suffix = choices_raw[idx].split(".", 1)[1]
extract_dir = output_dir
if sub_dir:
extract_dir = Path(extract_dir, data)
processor = _unpack_processor(suffix, extract_dir=extract_dir)
# Retrieve the data
retriever.fetch(choices_raw[idx], processor=processor)
return extract_dir