Source code for swvo.io.base

# SPDX-FileCopyrightText: 2025 GFZ Helmholtz Centre for Geosciences
#
# SPDX-License-Identifier: Apache-2.0

"""
Base class for all IO modules.
"""

import logging
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional

import pandas as pd

logger = logging.getLogger(__name__)


[docs] class BaseIO(ABC): """Abstract base class for all IO classes. This base class defines the common interface for external data I/O operations, including initialization, reading, and downloading/processing data. Subclasses can implement flexible signatures for `read()` and `download_and_process()` methods to accommodate different data sources and requirements. Parameters ---------- data_dir : Path | None Data directory for storing downloaded/processed data. If not provided, it will be read from the environment variable defined by the subclass's `ENV_VAR_NAME`. Raises ------ ValueError Raises `ValueError` if necessary environment variable is not set and `data_dir` is not provided. """ ENV_VAR_NAME: str = "" # Must be set by subclasses LABEL: str = "" # Must be set by subclasses def __init__(self, data_dir: Optional[Path] = None, prefer_env_var: bool = False) -> None: """Initialize the BaseIO class. Parameters ---------- data_dir : Path | None Data directory for storing data. If not provided, it will be read from the environment variable defined by `ENV_VAR_NAME`. prefer_env_var : bool, optional If True, the environment variable takes precedence over the passed `data_dir` argument. If False (default), the passed `data_dir` is used if provided, otherwise the environment variable is used. Raises ------ ValueError If `data_dir` is None and `ENV_VAR_NAME` is not set in environment, or if `prefer_env_var` is True and `ENV_VAR_NAME` is not set. """ if prefer_env_var and self.ENV_VAR_NAME in os.environ: data_dir = Path(os.environ[self.ENV_VAR_NAME]) elif data_dir is None: if not self.ENV_VAR_NAME or self.ENV_VAR_NAME not in os.environ: raise ValueError(f"Necessary environment variable {self.ENV_VAR_NAME} not set!") data_dir = Path(os.environ[self.ENV_VAR_NAME]) self.data_dir: Path = Path(data_dir) self.data_dir.mkdir(parents=True, exist_ok=True) logger.info(f"{self.__class__.__name__} data directory: {self.data_dir}")
[docs] @abstractmethod def read(self, *args, **kwargs) -> pd.DataFrame | list[pd.DataFrame]: """Read data. Subclasses should implement this method with their specific signature. Common parameters include: - start_time: datetime Start time of the data to read. Must be timezone-aware. - end_time: datetime End time of the data to read. Must be timezone-aware. - download: bool, optional Download data on the go if not available locally. - Additional parameters specific to each data source. Returns ------- pd.DataFrame or list[pd.DataFrame] Data for the specified parameters. """ pass
[docs] @abstractmethod def download_and_process(self, *args, **kwargs) -> None: """Download and process data. Subclasses should implement this method with their specific signature. Common parameters include: - start_time: datetime Start time of the data to download. Must be timezone-aware. - end_time: datetime End time of the data to download. Must be timezone-aware. - target_date: datetime Target date for data (for single-day sources). - request_time: datetime Request time for data (for streaming sources). - reprocess_files: bool, optional If True, re-download and re-process existing files. - Additional parameters specific to each data source. Returns ------- None """ pass