Source code for pyhealth.datasets.base_signal_dataset

from typing import Dict, Optional, Tuple, Union, Callable
import os
import logging

from abc import ABC
import pandas as pd
from pandarallel import pandarallel

from pyhealth.datasets.utils import hash_str, MODULE_CACHE_PATH
from pyhealth.datasets.sample_dataset import SampleSignalDataset
from pyhealth.utils import load_pickle, save_pickle

logger = logging.getLogger(__name__)

INFO_MSG = """
dataset.patients:
    - key: patient id
    - value: recoding file paths
"""


[docs]class BaseSignalDataset(ABC):
    """Abstract base Signal dataset class.

    This abstract class defines a uniform interface for all EEG datasets
    (e.g., SleepEDF, SHHS).

    Each specific dataset will be a subclass of this abstract class, which can then
    be converted to samples dataset for different tasks by calling `self.set_task()`.

    Args:
        dataset_name: name of the dataset.
        root: root directory of the raw data (should contain many csv files).
        dev: whether to enable dev mode (only use a small subset of the data).
            Default is False.
        refresh_cache: whether to refresh the cache; if true, the dataset will
            be processed from scratch and the cache will be updated. Default is False.
    """

    def __init__(
        self,
        root: str,
        dataset_name: Optional[str] = None,
        dev: bool = False,
        refresh_cache: bool = False,
        **kwargs: Optional[Dict],
    ):

        # base attributes
        self.dataset_name = (
            self.__class__.__name__ if dataset_name is None else dataset_name
        )
        self.root = root
        self.dev = dev
        self.refresh_cache = refresh_cache

        # hash filename for cache
        args_to_hash = [self.dataset_name, root] + ["dev" if dev else "prod"]
        filename = hash_str("+".join([str(arg) for arg in args_to_hash]))
        self.filepath = os.path.join(MODULE_CACHE_PATH, filename)

        # for task-specific attributes
        self.kwargs = kwargs

        self.patients = self.process_EEG_data()

    def __str__(self):
        """Prints some information of the dataset."""
        return f"Base dataset {self.dataset_name}"

[docs]    def stat(self) -> str:
        """Returns some statistics of the base dataset."""
        lines = list()
        lines.append("")
        lines.append(f"Statistics of base dataset (dev={self.dev}):")
        lines.append(f"\t- Dataset: {self.dataset_name}")
        lines.append(f"\t- Number of patients: {len(self.patients)}")
        num_records = [len(p) for p in self.patients.values()]
        lines.append(f"\t- Number of recodings: {sum(num_records)}")
        lines.append("")
        print("\n".join(lines))
        return "\n".join(lines)

[docs]    @staticmethod
    def info():
        """Prints the output format."""
        print(INFO_MSG)

[docs]    def set_task(
        self,
        task_fn: Callable,
        task_name: Optional[str] = None,
    ) -> SampleSignalDataset:
        """Processes the base dataset to generate the task-specific sample dataset.

        This function should be called by the user after the base dataset is
        initialized. It will iterate through all patients in the base dataset
        and call `task_fn` which should be implemented by the specific task.

        Args:
            task_fn: a function that takes a single patient and returns a
                list of samples (each sample is a dict with patient_id, visit_id,
                and other task-specific attributes as key). The samples will be
                concatenated to form the sample dataset.
            task_name: the name of the task. If None, the name of the task
                function will be used.

        Returns:
            sample_dataset: the task-specific sample (Base) dataset.

        Note:
            In `task_fn`, a patient may be converted to multiple samples, e.g.,
                a patient with three visits may be converted to three samples
                ([visit 1], [visit 1, visit 2], [visit 1, visit 2, visit 3]).
                Patients can also be excluded from the task dataset by returning
                an empty list.
        """
        if task_name is None:
            task_name = task_fn.__name__

        # check if cache exists or refresh_cache is True
        if os.path.exists(self.filepath) and (not self.refresh_cache):
            """
            It obtains the signal samples (with path only) to ```self.filepath.pkl``` file
            """
            # load from cache
            logger.debug(
                f"Loaded {self.dataset_name} base dataset from {self.filepath}"
            )
            samples = load_pickle(self.filepath + ".pkl")
        else:
            """
            It stores the actual data and label to ```self.filepath/``` folder
            It also stores the signal samples (with path only) to ```self.filepath.pkl``` file
            """
            # load from raw data
            logger.debug(f"Processing {self.dataset_name} base dataset...")

            pandarallel.initialize(progress_bar=False)

            # transform dict to pandas dataframe
            if not os.path.exists(self.filepath):
                os.makedirs(self.filepath)
            patients = pd.DataFrame(self.patients.items(), columns=["pid", "records"])
            patients.records = patients.records.parallel_apply(lambda x: task_fn(x))

            samples = []
            for _, records in patients.values:
                samples.extend(records)

            # save to cache
            logger.debug(f"Saved {self.dataset_name} base dataset to {self.filepath}")
            save_pickle(samples, self.filepath + ".pkl")

        sample_dataset = SampleSignalDataset(
            samples,
            dataset_name=self.dataset_name,
            task_name=task_name,
        )
        return sample_dataset