Source code for pyhealth.datasets.isruc

import subprocess
import shutil
import os
from typing import List
from urllib.request import urlretrieve

from pyhealth.datasets import BaseSignalDataset

DEV_NUM_PAT = 5
FULL_NUM_PAT = 100


def _download_file(online_filepath, local_filepath, refresh_cache=False):
    if (not os.path.exists(local_filepath)) or refresh_cache:
        urlretrieve(online_filepath, local_filepath)
    return local_filepath


def _unrar_function(rar_path, dst_path):
    if os.name == "nt":
        try:
            import patoolib
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "Please install patool to download ISRUC data. \
            You might need to have 7z/rar/unrar installed as well."
            )

        patoolib.extract_archive(rar_path, outdir=dst_path)
    else:
        # Linux, we use 7zzs, which can be downloaded from https://www.7-zip.org/download.html
        path_7z = shutil.which("7zzs")
        assert (
            path_7z is not None
        ), "Please download 7z for linux, reference: https://www.7-zip.org/download.html"
        subprocess.call([path_7z, "x", rar_path, f"-o{dst_path}"])


def _download_ISRUC_group1(data_dir: str, dev: bool, exclude_subjects: List[int]):
    """Download all group 1 data for ISRUC.

    Args:
        data_dir (str):
            path to download the data.
        exclude_subjects (List[int]):
            List of subjects to exclude.
    Returns:
        raw_dir: directory the dataset is extracted to (in data_dir).
    """
    rar_dir = os.path.join(data_dir, "rar_files")
    raw_dir = os.path.join(data_dir, "raw")
    for _ in [rar_dir, raw_dir]:
        if not os.path.isdir(_):
            os.makedirs(_)
    exclude_subjects = set(exclude_subjects)

    NUM_PAT = DEV_NUM_PAT if dev else FULL_NUM_PAT
    print(f"Downloading ISRUC Group 1 data, the first {NUM_PAT} patients")

    for subject_id in range(1, NUM_PAT + 1):
        if subject_id in exclude_subjects:
            continue
        if os.path.isfile(os.path.join(raw_dir, f"{subject_id}/{subject_id}.edf")):
            continue
        rar_url = f"http://dataset.isr.uc.pt/ISRUC_Sleep/subgroupI/{subject_id}.rar"
        rar_dst = os.path.join(rar_dir, f"{subject_id}.rar")
        _download_file(rar_url, rar_dst)
        _unrar_function(rar_dst, raw_dir)
        os.rename(
            os.path.join(raw_dir, f"{subject_id}/{subject_id}.rec"),
            os.path.join(raw_dir, f"{subject_id}/{subject_id}.edf"),
        )
    return raw_dir


[docs]class ISRUCDataset(BaseSignalDataset):
    """Base EEG dataset for ISRUC Group I.

    Dataset is available at https://sleeptight.isr.uc.pt/

        - The EEG signals are sampled at 200 Hz.
        - There are 100 subjects in the orignal dataset.
        - Each subject's data is about a night's sleep.


    Args:
        dataset_name: name of the dataset.
            Default is 'ISRUCDataset'.
        root: root directory of the raw data.
            We expect `root/raw` to contain all extracted files (.txt, .rec, ...)
            You can also download the data to a new directory by using download=True.
        dev: whether to enable dev mode (only use a small subset of the data).
            Default is False.
        refresh_cache: Whether to refresh the cache; if true, the dataset will
            be processed from scratch and the cache will be updated. Default is False.
        download: Whether to download the data automatically.
            Default is False.


    Examples:
        >>> from pyhealth.datasets import ISRUCDataset
        >>> dataset = ISRUCDataset(
        ...         root="/srv/local/data/data/ISRUC-I",
        ...         download=True,
        ...     )
        >>> dataset.stat()
        >>> dataset.info()
    """

    _EXCLUDE_SUBJECTS = [8]  # This subject has missing channels

[docs]    def process_EEG_data(self):
        # download the data or check if the data exists
        if ("download" in self.kwargs) and self.kwargs["download"]:
            _download_ISRUC_group1(
                self.root, self.dev, exclude_subjects=self._EXCLUDE_SUBJECTS
            )
        else:
            assert os.path.exists(
                os.path.join(self.root, "raw")
            ), "raw data {root}/raw does not exist, please \
                download the data by enabling 'download=True' first."

        raw_dir = os.path.join(self.root, "raw")
        subject_ids = os.listdir(raw_dir)
        if self.dev:
            subject_ids = subject_ids[:DEV_NUM_PAT]
        subjects = {
            subject_id: [
                {
                    "load_from_path": raw_dir,
                    "signal_file": f"{subject_id}/{subject_id}.edf",
                    "label1_file": f"{subject_id}/{subject_id}_1.txt",
                    "label2_file": f"{subject_id}/{subject_id}_2.txt",
                    "save_to_path": self.filepath,
                    "subject_id": subject_id,
                }
            ]
            for subject_id in subject_ids
        }
        return subjects


if __name__ == "__main__":
    dataset = ISRUCDataset(
        root="/srv/local/data/trash/",
        dev=True,
        refresh_cache=True,
        download=True,
    )

    dataset.stat()
    dataset.info()
    print(list(dataset.patients.items())[0])