Source code for pyhealth.datasets.sleepedf

import logging
import os
import re
from typing import Optional

import pandas as pd

from pyhealth.datasets import BaseDataset
from pyhealth.tasks.sleep_staging_v2 import SleepStagingSleepEDF

logger = logging.getLogger(__name__)


[docs]class SleepEDFDataset(BaseDataset):
    """Base EEG dataset for SleepEDF

    Dataset is available at https://www.physionet.org/content/sleep-edfx/1.0.0/

    For the Sleep Cassette Study portion:
        - The 153 SC* files (SC = Sleep Cassette) were obtained in a 1987-1991 study of age effects on sleep in healthy Caucasians aged 25-101, without any sleep-related medication [2]. Two PSGs of about 20 hours each were recorded during two subsequent day-night periods at the subjects homes. Subjects continued their normal activities but wore a modified Walkman-like cassette-tape recorder described in chapter VI.4 (page 92) of Bob's 1987 thesis.
        - Files are named in the form SC4ssNEO-PSG.edf where ss is the subject number, and N is the night. The first nights of subjects 36 and 52, and the second night of subject 13, were lost due to a failing cassette or laserdisk.
        - The EOG and EEG signals were each sampled at 100 Hz. The submental-EMG signal was electronically highpass filtered, rectified and low-pass filtered after which the resulting EMG envelope expressed in uV rms (root-mean-square) was sampled at 1Hz. Oro-nasal airflow, rectal body temperature and the event marker were also sampled at 1Hz.
        - Subjects and recordings are further described in the file headers, the descriptive spreadsheet SC-subjects.xls.

    For the Sleep Telemetry portion:
        - The 44 ST* files (ST = Sleep Telemetry) were obtained in a 1994 study of temazepam effects on sleep in 22 Caucasian males and females without other medication. Subjects had mild difficulty falling asleep but were otherwise healthy. The PSGs of about 9 hours were recorded in the hospital during two nights, one of which was after temazepam intake, and the other of which was after placebo intake. Subjects wore a miniature telemetry system with very good signal quality.
        - Files are named in the form ST7ssNJ0-PSG.edf where ss is the subject number, and N is the night.
        - EOG, EMG and EEG signals were sampled at 100 Hz, and the event marker at 1 Hz. The physical marker dimension ID+M-E relates to the fact that pressing the marker (M) button generated two-second deflections from a baseline value that either identifies the telemetry unit (ID = 1 or 2 if positive) or marks an error (E) in the telemetry link if negative. Subjects and recordings are further described in the file headers, the descriptive spreadsheet ST-subjects.xls.
    Args:
        root: str, root directory of the raw data. *You can choose to use the path to Cassette portion or the Telemetry portion.*
        dataset_name: Optional[str], name of the dataset. Default is None.
        config_path: Optional[str], path to the config file. Default is None.
        subset: Optional[str], subset of the SleepEDF dataset, either "cassette" or "telemetry". Default is "cassette".

    Attributes:
        task: Optional[str], name of the task (e.g., "sleep staging").
            Default is None.
        samples: Optional[List[Dict]], a list of samples, each sample is a dict with
            patient_id, record_id, and other task-specific attributes as key.
            Default is None.
        patient_to_index: Optional[Dict[str, List[int]]], a dict mapping patient_id to
            a list of sample indices. Default is None.
        visit_to_index: Optional[Dict[str, List[int]]], a dict mapping visit_id to a
            list of sample indices. Default is None.

    Examples:
        >>> from pyhealth.datasets import SleepEDFDataset
        >>> dataset = SleepEDFDataset(
        ...         root="/srv/local/data/SLEEPEDF/sleep-edf-database-expanded-1.0.0/",
        ...     )
        >>> dataset.stat()
        >>> dataset.info()
    """

    def __init__(
        self,
        root: str,
        dataset_name: Optional[str] = None,
        config_path: Optional[str] = None,
        subset: Optional[str] = "cassette",
    ) -> None:
        subset = (subset or "cassette").lower()
        if subset not in {"cassette", "telemetry"}:
            raise ValueError(
                f"Unsupported subset '{subset}'. Expected 'cassette' or 'telemetry'."
            )
        if config_path is None:
            logger.info("No config path provided, using default config")
            config_path = os.path.join(
                os.path.dirname(__file__), "configs", "sleepedf.yaml"
            )

        metadata_filename = f"sleepedf-{subset}-pyhealth.csv"
        metadata_path = os.path.join(root, metadata_filename)

        if subset == "cassette":
            if not os.path.exists(metadata_path):
                self.prepare_metadata_cassette(root)
        else:
            if not os.path.exists(metadata_path):
                self.prepare_metadata_telemetry(root)

        self.subset = subset
        default_tables = [subset]
        super().__init__(
            root=root,
            tables=default_tables,
            dataset_name=dataset_name or "sleepedf",
            config_path=config_path,
        )

[docs]    def prepare_metadata_cassette(self, root: str) -> None:
        """Prepare metadata for the SleepEDF cassette subset.
        Args:
            root: Root directory containing the dataset files.

        This method processes the raw cassette metadata files and saves a processed CSV file.
        """

        sleep_edf_cassette = pd.read_excel(os.path.join(root, "SC-subjects.xls"))

        sleep_edf_cassette = sleep_edf_cassette.rename(
            columns={"sex (F=1)": "sex", "LightsOff": "lights_off"}
        )

        for files in os.listdir(os.path.join(root, "sleep-cassette")):
            if files.endswith("-PSG.edf"):
                subject_id = int(files[3:5])
                night = files[5]
                if subject_id in sleep_edf_cassette["subject"].values:
                    sleep_edf_cassette.loc[
                        (sleep_edf_cassette["subject"] == subject_id)
                        & (sleep_edf_cassette["night"] == int(night)),
                        "signal_file",
                    ] = os.path.join(root, "sleep-cassette", files)
            elif files.endswith("-Hypnogram.edf"):
                subject_id = int(files[3:5])
                night = files[5]
                if subject_id in sleep_edf_cassette["subject"].values:
                    sleep_edf_cassette.loc[
                        (sleep_edf_cassette["subject"] == subject_id)
                        & (sleep_edf_cassette["night"] == int(night)),
                        "label_file",
                    ] = os.path.join(root, "sleep-cassette", files)

        sleep_edf_cassette.to_csv(
            os.path.join(root, "sleepedf-cassette-pyhealth.csv"), index=False
        )

[docs]    def prepare_metadata_telemetry(self, root: str) -> None:
        """Prepare metadata for the SleepEDF telemetry subset.

        Args:
            root: Root directory containing the dataset files.

        This method processes the raw telemetry metadata files and saves a processed CSV file.
        """
        telemetry_path = os.path.join(root, "ST-subjects.xls")
        telemetry_raw = pd.read_excel(telemetry_path, header=[0, 1])
        telemetry_raw.columns = self._flatten_multilevel_columns(telemetry_raw.columns)
        print(telemetry_raw.columns)
        telemetry = telemetry_raw.rename(
            columns={
                "subject_age_sex_nr": "subject",
                "subject_age_sex_age": "age",
                "subject_age_sex_m1_f2": "sex",
                "placebo_night_night_nr": "placebo_night",
                "placebo_night_lights_off": "placebo_lights_off",
                "temazepam_night_night_nr": "temazepam_night",
                "temazepam_night_lights_off": "temazepam_lights_off",
            }
        ).copy()

        telemetry["sex"] = (
            telemetry["sex"].map({1: "M", 2: "F"}).fillna(telemetry["sex"])
        )

        records = []
        for _, row in telemetry.iterrows():
            subject_val = row.get("subject")
            if pd.isna(subject_val):
                continue
            base = {
                "subject": int(subject_val),
                "age": row.get("age"),
                "sex": row.get("sex"),
            }
            for condition in ("placebo", "temazepam"):
                night_val = row.get(f"{condition}_night")
                if pd.isna(night_val):
                    continue
                record = {
                    **base,
                    "condition": condition,
                    "night": int(night_val),
                    "lights_off": row.get(f"{condition}_lights_off"),
                    "signal_file": None,
                    "label_file": None,
                }
                records.append(record)

        telemetry_records = pd.DataFrame(records)

        telemetry_dir = os.path.join(root, "sleep-telemetry")
        if os.path.isdir(telemetry_dir):
            for filename in os.listdir(telemetry_dir):
                filepath = os.path.join(telemetry_dir, filename)
                if filename.endswith("-PSG.edf"):
                    subject_id = int(filename[3:5])
                    night = int(filename[5])
                    mask = (telemetry_records["subject"] == subject_id) & (
                        telemetry_records["night"] == night
                    )
                    telemetry_records.loc[mask, "signal_file"] = filepath
                elif filename.endswith("-Hypnogram.edf"):
                    subject_id = int(filename[3:5])
                    night = int(filename[5])
                    mask = (telemetry_records["subject"] == subject_id) & (
                        telemetry_records["night"] == night
                    )
                    telemetry_records.loc[mask, "label_file"] = filepath
        else:
            logger.warning("Telemetry directory '%s' not found.", telemetry_dir)

        telemetry_records.sort_values(["subject", "night", "condition"], inplace=True)
        telemetry_records.reset_index(drop=True, inplace=True)
        telemetry_records.to_csv(
            os.path.join(root, "sleepedf-telemetry-pyhealth.csv"), index=False
        )

    @staticmethod
    def _flatten_multilevel_columns(columns: pd.Index) -> list[str]:
        """Normalize a MultiIndex column into flat snake_case names."""

        def normalize(value: object) -> str:
            if value is None:
                return ""
            cleaned = str(value).strip().lower()
            cleaned = re.sub(r"[^\w]+", "_", cleaned)
            return re.sub(r"_+", "_", cleaned).strip("_")

        flattened: list[str] = []
        for col in columns:
            parts: list[str] = []
            for part in col:
                part_clean = normalize(part)
                if part_clean:
                    parts.append(part_clean)
            flattened.append("_".join(parts))
        return flattened

    @property
    def default_task(self) -> SleepStagingSleepEDF:
        """Returns the default task for this dataset.

        Returns:
            SleepStagingSleepEDF: The default task instance.
        """
        return SleepStagingSleepEDF()