Source code for pyhealth.datasets.sleepedf

import logging
import os
import re
from typing import Optional

import pandas as pd

from pyhealth.datasets import BaseDataset
from pyhealth.tasks.sleep_staging_v2 import SleepStagingSleepEDF

logger = logging.getLogger(__name__)


[docs]class SleepEDFDataset(BaseDataset): """Base EEG dataset for SleepEDF Dataset is available at https://www.physionet.org/content/sleep-edfx/1.0.0/ For the Sleep Cassette Study portion: - The 153 SC* files (SC = Sleep Cassette) were obtained in a 1987-1991 study of age effects on sleep in healthy Caucasians aged 25-101, without any sleep-related medication [2]. Two PSGs of about 20 hours each were recorded during two subsequent day-night periods at the subjects homes. Subjects continued their normal activities but wore a modified Walkman-like cassette-tape recorder described in chapter VI.4 (page 92) of Bob's 1987 thesis. - Files are named in the form SC4ssNEO-PSG.edf where ss is the subject number, and N is the night. The first nights of subjects 36 and 52, and the second night of subject 13, were lost due to a failing cassette or laserdisk. - The EOG and EEG signals were each sampled at 100 Hz. The submental-EMG signal was electronically highpass filtered, rectified and low-pass filtered after which the resulting EMG envelope expressed in uV rms (root-mean-square) was sampled at 1Hz. Oro-nasal airflow, rectal body temperature and the event marker were also sampled at 1Hz. - Subjects and recordings are further described in the file headers, the descriptive spreadsheet SC-subjects.xls. For the Sleep Telemetry portion: - The 44 ST* files (ST = Sleep Telemetry) were obtained in a 1994 study of temazepam effects on sleep in 22 Caucasian males and females without other medication. Subjects had mild difficulty falling asleep but were otherwise healthy. The PSGs of about 9 hours were recorded in the hospital during two nights, one of which was after temazepam intake, and the other of which was after placebo intake. Subjects wore a miniature telemetry system with very good signal quality. - Files are named in the form ST7ssNJ0-PSG.edf where ss is the subject number, and N is the night. - EOG, EMG and EEG signals were sampled at 100 Hz, and the event marker at 1 Hz. The physical marker dimension ID+M-E relates to the fact that pressing the marker (M) button generated two-second deflections from a baseline value that either identifies the telemetry unit (ID = 1 or 2 if positive) or marks an error (E) in the telemetry link if negative. Subjects and recordings are further described in the file headers, the descriptive spreadsheet ST-subjects.xls. Args: root: str, root directory of the raw data. *You can choose to use the path to Cassette portion or the Telemetry portion.* dataset_name: Optional[str], name of the dataset. Default is None. config_path: Optional[str], path to the config file. Default is None. subset: Optional[str], subset of the SleepEDF dataset, either "cassette" or "telemetry". Default is "cassette". Attributes: task: Optional[str], name of the task (e.g., "sleep staging"). Default is None. samples: Optional[List[Dict]], a list of samples, each sample is a dict with patient_id, record_id, and other task-specific attributes as key. Default is None. patient_to_index: Optional[Dict[str, List[int]]], a dict mapping patient_id to a list of sample indices. Default is None. visit_to_index: Optional[Dict[str, List[int]]], a dict mapping visit_id to a list of sample indices. Default is None. Examples: >>> from pyhealth.datasets import SleepEDFDataset >>> dataset = SleepEDFDataset( ... root="/srv/local/data/SLEEPEDF/sleep-edf-database-expanded-1.0.0/", ... ) >>> dataset.stat() >>> dataset.info() """ def __init__( self, root: str, dataset_name: Optional[str] = None, config_path: Optional[str] = None, subset: Optional[str] = "cassette", ) -> None: subset = (subset or "cassette").lower() if subset not in {"cassette", "telemetry"}: raise ValueError( f"Unsupported subset '{subset}'. Expected 'cassette' or 'telemetry'." ) if config_path is None: logger.info("No config path provided, using default config") config_path = os.path.join( os.path.dirname(__file__), "configs", "sleepedf.yaml" ) metadata_filename = f"sleepedf-{subset}-pyhealth.csv" metadata_path = os.path.join(root, metadata_filename) if subset == "cassette": if not os.path.exists(metadata_path): self.prepare_metadata_cassette(root) else: if not os.path.exists(metadata_path): self.prepare_metadata_telemetry(root) self.subset = subset default_tables = [subset] super().__init__( root=root, tables=default_tables, dataset_name=dataset_name or "sleepedf", config_path=config_path, )
[docs] def prepare_metadata_cassette(self, root: str) -> None: """Prepare metadata for the SleepEDF cassette subset. Args: root: Root directory containing the dataset files. This method processes the raw cassette metadata files and saves a processed CSV file. """ sleep_edf_cassette = pd.read_excel(os.path.join(root, "SC-subjects.xls")) sleep_edf_cassette = sleep_edf_cassette.rename( columns={"sex (F=1)": "sex", "LightsOff": "lights_off"} ) for files in os.listdir(os.path.join(root, "sleep-cassette")): if files.endswith("-PSG.edf"): subject_id = int(files[3:5]) night = files[5] if subject_id in sleep_edf_cassette["subject"].values: sleep_edf_cassette.loc[ (sleep_edf_cassette["subject"] == subject_id) & (sleep_edf_cassette["night"] == int(night)), "signal_file", ] = os.path.join(root, "sleep-cassette", files) elif files.endswith("-Hypnogram.edf"): subject_id = int(files[3:5]) night = files[5] if subject_id in sleep_edf_cassette["subject"].values: sleep_edf_cassette.loc[ (sleep_edf_cassette["subject"] == subject_id) & (sleep_edf_cassette["night"] == int(night)), "label_file", ] = os.path.join(root, "sleep-cassette", files) sleep_edf_cassette.to_csv( os.path.join(root, "sleepedf-cassette-pyhealth.csv"), index=False )
[docs] def prepare_metadata_telemetry(self, root: str) -> None: """Prepare metadata for the SleepEDF telemetry subset. Args: root: Root directory containing the dataset files. This method processes the raw telemetry metadata files and saves a processed CSV file. """ telemetry_path = os.path.join(root, "ST-subjects.xls") telemetry_raw = pd.read_excel(telemetry_path, header=[0, 1]) telemetry_raw.columns = self._flatten_multilevel_columns(telemetry_raw.columns) print(telemetry_raw.columns) telemetry = telemetry_raw.rename( columns={ "subject_age_sex_nr": "subject", "subject_age_sex_age": "age", "subject_age_sex_m1_f2": "sex", "placebo_night_night_nr": "placebo_night", "placebo_night_lights_off": "placebo_lights_off", "temazepam_night_night_nr": "temazepam_night", "temazepam_night_lights_off": "temazepam_lights_off", } ).copy() telemetry["sex"] = ( telemetry["sex"].map({1: "M", 2: "F"}).fillna(telemetry["sex"]) ) records = [] for _, row in telemetry.iterrows(): subject_val = row.get("subject") if pd.isna(subject_val): continue base = { "subject": int(subject_val), "age": row.get("age"), "sex": row.get("sex"), } for condition in ("placebo", "temazepam"): night_val = row.get(f"{condition}_night") if pd.isna(night_val): continue record = { **base, "condition": condition, "night": int(night_val), "lights_off": row.get(f"{condition}_lights_off"), "signal_file": None, "label_file": None, } records.append(record) telemetry_records = pd.DataFrame(records) telemetry_dir = os.path.join(root, "sleep-telemetry") if os.path.isdir(telemetry_dir): for filename in os.listdir(telemetry_dir): filepath = os.path.join(telemetry_dir, filename) if filename.endswith("-PSG.edf"): subject_id = int(filename[3:5]) night = int(filename[5]) mask = (telemetry_records["subject"] == subject_id) & ( telemetry_records["night"] == night ) telemetry_records.loc[mask, "signal_file"] = filepath elif filename.endswith("-Hypnogram.edf"): subject_id = int(filename[3:5]) night = int(filename[5]) mask = (telemetry_records["subject"] == subject_id) & ( telemetry_records["night"] == night ) telemetry_records.loc[mask, "label_file"] = filepath else: logger.warning("Telemetry directory '%s' not found.", telemetry_dir) telemetry_records.sort_values(["subject", "night", "condition"], inplace=True) telemetry_records.reset_index(drop=True, inplace=True) telemetry_records.to_csv( os.path.join(root, "sleepedf-telemetry-pyhealth.csv"), index=False )
@staticmethod def _flatten_multilevel_columns(columns: pd.Index) -> list[str]: """Normalize a MultiIndex column into flat snake_case names.""" def normalize(value: object) -> str: if value is None: return "" cleaned = str(value).strip().lower() cleaned = re.sub(r"[^\w]+", "_", cleaned) return re.sub(r"_+", "_", cleaned).strip("_") flattened: list[str] = [] for col in columns: parts: list[str] = [] for part in col: part_clean = normalize(part) if part_clean: parts.append(part_clean) flattened.append("_".join(parts)) return flattened @property def default_task(self) -> SleepStagingSleepEDF: """Returns the default task for this dataset. Returns: SleepStagingSleepEDF: The default task instance. """ return SleepStagingSleepEDF()