Source code for pyhealth.datasets.dreamt

import logging
import os
from pathlib import Path
from typing import  Optional, Union

import pandas as pd
from pyhealth.datasets import BaseDataset

logger = logging.getLogger(__name__)

[docs]class DREAMTDataset(BaseDataset):
    """
    Base Dataset for Real-time sleep stage EstimAtion using Multisensor wearable Technology (DREAMT)

    Dataset accepts current versions of DREAMT (1.0.0, 1.0.1, 2.0.0, 2.1.0), available at:
    https://physionet.org/content/dreamt/

    DREAMT includes wrist-based wearable and polysomnography (PSG) sleep data from 100 participants
    recruited from the Duke University Health System (DUHS) Sleep Disorder Lab. This includes 
    wearable signals, PSG signals, sleep labels, and clinical data related to sleep health and disorders. 

    Citations:
    ---------
    When using this dataset, please cite:

    Wang, K., Yang, J., Shetty, A., & Dunn, J. (2025). DREAMT: Dataset for Real-time sleep stage EstimAtion 
    using Multisensor wearable Technology (version 2.1.0). PhysioNet. RRID:SCR_007345. 
    https://doi.org/10.13026/7r9r-7r24

    Will Ke Wang, Jiamu Yang, Leeor Hershkovich, Hayoung Jeong, Bill Chen, Karnika Singh, Ali R Roghanizad, 
    Md Mobashir Hasan Shandhi, Andrew R Spector, Jessilyn Dunn. (2024). Proceedings of the fifth 
    Conference on Health, Inference, and Learning, PMLR 248:380-396.

    Goldberger, A., Amaral, L., Glass, L., Hausdorff, J., Ivanov, P. C., Mark, R., ... & Stanley, H. E. (2000). 
    PhysioBank, PhysioToolkit, and PhysioNet: Components of a new research resource for complex 
    physiologic signals. Circulation [Online]. 101 (23), pp. e215–e220. RRID:SCR_007345.

    Note: 
    ---------
    Dataset follows file and folder structure of dataset version, looks for participant_info.csv and data folders, 
    so root path should be version downloaded, example: root = ".../dreamt/1.0.0/" or ".../dreamt/2.0.0/"

    Args:
        root: root directory containing the dataset files
        dataset_name: optional name of dataset, defaults to "dreamt_sleep"
        config_path: optional configuration file, defaults to "dreamt.yaml"
    
    Attributes:
        root: root directory containing the dataset files
        dataset_name: name of dataset
        config_path: path to configuration file

    Examples:
        >>> from pyhealth.datasets import DREAMTDataset
        >>> dataset = DREAMTDataset(root = "/path/to/dreamt/data/version")
        >>> dataset.stats()
        >>>
        >>> # Get all patient ids
        >>> unique_patients = dataset.unique_patient_ids
        >>> print(f"There are {len(unique_patients)} patients")
        >>>
        >>> # Get single patient data
        >>> patient = dataset.get_patient("S002")
        >>> print(f"Patient has {len(patient.data_source)} event")
        >>>
        >>> # Get event
        >>> event = patient.get_events(event_type="dreamt_sleep")
        >>>
        >>> # Get Apnea-Hypopnea Index (AHI)
        >>> ahi = event[0].ahi
        >>> print(f"AHI is {ahi}")
        >>> 
        >>> # Get 64Hz sleep file path
        >>> file_path = event[0].file_64hz
        >>> print(f"64Hz sleep file path: {file_path}") 
    """

    def __init__(
            self,
            root: str,
            dataset_name: Optional[str] = None,
            config_path: Optional[str] = None,
    ) -> None:
        if config_path is None:
            logger.info("No config provided, using default config")
            config_path = Path(__file__).parent / "configs" / "dreamt.yaml"
        
        metadata_file = Path(root) / "dreamt-metadata.csv"

        if not os.path.exists(metadata_file):
            logger.info(f"{metadata_file} does not exist")
            self.prepare_metadata(root)
        
        default_tables = ["dreamt_sleep"]

        super().__init__(
            root=root,
            tables=default_tables,
            dataset_name=dataset_name or "dreamt_sleep",
            config_path=config_path
        )
    
[docs]    def get_patient_file(self, patient_id: str, root: str, file_path: str) -> Union[str | None]:
        """
        Returns file path of 64Hz and 100Hz data for a patient, or None if no file found

        Args:
            patient_id: patient identifier
            root: root directory containing the dataset files
            file_path: path to location of 64Hz or 100Hz file
            
        Returns:
            file: path to file location or None if no file found
        """
        
        if file_path == "data_64Hz" or file_path == "data":
            file = Path(root) / f"{file_path}" / f"{patient_id}_whole_df.csv"
    
        if file_path == "data_100Hz":
            file = Path(root) / f"{file_path}" / f"{patient_id}_PSG_df.csv"

        if not os.path.exists(str(file)):
            logger.info(f"{file} not found")
            file = None
        
        return file


[docs]    def prepare_metadata(self, root: str) -> None:
        """
        Prepares metadata csv file for the DREAMT dataset by performing the following:
        1. Obtain clinical data from participant_info.csv file
        2. Process file paths based on patients found in clinical data
        3. Organize all data into a single DataFrame
        4. Save the processed DataFrame to a CSV file

        Args:
            root: root directory containing the dataset files
        """

        output_path = Path(root) / "dreamt-metadata.csv"

        # Obtain patient clinical data
        participant_info_path = Path(root) / "participant_info.csv"
        participant_info = pd.read_csv(participant_info_path)


        # Determine folder structure, assign associated file paths based on folder structure
        all_folders = [item.name for item in Path(root).iterdir() if item.is_dir()]
        file_path_64hz = "data_64Hz" if "data_64Hz" in all_folders else "data"
        file_path_100hz = "data_100Hz"

        # Determine paths for 64Hz and 100Hz files for each patient
        participant_info['file_64hz'] = participant_info['SID'].apply(
            lambda sid: self.get_patient_file(sid, root, file_path_64hz)
        )
        participant_info['file_100hz'] = participant_info['SID'].apply(
            lambda sid: self.get_patient_file(sid, root, file_path_100hz)
        )

        # Remove "%" from mean SaO2 recording
        participant_info['Mean_SaO2'] = participant_info['Mean_SaO2'].str[:-1]

        # Format columns to align with BaseDataset
        participant_info = participant_info.rename(columns = {
            'SID': 'patient_id',
            'AGE': 'age',
            'GENDER': 'gender',
            'BMI': 'bmi',
            'OAHI': 'oahi',
            'AHI': 'ahi',
            'Mean_SaO2': 'mean_sao2',
            'Arousal Index': 'arousal_index',
            "MEDICAL_HISTORY": 'medical_history',
            "Sleep_Disorders": 'sleep_disorders'
        })
        
        # Create csv
        participant_info.to_csv(output_path, index=False)