Source code for pyhealth.datasets.shhs

import os

import numpy as np

from pyhealth.datasets import BaseSignalDataset


[docs]class SHHSDataset(BaseSignalDataset): """Base EEG dataset for Sleep Heart Health Study (SHHS) Dataset is available at https://sleepdata.org/datasets/shhs The Sleep Heart Health Study (SHHS) is a multi-center cohort study implemented by the National Heart Lung & Blood Institute to determine the cardiovascular and other consequences of sleep-disordered breathing. It tests whether sleep-related breathing is associated with an increased risk of coronary heart disease, stroke, all cause mortality, and hypertension. In all, 6,441 men and women aged 40 years and older were enrolled between November 1, 1995 and January 31, 1998 to take part in SHHS Visit 1. During exam cycle 3 (January 2001- June 2003), a second polysomnogram (SHHS Visit 2) was obtained in 3,295 of the participants. CVD Outcomes data were monitored and adjudicated by parent cohorts between baseline and 2011. More than 130 manuscripts have been published investigating predictors and outcomes of sleep disorders. Args: dataset_name: name of the dataset. root: root directory of the raw data (should contain many csv files). dev: whether to enable dev mode (only use a small subset of the data). Default is False. refresh_cache: whether to refresh the cache; if true, the dataset will be processed from scratch and the cache will be updated. Default is False. Attributes: task: Optional[str], name of the task (e.g., "sleep staging"). Default is None. samples: Optional[List[Dict]], a list of samples, each sample is a dict with patient_id, record_id, and other task-specific attributes as key. Default is None. patient_to_index: Optional[Dict[str, List[int]]], a dict mapping patient_id to a list of sample indices. Default is None. visit_to_index: Optional[Dict[str, List[int]]], a dict mapping visit_id to a list of sample indices. Default is None. Examples: >>> from pyhealth.datasets import SHHSDataset >>> dataset = SHHSDataset( ... root="/srv/local/data/SHHS/", ... ) >>> dataset.stat() >>> dataset.info() """
[docs] def parse_patient_id(self, file_name): """ Args: file_name: the file name of the shhs datasets. e.g., shhs1-200001.edf Returns: patient_id: the patient id of the shhs datasets. e.g., 200001 """ return file_name.split("-")[1].split(".")[0]
[docs] def process_EEG_data(self): # get shhs1 shhs1 = [] if os.path.exists(os.path.join(self.root, "edfs/shhs1")): print("shhs1 exists and load shhs1") shhs1 = os.listdir(os.path.join(self.root, "edfs/shhs1")) else: print("shhs1 does not exist") # get shhs2 shhs2 = [] if os.path.exists(os.path.join(self.root, "edfs/shhs2")): print("shhs2 exists and load shhs2") shhs2 = os.listdir(os.path.join(self.root, "edfs/shhs2")) else: print("shhs2 does not exist") # get all patient ids patient_ids = np.unique([self.parse_patient_id(file) for file in shhs1 + shhs2]) if self.dev: patient_ids = patient_ids[:5] # get patient to record maps # - key: pid: # - value: [{"load_from_path": None, "file": None, "save_to_path": None}, ...] patients = {pid: [] for pid in patient_ids} # parse shhs1 for file in shhs1: pid = self.parse_patient_id(file) if pid in patient_ids: patients[pid].append( { "load_from_path": self.root, "signal_file": os.path.join("edfs/shhs1", file), "label_file": os.path.join("annotations-events-profusion/shhs1", f"shhs1-{pid}-profusion.xml"), "save_to_path": os.path.join(self.filepath), } ) # parse shhs2 for file in shhs2: pid = self.parse_patient_id(file) if pid in patient_ids: patients[pid].append( { "load_from_path": self.root, "signal_file": os.path.join("edfs/shhs2", file), "label_file": os.path.join("annotations-events-profusion/label", f"shhs2-{pid}-profusion.xml"), "save_to_path": os.path.join(self.filepath), } ) return patients
if __name__ == "__main__": dataset = SHHSDataset( root="/srv/local/data/SHHS/polysomnography", dev=True, refresh_cache=True, ) dataset.stat() dataset.info() print(list(dataset.patients.items())[0])