Source code for pyhealth.datasets.tuev

import os

import numpy as np

from pyhealth.datasets import BaseSignalDataset


[docs]class TUEVDataset(BaseSignalDataset):
    """Base EEG dataset for the TUH EEG Events Corpus

    Dataset is available at https://isip.piconepress.com/projects/tuh_eeg/html/downloads.shtml

    This corpus is a subset of TUEG that contains annotations of EEG segments as one of six classes: (1) spike and sharp wave (SPSW), (2) generalized periodic epileptiform discharges (GPED), (3) periodic lateralized epileptiform discharges (PLED), (4) eye movement (EYEM), (5) artifact (ARTF) and (6) background (BCKG).

    Files are named in the form of bckg_032_a_.edf in the eval partition:
        bckg: this file contains background annotations.
		032: a reference to the eval index	
		a_.edf: EEG files are split into a series of files starting with a_.edf, a_1.ef, ... These represent pruned EEGs, so the  original EEG is split into these segments, and uninteresting parts of the original recording were deleted.
    or in the form of 00002275_00000001.edf in the train partition:
        00002275: a reference to the train index. 
		0000001: indicating that this is the first file inssociated with this patient. 

    Args:
        dataset_name: name of the dataset.
        root: root directory of the raw data. *You can choose to use the path to Cassette portion or the Telemetry portion.*
        dev: whether to enable dev mode (only use a small subset of the data).
            Default is False.
        refresh_cache: whether to refresh the cache; if true, the dataset will
            be processed from scratch and the cache will be updated. Default is False.

    Attributes:
        task: Optional[str], name of the task (e.g., "EEG_events").
            Default is None.
        samples: Optional[List[Dict]], a list of samples, each sample is a dict with
            patient_id, record_id, and other task-specific attributes as key.
            Default is None.
        patient_to_index: Optional[Dict[str, List[int]]], a dict mapping patient_id to
            a list of sample indices. Default is None.
        visit_to_index: Optional[Dict[str, List[int]]], a dict mapping visit_id to a
            list of sample indices. Default is None.

    Examples:
        >>> from pyhealth.datasets import TUEVDataset
        >>> dataset = TUEVDataset(
        ...         root="/srv/local/data/TUH/tuh_eeg_events/v2.0.0/edf/",
        ...     )
        >>> dataset.stat()
        >>> dataset.info()
    """

[docs]    def process_EEG_data(self):
        # get all file names
        all_files = {}

        train_files = os.listdir(os.path.join(self.root, "train/"))
        for id in train_files:
            if id != ".DS_Store":
                all_files["0_{}".format(id)] = [name for name in os.listdir(os.path.join(self.root, "train/", id)) if name.endswith(".edf")]

        eval_files = os.listdir(os.path.join(self.root, "eval/"))
        for id in eval_files:
            if id != ".DS_Store":
                all_files["1_{}".format(id)] = [name for name in os.listdir(os.path.join(self.root, "eval/", id)) if name.endswith(".edf")]

        # get all patient ids
        patient_ids = list(set(list(all_files.keys())))

        if self.dev:
            patient_ids = patient_ids[:20]
            # print(patient_ids)

        # get patient to record maps
        #    - key: pid:
        #    - value: [{"load_from_path": None, "patient_id": None, "signal_file": None, "label_file": None, "save_to_path": None}, ...]
        patients = {
            pid: []
            for pid in patient_ids
        }
           
        for pid in patient_ids:
            split = "train" if pid.split("_")[0] == "0" else "eval"
            id = pid.split("_")[1]

            patient_visits = all_files[pid]
            
            for visit in patient_visits:
                if split == "train":
                    visit_id = visit.strip(".edf").split("_")[1]
                else:
                    visit_id = visit.strip(".edf")
                    
                patients[pid].append({
                    "load_from_path": os.path.join(self.root, split, id),
                    "patient_id": pid,
                    "visit_id": visit_id,
                    "signal_file": visit,
                    "label_file": visit,
                    "save_to_path": self.filepath,
                })
        
        return patients


if __name__ == "__main__":
    dataset = TUEVDataset(
        root="/srv/local/data/TUH/tuh_eeg_events/v2.0.0/edf",
        dev=True,
        refresh_cache=True,
    )
    dataset.stat()
    dataset.info()
    print(list(dataset.patients.items())[0])