Source code for pyhealth.datasets.mimic3

import logging
import warnings
from pathlib import Path
from typing import List, Optional

import narwhals as pl

from .base_dataset import BaseDataset

logger = logging.getLogger(__name__)


[docs]class MIMIC3Dataset(BaseDataset): """ A dataset class for handling MIMIC-III data. This class is responsible for loading and managing the MIMIC-III dataset, which includes tables such as patients, admissions, and icustays. Attributes: root (str): The root directory where the dataset is stored. tables (List[str]): A list of tables to be included in the dataset. dataset_name (Optional[str]): The name of the dataset. config_path (Optional[str]): The path to the configuration file. Examples: >>> from pyhealth.datasets import MIMIC3Dataset >>> # Load MIMIC-III dataset with clinical tables >>> dataset = MIMIC3Dataset( ... root="/path/to/mimic-iii/1.4", ... tables=["diagnoses_icd", "procedures_icd", "labevents"], ... ) >>> dataset.stats() """ def __init__( self, root: str, tables: List[str], dataset_name: Optional[str] = None, config_path: Optional[str] = None, **kwargs, ) -> None: """ Initializes the MIMIC4Dataset with the specified parameters. Args: root (str): The root directory where the dataset is stored. tables (List[str]): A list of additional tables to include. dataset_name (Optional[str]): The name of the dataset. Defaults to "mimic3". config_path (Optional[str]): The path to the configuration file. If not provided, a default config is used. """ if config_path is None: logger.info("No config path provided, using default config") config_path = Path(__file__).parent / "configs" / "mimic3.yaml" default_tables = ["patients", "admissions", "icustays"] tables = default_tables + tables if "prescriptions" in tables: warnings.warn( "Events from prescriptions table only have date timestamp (no specific time). " "This may affect temporal ordering of events.", UserWarning, ) super().__init__( root=root, tables=tables, dataset_name=dataset_name or "mimic3", config_path=config_path, **kwargs, ) return
[docs] def preprocess_noteevents(self, df: pl.LazyFrame) -> pl.LazyFrame: """ Table-specific preprocess function which will be called by BaseDataset.load_table(). Preprocesses the noteevents table by ensuring that the charttime column is populated. If charttime is null, it uses chartdate with a default time of 00:00:00. See: https://mimic.mit.edu/docs/iii/tables/noteevents/#chartdate-charttime-storetime. Args: df (pl.LazyFrame): The input dataframe containing noteevents data. Returns: pl.LazyFrame: The processed dataframe with updated charttime values. """ df = df.with_columns( pl.when(pl.col("charttime").is_null()) .then(pl.col("chartdate") + pl.lit(" 00:00:00")) .otherwise(pl.col("charttime")) .alias("charttime") ) return df