Source code for pyhealth.datasets.medical_transcriptions

import logging
from pathlib import Path
from typing import Optional

from ..tasks import MedicalTranscriptionsClassification
from .base_dataset import BaseDataset

logger = logging.getLogger(__name__)


[docs]class MedicalTranscriptionsDataset(BaseDataset): """Medical transcription data scraped from mtsamples.com. Dataset is available at: https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions Args: root: Root directory of the raw data. dataset_name: Name of the dataset. Defaults to "medical_transcriptions". config_path: Path to the configuration file. If None, uses default config. Attributes: root: Root directory of the raw data (should contain many csv files). dataset_name: Name of the dataset. config_path: Path to the configuration file. Examples: >>> from pyhealth.datasets import MedicalTranscriptionsDataset >>> dataset = MedicalTranscriptionsDataset( ... root="path/to/medical_transcriptions", ... ) >>> dataset.stats() >>> samples = dataset.set_task() >>> print(samples[0]) """ def __init__( self, root: str, dataset_name: Optional[str] = None, config_path: Optional[str] = None, cache_dir=None, num_workers: int = 1, dev: bool = False, ) -> None: if config_path is None: logger.info("No config path provided, using default config") config_path = ( Path(__file__).parent / "configs" / "medical_transcriptions.yaml" ) default_tables = ["mtsamples"] super().__init__( root=root, tables=default_tables, dataset_name=dataset_name or "medical_transcriptions", config_path=config_path, cache_dir=cache_dir, num_workers=num_workers, dev=dev, ) return @property def default_task(self) -> MedicalTranscriptionsClassification: """Returns the default task for this dataset.""" return MedicalTranscriptionsClassification()