Source code for pyhealth.tasks.deid_ner

"""
PyHealth task for NER-based de-identification of clinical text.

Converts PhysioNet De-Identification dataset records into token-level
BIO-tagged NER samples for PHI detection.

Dataset link:
    https://physionet.org/content/deidentifiedmedicaltext/1.0/

Task paper: (please cite if you use this task)
    Johnson, Alistair E.W., et al. "Deidentification of free-text medical
    records using pre-trained bidirectional transformers." Proceedings of
    the ACM Conference on Health, Inference, and Learning (CHIL), 2020.

Paper link:
    https://doi.org/10.1145/3368555.3384455

Author:
    Matt McKenna (mtm16@illinois.edu)
"""

from typing import Dict, List, Optional, Type, Union

from pyhealth.data import Event, Patient
from pyhealth.processors.text_processor import TextProcessor
from pyhealth.tasks import BaseTask


[docs]class DeIDNERTask(BaseTask):
    """Token-level NER task for clinical text de-identification.

    Each sample contains a list of tokens and their BIO labels over
    7 PHI categories: AGE, CONTACT, DATE, ID, LOCATION, NAME,
    PROFESSION.

    Supports optional overlapping windowing (paper Section 3.3) to
    handle notes longer than BERT's 512 token limit.

    Args:
        window_size: If set, split notes into overlapping windows of
            this many tokens. Default None (no windowing).
        window_overlap: Number of tokens shared between consecutive
            windows. Default 0.

    Attributes:
        task_name (str): The name of the task.
        input_schema (Dict[str, Union[str, Type]]): The schema for the task input.
        output_schema (Dict[str, Union[str, Type]]): The schema for the task output.

    Examples:
        >>> from pyhealth.datasets import PhysioNetDeIDDataset
        >>> from pyhealth.tasks import DeIDNERTask
        >>> dataset = PhysioNetDeIDDataset(root="/path/to/data")
        >>> task = DeIDNERTask()
        >>> samples = dataset.set_task(task)
        >>> task_windowed = DeIDNERTask(window_size=100, window_overlap=60)
        >>> samples = dataset.set_task(task_windowed)
    """

    task_name: str = "DeIDNER"
    input_schema: Dict[str, Union[str, Type]] = {"text": TextProcessor}
    output_schema: Dict[str, Union[str, Type]] = {"labels": TextProcessor}

    def __init__(
        self,
        window_size: Optional[int] = None,
        window_overlap: int = 0,
    ):
        self.window_size = window_size
        self.window_overlap = window_overlap

    def __call__(self, patient: Patient) -> List[Dict]:
        """Generate NER samples from a patient's clinical notes.

        Args:
            patient: A Patient object with physionet_deid events.

        Returns:
            List of dicts, each with 'text' (str) and
            'labels' (str) keys. Both are space-joined strings.
        """
        events: List[Event] = patient.get_events(
            event_type="physionet_deid"
        )

        samples = []
        for event in events:
            note_id = event["note_id"]
            words = event["text"].split(" ")
            labels = event["labels"].split(" ")

            if self.window_size is None:
                # No windowing: one sample per note.
                samples.append({
                    "patient_id": patient.patient_id,
                    "note_id": note_id,
                    "token_start": "0",
                    "text": event["text"],
                    "labels": event["labels"],
                })
            else:
                # Overlapping windows (paper Section 3.3).
                step = self.window_size - self.window_overlap
                idx = 0
                while idx < len(words):
                    end = min(idx + self.window_size, len(words))
                    samples.append({
                        "patient_id": patient.patient_id,
                        "note_id": note_id,
                        "token_start": str(idx),
                        "text": " ".join(words[idx:end]),
                        "labels": " ".join(labels[idx:end]),
                    })
                    idx += step

        return samples