Source code for pyhealth.processors.tuple_time_text_processor

from typing import Any, List, Tuple, Optional, Union
import torch
import logging
from .base_processor import FeatureProcessor, ModalityType, TemporalFeatureProcessor
from . import register_processor

logger = logging.getLogger(__name__)

[docs]@register_processor("tuple_time_text")
class TupleTimeTextProcessor(TemporalFeatureProcessor):
    """Processes (text, time_diff) tuples for multimodal temporal fusion.
    
    Converts paired text and temporal data into a format suitable for models
    that need to distinguish between different modality types automatically.
    
    If `tokenizer_model` is provided, the text will be tokenized using a HuggingFace
    AutoTokenizer, and the output will differ from the raw text version.
    """
    
    def __init__(
        self, 
        type_tag: str = "note",
        tokenizer_model: Optional[str] = None,
        max_length: int = 128,
        padding: bool = True,
        truncation: bool = True,
    ):
        """Initialize the processor.
        
        Args:
            type_tag: Modality identifier for automatic routing. Default: "note"
            tokenizer_model: Name or path of the HuggingFace tokenizer to use.
                If None, texts are returned as raw strings. Default: None
            max_length: Maximum sequence length for tokenization. Default: 128
            padding: Whether to pad sequences to max_length. Default: True
            truncation: Whether to truncate sequences to max_length. Default: True
        """
        super().__init__()
        self.type_tag = type_tag
        self.tokenizer_model = tokenizer_model
        self.max_length = max_length
        self.padding = padding
        self.truncation = truncation
        
        self.tokenizer = None
        if self.tokenizer_model is not None:
            try:
                from transformers import AutoTokenizer
                # Suppress tokenizer warnings
                logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
                self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_model)
            except ImportError:
                raise ImportError(
                    "The 'transformers' library is required when 'tokenizer_model' is provided. "
                    "Please install it via `pip install transformers`."
                )
            except Exception as e:
                raise ValueError(f"Failed to load tokenizer '{self.tokenizer_model}': {e}")

[docs]    def process(self, value: Tuple[List[str], List[float]]) -> Union[Tuple[List[str], torch.Tensor, str], Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, str]]:
        """Process a tuple of texts and time differences.
        
        Args:
            value: Tuple containing:
                - List[str]: Text entries (clinical notes, observations, etc.)
                - List[float]: Time differences corresponding to each text entry
        
        Returns:
            If tokenizer_model is None:
                Tuple containing:
                    - List[str]: Original text entries (unmodified)
                    - torch.Tensor: 1D float tensor of time differences [shape: (N,)]
                    - str: Type tag for modality routing
            
            If tokenizer_model is provided:
                Tuple containing:
                    - torch.Tensor: input_ids [shape: (N, max_length)]
                    - torch.Tensor: attention_mask [shape: (N, max_length)]
                    - torch.Tensor: token_type_ids [shape: (N, max_length)] (if supported by tokenizer)
                    - torch.Tensor: 1D float tensor of time differences [shape: (N,)]
                    - str: Type tag
        """
        texts, time_diffs = value
        time_tensor = torch.tensor(time_diffs, dtype=torch.float32)

        if self.tokenizer is not None:
            # Tokenize the list of texts
            encoded = self.tokenizer(
                texts,
                padding="max_length" if self.padding else False,
                truncation=self.truncation,
                max_length=self.max_length,
                return_tensors="pt"
            )
            
            input_ids = encoded["input_ids"]
            attention_mask = encoded["attention_mask"]
            
            # Not all tokenizers return token_type_ids (e.g. RoBERTa might not, BERT does)
            if "token_type_ids" in encoded:
                token_type_ids = encoded["token_type_ids"]
            else:
                # meaningful text usually 0, padding 0? BERT uses 0 for sent A. 
                # If not provided, we can just use zeros or omit. 
                # For consistency with schema, let's provide zeros if expected.
                token_type_ids = torch.zeros_like(input_ids)

            return input_ids, attention_mask, token_type_ids, time_tensor, self.type_tag

        return texts, time_tensor, self.type_tag
    
[docs]    def size(self):
        """Return the size of the processor vocabulary (not applicable for this processor)."""
        if self.tokenizer is not None:
            return self.tokenizer.vocab_size
        return None
    
[docs]    def is_token(self) -> bool:
        """Returns True if the processor outputs discrete tokens (when tokenizer is used)."""
        return self.tokenizer is not None

[docs]    def schema(self) -> tuple[str, ...]:
        """Returns the schema of the processed feature."""
        if self.tokenizer is not None:
            # "value" corresponds to input_ids, "mask" to attention_mask
            return ("value", "mask", "token_type_ids", "time", "type_tag")
        return ("text", "time", "type_tag")

[docs]    def dim(self) -> tuple[int, ...]:
        """Number of dimensions for each output tensor."""
        if self.tokenizer is not None:
            # input_ids: (seq_len,), attention_mask: (seq_len,), token_type_ids: (seq_len,), time: ()
            # Note: process returns batched items if fit? No, process operates on a single sample's field value.
            # Here 'value' is (List[str], List[float]) -> representing N notes for ONE patient (or visit).
            # The output input_ids is (N, max_length), which is 2 dimensions.
            return (2, 2, 2, 1)
        return (0, 1, 0) # text list has 0 tensor dims, time tensor has 1 dim

[docs]    def modality(self) -> ModalityType:
        """Clinical text → TEXT modality."""
        return ModalityType.TEXT

[docs]    def value_dim(self) -> int:
        """Tokenizer vocabulary size (used with transformer encoder).
        Returns 0 if no tokenizer is loaded."""
        return self.tokenizer.vocab_size if self.tokenizer is not None else 0

[docs]    def process_temporal(self, value) -> dict:
        """Return dict output for UnifiedMultimodalEmbeddingModel.

        Requires ``tokenizer_model`` to be set (raw strings are not
        litdata-serialisable and cannot be embedded without tokenisation).

        Returns:
            {"value": LongTensor (N, L), "mask": LongTensor (N, L), "time": FloatTensor (N,)}

        Raises:
            ValueError: If processor was created without a tokenizer.
        """
        if self.tokenizer is None:
            raise ValueError(
                "TupleTimeTextProcessor.process_temporal() requires a tokenizer. "
                "Pass tokenizer_model='...' when creating the processor."
            )
        result = self.process(value)  # (input_ids, mask, type_ids, time, tag)
        return {
            "value": result[0],  # input_ids  (N, L)
            "mask":  result[1],  # attention_mask (N, L)
            "time":  result[3],  # time tensor (N,)
        }

    def __repr__(self):
        if self.tokenizer_model:
            return f"TupleTimeTextProcessor(type_tag='{self.type_tag}', tokenizer='{self.tokenizer_model}')"
        return f"TupleTimeTextProcessor(type_tag='{self.type_tag}')"