sunlabuiuc · will-pang · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/examples/foundation_ehr/multimodal_task.py b/examples/foundation_ehr/multimodal_task.py
@@ -0,0 +1,39 @@
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+import os
+
+# PyHealth Packages
+from pyhealth.datasets import MIMIC4Dataset
+from pyhealth.tasks.ehr_foundational_model_mimic4 import EHRFoundationalModelMIMIC4
+from pyhealth.tasks.base_task import BaseTask
+
+# Load MIMIC4 Files
+# There's probably better ways dealing with this on the cluster, but working locally for now 
+# (see: https://github.com/sunlabuiuc/PyHealth/blob/master/examples/mortality_prediction/multimodal_mimic4_minimal.py)
+
+PYHEALTH_REPO_ROOT = '/Users/wpang/Desktop/PyHealth'
+
+EHR_ROOT = os.path.join(PYHEALTH_REPO_ROOT, "srv/local/data/physionet.org/files/mimiciv/2.2")
+NOTE_ROOT = os.path.join(PYHEALTH_REPO_ROOT, "srv/local/data/physionet.org/files/mimic-iv-note/2.2")
+CXR_ROOT = os.path.join(PYHEALTH_REPO_ROOT,"srv/local/data/physionet.org/files/mimic-cxr-jpg/2.0.0")
+CACHE_DIR = os.path.join(PYHEALTH_REPO_ROOT,"srv/local/data/wp/pyhealth_cache")
+
+if __name__ == "__main__":
+
+    dataset = MIMIC4Dataset(
+            ehr_root=EHR_ROOT,
+            note_root=NOTE_ROOT,
+            ehr_tables=["diagnoses_icd", "procedures_icd", "prescriptions", "labevents"],
+            note_tables=["discharge", "radiology"],
+            cache_dir=CACHE_DIR,
+            num_workers=8,
+            dev=True
+        )
+
+    # Apply multimodal task
+    task = EHRFoundationalModelMIMIC4() 
+    samples = dataset.set_task(task, cache_dir=f"{CACHE_DIR}/task", num_workers=8)
+
+    # Get and print sample
+    sample = samples[0]
+    print(sample)
diff --git a/pyhealth/processors/tuple_time_text_processor.py b/pyhealth/processors/tuple_time_text_processor.py
@@ -8,99 +8,102 @@
     Input:  Tuple[List[str], List[float]]
             - List[str]: Clinical text entries (e.g., discharge notes, progress notes)
             - List[float]: Time differences between entries (in any time unit)
-    
-    Output: Tuple[List[str], torch.Tensor, str]
-            - List[str]: Same text entries (unmodified)
-            - torch.Tensor: 1D float tensor of time differences
+
+    Output: Tuple[torch.Tensor, torch.Tensor, str]
+            - torch.Tensor: Text Token IDs from tokenizer [shape: (num_texts, max_seq_len)]
+            - torch.Tensor: 1D float tensor of time differences [shape: (N,)]
             - str: Type tag for automatic modality routing (default: "note")
 
 Use Case:
     This processor enables automatic modality bucketing in multimodal pipelines.
     The type_tag allows downstream models to automatically route different feature
     types to appropriate encoders without hardcoding feature names:
-    
+
     - type_tag="note" routes to text encoder
     - type_tag="image" routes to vision encoder
     - type_tag="ehr" routes to EHR encoder
-    
+
     This design eliminates the need to manually map task schema feature_keys to
     specific model components.
 
 Example:
     >>> from pyhealth.processors import TupleTimeTextProcessor
-    >>> processor = TupleTimeTextProcessor(type_tag="note")
-    >>> 
+    >>> processor = TupleTimeTextProcessor(type_tag="note", tokenizer_name="dmis-lab/biobert-base-cased-v1.1")
+    >>>
     >>> # Clinical notes with time differences
     >>> texts = [
     ...     "Patient admitted with chest pain.",
     ...     "Follow-up: symptoms improved.",
     ...     "Discharge: stable condition."
     ... ]
     >>> time_diffs = [0.0, 2.5, 5.0]  # hours since admission
-    >>> 
+    >>>
     >>> result = processor.process((texts, time_diffs))
-    >>> texts_out, time_tensor, tag = result
-    >>> print(f"Texts: {texts_out}")
+    >>> token_ids, time_tensor, tag = result
+    >>> print(f"Text Token IDs shape: {token_ids.shape}")
     >>> print(f"Time tensor: {time_tensor}")
     >>> print(f"Type tag: {tag}")
-    
+
 Args:
     type_tag (str): Modality identifier for automatic routing in multimodal
         models. Common values: "note", "image", "ehr", "signal".
         Default: "note"
+    tokenizer_name (str): HuggingFace model name for the tokenizer.
+        Default: "dmis-lab/biobert-base-cased-v1.1"
 """
 
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Tuple
 import torch
+from transformers import AutoTokenizer
 from .base_processor import FeatureProcessor
 from . import register_processor
 
 
 @register_processor("tuple_time_text")
 class TupleTimeTextProcessor(FeatureProcessor):
     """Processes (text, time_diff) tuples for multimodal temporal fusion.
-    
-    Converts paired text and temporal data into a format suitable for models
-    that need to distinguish between different modality types automatically.
+
+    Tokenizes text entries using a HuggingFace tokenizer and converts
+    temporal data into tensors for downstream model consumption.
     """
-    
-    def __init__(self, type_tag: str = "note"):
+
+    def __init__(self, type_tag: str = "note", tokenizer_name: str = "dmis-lab/biobert-base-cased-v1.1"):
         """Initialize the processor.
-        
+
         Args:
             type_tag: Modality identifier for automatic routing. Default: "note"
+            tokenizer_name: HuggingFace model name for the tokenizer.
+                Default: "dmis-lab/biobert-base-cased-v1.1"
         """
         super().__init__()
         self.type_tag = type_tag
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
-    def process(self, value: Tuple[List[str], List[float]]) -> Tuple[List[str], torch.Tensor, str]:
+    def process(self, value: Tuple[List[str], List[float]]) -> Tuple[Any, Any, str]:
         """Process a tuple of texts and time differences.
-
+
+        Tokenizes the text entries using the HuggingFace tokenizer and
+        converts time differences to a float tensor.
+
         Args:
             value: Tuple containing:
                 - List[str]: Text entries (clinical notes, observations, etc.)
                 - List[float]: Time differences corresponding to each text entry
-        
+
         Returns:
             Tuple containing:
-                - List[str]: Original text entries (unmodified)
+                - torch.Tensor: Text Token IDs [shape: (T: num_texts, L: max_token_len)]
                 - torch.Tensor: 1D float tensor of time differences [shape: (N,)]
                 - str: Type tag for modality routing
-
-        Example:
-            >>> processor = TupleTimeTextProcessor(type_tag="clinical_note")
-            >>> texts = ["Note 1", "Note 2"]
-            >>> times = [0.0, 24.0]  # hours
-            >>> result = processor.process((texts, times))
-            >>> print(result[1])  # tensor([0., 24.])
         """
         texts, time_diffs = value
+        text_token_ids = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")["input_ids"]
         time_tensor = torch.tensor(time_diffs, dtype=torch.float32)
-        return texts, time_tensor, self.type_tag
-    
+        return text_token_ids, time_tensor, self.type_tag
+
     def size(self):
-        """Return the size of the processor vocabulary (not applicable for this processor)."""
-        return None
-    
+        """Return the vocabulary size of the tokenizer."""
+        return self.tokenizer.vocab_size
+
     def __repr__(self):
-        return f"TupleTimeTextProcessor(type_tag='{self.type_tag}')"
+        return f"TupleTimeTextProcessor(type_tag='{self.type_tag}', tokenizer='{self.tokenizer.name_or_path}')"
diff --git a/pyhealth/tasks/ehr_foundational_model_mimic4.py b/pyhealth/tasks/ehr_foundational_model_mimic4.py
@@ -0,0 +1,135 @@
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Union, Tuple
+
+from pyhealth.tasks.base_task import BaseTask
+
+class EHRFoundationalModelMIMIC4(BaseTask):
+
+    task_name: str = "EHRFoundationalModelMIMIC4"
+    TOKEN_REPRESENTING_MISSING_TEXT = "<missing>"
+    TOKEN_REPRESENTING_MISSING_FLOAT = float("nan")
+
+    def __init__(self):
+        """Initialize the EHR Foundational Model task."""
+        self.input_schema: Dict[str, Union[str, Tuple[str, Dict]]] = {
+            "discharge_note_times": (
+                "tuple_time_text",
+                {
+                    "tokenizer_name": "bert-base-uncased",
+                    "type_tag": "note",
+                },
+            ),
+            "radiology_note_times": (
+                "tuple_time_text",
+                {
+                    "tokenizer_name": "bert-base-uncased",
+                    "type_tag": "note",
+                },
+            )
+        }
+        self.output_schema: Dict[str, str] = {"mortality": "binary"}
+
+    def _clean_text(self, text: Optional[str]) -> Optional[str]:
+        """Return text if non-empty, otherwise None."""
+        return text if text else None
+
+    def __call__(self, patient: Any) -> List[Dict[str, Any]]:
+        # Get demographic info to filter by age
+        demographics = patient.get_events(event_type="patients")
+        if not demographics:
+            return []
+
+        demographics = demographics[0]
+
+        # Get visits
+        admissions = patient.get_events(event_type="admissions")
+        if len(admissions) == 0:
+            return []
+
+        # Determine which admissions to process iteratively
+        # Check each admission's NEXT admission for mortality flag
+        admissions_to_process = []
+        mortality_label = 0
+
+        for i, admission in enumerate(admissions):
+            # Check if THIS admission has the death flag
+            if admission.hospital_expire_flag in [1, "1"]:
+                # Patient died in this admission - set mortality label
+                # but don't include this admission's data
+                mortality_label = 1
+                break
+
+            # Check if there's a next admission with death flag
+            if i + 1 < len(admissions):
+                next_admission = admissions[i + 1]
+                if next_admission.hospital_expire_flag in [1, "1"]:
+                    # Next admission has death - include current, set mortality
+                    admissions_to_process.append(admission)
+                    mortality_label = 1
+                    break
+
+            # No death in current or next - include this admission
+            admissions_to_process.append(admission)
+
+        if len(admissions_to_process) == 0:
+            return []
+
+        # Aggregated notes and time offsets across all admissions (per hadm_id)
+        all_discharge_texts: List[str] = []
+        all_discharge_times_from_admission: List[float] = []
+        all_radiology_texts: List[str] = []
+        all_radiology_times_from_admission: List[float] = []
+
+        # Process each admission independently (per hadm_id)
+        for admission in admissions_to_process:
+            admission_time = admission.timestamp
+
+            # Get notes for this hadm_id only
+            discharge_notes = patient.get_events(
+                event_type="discharge", filters=[("hadm_id", "==", admission.hadm_id)]
+            )
+            radiology_notes = patient.get_events(
+                event_type="radiology", filters=[("hadm_id", "==", admission.hadm_id)]
+            )
+
+            for note in discharge_notes: #TODO: Maybe make this into a helper function?
+                try:
+                    note_text = self._clean_text(note.text)
+                    if note_text:
+                        time_from_admission = (
+                            note.timestamp - admission_time
+                        ).total_seconds() / 3600.0
+                        all_discharge_texts.append(note_text)
+                        all_discharge_times_from_admission.append(time_from_admission)
+                except AttributeError: # note object is missing .text or .timestamp attribute (e.g. malformed note)
+                    pass
+            if not discharge_notes: # If we get an empty list
+                all_discharge_texts.append(self.TOKEN_REPRESENTING_MISSING_TEXT) # Token representing missing text
+                all_discharge_times_from_admission.append(self.TOKEN_REPRESENTING_MISSING_FLOAT) # Token representing missing time(?)
+
+            for note in radiology_notes: #TODO: Maybe make this into a helper function?
+                try:
+                    note_text = self._clean_text(note.text)
+                    if note_text:
+                        time_from_admission = (
+                            note.timestamp - admission_time
+                        ).total_seconds() / 3600.0
+                        all_radiology_texts.append(note_text)
+                        all_radiology_times_from_admission.append(time_from_admission)
+                except AttributeError: # note object is missing .text or .timestamp attribute (e.g. malformed note)
+                    pass
+            if not radiology_notes: # If we receive empty list
+                all_radiology_texts.append(self.TOKEN_REPRESENTING_MISSING_TEXT) # Token representing missing text
+                all_radiology_times_from_admission.append(self.TOKEN_REPRESENTING_MISSING_FLOAT) # Token representing missing time(?)
+
+        discharge_note_times_from_admission = (all_discharge_texts, all_discharge_times_from_admission)
+        radiology_note_times_from_admission = (all_radiology_texts, all_radiology_times_from_admission)
+
+        return [
+            {
+                "patient_id": patient.patient_id,
+                "discharge_note_times": discharge_note_times_from_admission,
+                "radiology_note_times": radiology_note_times_from_admission,
+                "mortality": mortality_label,
+            }
+        ]