From 7888402e0f05cb35dea9b0b5140b8721de99c78a Mon Sep 17 00:00:00 2001 From: Anurag Ray Chowdhury Date: Wed, 22 Apr 2026 17:40:06 -0400 Subject: [PATCH 1/2] Add MedLingo dataset + jargon expansion task --- docs/api/datasets.rst | 1 + docs/api/tasks.rst | 1 + pyhealth/datasets/__init__.py | 1 + pyhealth/tasks/__init__.py | 1 + 4 files changed, 4 insertions(+) diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index 8d9a59d21..df06f3831 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -225,6 +225,7 @@ Available Datasets datasets/pyhealth.datasets.MIMIC3Dataset datasets/pyhealth.datasets.MIMIC4Dataset datasets/pyhealth.datasets.MedicalTranscriptionsDataset + datasets/pyhealth.datasets.MedLingoDataset datasets/pyhealth.datasets.CardiologyDataset datasets/pyhealth.datasets.eICUDataset datasets/pyhealth.datasets.ISRUCDataset diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst index 23a4e06e5..d84448e34 100644 --- a/docs/api/tasks.rst +++ b/docs/api/tasks.rst @@ -214,6 +214,7 @@ Available Tasks Drug Recommendation Length of Stay Prediction Medical Transcriptions Classification + MedLingo Jargon Expansion Mortality Prediction (Next Visit) Mortality Prediction (StageNet MIMIC-IV) Patient Linkage (MIMIC-III) diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 50b1b3887..bc3dd5ce7 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -57,6 +57,7 @@ def __init__(self, *args, **kwargs): from .eicu import eICUDataset from .isruc import ISRUCDataset from .medical_transcriptions import MedicalTranscriptionsDataset +from .medlingo import MedLingoDataset from .mimic3 import MIMIC3Dataset from .mimic4 import MIMIC4CXRDataset, MIMIC4Dataset, MIMIC4EHRDataset, MIMIC4NoteDataset from .mimicextract import MIMICExtractDataset diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py index a32618f9c..72c02edde 100644 --- a/pyhealth/tasks/__init__.py +++ b/pyhealth/tasks/__init__.py @@ -32,6 +32,7 @@ from .length_of_stay_stagenet_mimic4 import LengthOfStayStageNetMIMIC4 from .medical_coding import MIMIC3ICD9Coding from .medical_transcriptions_classification import MedicalTranscriptionsClassification +from .medlingo_jargon_expansion import MedLingoJargonExpansionTask from .mortality_prediction import ( MortalityPredictionEICU, MortalityPredictionEICU2, From 27e656e03f7effe57f6310cc74a84c1c2aa6cf26 Mon Sep 17 00:00:00 2001 From: Anurag Ray Chowdhury Date: Wed, 22 Apr 2026 17:41:54 -0400 Subject: [PATCH 2/2] Add MedLingo dataset + jargon expansion task --- .../datasets/pyhealth.datasets.medlingo.rst | 7 ++ ...health.tasks.medlingo_jargon_expansion.rst | 7 ++ ...ingo_jargon_expansion_transformersmodel.py | 119 ++++++++++++++++++ pyhealth/datasets/configs/medlingo.yaml | 11 ++ pyhealth/datasets/medlingo.py | 82 ++++++++++++ pyhealth/tasks/medlingo_jargon_expansion.py | 108 ++++++++++++++++ tests/test_medlingo_dataset.py | 102 +++++++++++++++ tests/test_medlingo_jargon_expansion_task.py | 110 ++++++++++++++++ 8 files changed, 546 insertions(+) create mode 100644 docs/api/datasets/pyhealth.datasets.medlingo.rst create mode 100644 docs/api/tasks/pyhealth.tasks.medlingo_jargon_expansion.rst create mode 100644 examples/medlingo_medlingo_jargon_expansion_transformersmodel.py create mode 100644 pyhealth/datasets/configs/medlingo.yaml create mode 100644 pyhealth/datasets/medlingo.py create mode 100644 pyhealth/tasks/medlingo_jargon_expansion.py create mode 100644 tests/test_medlingo_dataset.py create mode 100644 tests/test_medlingo_jargon_expansion_task.py diff --git a/docs/api/datasets/pyhealth.datasets.medlingo.rst b/docs/api/datasets/pyhealth.datasets.medlingo.rst new file mode 100644 index 000000000..b5c5294a0 --- /dev/null +++ b/docs/api/datasets/pyhealth.datasets.medlingo.rst @@ -0,0 +1,7 @@ +pyhealth.datasets.MedLingoDataset +=================================== + +.. autoclass:: pyhealth.datasets.medlingo.MedLingoDataset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/tasks/pyhealth.tasks.medlingo_jargon_expansion.rst b/docs/api/tasks/pyhealth.tasks.medlingo_jargon_expansion.rst new file mode 100644 index 000000000..8c871c08a --- /dev/null +++ b/docs/api/tasks/pyhealth.tasks.medlingo_jargon_expansion.rst @@ -0,0 +1,7 @@ +pyhealth.tasks.medlingo_jargon_expansion +======================================== + +.. autoclass:: pyhealth.tasks.medlingo_jargon_expansion.MedLingoJargonExpansionTask + :members: + :undoc-members: + :show-inheritance: diff --git a/examples/medlingo_medlingo_jargon_expansion_transformersmodel.py b/examples/medlingo_medlingo_jargon_expansion_transformersmodel.py new file mode 100644 index 000000000..2e8234c12 --- /dev/null +++ b/examples/medlingo_medlingo_jargon_expansion_transformersmodel.py @@ -0,0 +1,119 @@ +""" +MedLingo jargon expansion with :class:`~pyhealth.models.TransformersModel`. + +**Paper:** Jia, Sontag & Agrawal — *Diagnosing our datasets* (CHIL 2025), +https://arxiv.org/abs/2505.15024. Public CSV: ``questions.csv`` (columns +``word1``, ``word2``, ``question``, ``answer``) from the MedLingo export in +Flora-jia-jfr/diagnosing_our_datasets — place that file under the directory you +pass as ``root`` below. + +**Ablation (two task configs):** + +- ``MedLingoJargonExpansionTask(shot_mode="one_shot")`` — ``prompt`` is the + released ``question`` string (matches the distributed MedLingo item). +- ``MedLingoJargonExpansionTask(shot_mode="zero_shot")`` — ``prompt`` is rebuilt + from ``word1`` and ``word2`` only; the CSV ``question`` field is not used, so + any one-shot / ICL demo in that column is stripped by construction. + +**Limitation vs the paper:** this PyHealth task uses **multiclass classification +on the string ``answer``** (via ``TransformersModel`` + Hugging Face encoders). +The paper evaluates **open-ended** generations with an LLM judge; this script +does not reproduce that protocol. + +**Smoke run (no Hugging Face download):** by default this script only builds the +dataset, runs ``set_task`` for both shot modes, and prints sample counts. To +also run one forward pass with a **tiny** BERT (small one-time download unless +cached), set environment variable ``PYHEALTH_MEDLINGO_RUN_MODEL=1``:: + + PYHEALTH_MEDLINGO_RUN_MODEL=1 python examples/medlingo_medlingo_jargon_expansion_transformersmodel.py + +Optional: ``PYHEALTH_MEDLINGO_MODEL=`` overrides the tiny default +(``hf-internal-testing/tiny-random-bert``). + +Run from the repository root after ``pip install -e .``, or set +``PYTHONPATH`` to the repo root so ``import pyhealth`` resolves. +""" + +from __future__ import annotations + +import logging +import os +import tempfile +from pathlib import Path + +import pandas as pd + +logging.basicConfig(level=logging.WARNING) +for _name in ("pyhealth", "pyhealth.datasets", "pyhealth.datasets.base_dataset"): + logging.getLogger(_name).setLevel(logging.WARNING) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def _write_synthetic_questions_csv(path: Path) -> None: + """Tiny stand-in for ``datasets/MedLingo/questions.csv`` (no secrets).""" + rows = [ + { + "word1": "MI", + "word2": "STEMI", + "question": "ICL_STUB What is MI vs STEMI in one sentence?", + "answer": "types of heart attack", + }, + { + "word1": "HTN", + "word2": "BP", + "question": "ICL_STUB Define HTN.", + "answer": "high blood pressure", + }, + ] + pd.DataFrame(rows).to_csv(path, index=False) + + +def main() -> None: + from pyhealth.datasets import MedLingoDataset, get_dataloader + from pyhealth.tasks import MedLingoJargonExpansionTask + + tmp = Path(tempfile.mkdtemp(prefix="pyhealth_medlingo_")) + root = tmp / "root" + root.mkdir() + cache = tmp / "cache" + _write_synthetic_questions_csv(root / "questions.csv") + + base = MedLingoDataset(root=str(root), cache_dir=cache, num_workers=1) + logger.info("Patients: %s", len(base.unique_patient_ids)) + + for shot in ("one_shot", "zero_shot"): + task = MedLingoJargonExpansionTask(shot_mode=shot) + samples = base.set_task(task=task, num_workers=1) + logger.info("shot_mode=%s -> %s samples", shot, len(samples)) + if len(samples): + s0 = samples[0] + logger.info("First keys: %s", sorted(s0.keys())) + + if os.environ.get("PYHEALTH_MEDLINGO_RUN_MODEL") != "1": + logger.info( + "Skipping TransformersModel forward (no download). " + "Set PYHEALTH_MEDLINGO_RUN_MODEL=1 to run a tiny HF model on one batch." + ) + return + + from pyhealth.models import TransformersModel + + model_name = os.environ.get( + "PYHEALTH_MEDLINGO_MODEL", "hf-internal-testing/tiny-random-bert" + ) + task = MedLingoJargonExpansionTask(shot_mode="one_shot") + samples = base.set_task(task=task, num_workers=1) + loader = get_dataloader(samples, batch_size=2, shuffle=False) + model = TransformersModel(dataset=samples, model_name=model_name) + model.eval() + batch = next(iter(loader)) + import torch + + with torch.no_grad(): + out = model(**batch) + logger.info("Forward ok; loss=%s", out.get("loss")) + + +if __name__ == "__main__": + main() diff --git a/pyhealth/datasets/configs/medlingo.yaml b/pyhealth/datasets/configs/medlingo.yaml new file mode 100644 index 000000000..f1d22d0eb --- /dev/null +++ b/pyhealth/datasets/configs/medlingo.yaml @@ -0,0 +1,11 @@ +version: "1.0" +tables: + questions: + file_path: "questions.csv" + patient_id: null + timestamp: null + attributes: + - "word1" + - "word2" + - "question" + - "answer" diff --git a/pyhealth/datasets/medlingo.py b/pyhealth/datasets/medlingo.py new file mode 100644 index 000000000..9256d7144 --- /dev/null +++ b/pyhealth/datasets/medlingo.py @@ -0,0 +1,82 @@ +import logging +from pathlib import Path +from typing import Any + +import narwhals as nw + +from ..tasks.medlingo_jargon_expansion import MedLingoJargonExpansionTask +from .base_dataset import BaseDataset + +logger = logging.getLogger(__name__) + +# Expected public export from Flora-jia-jfr/diagnosing_our_datasets: +# datasets/MedLingo/questions.csv with columns word1, word2, question, answer. +_REQUIRED_QUESTION_COLUMNS = frozenset({"word1", "word2", "question", "answer"}) + + +class MedLingoDataset(BaseDataset): + """MedLingo jargon QA rows from the *Diagnosing our datasets* line of work. + + Public MedLingo data (e.g. ``questions.csv``) is released with the paper + *Diagnosing our datasets* (Jia, Sontag & Agrawal, CHIL 2025, + https://arxiv.org/abs/2505.15024). Place ``questions.csv`` under ``root`` + (same layout as ``datasets/MedLingo/questions.csv`` in the paper's data + repo). Each CSV row becomes one synthetic patient with a single + ``questions`` event; attributes are ``word1``, ``word2``, ``question``, + and ``answer`` (column names are matched case-insensitively after load). + + Args: + root: Directory containing ``questions.csv``. + dataset_name: Optional override for the dataset name. + config_path: YAML config path; defaults to ``configs/medlingo.yaml``. + cache_dir: Optional cache root (see :class:`BaseDataset`). + num_workers: Workers for task/sample transforms. + dev: If True, limits to the first 1000 patients (see ``BaseDataset``). + + Note: + :meth:`default_task` uses ``MedLingoJargonExpansionTask(shot_mode= + \"one_shot\")`` so ``set_task()`` matches the released CSV prompts. + Pass ``MedLingoJargonExpansionTask(shot_mode=\"zero_shot\")`` for the + ablation that rebuilds the prompt from ``word1``/``word2`` only. + """ + + def __init__( + self, + root: str, + dataset_name: str | None = None, + config_path: str | Path | None = None, + cache_dir=None, + num_workers: int = 1, + dev: bool = False, + ) -> None: + if config_path is None: + logger.info("No config path provided, using default MedLingo config") + config_path = Path(__file__).parent / "configs" / "medlingo.yaml" + default_tables = ["questions"] + super().__init__( + root=root, + tables=default_tables, + dataset_name=dataset_name or "medlingo", + config_path=str(config_path), + cache_dir=cache_dir, + num_workers=num_workers, + dev=dev, + ) + + @property + def default_task(self) -> MedLingoJargonExpansionTask: + """Default MedLingo task using the released one-shot ``question`` text.""" + return MedLingoJargonExpansionTask(shot_mode="one_shot") + + def preprocess_questions(self, df: Any) -> Any: + """Ensure required MedLingo columns exist after lowercasing names.""" + lf = nw.from_native(df) + names = set(lf.columns) + missing = _REQUIRED_QUESTION_COLUMNS - names + if missing: + raise ValueError( + "questions.csv is missing required column(s): " + f"{sorted(missing)}. Expected columns: " + f"{sorted(_REQUIRED_QUESTION_COLUMNS)} (case-insensitive)." + ) + return lf diff --git a/pyhealth/tasks/medlingo_jargon_expansion.py b/pyhealth/tasks/medlingo_jargon_expansion.py new file mode 100644 index 000000000..ab67bd207 --- /dev/null +++ b/pyhealth/tasks/medlingo_jargon_expansion.py @@ -0,0 +1,108 @@ +"""MedLingo jargon expansion task (plain-language answer from a prompt). + +Tied to *Diagnosing our datasets* (Jia, Sontag & Agrawal, CHIL 2025; +https://arxiv.org/abs/2505.15024). This task is a **multiclass shortcut** over +the string ``answer`` column; it does not reproduce the paper's open-ended +generation plus LLM-as-judge setup. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Literal, Optional, Tuple + +from ..data import Event, Patient +from .base_task import BaseTask + +ShotMode = Literal["zero_shot", "one_shot"] + + +def _as_str(value: Any) -> Optional[str]: + """Return a clean string or None if the value is unusable.""" + if value is None: + return None + text = str(value).strip() + if not text or text.lower() == "nan": + return None + return text + + +class MedLingoJargonExpansionTask(BaseTask): + """Map each MedLingo row to a text prompt and a plain-language ``answer``. + + Ablation (``shot_mode``), aligned with the course rubric: + + - **one_shot**: Use the ``question`` field verbatim as ``prompt``. This + matches the **released** MedLingo item (including any in-context demo + baked into that string). + - **zero_shot**: Do **not** use ``question``. Rebuild a minimal instruction + from ``word1`` and ``word2`` only so the model never sees the released + one-shot prompt (ICL demonstration stripped by construction). + + Attributes: + task_name: Includes ``shot_mode`` so caches differ per configuration. + shot_mode: Either ``\"zero_shot\"`` or ``\"one_shot\"``. + input_schema: Single ``\"text\"`` field ``prompt`` for encoder models. + output_schema: ``answer`` as ``\"multiclass\"`` over distinct strings. + """ + + input_schema: Dict[str, str] = {"prompt": "text"} + output_schema: Dict[str, str] = {"answer": "multiclass"} + + def __init__( + self, + shot_mode: ShotMode = "one_shot", + code_mapping: Optional[Dict[str, Tuple[str, str]]] = None, + ) -> None: + if shot_mode not in ("zero_shot", "one_shot"): + raise ValueError( + f"shot_mode must be 'zero_shot' or 'one_shot', got {shot_mode!r}" + ) + super().__init__(code_mapping=code_mapping) + self.shot_mode: ShotMode = shot_mode + self.task_name = f"MedLingoJargonExpansionTask/{shot_mode}" + + def _build_prompt(self, event: Event) -> Optional[str]: + """Build model input text for the current ``shot_mode``.""" + word1 = _as_str(event.word1) + word2 = _as_str(event.word2) + question = _as_str(event.question) + + if self.shot_mode == "one_shot": + # Released conditioning: full CSV ``question`` (demo + query as + # distributed). + return question + + # zero_shot: ignore ``question`` entirely; ICL is not present by design. + if word1 is None or word2 is None: + return None + return ( + "In plain language, define the medical jargon that connects " + f'"{word1}" and "{word2}". Respond with the plain-language ' + "definition only." + ) + + def __call__(self, patient: Patient) -> List[Dict[str, Any]]: + """Emit one sample per patient when fields are valid. + + Args: + patient: Synthetic patient with a single ``questions`` event. + + Returns: + A one-element list with ``id``, ``prompt``, and ``answer``, or + empty if required fields are missing. + """ + events = patient.get_events(event_type="questions") + if len(events) != 1: + return [] + event = events[0] + answer = _as_str(event.answer) + prompt = self._build_prompt(event) + if prompt is None or answer is None: + return [] + return [ + { + "id": patient.patient_id, + "prompt": prompt, + "answer": answer, + } + ] diff --git a/tests/test_medlingo_dataset.py b/tests/test_medlingo_dataset.py new file mode 100644 index 000000000..c5e9d5d3c --- /dev/null +++ b/tests/test_medlingo_dataset.py @@ -0,0 +1,102 @@ +"""Synthetic tests for :class:`~pyhealth.datasets.MedLingoDataset` (no real MedLingo).""" + +from __future__ import annotations + +import pandas as pd +import pytest + +from pyhealth.datasets import MedLingoDataset +from pyhealth.tasks import MedLingoJargonExpansionTask + +THREE_ROWS = [ + { + "word1": "MI", + "word2": "STEMI", + "question": "Q0?", + "answer": "heart attack", + }, + { + "word1": "HTN", + "word2": "BP", + "question": "Q1?", + "answer": "high blood pressure", + }, + { + "word1": "DM", + "word2": "A1c", + "question": "Q2?", + "answer": "diabetes", + }, +] + + +def _write_questions_csv(path, rows: list[dict]) -> None: + pd.DataFrame(rows).to_csv(path, index=False) + + +@pytest.fixture(scope="module") +def medlingo_three_patients(tmp_path_factory): + """One parquet build shared by load + default-task tests.""" + base = tmp_path_factory.mktemp("medlingo_mod") + root = base / "data" + root.mkdir() + cache = base / "cache" + cache.mkdir() + _write_questions_csv(root / "questions.csv", THREE_ROWS) + return MedLingoDataset(root=str(root), cache_dir=str(cache), num_workers=1) + + +def test_medlingo_default_task_raw_sample(medlingo_three_patients): + """Raw task output (no ``set_task`` / litdata).""" + ds = medlingo_three_patients + assert isinstance(ds.default_task, MedLingoJargonExpansionTask) + assert ds.default_task.shot_mode == "one_shot" + raw = ds.default_task(ds.get_patient("0")) + assert len(raw) == 1 + assert raw[0]["prompt"] == "Q0?" + assert raw[0]["answer"] == "heart attack" + + +def test_medlingo_loads_rows_as_patients(medlingo_three_patients): + ds = medlingo_three_patients + assert len(ds.unique_patient_ids) == 3 + p0 = ds.get_patient("0") + evs = p0.get_events(event_type="questions") + assert len(evs) == 1 + assert evs[0].word1 == "MI" + assert evs[0].answer == "heart attack" + + +def test_medlingo_missing_column_raises(tmp_path): + root = tmp_path / "data" + root.mkdir() + _write_questions_csv( + root / "questions.csv", + [{"word1": "a", "word2": "b", "question": "q"}], + ) + ds = MedLingoDataset(root=str(root), cache_dir=tmp_path / "c", num_workers=1) + with pytest.raises(ValueError, match="missing required column"): + _ = ds.unique_patient_ids + + +def test_medlingo_z_case_insensitive_columns(tmp_path): + """Runs after module-scoped tests (name) so a second CSV build is isolated.""" + root = tmp_path / "data" + root.mkdir() + cache = tmp_path / "cache" + _write_questions_csv( + root / "questions.csv", + [ + { + "Word1": "a", + "WORD2": "b", + "Question": "Q?", + "ANSWER": "ans", + }, + ], + ) + ds = MedLingoDataset(root=str(root), cache_dir=cache, num_workers=1) + p = ds.get_patient("0") + ev = p.get_events(event_type="questions")[0] + assert ev.word1 == "a" + assert ev.answer == "ans" diff --git a/tests/test_medlingo_jargon_expansion_task.py b/tests/test_medlingo_jargon_expansion_task.py new file mode 100644 index 000000000..3cc202d17 --- /dev/null +++ b/tests/test_medlingo_jargon_expansion_task.py @@ -0,0 +1,110 @@ +"""Tests for :class:`~pyhealth.tasks.MedLingoJargonExpansionTask` (no network).""" + +from __future__ import annotations + +from datetime import datetime + +import polars as pl +import pytest + +from pyhealth.data import Patient +from pyhealth.tasks import MedLingoJargonExpansionTask + + +def _patient_from_row( + pid: str, + word1: str, + word2: str, + question: str, + answer: str, +) -> Patient: + df = pl.DataFrame( + { + "patient_id": [pid], + "event_type": ["questions"], + "timestamp": [datetime(2020, 1, 1)], + "questions/word1": [word1], + "questions/word2": [word2], + "questions/question": [question], + "questions/answer": [answer], + } + ) + return Patient(pid, df) + + +def test_one_shot_uses_csv_question_verbatim(): + task = MedLingoJargonExpansionTask(shot_mode="one_shot") + q = "ICL_DEMO_ONLY_XYZ What is MI?" + p = _patient_from_row("0", "MI", "STEMI", q, "myocardial infarction") + out = task(p) + assert len(out) == 1 + assert out[0]["prompt"] == q + assert out[0]["answer"] == "myocardial infarction" + assert out[0]["id"] == "0" + + +def test_zero_shot_ignores_question_field(): + task = MedLingoJargonExpansionTask(shot_mode="zero_shot") + p = _patient_from_row( + "1", + "foo", + "bar", + "ICL_DEMO_ONLY_XYZ never use this in zero-shot", + "plain", + ) + out = task(p) + assert len(out) == 1 + assert "ICL_DEMO_ONLY_XYZ" not in out[0]["prompt"] + assert "foo" in out[0]["prompt"] and "bar" in out[0]["prompt"] + + +def test_zero_shot_and_one_shot_differ_on_same_row(): + p = _patient_from_row("2", "a", "b", "full released question", "lbl") + z = MedLingoJargonExpansionTask(shot_mode="zero_shot")(p)[0]["prompt"] + o = MedLingoJargonExpansionTask(shot_mode="one_shot")(p)[0]["prompt"] + assert z != o + assert o == "full released question" + + +def test_invalid_shot_mode(): + with pytest.raises(ValueError, match="shot_mode"): + MedLingoJargonExpansionTask(shot_mode="bad") + + +def test_empty_answer_drops_sample(): + task = MedLingoJargonExpansionTask(shot_mode="one_shot") + p = _patient_from_row("3", "a", "b", "q", "") + assert task(p) == [] + + +def test_zero_shot_requires_both_words(): + task = MedLingoJargonExpansionTask(shot_mode="zero_shot") + p = _patient_from_row("4", "", "b", "q", "ans") + assert task(p) == [] + + +def test_task_name_includes_shot_mode(): + assert ( + MedLingoJargonExpansionTask(shot_mode="zero_shot").task_name + == "MedLingoJargonExpansionTask/zero_shot" + ) + assert ( + MedLingoJargonExpansionTask(shot_mode="one_shot").task_name + == "MedLingoJargonExpansionTask/one_shot" + ) + + +def test_wrong_event_count_returns_empty(): + df = pl.DataFrame( + { + "patient_id": ["5", "5"], + "event_type": ["questions", "questions"], + "timestamp": [datetime(2020, 1, 1), datetime(2020, 1, 2)], + "questions/word1": ["a", "b"], + "questions/word2": ["c", "d"], + "questions/question": ["q1", "q2"], + "questions/answer": ["x", "y"], + } + ) + p = Patient("5", df) + assert MedLingoJargonExpansionTask(shot_mode="one_shot")(p) == []