From b0bbcabd226ecfb7f826b26fbeb5903470be6a6e Mon Sep 17 00:00:00 2001
From: SexyERIC0723 <haoyuwang144@gmail.com>
Date: Sun, 12 Apr 2026 01:53:48 +0100
Subject: [PATCH] test: add NOTEEVENTS demo data and enable multimodal
 mortality test

Add synthetic NOTEEVENTS.csv.gz to mimic3demo test resources and
remove the skip decorator from the multimodal mortality prediction
test. The noteevents preprocessor already exists in MIMIC3Dataset.

Changes to the multimodal test:
- Load a dedicated dataset with the noteevents table included
- Replace deprecated .samples attribute with len()/indexing
- Use int() on mortality label for tensor-safe comparison
- Assert at least one sample has non-empty clinical notes

Closes #512

Assisted-by: Claude Code
---
 .../core/mimic3demo/NOTEEVENTS.csv.gz         | Bin 0 -> 912 bytes
 .../core/test_mimic3_mortality_prediction.py  |  70 ++++++++++--------
 2 files changed, 40 insertions(+), 30 deletions(-)
 create mode 100644 test-resources/core/mimic3demo/NOTEEVENTS.csv.gz
diff --git a/test-resources/core/mimic3demo/NOTEEVENTS.csv.gz b/test-resources/core/mimic3demo/NOTEEVENTS.csv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..3d87d0e7a775b3f085a48a80a017c77246aeaadd
GIT binary patch
literal 912
zcmV;B18@8viwFpJ?AmDp|4vU-MMYLcPE=DaV{>)@%$QAYoG=iE?~(E!ten}EZN5xy
zJ@%4DRn;CV0$$b?*pY2EO@4jH5H=I`P|`zX2_>U<37Lln2D8>Y{Zwt@#)n_B@u57{
z?GS!9$J+X~_HwSg8f4<Uv2v;!_@FcPIccS9ta|jy=%nf3@063)8k>0e+h1R?gi)GT
z3Cnmcc*0pucT~WJN<zggrLe6smWS+av`*=6`AzPv>8x~)+!^SN@ia?la$kETwI}d_
z4?8txI2h~ehyHm_zAIn%#Hmg@@}&GxX`+oCAnVnjyxWue8nU6*btl_t7t)N@h64_=
zRlo{#aO@7q_5S*bZ-HX;HeEwOmQ7Ii$7iP+RqMBkeDYrEHgrFc<wYrg?5jK}`2=m5
zleAiOM^EGlI-9zlq1Etc&1u5!m4<fvK>b7YJ)8&aR4XlMl?xg>Ngn%J>#(yr5Essn
z1fp~y;+$4*7(7MgrKUOi2Tk?xF!pc-wFJ)l5&C_qpXc*lJEx#4-N-%pW_oF-z3RrR
zlk%{S?E5_&G6)6`1?Pd71JJ#^C=LG&P_iv37jot-PC0oX5+Y$mRi!{gvvNmat5AvL
zyvsfy=jMwI9RrOw<WX7&cXVi|kqN0aq;U1`$>;Cn(2uHhpzbhEM+@T2=mFz!P|etT
z1PFKID8m5<qZLe#{mu1Nv4!W&(7P;7L*<!B@HAX#p<lL3DJt>pgR^L-oY(?|ZX}$=
zsi|6svWjNeYs>kLrKntMTC&Kd9HU!UR(6&wPCIN`D^yvy;Ak0;1dQesE-F-FJvfVT
z&N03P2sy%8q>j=BiSvq!bI+_m<)x!Ji+0knYzx(es5y&NR9G}H1`j4t(rN9J!V0v2
z4V73F&SIN&#TFK%2WOG$DHkM~!wPinnXw(i<x0;vi*nY9EkN%bowG<CRTC2Y%*(6@
z*BUFQ_c&Z)M>vc3-IZ>kLZWaMrzp-Aq$1DKHxABGc`0hnBAa!bZ(+I6GiQ<N32O$%
z;pr-PQG{hs#lz#1m#D;ga2DgN3n+gFLXL12siS;BD&eIfeeF5hu@aSQM@ts(tSh!q
ztwb$Zq@v0sn~`|I$|^iaVJ!-6=cs(Qa6nmXvrcSb!D|MTMXIM-kg`Hl>1)qoel1_h
mwI1>DuNeN@tV`(@pp_#L_4f>a00030{{sN9oa;gJ7ytkgFT!j9

literal 0
HcmV?d00001

diff --git a/tests/core/test_mimic3_mortality_prediction.py b/tests/core/test_mimic3_mortality_prediction.py
index d7105782b..0fd6fc169 100644
--- a/tests/core/test_mimic3_mortality_prediction.py
+++ b/tests/core/test_mimic3_mortality_prediction.py
@@ -131,7 +131,6 @@ def test_mortality_prediction_mimic3_set_task(self):
             traceback.print_exc()
             self.fail(f"Failed to use set_task with MortalityPredictionMIMIC3: {e}")
 
-    @unittest.skip("Skipping multimodal test - noteevents not included in test resources")
     def test_multimodal_mortality_prediction_mimic3_set_task(self):
         """Test MultimodalMortalityPredictionMIMIC3 task with set_task() method."""
         task = MultimodalMortalityPredictionMIMIC3()
@@ -144,44 +143,55 @@ def test_multimodal_mortality_prediction_mimic3_set_task(self):
         self.assertIn("clinical_notes", task.input_schema)
         self.assertIn("mortality", task.output_schema)
 
+        # Load dataset with noteevents for multimodal testing
+        multimodal_dataset = MIMIC3Dataset(
+            root=self.demo_dataset_path,
+            tables=["diagnoses_icd", "procedures_icd", "prescriptions", "noteevents"],
+        )
+
         # Test using set_task method
         try:
-            sample_dataset = self.dataset.set_task(task)
+            sample_dataset = multimodal_dataset.set_task(task)
             self.assertIsNotNone(sample_dataset, "set_task should return a dataset")
-            self.assertTrue(
-                hasattr(sample_dataset, "samples"), "Sample dataset should have samples"
-            )
 
             # Verify we got some samples
-            self.assertGreater(
-                len(sample_dataset.samples), 0, "Should generate at least one sample"
-            )
+            num_samples = len(sample_dataset)
+            self.assertGreater(num_samples, 0, "Should generate at least one sample")
 
             # Test sample structure
-            if len(sample_dataset.samples) > 0:
-                sample = sample_dataset.samples[0]
-                required_keys = [
-                    "hadm_id",
-                    "patient_id",
-                    "conditions",
-                    "procedures",
-                    "drugs",
-                    "clinical_notes",
-                    "mortality",
-                ]
-                for key in required_keys:
-                    self.assertIn(key, sample, f"Sample should contain key: {key}")
+            sample = sample_dataset[0]
+            required_keys = [
+                "hadm_id",
+                "patient_id",
+                "conditions",
+                "procedures",
+                "drugs",
+                "clinical_notes",
+                "mortality",
+            ]
+            for key in required_keys:
+                self.assertIn(key, sample, f"Sample should contain key: {key}")
+
+            # Verify data types
+            self.assertIsInstance(
+                sample["clinical_notes"], str, "clinical_notes should be a string"
+            )
+            self.assertIn(
+                int(sample["mortality"]), [0, 1], "Mortality label should be 0 or 1"
+            )
 
-                # Verify data types
-                self.assertIsInstance(
-                    sample["clinical_notes"], str, "clinical_notes should be a string"
-                )
-                self.assertIn(
-                    sample["mortality"], [0, 1], "Mortality label should be 0 or 1"
-                )
+            # Verify that at least one sample has non-empty clinical notes,
+            # proving that NOTEEVENTS data was actually loaded
+            has_notes = any(
+                len(sample_dataset[i]["clinical_notes"]) > 0
+                for i in range(num_samples)
+            )
+            self.assertTrue(
+                has_notes, "At least one sample should have non-empty clinical notes"
+            )
 
-                print(f"Generated {len(sample_dataset.samples)} multimodal samples")
-                print(f"Clinical notes length: {len(sample['clinical_notes'])}")
+            print(f"Generated {num_samples} multimodal samples")
+            print(f"Clinical notes length: {len(sample['clinical_notes'])}")
 
         except Exception as e:
             self.fail(