From b0bbcabd226ecfb7f826b26fbeb5903470be6a6e Mon Sep 17 00:00:00 2001 From: SexyERIC0723 Date: Sun, 12 Apr 2026 01:53:48 +0100 Subject: [PATCH] test: add NOTEEVENTS demo data and enable multimodal mortality test Add synthetic NOTEEVENTS.csv.gz to mimic3demo test resources and remove the skip decorator from the multimodal mortality prediction test. The noteevents preprocessor already exists in MIMIC3Dataset. Changes to the multimodal test: - Load a dedicated dataset with the noteevents table included - Replace deprecated .samples attribute with len()/indexing - Use int() on mortality label for tensor-safe comparison - Assert at least one sample has non-empty clinical notes Closes #512 Assisted-by: Claude Code --- .../core/mimic3demo/NOTEEVENTS.csv.gz | Bin 0 -> 912 bytes .../core/test_mimic3_mortality_prediction.py | 70 ++++++++++-------- 2 files changed, 40 insertions(+), 30 deletions(-) create mode 100644 test-resources/core/mimic3demo/NOTEEVENTS.csv.gz diff --git a/test-resources/core/mimic3demo/NOTEEVENTS.csv.gz b/test-resources/core/mimic3demo/NOTEEVENTS.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..3d87d0e7a775b3f085a48a80a017c77246aeaadd GIT binary patch literal 912 zcmV;B18@8viwFpJ?AmDp|4vU-MMYLcPE=DaV{>)@%$QAYoG=iE?~(E!ten}EZN5xy zJ@%4DRn;CV0$$b?*pY2EO@4jH5H=I`P|`zX2_>U<37Lln2D8>Y{Zwt@#)n_B@u57{ z?GS!9$J+X~_HwSg8f40e+h1R?gi)GT z3Cnmcc*0pucT~WJN8x~)+!^SN@ia?la$kETwI}d_ z4?8txI2h~ehyHm_zAIn%#Hmg@@}&GxX`+oCAnVnjyxWue8nU6*btl_t7t)N@h64_= zRlo{#aO@7q_5S*bZ-HX;HeEwOmQ7Ii$7iP+RqMBkeDYrEHgrFcb7YJ)8&aR4XlMl?xg>Ngn%J>#(yr5Essn z1fp~y;+$4*7(7MgrKUOi2Tk?xF!pc-wFJ)l5&C_qpXc*lJEx#4-N-%pW_oF-z3RrR zlk%{S?E5_&G6)6`1?Pd71JJ#^C=LG&P_iv37jot-PC0oX5+Y$mRi!{gvvNmat5AvL zyvsfy=jMwI9RrOw;Cn(2uHhpzbhEM+@T2=mFz!P|etT z1PFKID8m5pgR^L-oY(?|ZX}$= zsi|6svWjNeYs>kLrKntMTC&Kd9HU!UR(6&wPCIN`D^yvy;Ak0;1dQesE-F-FJvfVT z&N03P2sy%8q>j=BiSvq!bI+_m<)x!Ji+0knYzx(es5y&NR9G}H1`j4t(rN9J!V0v2 z4V73F&SIN&#TFK%2WOG$DHkM~!wPinnXw(ivc3-IZ>kLZWaMrzp-Aq$1DKHxABGc`0hnBAa!bZ(+I6GiQ1)qoel1_h mwI1>DuNeN@tV`(@pp_#L_4f>a00030{{sN9oa;gJ7ytkgFT!j9 literal 0 HcmV?d00001 diff --git a/tests/core/test_mimic3_mortality_prediction.py b/tests/core/test_mimic3_mortality_prediction.py index d7105782b..0fd6fc169 100644 --- a/tests/core/test_mimic3_mortality_prediction.py +++ b/tests/core/test_mimic3_mortality_prediction.py @@ -131,7 +131,6 @@ def test_mortality_prediction_mimic3_set_task(self): traceback.print_exc() self.fail(f"Failed to use set_task with MortalityPredictionMIMIC3: {e}") - @unittest.skip("Skipping multimodal test - noteevents not included in test resources") def test_multimodal_mortality_prediction_mimic3_set_task(self): """Test MultimodalMortalityPredictionMIMIC3 task with set_task() method.""" task = MultimodalMortalityPredictionMIMIC3() @@ -144,44 +143,55 @@ def test_multimodal_mortality_prediction_mimic3_set_task(self): self.assertIn("clinical_notes", task.input_schema) self.assertIn("mortality", task.output_schema) + # Load dataset with noteevents for multimodal testing + multimodal_dataset = MIMIC3Dataset( + root=self.demo_dataset_path, + tables=["diagnoses_icd", "procedures_icd", "prescriptions", "noteevents"], + ) + # Test using set_task method try: - sample_dataset = self.dataset.set_task(task) + sample_dataset = multimodal_dataset.set_task(task) self.assertIsNotNone(sample_dataset, "set_task should return a dataset") - self.assertTrue( - hasattr(sample_dataset, "samples"), "Sample dataset should have samples" - ) # Verify we got some samples - self.assertGreater( - len(sample_dataset.samples), 0, "Should generate at least one sample" - ) + num_samples = len(sample_dataset) + self.assertGreater(num_samples, 0, "Should generate at least one sample") # Test sample structure - if len(sample_dataset.samples) > 0: - sample = sample_dataset.samples[0] - required_keys = [ - "hadm_id", - "patient_id", - "conditions", - "procedures", - "drugs", - "clinical_notes", - "mortality", - ] - for key in required_keys: - self.assertIn(key, sample, f"Sample should contain key: {key}") + sample = sample_dataset[0] + required_keys = [ + "hadm_id", + "patient_id", + "conditions", + "procedures", + "drugs", + "clinical_notes", + "mortality", + ] + for key in required_keys: + self.assertIn(key, sample, f"Sample should contain key: {key}") + + # Verify data types + self.assertIsInstance( + sample["clinical_notes"], str, "clinical_notes should be a string" + ) + self.assertIn( + int(sample["mortality"]), [0, 1], "Mortality label should be 0 or 1" + ) - # Verify data types - self.assertIsInstance( - sample["clinical_notes"], str, "clinical_notes should be a string" - ) - self.assertIn( - sample["mortality"], [0, 1], "Mortality label should be 0 or 1" - ) + # Verify that at least one sample has non-empty clinical notes, + # proving that NOTEEVENTS data was actually loaded + has_notes = any( + len(sample_dataset[i]["clinical_notes"]) > 0 + for i in range(num_samples) + ) + self.assertTrue( + has_notes, "At least one sample should have non-empty clinical notes" + ) - print(f"Generated {len(sample_dataset.samples)} multimodal samples") - print(f"Clinical notes length: {len(sample['clinical_notes'])}") + print(f"Generated {num_samples} multimodal samples") + print(f"Clinical notes length: {len(sample['clinical_notes'])}") except Exception as e: self.fail(