diff --git a/docs/api/models.rst b/docs/api/models.rst
index 7368dec94..d44aa829f 100644
--- a/docs/api/models.rst
+++ b/docs/api/models.rst
@@ -204,3 +204,4 @@ API Reference
     models/pyhealth.models.TextEmbedding
     models/pyhealth.models.BIOT
     models/pyhealth.models.unified_multimodal_embedding_docs
+    models/pyhealth.models.SimpleADCNN
diff --git a/docs/api/models/pyhealth.models.SimpleADCNN.rst b/docs/api/models/pyhealth.models.SimpleADCNN.rst
new file mode 100644
index 000000000..a64bcd335
--- /dev/null
+++ b/docs/api/models/pyhealth.models.SimpleADCNN.rst
@@ -0,0 +1,9 @@
+pyhealth.models.SimpleADCNN
+===================================
+
+Simple 3D CNN for Alzheimer's Classification of MRI images with Domain Knowledge.
+
+.. autoclass:: pyhealth.models.SimpleADCNN
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file
diff --git a/examples/simple_ad_cnn_classification.py b/examples/simple_ad_cnn_classification.py
new file mode 100644
index 000000000..e5ba551fa
--- /dev/null
+++ b/examples/simple_ad_cnn_classification.py
@@ -0,0 +1,422 @@
+"""Ablation study for SimpleADCNN on synthetic 3D MRI data.
+
+Paper: Bruningk et al., "Back to the Basics with Inclusion of Clinical
+Domain Knowledge - A Simple, Scalable and Effective Model of Alzheimer's
+Disease Classification", ML4HC 2021.
+https://proceedings.mlr.press/v149/bruningk21a.html
+
+The paper focused on the parts of the brain most affected by Alzhheimer's (left hippocampus).
+Additionally, the paper looked at brain topography, but found this to be less relevant.
+By using prior knowledge of the problem and relatively simple architectures,
+the paper was able to accurately classify Alzheimer's MRI images.
+Below are some examples of structures the paper investigated.
+
+    - I-3D  (inner brain, 120x144x120): ACC 0.79 +/- 0.05, AUC 0.88
+    - P*-3D (best patch,  30x36x30):   ACC 0.81 +/- 0.05, AUC 0.89
+    - HC-3D (hippocampus, 33x45x48):   ACC 0.84 +/- 0.07, AUC 0.91
+
+The ablation tests the ability to create a wider variety of model structures
+beyond those seen in the paper. The idea is to give greater customization
+when potentially adapting the model to similar problems. 
+
+    1. **Network depth** (2, 3, 4 conv blocks) — the paper uses a fixed depth
+       per region
+    2. **Dropout rate** (0.0, 0.3, 0.5) — the paper mentions dropout but does
+       not report a sensitivity analysis.
+    3. **Dense layer capacity** (64, 128, 256) — classifier head width was not
+       explored in the paper.
+    4. **Learning rate** (1e-3, 5e-4, 1e-4) — standard Adam search values.
+    5. **Input shape** — reproduces the paper's region-level ablation (HC, P, I)
+       to confirm the model handles all three configurations.
+
+Results
+=============================================================
+Captured from a single end-to-end run of this script. Metrics are on
+synthetic random tensors and serve only to verify the ablation grid
+executes and produces well-formed numbers.
+
+    Config                   ACC    AUC    Params
+    -----------------------  -----  -----  ---------
+    HC-3D (~140k)            0.500  0.250    142,017
+    P*-3D (~72k)             0.500  0.500     74,113
+    I-3D (~270k)             0.500  0.625    294,657
+    depth=2                  0.500  0.188     18,753
+    depth=4                  0.500  0.562  1,043,905
+    dropout=0.0              0.500  0.500    142,017
+    dropout=0.5              0.500  0.500    142,017
+    dense=64                 0.500  0.312    133,697
+    dense=256                0.500  0.312    158,657
+    lr=5e-4                  0.500  0.000    142,017
+    lr=1e-4                  0.500  0.000    142,017
+
+The results above are on synthetic data, which is why an ACC of .5 is expected.
+The results seems relatively reasonable with a small sample size and random data.
+SimpleADCNN provides a variety of ways to create the "simple cnn" described in
+the paper, with modifications available if desired.
+
+The ADNI dataset was not available for this project, so synthetic data is necessary.
+As such, the results are in line with synthetic data and seem to train as expected.
+
+How to run
+----------
+    python examples/simple_ad_cnn_classification.py
+"""
+
+import random
+from typing import Any
+
+import torch
+from torch.utils.data import DataLoader
+from sklearn.metrics import accuracy_score, roc_auc_score
+
+
+from pyhealth.datasets import create_sample_dataset
+from pyhealth.datasets.utils import collate_fn_dict_with_padding
+from pyhealth.models import SimpleADCNN
+
+# ---------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------
+
+
+def generate_synthetic_dataset(
+    n_samples: int = 40,
+    volume_shape: tuple[int, int, int] = (33, 45, 48),
+    seed: int = 42,
+):
+    """Generate a synthetic 3D MRI dataset with balanced binary labels.
+
+    Args:
+        n_samples: Total number of samples (split 50/50 between classes).
+        volume_shape: Spatial dimensions (D, H, W) of each volume.
+        seed: Random seed for reproducibility.
+
+    Returns:
+        A ``SampleDataset`` with input key ``"mri"`` and label key
+        ``"label"``.
+    """
+    rng = torch.Generator().manual_seed(seed)
+    samples: list[dict[str, Any]] = []
+    for i in range(n_samples):
+        vol = torch.randn(1, *volume_shape, generator=rng)
+        samples.append(
+            {
+                "patient_id": f"patient-{i}",
+                "visit_id": "visit-0",
+                "mri": vol.tolist(),
+                "label": i % 2,  # balanced: 0, 1, 0, 1, ...
+            }
+        )
+    return create_sample_dataset(
+        samples=samples,
+        input_schema={"mri": "tensor"},
+        output_schema={"label": "binary"},
+        dataset_name="synthetic_adni",
+    )
+
+
+def stratified_split(dataset, ratios=(0.6, 0.2, 0.2), seed=42):
+    """Split a dataset into train/val/test with class balance preserved.
+
+    Ensures every split contains both classes so that AUC is always
+    computable, even on small synthetic datasets.
+
+    Args:
+        dataset: A ``SampleDataset``.
+        ratios: Train / val / test proportions (must sum to 1).
+        seed: Random seed.
+
+    Returns:
+        Three ``torch.utils.data.Subset`` objects.
+    """
+    from torch.utils.data import Subset
+
+    # Separate indices by label
+    class_0, class_1 = [], []
+    for idx in range(len(dataset)):
+        sample = dataset[idx]
+        lab = sample["label"]
+        if isinstance(lab, torch.Tensor):
+            lab = lab.item()
+        (class_0 if lab == 0 else class_1).append(idx)
+
+    rng = random.Random(seed)
+    rng.shuffle(class_0)
+    rng.shuffle(class_1)
+
+    def _split_list(lst):
+        n = len(lst)
+        n_train = max(1, int(n * ratios[0]))
+        n_val = max(1, int(n * ratios[1]))
+        return lst[:n_train], lst[n_train : n_train + n_val], lst[n_train + n_val :]
+
+    train_0, val_0, test_0 = _split_list(class_0)
+    train_1, val_1, test_1 = _split_list(class_1)
+
+    train_idx = train_0 + train_1
+    val_idx = val_0 + val_1
+    test_idx = test_0 + test_1
+
+    rng.shuffle(train_idx)
+    rng.shuffle(val_idx)
+    rng.shuffle(test_idx)
+
+    return (
+        Subset(dataset, train_idx),
+        Subset(dataset, val_idx),
+        Subset(dataset, test_idx),
+    )
+
+
+def train_and_evaluate(config: dict) -> dict[str, float]:
+    """Train an SimpleADCNN with the given config and return metrics.
+
+    Args:
+        config: Dictionary with keys ``name``, ``volume_shape``,
+            ``conv_channels``, ``dropout``, ``dense_dim``, ``lr``,
+            ``epochs``.
+
+    Returns:
+        Dictionary with ``acc``, ``auc``, and ``params`` on the test set.
+    """
+    # Seed all RNG sources for reproducibility across configs
+    seed = config.get("seed", 42)
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    dataset = generate_synthetic_dataset(
+        n_samples=config.get("n_samples", 40),
+        volume_shape=config["volume_shape"],
+        seed=seed,
+    )
+
+    train_data, _, test_data = stratified_split(dataset, seed=seed)
+
+    train_loader = DataLoader(
+        train_data,
+        batch_size=8,
+        shuffle=True,
+        collate_fn=collate_fn_dict_with_padding,
+        generator=torch.Generator().manual_seed(seed),
+    )
+    test_loader = DataLoader(
+        test_data,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=collate_fn_dict_with_padding,
+    )
+
+    model = SimpleADCNN(
+        dataset=dataset,
+        conv_channels=config["conv_channels"],
+        dropout=config["dropout"],
+        dense_dim=config["dense_dim"],
+    )
+    n_params = sum(p.numel() for p in model.parameters())
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
+    epochs = config.get("epochs", 5)
+
+    # --- Training ---
+    model.train()
+    for epoch in range(epochs):
+        for batch in train_loader:
+            optimizer.zero_grad()
+            ret = model(**batch)
+            ret["loss"].backward()
+            optimizer.step()
+
+    # --- Evaluation ---
+    model.eval()
+    all_probs, all_true = [], []
+    with torch.no_grad():
+        for batch in test_loader:
+            ret = model(**batch)
+            all_probs.append(ret["y_prob"].cpu())
+            all_true.append(ret["y_true"].cpu())
+
+    y_prob = torch.cat(all_probs).numpy().ravel()
+    y_true = torch.cat(all_true).numpy().ravel()
+    y_pred = (y_prob >= 0.5).astype(int)
+
+    acc = accuracy_score(y_true, y_pred)
+    try:
+        auc = roc_auc_score(y_true, y_prob)
+    except ValueError:
+        auc = float("nan")
+
+    return {"acc": acc, "auc": auc, "params": n_params}
+
+
+# ---------------------------------------------------------------
+# Ablation configurations
+# ---------------------------------------------------------------
+
+# Each config varies one axis from the HC-3D baseline.
+# The baseline mirrors the paper's hippocampus configuration.
+
+CONFIGS = [
+    # --- Paper region configs with approximate param-count matching ---
+    # Channel widths are chosen so parameter counts approximate the
+    # paper's reported values (which depend on architecture, not input
+    # shape, because global average pooling decouples spatial size from
+    # the classifier).
+    {
+        "name": "HC-3D (~140k)",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.4,
+        "dense_dim": 128,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    {
+        "name": "P*-3D (~72k)",
+        "volume_shape": (30, 36, 30),
+        "conv_channels": (16, 32, 64),
+        "dropout": 0.4,
+        "dense_dim": 64,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    {
+        "name": "I-3D (~270k)",
+        "volume_shape": (24, 28, 24),
+        "conv_channels": (32, 64, 128),
+        "dropout": 0.4,
+        "dense_dim": 128,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    # --- Depth ablation (novel, based on HC-3D baseline) ---
+    {
+        "name": "depth=2",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32),
+        "dropout": 0.4,
+        "dense_dim": 128,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    {
+        "name": "depth=4",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128, 256),
+        "dropout": 0.4,
+        "dense_dim": 128,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    # --- Dropout ablation (novel, HC-3D architecture) ---
+    {
+        "name": "dropout=0.0",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.0,
+        "dense_dim": 128,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    {
+        "name": "dropout=0.5",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.5,
+        "dense_dim": 128,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    # --- Dense dim ablation (novel, HC-3D architecture) ---
+    {
+        "name": "dense=64",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.4,
+        "dense_dim": 64,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    {
+        "name": "dense=256",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.4,
+        "dense_dim": 256,
+        "lr": 1e-3,
+        "epochs": 5,
+    },
+    # --- Learning rate ablation (HC-3D architecture) ---
+    {
+        "name": "lr=5e-4",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.4,
+        "dense_dim": 128,
+        "lr": 5e-4,
+        "epochs": 5,
+    },
+    {
+        "name": "lr=1e-4",
+        "volume_shape": (33, 45, 48),
+        "conv_channels": (16, 32, 128),
+        "dropout": 0.4,
+        "dense_dim": 128,
+        "lr": 1e-4,
+        "epochs": 5,
+    },
+]
+
+
+# ---------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------
+
+
+def main():
+    """Run the ablation grid and print a results table."""
+    print("=" * 72)
+    print("SimpleADCNN Ablation Study — Synthetic 3D MRI Data")
+    print("Paper: Bruningk et al., ML4HC 2021")
+    print("=" * 72)
+    print()
+
+    results = []
+    for i, cfg in enumerate(CONFIGS):
+        name = cfg["name"]
+        print(f"[{i + 1}/{len(CONFIGS)}] Running: {name} ...", end=" ", flush=True)
+        metrics = train_and_evaluate(cfg)
+        results.append((name, metrics))
+        print(
+            f"ACC={metrics['acc']:.3f}  "
+            f"AUC={metrics['auc']:.3f}  "
+            f"params={metrics['params']:,}"
+        )
+
+    # --- Results table ---
+    print()
+    print("-" * 72)
+    print(f"{'Configuration':<28} {'ACC':>8} {'AUC':>8} {'Params':>10}")
+    print("-" * 72)
+    for name, metrics in results:
+        print(
+            f"{name:<28} {metrics['acc']:>8.3f} "
+            f"{metrics['auc']:>8.3f} {metrics['params']:>10,}"
+        )
+    print("-" * 72)
+
+    # --- Paper reference values ---
+    print()
+    print("Paper reference (on real ADNI data, 5-fold CV, 3 runs):")
+    print(f"  {'I-3D  (inner brain)':<28} {'0.79':>8} {'0.88':>8} {'~270k':>10}")
+    print(f"  {'P*-3D (best patch)':<28} {'0.81':>8} {'0.89':>8} {'~72k':>10}")
+    print(f"  {'HC-3D (left hippocampus)':<28} {'0.84':>8} {'0.91':>8} {'~140k':>10}")
+    print()
+    print(
+        "Note: Metrics above are on synthetic random data and are NOT "
+        "expected to match the paper. They demonstrate that the model "
+        "trains correctly across all configurations."
+    )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pyhealth/models/__init__.py b/pyhealth/models/__init__.py
index 5233b1726..420c14b3e 100644
--- a/pyhealth/models/__init__.py
+++ b/pyhealth/models/__init__.py
@@ -22,6 +22,7 @@
 from .retain import MultimodalRETAIN, RETAIN, RETAINLayer
 from .rnn import MultimodalRNN, RNN, RNNLayer
 from .safedrug import SafeDrug, SafeDrugLayer
+from .simple_ad_cnn import SimpleADCNN
 from .sparcnet import DenseBlock, DenseLayer, SparcNet, TransitionLayer
 from .stagenet import StageNet, StageNetLayer
 from .stagenet_mha import StageAttentionNet, StageNetAttentionLayer
diff --git a/pyhealth/models/simple_ad_cnn.py b/pyhealth/models/simple_ad_cnn.py
new file mode 100644
index 000000000..a685c1129
--- /dev/null
+++ b/pyhealth/models/simple_ad_cnn.py
@@ -0,0 +1,347 @@
+from abc import ABC
+from typing import Callable, Any, Optional
+import inspect
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..datasets import SampleDataset
+from ..processors import PROCESSOR_REGISTRY
+from pyhealth.models import BaseModel
+
+
+class SimpleADCNN(BaseModel):
+    """Convolutional Neural Network for image classification. Primarily for Alzheimer's Classification.
+
+    This SimpleADCNN is a basic implementation of part of the architecture seen in "Back to the basics
+    with inclusion of clinical domain knowledge - A simple, scalable, and effective model of Alzheimer's
+    Disease Classification". The MRI images are process through 3 layers with batch normalization,
+    dropout, and relu. At the end there are two dense layers and pooling.
+
+    The model is intended to be used with "clinical domain knowledge"; for this paper, it means to
+    focus the network on analyzing relevant information (the hippocampus in the case of Alzheimer's
+    and the topography of the brain) in order to get accurate classification with a relatively
+    simple model.
+
+    
+    The paper gives a number of params for the network, for which the SimpleADCNN allows
+    these to be recreated with the conv_channels parameter.
+
+    - **I-3D** (inner brain): 120x144x120, ~270k params, ACC 0.79
+      - approximate with ``conv_channels=(32, 64, 128)`` (~295k)
+    - **P*-3D** (best patch): 30x36x30, ~72k params, ACC 0.81
+      - approximate with ``conv_channels=(16, 32, 64), dense_dim=64``
+        (~74k)
+    - **HC-3D** (hippocampus): 33x45x48, ~140k params, ACC 0.84
+      - approximate with ``conv_channels=(16, 32, 128)`` (~142k)
+
+    Paper: https://proceedings.mlr.press/v149/bruningk21a.html
+
+    Args:
+        dataset (SampleDataset): The dataset to train the model. It is used to query certain
+            information such as the set of all tokens.
+        in_channels (int): The number of input channels for the first convolutional layer.
+        conv_channels (tuple[int, ]): Channels for each layer. 
+        kernel_size (int): Convolutional kernel size
+        dropout (float): dropout probability
+        dense_dim (int): dimension of the hidden dense layer of simple CNN.
+            
+    Examples:
+         from pyhealth.datasets import create_sample_dataset
+         import torch
+         samples = [
+             {
+                 "patient_id": "p0",
+                 "visit_id": "v0",
+                 "mri": torch.randn(1, 33, 45, 48).tolist(),
+                 "label": 1,
+             },
+             {
+                 "patient_id": "p1",
+                 "visit_id": "v0",
+                 "mri": torch.randn(1, 33, 45, 48).tolist(),
+                 "label": 0,
+             },
+         ]
+         dataset = create_sample_dataset(
+             samples=samples,
+             input_schema={"mri": "tensor"},
+             output_schema={"label": "binary"},
+             dataset_name="demo",
+         )
+         model = AD3DCNN(dataset=dataset)
+    """
+
+    def __init__(
+        self, 
+        dataset: SampleDataset,
+        in_channels: int = 1,
+        conv_channels: tuple[int,  ] = (16, 32, 64),
+        kernel_size: int = 3,                 
+        dropout:float = 0.4, 
+        dense_dim: int = 128           
+    ):
+        """
+        Initializes the SimpleADCNN.
+
+        Args:
+            dataset (SampleDataset): The dataset to train the model.
+        """
+        super(SimpleADCNN, self).__init__(dataset = dataset)
+
+        if len(self.feature_keys) != 1:
+            raise ValueError(
+                f"AD3DCNN expects exactly one input feature (the 3D MRI "
+                f"volume), got {len(self.feature_keys)}: {self.feature_keys}"
+            )
+        if len(self.label_keys) != 1:
+            raise ValueError(
+                f"AD3DCNN expects exactly one label key, "
+                f"got {len(self.label_keys)}: {self.label_keys}"
+            )
+        self.label_key = self.label_keys[0]
+
+        if in_channels < 1:
+            raise ValueError(f"must have at least one inut channel, instead got {in_channels}")
+        if not conv_channels:
+            raise ValueError("must have at least one output channel")
+        if any(ch < 1 for ch in conv_channels):
+            raise ValueError(
+                f"conv_channels must be positive, got "
+                f"{conv_channels}"
+            )
+        if kernel_size < 1 or kernel_size % 2 == 0:
+            raise ValueError(
+                f"kernel_size must be a positive odd integer, got {kernel_size}"
+            )
+        if not 0.0 <= dropout <= 1.0:
+            raise ValueError(f"dropout must be valid probability, got {dropout}")
+        if dense_dim < 1:
+            raise ValueError(f"dense_dim must be greater than 1, got {dense_dim}")
+
+        self.in_channels = in_channels
+        self.conv_channels = conv_channels
+        self.kernel_size = kernel_size
+        self.dropout = dropout
+        self.dense_dim = dense_dim
+
+        padding = kernel_size // 2
+        layers: list[nn.Module] = []
+        ch_in = in_channels
+        for ch_out in conv_channels:
+            layers.extend(
+                [
+                    nn.Conv3d(ch_in, ch_out, kernel_size, padding=padding),
+                    nn.BatchNorm3d(ch_out),
+                    nn.ReLU(),
+                    nn.Dropout3d(dropout),
+                ]
+            )
+            ch_in = ch_out
+        layers.append(nn.AdaptiveAvgPool3d(1))
+        layers.append(nn.Flatten())
+
+        # sequential instead of blocks like "cnn.py"
+        self.features = nn.Sequential(*layers)
+
+        output_size = self.get_output_size()
+        self.classifier = nn.Sequential(
+            nn.Linear(conv_channels[-1], dense_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(dense_dim, output_size),
+        )
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        """Apply He-uniform initialization to all layers.
+
+        Conv3d and Linear layers get ``kaiming_uniform_`` on their
+        weights (matching the paper's "He uniform initialisation").
+        BatchNorm3d layers get weight=1, bias=0.
+        """
+        for module in self.modules():
+            if isinstance(module, (nn.Conv3d, nn.Linear)):
+                nn.init.kaiming_uniform_(module.weight, nonlinearity="relu")
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.BatchNorm3d):
+                nn.init.ones_(module.weight)
+                nn.init.zeros_(module.bias)
+        
+    def forward(self, 
+            **kwargs: torch.Tensor | tuple[torch.Tensor, ]
+        ) -> dict[str, torch.Tensor]:
+        """Forward pass of the model.
+        
+        Args:
+            **kwargs: A variable number of keyword arguments representing input features.
+                Each keyword argument is a tensor or a tuple of tensors of shape (batch_size, ).
+        
+        Returns:
+            A dictionary with the following keys:
+                logit: a tensor of predicted logits.
+                y_prob: a tensor of predicted probabilities.
+                loss [optional]: a scalar tensor representing the final loss, if self.label_keys in kwargs.
+                y_true [optional]: a tensor representing the true labels, if self.label_keys in kwargs.
+        """
+        feature_key = self.feature_keys[0]
+        x = kwargs[feature_key]
+        if isinstance(x, (tuple, list)):
+            x = x[0]
+        x = x.to(self.device).float()
+
+        # (B, D, H, W) or (B, C, D, H, W).
+        if x.dim() not in (4, 5):
+            raise ValueError(
+                f"Expected MRI tensor with 4 or 5 dimensions, got shape "
+                f"{tuple(x.shape)}"
+            )
+        if x.dim() == 4:
+            if self.in_channels != 1:
+                raise ValueError(
+                    "Input is missing an explicit channel dimension. "
+                    f"Automatic unsqueeze is only supported when "
+                    f"in_channels=1, got in_channels={self.in_channels}."
+                )
+            x = x.unsqueeze(1)
+        elif x.shape[1] != self.in_channels:
+            raise ValueError(
+                f"Expected MRI tensor with {self.in_channels} channels, got "
+                f"{x.shape[1]}"
+            )
+
+        x = self.features(x)
+        logits = self.classifier(x)
+
+        y_true = kwargs[self.label_key].to(self.device)
+        loss = self.get_loss_function()(logits, y_true)
+        y_prob = self.prepare_y_prob(logits)
+
+        return {
+            "loss": loss,
+            "y_prob": y_prob,
+            "y_true": y_true,
+            "logit": logits,
+        }
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _resolve_mode(self, schema_entry: Any) -> str:
+        """Resolve a mode string from an output_schema entry.
+
+        Supports:
+          - direct string ("binary", )
+          - processor class
+          - processor instance
+        Returns the registered processor name if found.
+        """
+        if isinstance(schema_entry, str):
+            return schema_entry.lower()
+
+        # Get class reference
+        cls = schema_entry if inspect.isclass(schema_entry) else schema_entry.__class__
+        for name, registered_cls in PROCESSOR_REGISTRY.items():
+            if cls is registered_cls or issubclass(
+                cls, registered_cls
+            ):  # allow subclassing
+                return name.lower()
+        raise ValueError(
+            f"Cannot resolve mode from output_schema entry {schema_entry}. Use a supported string"
+        )
+
+    @property
+    def device(self) -> torch.device:
+        """
+        Gets the device of the model.
+
+        Returns:
+            torch.device: The device on which the model is located.
+        """
+        return self._dummy_param.device
+
+    def get_output_size(self) -> int:
+        """
+        Gets the default output size using the label tokenizer and `self.mode`.
+
+        If the mode is "binary", the output size is 1. If the mode is "multiclass"
+        or "multilabel", the output size is the number of classes or labels.
+
+        Returns:
+            int: The output size of the model.
+        """
+        assert (
+            len(self.label_keys) == 1
+        ), "Only one label key is supported if get_output_size is called"
+        output_size = self.dataset.output_processors[self.label_keys[0]].size()
+        return output_size
+
+    def get_loss_function(self) -> Callable:
+        """
+        Gets the default loss function using `self.mode`.
+
+        The default loss functions are:
+            - binary: `F.binary_cross_entropy_with_logits`
+            - multiclass: `F.cross_entropy`
+            - multilabel: `F.binary_cross_entropy_with_logits`
+            - regression: `F.mse_loss`
+
+        Returns:
+            Callable: The default loss function.
+        """
+        assert (
+            len(self.label_keys) == 1
+        ), "Only one label key is supported if get_loss_function is called"
+        label_key = self.label_keys[0]
+        mode = self._resolve_mode(self.dataset.output_schema[label_key])
+        if mode == "binary":
+            return F.binary_cross_entropy_with_logits
+        elif mode == "multiclass":
+            return F.cross_entropy
+        elif mode == "multilabel":
+            return F.binary_cross_entropy_with_logits
+        elif mode == "regression":
+            return F.mse_loss
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+
+    def prepare_y_prob(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Prepares the predicted probabilities for model evaluation.
+
+        This function converts the predicted logits to predicted probabilities
+        depending on the mode. The default formats are:
+            - binary: a tensor of shape (batch_size, 1) with values in [0, 1],
+                which is obtained with `torch.sigmoid()`
+            - multiclass: a tensor of shape (batch_size, num_classes) with
+                values in [0, 1] and sum to 1, which is obtained with
+                `torch.softmax()`
+            - multilabel: a tensor of shape (batch_size, num_labels) with values
+                in [0, 1], which is obtained with `torch.sigmoid()`
+            - regression: a tensor of shape (batch_size, 1) with raw logits
+
+        Args:
+            logits (torch.Tensor): The predicted logit tensor.
+
+        Returns:
+            torch.Tensor: The predicted probability tensor.
+        """
+        assert (
+            len(self.label_keys) == 1
+        ), "Only one label key is supported if get_loss_function is called"
+        label_key = self.label_keys[0]
+        mode = self._resolve_mode(self.dataset.output_schema[label_key])
+        if mode in ["binary"]:
+            y_prob = torch.sigmoid(logits)
+        elif mode in ["multiclass"]:
+            y_prob = F.softmax(logits, dim=-1)
+        elif mode in ["multilabel"]:
+            y_prob = torch.sigmoid(logits)
+        elif mode in ["regression"]:
+            y_prob = logits
+        else:
+            raise NotImplementedError
+        return y_prob
diff --git a/tests/core/test_ad_cnn.py b/tests/core/test_ad_cnn.py
new file mode 100644
index 000000000..802d5b768
--- /dev/null
+++ b/tests/core/test_ad_cnn.py
@@ -0,0 +1,287 @@
+import math
+import unittest
+
+import torch
+
+from pyhealth.datasets import create_sample_dataset, get_dataloader
+from pyhealth.models import SimpleADCNN, BaseModel
+
+
+class TestSimpleADCNN(unittest.TestCase):
+    """Test cases for the SimpleADCNN model."""
+
+    def setUp(self):
+        """Set up a tiny synthetic dataset and model."""
+        self.samples = [
+            {
+                "patient_id": "patient-0",
+                "visit_id": "visit-0",
+                "mri": torch.randn(1, 4, 6, 8).tolist(),
+                "label": 1,
+            },
+            {
+                "patient_id": "patient-1",
+                "visit_id": "visit-0",
+                "mri": torch.randn(1, 4, 6, 8).tolist(),
+                "label": 0,
+            },
+        ]
+
+        self.input_schema = {"mri": "tensor"}
+        self.output_schema = {"label": "binary"}
+
+        self.dataset = create_sample_dataset(
+            samples=self.samples,
+            input_schema=self.input_schema,
+            output_schema=self.output_schema,
+            dataset_name="test",
+        )
+
+        self.model = SimpleADCNN(
+            dataset=self.dataset,
+            conv_channels=(4, 8),
+            dense_dim=8,
+        )
+
+    def test_model_initialization(self):
+        """Test that SimpleADCNN initializes correctly and inherits BaseModel."""
+        self.assertIsInstance(self.model, SimpleADCNN)
+        self.assertIsInstance(self.model, BaseModel)
+        self.assertEqual(self.model.conv_channels, (4, 8))
+        self.assertEqual(self.model.dense_dim, 8)
+        self.assertEqual(self.model.dropout, 0.4)
+        self.assertEqual(self.model.in_channels, 1)
+        self.assertEqual(self.model.kernel_size, 3)
+        self.assertEqual(len(self.model.feature_keys), 1)
+        self.assertIn("mri", self.model.feature_keys)
+        self.assertEqual(self.model.label_key, "label")
+
+    def test_model_forward(self):
+        """Test that the forward pass returns correct keys and shapes."""
+        train_loader = get_dataloader(self.dataset, batch_size=2, shuffle=False)
+        data_batch = next(iter(train_loader))
+
+        with torch.no_grad():
+            ret = self.model(**data_batch)
+
+        self.assertIn("loss", ret)
+        self.assertIn("y_prob", ret)
+        self.assertIn("y_true", ret)
+        self.assertIn("logit", ret)
+
+        self.assertEqual(ret["y_prob"].shape[0], 2)
+        self.assertEqual(ret["y_true"].shape[0], 2)
+        self.assertEqual(ret["logit"].shape[0], 2)
+        self.assertEqual(ret["loss"].dim(), 0)
+
+    def test_model_backward(self):
+        """Test that gradients flow through all trainable parameters."""
+        train_loader = get_dataloader(self.dataset, batch_size=2, shuffle=False)
+        data_batch = next(iter(train_loader))
+
+        ret = self.model(**data_batch)
+        ret["loss"].backward()
+
+        has_gradient = any(
+            param.requires_grad and param.grad is not None
+            for param in self.model.parameters()
+        )
+        self.assertTrue(
+            has_gradient, "No parameters have gradients after backward pass"
+        )
+
+    def test_custom_hyperparameters(self):
+        """Test SimpleADCNN with non-default hyperparameters."""
+        model = SimpleADCNN(
+            dataset=self.dataset,
+            conv_channels=(8, 16, 32),
+            dropout=0.2,
+            dense_dim=16,
+            kernel_size=5,
+        )
+
+        self.assertEqual(model.conv_channels, (8, 16, 32))
+        self.assertEqual(model.dropout, 0.2)
+        self.assertEqual(model.dense_dim, 16)
+        self.assertEqual(model.kernel_size, 5)
+
+        train_loader = get_dataloader(self.dataset, batch_size=2, shuffle=False)
+        data_batch = next(iter(train_loader))
+
+        with torch.no_grad():
+            ret = model(**data_batch)
+
+        self.assertIn("loss", ret)
+        self.assertIn("y_prob", ret)
+
+    def test_different_input_shapes(self):
+        """Test that the model handles different volume dimensions."""
+        samples = [
+            {
+                "patient_id": "patient-0",
+                "visit_id": "visit-0",
+                "mri": torch.randn(1, 6, 8, 10).tolist(),
+                "label": 1,
+            },
+            {
+                "patient_id": "patient-1",
+                "visit_id": "visit-0",
+                "mri": torch.randn(1, 6, 8, 10).tolist(),
+                "label": 0,
+            },
+        ]
+
+        dataset = create_sample_dataset(
+            samples=samples,
+            input_schema={"mri": "tensor"},
+            output_schema={"label": "binary"},
+            dataset_name="test_shape",
+        )
+
+        model = SimpleADCNN(dataset=dataset, conv_channels=(4, 8), dense_dim=8)
+
+        train_loader = get_dataloader(dataset, batch_size=2, shuffle=False)
+        data_batch = next(iter(train_loader))
+
+        with torch.no_grad():
+            ret = model(**data_batch)
+
+        self.assertIn("loss", ret)
+        self.assertEqual(ret["y_prob"].shape[0], 2)
+        self.assertEqual(ret["logit"].shape[0], 2)
+        self.assertEqual(ret["loss"].dim(), 0)
+
+    def test_rejects_multiple_features(self):
+        """Test that SimpleADCNN raises on datasets with multiple input features."""
+        samples = [
+            {
+                "patient_id": "patient-0",
+                "visit_id": "visit-0",
+                "mri": torch.randn(1, 4, 6, 8).tolist(),
+                "extra": torch.randn(1, 4, 6, 8).tolist(),
+                "label": 1,
+            },
+            {
+                "patient_id": "patient-1",
+                "visit_id": "visit-0",
+                "mri": torch.randn(1, 4, 6, 8).tolist(),
+                "extra": torch.randn(1, 4, 6, 8).tolist(),
+                "label": 0,
+            },
+        ]
+
+        dataset = create_sample_dataset(
+            samples=samples,
+            input_schema={"mri": "tensor", "extra": "tensor"},
+            output_schema={"label": "binary"},
+            dataset_name="test_multi",
+        )
+
+        with self.assertRaises(ValueError):
+            SimpleADCNN(dataset=dataset, conv_channels=(4, 8), dense_dim=8)
+
+    def test_rejects_invalid_hyperparameters(self):
+        """Test that invalid model hyperparameters raise clear errors."""
+        invalid_cases = [
+            {"conv_channels": ()},
+            {"conv_channels": (4, 0, 8)},
+            {"in_channels": 0},
+            {"kernel_size": 2},
+            {"dropout": -0.1},
+            {"dropout": 1.1},
+            {"dense_dim": 0},
+        ]
+
+        for kwargs in invalid_cases:
+            with self.subTest(kwargs=kwargs):
+                with self.assertRaises(ValueError):
+                    SimpleADCNN(dataset=self.dataset, **kwargs)
+
+    def test_multi_channel_input(self):
+        """Test that in_channels > 1 works with matching input data."""
+        samples = [
+            {
+                "patient_id": "patient-0",
+                "visit_id": "visit-0",
+                "mri": torch.randn(3, 4, 6, 8).tolist(),
+                "label": 1,
+            },
+            {
+                "patient_id": "patient-1",
+                "visit_id": "visit-0",
+                "mri": torch.randn(3, 4, 6, 8).tolist(),
+                "label": 0,
+            },
+        ]
+
+        dataset = create_sample_dataset(
+            samples=samples,
+            input_schema={"mri": "tensor"},
+            output_schema={"label": "binary"},
+            dataset_name="test_multichan",
+        )
+
+        model = SimpleADCNN(
+            dataset=dataset,
+            in_channels=3,
+            conv_channels=(4, 8),
+            dense_dim=8,
+        )
+
+        train_loader = get_dataloader(dataset, batch_size=2, shuffle=False)
+        data_batch = next(iter(train_loader))
+
+        with torch.no_grad():
+            ret = model(**data_batch)
+
+        self.assertIn("loss", ret)
+        self.assertEqual(ret["y_prob"].shape[0], 2)
+
+    def test_channel_mismatch_rejection(self):
+        """Test that 4D input raises when in_channels != 1."""
+        model = SimpleADCNN(
+            dataset=self.dataset,
+            in_channels=3,
+            conv_channels=(4, 8),
+            dense_dim=8,
+        )
+
+        train_loader = get_dataloader(self.dataset, batch_size=2, shuffle=False)
+        data_batch = next(iter(train_loader))
+
+        # self.dataset has (1, 4, 6, 8) volumes — 4D after batch dim
+        # Model has in_channels=3, so the unsqueeze guard should reject
+        with self.assertRaises(ValueError):
+            model(**data_batch)
+
+    def test_forward_rejects_invalid_input_shape(self):
+        """Test that invalid MRI tensor shapes raise clear errors."""
+        with self.assertRaises(ValueError):
+            self.model(
+                mri=torch.randn(2, 4, 6),
+                label=torch.tensor([[1.0], [0.0]]),
+            )
+
+    def test_he_initialization(self):
+        """Spot-check that conv weights follow He-uniform distribution."""
+        for m in self.model.modules():
+            if isinstance(m, torch.nn.Conv3d):
+                fan_in = (
+                    m.in_channels
+                    * m.kernel_size[0]
+                    * m.kernel_size[1]
+                    * m.kernel_size[2]
+                )
+                # He-uniform bound = sqrt(6 / fan_in)
+                expected_bound = math.sqrt(6.0 / fan_in)
+                weight_max = m.weight.data.abs().max().item()
+                # Weights should be within the He-uniform bound
+                # (with a small tolerance for floating point)
+                self.assertLessEqual(weight_max, expected_bound + 1e-6)
+                # Weights should not be all zeros
+                self.assertGreater(m.weight.data.abs().sum().item(), 0.0)
+                break  # only check the first conv layer
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file