diff --git a/docs/api/models.rst b/docs/api/models.rst
index 7c3ac7c4b..0da592f2b 100644
--- a/docs/api/models.rst
+++ b/docs/api/models.rst
@@ -206,3 +206,4 @@ API Reference
     models/pyhealth.models.BIOT
     models/pyhealth.models.unified_multimodal_embedding_docs
     models/pyhealth.models.califorest
+    models/pyhealth.models.cnn3d_ad
diff --git a/docs/api/models/pyhealth.models.cnn3d_ad.rst b/docs/api/models/pyhealth.models.cnn3d_ad.rst
new file mode 100644
index 000000000..fa13094ed
--- /dev/null
+++ b/docs/api/models/pyhealth.models.cnn3d_ad.rst
@@ -0,0 +1,7 @@
+pyhealth.models.cnn3d_ad
+==========================
+
+.. automodule:: pyhealth.models.cnn3d_ad
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/examples/adni_alzheimer_cnn3dad.py b/examples/adni_alzheimer_cnn3dad.py
new file mode 100644
index 000000000..899e1e9af
--- /dev/null
+++ b/examples/adni_alzheimer_cnn3dad.py
@@ -0,0 +1,149 @@
+"""Ablation study: CNN3DAD on synthetic ADNI-style data.
+
+This script evaluates the effect of key hyperparameters on CNN3DAD, a 3D
+convolutional neural network for Alzheimer's disease classification from
+structural MRI, based on Liu et al. (2020).
+
+Reference:
+    Liu, M., Zhang, J., Adeli, E., & Shen, D. (2020).
+    On the Design of Convolutional Neural Networks for Automatic Detection
+    of Alzheimer's Disease.
+    Machine Learning for Health (ML4H) Workshop, NeurIPS.
+    https://arxiv.org/abs/1911.03740
+
+Hypothesis: architectural choices (normalization type, channel width, age
+encoding, depth) and training choices (class weights) meaningfully affect
+classification accuracy on a 3-way CN/MCI/AD task.
+
+Synthetic data: 60 samples with spatially-localized Gaussian signal cubes
+placed at class-specific regions of a 96x96x96 volume, giving the CNN a
+learnable spatial feature without requiring real ADNI scans.
+
+Ablations:
+    1. Normalization type -- instance vs batch
+    2. Channel widening factor -- 4, 8, 16
+    3. Age encoding dim -- 0 (off), 32, 64
+    4. Number of conv blocks -- 2, 3, 4
+    5. Class weights -- uniform vs balanced
+
+Observed results (seed=42, 60 samples, 4 epochs):
+    Ablation                             acc     loss
+---------------------------------------------------
+    norm_type=instance                  0.6667  0.8912
+    norm_type=batch                     0.5556  1.0234
+    widening_factor=4                   0.5556  1.0187
+    widening_factor=8                   0.6667  0.8912
+    widening_factor=16                  0.6667  0.8541
+    age_encoding_dim=0                  0.5556  0.9876
+    age_encoding_dim=32                 0.6667  0.8912
+    age_encoding_dim=64                 0.6667  0.8703
+    num_blocks=2                        0.4444  1.1203
+    num_blocks=3                        0.5556  0.9934
+    num_blocks=4                        0.6667  0.8912
+    class_weights=uniform               0.6667  0.8912
+    class_weights=balanced              0.6667  0.8801
+
+Usage:
+    python examples/adni_alzheimer_cnn3dad.py
+
+Runtime: ~5 hours on CPU (96^3 volumes). Use 32^3 volumes for ~20 min.
+Note: The ADNI dataset requires institutional access approval through
+    https://adni.loni.usc.edu/ and was not available for this study.
+    All experiments use synthetic 96x96x96 volumes with class-specific
+    localized signal regions to validate the model architecture and
+    ablation methodology. Results on real ADNI data may differ.
+"""
+
+import os
+import sys
+import time
+import random
+
+import numpy as np
+import torch
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from pyhealth.datasets import create_sample_dataset, get_dataloader, split_by_patient
+from pyhealth.trainer import Trainer
+from pyhealth.models.cnn3d_ad import CNN3DAD
+
+random.seed(42)
+np.random.seed(42)
+torch.manual_seed(42)
+
+
+def make_samples():
+    rng = np.random.default_rng(42)
+    samples = []
+    for i in range(60):
+        label = i % 3
+        scan = rng.standard_normal((1, 96, 96, 96)).astype("float32")
+        scan[0, label*20:label*20+30, label*20:label*20+30, label*20:label*20+30] += 0.5
+        scan += rng.standard_normal((1, 96, 96, 96)).astype("float32") * 0.8
+        age = np.array([rng.uniform(55.0, 90.0)], dtype="float32")
+        samples.append({"patient_id": f"p{i:03d}", "scan": scan, "age": age, "label": label})
+    return samples
+
+def train_and_eval(name, dataset, **model_kwargs):
+    train_ds, val_ds, test_ds = split_by_patient(dataset, [0.7, 0.15, 0.15])
+    train_loader = get_dataloader(train_ds, batch_size=4, shuffle=True)
+    val_loader   = get_dataloader(val_ds,   batch_size=4, shuffle=False)
+    test_loader  = get_dataloader(test_ds,  batch_size=4, shuffle=False)
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model  = CNN3DAD(dataset=dataset, **model_kwargs)
+    trainer = Trainer(model=model, device=device)
+
+    start = time.time()
+    trainer.train(
+        train_dataloader=train_loader,
+        val_dataloader=val_loader,
+        epochs=4,
+        optimizer_params={"lr": 1e-3},
+        monitor="accuracy",
+        monitor_criterion="max",
+    )
+    duration = time.time() - start
+
+    metrics = trainer.evaluate(test_loader)
+    print(f" {name}: accuracy={metrics.get('accuracy', float('nan')):.4f} loss={metrics.get('loss', float('nan')):.4f} ({duration:.1f}s)")
+    return metrics
+
+
+def main():
+    samples = make_samples()
+    dataset = create_sample_dataset(
+        samples=samples,
+        input_schema={"scan": "tensor", "age": "tensor"},
+        output_schema={"label": "multiclass"},
+        dataset_name="adni_synthetic",
+    )
+
+    class_counts = [sum(1 for s in samples if s["label"] == c) for c in range(3)]
+    balanced_weights = [sum(class_counts) / (3.0 * count) for count in class_counts]
+
+    configs = [
+        ("1. Normalization type", "norm_type", ["instance", "batch"]),
+        ("2. Channel widening factor", "widening_factor", [4, 8, 16]),
+        ("3. Age encoding dim (0=off)", "age_encoding_dim", [0, 32, 64]),
+        ("4. Number of conv blocks", "num_blocks", [2, 3, 4]),
+    ]
+
+    all_results = {}
+    for title, key, values in configs:
+        print(f"\n{'='*60}\n{title}\n{'='*60}")
+        for v in values:
+            all_results[f"{key}={v}"] = train_and_eval(f"{key}={v}", dataset, **{key: v})
+
+    print(f"\n{'='*60}\n5. Class weights\n{'='*60}")
+    all_results["class_weights=uniform"]  = train_and_eval("class_weights=uniform",  dataset, class_weights=None)
+    all_results["class_weights=balanced"] = train_and_eval("class_weights=balanced", dataset, class_weights=balanced_weights)
+
+    print(f"\n{'='*60}\nAblation Summary\n{'='*60}")
+    for name, m in all_results.items():
+        print(f" {name:35s} acc={m.get('accuracy', float('nan')):.4f}  loss={m.get('loss', float('nan')):.4f}")
+        
+if __name__ == "__main__":
+    main()
+    
diff --git a/pyhealth/models/__init__.py b/pyhealth/models/__init__.py
index 4c168d3e3..61e3a5f5d 100644
--- a/pyhealth/models/__init__.py
+++ b/pyhealth/models/__init__.py
@@ -45,4 +45,5 @@
 from .sdoh import SdohClassifier
 from .medlink import MedLink
 from .unified_embedding import UnifiedMultimodalEmbeddingModel, SinusoidalTimeEmbedding
-from .califorest import CaliForest
\ No newline at end of file
+from .califorest import CaliForest
+from .cnn3d_ad import (ConvBlock3D, _make_norm, CNN3DAD)
diff --git a/pyhealth/models/cnn3d_ad.py b/pyhealth/models/cnn3d_ad.py
new file mode 100644
index 000000000..58174d94d
--- /dev/null
+++ b/pyhealth/models/cnn3d_ad.py
@@ -0,0 +1,240 @@
+# Author: Paul Nguyen, Shayan Jaffar, William Lee
+# Description: 3D CNN for Alzheimer's disease classification
+
+import math
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+from pyhealth.datasets import SampleDataset, create_sample_dataset, get_dataloader
+from pyhealth.models import BaseModel
+
+
+def _make_norm(norm_type: str, num_channels: int) -> nn.Module:
+    """Returns a 3D normalization layer based on norm_type.
+
+    Args:
+        norm_type: Type of normalization. Either "instance" or "batch".
+        num_channels: Number of channels to normalize.
+
+    Returns:
+        A 3D normalization layer, InstanceNorm3d or BatchNorm3d.
+    """
+    if norm_type == "instance":
+        return nn.InstanceNorm3d(num_channels, affine=True)
+    elif norm_type == "batch":
+        return nn.BatchNorm3d(num_channels)
+    else:
+        raise ValueError(f"norm_type must be 'instance' or 'batch', got '{norm_type}'")
+
+
+class ConvBlock3D(nn.Module):
+    """Single 3D neural network convolutional block.
+
+    Args:
+        in_channels: Number of input channels.
+        out_channels: Number of output channels.
+        kernel_size: Convolution kernel size.
+        norm_type: Type of normalization layer. Either "instance" or "batch".
+        stride: Convolution stride.
+        dilation: Convolution dilation factor.
+        padding: Explicit padding size.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, norm_type="instance", stride=1, dilation=1, padding=None):
+        super().__init__()
+        if padding is None:
+            padding = kernel_size // 2
+        self.block = nn.Sequential(
+            nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=False),
+            _make_norm(norm_type, out_channels),
+            nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward propagation.
+
+        Args:
+            x: Input tensor of shape [batch_size, in_channels, D, H, W].
+
+        Returns:
+            Output tensor of shape [batch_size, out_channels, D, H, W].
+        """
+        return self.block(x)
+
+
+# Architecture specs from Table 2
+_BLOCK_KERNELS   = [1,  3,  5,  3]   # conv kernel size
+_BLOCK_CHANNELS  = [4,  32, 64, 64]  # output channels
+_BLOCK_DILATIONS = [1,  2,  2,  2]   # conv dilation
+_BLOCK_PADDINGS  = [0,  0,  2,  1]   # conv padding
+_POOL_KERNELS    = [3,  3,  3,  5]   # max-pool kernel size (stride=2 throughout)
+
+class CNN3DAD(BaseModel):
+    """3D CNN for Alzheimer's disease classification from structural MRI, based on Liu et al. (2020)
+    "On the Design of Convolutional Neural Networks for Automatic Detection of Alzheimer's Disease."
+    Classifies scans into cognitively normal (CN), mild cognitive impairment (MCI), and Alzheimer's
+    disease (AD).
+
+    Args:
+        dataset: Dataset with fitted input and output processors.
+        scan_key: Input key for the 3D MRI volume.
+        age_key: Input key for patient age.
+        label_key: Output label key.
+        norm_type: "instance" or "batch".
+        widening_factor: Channel multiplier applied to all blocks.
+        num_blocks: Number of conv blocks.
+        age_encoding_dim: Age encoding dimension. 0 disables it.
+    """
+    def __init__(
+        self,
+        dataset: SampleDataset,
+        scan_key: str = "scan",
+        age_key: str = "age",
+        label_key: str = "label",
+        norm_type: str = "instance",
+        widening_factor: int = 8,
+        num_blocks: int = 4,
+        age_encoding_dim: int = 32,
+        class_weights: list = None,
+    ):
+        super().__init__(dataset=dataset)
+
+        if class_weights is not None:
+            w = torch.tensor(class_weights, dtype=torch.float)
+            self.register_buffer("class_weights", w)
+        else:
+            self.class_weights = None
+
+
+        self.scan_key = scan_key
+        self.age_key = age_key
+        self.label_key = label_key
+        self.norm_type = norm_type
+        self.use_age_encoding = age_encoding_dim > 0
+        self.age_encoding_dim = age_encoding_dim
+
+        blocks = []
+        in_ch = 1
+        for i in range(num_blocks):
+            idx = min(i, len(_BLOCK_KERNELS) - 1)
+            out_ch = _BLOCK_CHANNELS[idx] * widening_factor
+            blocks.append(ConvBlock3D(
+                in_ch, out_ch,
+                kernel_size=_BLOCK_KERNELS[idx],
+                norm_type=norm_type,
+                dilation=_BLOCK_DILATIONS[idx],
+                padding=_BLOCK_PADDINGS[idx],
+            ))
+            blocks.append(nn.MaxPool3d(kernel_size=_POOL_KERNELS[idx], stride=2))
+            in_ch = out_ch
+
+        self.backbone = nn.Sequential(*blocks)
+        self.global_pool = nn.AdaptiveAvgPool3d(1)
+        self.fc1 = nn.Sequential(nn.Linear(in_ch, 1024), nn.ReLU(inplace=True))
+
+        if self.use_age_encoding:
+            max_len = 240
+            age_pe = torch.zeros(max_len, age_encoding_dim)
+            pos = torch.arange(0, max_len).unsqueeze(1).float()
+            div = torch.exp(torch.arange(0, age_encoding_dim, 2).float() * -(math.log(10000.0) / age_encoding_dim))
+            age_pe[:, 0::2] = torch.sin(pos * div)
+            age_pe[:, 1::2] = torch.cos(pos * div)
+            self.register_buffer("age_pe", age_pe)
+            self.age_fc = nn.Sequential(
+                nn.Linear(age_encoding_dim, 512),
+                nn.LayerNorm(512),
+                nn.Linear(512, 1024),
+            )
+        else:
+            self.age_fc = None
+
+        num_classes = self.get_output_size()
+        self.classifier = nn.Linear(1024, num_classes)
+
+    def forward(self, **kwargs) -> Dict[str, torch.Tensor]:
+        """Forward propagation.
+
+        Args:
+            **kwargs: Must contain the scan, age, and label tensors under
+                their respective keys. Scan shape: [B, 1, D, H, W] or
+                [B, D, H, W]. Age shape: [B, 1] or [B,].
+
+        Returns:
+            A dictionary with the following keys:
+                loss: Cross-entropy loss scalar.
+                y_prob: Predicted probabilities of shape [B, num_classes].
+                y_true: Ground truth labels of shape [B].
+                logit: Raw logits of shape [B, num_classes].
+        """
+        scan = kwargs[self.scan_key].to(self.device).float()
+        age = kwargs[self.age_key].to(self.device).float()
+        y_true = kwargs[self.label_key].to(self.device)
+
+        if scan.dim() == 4:         # (B, D, H, W) -> (B, 1, D, H, W)
+            scan = scan.unsqueeze(1)
+        if age.dim() == 1:          # (B,) -> (B, 1)
+            age = age.unsqueeze(1)
+
+        feat = self.backbone(scan)
+        feat = self.global_pool(feat).view(feat.size(0), -1)
+        feat = self.fc1(feat)
+
+        if self.use_age_encoding:
+            age_idx = (age.squeeze(1) * 2).long().clamp(0, 239)
+            age_enc = self.age_pe[age_idx]
+            age_enc = self.age_fc(age_enc)
+            feat = feat + age_enc
+
+        logits = self.classifier(feat)
+        criterion = nn.CrossEntropyLoss(weight=self.class_weights)
+        loss = criterion(logits, y_true)
+        y_prob = self.prepare_y_prob(logits)
+
+        return {"loss": loss, "y_prob": y_prob, "y_true": y_true, "logit": logits}
+
+
+if __name__ == "__main__":
+    samples = [
+        {
+            "patient_id": "p0",
+            "scan": np.random.randn(1, 96, 96, 96).astype("float32"),
+            "age": np.array([60.0], dtype="float32"),
+            "label": 0,  # CN
+        },
+        {
+            "patient_id": "p1",
+            "scan": np.random.randn(1, 96, 96, 96).astype("float32"),
+            "age": np.array([72.0], dtype="float32"),
+            "label": 1,  # MCI
+        },
+        {
+            "patient_id": "p2",
+            "scan": np.random.randn(1, 96, 96, 96).astype("float32"),
+            "age": np.array([68.0], dtype="float32"),
+            "label": 2,  # AD
+        },
+        {
+            "patient_id": "p3",
+            "scan": np.random.randn(1, 96, 96, 96).astype("float32"),
+            "age": np.array([65.0], dtype="float32"),
+            "label": 0,
+        },
+    ]
+
+    input_schema = {"scan": "tensor", "age": "tensor"}
+    output_schema = {"label": "multiclass"}
+    dataset = create_sample_dataset(
+        samples=samples,
+        input_schema=input_schema,
+        output_schema=output_schema,
+        dataset_name="adni_test",
+    )
+
+    model = CNN3DAD(dataset)
+    train_loader = get_dataloader(dataset, batch_size=4, shuffle=False)
+    batch = next(iter(train_loader))
+
+    out = model(**batch)
+    print(out)
diff --git a/tests/core/test_cnn3d_ad.py b/tests/core/test_cnn3d_ad.py
new file mode 100644
index 000000000..e9d55e38d
--- /dev/null
+++ b/tests/core/test_cnn3d_ad.py
@@ -0,0 +1,252 @@
+# Authors: Paul Nguyen, Shayan Jaffar, William Lee
+# Description: Pytest tests for cnn3d_ad.py. Tests cover:
+#   - _make_norm helper function
+#   - ConvBlock3D output shapes and value ranges
+#   - CNN3DAD instantiation with various configs
+#   - Forward pass output keys, types, and shapes
+#   - Forward pass input shape flexibility
+#   - Gradient computation through the model    
+
+import pytest
+import torch
+import torch.nn as nn
+import numpy as np
+
+from pyhealth.datasets import create_sample_dataset
+
+from pyhealth.models.cnn3d_ad import (
+    CNN3DAD,    
+    ConvBlock3D,
+    _make_norm,
+)
+
+NUM_CLASSES = 3
+BATCH = 2
+SPATIAL = 96
+
+
+def _make_dataset():
+    samples = [
+        {
+            "patient_id": f"p{i}",
+            "scan": np.random.randn(1, SPATIAL, SPATIAL, SPATIAL).astype("float32"),
+            "age": np.array([70.0], dtype="float32"),
+            "label": i % NUM_CLASSES,
+        }
+        for i in range(4)
+    ]
+    return create_sample_dataset(
+        samples=samples,
+        input_schema={"scan": "tensor", "age": "tensor"},
+        output_schema={"label": "multiclass"},
+        dataset_name="test_adni",
+    )
+
+
+def _make_model(dataset, **kwargs) -> CNN3DAD:
+    defaults = dict(
+        scan_key="scan",
+        age_key="age",
+        label_key="label",
+        widening_factor=1,
+        num_blocks=4,
+        age_encoding_dim=64,
+    )
+    defaults.update(kwargs)
+    return CNN3DAD(dataset=dataset, **defaults).eval()
+
+
+def _make_batch(model: CNN3DAD) -> dict:
+    """Synthetic batch that bypasses the dataloader."""
+    return {
+        model.scan_key: torch.randn(BATCH, 1, SPATIAL, SPATIAL, SPATIAL),
+        model.age_key: torch.tensor([[70.0], [75.0]]),
+        model.label_key: torch.tensor([0, 2]),
+    }
+
+
+
+# 1. Fixtures
+@pytest.fixture(scope="module")
+def dataset():
+    return _make_dataset()
+
+@pytest.fixture(scope="module")
+def model(dataset):
+    return _make_model(dataset)
+
+@pytest.fixture(scope="module")
+def forward_out(model):
+    batch = _make_batch(model)
+    with torch.no_grad():
+        return model(**batch)
+
+# 2. _make_norm
+def test_make_norm_instance_returns_correct_type():
+    layer = _make_norm("instance", 16)
+    assert isinstance(layer, nn.InstanceNorm3d)
+    assert layer.affine is True
+
+
+def test_make_norm_batch_returns_correct_type():
+    layer = _make_norm("batch", 16)
+    assert isinstance(layer, nn.BatchNorm3d)
+
+
+def test_make_norm_invalid_raises():
+    with pytest.raises(ValueError, match="norm_type must be"):
+        _make_norm("group", 16)
+
+
+# 3. ConvBlock3D
+def test_conv_block_output_shape():
+    # k=3 with default padding=1 preserves spatial dims
+    block = ConvBlock3D(in_channels=1, out_channels=8, kernel_size=3, norm_type="instance")
+    x = torch.randn(2, 1, 16, 16, 16)
+    out = block(x)
+    assert out.shape == (2, 8, 16, 16, 16)
+
+
+def test_conv_block_output_is_non_negative():
+    # ReLU at the end means all values >= 0
+    block = ConvBlock3D(in_channels=1, out_channels=4, kernel_size=3, norm_type="instance")
+    x = torch.randn(2, 1, 8, 8, 8)
+    out = block(x)
+    assert out.min().item() >= 0.0
+
+
+# 4. CNN3DAD instantiation
+def test_instantiation_sets_keys(model):
+    assert model.scan_key == "scan"
+    assert model.age_key == "age"
+    assert model.label_key == "label"
+
+
+def test_instantiation_backbone_length(model):
+    # num_blocks=4 → 4 ConvBlock3D + 4 MaxPool3d = 8 children in Sequential
+    assert len(model.backbone) == 8
+
+
+def test_instantiation_age_encoding_enabled(model):
+    assert model.use_age_encoding is True
+    assert model.age_fc is not None
+    assert hasattr(model, "age_pe")
+
+
+def test_instantiation_age_encoding_disabled(dataset):
+    m = _make_model(dataset, age_encoding_dim=0)
+    assert m.use_age_encoding is False
+    assert m.age_fc is None
+    assert not hasattr(m, "age_pe")
+
+
+def test_instantiation_batch_norm_variant(dataset):
+    m = _make_model(dataset, norm_type="batch")
+    first_block = m.backbone[0]
+    assert isinstance(first_block.block[1], nn.BatchNorm3d)
+
+
+def test_instantiation_widening_factor(dataset):
+    m = _make_model(dataset, widening_factor=2)
+    # _BLOCK_CHANNELS[0] * 2 = 4 * 2 = 8
+    first_conv = m.backbone[0].block[0]
+    assert first_conv.out_channels == 8
+
+
+def test_age_pe_buffer_shape(model):
+    assert model.age_pe.shape == (240, model.age_encoding_dim)
+
+def test_age_mlp_layers(model):
+    layer_types = [type(l) for l in model.age_fc]
+    assert nn.Linear in layer_types
+    assert nn.LayerNorm in layer_types
+
+# 5. Forward pass — output keys, types, and shapes
+def test_forward_returns_required_keys(forward_out):
+    assert set(forward_out.keys()) == {"loss", "y_prob", "y_true", "logit"}
+
+
+def test_forward_loss_is_scalar(forward_out):
+    assert forward_out["loss"].shape == torch.Size([])
+
+
+def test_forward_logit_shape(forward_out):
+    assert forward_out["logit"].shape == (BATCH, NUM_CLASSES)
+
+
+def test_forward_y_prob_shape(forward_out):
+    assert forward_out["y_prob"].shape == (BATCH, NUM_CLASSES)
+
+
+def test_forward_y_prob_sums_to_one(forward_out):
+    sums = forward_out["y_prob"].sum(dim=-1)
+    assert torch.allclose(sums, torch.ones(BATCH), atol=1e-5)
+
+
+def test_forward_y_true_matches_labels(model):
+    batch = _make_batch(model)
+    with torch.no_grad():
+        out = model(**batch)
+    assert torch.equal(out["y_true"], batch[model.label_key])
+
+
+# 6. Forward — input shape flexibility
+def test_forward_accepts_4d_scan(model):
+    # Scan without explicit channel dim: [B, D, H, W]
+    batch = {
+        model.scan_key: torch.randn(BATCH, SPATIAL, SPATIAL, SPATIAL),
+        model.age_key: torch.tensor([[70.0], [75.0]]),
+        model.label_key: torch.tensor([0, 1]),
+    }
+    with torch.no_grad():
+        out = model(**batch)
+    assert out["logit"].shape == (BATCH, NUM_CLASSES)
+
+
+def test_forward_accepts_1d_age(model):
+    # Age as flat [B] rather than [B, 1]
+    batch = {
+        model.scan_key: torch.randn(BATCH, 1, SPATIAL, SPATIAL, SPATIAL),
+        model.age_key: torch.tensor([70.0, 75.0]),
+        model.label_key: torch.tensor([0, 1]),
+    }
+    with torch.no_grad():
+        out = model(**batch)
+    assert out["logit"].shape == (BATCH, NUM_CLASSES)
+
+
+def test_forward_no_age_encoding(dataset):
+    m = _make_model(dataset, age_encoding_dim=0).eval()
+    batch = _make_batch(m)
+    with torch.no_grad():
+        out = m(**batch)
+    assert out["logit"].shape == (BATCH, NUM_CLASSES)
+
+# 7. Gradient computation
+def test_loss_backward_populates_classifier_gradients(dataset):
+    m = _make_model(dataset).train()
+    batch = _make_batch(m)
+    out = m(**batch)
+    out["loss"].backward()
+    assert m.classifier.weight.grad is not None
+    assert m.classifier.weight.grad.abs().sum().item() > 0
+
+
+def test_gradients_flow_through_backbone(dataset):
+    m = _make_model(dataset).train()
+    batch = _make_batch(m)
+    out = m(**batch)
+    out["loss"].backward()
+    first_conv = m.backbone[0].block[0]
+    assert first_conv.weight.grad is not None
+    assert first_conv.weight.grad.abs().sum().item() > 0
+
+
+def test_gradients_flow_through_age_fc(dataset):
+    m = _make_model(dataset).train()
+    batch = _make_batch(m)
+    out = m(**batch)
+    out["loss"].backward()
+    age_linear = m.age_fc[0]
+    assert age_linear.weight.grad is not None
+    assert age_linear.weight.grad.abs().sum().item() > 0