Skip to content

Commit 8fe0ccd

Browse files
rwgkcursoragent
andcommitted
pathfinder: add display-driver release compatibility checks
Query NVML for display-driver release metadata and use it to distinguish backward compatibility from NVIDIA's same-major minor-version compatibility. This lets guard rails follow published driver-branch thresholds instead of treating cuDriverGetVersion() as the whole driver story. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent b6f9e31 commit 8fe0ccd

5 files changed

Lines changed: 490 additions & 20 deletions

File tree

cuda_pathfinder/cuda/pathfinder/_compatibility_guard_rails.py

Lines changed: 173 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,19 @@
5151
)
5252
from cuda.pathfinder._utils.driver_info import (
5353
DriverCudaVersion,
54+
DriverReleaseVersion,
5455
QueryDriverCudaVersionError,
56+
QueryDriverReleaseVersionError,
5557
query_driver_cuda_version,
58+
query_driver_release_version,
5659
)
5760
from cuda.pathfinder._utils.toolkit_info import ReadCudaHeaderVersionError, read_cuda_header_version
5861

5962
ItemKind: TypeAlias = str
6063
PackagedWith: TypeAlias = str
6164
CtkVersionConstraintArg: TypeAlias = str | SpecifierSet | None
6265
PairwiseItemRelationKind: TypeAlias = str
66+
DriverCompatibilityKind: TypeAlias = str
6367

6468
_CTK_VERSION_RE = re.compile(r"^(?P<major>\d+)\.(?P<minor>\d+)")
6569
_CTK_VERSION_CONSTRAINT_ERROR = (
@@ -68,6 +72,13 @@
6872
)
6973
_PAIRWISE_ITEM_RELATION_NONE = "none"
7074
_PAIRWISE_ITEM_RELATION_EXACT_CTK_MATCH_REQUIRED = "exact-ctk-match-required"
75+
_DRIVER_COMPATIBILITY_BACKWARD = "backward-compatibility"
76+
_DRIVER_COMPATIBILITY_MINOR_VERSION = "minor-version-compatibility"
77+
_MIN_DRIVER_BRANCH_FOR_MINOR_VERSION_COMPATIBILITY_BY_CTK_MAJOR = {
78+
11: 450,
79+
12: 525,
80+
13: 580,
81+
}
7182

7283

7384
@dataclass(frozen=True, slots=True)
@@ -76,6 +87,12 @@ class PairwiseItemRelation:
7687
reason: str | None = None
7788

7889

90+
@dataclass(frozen=True, slots=True)
91+
class DriverCompatibilityDecision:
92+
kind: DriverCompatibilityKind
93+
detail: str
94+
95+
7996
_STATIC_LIBS_PACKAGED_WITH: dict[str, PackagedWith] = {
8097
"cudadevrt": "ctk",
8198
}
@@ -446,6 +463,30 @@ def _ctk_constraint_failure_message(item: ResolvedItem, constraint: CtkVersionCo
446463
return f"{item.describe()} resolves to CTK {item.ctk_version}, which does not satisfy ctk_version{constraint}."
447464

448465

466+
def _driver_backward_compatibility_detail(driver_cuda_version: DriverCudaVersion, item: ResolvedItem) -> str:
467+
assert item.ctk_version is not None
468+
return (
469+
f"the driver satisfies backward compatibility because cuDriverGetVersion() reports "
470+
f"CUDA {driver_cuda_version.major}.{driver_cuda_version.minor}, which is not older than CTK {item.ctk_version}"
471+
)
472+
473+
474+
def _driver_minor_version_compatibility_detail(
475+
driver_cuda_version: DriverCudaVersion,
476+
driver_release_version: DriverReleaseVersion,
477+
item: ResolvedItem,
478+
*,
479+
required_branch: int,
480+
) -> str:
481+
assert item.ctk_version is not None
482+
return (
483+
"the driver satisfies NVIDIA's same-major minor-version compatibility because "
484+
f"cuDriverGetVersion() reports older CUDA {driver_cuda_version.major}.{driver_cuda_version.minor}, "
485+
f"but display-driver release {driver_release_version.text} meets the published CUDA "
486+
f"{item.ctk_version.major}.x minimum branch >= {required_branch}"
487+
)
488+
489+
449490
def _ctk_pair_mismatch_message(
450491
item1: ResolvedItem,
451492
item2: ResolvedItem,
@@ -471,8 +512,44 @@ def _driver_major_mismatch_message(driver_cuda_version: DriverCudaVersion, item:
471512
)
472513

473514

474-
def _compatible_pair_message(
515+
def _driver_cuda_version_too_old_message(driver_cuda_version: DriverCudaVersion, item: ResolvedItem) -> str:
516+
assert item.ctk_version is not None
517+
return (
518+
f"cuDriverGetVersion() reports CUDA {driver_cuda_version.major}.{driver_cuda_version.minor}, "
519+
f"but {item.describe()} requires CTK {item.ctk_version}. "
520+
"NVIDIA's published minor-version compatibility starts with CUDA 11, so v1 requires "
521+
"the driver CUDA version to be at least the CTK version for older CTK majors."
522+
)
523+
524+
525+
def _missing_driver_release_version_message(driver_cuda_version: DriverCudaVersion, item: ResolvedItem) -> str:
526+
assert item.ctk_version is not None
527+
return (
528+
f"cuDriverGetVersion() reports older CUDA {driver_cuda_version.major}.{driver_cuda_version.minor} for "
529+
f"{item.describe()}, which requires CTK {item.ctk_version}. Determining whether NVIDIA's same-major "
530+
"minor-version compatibility applies requires the display-driver release version (for example "
531+
"'535.54.03' or branch '535')."
532+
)
533+
534+
535+
def _driver_release_branch_too_old_message(
475536
driver_cuda_version: DriverCudaVersion,
537+
driver_release_version: DriverReleaseVersion,
538+
item: ResolvedItem,
539+
*,
540+
required_branch: int,
541+
) -> str:
542+
assert item.ctk_version is not None
543+
return (
544+
f"cuDriverGetVersion() reports older CUDA {driver_cuda_version.major}.{driver_cuda_version.minor}, "
545+
f"and display-driver release {driver_release_version.text} (branch {driver_release_version.branch}) "
546+
f"is below NVIDIA's published CUDA {item.ctk_version.major}.x minimum branch >= {required_branch} "
547+
f"for {item.describe()}."
548+
)
549+
550+
551+
def _compatible_pair_message(
552+
driver_decision: DriverCompatibilityDecision,
476553
item1: ResolvedItem,
477554
item2: ResolvedItem,
478555
relation: PairwiseItemRelation,
@@ -484,13 +561,12 @@ def _compatible_pair_message(
484561
f"{item1.describe()} resolves to CTK {item1.ctk_version}, "
485562
f"{item2.describe()} resolves to CTK {item2.ctk_version}, "
486563
"and v1 does not require exact CTK lockstep for this pair. "
487-
f"Driver version {driver_cuda_version.encoded} satisfies the v1 driver guard rail."
564+
f"Separately, {driver_decision.detail}."
488565
)
489566
assert relation.reason is not None
490567
return (
491568
f"{item1.describe()} and {item2.describe()} both resolve to CTK {item1.ctk_version}. "
492-
f"{relation.reason[:1].upper() + relation.reason[1:]}, and driver version "
493-
f"{driver_cuda_version.encoded} satisfies the v1 driver guard rail."
569+
f"{relation.reason[:1].upper() + relation.reason[1:]}. Separately, {driver_decision.detail}."
494570
)
495571

496572

@@ -565,21 +641,71 @@ def _pairwise_policy_result(
565641
raise AssertionError(f"Unhandled pairwise item relation: {relation.kind!r}")
566642

567643

568-
def _driver_compatibility_result(
569-
driver_cuda_version: DriverCudaVersion, item: ResolvedItem
570-
) -> CompatibilityResult | None:
644+
def _driver_cuda_version_supports_ctk_by_backward_compatibility(
645+
driver_cuda_version: DriverCudaVersion,
646+
ctk_version: CtkVersion,
647+
) -> bool:
648+
return (driver_cuda_version.major, driver_cuda_version.minor) >= (ctk_version.major, ctk_version.minor)
649+
650+
651+
def _driver_compatibility_outcome(
652+
driver_cuda_version: DriverCudaVersion,
653+
item: ResolvedItem,
654+
*,
655+
driver_release_version: DriverReleaseVersion | None = None,
656+
) -> DriverCompatibilityDecision | CompatibilityResult:
571657
assert item.ctk_version is not None
572-
if driver_cuda_version.major >= item.ctk_version.major:
573-
return None
658+
if _driver_cuda_version_supports_ctk_by_backward_compatibility(driver_cuda_version, item.ctk_version):
659+
return DriverCompatibilityDecision(
660+
kind=_DRIVER_COMPATIBILITY_BACKWARD,
661+
detail=_driver_backward_compatibility_detail(driver_cuda_version, item),
662+
)
663+
if driver_cuda_version.major != item.ctk_version.major:
664+
return CompatibilityResult(
665+
status="incompatible",
666+
message=_driver_major_mismatch_message(driver_cuda_version, item),
667+
error_type=DriverCtkCompatibilityError,
668+
)
669+
required_branch = _MIN_DRIVER_BRANCH_FOR_MINOR_VERSION_COMPATIBILITY_BY_CTK_MAJOR.get(item.ctk_version.major)
670+
if required_branch is None:
671+
return CompatibilityResult(
672+
status="incompatible",
673+
message=_driver_cuda_version_too_old_message(driver_cuda_version, item),
674+
error_type=DriverCtkCompatibilityError,
675+
)
676+
if driver_release_version is None:
677+
return CompatibilityResult(
678+
status="insufficient_metadata",
679+
message=_missing_driver_release_version_message(driver_cuda_version, item),
680+
)
681+
if driver_release_version.branch >= required_branch:
682+
return DriverCompatibilityDecision(
683+
kind=_DRIVER_COMPATIBILITY_MINOR_VERSION,
684+
detail=_driver_minor_version_compatibility_detail(
685+
driver_cuda_version,
686+
driver_release_version,
687+
item,
688+
required_branch=required_branch,
689+
),
690+
)
574691
return CompatibilityResult(
575692
status="incompatible",
576-
message=_driver_major_mismatch_message(driver_cuda_version, item),
693+
message=_driver_release_branch_too_old_message(
694+
driver_cuda_version,
695+
driver_release_version,
696+
item,
697+
required_branch=required_branch,
698+
),
577699
error_type=DriverCtkCompatibilityError,
578700
)
579701

580702

581703
def compatibility_check(
582-
driver_cuda_version: DriverCudaVersion, item1: ResolvedItem, item2: ResolvedItem
704+
driver_cuda_version: DriverCudaVersion,
705+
item1: ResolvedItem,
706+
item2: ResolvedItem,
707+
*,
708+
driver_release_version: DriverReleaseVersion | None = None,
583709
) -> CompatibilityResult:
584710
for item in (item1, item2):
585711
result = _supported_packaging_result(item)
@@ -594,13 +720,17 @@ def compatibility_check(
594720
if result is not None:
595721
return result
596722

597-
result = _driver_compatibility_result(driver_cuda_version, item1)
598-
if result is not None:
599-
return result
723+
driver_outcome = _driver_compatibility_outcome(
724+
driver_cuda_version,
725+
item1,
726+
driver_release_version=driver_release_version,
727+
)
728+
if isinstance(driver_outcome, CompatibilityResult):
729+
return driver_outcome
600730

601731
return CompatibilityResult(
602732
status="compatible",
603-
message=_compatible_pair_message(driver_cuda_version, item1, item2, relation),
733+
message=_compatible_pair_message(driver_outcome, item1, item2, relation),
604734
)
605735

606736

@@ -612,10 +742,13 @@ def __init__(
612742
*,
613743
ctk_version: CtkVersionConstraintArg = None,
614744
driver_cuda_version: DriverCudaVersion | None = None,
745+
driver_release_version: DriverReleaseVersion | None = None,
615746
) -> None:
616747
self._ctk_version_constraint = _coerce_ctk_version_constraint(ctk_version)
617748
self._configured_driver_cuda_version = driver_cuda_version
618749
self._driver_cuda_version = driver_cuda_version
750+
self._configured_driver_release_version = driver_release_version
751+
self._driver_release_version = driver_release_version
619752
self._resolved_items: list[ResolvedItem] = []
620753

621754
def _get_driver_cuda_version(self) -> DriverCudaVersion:
@@ -628,6 +761,16 @@ def _get_driver_cuda_version(self) -> DriverCudaVersion:
628761
) from exc
629762
return self._driver_cuda_version
630763

764+
def _get_driver_release_version(self) -> DriverReleaseVersion:
765+
if self._driver_release_version is None:
766+
try:
767+
self._driver_release_version = query_driver_release_version()
768+
except QueryDriverReleaseVersionError as exc:
769+
raise CompatibilityInsufficientMetadataError(
770+
"Failed to query the display-driver release version needed for compatibility checks."
771+
) from exc
772+
return self._driver_release_version
773+
631774
def _enforce_supported_packaging(self, item: ResolvedItem) -> None:
632775
if item.packaged_with == "ctk":
633776
return
@@ -647,10 +790,20 @@ def _enforce_constraints(self, item: ResolvedItem) -> None:
647790
raise CompatibilityCheckError(_ctk_constraint_failure_message(item, self._ctk_version_constraint))
648791

649792
def _enforce_driver_compatibility(self, item: ResolvedItem) -> None:
650-
result = _driver_compatibility_result(self._get_driver_cuda_version(), item)
651-
if result is None:
652-
return
653-
result.require_compatible()
793+
driver_cuda_version = self._get_driver_cuda_version()
794+
assert item.ctk_version is not None
795+
driver_release_version = (
796+
None
797+
if _driver_cuda_version_supports_ctk_by_backward_compatibility(driver_cuda_version, item.ctk_version)
798+
else self._get_driver_release_version()
799+
)
800+
outcome = _driver_compatibility_outcome(
801+
driver_cuda_version,
802+
item,
803+
driver_release_version=driver_release_version,
804+
)
805+
if isinstance(outcome, CompatibilityResult):
806+
outcome.require_compatible()
654807

655808
def _enforce_pairwise_compatibility(self, prior_item: ResolvedItem, item: ResolvedItem) -> None:
656809
result = _pairwise_policy_result(prior_item, item)
@@ -664,6 +817,7 @@ def _remember(self, item: ResolvedItem) -> None:
664817

665818
def _reset_for_testing(self) -> None:
666819
self._driver_cuda_version = self._configured_driver_cuda_version
820+
self._driver_release_version = self._configured_driver_release_version
667821
self._resolved_items.clear()
668822

669823
def _register_and_check(self, item: ResolvedItem) -> None:

0 commit comments

Comments
 (0)