From 6a9c7c03964f89d720db10fab404dae236b5a7df Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 21 Mar 2026 15:53:05 +0200 Subject: [PATCH] Create a pipeline to archive_urls Update the pipeline to use https://web.archive.org/web/ endpoint no complex logic Add a test Signed-off-by: ziad hany --- vulnerabilities/improvers/__init__.py | 2 + .../0120_advisoryreference_archive_url.py | 20 ++++++ vulnerabilities/models.py | 6 ++ .../pipelines/v2_improvers/archive_urls.py | 63 +++++++++++++++++++ .../v2_improvers/test_archive_urls.py | 37 +++++++++++ 5 files changed, 128 insertions(+) create mode 100644 vulnerabilities/migrations/0120_advisoryreference_archive_url.py create mode 100644 vulnerabilities/pipelines/v2_improvers/archive_urls.py create mode 100644 vulnerabilities/tests/pipelines/v2_improvers/test_archive_urls.py diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index 11fa5126a..e84723f22 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -18,6 +18,7 @@ from vulnerabilities.pipelines import flag_ghost_packages from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories +from vulnerabilities.pipelines.v2_improvers import archive_urls from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( @@ -71,6 +72,7 @@ unfurl_version_range_v2.UnfurlVersionRangePipeline, collect_ssvc_trees.CollectSSVCPipeline, relate_severities.RelateSeveritiesPipeline, + archive_urls.ArchiveImproverPipeline, group_advisories_for_packages.GroupAdvisoriesForPackages, ] ) diff --git a/vulnerabilities/migrations/0120_advisoryreference_archive_url.py b/vulnerabilities/migrations/0120_advisoryreference_archive_url.py new file mode 100644 index 000000000..6b8b3e989 --- /dev/null +++ b/vulnerabilities/migrations/0120_advisoryreference_archive_url.py @@ -0,0 +1,20 @@ +# Generated by Django 5.2.11 on 2026-04-07 15:59 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("vulnerabilities", "0119_remove_advisoryset_identifiers_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="advisoryreference", + name="archive_url", + field=models.URLField( + help_text="URL to the backup vulnerability reference", max_length=1024, null=True + ), + ), + ] diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index 45d8acf55..135b90733 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -2675,6 +2675,12 @@ class AdvisoryReference(models.Model): help_text="URL to the vulnerability reference", ) + archive_url = models.URLField( + max_length=1024, + null=True, + help_text="URL to the backup vulnerability reference", + ) + ADVISORY = "advisory" EXPLOIT = "exploit" COMMIT = "commit" diff --git a/vulnerabilities/pipelines/v2_improvers/archive_urls.py b/vulnerabilities/pipelines/v2_improvers/archive_urls.py new file mode 100644 index 000000000..6337a18ca --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/archive_urls.py @@ -0,0 +1,63 @@ +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import time + +import requests + +from vulnerabilities.models import AdvisoryReference +from vulnerabilities.pipelines import VulnerableCodePipeline + + +class ArchiveImproverPipeline(VulnerableCodePipeline): + """ + Archive Improver Pipeline + """ + + pipeline_id = "archive_improver_pipeline" + + @classmethod + def steps(cls): + return (cls.archive_urls,) + + def archive_urls(self): + """Get and stores archive URLs for AdvisoryReferences, flagging missing ones as NO_ARCHIVE""" + advisory_refs = ( + AdvisoryReference.objects.filter(archive_url__isnull=True) + .exclude(archive_url="NO_ARCHIVE") + .only("id", "url") + ) + + for advisory_ref in advisory_refs: + url = advisory_ref.url + if not url or not url.startswith("http"): + continue + + archive_url = self.get_archival(url) + if not archive_url: + AdvisoryReference.objects.filter(id=advisory_ref.id).update( + archive_url="NO_ARCHIVE" + ) + self.log(f"URL unreachable or returned no archive url: {url}") + continue + self.log(f"Found Archived Reference URL: {archive_url}") + AdvisoryReference.objects.filter(id=advisory_ref.id).update(archive_url=archive_url) + + def get_archival(self, url): + self.log(f"Searching for archive URL for this Reference URL: {url}") + try: + archive_response = requests.get( + url=f"https://web.archive.org/web/{url}", allow_redirects=True + ) + time.sleep(30) + if archive_response.status_code == 200: + return archive_response.url + elif archive_response.status_code == 403: + self.log(f"Wayback Machine permission denied for '{url}'.") + except requests.RequestException as e: + self.log(f"Error checking existing archival: {e}") diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_archive_urls.py b/vulnerabilities/tests/pipelines/v2_improvers/test_archive_urls.py new file mode 100644 index 000000000..e7e38c5d7 --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_archive_urls.py @@ -0,0 +1,37 @@ +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest.mock import MagicMock + +import pytest + +from vulnerabilities.models import AdvisoryReference +from vulnerabilities.pipelines.v2_improvers.archive_urls import ArchiveImproverPipeline + + +@pytest.mark.django_db +def test_archive_urls_pipeline(monkeypatch): + advisory = AdvisoryReference.objects.create(url="https://example.com", archive_url=None) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.url = "https://web.archive.org/web/20250519082420/https://example.com" + + monkeypatch.setattr( + f"vulnerabilities.pipelines.v2_improvers.archive_urls.time.sleep", MagicMock() + ) + monkeypatch.setattr( + f"vulnerabilities.pipelines.v2_improvers.archive_urls.requests.get", + MagicMock(return_value=mock_response), + ) + + pipeline = ArchiveImproverPipeline() + pipeline.archive_urls() + + advisory.refresh_from_db() + assert advisory.archive_url == "https://web.archive.org/web/20250519082420/https://example.com"