From 1c049a412bc378faee51dd3eac373b7b4a8899c0 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Mon, 2 Mar 2026 21:37:30 +0200 Subject: [PATCH 1/2] Add manage command to resync preprint dois v1 --- .../commands/resync_preprint_dois_v1.py | 129 ++++++++++++ .../test_resync_preprint_dois_v1.py | 187 ++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 osf/management/commands/resync_preprint_dois_v1.py create mode 100644 tests/identifiers/test_resync_preprint_dois_v1.py diff --git a/osf/management/commands/resync_preprint_dois_v1.py b/osf/management/commands/resync_preprint_dois_v1.py new file mode 100644 index 00000000000..ff8e1fd5558 --- /dev/null +++ b/osf/management/commands/resync_preprint_dois_v1.py @@ -0,0 +1,129 @@ +import logging +import time + +from django.contrib.contenttypes.models import ContentType +from django.core.management.base import BaseCommand +from django.db.models import Q + +from osf.models import Preprint, Identifier +from osf.models.base import VersionedGuidMixin +from osf.management.commands.sync_doi_metadata import async_request_identifier_update + +logger = logging.getLogger(__name__) + +RATE_LIMIT_SLEEP = 60 * 5 + +def get_preprints_needing_v1_doi(provider_id=None): + content_type = ContentType.objects.get_for_model(Preprint) + + already_versioned_ids = Identifier.objects.filter( + content_type=content_type, + category='doi', + deleted__isnull=True, + value__contains=VersionedGuidMixin.GUID_VERSION_DELIMITER, + ).values_list('object_id', flat=True) + + public_query = Q(is_published=True, is_public=True, deleted__isnull=True) + withdrawn_query = Q(date_withdrawn__isnull=False, ever_public=True) + + qs = Preprint.objects.filter( + versioned_guids__version=1, + ).filter( + public_query | withdrawn_query + ).exclude( + id__in=already_versioned_ids + ).exclude( + tags__name='qatest', + tags__system=True, + ).select_related('provider').distinct() + + if provider_id: + qs = qs.filter(provider___id=provider_id) + + return qs + + +def resync_preprint_dois_v1(dry_run=True, batch_size=0, rate_limit=100, provider_id=None): + preprints_to_update = get_preprints_needing_v1_doi(provider_id=provider_id) + + total = preprints_to_update.count() + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'{total} preprints need v1 DOI resync' + + (f' (provider={provider_id})' if provider_id else '') + ) + + if batch_size: + preprints_iterable = preprints_to_update[:batch_size] + else: + preprints_iterable = preprints_to_update.iterator() + + queued = 0 + skipped = 0 + for record_number, preprint in enumerate(preprints_iterable, 1): + if not preprint.provider.doi_prefix: + logger.warning( + f'Skipping preprint {preprint._id}: ' + f'provider {preprint.provider._id} has no DOI prefix' + ) + skipped += 1 + continue + + if dry_run: + logger.info(f'[DRY RUN] Would resync DOI for preprint {preprint._id}') + queued += 1 + continue + + if rate_limit and not record_number % rate_limit: + logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s') + time.sleep(RATE_LIMIT_SLEEP) + + async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) + logger.info(f'Queued DOI resync for preprint {preprint._id}') + queued += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Done: {queued} preprints queued, {skipped} skipped (no DOI prefix)' + ) + + +class Command(BaseCommand): + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '--dry_run', + action='store_true', + dest='dry_run', + help='Log what would be done without submitting to Crossref.', + ) + parser.add_argument( + '--batch_size', + '-b', + type=int, + default=0, + help='Maximum number of preprints to process (0 = no limit).', + ) + parser.add_argument( + '--rate_limit', + '-r', + type=int, + default=100, + help='Sleep between Crossref submissions every N preprints.', + ) + parser.add_argument( + '--provider', + '-p', + type=str, + default=None, + dest='provider_id', + help='Restrict to a single provider _id (e.g. socarxiv).', + ) + + def handle(self, *args, **options): + resync_preprint_dois_v1( + dry_run=options['dry_run'], + batch_size=options['batch_size'], + rate_limit=options['rate_limit'], + provider_id=options['provider_id'], + ) diff --git a/tests/identifiers/test_resync_preprint_dois_v1.py b/tests/identifiers/test_resync_preprint_dois_v1.py new file mode 100644 index 00000000000..05c5996fb73 --- /dev/null +++ b/tests/identifiers/test_resync_preprint_dois_v1.py @@ -0,0 +1,187 @@ +import pytest +from unittest import mock +from django.utils import timezone + +from osf.models import Preprint +from osf_tests.factories import PreprintFactory, PreprintProviderFactory +from osf.management.commands.resync_preprint_dois_v1 import ( + get_preprints_needing_v1_doi, + resync_preprint_dois_v1, +) +from website import settings + +pytestmark = pytest.mark.django_db + + +@pytest.fixture() +def provider(): + p = PreprintProviderFactory() + p.doi_prefix = '10.31219' + p.save() + return p + + +@pytest.fixture() +def preprint(provider): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + return pp + + +@pytest.fixture() +def preprint_with_v1_doi(provider): + pp = PreprintFactory(provider=provider, is_published=True) + v1_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp._id) + pp.set_identifier_values(doi=v1_doi, save=True) + return pp + + +class TestGetPreprrintsNeedingV1Doi: + + def test_includes_public_preprint_without_versioned_doi(self, preprint): + qs = get_preprints_needing_v1_doi() + assert preprint in qs + + def test_excludes_preprint_with_versioned_doi(self, preprint_with_v1_doi): + qs = get_preprints_needing_v1_doi() + assert preprint_with_v1_doi not in qs + + def test_excludes_preprint_with_no_doi_if_private(self, provider): + private_preprint = PreprintFactory(provider=provider, is_published=False) + private_preprint.is_public = False + private_preprint.save() + qs = get_preprints_needing_v1_doi() + assert private_preprint not in qs + + def test_includes_withdrawn_preprint_with_ever_public(self, provider): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + pp.date_withdrawn = timezone.now() + pp.ever_public = True + pp.save() + qs = get_preprints_needing_v1_doi() + assert pp in qs + + def test_excludes_withdrawn_preprint_never_public(self, provider): + pp = PreprintFactory(provider=provider, is_published=False) + Preprint.objects.filter(pk=pp.pk).update(date_withdrawn=timezone.now()) + qs = get_preprints_needing_v1_doi() + assert pp not in qs + + def test_excludes_version_2_preprint(self, preprint): + from tests.utils import capture_notifications + with capture_notifications(): + v2 = PreprintFactory.create_version(preprint, is_published=True, set_doi=False) + old_doi = settings.DOI_FORMAT.format(prefix=preprint.provider.doi_prefix, guid=v2.get_guid()._id) + v2.set_identifier_values(doi=old_doi, save=True) + qs = get_preprints_needing_v1_doi() + assert v2 not in qs + + def test_excludes_qatest_tagged_preprint(self, preprint): + preprint.add_system_tag('qatest') + qs = get_preprints_needing_v1_doi() + assert preprint not in qs + + def test_excludes_deleted_preprint(self, preprint): + preprint.deleted = timezone.now() + preprint.save() + qs = get_preprints_needing_v1_doi() + assert preprint not in qs + + def test_provider_filter_limits_results(self, preprint, provider): + other_provider = PreprintProviderFactory() + other_provider.doi_prefix = '10.12345' + other_provider.save() + other_preprint = PreprintFactory(provider=other_provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=other_provider.doi_prefix, guid=other_preprint.get_guid()._id) + other_preprint.set_identifier_values(doi=old_doi, save=True) + + qs = get_preprints_needing_v1_doi(provider_id=provider._id) + assert preprint in qs + assert other_preprint not in qs + + def test_preprint_with_no_doi_identifier_is_included(self, provider): + pp = PreprintFactory(provider=provider, is_published=True, set_doi=False) + qs = get_preprints_needing_v1_doi() + assert pp in qs + + +class TestResyncPreprintDoisV1: + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_dry_run_does_not_queue_tasks(self, mock_task, preprint): + resync_preprint_dois_v1(dry_run=True) + mock_task.apply_async.assert_not_called() + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_live_run_queues_task_for_each_preprint(self, mock_task, preprint): + resync_preprint_dois_v1(dry_run=False, rate_limit=0) + mock_task.apply_async.assert_called_once_with(kwargs={'preprint_id': preprint._id}) + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_batch_size_limits_processed_count(self, mock_task, provider): + preprints = [] + for _ in range(5): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + preprints.append(pp) + + resync_preprint_dois_v1(dry_run=False, batch_size=2, rate_limit=0) + assert mock_task.apply_async.call_count == 2 + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_skips_provider_without_doi_prefix(self, mock_task, provider): + no_prefix_provider = PreprintProviderFactory() + no_prefix_provider.doi_prefix = '' + no_prefix_provider.save() + pp = PreprintFactory(provider=no_prefix_provider, is_published=True) + old_doi = '10.000/old-doi' + pp.set_identifier_values(doi=old_doi, save=True) + + resync_preprint_dois_v1(dry_run=False, rate_limit=0) + queued_ids = [ + call.kwargs['kwargs']['preprint_id'] + for call in mock_task.apply_async.call_args_list + ] + assert pp._id not in queued_ids + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_provider_filter_is_applied(self, mock_task, preprint, provider): + other_provider = PreprintProviderFactory() + other_provider.doi_prefix = '10.99999' + other_provider.save() + other_pp = PreprintFactory(provider=other_provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=other_provider.doi_prefix, guid=other_pp.get_guid()._id) + other_pp.set_identifier_values(doi=old_doi, save=True) + + resync_preprint_dois_v1(dry_run=False, rate_limit=0, provider_id=provider._id) + + queued_ids = [ + call.kwargs['kwargs']['preprint_id'] + for call in mock_task.apply_async.call_args_list + ] + assert preprint._id in queued_ids + assert other_pp._id not in queued_ids + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_already_versioned_doi_is_not_queued(self, mock_task, preprint_with_v1_doi): + resync_preprint_dois_v1(dry_run=False, rate_limit=0) + queued_ids = [ + call.kwargs['kwargs']['preprint_id'] + for call in mock_task.apply_async.call_args_list + ] + assert preprint_with_v1_doi._id not in queued_ids + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.time.sleep') + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_rate_limit_triggers_sleep(self, mock_task, mock_sleep, provider): + for _ in range(3): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + + resync_preprint_dois_v1(dry_run=False, rate_limit=2) + mock_sleep.assert_called_once() From b6164e0614fc0b3360226259f64d3d8a974f80ea Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Mon, 9 Mar 2026 13:15:56 +0200 Subject: [PATCH 2/2] Update command --- .../commands/resync_preprint_dois_v1.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/osf/management/commands/resync_preprint_dois_v1.py b/osf/management/commands/resync_preprint_dois_v1.py index ff8e1fd5558..1e6d5be50af 100644 --- a/osf/management/commands/resync_preprint_dois_v1.py +++ b/osf/management/commands/resync_preprint_dois_v1.py @@ -11,8 +11,11 @@ logger = logging.getLogger(__name__) +# 5-minute pause between rate-limit windows to avoid flooding the Crossref API +# with too many deposit requests in a short period. RATE_LIMIT_SLEEP = 60 * 5 + def get_preprints_needing_v1_doi(provider_id=None): content_type = ContentType.objects.get_for_model(Preprint) @@ -43,7 +46,7 @@ def get_preprints_needing_v1_doi(provider_id=None): return qs -def resync_preprint_dois_v1(dry_run=True, batch_size=0, rate_limit=100, provider_id=None): +def resync_preprint_dois_v1(dry_run=True, batch_size=500, rate_limit=100, provider_id=None): preprints_to_update = get_preprints_needing_v1_doi(provider_id=provider_id) total = preprints_to_update.count() @@ -60,6 +63,7 @@ def resync_preprint_dois_v1(dry_run=True, batch_size=0, rate_limit=100, provider queued = 0 skipped = 0 + errored = 0 for record_number, preprint in enumerate(preprints_iterable, 1): if not preprint.provider.doi_prefix: logger.warning( @@ -78,13 +82,17 @@ def resync_preprint_dois_v1(dry_run=True, batch_size=0, rate_limit=100, provider logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s') time.sleep(RATE_LIMIT_SLEEP) - async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) - logger.info(f'Queued DOI resync for preprint {preprint._id}') - queued += 1 + try: + async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) + logger.info(f'Queued DOI resync for preprint {preprint._id}') + queued += 1 + except Exception: + logger.exception(f'Failed to queue DOI resync for preprint {preprint._id}') + errored += 1 logger.info( f'{"[DRY RUN] " if dry_run else ""}' - f'Done: {queued} preprints queued, {skipped} skipped (no DOI prefix)' + f'Done: {queued} preprints queued, {skipped} skipped (no DOI prefix), {errored} errored' ) @@ -101,8 +109,12 @@ def add_arguments(self, parser): '--batch_size', '-b', type=int, - default=0, - help='Maximum number of preprints to process (0 = no limit).', + default=500, + help=( + 'Maximum number of preprints to process per run (default: 500). ' + 'The command processes the first N eligible preprints and exits; ' + 're-run the command to continue with the next batch.' + ), ) parser.add_argument( '--rate_limit',