From 626b62ec1124cb27e8f5807aadd7bca5d4899dd0 Mon Sep 17 00:00:00 2001 From: Fitz Elliott Date: Thu, 26 Feb 2026 14:28:39 -0500 Subject: [PATCH 1/4] [ENG-10054] feature/ror-migration (#11610) * feat(osf): script to migrate Crossref Funder IDs to ROR IDs * feat(osf): Fix fot the script to migrate Crossref Funder IDs to ROR IDs * feat(osf): Update OSF metadata model code and tests for ROR funder identifier support * feat(osf): Add DataCite client tests for ROR funder identifier support * feat(osf): update migration script to remove unmapped crossref funders * add another stat to the migration script --------- Co-authored-by: Andriy Sheredko --- .../commands/migrate_funder_ids_to_ror.py | 349 +++++++++++++++ osf/metadata/schemas/datacite.json | 4 + .../datacite/datacite_tree_walker.py | 16 +- .../test_migrate_funder_ids_to_ror.py | 415 ++++++++++++++++++ .../expected_metadata_files/file_full.turtle | 8 +- .../preprint_full.turtle | 8 +- .../project_full.datacite.json | 11 +- .../project_full.datacite.xml | 6 +- .../project_full.turtle | 8 +- .../registration_full.turtle | 8 +- osf_tests/metadata/test_osf_gathering.py | 10 + .../metadata/test_serialized_metadata.py | 6 +- tests/identifiers/test_datacite.py | 147 ++++++- 13 files changed, 962 insertions(+), 34 deletions(-) create mode 100644 osf/management/commands/migrate_funder_ids_to_ror.py create mode 100644 osf_tests/management_commands/test_migrate_funder_ids_to_ror.py diff --git a/osf/management/commands/migrate_funder_ids_to_ror.py b/osf/management/commands/migrate_funder_ids_to_ror.py new file mode 100644 index 00000000000..64eb1721d6e --- /dev/null +++ b/osf/management/commands/migrate_funder_ids_to_ror.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python3 +""" +Management command to migrate Crossref Funder IDs to ROR IDs. + +This script reads a CSV mapping file and updates all GuidMetadataRecord entries +that have funding_info with Crossref Funder IDs, converting them to ROR IDs. + +Usage: + # Dry run (recommended first) + python manage.py migrate_funder_ids_to_ror --csv-file /path/to/mapping.csv --dry-run + + # Actual migration + python manage.py migrate_funder_ids_to_ror --csv-file /path/to/mapping.csv + +CSV Format Expected (tab or comma separated): + Funder Name, ror ID, ROR name, Crossref DOI, Funder ID + Example: + National Science Foundation, https://ror.org/021nxhr62, National Science Foundation, http://dx.doi.org/10.13039/100000001, 100000001 +""" +import csv +import logging +import re + +from django.core.management.base import BaseCommand +from django.db import transaction + +from osf.models import GuidMetadataRecord + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = 'Migrate Crossref Funder IDs to ROR IDs in GuidMetadataRecord.funding_info' + + def add_arguments(self, parser): + parser.add_argument( + '--csv-file', + type=str, + required=True, + help='Path to the CSV file containing the Crossref to ROR mapping.', + ) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='Run without making any changes to the database.', + ) + parser.add_argument( + '--batch-size', + type=int, + default=1000, + help='Number of records to process in each batch (default: 1000).', + ) + parser.add_argument( + '--update-funder-name', + action='store_true', + dest='update_funder_name', + help='Also update funder_name to the ROR name from the mapping.', + ) + parser.add_argument( + '--skip-reindex', + action='store_true', + dest='skip_reindex', + help='Skip triggering SHARE/DataCite re-indexing after migration. ' + 'Use this if you plan to run recatalog_metadata separately.', + ) + + def handle(self, *args, **options): + csv_file = options['csv_file'] + dry_run = options['dry_run'] + batch_size = options['batch_size'] + update_funder_name = options['update_funder_name'] + reindex = not options['skip_reindex'] + + if dry_run: + self.stdout.write(self.style.WARNING('[DRY RUN] No changes will be made to the database.')) + + if not reindex: + self.stdout.write(self.style.WARNING('Re-indexing is disabled. Run recatalog_metadata after migration.')) + + # Load the mapping + mapping = self.load_mapping(csv_file) + if not mapping: + self.stdout.write(self.style.ERROR('No valid mappings found in CSV file.')) + return + + self.stdout.write(f'Loaded {len(mapping)} Crossref to ROR mappings.') + + # Find and update records + stats = self.migrate_records(mapping, dry_run, batch_size, update_funder_name, reindex) + + # Print summary + self.stdout.write('\n' + '=' * 60) + self.stdout.write(self.style.SUCCESS('Migration Summary:')) + self.stdout.write(f" Records scanned: {stats['scanned']}") + self.stdout.write(f" Records updated: {stats['updated']}") + self.stdout.write(f" Records re-indexed: {stats['reindexed']}") + self.stdout.write(f" Funders migrated: {stats['funders_migrated']}") + self.stdout.write(f" Unmapped funders removed: {stats['not_in_mapping']}") + self.stdout.write(f" Unique funders not in mapping: {len(stats['unmapped_ids'])}") + if stats['errors']: + self.stdout.write(self.style.ERROR(f" Errors: {stats['errors']}")) + + if stats['unmapped_ids']: + self.stdout.write('\nUnmapped Crossref Funder IDs (not in CSV):') + for funder_id in sorted(stats['unmapped_ids'])[:50]: # Show first 50 + self.stdout.write(f' - {funder_id}') + if len(stats['unmapped_ids']) > 50: + self.stdout.write(f' ... and {len(stats["unmapped_ids"]) - 50} more') + + def load_mapping(self, csv_file): + """Load the Crossref to ROR mapping from CSV file. + + Returns a dict mapping various forms of Crossref ID to ROR info: + { + '100000001': {'ror_id': 'https://ror.org/021nxhr62', 'ror_name': 'National Science Foundation'}, + 'http://dx.doi.org/10.13039/100000001': {...}, + 'https://doi.org/10.13039/100000001': {...}, + ... + } + """ + mapping = {} + + try: + with open(csv_file, 'r', encoding='utf-8-sig') as f: + # Try to detect delimiter + sample = f.read(2048) + f.seek(0) + if '\t' in sample: + delimiter = '\t' + else: + delimiter = ',' + + reader = csv.DictReader(f, delimiter=delimiter) + + # Normalize column names (handle various formats) + for row in reader: + # Try to find the relevant columns + ror_id = None + ror_name = None + crossref_doi = None + funder_id = None + + for key, value in row.items(): + if not key: + continue + key_lower = key.lower().strip() + + if 'ror' in key_lower and 'id' in key_lower and 'ror_name' not in key_lower: + ror_id = value.strip() if value else None + elif 'ror' in key_lower and 'name' in key_lower: + ror_name = value.strip() if value else None + elif 'crossref' in key_lower and 'doi' in key_lower: + crossref_doi = value.strip() if value else None + elif key_lower == 'funder id' or key_lower == 'funder_id': + funder_id = value.strip() if value else None + + if not ror_id: + continue + + ror_info = { + 'ror_id': ror_id, + 'ror_name': ror_name, + } + + # Add mappings for various ID formats + if funder_id: + mapping[funder_id] = ror_info + # Also add with various DOI prefixes + mapping[f'http://dx.doi.org/10.13039/{funder_id}'] = ror_info + mapping[f'https://doi.org/10.13039/{funder_id}'] = ror_info + mapping[f'10.13039/{funder_id}'] = ror_info + + if crossref_doi: + mapping[crossref_doi] = ror_info + # Normalize the DOI URL + if crossref_doi.startswith('http://'): + mapping[crossref_doi.replace('http://', 'https://')] = ror_info + elif crossref_doi.startswith('https://'): + mapping[crossref_doi.replace('https://', 'http://')] = ror_info + + except FileNotFoundError: + self.stdout.write(self.style.ERROR(f'CSV file not found: {csv_file}')) + return None + except Exception as e: + self.stdout.write(self.style.ERROR(f'Error reading CSV file: {e}')) + return None + + return mapping + + def extract_funder_id(self, identifier): + """Extract the numeric funder ID from various identifier formats.""" + if not identifier: + return None + + # Already just a number + if re.match(r'^\d+$', identifier): + return identifier + + # Extract from DOI URL (e.g., http://dx.doi.org/10.13039/100000001) + match = re.search(r'10\.13039/(\d+)', identifier) + if match: + return match.group(1) + + return identifier + + def migrate_records(self, mapping, dry_run, batch_size, update_funder_name, reindex): + """Find and migrate all GuidMetadataRecord entries with Crossref Funder IDs.""" + stats = { + 'scanned': 0, + 'updated': 0, + 'reindexed': 0, + 'funders_migrated': 0, + 'not_in_mapping': 0, + 'errors': 0, + 'unmapped_ids': set(), + } + + # Query records that have non-empty funding_info + # We need to check if any funder has 'Crossref Funder ID' type + queryset = GuidMetadataRecord.objects.exclude(funding_info=[]).exclude(funding_info__isnull=True) + + total_count = queryset.count() + self.stdout.write(f'Found {total_count} records with funding_info to scan.') + + processed = 0 + for record in queryset.iterator(chunk_size=batch_size): + stats['scanned'] += 1 + processed += 1 + + if processed % 500 == 0: + self.stdout.write(f' Processed {processed}/{total_count} records...') + + try: + updated, funder_stats = self.migrate_record(record, mapping, dry_run, update_funder_name) + if updated: + stats['updated'] += 1 + if reindex and not dry_run: + try: + self.reindex_record(record) + stats['reindexed'] += 1 + except Exception as e: + logger.error(f'Error re-indexing record {record.guid._id}: {e}') + stats['funders_migrated'] += funder_stats['migrated'] + stats['not_in_mapping'] += funder_stats['not_found'] + stats['unmapped_ids'].update(funder_stats['unmapped_ids']) + except Exception as e: + stats['errors'] += 1 + logger.error(f'Error migrating record {record.guid._id}: {e}') + + return stats + + def migrate_record(self, record, mapping, dry_run, update_funder_name): + """Migrate a single GuidMetadataRecord's funding_info. + + Returns (was_updated, funder_stats) + """ + funder_stats = { + 'migrated': 0, + 'not_found': 0, + 'unmapped_ids': set(), + } + + if not record.funding_info: + return False, funder_stats + + updated_funding_info = [] + record_modified = False + + for funder in record.funding_info: + funder_type = funder.get('funder_identifier_type', '') + funder_identifier = funder.get('funder_identifier', '') + + # Only migrate Crossref Funder IDs (includes legacy 'Crossref Funder URI' type) + if funder_type not in ('Crossref Funder ID', 'Crossref Funder URI'): + updated_funding_info.append(funder) + continue + + # Try to find in mapping + ror_info = None + + # Try exact match first + if funder_identifier in mapping: + ror_info = mapping[funder_identifier] + else: + # Try to extract numeric ID and look up + numeric_id = self.extract_funder_id(funder_identifier) + if numeric_id and numeric_id in mapping: + ror_info = mapping[numeric_id] + + if ror_info: + # Create updated funder entry + updated_funder = funder.copy() + updated_funder['funder_identifier'] = ror_info['ror_id'] + updated_funder['funder_identifier_type'] = 'ROR' + + if update_funder_name and ror_info.get('ror_name'): + updated_funder['funder_name'] = ror_info['ror_name'] + + updated_funding_info.append(updated_funder) + record_modified = True + funder_stats['migrated'] += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Migrating funder in {record.guid._id}: ' + f'{funder_identifier} -> {ror_info["ror_id"]}' + ) + else: + # No mapping found, remove unmapped Crossref funder + record_modified = True + funder_stats['not_found'] += 1 + funder_stats['unmapped_ids'].add(funder_identifier) + + logger.warning( + f'{"[DRY RUN] " if dry_run else ""}' + f'Removing unmapped Crossref Funder ID: {funder_identifier} ' + f'from record {record.guid._id}' + ) + + # Warn about duplicate ROR IDs that would result from migration + if record_modified: + ror_identifiers = [ + f['funder_identifier'] + for f in updated_funding_info + if f.get('funder_identifier_type') == 'ROR' + ] + seen = set() + duplicates = {rid for rid in ror_identifiers if rid in seen or seen.add(rid)} + if duplicates: + logger.warning( + f'Record {record.guid._id} has duplicate ROR IDs after migration: {duplicates}' + ) + + if record_modified and not dry_run: + with transaction.atomic(): + record.funding_info = updated_funding_info + record.save(update_fields=['funding_info']) + + return record_modified, funder_stats + + def reindex_record(self, record): + """Trigger SHARE/ElasticSearch and DataCite re-indexing for the record's referent.""" + referent = record.guid.referent + if hasattr(referent, 'update_search'): + referent.update_search() + if hasattr(referent, 'request_identifier_update'): + referent.request_identifier_update('doi') diff --git a/osf/metadata/schemas/datacite.json b/osf/metadata/schemas/datacite.json index 650598b8ee4..555c9e56e9a 100644 --- a/osf/metadata/schemas/datacite.json +++ b/osf/metadata/schemas/datacite.json @@ -470,8 +470,12 @@ "ISNI", "GRID", "Crossref Funder ID", + "ROR", "Other" ] + }, + "schemeURI": { + "$ref": "#/definitions/uri" } }, "additionalProperties": false, diff --git a/osf/metadata/serializers/datacite/datacite_tree_walker.py b/osf/metadata/serializers/datacite/datacite_tree_walker.py index 5f0a283d450..950ac3b50dd 100644 --- a/osf/metadata/serializers/datacite/datacite_tree_walker.py +++ b/osf/metadata/serializers/datacite/datacite_tree_walker.py @@ -195,12 +195,12 @@ def _identifier_type_and_value(self, identifier: str): return ('URL', identifier) logger.warning('skipping non-IRI-shaped identifier "%s"', identifier) - def _funder_identifier_type(self, identifier: str): + def _funder_identifier_type_and_scheme(self, identifier: str): if identifier.startswith(DxDOI) or identifier.startswith(DOI): - return 'Crossref Funder ID' + return ('Crossref Funder ID', 'https://www.crossref.org/services/funder-registry/') if identifier.startswith(ROR): - return 'ROR' - return 'Other' + return ('ROR', str(ROR)) + return ('Other', '') def _get_name_type(self, agent_iri): if (agent_iri, RDF.type, FOAF.Person) in self.basket: @@ -312,13 +312,15 @@ def _funding_reference(self, fundrefs_el, funder, funding_award=None): _fundref_el = self.visit(fundrefs_el, 'fundingReference') self.visit(_fundref_el, 'funderName', text=next(self.basket[funder:FOAF.name], '')) _funder_identifier = next(self.basket[funder:DCTERMS.identifier], '') + _funder_id_type, _funder_scheme_uri = self._funder_identifier_type_and_scheme(_funder_identifier) + _funder_id_attrib = {'funderIdentifierType': _funder_id_type} + if _funder_scheme_uri: + _funder_id_attrib['schemeURI'] = _funder_scheme_uri self.visit( _fundref_el, 'funderIdentifier', text=_funder_identifier, - attrib={ - 'funderIdentifierType': self._funder_identifier_type(_funder_identifier), - }, + attrib=_funder_id_attrib, ) if funding_award is not None: self.visit( diff --git a/osf_tests/management_commands/test_migrate_funder_ids_to_ror.py b/osf_tests/management_commands/test_migrate_funder_ids_to_ror.py new file mode 100644 index 00000000000..a7bec602e5f --- /dev/null +++ b/osf_tests/management_commands/test_migrate_funder_ids_to_ror.py @@ -0,0 +1,415 @@ +import os +import pytest +import tempfile +from unittest import mock + +from django.core.management import call_command + +from osf.models import GuidMetadataRecord +from osf.management.commands.migrate_funder_ids_to_ror import Command +from osf_tests import factories + + +@pytest.mark.django_db +class TestMigrateFunderIdsToRor: + + @pytest.fixture + def user(self): + return factories.UserFactory() + + @pytest.fixture + def project(self, user): + return factories.ProjectFactory(creator=user) + + @pytest.fixture + def csv_mapping_file(self): + """Create a temporary CSV file with test mapping data.""" + content = """Funder Name\tror ID\tROR name\tCrossref DOI\tFunder ID +National Institutes of Health\thttps://ror.org/01cwqze88\tNational Institutes of Health\thttp://dx.doi.org/10.13039/100000002\t100000002 +National Science Foundation\thttps://ror.org/021nxhr62\tNational Science Foundation\thttp://dx.doi.org/10.13039/100000001\t100000001 +European Research Council\thttps://ror.org/0472cxd90\tEuropean Research Council\thttp://dx.doi.org/10.13039/501100000781\t501100000781 +""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write(content) + temp_path = f.name + + yield temp_path + + # Cleanup + os.unlink(temp_path) + + @pytest.fixture + def mock_reindex(self): + """Mock update_search and request_identifier_update to avoid actual re-indexing in tests.""" + with mock.patch('osf.models.node.AbstractNode.update_search') as mock_search, \ + mock.patch('osf.models.node.AbstractNode.request_identifier_update') as mock_doi: + yield mock_search, mock_doi + + @pytest.fixture + def record_with_crossref_funder(self, project): + """Create a GuidMetadataRecord with Crossref Funder ID.""" + record = GuidMetadataRecord.objects.for_guid(project._id) + record.funding_info = [{ + 'funder_name': 'National Institutes of Health', + 'funder_identifier': 'http://dx.doi.org/10.13039/100000002', + 'funder_identifier_type': 'Crossref Funder ID', + 'award_number': 'R01-GM-123456', + 'award_title': 'Test Grant', + }] + record.save() + return record + + @pytest.fixture + def record_with_multiple_funders(self, user): + """Create a GuidMetadataRecord with multiple funders (mix of Crossref and ROR).""" + project = factories.ProjectFactory(creator=user) + record = GuidMetadataRecord.objects.for_guid(project._id) + record.funding_info = [ + { + 'funder_name': 'NIH', + 'funder_identifier': 'https://doi.org/10.13039/100000002', + 'funder_identifier_type': 'Crossref Funder ID', + 'award_number': 'R01-123', + }, + { + 'funder_name': 'Already ROR', + 'funder_identifier': 'https://ror.org/existing123', + 'funder_identifier_type': 'ROR', + }, + { + 'funder_name': 'NSF', + 'funder_identifier': '100000001', + 'funder_identifier_type': 'Crossref Funder ID', + }, + ] + record.save() + return record + + @pytest.fixture + def record_with_unmapped_funder(self, user): + """Create a GuidMetadataRecord with a funder not in the mapping.""" + project = factories.ProjectFactory(creator=user) + record = GuidMetadataRecord.objects.for_guid(project._id) + record.funding_info = [{ + 'funder_name': 'Unknown Funder', + 'funder_identifier': 'http://dx.doi.org/10.13039/999999999', + 'funder_identifier_type': 'Crossref Funder ID', + }] + record.save() + return record + + def test_migrate_single_crossref_funder(self, record_with_crossref_funder, csv_mapping_file): + """Test migrating a single Crossref Funder ID to ROR.""" + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + # Run migration (not dry run) + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record_with_crossref_funder, + mapping, + dry_run=False, + update_funder_name=False + ) + + assert updated is True + assert stats['migrated'] == 1 + assert stats['not_found'] == 0 + + record_with_crossref_funder.refresh_from_db() + funder = record_with_crossref_funder.funding_info[0] + + assert funder['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funder['funder_identifier_type'] == 'ROR' + # Original name preserved (update_funder_name=False) + assert funder['funder_name'] == 'National Institutes of Health' + # Other fields preserved + assert funder['award_number'] == 'R01-GM-123456' + assert funder['award_title'] == 'Test Grant' + + def test_migrate_with_funder_name_update(self, record_with_crossref_funder, csv_mapping_file): + """Test migrating with funder name update enabled.""" + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + command.migrate_record( + record_with_crossref_funder, + mapping, + dry_run=False, + update_funder_name=True + ) + + record_with_crossref_funder.refresh_from_db() + funder = record_with_crossref_funder.funding_info[0] + + assert funder['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funder['funder_identifier_type'] == 'ROR' + assert funder['funder_name'] == 'National Institutes of Health' + + def test_dry_run_does_not_modify(self, record_with_crossref_funder, csv_mapping_file): + """Test that dry run does not modify the database.""" + original_funding_info = record_with_crossref_funder.funding_info.copy() + + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record_with_crossref_funder, + mapping, + dry_run=True, + update_funder_name=False + ) + + assert updated is True # Would have updated + assert stats['migrated'] == 1 + + record_with_crossref_funder.refresh_from_db() + # Data should be unchanged + assert record_with_crossref_funder.funding_info == original_funding_info + + def test_migrate_multiple_funders(self, record_with_multiple_funders, csv_mapping_file): + """Test migrating record with multiple funders.""" + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record_with_multiple_funders, + mapping, + dry_run=False, + update_funder_name=False + ) + + assert updated is True + assert stats['migrated'] == 2 # NIH and NSF + assert stats['not_found'] == 0 + + record_with_multiple_funders.refresh_from_db() + funders = record_with_multiple_funders.funding_info + + # NIH should be migrated + assert funders[0]['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funders[0]['funder_identifier_type'] == 'ROR' + + # Already ROR should be unchanged + assert funders[1]['funder_identifier'] == 'https://ror.org/existing123' + assert funders[1]['funder_identifier_type'] == 'ROR' + + # NSF should be migrated + assert funders[2]['funder_identifier'] == 'https://ror.org/021nxhr62' + assert funders[2]['funder_identifier_type'] == 'ROR' + + def test_unmapped_funder_removed(self, record_with_unmapped_funder, csv_mapping_file): + """Test that funders not in mapping are removed.""" + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record_with_unmapped_funder, + mapping, + dry_run=False, + update_funder_name=False + ) + + assert updated is True + assert stats['migrated'] == 0 + assert stats['not_found'] == 1 + assert 'http://dx.doi.org/10.13039/999999999' in stats['unmapped_ids'] + + record_with_unmapped_funder.refresh_from_db() + assert record_with_unmapped_funder.funding_info == [] + + def test_load_mapping_various_id_formats(self, csv_mapping_file): + """Test that mapping handles various ID formats.""" + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + + # All these formats should map to the same ROR ID + assert mapping['100000002']['ror_id'] == 'https://ror.org/01cwqze88' + assert mapping['http://dx.doi.org/10.13039/100000002']['ror_id'] == 'https://ror.org/01cwqze88' + assert mapping['https://doi.org/10.13039/100000002']['ror_id'] == 'https://ror.org/01cwqze88' + assert mapping['10.13039/100000002']['ror_id'] == 'https://ror.org/01cwqze88' + + def test_extract_funder_id(self): + """Test extraction of numeric funder ID from various formats.""" + command = Command() + + assert command.extract_funder_id('100000002') == '100000002' + assert command.extract_funder_id('http://dx.doi.org/10.13039/100000002') == '100000002' + assert command.extract_funder_id('https://doi.org/10.13039/100000002') == '100000002' + assert command.extract_funder_id('10.13039/100000002') == '100000002' + + def test_empty_funding_info_skipped(self, project, csv_mapping_file): + """Test that records with empty funding_info are skipped.""" + record = GuidMetadataRecord.objects.for_guid(project._id) + record.funding_info = [] + record.save() + + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record, + mapping, + dry_run=False, + update_funder_name=False + ) + + assert updated is False + assert stats['migrated'] == 0 + + def test_ror_funder_not_modified(self, user, csv_mapping_file): + """Test that funders already using ROR are not modified.""" + project = factories.ProjectFactory(creator=user) + record = GuidMetadataRecord.objects.for_guid(project._id) + record.funding_info = [{ + 'funder_name': 'Already ROR', + 'funder_identifier': 'https://ror.org/01cwqze88', + 'funder_identifier_type': 'ROR', + }] + record.save() + + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record, + mapping, + dry_run=False, + update_funder_name=False + ) + + assert updated is False + assert stats['migrated'] == 0 + + record.refresh_from_db() + assert record.funding_info[0]['funder_identifier'] == 'https://ror.org/01cwqze88' + + def test_reindex_triggered_after_migration(self, record_with_crossref_funder, csv_mapping_file, mock_reindex): + """Test that SHARE/DataCite re-indexing is triggered for migrated records.""" + mock_update_search, mock_request_identifier_update = mock_reindex + + call_command( + 'migrate_funder_ids_to_ror', + '--csv-file', csv_mapping_file, + ) + + # Verify re-indexing was triggered + mock_update_search.assert_called() + mock_request_identifier_update.assert_called_with('doi') + + # Verify data was actually migrated + record_with_crossref_funder.refresh_from_db() + funder = record_with_crossref_funder.funding_info[0] + assert funder['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funder['funder_identifier_type'] == 'ROR' + + def test_reindex_not_triggered_on_dry_run(self, record_with_crossref_funder, csv_mapping_file, mock_reindex): + """Test that re-indexing is NOT triggered during dry run.""" + mock_update_search, mock_request_identifier_update = mock_reindex + + call_command( + 'migrate_funder_ids_to_ror', + '--csv-file', csv_mapping_file, + '--dry-run', + ) + + mock_update_search.assert_not_called() + mock_request_identifier_update.assert_not_called() + + def test_reindex_not_triggered_with_skip_flag(self, record_with_crossref_funder, csv_mapping_file, mock_reindex): + """Test that re-indexing is NOT triggered when --skip-reindex is used.""" + mock_update_search, mock_request_identifier_update = mock_reindex + + call_command( + 'migrate_funder_ids_to_ror', + '--csv-file', csv_mapping_file, + '--skip-reindex', + ) + + mock_update_search.assert_not_called() + mock_request_identifier_update.assert_not_called() + + # But data should still be migrated + record_with_crossref_funder.refresh_from_db() + funder = record_with_crossref_funder.funding_info[0] + assert funder['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funder['funder_identifier_type'] == 'ROR' + + def test_reindex_triggered_for_unmapped_records(self, record_with_unmapped_funder, csv_mapping_file, mock_reindex): + """Test that re-indexing IS triggered when unmapped funders are removed.""" + mock_update_search, mock_request_identifier_update = mock_reindex + + call_command( + 'migrate_funder_ids_to_ror', + '--csv-file', csv_mapping_file, + ) + + mock_update_search.assert_called() + mock_request_identifier_update.assert_called_with('doi') + + def test_end_to_end_call_command(self, record_with_crossref_funder, record_with_multiple_funders, csv_mapping_file, mock_reindex): + """Test the full management command end-to-end via call_command.""" + mock_update_search, mock_request_identifier_update = mock_reindex + + call_command( + 'migrate_funder_ids_to_ror', + '--csv-file', csv_mapping_file, + ) + + # Record with single crossref funder should be migrated + record_with_crossref_funder.refresh_from_db() + funder = record_with_crossref_funder.funding_info[0] + assert funder['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funder['funder_identifier_type'] == 'ROR' + assert funder['award_number'] == 'R01-GM-123456' + + # Record with multiple funders should have Crossref ones migrated + record_with_multiple_funders.refresh_from_db() + funders = record_with_multiple_funders.funding_info + assert funders[0]['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funders[0]['funder_identifier_type'] == 'ROR' + assert funders[1]['funder_identifier'] == 'https://ror.org/existing123' + assert funders[1]['funder_identifier_type'] == 'ROR' + assert funders[2]['funder_identifier'] == 'https://ror.org/021nxhr62' + assert funders[2]['funder_identifier_type'] == 'ROR' + + # Re-indexing should have been triggered for both updated records + assert mock_update_search.call_count == 2 + assert mock_request_identifier_update.call_count == 2 + + def test_crossref_funder_uri_type_also_migrated(self, user, csv_mapping_file): + """Test that legacy 'Crossref Funder URI' type is also migrated (found in staging data).""" + project = factories.ProjectFactory(creator=user) + record = GuidMetadataRecord.objects.for_guid(project._id) + record.funding_info = [{ + 'funder_name': 'National Institutes of Health', + 'funder_identifier': 'http://dx.doi.org/10.13039/100000002', + 'funder_identifier_type': 'Crossref Funder URI', + 'award_number': '', + 'award_title': '', + }] + record.save() + + command = Command() + command.stdout = type('MockStdout', (), {'write': lambda self, x: None})() + + mapping = command.load_mapping(csv_mapping_file) + updated, stats = command.migrate_record( + record, mapping, dry_run=False, update_funder_name=False + ) + + assert updated is True + assert stats['migrated'] == 1 + + record.refresh_from_db() + funder = record.funding_info[0] + assert funder['funder_identifier'] == 'https://ror.org/01cwqze88' + assert funder['funder_identifier_type'] == 'ROR' diff --git a/osf_tests/metadata/expected_metadata_files/file_full.turtle b/osf_tests/metadata/expected_metadata_files/file_full.turtle index 492adf41375..4f529ed037f 100644 --- a/osf_tests/metadata/expected_metadata_files/file_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/file_full.turtle @@ -29,8 +29,8 @@ dcterms:title "this is a project title!"@en ; dcterms:type ; owl:sameAs ; - osf:funder , - ; + osf:funder , + ; osf:hasFunding , . @@ -63,8 +63,8 @@ dcterms:identifier "https://doi.org/10.$$$$" ; foaf:name "Mx. Moneypockets" . - a dcterms:Agent ; - dcterms:identifier "https://doi.org/10.$" ; + a dcterms:Agent ; + dcterms:identifier "https://ror.org/0example" ; foaf:name "Caring Fan" . a dcterms:Agent, diff --git a/osf_tests/metadata/expected_metadata_files/preprint_full.turtle b/osf_tests/metadata/expected_metadata_files/preprint_full.turtle index 6b28e0dfa3e..9e3f96bc09c 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_full.turtle @@ -55,8 +55,8 @@ dcterms:title "this is a project title!"@en ; dcterms:type ; owl:sameAs ; - osf:funder , - ; + osf:funder , + ; osf:hasFunding , . @@ -121,8 +121,8 @@ dcterms:identifier "https://doi.org/10.$$$$" ; foaf:name "Mx. Moneypockets" . - a dcterms:Agent ; - dcterms:identifier "https://doi.org/10.$" ; + a dcterms:Agent ; + dcterms:identifier "https://ror.org/0example" ; foaf:name "Caring Fan" . a skos:Concept ; diff --git a/osf_tests/metadata/expected_metadata_files/project_full.datacite.json b/osf_tests/metadata/expected_metadata_files/project_full.datacite.json index 312e74b2388..3ff394455a1 100644 --- a/osf_tests/metadata/expected_metadata_files/project_full.datacite.json +++ b/osf_tests/metadata/expected_metadata_files/project_full.datacite.json @@ -56,7 +56,8 @@ "awardTitle": "because reasons", "funderIdentifier": { "funderIdentifier": "https://doi.org/10.$$$$", - "funderIdentifierType": "Crossref Funder ID" + "funderIdentifierType": "Crossref Funder ID", + "schemeURI": "https://www.crossref.org/services/funder-registry/" }, "funderName": "Mx. Moneypockets" }, @@ -68,14 +69,16 @@ "awardTitle": "because reasons!", "funderIdentifier": { "funderIdentifier": "https://doi.org/10.$$$$", - "funderIdentifierType": "Crossref Funder ID" + "funderIdentifierType": "Crossref Funder ID", + "schemeURI": "https://www.crossref.org/services/funder-registry/" }, "funderName": "Mx. Moneypockets" }, { "funderIdentifier": { - "funderIdentifier": "https://doi.org/10.$", - "funderIdentifierType": "Crossref Funder ID" + "funderIdentifier": "https://ror.org/0example", + "funderIdentifierType": "ROR", + "schemeURI": "https://ror.org/" }, "funderName": "Caring Fan" } diff --git a/osf_tests/metadata/expected_metadata_files/project_full.datacite.xml b/osf_tests/metadata/expected_metadata_files/project_full.datacite.xml index 524fbc33dd4..a161f7cad66 100644 --- a/osf_tests/metadata/expected_metadata_files/project_full.datacite.xml +++ b/osf_tests/metadata/expected_metadata_files/project_full.datacite.xml @@ -38,19 +38,19 @@ Mx. Moneypockets - https://doi.org/10.$$$$ + https://doi.org/10.$$$$ 10000000 because reasons Mx. Moneypockets - https://doi.org/10.$$$$ + https://doi.org/10.$$$$ 2000000 because reasons! Caring Fan - https://doi.org/10.$ + https://ror.org/0example diff --git a/osf_tests/metadata/expected_metadata_files/project_full.turtle b/osf_tests/metadata/expected_metadata_files/project_full.turtle index 0085e6164e3..5d3d78b761e 100644 --- a/osf_tests/metadata/expected_metadata_files/project_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_full.turtle @@ -27,8 +27,8 @@ owl:sameAs ; dcat:accessService ; osf:contains ; - osf:funder , - ; + osf:funder , + ; osf:hasFunding , ; osf:hostingInstitution ; @@ -116,8 +116,8 @@ dcterms:identifier "https://doi.org/10.$$$$" ; foaf:name "Mx. Moneypockets" . - a dcterms:Agent ; - dcterms:identifier "https://doi.org/10.$" ; + a dcterms:Agent ; + dcterms:identifier "https://ror.org/0example" ; foaf:name "Caring Fan" . a dcterms:Agent, diff --git a/osf_tests/metadata/expected_metadata_files/registration_full.turtle b/osf_tests/metadata/expected_metadata_files/registration_full.turtle index ab75ae5888c..32599e57ab9 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_full.turtle @@ -43,8 +43,8 @@ dcterms:title "this is a project title!"@en ; dcterms:type ; owl:sameAs ; - osf:funder , - ; + osf:funder , + ; osf:hasFunding , . @@ -98,8 +98,8 @@ dcterms:identifier "https://doi.org/10.$$$$" ; foaf:name "Mx. Moneypockets" . - a dcterms:Agent ; - dcterms:identifier "https://doi.org/10.$" ; + a dcterms:Agent ; + dcterms:identifier "https://ror.org/0example" ; foaf:name "Caring Fan" . a dcterms:Agent, diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py index 424042253f7..f235488e557 100644 --- a/osf_tests/metadata/test_osf_gathering.py +++ b/osf_tests/metadata/test_osf_gathering.py @@ -610,10 +610,16 @@ def test_gather_funding(self): 'award_uri': 'https://nih.example/award', 'award_number': '27', }, + { + 'funder_name': 'NSF', + 'funder_identifier': 'https://ror.org/021nxhr62', + 'funder_identifier_type': 'ROR', + }, ] _bnode1 = rdflib.BNode() _award_uri = URIRef('https://nih.example/award') _funder_uri = URIRef('https://doi.org/10.fake/NIH') + _ror_funder_uri = URIRef('https://ror.org/021nxhr62') assert_triples(osf_gathering.gather_funding(self.projectfocus), { (self.projectfocus.iri, OSF.funder, _bnode1), (_bnode1, RDF.type, DCTERMS.Agent), @@ -628,6 +634,10 @@ def test_gather_funding(self): (_award_uri, DCTERMS.title, Literal('big fun')), (_award_uri, OSF.awardNumber, Literal('27')), (_award_uri, DCTERMS.contributor, _funder_uri), + (self.projectfocus.iri, OSF.funder, _ror_funder_uri), + (_ror_funder_uri, RDF.type, DCTERMS.Agent), + (_ror_funder_uri, DCTERMS.identifier, Literal(_ror_funder_uri)), + (_ror_funder_uri, FOAF.name, Literal('NSF')), }) # focus: registration assert_triples(osf_gathering.gather_funding(self.registrationfocus), set()) diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py index 020f4d18574..5dc4029aaf4 100644 --- a/osf_tests/metadata/test_serialized_metadata.py +++ b/osf_tests/metadata/test_serialized_metadata.py @@ -352,10 +352,10 @@ def _setUp_full(self): 'award_number': '2000000', 'award_uri': 'https://moneypockets.example/millions-more', 'award_title': 'because reasons!', - }, { # no award info, just a funder: + }, { # no award info, just a funder with ROR identifier: 'funder_name': 'Caring Fan', - 'funder_identifier': 'https://doi.org/10.$', - 'funder_identifier_type': 'Crossref Funder ID', + 'funder_identifier': 'https://ror.org/0example', + 'funder_identifier_type': 'ROR', 'award_number': '', 'award_uri': '', 'award_title': '', diff --git a/tests/identifiers/test_datacite.py b/tests/identifiers/test_datacite.py index ba432402a88..1a774ceea71 100644 --- a/tests/identifiers/test_datacite.py +++ b/tests/identifiers/test_datacite.py @@ -6,7 +6,7 @@ from django.utils import timezone from framework.auth import Auth -from osf.models import Outcome +from osf.models import GuidMetadataRecord, Outcome from osf.utils.outcomes import ArtifactTypes from osf_tests.factories import AuthUserFactory, IdentifierFactory, RegistrationFactory from tests.base import OsfTestCase @@ -300,6 +300,151 @@ def test_datacite_format_related_resources__ignores_inactive_resources(self, dat ] _assert_unordered_list_of_dicts_equal(metadata_dict['relatedIdentifiers'], expected_relationships) + def _set_funding_info(self, registration, funding_info): + metadata_record = GuidMetadataRecord.objects.for_guid(registration._id) + metadata_record.funding_info = funding_info + metadata_record.save() + + def test_datacite_funding_references_with_ror_identifier_xml(self, registration, datacite_client): + self._set_funding_info(registration, [ + { + 'funder_name': 'National Science Foundation', + 'funder_identifier': 'https://ror.org/021nxhr62', + 'funder_identifier_type': 'ROR', + }, + ]) + metadata_xml = datacite_client.build_metadata(registration) + parser = lxml.etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') + root = lxml.etree.fromstring(metadata_xml, parser=parser) + ns = schema40.ns[None] + + funding_refs = root.find(f'{{{ns}}}fundingReferences') + refs = funding_refs.findall(f'{{{ns}}}fundingReference') + assert len(refs) == 1 + + funder_name = refs[0].find(f'{{{ns}}}funderName') + assert funder_name.text == 'National Science Foundation' + + funder_id = refs[0].find(f'{{{ns}}}funderIdentifier') + assert funder_id.text == 'https://ror.org/021nxhr62' + assert funder_id.attrib['funderIdentifierType'] == 'ROR' + assert funder_id.attrib['schemeURI'] == 'https://ror.org/' + + def test_datacite_funding_references_with_ror_identifier_json(self, registration, datacite_client): + self._set_funding_info(registration, [ + { + 'funder_name': 'National Science Foundation', + 'funder_identifier': 'https://ror.org/021nxhr62', + 'funder_identifier_type': 'ROR', + }, + ]) + metadata_dict = datacite_client.build_metadata(registration, as_xml=False) + + funding_refs = metadata_dict['fundingReferences'] + assert len(funding_refs) == 1 + assert str(funding_refs[0]['funderName']) == 'National Science Foundation' + assert funding_refs[0]['funderIdentifier']['funderIdentifier'] == 'https://ror.org/021nxhr62' + assert funding_refs[0]['funderIdentifier']['funderIdentifierType'] == 'ROR' + assert funding_refs[0]['funderIdentifier']['schemeURI'] == 'https://ror.org/' + + def test_datacite_funding_references_with_crossref_funder_id(self, registration, datacite_client): + self._set_funding_info(registration, [ + { + 'funder_name': 'Mx. Moneypockets', + 'funder_identifier': 'https://doi.org/10.13039/100000001', + 'funder_identifier_type': 'Crossref Funder ID', + 'award_number': '10000000', + 'award_uri': 'https://moneypockets.example/millions', + 'award_title': 'because reasons', + }, + ]) + metadata_xml = datacite_client.build_metadata(registration) + parser = lxml.etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') + root = lxml.etree.fromstring(metadata_xml, parser=parser) + ns = schema40.ns[None] + + funding_refs = root.find(f'{{{ns}}}fundingReferences') + refs = funding_refs.findall(f'{{{ns}}}fundingReference') + assert len(refs) == 1 + + funder_id = refs[0].find(f'{{{ns}}}funderIdentifier') + assert funder_id.text == 'https://doi.org/10.13039/100000001' + assert funder_id.attrib['funderIdentifierType'] == 'Crossref Funder ID' + assert funder_id.attrib['schemeURI'] == 'https://www.crossref.org/services/funder-registry/' + + award_number = refs[0].find(f'{{{ns}}}awardNumber') + assert award_number.text == '10000000' + + def test_datacite_funding_references_mixed_ror_and_crossref(self, registration, datacite_client): + self._set_funding_info(registration, [ + { + 'funder_name': 'Mx. Moneypockets', + 'funder_identifier': 'https://doi.org/10.13039/100000001', + 'funder_identifier_type': 'Crossref Funder ID', + 'award_number': '10000000', + 'award_uri': 'https://moneypockets.example/millions', + 'award_title': 'because reasons', + }, + { + 'funder_name': 'National Science Foundation', + 'funder_identifier': 'https://ror.org/021nxhr62', + 'funder_identifier_type': 'ROR', + }, + ]) + metadata_dict = datacite_client.build_metadata(registration, as_xml=False) + funding_refs = metadata_dict['fundingReferences'] + assert len(funding_refs) == 2 + + # Build a lookup by funder name for order-independent assertions + refs_by_name = {str(ref['funderName']): ref for ref in funding_refs} + + crossref_ref = refs_by_name['Mx. Moneypockets'] + assert crossref_ref['funderIdentifier']['funderIdentifier'] == 'https://doi.org/10.13039/100000001' + assert crossref_ref['funderIdentifier']['funderIdentifierType'] == 'Crossref Funder ID' + assert crossref_ref['funderIdentifier']['schemeURI'] == 'https://www.crossref.org/services/funder-registry/' + assert crossref_ref['awardNumber']['awardNumber'] == '10000000' + + ror_ref = refs_by_name['National Science Foundation'] + assert ror_ref['funderIdentifier']['funderIdentifier'] == 'https://ror.org/021nxhr62' + assert ror_ref['funderIdentifier']['funderIdentifierType'] == 'ROR' + assert ror_ref['funderIdentifier']['schemeURI'] == 'https://ror.org/' + + def test_datacite_funding_references_ror_with_award_info(self, registration, datacite_client): + self._set_funding_info(registration, [ + { + 'funder_name': 'National Institutes of Health', + 'funder_identifier': 'https://ror.org/01cwqze88', + 'funder_identifier_type': 'ROR', + 'award_number': 'R01-GM123456', + 'award_uri': 'https://reporter.nih.gov/project-details/123456', + 'award_title': 'Studying important things', + }, + ]) + metadata_xml = datacite_client.build_metadata(registration) + parser = lxml.etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') + root = lxml.etree.fromstring(metadata_xml, parser=parser) + ns = schema40.ns[None] + + funding_refs = root.find(f'{{{ns}}}fundingReferences') + refs = funding_refs.findall(f'{{{ns}}}fundingReference') + assert len(refs) == 1 + + funder_id = refs[0].find(f'{{{ns}}}funderIdentifier') + assert funder_id.text == 'https://ror.org/01cwqze88' + assert funder_id.attrib['funderIdentifierType'] == 'ROR' + assert funder_id.attrib['schemeURI'] == 'https://ror.org/' + + award_number = refs[0].find(f'{{{ns}}}awardNumber') + assert award_number.text == 'R01-GM123456' + + award_title = refs[0].find(f'{{{ns}}}awardTitle') + assert award_title.text == 'Studying important things' + + def test_datacite_funding_references_no_funding_info(self, registration, datacite_client): + # With no funding info set, fundingReferences should be empty + metadata_dict = datacite_client.build_metadata(registration, as_xml=False) + assert metadata_dict.get('fundingReferences', []) == [] + @pytest.mark.django_db class TestDataCiteViews(OsfTestCase): From 194fa0cbcb5dea30f0467657147745005d8374da Mon Sep 17 00:00:00 2001 From: Fitz Elliott Date: Thu, 26 Feb 2026 14:37:37 -0500 Subject: [PATCH 2/4] bump version & update changelog --- CHANGELOG | 5 +++++ package.json | 2 +- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index cfdf72374db..4603d7195e3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +26.4.0 (2026-02-26) +=================== + +- Transition funder ids from CrossRef to ROR + 26.3.0 (2026-02-24) =================== diff --git a/package.json b/package.json index 530dbc98351..ae78410145e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "26.3.0", + "version": "26.4.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", diff --git a/pyproject.toml b/pyproject.toml index d9973dae373..d89ca9946e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "osf-io" -version = "24.05.2" +version = "26.4.0" description = "The code for [https://osf.io](https://osf.io)." authors = ["Your Name "] license = "Apache License 2.0" From 32fe9718efc4e35e139ab4ce2d2417388b33e88b Mon Sep 17 00:00:00 2001 From: Fitz Elliott Date: Sat, 28 Feb 2026 21:41:10 -0500 Subject: [PATCH 3/4] add command to migrate ror funder names --- .../commands/migrate_funder_names_to_ror.py | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 osf/management/commands/migrate_funder_names_to_ror.py diff --git a/osf/management/commands/migrate_funder_names_to_ror.py b/osf/management/commands/migrate_funder_names_to_ror.py new file mode 100644 index 00000000000..5c8b1e513e6 --- /dev/null +++ b/osf/management/commands/migrate_funder_names_to_ror.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +""" +Management command to migrate ROR funders to use ROR funder names. + +This script reads a CSV mapping file and updates all GuidMetadataRecord entries +that have funding_info with ROR funder IDs, converting them to ROR IDs. + +This has similar functionality to migrate_funder_ids_to_ror.py but is useful if +someone that definitely doesn't have the github id felliott forgot to include +name migrations when running the prior script. It's also useful for generally +updating a bunch of ROR funder names. + +Usage: + # Dry run (recommended first) + python manage.py migrate_funder_names_to_ror --csv-file /path/to/mapping.csv --dry-run + + # Actual migration + python manage.py migrate_funder_names_to_ror --csv-file /path/to/mapping.csv + +CSV Format Expected (tab or comma separated): + Funder Name, ror ID, ROR name, Crossref DOI, Funder ID + Example: + National Science Foundation, https://ror.org/021nxhr62, National Science Foundation, http://dx.doi.org/10.13039/100000001, 100000001 + +Only the "ror id" and "ror name" columns are used. The others may be omitted. + +""" +import csv +import logging + +from django.core.management.base import BaseCommand +from django.db import transaction + +from osf.models import GuidMetadataRecord + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = 'Migrate Crossref Funder IDs to ROR IDs in GuidMetadataRecord.funding_info' + + def add_arguments(self, parser): + parser.add_argument( + '--csv-file', + type=str, + required=True, + help='Path to the CSV file containing the Crossref to ROR mapping.', + ) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='Run without making any changes to the database.', + ) + parser.add_argument( + '--batch-size', + type=int, + default=1000, + help='Number of records to process in each batch (default: 1000).', + ) + parser.add_argument( + '--skip-reindex', + action='store_true', + dest='skip_reindex', + help='Skip triggering SHARE/DataCite re-indexing after migration. ' + 'Use this if you plan to run recatalog_metadata separately.', + ) + + def handle(self, *args, **options): + csv_file = options['csv_file'] + dry_run = options['dry_run'] + batch_size = options['batch_size'] + reindex = not options['skip_reindex'] + + if dry_run: + self.stdout.write(self.style.WARNING('[DRY RUN] No changes will be made to the database.')) + + if not reindex: + self.stdout.write(self.style.WARNING('Re-indexing is disabled. Run recatalog_metadata after migration.')) + + # Load the mapping + mapping = self.load_mapping(csv_file) + if not mapping: + self.stdout.write(self.style.ERROR('No valid mappings found in CSV file.')) + return + + self.stdout.write(f'Loaded {len(mapping)} ROR id to name mappings.') + + # Find and update records + stats = self.migrate_records(mapping, dry_run, batch_size, reindex) + + # Print summary + self.stdout.write('\n' + '=' * 60) + self.stdout.write(self.style.SUCCESS('Migration Summary:')) + self.stdout.write(f" Records scanned: {stats['scanned']}") + self.stdout.write(f" Records updated: {stats['updated']}") + self.stdout.write(f" Records re-indexed: {stats['reindexed']}") + self.stdout.write(f" Funder names updated: {stats['funders_migrated']}") + self.stdout.write(f" Unmapped funders removed: {stats['not_in_mapping']}") + self.stdout.write(f" Unique funders not in mapping: {len(stats['unmapped_ids'])}") + if stats['errors']: + self.stdout.write(self.style.ERROR(f" Errors: {stats['errors']}")) + + if stats['unmapped_ids']: + self.stdout.write('\nUnmapped ROR Funder IDs (not in CSV):') + for funder_id in sorted(stats['unmapped_ids'])[:50]: # Show first 50 + self.stdout.write(f' - {funder_id}') + if len(stats['unmapped_ids']) > 50: + self.stdout.write(f' ... and {len(stats["unmapped_ids"]) - 50} more') + + def load_mapping(self, csv_file): + """Load the ROR ID to ROR info mapping from CSV file. + + Returns a dict mapping ROR IDs to ROR info: + { + 'https://ror.org/021nxhr62': { + 'ror_id': 'https://ror.org/021nxhr62', + 'ror_name': 'National Science Foundation' + }, + ... + } + """ + mapping = {} + + try: + with open(csv_file, 'r', encoding='utf-8-sig') as f: + # Try to detect delimiter + sample = f.read(2048) + f.seek(0) + if '\t' in sample: + delimiter = '\t' + else: + delimiter = ',' + + reader = csv.DictReader(f, delimiter=delimiter) + + # Normalize column names (handle various formats) + for row in reader: + # Try to find the relevant columns + ror_id = None + ror_name = None + + for key, value in row.items(): + if not key: + continue + key_lower = key.lower().strip() + + if 'ror' in key_lower and 'id' in key_lower and 'ror_name' not in key_lower: + ror_id = value.strip() if value else None + elif 'ror' in key_lower and 'name' in key_lower: + ror_name = value.strip() if value else None + + if not ror_id: + continue + + ror_info = { + 'ror_id': ror_id, + 'ror_name': ror_name, + } + + # Add mappings for various ID formats + mapping[ror_id] = ror_info + + except FileNotFoundError: + self.stdout.write(self.style.ERROR(f'CSV file not found: {csv_file}')) + return None + except Exception as e: + self.stdout.write(self.style.ERROR(f'Error reading CSV file: {e}')) + return None + + return mapping + + def migrate_records(self, mapping, dry_run, batch_size, reindex): + """Find and migrate all GuidMetadataRecord entries with Crossref Funder IDs.""" + stats = { + 'scanned': 0, + 'updated': 0, + 'reindexed': 0, + 'funders_migrated': 0, + 'not_in_mapping': 0, + 'errors': 0, + 'unmapped_ids': set(), + } + + # Query records that have non-empty funding_info + # We need to check if any funder has 'Crossref Funder ID' type + queryset = GuidMetadataRecord.objects.exclude(funding_info=[]).exclude(funding_info__isnull=True) + + total_count = queryset.count() + self.stdout.write(f'Found {total_count} records with funding_info to scan.') + + processed = 0 + for record in queryset.iterator(chunk_size=batch_size): + stats['scanned'] += 1 + processed += 1 + + if processed % 500 == 0: + self.stdout.write(f' Processed {processed}/{total_count} records...') + + try: + updated, funder_stats = self.migrate_record(record, mapping, dry_run) + if updated: + stats['updated'] += 1 + if reindex and not dry_run: + try: + self.reindex_record(record) + stats['reindexed'] += 1 + except Exception as e: + logger.error(f'Error re-indexing record {record.guid._id}: {e}') + stats['funders_migrated'] += funder_stats['migrated'] + stats['not_in_mapping'] += funder_stats['not_found'] + stats['unmapped_ids'].update(funder_stats['unmapped_ids']) + except Exception as e: + stats['errors'] += 1 + logger.error(f'Error migrating record {record.guid._id}: {e}') + + return stats + + def migrate_record(self, record, mapping, dry_run): + """Migrate a single GuidMetadataRecord's funding_info. + + Returns (was_updated, funder_stats) + """ + funder_stats = { + 'migrated': 0, + 'not_found': 0, + 'unmapped_ids': set(), + } + + if not record.funding_info: + return False, funder_stats + + updated_funding_info = [] + record_modified = False + + for funder in record.funding_info: + funder_type = funder.get('funder_identifier_type', '') + funder_identifier = funder.get('funder_identifier', '') + + # Only update ROR funder records + if funder_type != 'ROR': + updated_funding_info.append(funder) + continue + + # Try to find in mapping + ror_info = mapping.get(funder_identifier, None) + if ror_info is None: + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Unrecognized ror id for {record.guid._id}: ' + f'{funder_identifier}' + ) + updated_funding_info.append(funder) + continue + + # Has name changed? + if funder.get('funder_name') == ror_info['ror_name']: + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'ROR name unchanged for {record.guid._id}: ' + f'{funder_identifier} -> {funder.get("funder_name")}' + ) + updated_funding_info.append(funder) + continue + + # Create updated funder entry + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Updating name for {record.guid._id}: ' + f'id {funder_identifier} from {funder["funder_name"]} to {ror_info["ror_name"]}' + ) + updated_funder = funder.copy() + updated_funder['funder_name'] = ror_info['ror_name'] + updated_funding_info.append(updated_funder) + record_modified = True + funder_stats['migrated'] += 1 + + # Warn about duplicate ROR IDs that would result from migration + # THIS SHOULDN'T HAPPEN + if record_modified: + ror_identifiers = [ + f['funder_identifier'] + for f in updated_funding_info + if f.get('funder_identifier_type') == 'ROR' + ] + seen = set() + duplicates = {rid for rid in ror_identifiers if rid in seen or seen.add(rid)} + if duplicates: + logger.warning( + f'Record {record.guid._id} has duplicate ROR IDs after migration: {duplicates}' + ) + + if record_modified and not dry_run: + with transaction.atomic(): + record.funding_info = updated_funding_info + record.save(update_fields=['funding_info']) + + return record_modified, funder_stats + + def reindex_record(self, record): + """Trigger SHARE/ElasticSearch and DataCite re-indexing for the record's referent.""" + referent = record.guid.referent + if hasattr(referent, 'update_search'): + referent.update_search() + if hasattr(referent, 'request_identifier_update'): + referent.request_identifier_update('doi') From d4d8c2dc4313ca9c170a31104697e9cbf9fb27ea Mon Sep 17 00:00:00 2001 From: Fitz Elliott Date: Sun, 1 Mar 2026 17:31:24 -0500 Subject: [PATCH 4/4] bump version & update changelog --- CHANGELOG | 5 +++++ package.json | 2 +- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4603d7195e3..4e59f9d6ca6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +26.4.1 (2026-03-01) +=================== + +- Add script to update ROR funder names + 26.4.0 (2026-02-26) =================== diff --git a/package.json b/package.json index ae78410145e..2dfdc3bdb83 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "26.4.0", + "version": "26.4.1", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", diff --git a/pyproject.toml b/pyproject.toml index d89ca9946e5..489560c54a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "osf-io" -version = "26.4.0" +version = "26.4.1" description = "The code for [https://osf.io](https://osf.io)." authors = ["Your Name "] license = "Apache License 2.0"