From 6d026520a8090f587a1130b6785ca9b93f20a23e Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 5 Feb 2026 13:05:51 -0600 Subject: [PATCH] fix: Add config option to skip filepath checksum on insert Adds `filepath_checksum_size_limit_insert` config option to skip checksum computation on insert for files larger than the specified limit. This prevents transaction timeouts when inserting large files with filepath attributes in three-part `make()` methods. Config options: - `filepath_checksum_size_limit`: skip checksum verification on fetch (existing) - `filepath_checksum_size_limit_insert`: skip checksum computation on insert (new) When checksum is skipped on insert: - A warning is logged - contents_hash is stored as NULL - Existing file verification is bypassed Fixes #1386 Co-Authored-By: Claude Opus 4.5 --- datajoint/external.py | 22 +++++++++++++++++----- datajoint/settings.py | 5 ++++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index b3de2ff5d..ef4ce0be4 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -276,13 +276,25 @@ def upload_filepath(self, local_filepath): uuid = uuid_from_buffer( init_string=relative_filepath ) # hash relative path, not contents - contents_hash = uuid_from_file(local_filepath) + + # Check if checksum should be skipped based on file size limit + file_size = Path(local_filepath).stat().st_size + size_limit = config.get("filepath_checksum_size_limit_insert") + skip_checksum = size_limit is not None and file_size > size_limit + + if skip_checksum: + contents_hash = None + logger.warning( + f"Skipping checksum for '{relative_filepath}' ({file_size} bytes > {size_limit} byte limit)" + ) + else: + contents_hash = uuid_from_file(local_filepath) # check if the remote file already exists and verify that it matches check_hash = (self & {"hash": uuid}).fetch("contents_hash") if check_hash.size: # the tracking entry exists, check that it's the same file as before - if contents_hash != check_hash[0]: + if not skip_checksum and contents_hash != check_hash[0]: raise DataJointError( f"A different version of '{relative_filepath}' has already been placed." ) @@ -291,15 +303,15 @@ def upload_filepath(self, local_filepath): self._upload_file( local_filepath, self._make_external_filepath(relative_filepath), - metadata={"contents_hash": str(contents_hash)}, + metadata={"contents_hash": str(contents_hash) if contents_hash else ""}, ) self.connection.query( "INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format( tab=self.full_table_name, - size=Path(local_filepath).stat().st_size, + size=file_size, filepath=relative_filepath, ), - args=(uuid.bytes, contents_hash.bytes), + args=(uuid.bytes, contents_hash.bytes if contents_hash else None), ) return uuid diff --git a/datajoint/settings.py b/datajoint/settings.py index 30b206f99..c8add6017 100644 --- a/datajoint/settings.py +++ b/datajoint/settings.py @@ -49,8 +49,11 @@ "database.use_tls": None, "enable_python_native_blobs": True, # python-native/dj0 encoding support "add_hidden_timestamp": False, - # file size limit for when to disable checksums + # file size limits for when to disable checksums (in bytes) + # filepath_checksum_size_limit: skip checksum verification on fetch for large files "filepath_checksum_size_limit": None, + # filepath_checksum_size_limit_insert: skip checksum computation on insert for large files + "filepath_checksum_size_limit_insert": None, } )