From d49265a52453615bec83a73fb87aac2f0325c2b3 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 23 Apr 2026 10:52:26 +0200 Subject: [PATCH] fix: Fix StorageInstanceManager cache eviction --- .../storages/_storage_instance_manager.py | 18 ++++++------- .../storages/test_storage_instance_manager.py | 25 +++++++++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 62b6c4720b..6e6c8c2359 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -46,20 +46,20 @@ def remove_from_cache(self, storage_instance: Storage) -> None: storage_type = type(storage_instance) # Remove from ID cache - for additional_key in self.by_id[storage_type][storage_instance.id]: - del self.by_id[storage_type][storage_instance.id][additional_key] - break + for additional_key, cached in list(self.by_id[storage_type][storage_instance.id].items()): + if cached is storage_instance: + del self.by_id[storage_type][storage_instance.id][additional_key] # Remove from name cache or alias cache. It can never be in both. if storage_instance.name is not None: - for additional_key in self.by_name[storage_type][storage_instance.name]: - del self.by_name[storage_type][storage_instance.name][additional_key] - break + for additional_key, cached in list(self.by_name[storage_type][storage_instance.name].items()): + if cached is storage_instance: + del self.by_name[storage_type][storage_instance.name][additional_key] else: for alias_key in self.by_alias[storage_type]: - for additional_key in self.by_alias[storage_type][alias_key]: - del self.by_alias[storage_type][alias_key][additional_key] - break + for additional_key, cached in list(self.by_alias[storage_type][alias_key].items()): + if cached is storage_instance: + del self.by_alias[storage_type][alias_key][additional_key] ClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient] diff --git a/tests/unit/storages/test_storage_instance_manager.py b/tests/unit/storages/test_storage_instance_manager.py index db7b75707e..d8fcc48437 100644 --- a/tests/unit/storages/test_storage_instance_manager.py +++ b/tests/unit/storages/test_storage_instance_manager.py @@ -189,3 +189,28 @@ async def open_dataset(name: str | None, alias: str | None) -> None: dataset = await Dataset.open(name=valid_kwargs.get('name'), alias=valid_kwargs.get('alias')) await dataset.drop() + + +@pytest.mark.parametrize('name', ['my-storage', None], ids=['named', 'default']) +@pytest.mark.parametrize('drop_first', [True, False], ids=['drop-first', 'drop-second']) +async def test_drop_only_evicts_own_cache_entry( + tmp_path: Path, storage_type: type[Storage], name: str | None, *, drop_first: bool +) -> None: + """Dropping a storage evicts only its own cache entry; siblings with the same name but a different client stay.""" + config = Configuration(purge_on_start=True, storage_dir=str(tmp_path)) + mem_client = MemoryStorageClient() + fs_client = FileSystemStorageClient() + + # Two different client types produce different cache keys under the same name. + storage_mem = await storage_type.open(name=name, storage_client=mem_client, configuration=config) + storage_fs = await storage_type.open(name=name, storage_client=fs_client, configuration=config) + assert storage_mem is not storage_fs + + if drop_first: + await storage_mem.drop() + assert await storage_type.open(name=name, storage_client=fs_client, configuration=config) is storage_fs + assert await storage_type.open(name=name, storage_client=mem_client, configuration=config) is not storage_mem + else: + await storage_fs.drop() + assert await storage_type.open(name=name, storage_client=mem_client, configuration=config) is storage_mem + assert await storage_type.open(name=name, storage_client=fs_client, configuration=config) is not storage_fs