From de88ffb05c81f9711bfed261c3f039d6d2f96272 Mon Sep 17 00:00:00 2001 From: Sebastiaan Huber Date: Tue, 21 May 2024 14:19:29 +0200 Subject: [PATCH] Archive: Respect `filter_size` in query for existing nodes (#6404) The `QueryParams` dataclass defines the `filter_size` attribute which is used in all queries to limit the number of parameters used in a query. This is necessary because without it, large archives would result in queries with a lot of parameters which can cause exceptions in database backends, such as SQLite, which define a limit of 1000 by default. The `aiida.tools.archive._import_nodes` function was not respecting this setting when determining the set of nodes from the archive that already exist in the target storage. This would result in an exception when trying to import a large archive into a storage using SQLite. The problem is fixed by using the `batch_iter` utility to retrieve the existing UUIDs in batches of size `filter_size`. --- src/aiida/tools/archive/imports.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/aiida/tools/archive/imports.py b/src/aiida/tools/archive/imports.py index bdc47519f0..abff87917c 100644 --- a/src/aiida/tools/archive/imports.py +++ b/src/aiida/tools/archive/imports.py @@ -460,12 +460,17 @@ def _import_nodes( # get matching uuids from the backend backend_uuid_id: Dict[str, int] = {} + input_id_uuid_uuids = list(input_id_uuid.values()) + if input_id_uuid: - backend_uuid_id = dict( - orm.QueryBuilder(backend=backend_to) - .append(orm.Node, filters={'uuid': {'in': list(input_id_uuid.values())}}, project=['uuid', 'id']) - .all(batch_size=query_params.batch_size) - ) + for _, batch in batch_iter(input_id_uuid_uuids, query_params.filter_size): + backend_uuid_id.update( + dict( + orm.QueryBuilder(backend=backend_to) + .append(orm.Node, filters={'uuid': {'in': batch}}, project=['uuid', 'id']) + .all(batch_size=query_params.batch_size) + ) + ) new_nodes = len(input_id_uuid) - len(backend_uuid_id)