From 26c4c2a8ed1101d474b7fe9b6ac2b387ec0aada2 Mon Sep 17 00:00:00 2001 From: Sebastiaan Huber Date: Mon, 20 May 2024 16:22:13 +0200 Subject: [PATCH] Archive: Respect `filter_size` in query for existing nodes The `QueryParams` dataclass defines the `filter_size` attribute which is used in all queries to limit the number of parameters used in a query. This is necessary because without it, large archives would result in queries with a lot of parameters which can cause exceptions in database backends, such as SQLite, which define a limit of 1000 by default. The `aiida.tools.archive._import_nodes` function was not respecting this setting when determining the set of nodes from the archive that already exist in the target storage. This would result in an exception when trying to import a large archive into a storage using SQLite. The problem is fixed by using the `batch_iter` utility to retrieve the existing UUIDs in batches of size `filter_size`. --- src/aiida/tools/archive/imports.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/aiida/tools/archive/imports.py b/src/aiida/tools/archive/imports.py index bdc47519f0..abff87917c 100644 --- a/src/aiida/tools/archive/imports.py +++ b/src/aiida/tools/archive/imports.py @@ -460,12 +460,17 @@ def _import_nodes( # get matching uuids from the backend backend_uuid_id: Dict[str, int] = {} + input_id_uuid_uuids = list(input_id_uuid.values()) + if input_id_uuid: - backend_uuid_id = dict( - orm.QueryBuilder(backend=backend_to) - .append(orm.Node, filters={'uuid': {'in': list(input_id_uuid.values())}}, project=['uuid', 'id']) - .all(batch_size=query_params.batch_size) - ) + for _, batch in batch_iter(input_id_uuid_uuids, query_params.filter_size): + backend_uuid_id.update( + dict( + orm.QueryBuilder(backend=backend_to) + .append(orm.Node, filters={'uuid': {'in': batch}}, project=['uuid', 'id']) + .all(batch_size=query_params.batch_size) + ) + ) new_nodes = len(input_id_uuid) - len(backend_uuid_id)