From de88ffb05c81f9711bfed261c3f039d6d2f96272 Mon Sep 17 00:00:00 2001
From: Sebastiaan Huber <mail@sphuber.net>
Date: Tue, 21 May 2024 14:19:29 +0200
Subject: [PATCH] Archive: Respect `filter_size` in query for existing nodes
 (#6404)

The `QueryParams` dataclass defines the `filter_size` attribute which is
used in all queries to limit the number of parameters used in a query.
This is necessary because without it, large archives would result in
queries with a lot of parameters which can cause exceptions in database
backends, such as SQLite, which define a limit of 1000 by default.

The `aiida.tools.archive._import_nodes` function was not respecting this
setting when determining the set of nodes from the archive that already
exist in the target storage. This would result in an exception when
trying to import a large archive into a storage using SQLite. The
problem is fixed by using the `batch_iter` utility to retrieve the
existing UUIDs in batches of size `filter_size`.
---
 src/aiida/tools/archive/imports.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/aiida/tools/archive/imports.py b/src/aiida/tools/archive/imports.py
index bdc47519f0..abff87917c 100644
--- a/src/aiida/tools/archive/imports.py
+++ b/src/aiida/tools/archive/imports.py
@@ -460,12 +460,17 @@ def _import_nodes(
 
     # get matching uuids from the backend
     backend_uuid_id: Dict[str, int] = {}
+    input_id_uuid_uuids = list(input_id_uuid.values())
+
     if input_id_uuid:
-        backend_uuid_id = dict(
-            orm.QueryBuilder(backend=backend_to)
-            .append(orm.Node, filters={'uuid': {'in': list(input_id_uuid.values())}}, project=['uuid', 'id'])
-            .all(batch_size=query_params.batch_size)
-        )
+        for _, batch in batch_iter(input_id_uuid_uuids, query_params.filter_size):
+            backend_uuid_id.update(
+                dict(
+                    orm.QueryBuilder(backend=backend_to)
+                    .append(orm.Node, filters={'uuid': {'in': batch}}, project=['uuid', 'id'])
+                    .all(batch_size=query_params.batch_size)
+                )
+            )
 
     new_nodes = len(input_id_uuid) - len(backend_uuid_id)