From 334dbd6e75b6dc631e8bed7025196f77dfb5e674 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Sat, 19 Oct 2024 16:52:52 +0200
Subject: [PATCH 1/7] tests: set root log level to DEBUG during tests

Ease diagnosing test failures.
---
 qubes/tests/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qubes/tests/__init__.py b/qubes/tests/__init__.py
index c20d0c073..305733701 100644
--- a/qubes/tests/__init__.py
+++ b/qubes/tests/__init__.py
@@ -750,6 +750,7 @@ def setUp(self):
             ha_syslog.setFormatter(
                 logging.Formatter('%(name)s[%(process)d]: %(message)s'))
             logging.root.addHandler(ha_syslog)
+            logging.root.setLevel("DEBUG")
 
         self.log.critical('starting')
 

From f42f5b6315088221ea5f563efd4cef9ecf818f7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Sun, 13 Oct 2024 15:11:12 +0200
Subject: [PATCH 2/7] DEBUG: log list of VMs when startup failed

And also basic xen info (including free memory)
---
 qubes/vm/qubesvm.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/qubes/vm/qubesvm.py b/qubes/vm/qubesvm.py
index dd76c4a00..c40100a31 100644
--- a/qubes/vm/qubesvm.py
+++ b/qubes/vm/qubesvm.py
@@ -1222,8 +1222,13 @@ async def start(self, start_guid=True, notify_function=None,
 
                 self._update_libvirt_domain()
 
+                free_mem = subprocess.check_output(["xl", "info", "free_memory"]).decode()
+                self.log.debug("free mem before start: %s", free_mem)
+                subprocess.run(["xl", "list"])
                 self.libvirt_domain.createWithFlags(
                     libvirt.VIR_DOMAIN_START_PAUSED)
+                free_mem = subprocess.check_output(["xl", "info", "free_memory"]).decode()
+                self.log.debug("free mem after start: %s", free_mem)
 
                 # the above allocates xid, lets announce that
                 self.fire_event('property-reset:xid', name='xid')
@@ -1239,6 +1244,10 @@ async def start(self, start_guid=True, notify_function=None,
                     exc = qubes.exc.QubesException(
                         'Failed to start an HVM qube with PCI devices assigned '
                         '- hardware does not support IOMMU/VT-d/AMD-Vi')
+                free_mem = subprocess.check_output(["xl", "info", "free_memory"]).decode()
+                self.log.debug("free mem after failed start: %s", free_mem)
+                subprocess.run(["xl", "list"])
+                subprocess.run(["xl", "info"])
                 self.log.error('Start failed: %s', str(exc))
                 await self.fire_event_async('domain-start-failed',
                                             reason=str(exc))
@@ -1757,6 +1766,7 @@ def request_memory(self, mem_required=None):
             initial_memory = self.memory
             mem_required = int(initial_memory + stubdom_mem) * 1024 * 1024
 
+        self.log.debug(f"mem required: {mem_required}")
         qmemman_client = qubes.qmemman.client.QMemmanClient()
         try:
             mem_required_with_overhead = mem_required + MEM_OVERHEAD_BASE \
@@ -1767,6 +1777,7 @@ def request_memory(self, mem_required=None):
                 mem_required_with_overhead += self.maxmem * 4096
             got_memory = qmemman_client.request_memory(
                 mem_required_with_overhead)
+            self.log.debug(f"mem required with overhead: {mem_required_with_overhead}")
 
         except IOError as e:
             raise IOError('Failed to connect to qmemman: {!s}'.format(e))

From 7eea85e76b843dfa17e03a5f2fed0c7fc6b9be5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Sun, 20 Oct 2024 05:29:09 +0200
Subject: [PATCH 3/7] Update Xen's memory overhead formula

Experiments show that using memory hotplug or populate-on-demand makes
no difference in required memory at startup. But PV vs PVH/HVM does make
a difference - PV doesn't need extra per-MB overhead at all.

On top of that, experimentally find the correct factor. Do it by
starting VM (paused) with different parameters and compare `xl info
free_memory` before and after.

memory / maxmem: difference
400  / 4000: 434
600  / 4000: 634
400  / 400: 405
400  / 600: 407
400  / 2000: 418
2000 / 2000: 2018
600  / 600: 607

All above are with 2 vcpus. Testing with other vcpus count shows the
1.5MB per vcpu is quite accurate.
As seen above, the initial memory doesn't affect the overhead. The
maxmem counts. Applying linear regression to that shows it's about
0.008MB per MB of maxmem, so round it up to 8192 bytes.
The base overhead of 4MB doesn't match exactly, but since the calculated
number is smaller, leave it at 4MB as a safety margin.

Fixes QubesOS/qubes-issues#9431
---
 qubes/vm/qubesvm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/qubes/vm/qubesvm.py b/qubes/vm/qubesvm.py
index c40100a31..5001838e7 100644
--- a/qubes/vm/qubesvm.py
+++ b/qubes/vm/qubesvm.py
@@ -1771,10 +1771,11 @@ def request_memory(self, mem_required=None):
         try:
             mem_required_with_overhead = mem_required + MEM_OVERHEAD_BASE \
                                          + self.vcpus * MEM_OVERHEAD_PER_VCPU
-            if self.use_memory_hotplug:
-                # extra overhead to account future hotplug memory
-                # 1 page per 1MB of RAM, see libxl__get_required_paging_memory()
-                mem_required_with_overhead += self.maxmem * 4096
+            maxmem = self.maxmem if self.maxmem else self.memory
+            if self.virt_mode != "pv":
+                # extra overhead to account (possibly future hotplug) memory
+                # 2 pages per 1MB of RAM, see libxl__get_required_paging_memory()
+                mem_required_with_overhead += maxmem * 8192
             got_memory = qmemman_client.request_memory(
                 mem_required_with_overhead)
             self.log.debug(f"mem required with overhead: {mem_required_with_overhead}")

From b6421d0f78293d401226c05c09cfcdbd3099b526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Tue, 22 Oct 2024 13:41:46 +0200
Subject: [PATCH 4/7] qmemman: refresh domain list directly instead of
 scheduling

... for the next watcher loop iteration.

If two VMs are started in parallel, there may be no watcher loop
iteration between handling their requests. This means the memory request
for the second VM will operate on outdated list of VMs and may not
account for some allocations (assume memory is free, while in fact it's
already allocated to another VM). If that happens, the second VM may
fail to start due to out of memory error.

This is very similar problem as described in QubesOS/qubes-issues#1389,
but affects actual VM startup, not its auxiliary processes.

Fixes QubesOS/qubes-issues#9431
---
 qubes/tools/qmemmand.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/qubes/tools/qmemmand.py b/qubes/tools/qmemmand.py
index df61f172f..c348231ac 100644
--- a/qubes/tools/qmemmand.py
+++ b/qubes/tools/qmemmand.py
@@ -20,6 +20,7 @@
 # pylint: disable=global-statement
 
 import configparser
+import functools
 import socketserver
 import logging
 import logging.handlers
@@ -41,15 +42,6 @@
 
 system_state = qubes.qmemman.systemstate.SystemState()
 global_lock = threading.Lock()
-# If XSWatcher will
-# handle meminfo event before @introduceDomain, it will use
-# incomplete domain list for that and may redistribute memory
-# allocated to some VM, but not yet used (see #1389).
-# To fix that, system_state should be updated (refresh domain
-# list) before processing other changes, every time some process requested
-# memory for a new VM, before releasing the lock. Then XS_Watcher will check
-# this flag before processing other event.
-force_refresh_domain_list = False
 
 def only_in_first_list(list1, list2):
     ret = []
@@ -153,10 +145,6 @@ def meminfo_changed(self, domain_id):
         with global_lock:
             self.log.debug('global_lock acquired')
             try:
-                global force_refresh_domain_list
-                if force_refresh_domain_list:
-                    self.domain_list_changed(refresh_only=True)
-                    force_refresh_domain_list = False
                 if domain_id not in self.watch_token_dict:
                     # domain just destroyed
                     return
@@ -184,6 +172,10 @@ class QMemmanReqHandler(socketserver.BaseRequestHandler):
     client.
     """
 
+    def __init__(self, watcher: XSWatcher, request, client_address, server):
+        self.watcher = watcher
+        super().__init__(request, client_address, server)
+
     def handle(self):
         self.log = logging.getLogger('qmemman.daemon.reqhandler')
 
@@ -196,8 +188,10 @@ def handle(self):
                 if len(self.data) == 0:
                     self.log.info('client disconnected, resuming membalance')
                     if got_lock:
-                        global force_refresh_domain_list
-                        force_refresh_domain_list = True
+                        # got_lock = True means some VM may have been just created
+                        # ensure next watch or client request handler will use
+                        # updated domain list
+                        self.watcher.domain_list_changed(refresh_only=True)
                     return
 
                 # XXX something is wrong here: return without release?
@@ -290,7 +284,11 @@ def main():
     # Initialize the connection to Xen and to XenStore
     system_state.init()
 
-    server = socketserver.UnixStreamServer(SOCK_PATH, QMemmanReqHandler)
+    watcher = XSWatcher()
+    server = socketserver.UnixStreamServer(
+        SOCK_PATH,
+        functools.partial(QMemmanReqHandler, watcher)
+    )
     os.umask(0o077)
 
     # notify systemd
@@ -305,4 +303,4 @@ def main():
         sock.close()
 
     threading.Thread(target=server.serve_forever).start()
-    XSWatcher().watch_loop()
+    watcher.watch_loop()

From 820e8c75aebd96b4f2922264b272cd6c2b0afa32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 23 Oct 2024 12:54:16 +0200
Subject: [PATCH 5/7] DEBUG: raise default qmemman log level

---
 etc/qmemman.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/qmemman.conf b/etc/qmemman.conf
index e86c158f5..0175ad97a 100644
--- a/etc/qmemman.conf
+++ b/etc/qmemman.conf
@@ -14,4 +14,4 @@ cache-margin-factor = 1.3
 
 # log-level - Warning 30, Info 20, Debug 10
 #  Default: 30
-log-level = 30
+log-level = 20

From 4e3818eed1ec40ddd3e99c380cfcd438c246282a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 23 Oct 2024 23:53:36 +0200
Subject: [PATCH 6/7] DEBUG: more qmemman debugging

---
 qubes/tools/qmemmand.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/qubes/tools/qmemmand.py b/qubes/tools/qmemmand.py
index c348231ac..e3b70b5ce 100644
--- a/qubes/tools/qmemmand.py
+++ b/qubes/tools/qmemmand.py
@@ -86,11 +86,11 @@ def domain_list_changed(self, refresh_only=False):
 
         got_lock = False
         if not refresh_only:
-            self.log.debug('acquiring global_lock')
+            self.log.info('acquiring global_lock - list changed')
             # pylint: disable=consider-using-with
             global_lock.acquire()
             got_lock = True
-            self.log.debug('global_lock acquired')
+            self.log.info('global_lock acquired - list changed')
         try:
             curr = self.handle.ls('', '/local/domain')
             if curr is None:
@@ -125,7 +125,7 @@ def domain_list_changed(self, refresh_only=False):
         finally:
             if got_lock:
                 global_lock.release()
-                self.log.debug('global_lock released')
+                self.log.info('global_lock released - list changed')
 
         if not refresh_only:
             try:
@@ -141,9 +141,9 @@ def meminfo_changed(self, domain_id):
         if untrusted_meminfo_key is None or untrusted_meminfo_key == b'':
             return
 
-        self.log.debug('acquiring global_lock')
+        self.log.info('acquiring global_lock - meminfo changed')
         with global_lock:
-            self.log.debug('global_lock acquired')
+            self.log.info('global_lock acquired - meminfo changed')
             try:
                 if domain_id not in self.watch_token_dict:
                     # domain just destroyed
@@ -152,7 +152,7 @@ def meminfo_changed(self, domain_id):
                 system_state.refresh_meminfo(domain_id, untrusted_meminfo_key)
             except:  # pylint: disable=bare-except
                 self.log.exception('Updating meminfo for %s failed', domain_id)
-        self.log.debug('global_lock released')
+        self.log.info('global_lock released - meminfo changed')
 
     def watch_loop(self):
         self.log.debug('watch_loop()')
@@ -184,7 +184,7 @@ def handle(self):
             # self.request is the TCP socket connected to the client
             while True:
                 self.data = self.request.recv(1024).strip()
-                self.log.debug('data=%r', self.data)
+                self.log.info('data=%r', self.data)
                 if len(self.data) == 0:
                     self.log.info('client disconnected, resuming membalance')
                     if got_lock:
@@ -199,10 +199,10 @@ def handle(self):
                     self.log.warning('Second request over qmemman.sock?')
                     return
 
-                self.log.debug('acquiring global_lock')
+                self.log.info('acquiring global_lock - handle')
                 # pylint: disable=consider-using-with
                 global_lock.acquire()
-                self.log.debug('global_lock acquired')
+                self.log.info('global_lock acquired - handle')
 
                 got_lock = True
                 if (self.data.isdigit() and
@@ -218,7 +218,7 @@ def handle(self):
         finally:
             if got_lock:
                 global_lock.release()
-                self.log.debug('global_lock released')
+                self.log.info('global_lock released - handle')
 
 
 parser = qubes.tools.QubesArgumentParser(want_app=False)

From efb85e2e32a0d4e6eabbed35177a258db2d2790d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Thu, 24 Oct 2024 04:34:56 +0200
Subject: [PATCH 7/7] qmemman: always execute do_balance() under a lock

Any memory adjustments must be done while holding a lock, to not
interfere with client request handling. This is critical to prevent
memory just freed for a new VM being re-allocated elsewhere.
The domain_list_changed() function failed to do that - do_balance call
was done after releasing the lock.

It wasn't a problem for a long time because of Python's global interpreter
lock. But Python 3.13 is finally starting to support proper parallel
thread execution, and it revealed this bug.

Fixes QubesOS/qubes-issues#9431
---
 qubes/tools/qmemmand.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/qubes/tools/qmemmand.py b/qubes/tools/qmemmand.py
index e3b70b5ce..2c901c791 100644
--- a/qubes/tools/qmemmand.py
+++ b/qubes/tools/qmemmand.py
@@ -120,6 +120,12 @@ def domain_list_changed(self, refresh_only=False):
                                     self.watch_token_dict[i])
                 self.watch_token_dict.pop(i)
                 system_state.del_domain(i)
+
+            if not refresh_only:
+                try:
+                    system_state.do_balance()
+                except:  # pylint: disable=bare-except
+                    self.log.exception('do_balance() failed')
         except:  # pylint: disable=bare-except
             self.log.exception('Updating domain list failed')
         finally:
@@ -127,13 +133,6 @@ def domain_list_changed(self, refresh_only=False):
                 global_lock.release()
                 self.log.info('global_lock released - list changed')
 
-        if not refresh_only:
-            try:
-                system_state.do_balance()
-            except:  # pylint: disable=bare-except
-                self.log.exception('do_balance() failed')
-
-
     def meminfo_changed(self, domain_id):
         self.log.debug('meminfo_changed(domain_id={!r})'.format(domain_id))
         untrusted_meminfo_key = self.handle.read(