Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEBUG: log list of VMs when startup failed #624

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion etc/qmemman.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ cache-margin-factor = 1.3

# log-level - Warning 30, Info 20, Debug 10
# Default: 30
log-level = 30
log-level = 20
1 change: 1 addition & 0 deletions qubes/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ def setUp(self):
ha_syslog.setFormatter(
logging.Formatter('%(name)s[%(process)d]: %(message)s'))
logging.root.addHandler(ha_syslog)
logging.root.setLevel("DEBUG")

self.log.critical('starting')

Expand Down
65 changes: 31 additions & 34 deletions qubes/tools/qmemmand.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# pylint: disable=global-statement

import configparser
import functools
import socketserver
import logging
import logging.handlers
Expand All @@ -41,15 +42,6 @@

system_state = qubes.qmemman.systemstate.SystemState()
global_lock = threading.Lock()
# If XSWatcher will
# handle meminfo event before @introduceDomain, it will use
# incomplete domain list for that and may redistribute memory
# allocated to some VM, but not yet used (see #1389).
# To fix that, system_state should be updated (refresh domain
# list) before processing other changes, every time some process requested
# memory for a new VM, before releasing the lock. Then XS_Watcher will check
# this flag before processing other event.
force_refresh_domain_list = False

def only_in_first_list(list1, list2):
ret = []
Expand Down Expand Up @@ -94,11 +86,11 @@ def domain_list_changed(self, refresh_only=False):

got_lock = False
if not refresh_only:
self.log.debug('acquiring global_lock')
self.log.info('acquiring global_lock - list changed')
# pylint: disable=consider-using-with
global_lock.acquire()
got_lock = True
self.log.debug('global_lock acquired')
self.log.info('global_lock acquired - list changed')
try:
curr = self.handle.ls('', '/local/domain')
if curr is None:
Expand Down Expand Up @@ -128,19 +120,18 @@ def domain_list_changed(self, refresh_only=False):
self.watch_token_dict[i])
self.watch_token_dict.pop(i)
system_state.del_domain(i)

if not refresh_only:
try:
system_state.do_balance()
except: # pylint: disable=bare-except
self.log.exception('do_balance() failed')
except: # pylint: disable=bare-except
self.log.exception('Updating domain list failed')
finally:
if got_lock:
global_lock.release()
self.log.debug('global_lock released')

if not refresh_only:
try:
system_state.do_balance()
except: # pylint: disable=bare-except
self.log.exception('do_balance() failed')

self.log.info('global_lock released - list changed')

def meminfo_changed(self, domain_id):
self.log.debug('meminfo_changed(domain_id={!r})'.format(domain_id))
Expand All @@ -149,22 +140,18 @@ def meminfo_changed(self, domain_id):
if untrusted_meminfo_key is None or untrusted_meminfo_key == b'':
return

self.log.debug('acquiring global_lock')
self.log.info('acquiring global_lock - meminfo changed')
with global_lock:
self.log.debug('global_lock acquired')
self.log.info('global_lock acquired - meminfo changed')
try:
global force_refresh_domain_list
if force_refresh_domain_list:
self.domain_list_changed(refresh_only=True)
force_refresh_domain_list = False
if domain_id not in self.watch_token_dict:
# domain just destroyed
return

system_state.refresh_meminfo(domain_id, untrusted_meminfo_key)
except: # pylint: disable=bare-except
self.log.exception('Updating meminfo for %s failed', domain_id)
self.log.debug('global_lock released')
self.log.info('global_lock released - meminfo changed')

def watch_loop(self):
self.log.debug('watch_loop()')
Expand All @@ -184,6 +171,10 @@ class QMemmanReqHandler(socketserver.BaseRequestHandler):
client.
"""

def __init__(self, watcher: XSWatcher, request, client_address, server):
self.watcher = watcher
super().__init__(request, client_address, server)

def handle(self):
self.log = logging.getLogger('qmemman.daemon.reqhandler')

Expand All @@ -192,23 +183,25 @@ def handle(self):
# self.request is the TCP socket connected to the client
while True:
self.data = self.request.recv(1024).strip()
self.log.debug('data=%r', self.data)
self.log.info('data=%r', self.data)
if len(self.data) == 0:
self.log.info('client disconnected, resuming membalance')
if got_lock:
global force_refresh_domain_list
force_refresh_domain_list = True
# got_lock = True means some VM may have been just created
# ensure next watch or client request handler will use
# updated domain list
self.watcher.domain_list_changed(refresh_only=True)
return

# XXX something is wrong here: return without release?
if got_lock:
self.log.warning('Second request over qmemman.sock?')
return

self.log.debug('acquiring global_lock')
self.log.info('acquiring global_lock - handle')
# pylint: disable=consider-using-with
global_lock.acquire()
self.log.debug('global_lock acquired')
self.log.info('global_lock acquired - handle')

got_lock = True
if (self.data.isdigit() and
Expand All @@ -224,7 +217,7 @@ def handle(self):
finally:
if got_lock:
global_lock.release()
self.log.debug('global_lock released')
self.log.info('global_lock released - handle')


parser = qubes.tools.QubesArgumentParser(want_app=False)
Expand Down Expand Up @@ -290,7 +283,11 @@ def main():
# Initialize the connection to Xen and to XenStore
system_state.init()

server = socketserver.UnixStreamServer(SOCK_PATH, QMemmanReqHandler)
watcher = XSWatcher()
server = socketserver.UnixStreamServer(
SOCK_PATH,
functools.partial(QMemmanReqHandler, watcher)
)
os.umask(0o077)

# notify systemd
Expand All @@ -305,4 +302,4 @@ def main():
sock.close()

threading.Thread(target=server.serve_forever).start()
XSWatcher().watch_loop()
watcher.watch_loop()
20 changes: 16 additions & 4 deletions qubes/vm/qubesvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1222,8 +1222,13 @@ async def start(self, start_guid=True, notify_function=None,

self._update_libvirt_domain()

free_mem = subprocess.check_output(["xl", "info", "free_memory"]).decode()
self.log.debug("free mem before start: %s", free_mem)
subprocess.run(["xl", "list"])
self.libvirt_domain.createWithFlags(
libvirt.VIR_DOMAIN_START_PAUSED)
free_mem = subprocess.check_output(["xl", "info", "free_memory"]).decode()
self.log.debug("free mem after start: %s", free_mem)

# the above allocates xid, lets announce that
self.fire_event('property-reset:xid', name='xid')
Expand All @@ -1239,6 +1244,10 @@ async def start(self, start_guid=True, notify_function=None,
exc = qubes.exc.QubesException(
'Failed to start an HVM qube with PCI devices assigned '
'- hardware does not support IOMMU/VT-d/AMD-Vi')
free_mem = subprocess.check_output(["xl", "info", "free_memory"]).decode()
self.log.debug("free mem after failed start: %s", free_mem)
subprocess.run(["xl", "list"])
subprocess.run(["xl", "info"])
self.log.error('Start failed: %s', str(exc))
await self.fire_event_async('domain-start-failed',
reason=str(exc))
Expand Down Expand Up @@ -1757,16 +1766,19 @@ def request_memory(self, mem_required=None):
initial_memory = self.memory
mem_required = int(initial_memory + stubdom_mem) * 1024 * 1024

self.log.debug(f"mem required: {mem_required}")
qmemman_client = qubes.qmemman.client.QMemmanClient()
try:
mem_required_with_overhead = mem_required + MEM_OVERHEAD_BASE \
+ self.vcpus * MEM_OVERHEAD_PER_VCPU
if self.use_memory_hotplug:
# extra overhead to account future hotplug memory
# 1 page per 1MB of RAM, see libxl__get_required_paging_memory()
mem_required_with_overhead += self.maxmem * 4096
maxmem = self.maxmem if self.maxmem else self.memory
if self.virt_mode != "pv":
# extra overhead to account (possibly future hotplug) memory
# 2 pages per 1MB of RAM, see libxl__get_required_paging_memory()
mem_required_with_overhead += maxmem * 8192
got_memory = qmemman_client.request_memory(
mem_required_with_overhead)
self.log.debug(f"mem required with overhead: {mem_required_with_overhead}")

except IOError as e:
raise IOError('Failed to connect to qmemman: {!s}'.format(e))
Expand Down