From 624ecec0ea3748c1ecc3a3a6495c6813d624eb4d Mon Sep 17 00:00:00 2001
From: Clay Dugo <claydugo@gmail.com>
Date: Mon, 30 Oct 2023 16:11:57 -0400
Subject: [PATCH 1/4] Fixup compute_noop.py numpy example (#402)

---
 examples/compute_noop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/compute_noop.py b/examples/compute_noop.py
index 9da92775..dd68e9fe 100644
--- a/examples/compute_noop.py
+++ b/examples/compute_noop.py
@@ -51,9 +51,9 @@
 # import numpy as np
 #
 # numpy_data = np.frombuffer(data, np.int32)
-# out = compute_with_buffers({0: numpy_data}, {1: numpy_data.nbytes}, compute_shader, n=n)
+# out = compute_with_buffers({0: numpy_data}, {1: numpy_data.nbytes}, shader_source, n=n)
 # result = np.frombuffer(out[1], dtype=np.int32)
-# print(result)
+# print(result.tolist())
 
 
 # %% The long version using the wgpu API

From 183f0c3432daa34a0d4eb151037e373a8f166b62 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Wed, 1 Nov 2023 17:12:06 +0100
Subject: [PATCH 2/4] Track object usage (#399)

* Track object usage wip

* doh

* better sys exit

* Wrap up implementation of diagnostics

* add tests

* last bit of test coverage

* create map to determine bpp for textures

* Refactor

* More refactoring

* tweak

* Remove old print_report

* Add docs

* Small refactor for codegen

* small tweaks

* Cleaner way to handle space between totals
---
 docs/utils.rst                   |  39 +++
 tests/test_diagnostics.py        | 382 ++++++++++++++++++++++
 wgpu/__init__.py                 |   5 +-
 wgpu/_diagnostics.py             | 521 +++++++++++++++++++++++++++++++
 wgpu/backends/rs.py              |  84 +----
 wgpu/backends/rs_ffi.py          |  10 +-
 wgpu/backends/rs_helpers.py      | 129 ++++++++
 wgpu/base.py                     |  56 +++-
 wgpu/resources/codegen_report.md |  10 +-
 9 files changed, 1142 insertions(+), 94 deletions(-)
 create mode 100644 tests/test_diagnostics.py
 create mode 100644 wgpu/_diagnostics.py

diff --git a/docs/utils.rst b/docs/utils.rst
index 71a65fa1..6ed4557d 100644
--- a/docs/utils.rst
+++ b/docs/utils.rst
@@ -3,6 +3,45 @@ Utils
 
 The wgpu library provides a few utilities. Note that most functions below need to be explictly imported.
 
+Logger
+------
+
+Errors, warnings, and info messages (including messages generated by
+wgpu-native) are logged using Python's default logging mechanics. The
+wgpu logger instance is in ``wgpu.logger``, but can also be obtained
+via:
+
+.. code-block:: py
+
+    import logging
+    logger = logging.getLogger("wgpu")
+
+
+Diagnostics
+-----------
+
+To print a full diagnostic report:
+
+.. code-block:: py
+
+    wgpu.diagnostics.print_report()
+
+To inspect (for example) the total buffer usage:
+
+.. code-block:: py
+
+    >>> counts = wgpu.diagnostics.object_counts.get_dict()
+    >>> print(counts["Buffer"])
+    {'count': 3, 'resource_mem': 784}
+
+
+.. autoclass:: wgpu._diagnostics.DiagnosticsRoot
+    :members:
+
+
+.. autoclass:: wgpu._diagnostics.Diagnostics
+    :members:
+
 
 Get default device
 ------------------
diff --git a/tests/test_diagnostics.py b/tests/test_diagnostics.py
new file mode 100644
index 00000000..6cff00c2
--- /dev/null
+++ b/tests/test_diagnostics.py
@@ -0,0 +1,382 @@
+"""
+This tests the diagnostics logic itself. It does not do a tests that *uses* the diagnostics.
+"""
+
+
+import wgpu
+from wgpu import _diagnostics
+from wgpu._diagnostics import (
+    DiagnosticsRoot,
+    Diagnostics,
+    ObjectTracker,
+    dict_to_text,
+    int_repr,
+)
+
+from testutils import run_tests, can_use_wgpu_lib
+from pytest import mark
+
+
+def dedent(text, n):
+    return "\n".join(line[n:] for line in text.split("\n"))
+
+
+class CustomDiagnosticsRoot(DiagnosticsRoot):
+    def __enter__(self):
+        _diagnostics.diagnostics = self
+        return self
+
+    def __exit__(self, *args):
+        _diagnostics.diagnostics = wgpu.diagnostics
+
+
+class CustomDiagnostics(Diagnostics):
+    def __init__(self, name):
+        super().__init__(name)
+        self.tracker = ObjectTracker()
+
+    def get_dict(self):
+        return {k: {"count": v} for k, v in self.tracker.counts.items()}
+
+
+def test_diagnostics_meta():
+    # Test that our custom class does what we expet it to do
+    assert isinstance(wgpu.diagnostics, DiagnosticsRoot)
+    assert wgpu.diagnostics is _diagnostics.diagnostics
+
+    with CustomDiagnosticsRoot() as custom:
+        assert custom is _diagnostics.diagnostics
+
+    assert wgpu.diagnostics is _diagnostics.diagnostics
+
+
+def test_diagnostics_main():
+    with CustomDiagnosticsRoot() as custom:
+        d1 = CustomDiagnostics("foo")
+        d2 = CustomDiagnostics("bar")
+
+        assert "foo" in repr(custom)
+        assert "bar" in repr(custom)
+        assert "spam" not in repr(custom)
+
+        assert "foo" in repr(d1)
+        assert "bar" in repr(d2)
+
+        # Showing report for one topic
+
+        d1.tracker.increase("FooBar")
+
+        reference1 = """
+            ██ foo:
+
+                     count
+
+            FooBar:      1
+
+            ██ bar:
+
+            No data
+        """
+
+        assert custom.get_report() == dedent(reference1, 12)
+
+        # Showing report for both topics
+
+        d1.tracker.increase("FooBar")
+        d2.tracker.increase("XYZ")
+
+        reference2 = """
+            ██ foo:
+
+                     count
+
+            FooBar:      2
+
+            ██ bar:
+
+                  count
+
+            XYZ:      1
+        """
+
+        assert custom.get_report() == dedent(reference2, 12)
+
+        # Showing report also for newly added topic
+
+        d3 = CustomDiagnostics("spam")
+        assert "spam" in repr(custom)
+
+        d3.tracker.increase("FooBar")
+        d3.tracker.increase("FooBar")
+        d3.tracker.increase("XYZ")
+
+        reference3 = """
+            ██ foo:
+
+                     count
+
+            FooBar:      2
+
+            ██ bar:
+
+                  count
+
+            XYZ:      1
+
+            ██ spam:
+
+                     count
+
+            FooBar:      2
+               XYZ:      1
+        """
+
+        assert custom.get_report() == dedent(reference3, 12)
+
+        # Can also show one
+
+        reference4 = """
+            ██ spam:
+
+                     count
+
+            FooBar:      2
+               XYZ:      1
+        """
+
+        # Showing report also for newly added backend
+        assert d3.get_report() == dedent(reference4, 12)
+
+        # The root dict is a dict that maps topics to the per-topic dicts.
+        # So it's a dict of dicts of dicts.
+        big_dict = custom.get_dict()
+        assert isinstance(big_dict, dict)
+        for key, val in big_dict.items():
+            assert isinstance(val, dict)
+            for k, v in val.items():
+                assert isinstance(v, dict)
+
+        # These should not fail
+        d3.print_report()
+        custom.print_report()
+
+
+def test_dict_to_text_simple():
+    # Note the left justification
+
+    d = {"foo": 123456, "bar": "hi", "spam": 4.12345678}
+
+    reference = """
+         foo:  123K
+         bar:  hi
+        spam:  4.12346
+    """
+    assert dict_to_text(d) == dedent(reference[1:], 8)
+
+
+def test_dict_to_text_table():
+    # Note the right justification
+
+    d = {
+        "foo": {"a": 1, "b": 2, "c": 3.1000000},
+        "bar": {"a": 4, "b": 5, "c": 6.123456789123},
+    }
+
+    reference = """
+              a  b        c
+
+        foo:  1  2      3.1
+        bar:  4  5  6.12346
+    """
+    assert dict_to_text(d) == dedent(reference[1:], 8)
+
+    reference = """
+        title   b  a
+
+          foo:  2  1
+          bar:  5  4
+    """
+    assert dict_to_text(d, ["title", "b", "a"]) == dedent(reference[1:], 8)
+
+
+def test_dict_to_text_justification():
+    # Strain the justification
+
+    d = {
+        "foobarspameggs": {"aprettylongtitle": 1, "b": "cyan", "c": 3},
+        "yo": {"aprettylongtitle": 4, "b": "blueberrycake", "c": 6},
+    }
+
+    reference = """
+                 title   aprettylongtitle              b  c
+
+        foobarspameggs:                 1           cyan  3
+                    yo:                 4  blueberrycake  6
+    """
+
+    header = ["title", "aprettylongtitle", "b", "c"]
+    assert dict_to_text(d, header) == dedent(reference[1:], 8)
+
+
+def test_dict_to_text_subdicts():
+    # This covers the option to create sub-rows, covering one case, multiple cases, and zero cases.
+
+    d = {
+        "foo": {
+            "a": 1,
+            "b": 2,
+            "c": {"opt1": {"d": 101, "e": 102}, "opt2": {"d": 103, "e": 104}},
+        },
+        "bar": {"a": 3, "b": 4, "c": {"opt2": {"d": 105, "e": 106}}},
+        "spam": {"a": 5, "b": 6, "c": {}},
+        "eggs": {
+            "a": 7,
+            "b": 8,
+            "c": {
+                "opt1": {"d": 111, "e": 112},
+                "opt2": {"d": 113, "e": 114},
+                "opt3": {"d": 115, "e": 116},
+            },
+        },
+    }
+
+    reference = """
+               a  b     c     d    e
+
+         foo:  1  2  opt1:  101  102
+                     opt2:  103  104
+         bar:  3  4  opt2:  105  106
+        spam:  5  6
+        eggs:  7  8  opt1:  111  112
+                     opt2:  113  114
+                     opt3:  115  116
+    """
+
+    assert dict_to_text(d) == dedent(reference[1:], 8)
+
+
+def test_dict_to_text_mix():
+    # This covers the option to create sub-rows, covering one case, multiple cases, and zero cases.
+
+    d = {
+        "foo": {
+            "a": 1,
+            "b": 2,
+            "c": "simple",
+            "z": 42,
+        },
+        "bar": {"b": 4, "c": {"opt2": {"d": 105, "e": 106}}, "a": 3},
+        "spam": {"a": 5, "b": None, "c": {}},
+        "eggs": {
+            "z": 41,
+            "a": 7,
+            "c": {
+                "opt1": {"d": 111, "e": 112},
+                "opt2": {"d": 113, "e": 114},
+            },
+        },
+    }
+
+    reference = """
+               a  b   z       c     d    e
+
+         foo:  1  2  42  simple
+         bar:  3  4        opt2:  105  106
+        spam:  5
+        eggs:  7     41    opt1:  111  112
+                           opt2:  113  114
+    """
+
+    assert dict_to_text(d) == dedent(reference[1:], 8)
+
+
+def test_object_tracker():
+    tracker = ObjectTracker()
+    counts = tracker.counts
+
+    tracker.increase("FooBar")
+    tracker.increase("FooBar")
+    tracker.increase("FooBar")
+    tracker.increase("SpamEggs")
+    tracker.increase("SpamEggs")
+    tracker.increase("SpamEggs")
+
+    assert counts == {"FooBar": 3, "SpamEggs": 3}
+
+    tracker.decrease("FooBar")
+    tracker.decrease("FooBar")
+    tracker.decrease("FooBar")
+    tracker.decrease("SpamEggs")
+    tracker.decrease("SpamEggs")
+
+    assert counts == {"FooBar": 0, "SpamEggs": 1}
+
+    tracker.increase("FooBar")
+    tracker.increase("SpamEggs")
+
+    assert counts == {"FooBar": 1, "SpamEggs": 2}
+
+    tracker.decrease("FooBar")
+    tracker.decrease("SpamEggs")
+    tracker.decrease("SpamEggs")
+
+    assert counts == {"FooBar": 0, "SpamEggs": 0}
+
+
+def test_int_repr():
+    assert int_repr(0) == "0"
+    assert int_repr(7) == "7"
+    assert int_repr(912) == "912"
+
+    assert int_repr(1_000) == "1.00K"
+    assert int_repr(1_234) == "1.23K"
+    assert int_repr(12_345) == "12.3K"
+    assert int_repr(123_456) == "123K"
+
+    assert int_repr(1_000_000) == "1.00M"
+    assert int_repr(1_234_000) == "1.23M"
+    assert int_repr(12_345_000) == "12.3M"
+    assert int_repr(123_456_000) == "123M"
+
+    assert int_repr(1_000_000_000) == "1.00G"
+    assert int_repr(1_234_000_000) == "1.23G"
+    assert int_repr(12_345_000_000) == "12.3G"
+    assert int_repr(123_456_000_000) == "123G"
+
+    assert int_repr(-7) == "-7"
+    assert int_repr(-912) == "-912"
+    assert int_repr(-1000) == "-1.00K"
+    assert int_repr(-12_345) == "-12.3K"
+    assert int_repr(-123_456_000) == "-123M"
+
+
+@mark.skipif(not can_use_wgpu_lib, reason="Needs wgpu lib")
+def test_diagnostics_with_backends():
+    # Just make sure that it runs without errors
+
+    import wgpu.backends.rs
+
+    text = wgpu.diagnostics.get_report()
+
+    assert "Device" in text
+    assert "RenderPipeline" in text
+    assert "ShaderModule" in text
+
+
+def test_texture_format_map_is_complete():
+    # When texture formats are added, removed, or changed, we must update our
+    # map. This test makes sure we don't forget.
+
+    map_keys = set(_diagnostics.texture_format_to_bpp.keys())
+    enum_keys = set(wgpu.TextureFormat)
+
+    too_much = map_keys - enum_keys
+    missing = enum_keys - map_keys
+
+    assert not too_much
+    assert not missing
+    assert map_keys == enum_keys  # for good measure
+
+
+if __name__ == "__main__":
+    run_tests(globals())
diff --git a/wgpu/__init__.py b/wgpu/__init__.py
index 400f0d24..16d659a3 100644
--- a/wgpu/__init__.py
+++ b/wgpu/__init__.py
@@ -3,6 +3,7 @@
 """
 
 from ._coreutils import logger  # noqa: F401,F403
+from ._diagnostics import diagnostics  # noqa: F401,F403
 from .flags import *  # noqa: F401,F403
 from .enums import *  # noqa: F401,F403
 from .base import *  # noqa: F401,F403
@@ -36,10 +37,6 @@ def _register_backend(cls):
     globals()["request_adapter"] = gpu.request_adapter
     globals()["request_adapter_async"] = gpu.request_adapter_async
     globals()["wgsl_language_features"] = gpu.wgsl_language_features
-    if hasattr(gpu, "print_report"):
-        globals()["print_report"] = gpu.print_report
-    else:
-        globals()["print_report"] = _base_GPU.print_report
 
 
 _base_GPU = GPU  # noqa: F405, N816
diff --git a/wgpu/_diagnostics.py b/wgpu/_diagnostics.py
new file mode 100644
index 00000000..a19c1e15
--- /dev/null
+++ b/wgpu/_diagnostics.py
@@ -0,0 +1,521 @@
+"""
+Logic related to providing diagnostic info on wgpu.
+"""
+
+import os
+import sys
+import platform
+
+
+class DiagnosticsRoot:
+    """Root object to access wgpu diagnostics (i.e. ``wgpu.diagnostics``).
+
+    Per-topic diagnostics can be accessed as attributes on this object.
+    These include ``system``, ``native_info``, ``versions``,
+    ``object_counts``, and more.
+    """
+
+    def __init__(self):
+        self._diagnostics_instances = {}
+
+    def __repr__(self):
+        topics = ", ".join(self._diagnostics_instances.keys())
+        return f"<DiagnosticsRoot with topics: {topics}>"
+
+    def _register_diagnostics(self, name, ob):
+        self._diagnostics_instances[name] = ob
+        setattr(self, name, ob)
+
+    def get_dict(self):
+        """Get a dict that represents the full diagnostics info.
+
+        The keys are the diagnostic topics, and the values are dicts
+        of dicts. See e.g. ``wgpu.diagnostics.counts.get_dict()`` for
+        a topic-specific dict.
+        """
+        result = {}
+        for name, ob in self._diagnostics_instances.items():
+            result[name] = ob.get_dict()
+        return result
+
+    def get_report(self):
+        """Get the full textual diagnostic report (as a str)."""
+        text = ""
+        for name, ob in self._diagnostics_instances.items():
+            text += ob.get_report()
+        return text
+
+    def print_report(self):
+        """Convenience method to print the full diagnostics report."""
+        print(self.get_report(), end="")
+
+
+class Diagnostics:
+    """Object that represents diagnostics on a specific topic.
+
+    This is a base class that must be subclassed to provide diagnostics
+    on a certain topic. Instantiating the class registers it with the
+    root diagnostics object.
+    """
+
+    def __init__(self, name):
+        diagnostics._register_diagnostics(name, self)
+        self.name = name
+        self.object_counts = {}
+
+    def __repr__(self):
+        return f"<Diagnostics for '{self.name}'>"
+
+    def get_dict(self):
+        """Get the diagnostics for this topic, in the form of a Python dict.
+
+        Subclasses must implement this method. The dict can be a simple
+        map of keys to values (str, int, float)::
+
+            foo: 1
+            bar: 2
+
+        If the values are dicts, the data has a table-like layout, with
+        the keys representing the table header::
+
+                      count  mem
+
+            Adapter:      1  264
+             Buffer:      4  704
+
+        Subdicts are also supported, which results in multi-row entries.
+        In the report, the keys of the subdicts have colons behind them::
+
+                      count  mem  backend  o  v  e  el_size
+
+            Adapter:      1  264  vulkan:  1  0  0      264
+                                   d3d12:  1  0  0      220
+             Buffer:      4  704  vulkan:  4  0  0      176
+                                   d3d12:  0  0  0      154
+
+        """
+        raise NotImplementedError()
+
+    def get_subscript(self):
+        """Get informative text that helps interpret the report.
+
+        Subclasses can implement this method. The text will show below the table
+        in the report.
+        """
+        return ""  # Optional
+
+    def get_report(self):
+        """Get the textual diagnostics report for this topic."""
+        text = f"\n██ {self.name}:\n\n"
+        text += dict_to_text(self.get_dict())
+        subscript = self.get_subscript()
+        if subscript:
+            text += "\n" + subscript.rstrip() + "\n"
+        return text
+
+    def print_report(self):
+        """Print the diagnostics report for this topic."""
+        print(self.get_report(), end="")
+
+
+class ObjectTracker:
+    """Little object to help track object counts."""
+
+    def __init__(self):
+        self.counts = {}
+        self.amounts = {}
+
+    def increase(self, name, amount=0):
+        """Bump the counter."""
+        self.counts[name] = self.counts.get(name, 0) + 1
+        if amount:
+            self.amounts[name] = self.amounts.get(name, 0) + amount
+
+    def decrease(self, name, amount=0):
+        """Bump the counter back."""
+        self.counts[name] -= 1
+        if amount:
+            self.amounts[name] -= amount
+
+
+def derive_header(dct):
+    """Derive a table-header from the given dict."""
+
+    if not isinstance(dct, dict):  # no-cover
+        raise TypeError(f"Not a dict: {dct}")
+
+    header = []
+    sub_dicts = {}
+
+    for key, val in dct.items():
+        if not isinstance(val, dict):  # no-cover
+            raise TypeError(f"Element not a dict: {val}")
+        for k, v in val.items():
+            if k not in header:
+                header.append(k)
+            if isinstance(v, dict):
+                sub_dicts[k] = v
+
+    for k, d in sub_dicts.items():
+        while k in header:
+            header.remove(k)
+        header.append(k)
+        sub_header = derive_header(d)
+        for k in sub_header[1:]:
+            if k not in header:
+                header.append(k)
+
+    # Add header item for first column, i.e. the key / row title
+    header.insert(0, "")
+
+    return header
+
+
+def dict_to_text(d, header=None):
+    """Convert a dict data structure to a textual table representation."""
+
+    if not d:
+        return "No data\n"
+
+    # Copy the dict, with simple key-value dicts being transformed into table-like dicts.
+    # That wat the code in derive_header() and dict_to_table() can assume the table-like
+    # data structure, keeping it simpler.
+    d2 = {}
+    for key, val in d.items():
+        if not isinstance(val, dict):
+            val = {"": val}
+        d2[key] = val
+    d = d2
+
+    if not header:
+        header = derive_header(d)
+
+    # We have a table-like-layout if any of the values in the header is non-empty
+    table_layout = any(header)
+
+    # Get the table
+    rows = dict_to_table(d, header)
+    ncols = len(header)
+
+    # Sanity check (guard assumptions about dict_to_table)
+    for row in rows:
+        assert len(row) == ncols, "dict_to_table failed"
+        for i in range(ncols):
+            assert isinstance(row[i], str), "dict_to_table failed"
+
+    # Insert heading
+    if table_layout:
+        rows.insert(0, header.copy())
+        rows.insert(1, [""] * ncols)
+
+    # Determine what colons have values with a colon at the end
+    column_has_colon = [False for _ in range(ncols)]
+    for row in rows:
+        for i in range(ncols):
+            column_has_colon[i] |= row[i].endswith(":")
+
+    # Align the values that don't have a colon at the end
+    for row in rows:
+        for i in range(ncols):
+            word = row[i]
+            if column_has_colon[i] and not word.endswith(":"):
+                row[i] = word + " "
+
+    # Establish max lengths
+    max_lens = [0 for _ in range(ncols)]
+    for row in rows:
+        for i in range(ncols):
+            max_lens[i] = max(max_lens[i], len(row[i]))
+
+    # Justify first column (always rjust)
+    for row in rows:
+        row[0] = row[0].rjust(max_lens[0])
+
+    # For the table layour we also rjust the other columns
+    if table_layout:
+        for row in rows:
+            for i in range(1, ncols):
+                row[i] = row[i].rjust(max_lens[i])
+
+    # Join into a consistent text
+    lines = ["  ".join(row).rstrip() for row in rows]
+    text = "\n".join(lines)
+    return text.rstrip() + "\n"
+
+
+def dict_to_table(d, header, header_offest=0):
+    """Convert a dict data structure to a table (a list of lists of strings).
+    The keys form the first entry of the row. Values that are dicts recurse.
+    """
+
+    ncols = len(header)
+    rows = []
+
+    for row_title, values in d.items():
+        if row_title == "total" and row_title == list(d.keys())[-1]:
+            rows.append([""] * ncols)
+        row = [row_title + ":" if row_title else ""]
+        rows.append(row)
+        for i in range(header_offest + 1, len(header)):
+            key = header[i]
+            val = values.get(key, None)
+            if val is None:
+                row.append("")
+            elif isinstance(val, str):
+                row.append(val)
+            elif isinstance(val, int):
+                row.append(int_repr(val))
+            elif isinstance(val, float):
+                row.append(f"{val:.6g}")
+            elif isinstance(val, dict):
+                subrows = dict_to_table(val, header, i)
+                if len(subrows) == 0:
+                    row += [""] * (ncols - i)
+                else:
+                    row += subrows[0]
+                    extrarows = [[""] * i + subrow for subrow in subrows[1:]]
+                    rows.extend(extrarows)
+                break  # header items are consumed by the sub
+            else:  # no-cover
+                raise TypeError(f"Unexpected table value: {val}")
+
+    return rows
+
+
+def int_repr(val):
+    """Represent an integer using K and M suffixes."""
+    prefix = "-" if val < 0 else ""
+    val = abs(val)
+    if val >= 1_000_000_000:  # >= 1G
+        s = str(val / 1_000_000_000)
+        suffix = "G"
+    elif val >= 1_000_000:  # >= 1M
+        s = str(val / 1_000_000)
+        suffix = "M"
+    elif val >= 1_000:  # >= 1K
+        s = str(val / 1_000)
+        suffix = "K"
+    else:
+        s = str(val)
+        suffix = ""
+    if "." in s:
+        s1, _, s2 = s.partition(".")
+        n_decimals = max(0, 3 - len(s1))
+        s = s1
+        if n_decimals:
+            s2 += "000"
+            s = s1 + "." + s2[:n_decimals]
+    return prefix + s + suffix
+
+
+# Map that we need to calculate texture resource consumption.
+# We need to keep this up-to-date as formats change, we have a unit test for this.
+# Also see https://wgpu.rs/doc/wgpu/enum.TextureFormat.html
+
+texture_format_to_bpp = {
+    # 8 bit
+    "r8unorm": 8,
+    "r8snorm": 8,
+    "r8uint": 8,
+    "r8sint": 8,
+    # 16 bit
+    "r16uint": 16,
+    "r16sint": 16,
+    "r16float": 16,
+    "rg8unorm": 16,
+    "rg8snorm": 16,
+    "rg8uint": 16,
+    "rg8sint": 16,
+    # 32 bit
+    "r32uint": 32,
+    "r32sint": 32,
+    "r32float": 32,
+    "rg16uint": 32,
+    "rg16sint": 32,
+    "rg16float": 32,
+    "rgba8unorm": 32,
+    "rgba8unorm-srgb": 32,
+    "rgba8snorm": 32,
+    "rgba8uint": 32,
+    "rgba8sint": 32,
+    "bgra8unorm": 32,
+    "bgra8unorm-srgb": 32,
+    # special fits
+    "rgb9e5ufloat": 32,  # 3*9 + 5
+    "rgb10a2uint": 32,  # 3*10 + 2
+    "rgb10a2unorm": 32,  # 3*10 + 2
+    "rg11b10ufloat": 32,  # 2*11 + 10
+    # 64 bit
+    "rg32uint": 64,
+    "rg32sint": 64,
+    "rg32float": 64,
+    "rgba16uint": 64,
+    "rgba16sint": 64,
+    "rgba16float": 64,
+    # 128 bit
+    "rgba32uint": 128,
+    "rgba32sint": 128,
+    "rgba32float": 128,
+    # depth and stencil
+    "stencil8": 8,
+    "depth16unorm": 16,
+    "depth24plus": 24,  # "... at least 24 bit integer depth" ?
+    "depth24plus-stencil8": 32,
+    "depth32float": 32,
+    "depth32float-stencil8": 40,
+    # Compressed
+    "bc1-rgba-unorm": 4,  # 4x4 blocks, 8 bytes per block
+    "bc1-rgba-unorm-srgb": 4,
+    "bc2-rgba-unorm": 8,  # 4x4 blocks, 16 bytes per block
+    "bc2-rgba-unorm-srgb": 8,
+    "bc3-rgba-unorm": 8,  # 4x4 blocks, 16 bytes per block
+    "bc3-rgba-unorm-srgb": 8,
+    "bc4-r-unorm": 4,
+    "bc4-r-snorm": 4,
+    "bc5-rg-unorm": 8,
+    "bc5-rg-snorm": 8,
+    "bc6h-rgb-ufloat": 8,
+    "bc6h-rgb-float": 8,
+    "bc7-rgba-unorm": 8,
+    "bc7-rgba-unorm-srgb": 8,
+    "etc2-rgb8unorm": 4,
+    "etc2-rgb8unorm-srgb": 4,
+    "etc2-rgb8a1unorm": 4,
+    "etc2-rgb8a1unorm-srgb": 4,
+    "etc2-rgba8unorm": 8,
+    "etc2-rgba8unorm-srgb": 8,
+    "eac-r11unorm": 4,
+    "eac-r11snorm": 4,
+    "eac-rg11unorm": 8,
+    "eac-rg11snorm": 8,
+    # astc always uses 16 bytes (128 bits) per block
+    "astc-4x4-unorm": 8.0,
+    "astc-4x4-unorm-srgb": 8.0,
+    "astc-5x4-unorm": 6.4,
+    "astc-5x4-unorm-srgb": 6.4,
+    "astc-5x5-unorm": 5.12,
+    "astc-5x5-unorm-srgb": 5.12,
+    "astc-6x5-unorm": 4.267,
+    "astc-6x5-unorm-srgb": 4.267,
+    "astc-6x6-unorm": 3.556,
+    "astc-6x6-unorm-srgb": 3.556,
+    "astc-8x5-unorm": 3.2,
+    "astc-8x5-unorm-srgb": 3.2,
+    "astc-8x6-unorm": 2.667,
+    "astc-8x6-unorm-srgb": 2.667,
+    "astc-8x8-unorm": 2.0,
+    "astc-8x8-unorm-srgb": 2.0,
+    "astc-10x5-unorm": 2.56,
+    "astc-10x5-unorm-srgb": 2.56,
+    "astc-10x6-unorm": 2.133,
+    "astc-10x6-unorm-srgb": 2.133,
+    "astc-10x8-unorm": 1.6,
+    "astc-10x8-unorm-srgb": 1.6,
+    "astc-10x10-unorm": 1.28,
+    "astc-10x10-unorm-srgb": 1.28,
+    "astc-12x10-unorm": 1.067,
+    "astc-12x10-unorm-srgb": 1.067,
+    "astc-12x12-unorm": 0.8889,
+    "astc-12x12-unorm-srgb": 0.8889,
+}
+
+
+# %% global diagnostics object, and builtin diagnostics
+
+
+# The global root object
+diagnostics = DiagnosticsRoot()
+
+
+class SystemDiagnostics(Diagnostics):
+    """Provides basic system info."""
+
+    def get_dict(self):
+        return {
+            "platform": platform.platform(),
+            # "platform_version": platform.version(),  # can be quite long
+            "python_implementation": platform.python_implementation(),
+            "python": platform.python_version(),
+        }
+
+
+class NativeDiagnostics(Diagnostics):
+    """Provides metadata about the wgpu-native backend."""
+
+    def get_dict(self):
+        # Get rs modules, or skip
+        try:
+            wgpu = sys.modules["wgpu"]
+            rs = wgpu.backends.rs
+            rs_ffi = wgpu.backends.rs_ffi
+        except (KeyError, AttributeError):  # no-cover
+            return {}
+
+        # Process lib path
+        lib_path = rs_ffi.lib_path
+        wgpu_path = os.path.dirname(wgpu.__file__)
+        if lib_path.startswith(wgpu_path):
+            lib_path = "." + os.path.sep + lib_path[len(wgpu_path) :].lstrip("/\\")
+
+        return {
+            "expected_version": rs.__version__,
+            "lib_version": ".".join(str(i) for i in rs_ffi.get_lib_version()),
+            "lib_path": lib_path,
+        }
+
+
+class VersionDiagnostics(Diagnostics):
+    """Provides version numbers from relevant libraries."""
+
+    def get_dict(self):
+        core_libs = ["wgpu", "cffi"]
+        qt_libs = ["PySide6", "PyQt6", "PySide2", "PyQt5"]
+        gui_libs = qt_libs + ["glfw", "jupyter_rfb", "wx"]
+        extra_libs = ["numpy", "pygfx", "pylinalg", "fastplotlib"]
+
+        info = {}
+
+        for libname in core_libs + gui_libs + extra_libs:
+            try:
+                ver = sys.modules[libname].__version__
+            except (KeyError, AttributeError):
+                pass
+            else:
+                info[libname] = str(ver)
+
+        return info
+
+
+class ObjectCountDiagnostics(Diagnostics):
+    """Provides object counts and resource consumption, used in base.py."""
+
+    def __init__(self, name):
+        super().__init__(name)
+        self.tracker = ObjectTracker()
+
+    def get_dict(self):
+        """Get diagnostics as a dict."""
+        object_counts = self.tracker.counts
+        resource_mem = self.tracker.amounts
+
+        # Collect counts
+        result = {}
+        for name in sorted(object_counts.keys()):
+            d = {"count": object_counts[name]}
+            if name in resource_mem:
+                d["resource_mem"] = resource_mem[name]
+            result[name[3:]] = d  # drop the 'GPU' from the name
+
+        # Add totals
+        totals = {}
+        for key in ("count", "resource_mem"):
+            totals[key] = sum(v.get(key, 0) for v in result.values())
+        result["total"] = totals
+
+        return result
+
+
+SystemDiagnostics("system")
+NativeDiagnostics("native_info")
+VersionDiagnostics("versions")
+ObjectCountDiagnostics("object_counts")
diff --git a/wgpu/backends/rs.py b/wgpu/backends/rs.py
index 45cd1bf2..d2abdbb4 100644
--- a/wgpu/backends/rs.py
+++ b/wgpu/backends/rs.py
@@ -350,83 +350,6 @@ async def request_adapter_async(
             force_fallback_adapter=force_fallback_adapter,
         )  # no-cover
 
-    def _generate_report(self):
-        """Get a dictionary with info about the internal status of WGPU.
-        The structure of the dict is not defined, for the moment. Use print_report().
-        """
-
-        # H: surfaces: WGPUStorageReport, backendType: WGPUBackendType, vulkan: WGPUHubReport, metal: WGPUHubReport, dx12: WGPUHubReport, dx11: WGPUHubReport, gl: WGPUHubReport
-        struct = new_struct_p(
-            "WGPUGlobalReport *",
-            # not used: surfaces
-            # not used: backendType
-            # not used: vulkan
-            # not used: metal
-            # not used: dx12
-            # not used: dx11
-            # not used: gl
-        )
-
-        # H: void f(WGPUInstance instance, WGPUGlobalReport * report)
-        libf.wgpuGenerateReport(get_wgpu_instance(), struct)
-
-        report = {}
-
-        report["surfaces"] = {
-            "occupied": struct.surfaces.numOccupied,
-            "vacant": struct.surfaces.numVacant,
-            "error": struct.surfaces.numError,
-            "element_size": struct.surfaces.elementSize,
-        }
-        report["backend_type"] = struct.backendType  # note: could make this a set
-        for backend in ("vulkan", "metal", "dx12", "dx11", "gl"):
-            c_hub_report = getattr(struct, backend)
-            report[backend] = {}
-            for key in dir(c_hub_report):
-                c_storage_report = getattr(c_hub_report, key)
-                storage_report = {
-                    "occupied": c_storage_report.numOccupied,
-                    "vacant": c_storage_report.numVacant,
-                    "error": c_storage_report.numError,
-                    "element_size": c_storage_report.elementSize,
-                }
-                # if any(x!=0 for x in storage_report.values()):
-                report[backend][key] = storage_report
-
-        return report
-
-    def print_report(self):
-        def print_line(topic, occupied, vacant, error, el_size):
-            print(
-                topic.rjust(20),
-                str(occupied).rjust(8),
-                str(vacant).rjust(8),
-                str(error).rjust(8),
-                str(el_size).rjust(8),
-            )
-
-        def print_storage_report(topic, d):
-            print_line(topic, d["occupied"], d["vacant"], d["error"], d["element_size"])
-
-        report = self._generate_report()
-
-        print(f"{self.__class__.__module__}.WGPU report:")
-        print()
-        print_line("", "Occupied", "Vacant", "Error", "el-size")
-        print()
-        print_storage_report("surfaces", report["surfaces"])
-        for backend in ("vulkan", "metal", "dx12", "dx11", "gl"):
-            backend_has_stuff = False
-            for hub_report in report[backend].values():
-                report_has_stuff = any(x != 0 for x in hub_report.values())
-                backend_has_stuff |= report_has_stuff
-            if backend_has_stuff:
-                print_line(f"--- {backend} ---", "", "", "", "")
-                for key, val in report[backend].items():
-                    print_storage_report(key, val)
-            else:
-                print_line(f"--- {backend} ---", "", "", "", "")
-
 
 class GPUCanvasContext(base.GPUCanvasContext):
     def __init__(self, canvas):
@@ -884,6 +807,11 @@ def create_texture(
 
         if not mip_level_count:
             mip_level_count = 1  # or lib.WGPU_MIP_LEVEL_COUNT_UNDEFINED ?
+        mip_level_count = int(mip_level_count)
+
+        if not sample_count:
+            sample_count = 1
+        sample_count = int(sample_count)
 
         # H: nextInChain: WGPUChainedStruct *, label: char *, usage: WGPUTextureUsageFlags/int, dimension: WGPUTextureDimension, size: WGPUExtent3D, format: WGPUTextureFormat, mipLevelCount: int, sampleCount: int, viewFormatCount: int, viewFormats: WGPUTextureFormat *
         struct = new_struct_p(
@@ -2769,7 +2697,7 @@ class GPUInternalError(base.GPUInternalError, GPUError):
 
 
 def _copy_docstrings():
-    base_classes = GPUObjectBase, GPUCanvasContext
+    base_classes = GPUObjectBase, GPUCanvasContext, GPUAdapter
     for ob in globals().values():
         if not (isinstance(ob, type) and issubclass(ob, base_classes)):
             continue
diff --git a/wgpu/backends/rs_ffi.py b/wgpu/backends/rs_ffi.py
index 60507bba..81999d01 100644
--- a/wgpu/backends/rs_ffi.py
+++ b/wgpu/backends/rs_ffi.py
@@ -134,10 +134,11 @@ def _maybe_get_pip_hint():
 ffi = FFI()
 ffi.cdef(get_wgpu_header())
 ffi.set_source("wgpu.h", None)
-lib = ffi.dlopen(get_wgpu_lib_path())
+lib_path = get_wgpu_lib_path()  # store path on this module so it can be checked
+lib = ffi.dlopen(lib_path)
 
 
-def check_expected_version(version_info):
+def get_lib_version():
     # Get lib version
     version_int = lib.wgpuGetVersion()
     if version_int < 65536:  # no-cover - old version encoding with 3 ints
@@ -149,6 +150,11 @@ def check_expected_version(version_info):
     # When the 0.7.0 tag was made, the version was not bumped.
     if version_info_lib == (0, 6, 0, 0):
         version_info_lib = (0, 7, 0)
+    return version_info_lib
+
+
+def check_expected_version(version_info):
+    version_info_lib = get_lib_version()
     # Compare
     if version_info_lib != version_info:  # no-cover
         logger.warning(
diff --git a/wgpu/backends/rs_helpers.py b/wgpu/backends/rs_helpers.py
index bc5ceee0..1d682b1b 100644
--- a/wgpu/backends/rs_helpers.py
+++ b/wgpu/backends/rs_helpers.py
@@ -6,6 +6,7 @@
 import ctypes
 
 from .rs_ffi import ffi, lib
+from .._diagnostics import Diagnostics
 from ..base import (
     GPUError,
     GPUOutOfMemoryError,
@@ -338,3 +339,131 @@ def proxy_func(*args):
 
         proxy_func.__name__ = name
         return proxy_func
+
+
+def generate_report():
+    """Get a report similar to the one produced by wgpuGenerateReport(),
+    but in the form of a Python dict.
+    """
+
+    # H: surfaces: WGPUStorageReport, backendType: WGPUBackendType, vulkan: WGPUHubReport, metal: WGPUHubReport, dx12: WGPUHubReport, dx11: WGPUHubReport, gl: WGPUHubReport
+    struct = ffi.new("WGPUGlobalReport *")
+
+    # H: void f(WGPUInstance instance, WGPUGlobalReport * report)
+    lib.wgpuGenerateReport(get_wgpu_instance(), struct)
+
+    report = {}
+
+    report["surfaces"] = {
+        "occupied": struct.surfaces.numOccupied,
+        "vacant": struct.surfaces.numVacant,
+        "error": struct.surfaces.numError,
+        "element_size": struct.surfaces.elementSize,
+    }
+
+    for backend in ("vulkan", "metal", "dx12", "dx11", "gl"):
+        c_hub_report = getattr(struct, backend)
+        report[backend] = {}
+        for key in dir(c_hub_report):
+            c_storage_report = getattr(c_hub_report, key)
+            storage_report = {
+                "occupied": c_storage_report.numOccupied,
+                "vacant": c_storage_report.numVacant,
+                "error": c_storage_report.numError,
+                "element_size": c_storage_report.elementSize,
+            }
+            # if any(x!=0 for x in storage_report.values()):
+            report[backend][key] = storage_report
+
+    return report
+
+
+class NativeDiagnostics(Diagnostics):
+    def get_subscript(self):
+        text = ""
+        text += "    * The o, v, e are occupied, vacant and error, respecitively.\n"
+        text += "    * Reported memory does not include buffer/texture data.\n"
+        return text
+
+    def get_dict(self):
+        result = {}
+        native_report = generate_report()
+
+        # Names in the root of the report (backend-less)
+        root_names = ["surfaces"]
+
+        # Get per-backend names and a list of backends
+        names = list(native_report["vulkan"].keys())
+        backends = [name for name in native_report.keys() if name not in root_names]
+
+        # Get a mapping from native names to wgpu-py names
+        name_map = {"surfaces": "CanvasContext"}
+        for name in names:
+            if name not in name_map:
+                name_map[name] = name[0].upper() + name[1:-1]
+
+        # Initialize the result dict (sorted)
+        for name in sorted(names + root_names):
+            report_name = name_map[name]
+            result[report_name] = {"count": 0, "mem": 0}
+
+        # Establish what backends are active
+        active_backends = []
+        for backend in backends:
+            total = 0
+            for name in names:
+                d = native_report[backend][name]
+                total += d["occupied"] + d["vacant"] + d["error"]
+            if total > 0:
+                active_backends.append(backend)
+
+        # Process names in the root
+        for name in root_names:
+            d = native_report[name]
+            subtotal_count = d["occupied"] + d["vacant"] + d["error"]
+            impl = {
+                "o": d["occupied"],
+                "v": d["vacant"],
+                "e": d["error"],
+                "el_size": d["element_size"],
+            }
+            # Store in report
+            report_name = name_map[name]
+            result[report_name]["count"] = subtotal_count
+            result[report_name]["mem"] = subtotal_count * d["element_size"]
+            result[report_name]["backend"] = {"": impl}
+
+        # Iterate over backends
+        for name in names:
+            total_count = 0
+            total_mem = 0
+            implementations = {}
+            for backend in active_backends:
+                d = native_report[backend][name]
+                subtotal_count = d["occupied"] + d["vacant"] + d["error"]
+                subtotal_mem = subtotal_count * d["element_size"]
+                impl = {
+                    "o": d["occupied"],
+                    "v": d["vacant"],
+                    "e": d["error"],
+                    "el_size": d["element_size"],
+                }
+                total_count += subtotal_count
+                total_mem += subtotal_mem
+                implementations[backend] = impl
+            # Store in report
+            report_name = name_map[name]
+            result[report_name]["count"] = total_count
+            result[report_name]["mem"] = total_mem
+            result[report_name]["backend"] = implementations
+
+        # Add totals
+        totals = {}
+        for key in ("count", "mem"):
+            totals[key] = sum(v.get(key, 0) for v in result.values())
+        result["total"] = totals
+
+        return result
+
+
+diagnostics = NativeDiagnostics("rs_counts")
diff --git a/wgpu/base.py b/wgpu/base.py
index 4ad51471..9528c34a 100644
--- a/wgpu/base.py
+++ b/wgpu/base.py
@@ -13,6 +13,7 @@
 from typing import List, Dict, Union
 
 from ._coreutils import ApiDiff
+from ._diagnostics import diagnostics, texture_format_to_bpp
 from . import flags, enums, structs
 
 
@@ -62,6 +63,13 @@
 apidiff = ApiDiff()
 
 
+# Obtain the object tracker. Note that we store a ref of
+# the latter on all classes that refer to it. Otherwise, on a sys exit,
+# the module attributes are None-ified, and the destructors would
+# therefore fail and produce warnings.
+object_tracker = diagnostics.object_counts.tracker
+
+
 class GPU:
     """The entrypoint to the wgpu API.
 
@@ -109,11 +117,6 @@ def get_preferred_canvas_format(self):
         """
         raise RuntimeError("Use canvas.get_preferred_format() instead.")
 
-    @apidiff.add("Usefull")
-    def print_report(self):
-        """Print a report about the interals of the backend."""
-        print(f"{self.__class__.__module__}.GPU: No report available.")
-
     # IDL: [SameObject] readonly attribute WGSLLanguageFeatures wgslLanguageFeatures;
     @property
     def wgsl_language_features(self):
@@ -131,7 +134,10 @@ class GPUCanvasContext:
     Can be obtained via `gui.WgpuCanvasInterface.get_context()`.
     """
 
+    _ot = object_tracker
+
     def __init__(self, canvas):
+        self._ot.increase(self.__class__.__name__)
         self._canvas_ref = weakref.ref(canvas)
 
     def _get_canvas(self):
@@ -206,6 +212,7 @@ def get_preferred_format(self, adapter):
         return "bgra8unorm-srgb"  # seems to be a good default
 
     def __del__(self):
+        self._ot.decrease(self.__class__.__name__)
         self._destroy()
 
     def _destroy(self):
@@ -257,7 +264,10 @@ class GPUAdapter:
     Once invalid, it never becomes valid again.
     """
 
+    _ot = object_tracker
+
     def __init__(self, internal, features, limits, adapter_info):
+        self._ot.increase(self.__class__.__name__)
         self._internal = internal
 
         assert isinstance(features, set)
@@ -315,6 +325,7 @@ def _destroy(self):
         pass
 
     def __del__(self):
+        self._ot.decrease(self.__class__.__name__)
         self._destroy()
 
     # IDL: readonly attribute boolean isFallbackAdapter;
@@ -341,7 +352,11 @@ class GPUObjectBase:
     the GPU; the device and all objects belonging to a device.
     """
 
+    _ot = object_tracker
+    _nbytes = 0
+
     def __init__(self, label, internal, device):
+        self._ot.increase(self.__class__.__name__, self._nbytes)
         self._label = label
         self._internal = internal  # The native/raw/real GPU object
         self._device = device
@@ -361,6 +376,7 @@ def _destroy(self):
         pass
 
     def __del__(self):
+        self._ot.decrease(self.__class__.__name__, self._nbytes)
         self._destroy()
 
     # Public destroy() methods are implemented on classes as the WebGPU spec specifies.
@@ -939,6 +955,7 @@ class GPUBuffer(GPUObjectBase):
     """
 
     def __init__(self, label, internal, device, size, usage, map_state):
+        self._nbytes = size
         super().__init__(label, internal, device)
         self._size = size
         self._usage = usage
@@ -1100,9 +1117,28 @@ class GPUTexture(GPUObjectBase):
     """
 
     def __init__(self, label, internal, device, tex_info):
+        self._nbytes = self._estimate_nbytes(tex_info)
         super().__init__(label, internal, device)
         self._tex_info = tex_info
 
+    def _estimate_nbytes(self, tex_info):
+        format = tex_info["format"]
+        size = tex_info["size"]
+        sample_count = tex_info["sample_count"] or 1
+        mip_level_count = tex_info["mip_level_count"] or 1
+
+        bpp = texture_format_to_bpp.get(format, 0)
+        npixels = size[0] * size[1] * size[2]
+        nbytes_at_mip_level = sample_count * npixels * bpp / 8
+
+        nbytes = 0
+        for i in range(mip_level_count):
+            nbytes += nbytes_at_mip_level
+            nbytes_at_mip_level /= 2
+
+        # Return rounded to nearest integer
+        return int(nbytes + 0.5)
+
     @apidiff.add("Too useful to not-have")
     @property
     def size(self):
@@ -2015,3 +2051,13 @@ def count(self):
 # like GPUExternalTexture and GPUUncapturedErrorEvent, and more.
 
 apidiff.remove_hidden_methods(globals())
+
+
+def _seed_object_counts():
+    for key, val in globals().items():
+        if key.startswith("GPU") and not key.endswith(("Base", "Mixin")):
+            if hasattr(val, "_ot"):
+                object_tracker.counts[key] = 0
+
+
+_seed_object_counts()
diff --git a/wgpu/resources/codegen_report.md b/wgpu/resources/codegen_report.md
index 71f72a30..d15ac625 100644
--- a/wgpu/resources/codegen_report.md
+++ b/wgpu/resources/codegen_report.md
@@ -9,7 +9,7 @@
 * Wrote 33 enums to enums.py
 * Wrote 59 structs to structs.py
 ### Patching API for base.py
-* Diffs for GPU: add print_report, change get_preferred_canvas_format, change request_adapter, change request_adapter_async
+* Diffs for GPU: change get_preferred_canvas_format, change request_adapter, change request_adapter_async
 * Diffs for GPUCanvasContext: add get_preferred_format, add present
 * Diffs for GPUDevice: add adapter, add create_buffer_with_data, hide import_external_texture, hide lost, hide onuncapturederror, hide pop_error_scope, hide push_error_scope
 * Diffs for GPUBuffer: add map_read, add map_write, add read_mapped, add write_mapped, hide get_mapped_range
@@ -19,7 +19,7 @@
 * Validated 37 classes, 113 methods, 43 properties
 ### Patching API for backends/rs.py
 * Diffs for GPUAdapter: add request_device_tracing
-* Validated 37 classes, 101 methods, 0 properties
+* Validated 37 classes, 99 methods, 0 properties
 ## Validating rs.py
 * Enum field TextureFormat.rgb10a2uint missing in wgpu.h
 * Enum PipelineErrorReason missing in wgpu.h
@@ -28,6 +28,6 @@
 * Enum CanvasAlphaMode missing in wgpu.h
 * Enum field DeviceLostReason.unknown missing in wgpu.h
 * Wrote 232 enum mappings and 47 struct-field mappings to rs_mappings.py
-* Validated 88 C function calls
-* Not using 115 C functions
-* Validated 72 C structs
+* Validated 87 C function calls
+* Not using 116 C functions
+* Validated 71 C structs

From 54055fc653f453b81dece7b97d47805589555640 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Fri, 3 Nov 2023 22:52:27 +0100
Subject: [PATCH 3/4] Remove asyncio from offscreen gui (#404)

---
 wgpu/gui/offscreen.py | 53 +++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/wgpu/gui/offscreen.py b/wgpu/gui/offscreen.py
index 79296f2f..b5a1985d 100644
--- a/wgpu/gui/offscreen.py
+++ b/wgpu/gui/offscreen.py
@@ -1,4 +1,4 @@
-import asyncio
+import time
 
 from ._offscreen import WgpuOffscreenCanvas
 from .base import WgpuAutoGui
@@ -38,10 +38,11 @@ def is_closed(self):
         return self._closed
 
     def _request_draw(self):
-        call_later(0, self.draw)
+        # Deliberately a no-op, because people use .draw() instead.
+        pass
 
     def present(self, texture_view):
-        # This gets called at the end of a draw pass via GPUCanvasContextOffline
+        # This gets called at the end of a draw pass via _offscreen.GPUCanvasContext
         device = texture_view._device
         size = texture_view.size
         bytes_per_pixel = 4
@@ -74,31 +75,33 @@ def draw(self):
 WgpuCanvas = WgpuManualOffscreenCanvas
 
 
-def call_later(delay, callback, *args):
-    loop = asyncio.get_event_loop_policy().get_event_loop()
-    # for the offscreen canvas, we prevent new frames and callbacks
-    # from being queued while the loop is running. this avoids
-    # callbacks from one visualization leaking into the next.
-    if loop.is_running():
-        return
-    loop.call_later(delay, callback, *args)
-
+# If we consider the use-cases for using this offscreen canvas:
+#
+# * Using wgpu.gui.auto in test-mode: in this case run() should not hang,
+#   and call_later should not cause lingering refs.
+# * Using the offscreen canvas directly, in a script: in this case you
+#   do not have/want an event system.
+# * Using the offscreen canvas in an evented app. In that case you already
+#   have an app with a specific event-loop (it might be PySide6 or
+#   something else entirely).
+#
+# In summary, we provide a call_later() and run() that behave pretty
+# well for the first case.
 
-async def mainloop_iter():
-    pass  # no op
+_pending_calls = []
 
 
-def run():
-    """Handle all tasks scheduled with call_later and return."""
-    loop = asyncio.get_event_loop_policy().get_event_loop()
+def call_later(delay, callback, *args):
+    # Note that this module never calls call_later() itself; request_draw() is a no-op.
+    etime = time.time() + delay
+    _pending_calls.append((etime, callback, args))
 
-    # If the loop is already running, this is likely an interactive session or something
-    if loop.is_running():
-        return
 
-    # Run stub mainloop, so that all currently pending tasks are handled
-    loop.run_until_complete(mainloop_iter())
+def run():
+    # Process pending calls
+    for etime, callback, args in _pending_calls.copy():
+        if time.time() >= etime:
+            callback(*args)
 
-    # Cancel all remaining tasks (those that are scheduled later)
-    for t in asyncio.all_tasks(loop=loop):
-        t.cancel()
+    # Clear any leftover scheduled calls, to avoid lingering refs.
+    _pending_calls.clear()

From 6ce9ac994206721d0b7265acb627530fd1272df1 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Sat, 4 Nov 2023 17:44:22 +0100
Subject: [PATCH 4/4] Add tests for object releasing (#403)

* WIP mem tests

* Rename GPUCanvasContextOffscreen

* Seems good practice to poll the device after releasing an object

* Progress on the memtests

* Fix sorting in rs_counts diagnostics

* more tests, also guis

* Almost there ...

* Implemented all

* refactor a bit for shorter module

* run the tests on ci

* codegen

* Remove the delayed releaser

* Try enabling the releases that previously panicked

* clean

* Tried harder, but has no effect

* add comment

* Add another comment

* And another
---
 .github/workflows/ci.yml         |   5 +-
 tests_mem/test_mem.py            | 559 +++++++++++++++++++++++++++++++
 tests_mem/testutils.py           | 177 ++++++++++
 wgpu/backends/rs.py              | 108 ++++--
 wgpu/backends/rs_helpers.py      |  26 +-
 wgpu/gui/_offscreen.py           |   4 +-
 wgpu/gui/jupyter.py              |   2 +-
 wgpu/resources/codegen_report.md |   6 +-
 8 files changed, 819 insertions(+), 68 deletions(-)
 create mode 100644 tests_mem/test_mem.py
 create mode 100644 tests_mem/testutils.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a4205f59..c4319fd5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -198,9 +198,12 @@ jobs:
           pip install -U -r dev-requirements.txt
           python download-wgpu-native.py
           pip install -e .
-    - name: Test on repo
+    - name: Unit tests
       run: |
           pytest -v tests
+    - name: Memory tests
+      run: |
+          pytest -v tests_mem
 
   # The release builds are done for the platforms that we want to build wheels for.
   # We build wheels, test them, and then upload the wheel as an artifact.
diff --git a/tests_mem/test_mem.py b/tests_mem/test_mem.py
new file mode 100644
index 00000000..49c24b05
--- /dev/null
+++ b/tests_mem/test_mem.py
@@ -0,0 +1,559 @@
+import gc
+import asyncio
+
+import wgpu.backends.rs
+
+import pytest
+from testutils import can_use_glfw, can_use_wgpu_lib, can_use_pyside6
+from testutils import create_and_release, get_counts, ob_name_from_test_func
+
+if not can_use_wgpu_lib:
+    pytest.skip(
+        "Skipping tests that need a window or the wgpu lib", allow_module_level=True
+    )
+
+
+# Create the default device beforehand
+DEVICE = wgpu.utils.get_default_device()
+
+
+async def stub_event_loop():
+    pass
+
+
+def make_draw_func_for_canvas(canvas):
+    """Create a draw function for the given canvas,
+    so that we can really present something to a canvas being tested.
+    """
+    ctx = canvas.get_context()
+    ctx.configure(device=DEVICE, format="bgra8unorm-srgb")
+
+    def draw():
+        ctx = canvas.get_context()
+        command_encoder = DEVICE.create_command_encoder()
+        current_texture_view = ctx.get_current_texture()
+        render_pass = command_encoder.begin_render_pass(
+            color_attachments=[
+                {
+                    "view": current_texture_view,
+                    "resolve_target": None,
+                    "clear_value": (1, 1, 1, 1),
+                    "load_op": wgpu.LoadOp.clear,
+                    "store_op": wgpu.StoreOp.store,
+                }
+            ],
+        )
+        render_pass.end()
+        DEVICE.queue.submit([command_encoder.finish()])
+        ctx.present()
+
+    return draw
+
+
+# %% Meta tests
+
+
+def test_meta_all_objects_covered():
+    """Test that we have a test_release test function for each known object."""
+
+    ref_obnames = set(key for key in get_counts().keys())
+    func_obnames = set(ob_name_from_test_func(func) for func in RELEASE_TEST_FUNCS)
+
+    missing = ref_obnames - func_obnames
+    extra = func_obnames - ref_obnames
+    assert not missing
+    assert not extra
+
+
+def test_meta_all_functions_solid():
+    """Test that all funcs starting with "test_release_" are decorated appropriately."""
+    for func in RELEASE_TEST_FUNCS:
+        is_decorated = func.__code__.co_name == "core_test_func"
+        assert is_decorated, func.__name__ + " not decorated"
+
+
+def test_meta_buffers_1():
+    """Making sure that the test indeed fails, when holding onto the objects."""
+
+    lock = []
+
+    @create_and_release
+    def test_release_buffer(n):
+        yield {}
+        for i in range(n):
+            b = DEVICE.create_buffer(size=128, usage=wgpu.BufferUsage.COPY_DST)
+            lock.append(b)
+            yield b
+
+    with pytest.raises(AssertionError):
+        test_release_buffer()
+
+
+def test_meta_buffers_2():
+    """Making sure that the test indeed fails, by disabling the release call."""
+
+    ori = wgpu.backends.rs.GPUBuffer._destroy
+    wgpu.backends.rs.GPUBuffer._destroy = lambda self: None
+
+    try:
+        with pytest.raises(AssertionError):
+            test_release_buffer()
+
+    finally:
+        wgpu.backends.rs.GPUBuffer._destroy = ori
+
+
+# %% The actual tests
+
+# These tests need to do one thing: generate n objects of the correct type.
+
+
+@create_and_release
+def test_release_adapter(n):
+    yield {}
+    for i in range(n):
+        yield wgpu.request_adapter(canvas=None, power_preference="high-performance")
+
+
+@create_and_release
+def test_release_device(n):
+    pytest.skip("XFAIL")
+    # todo: XFAIL: Device object seem not to be cleaned up at wgpu-native.
+
+    # Note: the WebGPU spec says:
+    # [request_device()] is a one-time action: if a device is returned successfully, the adapter becomes invalid.
+
+    yield {
+        "expected_counts_after_create": {"Device": (n, n), "Queue": (n, 0)},
+    }
+    adapter = DEVICE.adapter
+    for i in range(n):
+        d = adapter.request_device()
+        # d.queue._destroy()
+        # d._queue = None
+        yield d
+
+
+@create_and_release
+def test_release_bind_group(n):
+    buffer1 = DEVICE.create_buffer(size=128, usage=wgpu.BufferUsage.STORAGE)
+
+    binding_layouts = [
+        {
+            "binding": 0,
+            "visibility": wgpu.ShaderStage.COMPUTE,
+            "buffer": {
+                "type": wgpu.BufferBindingType.read_only_storage,
+            },
+        },
+    ]
+
+    bindings = [
+        {
+            "binding": 0,
+            "resource": {"buffer": buffer1, "offset": 0, "size": buffer1.size},
+        },
+    ]
+
+    bind_group_layout = DEVICE.create_bind_group_layout(entries=binding_layouts)
+
+    yield {}
+
+    for i in range(n):
+        yield DEVICE.create_bind_group(layout=bind_group_layout, entries=bindings)
+
+
+@create_and_release
+def test_release_bind_group_layout(n):
+    # Note: when we use the same binding layout descriptor, wgpu-native
+    # re-uses the BindGroupLayout object. On the other hand, it also
+    # does not seem to clean them up. Perhaps it just caches them? There
+    # are only so many possible combinations, and its just 152 bytes
+    # (on Metal) per object.
+
+    # todo: do we want similar behavior for *our* BindGroupLayout object?
+
+    yield {
+        "expected_counts_after_create": {"BindGroupLayout": (n, 1)},
+        "expected_counts_after_release": {"BindGroupLayout": (0, 1)},
+    }
+
+    binding_layouts = [
+        {
+            "binding": 0,
+            "visibility": wgpu.ShaderStage.COMPUTE,
+            "buffer": {
+                "type": wgpu.BufferBindingType.read_only_storage,
+            },
+        },
+    ]
+
+    for i in range(n):
+        # binding_layouts[0]["binding"] = i  # force unique objects
+        yield DEVICE.create_bind_group_layout(entries=binding_layouts)
+
+
+@create_and_release
+def test_release_buffer(n):
+    yield {}
+    for i in range(n):
+        yield DEVICE.create_buffer(size=128, usage=wgpu.BufferUsage.COPY_DST)
+
+
+@create_and_release
+def test_release_canvas_context_1(n):
+    # Test with offscreen canvases. A context is created, but not a wgpu-native surface.
+
+    # Note: the offscreen canvas keeps the render-texture-view alive, since it
+    # is used to e.g. download the resulting image. That's why we also see
+    # Textures and TextureViews in the counts.
+
+    from wgpu.gui.offscreen import WgpuCanvas
+
+    yield {
+        "expected_counts_after_create": {
+            "CanvasContext": (n, 0),
+            "Texture": (n, n),
+            "TextureView": (n, n),
+        },
+    }
+
+    for i in range(n):
+        c = WgpuCanvas()
+        c.request_draw(make_draw_func_for_canvas(c))
+        c.draw()
+        yield c.get_context()
+
+
+@create_and_release
+def test_release_canvas_context_2(n):
+    # Test with GLFW canvases.
+
+    # Note: in a draw, the textureview is obtained (thus creating a
+    # Texture and a TextureView, but these are released in present(),
+    # so we don't see them in the counts.
+
+    loop = asyncio.get_event_loop_policy().get_event_loop()
+
+    if loop.is_running():
+        pytest.skip("Cannot run this test when asyncio loop is running")
+    if not can_use_glfw:
+        pytest.skip("Need glfw for this test")
+
+    from wgpu.gui.glfw import WgpuCanvas  # noqa
+
+    yield {}
+
+    for i in range(n):
+        c = WgpuCanvas()
+        c.request_draw(make_draw_func_for_canvas(c))
+        loop.run_until_complete(stub_event_loop())
+        yield c.get_context()
+
+    # Need some shakes to get all canvas refs gone
+    del c
+    loop.run_until_complete(stub_event_loop())
+    gc.collect()
+    loop.run_until_complete(stub_event_loop())
+
+
+@create_and_release
+def test_release_canvas_context_3(n):
+    # Test with PySide canvases.
+
+    # Note: in a draw, the textureview is obtained (thus creating a
+    # Texture and a TextureView, but these are released in present(),
+    # so we don't see them in the counts.
+
+    if not can_use_pyside6:
+        pytest.skip("Need pyside6 for this test")
+
+    import PySide6  # noqa
+    from wgpu.gui.qt import WgpuCanvas  # noqa
+
+    app = PySide6.QtWidgets.QApplication.instance()
+    if app is None:
+        app = PySide6.QtWidgets.QApplication([""])
+
+    yield {}
+
+    for i in range(n):
+        c = WgpuCanvas()
+        c.request_draw(make_draw_func_for_canvas(c))
+        app.processEvents()
+        yield c.get_context()
+
+    # Need some shakes to get all canvas refs gone
+    del c
+    gc.collect()
+    app.processEvents()
+
+
+@create_and_release
+def test_release_command_buffer(n):
+    # Note: a command encoder can only be used once (it gets destroyed on finish())
+    yield {
+        "expected_counts_after_create": {
+            "CommandEncoder": (n, 0),
+            "CommandBuffer": (n, n),
+        },
+    }
+
+    for i in range(n):
+        command_encoder = DEVICE.create_command_encoder()
+        yield command_encoder.finish()
+
+
+@create_and_release
+def test_release_command_encoder(n):
+    # Note: a CommandEncoder does not exist in wgpu-core, but we do
+    # observe its internal CommandBuffer.
+    yield {
+        "expected_counts_after_create": {
+            "CommandEncoder": (n, 0),
+            "CommandBuffer": (0, n),
+        },
+    }
+
+    for i in range(n):
+        yield DEVICE.create_command_encoder()
+
+
+@create_and_release
+def test_release_compute_pass_encoder(n):
+    # Note: ComputePassEncoder does not really exist in wgpu-core
+    # -> Check gpu.diagnostics.rs_counts.print_report(), nothing there that ends with "Encoder".
+    command_encoder = DEVICE.create_command_encoder()
+
+    yield {
+        "expected_counts_after_create": {
+            "ComputePassEncoder": (n, 0),
+        },
+    }
+
+    for i in range(n):
+        yield command_encoder.begin_compute_pass()
+
+
+@create_and_release
+def test_release_compute_pipeline(n):
+    code = """
+        @compute
+        @workgroup_size(1)
+        fn main(@builtin(global_invocation_id) index: vec3<u32>) {
+            let i: u32 = index.x;
+        }
+    """
+    shader = DEVICE.create_shader_module(code=code)
+
+    binding_layouts = []
+    pipeline_layout = DEVICE.create_pipeline_layout(bind_group_layouts=binding_layouts)
+
+    yield {}
+
+    for i in range(n):
+        yield DEVICE.create_compute_pipeline(
+            layout=pipeline_layout,
+            compute={"module": shader, "entry_point": "main"},
+        )
+
+
+@create_and_release
+def test_release_pipeline_layout(n):
+    yield {}
+    for i in range(n):
+        yield DEVICE.create_pipeline_layout(bind_group_layouts=[])
+
+
+@create_and_release
+def test_release_query_set(n):
+    # todo: implement this when we do support them
+    pytest.skip("Query set not implemented")
+
+
+@create_and_release
+def test_release_queue(n):
+    pytest.skip("XFAIL")
+    # todo: XFAIL: the device and queue are kinda one, and the former won't release at wgpu-native.
+    yield {}
+    adapter = DEVICE.adapter
+    for i in range(n):
+        d = adapter.request_device()
+        q = d.queue
+        d._queue = None  # detach
+        yield q
+
+
+@create_and_release
+def test_release_render_bundle(n):
+    # todo: implement this when we do support them
+    pytest.skip("Render bundle not implemented")
+
+
+@create_and_release
+def test_release_render_bundle_encoder(n):
+    pytest.skip("Render bundle not implemented")
+
+
+@create_and_release
+def test_release_render_pass_encoder(n):
+    # Note: RenderPassEncoder does not really exist in wgpu-core
+    # -> Check gpu.diagnostics.rs_counts.print_report(), nothing there that ends with "Encoder".
+    command_encoder = DEVICE.create_command_encoder()
+
+    yield {
+        "expected_counts_after_create": {
+            "RenderPassEncoder": (n, 0),
+        },
+    }
+
+    for i in range(n):
+        yield command_encoder.begin_render_pass(color_attachments=[])
+
+
+@create_and_release
+def test_release_render_pipeline(n):
+    code = """
+        struct VertexInput {
+            @builtin(vertex_index) vertex_index : u32,
+        };
+        struct VertexOutput {
+            @location(0) color : vec4<f32>,
+            @builtin(position) pos: vec4<f32>,
+        };
+
+        @vertex
+        fn vs_main(in: VertexInput) -> VertexOutput {
+            var positions = array<vec2<f32>, 3>(
+                vec2<f32>(0.0, -0.5),
+                vec2<f32>(0.5, 0.5),
+                vec2<f32>(-0.5, 0.75),
+            );
+            var colors = array<vec3<f32>, 3>(  // srgb colors
+                vec3<f32>(1.0, 1.0, 0.0),
+                vec3<f32>(1.0, 0.0, 1.0),
+                vec3<f32>(0.0, 1.0, 1.0),
+            );
+            let index = i32(in.vertex_index);
+            var out: VertexOutput;
+            out.pos = vec4<f32>(positions[index], 0.0, 1.0);
+            out.color = vec4<f32>(colors[index], 1.0);
+            return out;
+        }
+
+        @fragment
+        fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
+            let physical_color = pow(in.color.rgb, vec3<f32>(2.2));  // gamma correct
+            return vec4<f32>(physical_color, in.color.a);
+        }
+    """
+    shader = DEVICE.create_shader_module(code=code)
+
+    binding_layouts = []
+    pipeline_layout = DEVICE.create_pipeline_layout(bind_group_layouts=binding_layouts)
+
+    yield {}
+
+    for i in range(n):
+        yield DEVICE.create_render_pipeline(
+            layout=pipeline_layout,
+            vertex={
+                "module": shader,
+                "entry_point": "vs_main",
+                "buffers": [],
+            },
+            primitive={
+                "topology": wgpu.PrimitiveTopology.triangle_list,
+                "front_face": wgpu.FrontFace.ccw,
+                "cull_mode": wgpu.CullMode.none,
+            },
+            depth_stencil=None,
+            multisample=None,
+            fragment={
+                "module": shader,
+                "entry_point": "fs_main",
+                "targets": [
+                    {
+                        "format": "bgra8unorm-srgb",
+                        "blend": {
+                            "color": (
+                                wgpu.BlendFactor.one,
+                                wgpu.BlendFactor.zero,
+                                wgpu.BlendOperation.add,
+                            ),
+                            "alpha": (
+                                wgpu.BlendFactor.one,
+                                wgpu.BlendFactor.zero,
+                                wgpu.BlendOperation.add,
+                            ),
+                        },
+                    },
+                ],
+            },
+        )
+
+
+@create_and_release
+def test_release_sampler(n):
+    yield {}
+    for i in range(n):
+        yield DEVICE.create_sampler()
+
+
+@create_and_release
+def test_release_shader_module(n):
+    yield {}
+
+    code = """
+        @fragment
+        fn fs_main() -> @location(0) vec4<f32> {
+           return vec4<f32>(1.0, 0.0, 0.0, 1.0);
+        }
+    """
+
+    for i in range(n):
+        yield DEVICE.create_shader_module(code=code)
+
+
+@create_and_release
+def test_release_texture(n):
+    yield {}
+    for i in range(n):
+        yield DEVICE.create_texture(
+            size=(16, 16, 16),
+            usage=wgpu.TextureUsage.TEXTURE_BINDING,
+            format="rgba8unorm",
+        )
+
+
+@create_and_release
+def test_release_texture_view(n):
+    texture = DEVICE.create_texture(
+        size=(16, 16, 16), usage=wgpu.TextureUsage.TEXTURE_BINDING, format="rgba8unorm"
+    )
+    yield {}
+    for i in range(n):
+        yield texture.create_view()
+
+
+# %% The end
+
+
+ALL_TEST_FUNCS = [
+    ob
+    for name, ob in list(globals().items())
+    if name.startswith("test_") and callable(ob)
+]
+RELEASE_TEST_FUNCS = [
+    func for func in ALL_TEST_FUNCS if func.__name__.startswith("test_release_")
+]
+
+
+if __name__ == "__main__":
+    for func in ALL_TEST_FUNCS:
+        print(func.__name__ + " ...")
+        try:
+            func()
+        except pytest.skip.Exception:
+            print("  skipped")
+    print("done")
diff --git a/tests_mem/testutils.py b/tests_mem/testutils.py
new file mode 100644
index 00000000..357eaead
--- /dev/null
+++ b/tests_mem/testutils.py
@@ -0,0 +1,177 @@
+import gc
+import os
+import sys
+import subprocess
+
+import wgpu
+
+
+def _determine_can_use_wgpu_lib():
+    # For some reason, since wgpu-native 5c304b5ea1b933574edb52d5de2d49ea04a053db
+    # the process' exit code is not zero, so we test more pragmatically.
+    code = "import wgpu.utils; wgpu.utils.get_default_device(); print('ok')"
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-c",
+            code,
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    )
+    print("_determine_can_use_wgpu_lib() status code:", result.returncode)
+    return (
+        result.stdout.strip().endswith("ok")
+        and "traceback" not in result.stderr.lower()
+    )
+
+
+def _determine_can_use_glfw():
+    code = "import glfw;exit(0) if glfw.init() else exit(1)"
+    try:
+        subprocess.check_output([sys.executable, "-c", code])
+    except Exception:
+        return False
+    else:
+        return True
+
+
+def _determine_can_use_pyside6():
+    code = "import PySide6.QtGui"
+    try:
+        subprocess.check_output([sys.executable, "-c", code])
+    except Exception:
+        return False
+    else:
+        return True
+
+
+can_use_wgpu_lib = _determine_can_use_wgpu_lib()
+can_use_glfw = _determine_can_use_glfw()
+can_use_pyside6 = _determine_can_use_pyside6()
+is_ci = bool(os.getenv("CI", None))
+
+
+def get_counts():
+    """Get a dict that maps object names to a 2-tuple represening
+    the counts in py and wgpu-native.
+    """
+    counts_py = wgpu.diagnostics.object_counts.get_dict()
+    counts_native = wgpu.diagnostics.rs_counts.get_dict()
+
+    all_keys = set(counts_py) | set(counts_native)
+
+    default = {"count": -1}
+
+    counts = {}
+    for key in sorted(all_keys):
+        counts[key] = (
+            counts_py.get(key, default)["count"],
+            counts_native.get(key, default)["count"],
+        )
+    counts.pop("total")
+
+    return counts
+
+
+def get_excess_counts(counts1, counts2):
+    """Compare two counts dicts, and return a new dict with the fields
+    that have increased counts.
+    """
+    more = {}
+    for name in counts1:
+        c1 = counts1[name][0]
+        c2 = counts2[name][0]
+        more_py = 0
+        if c2 > c1:
+            more_py = c2 - c1
+        c1 = counts1[name][1]
+        c2 = counts2[name][1]
+        more_native = 0
+        if c2 > c1:
+            more_native = c2 - c1
+        if more_py or more_native:
+            more[name] = more_py, more_native
+    return more
+
+
+def ob_name_from_test_func(func):
+    """Translate test_release_bind_group() to "BindGroup"."""
+    func_name = func.__name__
+    prefix = "test_release_"
+    assert func_name.startswith(prefix)
+    words = func_name[len(prefix) :].split("_")
+    if words[-1].isnumeric():
+        words.pop(-1)
+    return "".join(word.capitalize() for word in words)
+
+
+def create_and_release(create_objects_func):
+    """Decorator."""
+
+    def core_test_func():
+        """The core function that does the testing."""
+
+        n = 32
+
+        generator = create_objects_func(n)
+        ob_name = ob_name_from_test_func(create_objects_func)
+
+        # ----- Collect options
+
+        options = {
+            "expected_counts_after_create": {ob_name: (32, 32)},
+            "expected_counts_after_release": {},
+        }
+
+        func_options = next(generator)
+        assert isinstance(func_options, dict), "First yield must be an options dict"
+        options.update(func_options)
+
+        # Measure baseline object counts
+        counts1 = get_counts()
+
+        # ----- Create
+
+        # Create objects
+        objects = list(generator)
+
+        # Test the count
+        assert len(objects) == n
+
+        # Test that all objects are of the same class.
+        # (this for-loop is a bit weird, but its to avoid leaking refs to objects)
+        cls = objects[0].__class__
+        assert all(isinstance(objects[i], cls) for i in range(len(objects)))
+
+        # Test that class matches function name (should prevent a group of copy-paste errors)
+        assert ob_name == cls.__name__[3:]
+
+        # Measure peak object counts
+        counts2 = get_counts()
+        more2 = get_excess_counts(counts1, counts2)
+        print("  more after create:", more2)
+
+        # Make sure the actual object has increased
+        assert more2  # not empty
+        assert more2 == options["expected_counts_after_create"]
+
+        # It's ok if other objects are created too ...
+
+        # ----- Release
+
+        # Delete objects
+        del objects
+        gc.collect()
+
+        # Measure after-release object counts
+        counts3 = get_counts()
+        more3 = get_excess_counts(counts1, counts3)
+        print("  more after release:", more3)
+
+        # Check!
+        assert more3 == options["expected_counts_after_release"]
+
+    core_test_func.__name__ = create_objects_func.__name__
+    return core_test_func
diff --git a/wgpu/backends/rs.py b/wgpu/backends/rs.py
index d2abdbb4..9dad0e71 100644
--- a/wgpu/backends/rs.py
+++ b/wgpu/backends/rs.py
@@ -35,7 +35,6 @@
     get_memoryview_and_address,
     to_snake_case,
     to_camel_case,
-    DelayedReleaser,
     ErrorHandler,
     SafeLibCalls,
 )
@@ -173,7 +172,6 @@ def check_struct(struct_name, d):
         raise ValueError(f"Invalid keys in {struct_name}: {invalid_keys}")
 
 
-delayed_releaser = DelayedReleaser()
 error_handler = ErrorHandler(logger)
 libf = SafeLibCalls(lib, error_handler)
 
@@ -354,6 +352,7 @@ async def request_adapter_async(
 class GPUCanvasContext(base.GPUCanvasContext):
     def __init__(self, canvas):
         super().__init__(canvas)
+        self._device = None
         self._surface_size = (-1, -1)
         self._surface_id = None
         self._internal = None
@@ -486,7 +485,7 @@ def get_preferred_format(self, adapter):
             return default
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUSwapChain swapChain)
             libf.wgpuSwapChainRelease(internal)
@@ -545,9 +544,6 @@ def request_device_tracing(
     def _request_device(
         self, label, required_features, required_limits, default_queue, trace_path
     ):
-        # This is a good moment to release destroyed objects
-        delayed_releaser.release_all_pending()
-
         # ---- Handle features
 
         assert isinstance(required_features, (tuple, list, set))
@@ -715,9 +711,10 @@ async def request_device_async(
         )  # no-cover
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
-            delayed_releaser.release_soon("wgpuAdapterRelease", internal)
+            # H: void f(WGPUAdapter adapter)
+            libf.wgpuAdapterRelease(internal)
 
 
 class GPUDevice(base.GPUDevice, GPUObjectBase):
@@ -739,6 +736,12 @@ def uncaptured_error_callback(c_type, c_message, userdata):
             self._internal, uncaptured_error_callback, ffi.NULL
         )
 
+    def _poll(self):
+        # Internal function
+        if self._internal:
+            # H: bool f(WGPUDevice device, bool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
+            libf.wgpuDevicePoll(self._internal, True, ffi.NULL)
+
     def create_buffer(
         self,
         *,
@@ -968,9 +971,17 @@ def create_bind_group_layout(
             # not used: nextInChain
         )
 
+        # Note: wgpu-core re-uses BindGroupLayouts with the same (or similar
+        # enough) descriptor. You would think that this means that the id is
+        # the same when you call wgpuDeviceCreateBindGroupLayout with the same
+        # input, but it's not. So we cannot let wgpu-native/core decide when
+        # to re-use a BindGroupLayout. I don't feel confident checking here
+        # whether a BindGroupLayout can be re-used, so we simply don't. Higher
+        # level code can sometimes make this decision because it knows the app
+        # logic.
+
         # H: WGPUBindGroupLayout f(WGPUDevice device, WGPUBindGroupLayoutDescriptor const * descriptor)
         id = libf.wgpuDeviceCreateBindGroupLayout(self._internal, struct)
-
         return GPUBindGroupLayout(label, id, self, entries)
 
     def create_bind_group(
@@ -1439,14 +1450,21 @@ def create_render_bundle_encoder(
         stencil_read_only: bool = False,
     ):
         raise NotImplementedError()
+        # Note: also enable the coresponing memtest when implementing this!
 
     def create_query_set(self, *, label="", type: "enums.QueryType", count: int):
         raise NotImplementedError()
+        # Note: also enable the coresponing memtest when implementing this!
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._queue is not None:
+            self._queue._destroy()
+            self._queue = None
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
-            delayed_releaser.release_soon("wgpuDeviceRelease", internal)
+            # H: void f(WGPUDevice device)
+            libf.wgpuDeviceRelease(internal)
+            # wgpuDeviceDestroy(internal) is also an option
 
 
 class GPUBuffer(base.GPUBuffer, GPUObjectBase):
@@ -1526,8 +1544,7 @@ def callback(status_, user_data_p):
         )
 
         # Let it do some cycles
-        # H: bool f(WGPUDevice device, bool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
-        libf.wgpuDevicePoll(self._device._internal, True, ffi.NULL)
+        self._device._poll()
 
         if status != 0:  # no-cover
             raise RuntimeError(f"Could not map buffer ({status}).")
@@ -1635,10 +1652,18 @@ def destroy(self):
 
     def _destroy(self):
         self._release_memoryviews()
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUBuffer buffer)
             libf.wgpuBufferRelease(internal)
+            self._device._poll()
+            # Note: from the memtests it looks like we need to poll the device
+            # after releasing an object for some objects (buffer, texture,
+            # texture view, sampler, pipeline layout, compute pipeline, and
+            # render pipeline). But not others. Would be nice to at some point
+            # have more clarity on this. In the mean time, we now poll the
+            # device quite a bit, so leaks by not polling the device after
+            # releasing something are highly unlikely.
 
 
 class GPUTexture(base.GPUTexture, GPUObjectBase):
@@ -1692,31 +1717,34 @@ def destroy(self):
         self._destroy()  # no-cover
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUTexture texture)
             libf.wgpuTextureRelease(internal)
+            self._device._poll()
 
 
 class GPUTextureView(base.GPUTextureView, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUTextureView textureView)
             libf.wgpuTextureViewRelease(internal)
+            self._device._poll()
 
 
 class GPUSampler(base.GPUSampler, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUSampler sampler)
             libf.wgpuSamplerRelease(internal)
+            self._device._poll()
 
 
 class GPUBindGroupLayout(base.GPUBindGroupLayout, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUBindGroupLayout bindGroupLayout)
             libf.wgpuBindGroupLayoutRelease(internal)
@@ -1724,7 +1752,7 @@ def _destroy(self):
 
 class GPUBindGroup(base.GPUBindGroup, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUBindGroup bindGroup)
             libf.wgpuBindGroupRelease(internal)
@@ -1732,10 +1760,11 @@ def _destroy(self):
 
 class GPUPipelineLayout(base.GPUPipelineLayout, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUPipelineLayout pipelineLayout)
             libf.wgpuPipelineLayoutRelease(internal)
+            self._device._poll()
 
 
 class GPUShaderModule(base.GPUShaderModule, GPUObjectBase):
@@ -1761,8 +1790,7 @@ def get_compilation_info(self):
         # H: void f(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void * userdata)
         # libf.wgpuShaderModuleGetCompilationInfo(self._internal, callback, ffi.NULL)
         #
-        # H: bool f(WGPUDevice device, bool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
-        # libf.wgpuDevicePoll(self._device._internal, True, ffi.NULL)
+        # self._device._poll()
         #
         # if info is None:
         #     raise RuntimeError("Could not obtain shader compilation info.")
@@ -1772,7 +1800,7 @@ def get_compilation_info(self):
         return []
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUShaderModule shaderModule)
             libf.wgpuShaderModuleRelease(internal)
@@ -1784,18 +1812,20 @@ class GPUPipelineBase(base.GPUPipelineBase):
 
 class GPUComputePipeline(base.GPUComputePipeline, GPUPipelineBase, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUComputePipeline computePipeline)
             libf.wgpuComputePipelineRelease(internal)
+            self._device._poll()
 
 
 class GPURenderPipeline(base.GPURenderPipeline, GPUPipelineBase, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPURenderPipeline renderPipeline)
             libf.wgpuRenderPipelineRelease(internal)
+            self._device._poll()
 
 
 class GPUCommandBuffer(base.GPUCommandBuffer, GPUObjectBase):
@@ -1805,8 +1835,8 @@ def _destroy(self):
         # 'Cannot remove a vacant resource'. Got this info from the
         # wgpu chat. Also see
         # https://docs.rs/wgpu-core/latest/src/wgpu_core/device/mod.rs.html#4180-4194
-        # That's why _internal is set to None in submit()
-        if self._internal is not None and lib is not None:
+        # --> That's why _internal is set to None in Queue.submit()
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUCommandBuffer commandBuffer)
             libf.wgpuCommandBufferRelease(internal)
@@ -2308,9 +2338,9 @@ def resolve_query_set(
         raise NotImplementedError()
 
     def _destroy(self):
-        # Note that the natove object gets destroyed on finish.
+        # Note that the native object gets destroyed on finish.
         # Also see GPUCommandBuffer._destroy()
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUCommandEncoder commandEncoder)
             libf.wgpuCommandEncoderRelease(internal)
@@ -2350,10 +2380,10 @@ def end(self):
         libf.wgpuComputePassEncoderEnd(self._internal)
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUComputePassEncoder computePassEncoder)
-            internal  # panics: libf.wgpuComputePassEncoderRelease(internal)
+            libf.wgpuComputePassEncoderRelease(internal)
 
 
 class GPURenderPassEncoder(
@@ -2413,10 +2443,10 @@ def end_occlusion_query(self):
         raise NotImplementedError()
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPURenderPassEncoder renderPassEncoder)
-            internal  # panics: libf.wgpuRenderPassEncoderRelease(internal)
+            libf.wgpuRenderPassEncoderRelease(internal)
 
 
 class GPURenderBundleEncoder(
@@ -2431,7 +2461,7 @@ def finish(self, *, label=""):
         raise NotImplementedError()
 
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPURenderBundleEncoder renderBundleEncoder)
             libf.wgpuRenderBundleEncoderRelease(internal)
@@ -2639,10 +2669,16 @@ def read_texture(self, source, data_layout, size):
     def on_submitted_work_done(self):
         raise NotImplementedError()
 
+    def _destroy(self):
+        if self._internal is not None and libf is not None:
+            self._internal, internal = None, self._internal
+            # H: void f(WGPUQueue queue)
+            libf.wgpuQueueRelease(internal)
+
 
 class GPURenderBundle(base.GPURenderBundle, GPUObjectBase):
     def _destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPURenderBundle renderBundle)
             libf.wgpuRenderBundleRelease(internal)
@@ -2652,7 +2688,7 @@ class GPUQuerySet(base.GPUQuerySet, GPUObjectBase):
     pass
 
     def destroy(self):
-        if self._internal is not None and lib is not None:
+        if self._internal is not None and libf is not None:
             self._internal, internal = None, self._internal
             # H: void f(WGPUQuerySet querySet)
             libf.wgpuQuerySetRelease(internal)
diff --git a/wgpu/backends/rs_helpers.py b/wgpu/backends/rs_helpers.py
index 1d682b1b..43747ea8 100644
--- a/wgpu/backends/rs_helpers.py
+++ b/wgpu/backends/rs_helpers.py
@@ -222,29 +222,6 @@ def to_camel_case(name):
     return name2
 
 
-class DelayedReleaser:
-    """Helps release objects at a later time."""
-
-    # I found that when wgpuDeviceRelease() was called in Device._destroy,
-    # the tests would hang. I found that the release call was done around
-    # the time when another device was used (e.g. to create a buffer
-    # or shader module). For some reason, the delay in destruction (by
-    # Python's CG) causes a deadlock or something. We seem to be able
-    # to fix this by doing the actual release later - e.g. when the
-    # user creates a new device. Seems to be the same for the adapter.
-    def __init__(self):
-        self._things_to_release = []
-
-    def release_soon(self, fun, i):
-        self._things_to_release.append((fun, i))
-
-    def release_all_pending(self):
-        while self._things_to_release:
-            fun, i = self._things_to_release.pop(0)
-            release_func = getattr(lib, fun)
-            release_func(i)
-
-
 class ErrorHandler:
     """Object that logs errors, with the option to collect incoming
     errors elsewhere.
@@ -403,8 +380,7 @@ def get_dict(self):
                 name_map[name] = name[0].upper() + name[1:-1]
 
         # Initialize the result dict (sorted)
-        for name in sorted(names + root_names):
-            report_name = name_map[name]
+        for report_name in sorted(name_map[name] for name in names + root_names):
             result[report_name] = {"count": 0, "mem": 0}
 
         # Establish what backends are active
diff --git a/wgpu/gui/_offscreen.py b/wgpu/gui/_offscreen.py
index 894a9963..68e2adcd 100644
--- a/wgpu/gui/_offscreen.py
+++ b/wgpu/gui/_offscreen.py
@@ -23,7 +23,7 @@ def get_context(self, kind="gpupresent"):
         # the backend (e.g. rs), but here we use our own context.
         assert kind == "gpupresent"
         if self._canvas_context is None:
-            self._canvas_context = GPUCanvasContextOffline(self)
+            self._canvas_context = GPUCanvasContext(self)
         return self._canvas_context
 
     def present(self, texture_view):
@@ -43,7 +43,7 @@ def get_preferred_format(self):
         return "rgba8unorm-srgb"
 
 
-class GPUCanvasContextOffline(base.GPUCanvasContext):
+class GPUCanvasContext(base.GPUCanvasContext):
     """Helper class for canvases that render to a texture."""
 
     def __init__(self, canvas):
diff --git a/wgpu/gui/jupyter.py b/wgpu/gui/jupyter.py
index ffa99a95..5a84b844 100644
--- a/wgpu/gui/jupyter.py
+++ b/wgpu/gui/jupyter.py
@@ -91,7 +91,7 @@ def _request_draw(self):
     # Implementation needed for WgpuOffscreenCanvas
 
     def present(self, texture_view):
-        # This gets called at the end of a draw pass via GPUCanvasContextOffline
+        # This gets called at the end of a draw pass via _offscreen.GPUCanvasContext
         device = texture_view._device
         size = texture_view.size
         bytes_per_pixel = 4
diff --git a/wgpu/resources/codegen_report.md b/wgpu/resources/codegen_report.md
index d15ac625..acf83431 100644
--- a/wgpu/resources/codegen_report.md
+++ b/wgpu/resources/codegen_report.md
@@ -19,7 +19,7 @@
 * Validated 37 classes, 113 methods, 43 properties
 ### Patching API for backends/rs.py
 * Diffs for GPUAdapter: add request_device_tracing
-* Validated 37 classes, 99 methods, 0 properties
+* Validated 37 classes, 101 methods, 0 properties
 ## Validating rs.py
 * Enum field TextureFormat.rgb10a2uint missing in wgpu.h
 * Enum PipelineErrorReason missing in wgpu.h
@@ -28,6 +28,6 @@
 * Enum CanvasAlphaMode missing in wgpu.h
 * Enum field DeviceLostReason.unknown missing in wgpu.h
 * Wrote 232 enum mappings and 47 struct-field mappings to rs_mappings.py
-* Validated 87 C function calls
-* Not using 116 C functions
+* Validated 89 C function calls
+* Not using 113 C functions
 * Validated 71 C structs