Skip to content

Commit

Permalink
Add patches to fix nouveau issues preventing booting the installer or…
Browse files Browse the repository at this point in the history
… system
  • Loading branch information
fepitre committed Apr 16, 2020
1 parent 0c3113d commit afd5033
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 0 deletions.
141 changes: 141 additions & 0 deletions 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
From 7a7662fe09eb2ccd2eb93ce7261aa47c86111b4d Mon Sep 17 00:00:00 2001
From: Karol Herbst <[email protected]>
Date: Tue, 24 Mar 2020 21:29:23 +0100
Subject: [PATCH 1/2] drm/nouveau: workaround runpm fail by disabling PCI power
management on certain intel bridges

Fixes the infamous 'runtime PM' bug many users are facing on Laptops with
Nvidia Pascal GPUs by skipping said PCI power state changes on the GPU.

Depending on the used kernel there might be messages like those in demsg:

"nouveau 0000:01:00.0: Refused to change power state, currently in D3"
"nouveau 0000:01:00.0: can't change power state from D3cold to D0 (config
space inaccessible)"
followed by backtraces of kernel crashes or timeouts within nouveau.

It's still unkown why this issue exists, but this is a reliable workaround
and solves a very annoying issue for user having to choose between a
crashing kernel or higher power consumption of their Laptops.

Signed-off-by: Karol Herbst <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Cc: Lyude Paul <[email protected]>
Cc: Rafael J. Wysocki <[email protected]>
Cc: Mika Westerberg <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205623
Signed-off-by: Ben Skeggs <[email protected]>
---
drivers/gpu/drm/nouveau/nouveau_drm.c | 63 +++++++++++++++++++++++++++
drivers/gpu/drm/nouveau/nouveau_drv.h | 2 +
2 files changed, 65 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 6b1629c14dd7..ca4087f5a15b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev)
kfree(drm);
}

+/*
+ * On some Intel PCIe bridge controllers doing a
+ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear.
+ * Skipping the intermediate D3hot step seems to make it work again. This is
+ * probably caused by not meeting the expectation the involved AML code has
+ * when the GPU is put into D3hot state before invoking it.
+ *
+ * This leads to various manifestations of this issue:
+ * - AML code execution to power on the GPU hits an infinite loop (as the
+ * code waits on device memory to change).
+ * - kernel crashes, as all PCI reads return -1, which most code isn't able
+ * to handle well enough.
+ *
+ * In all cases dmesg will contain at least one line like this:
+ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3'
+ * followed by a lot of nouveau timeouts.
+ *
+ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not
+ * documented PCI config space register 0x248 of the Intel PCIe bridge
+ * controller (0x1901) in order to change the state of the PCIe link between
+ * the PCIe port and the GPU. There are alternative code paths using other
+ * registers, which seem to work fine (executed pre Windows 8):
+ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved')
+ * - 0xb0 bit 0x10 (link disable)
+ * Changing the conditions inside the firmware by poking into the relevant
+ * addresses does resolve the issue, but it seemed to be ACPI private memory
+ * and not any device accessible memory at all, so there is no portable way of
+ * changing the conditions.
+ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared.
+ *
+ * The only systems where this behavior can be seen are hybrid graphics laptops
+ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether
+ * this issue only occurs in combination with listed Intel PCIe bridge
+ * controllers and the mentioned GPUs or other devices as well.
+ *
+ * documentation on the PCIe bridge controller can be found in the
+ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2"
+ * Section "12 PCI Express* Controller (x16) Registers"
+ */
+
+static void quirk_broken_nv_runpm(struct pci_dev *pdev)
+{
+ struct drm_device *dev = pci_get_drvdata(pdev);
+ struct nouveau_drm *drm = nouveau_drm(dev);
+ struct pci_dev *bridge = pci_upstream_bridge(pdev);
+
+ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL)
+ return;
+
+ switch (bridge->device) {
+ case 0x1901:
+ drm->old_pm_cap = pdev->pm_cap;
+ pdev->pm_cap = 0;
+ NV_INFO(drm, "Disabling PCI power management to avoid bug\n");
+ break;
+ }
+}
+
static int nouveau_drm_probe(struct pci_dev *pdev,
const struct pci_device_id *pent)
{
@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
if (ret)
goto fail_drm_dev_init;

+ quirk_broken_nv_runpm(pdev);
return 0;

fail_drm_dev_init:
@@ -734,7 +793,11 @@ static void
nouveau_drm_remove(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
+ struct nouveau_drm *drm = nouveau_drm(dev);

+ /* revert our workaround */
+ if (drm->old_pm_cap)
+ pdev->pm_cap = drm->old_pm_cap;
nouveau_drm_device_remove(dev);
pci_disable_device(pdev);
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index c2c332fbde97..2a6519737800 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -140,6 +140,8 @@ struct nouveau_drm {

struct list_head clients;

+ u8 old_pm_cap;
+
struct {
struct agp_bridge_data *bridge;
u32 base;
--
2.25.1

68 changes: 68 additions & 0 deletions 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
From 37b556606d1217b4367e622d88cef11c65764386 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <[email protected]>
Date: Tue, 31 Mar 2020 16:08:44 +1000
Subject: [PATCH 2/2] drm/nouveau/gr/gp107,gp108: implement workaround for HW
hanging during init

Certain boards with GP107/GP108 chipsets hang (often, but randomly) for
unknown reasons during GR initialisation.

The first tell-tale symptom of this issue is:

nouveau 0000:01:00.0: bus: MMIO read of 00000000 FAULT at 409800 [ TIMEOUT ]

appearing in dmesg, likely followed by many other failures being logged.

Karol found this WAR for the issue a while back, but efforts to isolate
the root cause and proper fix have not yielded success so far. I've
modified the original patch to include a few more details, limit it to
GP107/GP108 by default, and added a config option to override this choice.

Signed-off-by: Ben Skeggs <[email protected]>
Reviewed-by: Karol Herbst <[email protected]>
---
.../gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 26 +++++++++++++++++++
1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
index dd8f85b8b3a7..f2f5636efac4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base)
{
struct gf100_gr *gr = gf100_gr(base);
struct nvkm_subdev *subdev = &base->engine.subdev;
+ struct nvkm_device *device = subdev->device;
+ bool reset = device->chipset == 0x137 || device->chipset == 0x138;
u32 ret;

+ /* On certain GP107/GP108 boards, we trigger a weird issue where
+ * GR will stop responding to PRI accesses after we've asked the
+ * SEC2 RTOS to boot the GR falcons. This happens with far more
+ * frequency when cold-booting a board (ie. returning from D3).
+ *
+ * The root cause for this is not known and has proven difficult
+ * to isolate, with many avenues being dead-ends.
+ *
+ * A workaround was discovered by Karol, whereby putting GR into
+ * reset for an extended period right before initialisation
+ * prevents the problem from occuring.
+ *
+ * XXX: As RM does not require any such workaround, this is more
+ * of a hack than a true fix.
+ */
+ reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset);
+ if (reset) {
+ nvkm_mask(device, 0x000200, 0x00001000, 0x00000000);
+ nvkm_rd32(device, 0x000200);
+ msleep(50);
+ nvkm_mask(device, 0x000200, 0x00001000, 0x00001000);
+ nvkm_rd32(device, 0x000200);
+ }
+
nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false);

ret = nvkm_falcon_get(&gr->fecs.falcon, subdev);
--
2.25.1

6 changes: 6 additions & 0 deletions kernel.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ Patch13: 0014-xen-events-avoid-NULL-pointer-dereference-in-evtchn-from-irq.patch
Patch14: 0015-xen-pciback-fix-INTERRUPT_TYPE_-defines.patch
Patch15: 0016-gcc-common.h-params.h-has-been-dropped-in-GCC10.patch

# nouveau runpm and secboot fixes
# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/f5755e7069d4acbcce1a93692421f358241ead7b
Patch20: 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/41c6a13e8143af71928749ea9895d2ebc2fb4ffd
Patch21: 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch

%description
Qubes Dom0 kernel.

Expand Down

0 comments on commit afd5033

Please sign in to comment.