From f1acf7c1ff5554bc0f7734cfef01845ece7e7e4f Mon Sep 17 00:00:00 2001 From: ishiy Date: Thu, 11 Apr 2024 17:00:12 +0900 Subject: [PATCH] Apply userqpu patch to 1.20230405 --- arch/arm/boot/dts/bcm2835-rpi-common.dtsi | 1 + arch/arm/mm/ioremap.c | 1 + drivers/firmware/raspberrypi.c | 89 ++++++++++++++ drivers/gpu/drm/vc4/vc4_drv.h | 11 ++ drivers/gpu/drm/vc4/vc4_gem.c | 130 +++++++++++++++++++-- drivers/gpu/drm/vc4/vc4_irq.c | 43 ++++++- drivers/gpu/drm/vc4/vc4_regs.h | 13 +++ drivers/gpu/drm/vc4/vc4_v3d.c | 28 ++++- include/soc/bcm2835/raspberrypi-firmware.h | 8 ++ 9 files changed, 306 insertions(+), 18 deletions(-) diff --git a/arch/arm/boot/dts/bcm2835-rpi-common.dtsi b/arch/arm/boot/dts/bcm2835-rpi-common.dtsi index 8a55b6cded592e..30a0f09be5a1b1 100644 --- a/arch/arm/boot/dts/bcm2835-rpi-common.dtsi +++ b/arch/arm/boot/dts/bcm2835-rpi-common.dtsi @@ -9,4 +9,5 @@ &v3d { power-domains = <&power RPI_POWER_DOMAIN_V3D>; + firmware = <&firmware>; }; diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 2129070065c323..efb167f970f651 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -417,6 +417,7 @@ void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) MT_MEMORY_RW, __builtin_return_address(0)); } +EXPORT_SYMBOL(arch_memremap_wb); void iounmap(volatile void __iomem *io_addr) { diff --git a/drivers/firmware/raspberrypi.c b/drivers/firmware/raspberrypi.c index 773ab66a506ad7..06f37641efabee 100644 --- a/drivers/firmware/raspberrypi.c +++ b/drivers/firmware/raspberrypi.c @@ -32,6 +32,13 @@ struct rpi_firmware { struct kref consumers; u32 get_throttled; + + struct vc4_dev *vc4; + int (*vc4_qpu_execute)(struct vc4_dev *vc4, + u32 num_qpu, + u32 control, + u32 noflush, + u32 timeout); }; static struct platform_device *g_pdev; @@ -74,6 +81,64 @@ rpi_firmware_transaction(struct rpi_firmware *fw, u32 chan, u32 data) return ret; } +/** + * Peeks at the property request to see if it's something that we + * should pass off to vc4 instead. + */ +static int +vc4_filter_property(struct rpi_firmware *fw, uint32_t *data, size_t tag_size) +{ + uint32_t tag = data[0]; + int ret; + + if (!fw->vc4) + return -ENOENT; + + switch (tag) { + case RPI_FIRMWARE_EXECUTE_QPU: { + struct qpu_execute_packet { + u32 tag; + u32 bufsize; + u32 size; + u32 num_qpu; + u32 control; + u32 noflush; + u32 timeout_ms; + } *packet = (void *)data; + + ret = fw->vc4_qpu_execute(fw->vc4, + packet->num_qpu, + packet->control, + packet->noflush, + packet->timeout_ms); + packet->size = 0x80000004; + packet->num_qpu = (ret != 0); + + return 0; + } + + case RPI_FIRMWARE_SET_ENABLE_QPU: { + struct qpu_enable_packet { + u32 tag; + u32 bufsize; + u32 size; + u32 enable; + } *packet = (void *)data; + /* If vc4 is present, userspace doesn't get to control + * when the QPUs are off or on. Just hand back the + * return value indicating success. + */ + packet->size = 0x80000004; + packet->enable = 0; + + return 0; + } + + default: + return -ENOENT; + } +} + /** * rpi_firmware_property_list - Submit firmware property list * @fw: Pointer to firmware structure from rpi_firmware_get(). @@ -96,6 +161,14 @@ int rpi_firmware_property_list(struct rpi_firmware *fw, dma_addr_t bus_addr; int ret; + /* NOTE: We're only handling filtering on the first property + * here, and if it gets filtered then we skip the rest of + * them. This is enough for hello_fft. + */ + ret = vc4_filter_property(fw, data, tag_size); + if (ret != -ENOENT) + return ret; + /* Packets are processed a dword at a time. */ if (size & 3) return -EINVAL; @@ -516,6 +589,22 @@ struct rpi_firmware *devm_rpi_firmware_get(struct device *dev, } EXPORT_SYMBOL_GPL(devm_rpi_firmware_get); +/** + * Called by the vc4 driver at its probe time, to request that QPU + * execution requests be redirected to it. + */ +void rpi_firmware_register_vc4(struct rpi_firmware *fw, struct vc4_dev *vc4, + int (*qpu_execute)(struct vc4_dev *vc4, + u32 num_qpu, + u32 control, + u32 noflush, + u32 timeout)) +{ + fw->vc4 = vc4; + fw->vc4_qpu_execute = qpu_execute; +} +EXPORT_SYMBOL_GPL(rpi_firmware_register_vc4); + static struct platform_driver rpi_firmware_driver = { .driver = { .name = "raspberrypi-firmware", diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h index c3057bda6adeb8..12164e4d2b951a 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.h +++ b/drivers/gpu/drm/vc4/vc4_drv.h @@ -815,6 +815,12 @@ struct vc4_exec_info { uint32_t uniforms_p; uint32_t uniforms_size; + struct { + u32 code; + u32 uniforms; + } user_qpu_job[16]; + u32 user_qpu_job_count; + /* Pointer to a performance monitor object if the user requested it, * NULL otherwise. */ @@ -1064,6 +1070,10 @@ int vc4_queue_seqno_cb(struct drm_device *dev, int vc4_gem_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); +int +vc4_firmware_qpu_execute(struct vc4_dev *dev, u32 num_qpu, + u32 control, u32 noflush, u32 timeout); + /* vc4_hdmi.c */ extern struct platform_driver vc4_hdmi_driver; @@ -1079,6 +1089,7 @@ void vc4_irq_disable(struct drm_device *dev); int vc4_irq_install(struct drm_device *dev, int irq); void vc4_irq_uninstall(struct drm_device *dev); void vc4_irq_reset(struct drm_device *dev); +void vc4_irq_finish_render_job(struct drm_device *dev); /* vc4_hvs.c */ extern struct platform_driver vc4_hvs_driver; diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index 628d40ff3aa1c0..1a51a604092884 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -342,6 +342,12 @@ vc4_hangcheck_elapsed(struct timer_list *t) return; } + /* Can't check hangs user qpu program execution */ + if (render_exec && render_exec->user_qpu_job_count > 0) { + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + return; + } + ct0ca = V3D_READ(V3D_CTNCA(0)); ct1ca = V3D_READ(V3D_CTNCA(1)); @@ -518,6 +524,7 @@ vc4_submit_next_render_job(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); struct vc4_exec_info *exec = vc4_first_render_job(vc4); + int i; if (!exec) return; @@ -533,8 +540,18 @@ vc4_submit_next_render_job(struct drm_device *dev) */ vc4_flush_texture_caches(dev); - trace_vc4_submit_cl(dev, true, exec->seqno, exec->ct1ca, exec->ct1ea); - submit_cl(dev, 1, exec->ct1ca, exec->ct1ea); + if (exec->user_qpu_job_count) { + /* XXX: Make sure we're idle. */ + /* XXX: Set up VPM */ + for (i = 0; i < exec->user_qpu_job_count; i++) { + V3D_WRITE(V3D_SRQUL, 1024); + V3D_WRITE(V3D_SRQUA, exec->user_qpu_job[i].uniforms); + V3D_WRITE(V3D_SRQPC, exec->user_qpu_job[i].code); + } + } else { + trace_vc4_submit_cl(dev, true, exec->seqno, exec->ct1ca, exec->ct1ea); + submit_cl(dev, 1, exec->ct1ca, exec->ct1ea); + } } void @@ -680,7 +697,7 @@ vc4_lock_bo_reservations(struct drm_device *dev, * then bump the end address. That's a change for a later date, * though. */ -static int +static uint64_t vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec, struct ww_acquire_ctx *acquire_ctx, struct drm_syncobj *out_sync) @@ -1144,6 +1161,36 @@ vc4_wait_bo_ioctl(struct drm_device *dev, void *data, return ret; } +static struct vc4_exec_info * +vc4_exec_alloc(struct drm_device *dev) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct vc4_exec_info *exec; + int ret; + + exec = kcalloc(1, sizeof(*exec), GFP_KERNEL); + if (!exec) { + DRM_ERROR("malloc failure on exec struct\n"); + return ERR_PTR(-ENOMEM); + } + + mutex_lock(&vc4->power_lock); + if (vc4->power_refcount++ == 0) { + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret < 0) { + vc4->power_refcount--; + mutex_unlock(&vc4->power_lock); + kfree(exec); + return ERR_PTR(ret); + } + } + mutex_unlock(&vc4->power_lock); + + INIT_LIST_HEAD(&exec->unref_list); + + return exec; +} + /** * vc4_submit_cl_ioctl() - Submits a job (frame) to the VC4. * @dev: DRM device @@ -1194,11 +1241,9 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data, return -EINVAL; } - exec = kcalloc(1, sizeof(*exec), GFP_KERNEL); - if (!exec) { - DRM_ERROR("malloc failure on exec struct\n"); - return -ENOMEM; - } + exec = vc4_exec_alloc(dev); + if (IS_ERR(exec)) + return PTR_ERR(exec); exec->dev = vc4; ret = vc4_v3d_pm_get(vc4); @@ -1208,7 +1253,6 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data, } exec->args = args; - INIT_LIST_HEAD(&exec->unref_list); ret = vc4_cl_lookup_bos(dev, file_priv, exec); if (ret) @@ -1304,6 +1348,74 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data, return ret; } +int +vc4_firmware_qpu_execute(struct vc4_dev *vc4, u32 num_jobs, + u32 control, u32 noflush, u32 timeout) +{ + struct drm_device *dev = &vc4->base; + u32 control_paddr; + struct vc4_exec_info *exec; + struct control_args { + u32 uniforms; + u32 code; + } *control_args; + int ret, i; + uint64_t seqno; + struct ww_acquire_ctx acquire_ctx; + + control_paddr = control & ~(BIT(31) | BIT(30)); + + if (num_jobs > ARRAY_SIZE(exec->user_qpu_job)) { + DRM_ERROR("V3D QPU execution request with too many jobs (%d)\n", + num_jobs); + return -EINVAL; + } + + exec = vc4_exec_alloc(dev); + if (IS_ERR(exec)) + return PTR_ERR(exec); + + ret = vc4_v3d_pm_get(vc4); + if (ret) { + kfree(exec); + return ret; + } + + ret = vc4_lock_bo_reservations(dev, exec, &acquire_ctx); + if (ret) { + vc4_complete_exec(dev, exec); + return ret; + } + + control_args = arch_memremap_wb(control_paddr, num_jobs * 2 * sizeof(u32)); + if (!control_args) { + vc4_complete_exec(dev, exec); + return -EINVAL; + } + + for (i = 0; i < num_jobs; i++) { + exec->user_qpu_job[i].code = control_args[i].code; + exec->user_qpu_job[i].uniforms = control_args[i].uniforms; + } + iounmap(control_args); + + exec->user_qpu_job_count = num_jobs; + + ret = vc4_queue_submit(dev, exec, &acquire_ctx, NULL); + if (ret) { + vc4_complete_exec(dev, exec); + return ret; + } + seqno = vc4->emit_seqno; + + /* The mailbox interface is synchronous, so wait for the job + * we just made to complete. + */ + ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true); + + return ret; +} + static void vc4_gem_destroy(struct drm_device *dev, void *unused); int vc4_gem_init(struct drm_device *dev) { diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index 1e6db0121ccd5f..ea5c750dc3c66b 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -154,7 +154,7 @@ vc4_cancel_bin_job(struct drm_device *dev) vc4_submit_next_bin_job(dev); } -static void +void vc4_irq_finish_render_job(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); @@ -206,18 +206,28 @@ vc4_irq(int irq, void *arg) { struct drm_device *dev = arg; struct vc4_dev *vc4 = to_vc4_dev(dev); - uint32_t intctl; + uint32_t intctl, dbqitc; irqreturn_t status = IRQ_NONE; + uint32_t srqcs; + uint32_t qpurqcc; + uint32_t qpurqcm; barrier(); intctl = V3D_READ(V3D_INTCTL); + dbqitc = V3D_READ(V3D_DBQITC); + srqcs = V3D_READ(V3D_SRQCS); + qpurqcc = VC4_GET_FIELD(srqcs, V3D_SRQCS_QPURQCC); + qpurqcm = VC4_GET_FIELD(srqcs, V3D_SRQCS_QPURQCM); /* Acknowledge the interrupts we're handling here. The binner * last flush / render frame done interrupt will be cleared, * while OUTOMEM will stay high until the underlying cause is * cleared. */ - V3D_WRITE(V3D_INTCTL, intctl); + if (intctl) + V3D_WRITE(V3D_INTCTL, intctl); + if (dbqitc) + V3D_WRITE(V3D_DBQITC, dbqitc); if (intctl & V3D_INT_OUTOMEM) { /* Disable OUTOMEM until the work is done. */ @@ -240,6 +250,26 @@ vc4_irq(int irq, void *arg) status = IRQ_HANDLED; } + if (dbqitc) { + /* The job isn't done until all programs that were + * spawned have sent an interrupt. + * + * XXX: The shader emits an interrupt that will land + * in DBQITC, and then does THREND a few cycles later. + * Do we have a race between the interrupt reaching + * ARM and when these queue counts get updated? + */ + if (qpurqcc == qpurqcm) { + V3D_WRITE(V3D_SRQCS, + V3D_SRQCS_QPURQCC_CLEAR | + V3D_SRQCS_QPURQCM_CLEAR); + spin_lock(&vc4->job_lock); + vc4_irq_finish_render_job(dev); + spin_unlock(&vc4->job_lock); + status = IRQ_HANDLED; + } + } + return status; } @@ -258,6 +288,7 @@ vc4_irq_prepare(struct drm_device *dev) * for us. */ V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS); + V3D_WRITE(V3D_DBQITC, ~0); } void @@ -275,6 +306,8 @@ vc4_irq_enable(struct drm_device *dev) * enabled as soon as we have a binner BO allocated. */ V3D_WRITE(V3D_INTENA, V3D_INT_FLDONE | V3D_INT_FRDONE); + V3D_WRITE(V3D_DBQITC, ~0); + V3D_WRITE(V3D_DBQITE, ~0); } void @@ -290,9 +323,11 @@ vc4_irq_disable(struct drm_device *dev) /* Disable sending interrupts for our driver's IRQs. */ V3D_WRITE(V3D_INTDIS, V3D_DRIVER_IRQS); + V3D_WRITE(V3D_DBQITE, 0); /* Clear any pending interrupts we might have left. */ V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS); + V3D_WRITE(V3D_DBQITC, ~0); /* Finish any interrupt handler still in flight. */ synchronize_irq(vc4->irq); @@ -344,6 +379,7 @@ void vc4_irq_reset(struct drm_device *dev) /* Acknowledge any stale IRQs. */ V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS); + V3D_WRITE(V3D_DBQITC, ~0); /* * Turn all our interrupts on. Binner out of memory is the @@ -352,6 +388,7 @@ void vc4_irq_reset(struct drm_device *dev) * memory yet. */ V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); + V3D_WRITE(V3D_DBQITE, ~0); spin_lock_irqsave(&vc4->job_lock, irqflags); vc4_cancel_bin_job(dev); diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h index 0981732904111e..15d0a42badec24 100644 --- a/drivers/gpu/drm/vc4/vc4_regs.h +++ b/drivers/gpu/drm/vc4/vc4_regs.h @@ -114,6 +114,16 @@ #define V3D_SRQUA 0x00434 #define V3D_SRQUL 0x00438 #define V3D_SRQCS 0x0043c +# define V3D_SRQCS_QPURQCC_MASK VC4_MASK(23, 16) +# define V3D_SRQCS_QPURQCC_SHIFT 16 +# define V3D_SRQCS_QPURQCC_CLEAR BIT(16) +# define V3D_SRQCS_QPURQCM_MASK VC4_MASK(15, 8) +# define V3D_SRQCS_QPURQCM_SHIFT 8 +# define V3D_SRQCS_QPURQCM_CLEAR BIT(8) +# define V3D_SRQCS_QPURQERR_SHIFT BIT(7) +# define V3D_SRQCS_QPURQL_MASK VC4_MASK(5, 0) +# define V3D_SRQCS_QPURQL_SHIFT 0 + #define V3D_VPACNTL 0x00500 #define V3D_VPMBASE 0x00504 #define V3D_PCTRC 0x00670 @@ -121,6 +131,9 @@ # define V3D_PCTRE_EN BIT(31) #define V3D_PCTR(x) (0x00680 + ((x) * 8)) #define V3D_PCTRS(x) (0x00684 + ((x) * 8)) +#define V3D_DBCFG 0x00e00 +#define V3D_DBQITE 0x00e2c +#define V3D_DBQITC 0x00e30 #define V3D_DBGE 0x00f00 #define V3D_FDBGO 0x00f04 #define V3D_FDBGB 0x00f08 diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c index 56abb0d6bc39b6..1ca58ab9189534 100644 --- a/drivers/gpu/drm/vc4/vc4_v3d.c +++ b/drivers/gpu/drm/vc4/vc4_v3d.c @@ -9,7 +9,7 @@ #include #include #include - +#include #include "vc4_drv.h" #include "vc4_regs.h" @@ -163,11 +163,8 @@ static void vc4_v3d_init_hw(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - /* Take all the memory that would have been reserved for user - * QPU programs, since we don't have an interface for running - * them, anyway. - */ - V3D_WRITE(V3D_VPMBASE, 0); + /* XXX: Fix the user QPU VPM share at 16 for now. */ + V3D_WRITE(V3D_VPMBASE, 16); } int vc4_v3d_get_bin_slot(struct vc4_dev *vc4) @@ -427,6 +424,7 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data) struct drm_device *drm = dev_get_drvdata(master); struct vc4_dev *vc4 = to_vc4_dev(drm); struct vc4_v3d *v3d = NULL; + struct device_node *firmware_node; int ret; v3d = devm_kzalloc(&pdev->dev, sizeof(*v3d), GFP_KERNEL); @@ -483,6 +481,9 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data) goto err_put_runtime_pm; } + /* Enable QPU to host interrupt */ + V3D_WRITE(V3D_DBCFG, 1); + /* Reset the binner overflow address/size at setup, to be sure * we don't reuse an old one. */ @@ -498,6 +499,16 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data) pm_runtime_use_autosuspend(dev); pm_runtime_set_autosuspend_delay(dev, 40); /* a little over 2 frames. */ + firmware_node = of_parse_phandle(dev->of_node, "firmware", 0); + vc4->firmware = rpi_firmware_get(firmware_node); + of_node_put(firmware_node); + if (!vc4->firmware) { + DRM_DEBUG("Failed to get Raspberry Pi firmware reference.\n"); + return -EPROBE_DEFER; + } + + rpi_firmware_register_vc4(vc4->firmware, vc4, vc4_firmware_qpu_execute); + return 0; err_put_runtime_pm: @@ -512,6 +523,8 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master, struct drm_device *drm = dev_get_drvdata(master); struct vc4_dev *vc4 = to_vc4_dev(drm); + rpi_firmware_register_vc4(vc4->firmware, NULL, NULL); + vc4_irq_uninstall(drm); /* Disable the binner's overflow memory address, so the next @@ -521,6 +534,9 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master, V3D_WRITE(V3D_BPOA, 0); V3D_WRITE(V3D_BPOS, 0); + /* Disable QPU to host interrupt */ + V3D_WRITE(V3D_DBCFG, 0); + vc4->v3d = NULL; } diff --git a/include/soc/bcm2835/raspberrypi-firmware.h b/include/soc/bcm2835/raspberrypi-firmware.h index c453978e62f604..71e58f37479d62 100644 --- a/include/soc/bcm2835/raspberrypi-firmware.h +++ b/include/soc/bcm2835/raspberrypi-firmware.h @@ -249,4 +249,12 @@ static inline struct rpi_firmware *devm_rpi_firmware_get(struct device *dev, } #endif +struct vc4_dev; +void rpi_firmware_register_vc4(struct rpi_firmware *fw, struct vc4_dev *vc4, + int (*qpu_execute)(struct vc4_dev *vc4, + u32 num_qpu, + u32 control, + u32 noflush, + u32 timeout)); + #endif /* __SOC_RASPBERRY_FIRMWARE_H__ */