diff --git a/recipes-kernel/linux/files/iot2050_defconfig_base b/recipes-kernel/linux/files/iot2050_defconfig_base index ec1358f6a..3ee745c7b 100644 --- a/recipes-kernel/linux/files/iot2050_defconfig_base +++ b/recipes-kernel/linux/files/iot2050_defconfig_base @@ -69,6 +69,7 @@ CONFIG_KSM=y CONFIG_MEMORY_FAILURE=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA=y +CONFIG_CMA_DEBUG=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -665,7 +666,7 @@ CONFIG_CRYPTO_ECHAINIV=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_DMA_RESTRICTED_POOL=y CONFIG_DMA_CMA=y -CONFIG_CMA_SIZE_MBYTES=24 +CONFIG_CMA_SIZE_MBYTES=128 CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/recipes-kernel/linux/files/patches-6.1/0099-WIP-print-pvu-size.patch b/recipes-kernel/linux/files/patches-6.1/0099-WIP-print-pvu-size.patch new file mode 100644 index 000000000..ee3551196 --- /dev/null +++ b/recipes-kernel/linux/files/patches-6.1/0099-WIP-print-pvu-size.patch @@ -0,0 +1,423 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Li Hua Qian +Date: Wed, 9 Oct 2024 13:00:39 +0800 +Subject: [PATCH] WIP: print pvu size + +Signed-off-by: Li Hua Qian +--- + drivers/pci/controller/dwc/pci-keystone.c | 1 + + drivers/soc/ti/ti-pvu.c | 2 + + include/linux/dma-mapping.h | 27 +++++++++++- + kernel/dma/contiguous.c | 6 +++ + kernel/dma/direct.c | 27 ++++++++++++ + kernel/dma/mapping.c | 51 +++++++++++++++++++++-- + mm/cma.c | 8 ++-- + 7 files changed, 114 insertions(+), 8 deletions(-) + +diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c +index 06f634f600bd..53426126ab70 100644 +--- a/drivers/pci/controller/dwc/pci-keystone.c ++++ b/drivers/pci/controller/dwc/pci-keystone.c +@@ -1501,6 +1501,7 @@ static int ks_pcie_probe(struct platform_device *pdev) + ret = ks_init_restricted_dma(pdev); + if (ret < 0) + goto err_get_sync; ++ dev_err(dev, "Lee: [%s]-[%d] pdev_size = %ld\n", __func__, __LINE__, sizeof(*pdev)); + + switch (mode) { + case DW_PCIE_RC_TYPE: +diff --git a/drivers/soc/ti/ti-pvu.c b/drivers/soc/ti/ti-pvu.c +index 5b71d503051f..8459268dd859 100644 +--- a/drivers/soc/ti/ti-pvu.c ++++ b/drivers/soc/ti/ti-pvu.c +@@ -270,6 +270,7 @@ static int pvu_create_region(struct ti_pvu *pvu, u64 addr, u64 size) + int psize; + int entry; + ++ dev_err(&pvu->pdev->dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + while (size > 0) { + entry = pvu_get_free_entry(pvu); + if (entry < 0) { +@@ -306,6 +307,7 @@ static void pvu_remove_region(struct ti_pvu *pvu, u64 addr, u64 size) + u64 entry_addr; + u32 entry2; + ++ dev_err(&pvu->pdev->dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + for (n = 0; n < pvu->num_entries; n++) { + entry_base = pvu->tlbif_base + n * 0x20; + entry2 = readl(entry_base + PVU_ENTRY2); +diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h +index 0ee20b764000..6e494ecfdfeb 100644 +--- a/include/linux/dma-mapping.h ++++ b/include/linux/dma-mapping.h +@@ -417,16 +417,39 @@ static inline void dma_sync_sgtable_for_device(struct device *dev, + #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) + #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) + ++static unsigned long dma_alloc_count = 0; ++static unsigned long dma_alloc_size = 0; ++ ++static inline bool is_pcie_device(struct device *dev) ++{ ++ return dev->bus && strcmp(dev->bus->name, "pci") == 0; ++} ++ + static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) + { +- return dma_alloc_attrs(dev, size, dma_handle, gfp, +- (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); ++ void *ret = dma_alloc_attrs(dev, size, dma_handle, gfp, ++ (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); ++ dma_alloc_count++; ++ dma_alloc_size += size; ++ if (ret && is_pcie_device(dev)) { ++ dev_err(dev, "Lee: [%s] DMA memory allocated by PCIe device: %zu bytes, total allocations: %lu, total size: %lu bytes\n", ++ __func__, size, dma_alloc_count, dma_alloc_size); ++ } ++ dev_err(dev, "[%s] All: %zu bytes, total allocations: %lu, total size: %lu bytes\n", ++ __func__, size, dma_alloc_count, dma_alloc_size); ++ return ret; + } + + static inline void dma_free_coherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) + { ++ if (is_pcie_device(dev)) { ++ dma_alloc_count--; ++ dma_alloc_size -= size; ++ dev_err(dev, "Lee: [%s] DMA memory freed by PCIe device: %zu bytes, total allocations: %lu, total size: %lu bytes\n", ++ __func__, size, dma_alloc_count, dma_alloc_size); ++ } + return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); + } + +diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c +index 6ea80ae42622..aa23d2d495e4 100644 +--- a/kernel/dma/contiguous.c ++++ b/kernel/dma/contiguous.c +@@ -307,11 +307,14 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) + int nid = dev_to_node(dev); + #endif + ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + /* CMA can be used only in the context which permits sleeping */ + if (!gfpflags_allow_blocking(gfp)) + return NULL; ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (dev->cma_area) + return cma_alloc_aligned(dev->cma_area, size, gfp); ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (size <= PAGE_SIZE) + return NULL; + +@@ -320,6 +323,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) + struct cma *cma = dma_contiguous_pernuma_area[nid]; + struct page *page; + ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (cma) { + page = cma_alloc_aligned(cma, size, gfp); + if (page) +@@ -327,9 +331,11 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) + } + } + #endif ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (!dma_contiguous_default_area) + return NULL; + ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); + } + +diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c +index d4215739efc7..49bfce4d1d50 100644 +--- a/kernel/dma/direct.c ++++ b/kernel/dma/direct.c +@@ -123,39 +123,49 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, + + WARN_ON_ONCE(!PAGE_ALIGNED(size)); + ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_swiotlb(dev, size); + + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + page = dma_alloc_contiguous(dev, size, gfp); ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (page) { + if (!dma_coherent_ok(dev, page_to_phys(page), size) || + (!allow_highmem && PageHighMem(page))) { + dma_free_contiguous(dev, page, size); ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + page = NULL; + } + } ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + again: + if (!page) + page = alloc_pages_node(node, gfp, get_order(size)); ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_free_contiguous(dev, page, size); ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + page = NULL; + + if (IS_ENABLED(CONFIG_ZONE_DMA32) && + phys_limit < DMA_BIT_MASK(64) && + !(gfp & (GFP_DMA32 | GFP_DMA))) { + gfp |= GFP_DMA32; ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + goto again; + } + + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + goto again; + } + } + ++ dev_err(dev, "Lee: [%s]-[%d] size = %ld\n", __func__, __LINE__, size); + return page; + } + +@@ -216,15 +226,18 @@ void *dma_direct_alloc(struct device *dev, size_t size, + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; + ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); + ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (!dev_is_dma_coherent(dev)) { + /* + * Fallback to the arch handler if it exists. This should + * eventually go away. + */ ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && +@@ -236,6 +249,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, + * If there is a global pool, always allocate from it for + * non-coherent devices. + */ ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL)) + return dma_alloc_from_global_coherent(dev, size, + dma_handle); +@@ -247,13 +261,17 @@ void *dma_direct_alloc(struct device *dev, size_t size, + */ + remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); + if (remap) { ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (dma_direct_use_pool(dev, gfp)) + return dma_direct_alloc_from_pool(dev, size, + dma_handle, gfp); ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + } else { ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) + return NULL; + set_uncached = true; ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + } + } + +@@ -264,6 +282,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); + ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + /* we always manually zero the memory once we are done */ + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); + if (!page) +@@ -274,6 +293,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, + * combination the cma= arguments and per-arch setup. These need to be + * remapped to return a kernel virtual address. + */ ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (PageHighMem(page)) { + remap = true; + set_uncached = false; +@@ -291,15 +311,18 @@ void *dma_direct_alloc(struct device *dev, size_t size, + /* create a coherent mapping */ + ret = dma_common_contiguous_remap(page, size, prot, + __builtin_return_address(0)); ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (!ret) + goto out_free_pages; + } else { + ret = page_address(page); ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (dma_set_decrypted(dev, ret, size)) + goto out_leak_pages; + } + + memset(ret, 0, size); ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + + if (set_uncached) { + arch_dma_prep_coherent(page, size); +@@ -307,17 +330,21 @@ void *dma_direct_alloc(struct device *dev, size_t size, + if (IS_ERR(ret)) + goto out_encrypt_pages; + } ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return ret; + + out_encrypt_pages: ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + if (dma_set_encrypted(dev, page_address(page), size)) + return NULL; + out_free_pages: + __dma_direct_free_pages(dev, page, size); ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + return NULL; + out_leak_pages: ++ dev_err(dev, "Lee: [%s]-[%d] attrs = %ld\n", __func__, __LINE__, attrs); + return NULL; + } + +diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c +index 33437d620644..03d9ca27ffe7 100644 +--- a/kernel/dma/mapping.c ++++ b/kernel/dma/mapping.c +@@ -224,6 +224,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + * dma_unmap_sg_attrs() should be used to unmap the buffer with the + * original sg and original nents (not the value returned by this funciton). + */ ++size_t total_size = 0; ++size_t all_total_size = 0; + unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) + { +@@ -232,6 +234,32 @@ unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs); + if (ret < 0) + return 0; ++ ++ if (dev->bus && strcmp(dev->bus->name, "pci") == 0) { ++ dev_err(dev, "Lee: [%s]-[%d] pci dev_size = %ld, nentry = %d\n", __func__, __LINE__, sizeof(*dev), nents); ++ ++ for (int i = 0; i < nents; i++) { ++ size_t entry_size = sg[i].length; ++ total_size += entry_size; ++ if (i%10) { ++ dev_err(dev, "Lee: Entry %d: size = %zu bytes\n", i, entry_size); ++ dev_err(dev, "Lee: Total mapping size = %zu bytes\n", total_size); ++ } ++ } ++ ++ } ++ ++ for (int i = 0; i < nents; i++) { ++ size_t entry_size = sg[i].length; ++ all_total_size += entry_size; ++ } ++ // if (all_total_size > 16200000) { ++ // static int j = 0; ++ // if (j%10 == 0) ++ // dev_err(dev, "Lee: All total mapping size = %zu bytes\n", all_total_size); ++ // j++; ++ // } ++ + return ret; + } + EXPORT_SYMBOL(dma_map_sg_attrs); +@@ -272,6 +300,9 @@ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + if (nents < 0) + return nents; + sgt->nents = nents; ++ if (dev->bus && strcmp(dev->bus->name, "pci") == 0) { ++ dev_err(dev, "Lee: [%s]-[%d] pci dev_size = %ld\n", __func__, __LINE__, sizeof(*dev)); ++ } + return 0; + } + EXPORT_SYMBOL_GPL(dma_map_sgtable); +@@ -498,20 +529,34 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + + WARN_ON_ONCE(!dev->coherent_dma_mask); + ++ if (dev->bus && strcmp(dev->bus->name, "pci") == 0) { ++ dev_err(dev, "Lee: [%s]-[%d] pci dev_size = %ld\n", __func__, __LINE__, sizeof(*dev)); ++ } + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) + return cpu_addr; + + /* let the implementation decide on the zone to allocate from: */ + flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); + +- if (dma_alloc_direct(dev, ops)) ++ if (dma_alloc_direct(dev, ops)) { + cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); +- else if (ops->alloc) ++ dev_err(dev, "Lee: [%s]-[%d] cpu_addr = %ld\n", __func__, __LINE__, cpu_addr); ++ } ++ else if (ops->alloc){ + cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); +- else ++ dev_err(dev, "Lee: [%s]-[%d] cpu_addr = %ld\n", __func__, __LINE__, cpu_addr); ++ } ++ else { ++ dev_err(dev, "Lee: [%s]-[%d] cpu_addr = %ld\n", __func__, __LINE__, cpu_addr); + return NULL; ++ } + + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); ++ if (dev->bus && strcmp(dev->bus->name, "pci") == 0) { ++ dev_err(dev, "Lee: [%s]-[%d] pci dev_size = %ld\n", __func__, __LINE__, sizeof(*dev)); ++ dev_err(dev, "Lee: [%s]-[%d] cpu_addr = %ld\n", __func__, __LINE__, cpu_addr); ++ } ++ dev_err(dev, "Lee: [%s]-[%d] cpu_addr = %ld\n", __func__, __LINE__, cpu_addr); + return cpu_addr; + } + EXPORT_SYMBOL(dma_alloc_attrs); +diff --git a/mm/cma.c b/mm/cma.c +index 01e9d0b2d875..c52d5185cf72 100644 +--- a/mm/cma.c ++++ b/mm/cma.c +@@ -431,7 +431,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, + if (!cma || !cma->count || !cma->bitmap) + goto out; + +- pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, ++ pr_err("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) +@@ -444,6 +444,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + ++ pr_err("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, ++ count, align); + if (bitmap_count > bitmap_maxno) + goto out; + +@@ -478,7 +480,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, + if (ret != -EBUSY) + break; + +- pr_debug("%s(): memory range at %p is busy, retrying\n", ++ pr_err("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), +@@ -505,7 +507,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, + cma_debug_show_areas(cma); + } + +- pr_debug("%s(): returned %p\n", __func__, page); ++ pr_err("%s(): returned %p\n", __func__, page); + out: + if (page) { + count_vm_event(CMA_ALLOC_SUCCESS);