From ab8813ee44c326886719d81e525084a36ebaa9fd Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 8 Nov 2024 09:40:29 +0100 Subject: [PATCH] Add option to store LRU cache as mmapped arrays (#203) * add option to store LRU cache as mmapped arrays * Relax mmap compat * add mmap option to cache function --- Project.toml | 4 +++- src/cached.jl | 22 +++++++++++++++------- test/runtests.jl | 43 +++++++++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/Project.toml b/Project.toml index 76df765..ec5a276 100644 --- a/Project.toml +++ b/Project.toml @@ -5,15 +5,17 @@ version = "0.4.6" [deps] LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" +Mmap = "a63ad114-7e13-5084-954f-fe012c677804" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" [compat] Aqua = "0.8" LRUCache = "1" -julia = "1.9" +Mmap = "1" OffsetArrays = "1" Statistics = "1.9" Test = "1.9" +julia = "1.9" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/src/cached.jl b/src/cached.jl index 0acceac..7a2e61e 100644 --- a/src/cached.jl +++ b/src/cached.jl @@ -1,4 +1,4 @@ - +import Mmap # Force disk any abstractarray into a different chunking pattern. # This is useful in `zip` and other operations that can iterate # over multiple arrays with different patterns. @@ -6,19 +6,22 @@ """ CachedDiskArray <: AbstractDiskArray - CachedDiskArray(A::AbstractArray; maxsize=1000) + CachedDiskArray(A::AbstractArray; maxsize=1000, mmap=false) Wrap some disk array `A` with a caching mechanism that will keep chunks up to a total of `maxsize` megabytes, dropping -the least used chunks when `maxsize` is exceeded. +the least used chunks when `maxsize` is exceeded. If `mmap` is +set to `true`, cached chunks will not be kept in RAM but Mmapped +to temproray files. """ struct CachedDiskArray{T,N,A<:AbstractArray{T,N},C} <: ChunkTiledDiskArray{T,N} parent::A cache::C + mmap::Bool end -function CachedDiskArray(A::AbstractArray{T,N}; maxsize=1000) where {T,N} +function CachedDiskArray(A::AbstractArray{T,N}; maxsize=1000, mmap=false) where {T,N} by(x) = sizeof(x) รท 1_000_000 # In Megabytes - CachedDiskArray(A, LRU{ChunkIndex{N,OffsetChunks},OffsetArray{T,N,Array{T,N}}}(; by, maxsize)) + CachedDiskArray(A, LRU{ChunkIndex{N,OffsetChunks},OffsetArray{T,N,Array{T,N}}}(; by, maxsize),mmap) end Base.parent(A::CachedDiskArray) = A.parent @@ -32,6 +35,11 @@ function getchunk(A::CachedDiskArray, i::ChunkIndex) get!(A.cache, i) do inds = eachchunk(A)[i.I] chunk = parent(A)[inds...] + if A.mmap + mmappedarray = Mmap.mmap(tempname(),Array{eltype(chunk),ndims(chunk)},size(chunk),shared=false) + copyto!(mmappedarray, chunk) + chunk = mmappedarray + end wrapchunk(chunk, inds) end end @@ -40,11 +48,11 @@ Base.getindex(A::CachedDiskArray, i::ChunkIndex{N,OneBasedChunks}) where {N} = p """ - cache(A::AbstractArray; maxsize=1000) + cache(A::AbstractArray; maxsize=1000, mmap=false) Wrap internal disk arrays with `CacheDiskArray`. This function is intended to be extended by package that want to re-wrap the disk array afterwards, such as YAXArrays.jl or Rasters.jl. """ -cache(A::AbstractArray; maxsize=1000) = CachedDiskArray(A; maxsize) +cache(A::AbstractArray; maxsize=1000, mmap=false) = CachedDiskArray(A; maxsize, mmap) diff --git a/test/runtests.jl b/test/runtests.jl index 5c9a487..6185b88 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -909,26 +909,29 @@ end end @testset "Cached arrays" begin - M = (1:300) * (1:1200)' - A = cat(M, M, M, M; dims=3) - ch = ChunkedDiskArray(A, (128, 128, 2)) - ca = DiskArrays.CachedDiskArray(ch; maxsize=5) - # Read the original - @test sum(ca) == sum(ca) - length(ca.cache) - - ca = DiskArrays.cache(ch; maxsize=5) - @test sum(ca) == sum(ca) - - @test ca[:, :, 1] == A[:, :, 1] - @test ca[:, :, 2] == A[:, :, 2] - @test ca[:, :, 2] == A[:, :, 3] - @test ca[:, :, 2] == A[:, :, 4] - @test ca[:, 1, 1] == ch[:, 1, 1] - @test ca[:, 2, 1] == ch[:, 2, 1] - @test ca[:, 3, 1] == ch[:, 3, 1] - @test ca[:, 200, 1] == ch[:, 200, 1] - @test ca[200, :, 1] == ch[200, :, 1] + + for mm in (false, true) + M = (1:300) * (1:1200)' + A = cat(M, M, M, M; dims=3) + ch = ChunkedDiskArray(A, (128, 128, 2)) + ca = DiskArrays.CachedDiskArray(ch; maxsize=5, mmap=mm) + # Read the original + @test sum(ca) == sum(ca) + length(ca.cache) + + ca = DiskArrays.cache(ch; maxsize=5) + @test sum(ca) == sum(ca) + + @test ca[:, :, 1] == A[:, :, 1] + @test ca[:, :, 2] == A[:, :, 2] + @test ca[:, :, 2] == A[:, :, 3] + @test ca[:, :, 2] == A[:, :, 4] + @test ca[:, 1, 1] == ch[:, 1, 1] + @test ca[:, 2, 1] == ch[:, 2, 1] + @test ca[:, 3, 1] == ch[:, 3, 1] + @test ca[:, 200, 1] == ch[:, 200, 1] + @test ca[200, :, 1] == ch[200, :, 1] + end end @testset "Range subset identification" begin