Skip to content

Commit

Permalink
Add option to store LRU cache as mmapped arrays (#203)
Browse files Browse the repository at this point in the history
* add option to store LRU cache as mmapped arrays

* Relax mmap compat

* add mmap option to cache function
  • Loading branch information
meggart authored Nov 8, 2024
1 parent 2a48da7 commit ab8813e
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 28 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@ version = "0.4.6"

[deps]
LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"

[compat]
Aqua = "0.8"
LRUCache = "1"
julia = "1.9"
Mmap = "1"
OffsetArrays = "1"
Statistics = "1.9"
Test = "1.9"
julia = "1.9"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Expand Down
22 changes: 15 additions & 7 deletions src/cached.jl
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@

import Mmap
# Force disk any abstractarray into a different chunking pattern.
# This is useful in `zip` and other operations that can iterate
# over multiple arrays with different patterns.

"""
CachedDiskArray <: AbstractDiskArray
CachedDiskArray(A::AbstractArray; maxsize=1000)
CachedDiskArray(A::AbstractArray; maxsize=1000, mmap=false)
Wrap some disk array `A` with a caching mechanism that will
keep chunks up to a total of `maxsize` megabytes, dropping
the least used chunks when `maxsize` is exceeded.
the least used chunks when `maxsize` is exceeded. If `mmap` is
set to `true`, cached chunks will not be kept in RAM but Mmapped
to temproray files.
"""
struct CachedDiskArray{T,N,A<:AbstractArray{T,N},C} <: ChunkTiledDiskArray{T,N}
parent::A
cache::C
mmap::Bool
end
function CachedDiskArray(A::AbstractArray{T,N}; maxsize=1000) where {T,N}
function CachedDiskArray(A::AbstractArray{T,N}; maxsize=1000, mmap=false) where {T,N}
by(x) = sizeof(x) ÷ 1_000_000 # In Megabytes
CachedDiskArray(A, LRU{ChunkIndex{N,OffsetChunks},OffsetArray{T,N,Array{T,N}}}(; by, maxsize))
CachedDiskArray(A, LRU{ChunkIndex{N,OffsetChunks},OffsetArray{T,N,Array{T,N}}}(; by, maxsize),mmap)
end

Base.parent(A::CachedDiskArray) = A.parent
Expand All @@ -32,6 +35,11 @@ function getchunk(A::CachedDiskArray, i::ChunkIndex)
get!(A.cache, i) do
inds = eachchunk(A)[i.I]
chunk = parent(A)[inds...]
if A.mmap
mmappedarray = Mmap.mmap(tempname(),Array{eltype(chunk),ndims(chunk)},size(chunk),shared=false)
copyto!(mmappedarray, chunk)
chunk = mmappedarray
end
wrapchunk(chunk, inds)
end
end
Expand All @@ -40,11 +48,11 @@ Base.getindex(A::CachedDiskArray, i::ChunkIndex{N,OneBasedChunks}) where {N} = p


"""
cache(A::AbstractArray; maxsize=1000)
cache(A::AbstractArray; maxsize=1000, mmap=false)
Wrap internal disk arrays with `CacheDiskArray`.
This function is intended to be extended by package that want to
re-wrap the disk array afterwards, such as YAXArrays.jl or Rasters.jl.
"""
cache(A::AbstractArray; maxsize=1000) = CachedDiskArray(A; maxsize)
cache(A::AbstractArray; maxsize=1000, mmap=false) = CachedDiskArray(A; maxsize, mmap)
43 changes: 23 additions & 20 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -909,26 +909,29 @@ end
end

@testset "Cached arrays" begin
M = (1:300) * (1:1200)'
A = cat(M, M, M, M; dims=3)
ch = ChunkedDiskArray(A, (128, 128, 2))
ca = DiskArrays.CachedDiskArray(ch; maxsize=5)
# Read the original
@test sum(ca) == sum(ca)
length(ca.cache)

ca = DiskArrays.cache(ch; maxsize=5)
@test sum(ca) == sum(ca)

@test ca[:, :, 1] == A[:, :, 1]
@test ca[:, :, 2] == A[:, :, 2]
@test ca[:, :, 2] == A[:, :, 3]
@test ca[:, :, 2] == A[:, :, 4]
@test ca[:, 1, 1] == ch[:, 1, 1]
@test ca[:, 2, 1] == ch[:, 2, 1]
@test ca[:, 3, 1] == ch[:, 3, 1]
@test ca[:, 200, 1] == ch[:, 200, 1]
@test ca[200, :, 1] == ch[200, :, 1]

for mm in (false, true)
M = (1:300) * (1:1200)'
A = cat(M, M, M, M; dims=3)
ch = ChunkedDiskArray(A, (128, 128, 2))
ca = DiskArrays.CachedDiskArray(ch; maxsize=5, mmap=mm)
# Read the original
@test sum(ca) == sum(ca)
length(ca.cache)

ca = DiskArrays.cache(ch; maxsize=5)
@test sum(ca) == sum(ca)

@test ca[:, :, 1] == A[:, :, 1]
@test ca[:, :, 2] == A[:, :, 2]
@test ca[:, :, 2] == A[:, :, 3]
@test ca[:, :, 2] == A[:, :, 4]
@test ca[:, 1, 1] == ch[:, 1, 1]
@test ca[:, 2, 1] == ch[:, 2, 1]
@test ca[:, 3, 1] == ch[:, 3, 1]
@test ca[:, 200, 1] == ch[:, 200, 1]
@test ca[200, :, 1] == ch[200, :, 1]
end
end

@testset "Range subset identification" begin
Expand Down

0 comments on commit ab8813e

Please sign in to comment.