diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3a5ced4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,48 @@ +name: CI +on: + pull_request: + branches: + - master + push: + branches: + - master + tags: '*' +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.2' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'. + - '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia. + - 'nightly' + os: + - ubuntu-latest + - windows-latest + - macos-latest + arch: + - x64 + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: actions/cache@v1 + env: + cache-name: cache-artifacts + with: + path: ~/.julia/artifacts + key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} + restore-keys: | + ${{ runner.os }}-test-${{ env.cache-name }}- + ${{ runner.os }}-test- + ${{ runner.os }}- + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v1 + with: + file: lcov.info \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 8f06e7a..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,22 +0,0 @@ -image: julia:1.1 # image comes from Docker hub - -before_script: - - julia --project=@. -e "import Pkg; Pkg.build()" - -default: - script: - - julia --project=@. -e "import Pkg; Pkg.test(; coverage = true)" - - julia --project=test/coverage -e 'import Pkg; Pkg.instantiate()' - - julia --project=test/coverage test/coverage/coverage-summary.jl - -pages: - stage: deploy - script: - - julia --project=docs -e 'using Pkg; Pkg.instantiate(); Pkg.develop(PackageSpec(path=pwd()))' - - julia --project=docs --color=yes docs/make.jl - - mv docs/build public # move to the directory picked up by Gitlab pages - artifacts: - paths: - - public - only: - - master diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index a62b5e3..0000000 --- a/.travis.yml +++ /dev/null @@ -1,34 +0,0 @@ -## Documentation: http://docs.travis-ci.com/user/languages/julia/ -language: julia -dist: xenial -os: - - linux - - osx -julia: - - 1 - - 1.2 - - nightly -notifications: - email: false -git: - depth: 99999999 - -## uncomment the following lines to allow failures on nightly julia -## (tests will run but not make your overall status red) -#matrix: -# allow_failures: -# - julia: nightly - -## uncomment following lines to deploy documentation -# jobs: -# include: -# - stage: Documentation -# julia: 1.0 -# os: linux -# script: -# - julia --project=docs -e 'using Pkg; Pkg.instantiate(); Pkg.develop(PackageSpec(path=pwd()))' -# - julia --project=docs --color=yes docs/make.jl -# after_success: skip -after_success: - - julia --project=test/coverage -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/coverage test/coverage/coverage.jl diff --git a/Project.toml b/Project.toml index 77e3c90..7c76df0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DiskArrays" uuid = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3" authors = ["Fabian Gans "] -version = "0.2.6" +version = "0.2.7" [compat] julia = "1.0" diff --git a/README.md b/README.md index 9dff273..a84b875 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ ![Lifecycle](https://img.shields.io/badge/lifecycle-retired-orange.svg) ![Lifecycle](https://img.shields.io/badge/lifecycle-archived-red.svg) ![Lifecycle](https://img.shields.io/badge/lifecycle-dormant-blue.svg) --> -[![Build Status](https://travis-ci.com/meggart/DiskArrays.jl.svg?branch=master)](https://travis-ci.com/meggart/DiskArrays.jl) -[![codecov.io](http://codecov.io/github/meggart/DiskArrays.jl/coverage.svg?branch=master)](http://codecov.io/github/meggart/DiskArrays.jl?branch=master) +[![Build Status][ci-img]][ci-url] +[![codecov.io][codecov-img]][codecov-url] This package is an attempt to collect utilities for working with n-dimensional array-like data structures that do not have considerable overhead for single read operations. Most important @@ -162,3 +162,13 @@ There are situations where one wants to read every other value along a certain a In this case a backend can define `readblock!(a,aout,r::OrdinalRange...)` and the respective `writeblock` method which will overwrite the fallback behavior that would read the whol block of data and only return the desired range. + +## Arrays that do not implement eachchunk + +There are arrays that live on disk but which are not split into rectangular chunks, so that the `haschunks` trait returns `Unchunked()`. In order to still enable broadcasting and reductions for these arrays, a chunk size will be estimated in a way that a certain memory limit per chunk is not exceeded. This memory limit defaults to 100MB and can be modified by changing `DiskArrays.default_chunk_size[]`. Then a chunk size is computed based on the element size of the array. However, there are cases where the size of the element type is undefined, e.g. for Strings or variable-length vectors. In these cases one can overload the `DiskArrays.element_size` function for certain container types which returns an approximate element size (in bytes). Otherwise the size of an element will simply be assumed to equal the value stored in `DiskArrays.fallback_element_size` which defaults to 100 bytes. + + +[ci-img]: https://github.com/meggart/DiskArrays.jl/workflows/CI/badge.svg +[ci-url]: https://github.com/meggart/DiskArrays.jl/actions?query=workflow%3ACI +[codecov-img]: http://codecov.io/github/meggart/DiskArrays.jl/coverage.svg?branch=master +[codecov-url]: (http://codecov.io/github/meggart/DiskArrays.jl?branch=master) \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index a20d377..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,43 +0,0 @@ -environment: - matrix: - - julia_version: 1 - - julia_version: 1.2 - - julia_version: nightly - -platform: - - x86 # 32-bit - - x64 # 64-bit - -# # Uncomment the following lines to allow failures on nightly julia -# # (tests will run but not make your overall status red) -# matrix: -# allow_failures: -# - julia_version: nightly - -branches: - only: - - master - - /release-.*/ - -notifications: - - provider: Email - on_build_success: false - on_build_failure: false - on_build_status_changed: false - -install: - - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) - -build_script: - - echo "%JL_BUILD_SCRIPT%" - - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" - -test_script: - - echo "%JL_TEST_SCRIPT%" - - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" - -# # Uncomment to support code coverage upload. Should only be enabled for packages -# # which would have coverage gaps without running on Windows -# on_success: -# - echo "%JL_CODECOV_SCRIPT%" -# - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%" diff --git a/src/chunks.jl b/src/chunks.jl index 9394784..5710875 100644 --- a/src/chunks.jl +++ b/src/chunks.jl @@ -47,8 +47,15 @@ function Base.iterate(g::GridChunks, state) end #Define the approx default maximum chunk size (in MB) +"The target chunk size for processing for unchunked arrays in MB, defaults to 100MB" const default_chunk_size = Ref(100) +""" +A fallback element size for arrays to determine a where elements have unknown +size like strings. Defaults to 100MB +""" +const fallback_element_size = Ref(100) + #Here we implement a fallback chunking for a DiskArray although this should normally #be over-ridden by the package that implements the interface @@ -62,7 +69,25 @@ struct Unchunked end function haschunks end haschunks(x) = Unchunked() -estimate_chunksize(a::AbstractArray) = estimate_chunksize(size(a), sizeof(eltype(a))) +""" + element_size(a::AbstractArray) + +Returns the approximate size of an element of a in bytes. This falls back to calling `sizeof` on +the element type or to the value stored in `DiskArrays.fallback_element_size`. Methods can be added for +custom containers. +""" +function element_size(a::AbstractArray) + if isbitstype(eltype(a)) + return sizeof(eltype(a)) + elseif isbitstype(Base.nonmissingtype(eltype(a))) + return sizeof(Base.nonmissingtype(eltype(a))) + else + @warn "Can not determine size of element type. Using DiskArrays.fallback_element_size[] = $(fallback_element_size[]) bytes" + return fallback_element_size[] + end +end + +estimate_chunksize(a::AbstractArray) = estimate_chunksize(size(a), element_size(a)) function estimate_chunksize(s, si) ii = searchsortedfirst(cumprod(collect(s)),default_chunk_size[]*1e6/si) ntuple(length(s)) do idim diff --git a/test/runtests.jl b/test/runtests.jl index b99b769..6f66112 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -229,4 +229,34 @@ import Base.PermutedDimsArrays.invperm a_disk1 = permutedims(_DiskArray(rand(9,2,10), chunksize=(3,2,5)),p) test_broadcast(a_disk1) end + +@testset "Unchunked String arrays" begin + a = reshape(1:200000,200,1000) + b = string.(a) + c = collect(Union{Int,Missing},a) + + DiskArrays.default_chunk_size[] = 100 + DiskArrays.fallback_element_size[] = 100 + @test DiskArrays.estimate_chunksize(a) == (200,1000) + @test DiskArrays.eachchunk(a) == DiskArrays.GridChunks(a,(200,1000)) + @test DiskArrays.estimate_chunksize(b) == (200,1000) + @test DiskArrays.eachchunk(b) == DiskArrays.GridChunks(b,(200,1000)) + @test DiskArrays.estimate_chunksize(c) == (200,1000) + @test DiskArrays.eachchunk(c) == DiskArrays.GridChunks(c,(200,1000)) + DiskArrays.default_chunk_size[] = 1 + @test DiskArrays.estimate_chunksize(a) == (200,625) + @test DiskArrays.eachchunk(a) == DiskArrays.GridChunks(a,(200,625)) + @test DiskArrays.estimate_chunksize(b) == (200,50) + @test DiskArrays.eachchunk(b) == DiskArrays.GridChunks(b,(200,50)) + @test DiskArrays.estimate_chunksize(c) == (200,625) + @test DiskArrays.eachchunk(c) == DiskArrays.GridChunks(c,(200,625)) + DiskArrays.fallback_element_size[] = 1000 + @test DiskArrays.estimate_chunksize(a) == (200,625) + @test DiskArrays.eachchunk(a) == DiskArrays.GridChunks(a,(200,625)) + @test DiskArrays.estimate_chunksize(b) == (200,5) + @test DiskArrays.eachchunk(b) == DiskArrays.GridChunks(b,(200,5)) + @test DiskArrays.estimate_chunksize(c) == (200,625) + @test DiskArrays.eachchunk(c) == DiskArrays.GridChunks(c,(200,625)) +end + end