Skip to content

Commit

Permalink
Cpusummary Note (#133)
Browse files Browse the repository at this point in the history
* Adjust blocking behavior to cap cache use at L2e and L3e

* Precompile

* Bump version

* Don't use Aqua to test for ambiguities because of ForwardDiff

* More debug, add a check for first cache size being large enough

* Try to avoid Windows issue

* init before precomp

* add CPUSummary note

* take 2

* updates

* disable project_toml_formatting test

* Hopefully more reliable first_cache
  • Loading branch information
chriselrod authored Mar 9, 2022
1 parent 82a68e7 commit db713f3
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 31 deletions.
10 changes: 6 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
version = "0.3.12"
version = "0.3.13"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
Expand All @@ -15,14 +16,15 @@ ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "3.1.14"
ArrayInterface = "3.1.14, 5.0.1"
CPUSummary = "0.1.1 - 0.1.8, 0.1.14"
IfElse = "0.1"
LoopVectorization = "0.12.86"
ManualMemory = "0.1.1"
PolyesterWeave = "0.1.1"
Requires = "1"
Static = "0.2, 0.3, 0.4"
ThreadingUtilities = "0.4.6"
Static = "0.2, 0.3, 0.4, 0.6"
ThreadingUtilities = "0.5"
VectorizationBase = "0.21.15"
julia = "1.6"

Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@
[ci-julia-nightly-img]: https://github.com/JuliaLinearAlgebra/Octavian.jl/workflows/CI%20(Julia%20nightly)/badge.svg "Continuous Integration (Julia nightly)"
[codecov-img]: https://codecov.io/gh/JuliaLinearAlgebra/Octavian.jl/branch/master/graph/badge.svg "Code Coverage"

Octavian.jl
is a multi-threaded BLAS-like library that provides pure Julia
To make sure CPUSummary 1.11 and newer are using `Hwloc`, you may want to run
```julia
julia> using CPUSummary

julia> CPUSummary.use_hwloc(true);
```
which will hopefully enable accurate hardware information. This is the default,
so it should typically be unnecessary.

Octavian.jl is a multi-threaded BLAS-like library that provides pure Julia
matrix multiplication on the CPU, built on top of
[LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl).

Expand Down
6 changes: 3 additions & 3 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ using Requires: @require
using VectorizationBase, ArrayInterface, LoopVectorization

using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_nsw, assume,
static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature,
cache_size, num_cores, cache_inclusive, cache_linesize
static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature
using CPUSummary: cache_size, num_cores, cache_inclusive, cache_linesize
using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
using ArrayInterface: size, strides, offsets, indices, axes, StrideIndex
using IfElse: ifelse
Expand All @@ -15,7 +15,7 @@ using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat
roundtostaticint, floortostaticint
using ManualMemory: MemoryBuffer, load, store!

using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait
using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait, SPIN

export StaticInt
export matmul!
Expand Down
29 changes: 14 additions & 15 deletions src/funcptrs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA
offset, K = load(p, Kd, offset)
offset, N = load(p, Nd, offset)
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
_atomic_store!(p, SPIN)
nothing
end
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
Expand Down Expand Up @@ -39,6 +40,7 @@ function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(
offset, id = load(p, ID, offset)
offset, total_ids = load(p, TT, offset)
sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
_atomic_store!(p, SPIN)
nothing
end

Expand All @@ -63,11 +65,17 @@ end
nothing
end

@inline function setup_syncmul!(
@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
end

struct SyncMulLauncher{W₁, W₂, R₁, R₂} end
@inline function (::SyncMulLauncher{W₁, W₂, R₁, R₂})(
p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT
) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
fptr = cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}())
offset = store!(p, fptr, sizeof(UInt))
offset = store!(p, C, offset)
offset = store!(p, A, offset)
offset = store!(p, B, offset)
Expand All @@ -82,20 +90,11 @@ end
offset = store!(p, tt, offset)
nothing
end

@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
end
@inline function launch_thread_mul!(
C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt, ::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt,
::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
) where {W₁,W₂,R₁,R₂}
launch(tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, id, tt
Base.@_inline_meta
setup_syncmul!(
p, C, A, B, α, β, M, K, N, ap, bcp, id, tt,
StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
)
end
launch(SyncMulLauncher{W₁, W₂, R₁, R₂}(), tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt)
end


8 changes: 5 additions & 3 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,12 @@ R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))


@static if Sys.ARCH === :x86_64 || Sys.ARCH === :i686
first_cache() = StaticInt{2}()
else
first_cache() = StaticInt{1}()
end

_first_cache(::StaticInt{1}) = StaticInt{1}()
_first_cache(::StaticInt) = StaticInt{2}()
first_cache() = _first_cache(VectorizationBase.num_l2cache())
second_cache() = first_cache() + One()

_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
Expand Down
2 changes: 1 addition & 1 deletion test/aqua.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@testset "Aqua.jl" begin
Aqua.test_all(Octavian, ambiguities=false)
Aqua.test_all(Octavian, ambiguities=false, project_toml_formatting=false)
@test isempty(Test.detect_ambiguities(Octavian))
end
6 changes: 3 additions & 3 deletions test/matmul_coverage.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
n_values = [1, 10, 20, 50, 100, 150, 200]
k_values = [10, 20, 50, 100, 150, 200]
m_values = [10, 20, 50, 100, 150, 200]
n_values = [1, 10, 20, 50, 100, 150, 200, 400]
k_values = [10, 20, 50, 100, 150, 200, 400]
m_values = [10, 20, 50, 100, 150, 200, 400]



Expand Down
6 changes: 6 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
import CPUSummary
# Increasing the number of threads must be done before importing Octavian
if Threads.nthreads() > 1 && Sys.CPU_THREADS > 1 && CPUSummary.num_cores() == 1
CPUSummary.num_cores() = CPUSummary.static(2)
end

import Octavian

import Aqua
Expand Down

2 comments on commit db713f3

@chriselrod
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/56290

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.13 -m "<description of version>" db713f3310030041bfa55576a15fcbeb55585884
git push origin v0.3.13

Please sign in to comment.