diff --git a/README.md b/README.md index ba151b5..16c9736 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ Solve for... * Non-orthogonal unit cells * Inhomogeneous permittivity and/or permeability * Chern numbers of topological photonic crystals using [built-in Wilson loop methods](https://sp94.github.io/Peacock.jl/dev/how-tos/wilson_loops) +* GPU support +* TODO: CPU parallel support Focused on ease of use diff --git a/benchmark/GPU_0.png b/benchmark/GPU_0.png new file mode 100644 index 0000000..c598621 Binary files /dev/null and b/benchmark/GPU_0.png differ diff --git a/benchmark/GPU_1.png b/benchmark/GPU_1.png new file mode 100644 index 0000000..ef886f7 Binary files /dev/null and b/benchmark/GPU_1.png differ diff --git a/benchmark/benchmark.md b/benchmark/benchmark.md new file mode 100644 index 0000000..1d34edd --- /dev/null +++ b/benchmark/benchmark.md @@ -0,0 +1,316 @@ +### Issue +The computer cost will very large if we extent Peacock.jl to 3D or need a large cutoff. Using parallel technology can significantly reduce the computing time. Can we parallelize the solver? + +### GPU parallel +First, We can use [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) to make computing run on Nvidia GPU. [CuSolver](https://docs.nvidia.com/cuda/cusolver/index.html) is a linear algebra library of CUDA. It is easy to solve dense linear systems of the form `Ax = b`. We only need to convert Julia Array to CuArray, almost all function of LinearAlgebra.jl can be running on GPU: +```julia +# running on CPU +a = rand(2, 2); b = rand(2, 2) +c = a / b +# running on GPU +using CUDA +a_d = CuArray(a); b_d = CuArray(b) +c_d = a_d / b_d +``` +Unfortunately, it may be inefficient to use CUDA to solve the eigenvalue and eigenvector of dense matrix ([discourse1](https://discourse.julialang.org/t/computing-eigenvalues-eigenvectors-using-gpu/12396) [discourse2](https://discourse.julialang.org/t/eigenvalues-for-lots-of-small-matrices-gpu-batched-vs-cpu-eigen/50792)). I also run benchmarks : +``` +julia> a= rand(100,100); b = rand(100,100); +julia> @btime CUDA.CUSOLVER.syevjBatched!('V','U',CuArray(b)/CuArray(a)); + 36.093 ms (577 allocations: 11.58 KiB) + +julia> @btime eigen(a,b); + 29.361 ms (21 allocations: 428.78 KiB) +``` + +It should be note that the function `CUDA.eigen` of CUDA.jl only can run with Julia Array and just same as `LinearAlgebra.eigen` of LinearAlgebra.jl. By the wave, `syevjBatched!` only support `Float32/Float64`. + +But we can still speed up Peacock.jl by using CUDA.jl, since GPU can solve `Ax = b` efficiently. I focus on the model of [square lattice](https://sp94.github.io/Peacock.jl/stable/tutorials/getting_started/): + +```julia +using PyPlot +using Peacock +using LinearAlgebra +using SparseArrays +using BenchmarkTools +using CUDA + +get_k = Peacock.get_k +DiagonalMatrix = Peacock.DiagonalMatrix + +# Permittivity +function epf(x,y) + # equation of a circle with radius 0.2a + if x^2+y^2 <= 0.2^2 + # dielectric inside the circle + return 8.9 + else + # air outside the circle + return 1 + end +end + +# Permeability is unity everywhere +function muf(x,y) + return 1 +end + +a1 = [1, 0] # first lattice vector +a2 = [0, 1] # second lattice vector +d1 = 0.01 # resolution along first lattice vector +d2 = 0.01 # resolution along second lattice vector +geometry = Geometry(epf, muf, a1, a2, d1, d2) +fourier_space_cutoff = 17 #greater than tutorials +solver = Solver(geometry, fourier_space_cutoff) +G = BrillouinZoneCoordinate( 0, 0, "Γ") +X = BrillouinZoneCoordinate(1/2, 0, "X") +M = BrillouinZoneCoordinate(1/2, 1/2, "M") +ks = [G,X,M,G] +ks = [typeof(x)==BrillouinZoneCoordinate ? get_k(x,solver.basis) : x for x in ks] + +polarisation = TE +k = ks[1] +basis = solver.basis +epc, muc = solver.epc, solver.muc +Kx = DiagonalMatrix(basis.kxs) + k[1]*I +Ky = DiagonalMatrix(basis.kys) + k[2]*I +Kx_d = CuArray(Kx); Ky_d = CuArray(Ky); +epc_d = CuArray(epc); muc_d = CuArray(muc) + +``` + +and run benchmarks like that: + +``` +julia> @btime LHS_d = Kx_d/epc_d*Kx_d + Ky_d/epc_d*Ky_d; + 22.768 ms (20762 allocations: 337.38 KiB) + +julia> @btime LHS_d = Kx_d/epc_d*Kx_d + Ky_d/epc_d*Ky_d; + 22.931 ms (18741 allocations: 303.83 KiB) + +julia> @btime LHS = Kx/epc*Kx + Ky/epc*Ky; + 2.433 s (20 allocations: 6.96 MiB) + +julia> @btime LHS = Kx/epc*Kx + Ky/epc*Ky; + 2.660 s (20 allocations: 6.96 MiB) + +julia> size(LHS) +(225, 225) +``` + +GPU-based program is about 90 times faster than CPU! The other benchmark is test `eigen`: + +``` +julia> @btime eigen(RHS \ LHS); + 1.362 s (29 allocations: 3.55 MiB) + +julia> @btime eigen(LHS, RHS); + 542.625 ms (19 allocations: 2.46 MiB) + +julia> @time eigen(Array(RHS_d \ LHS_d)); + 0.209265 seconds (17.91 k allocations: 3.053 MiB) +``` + +Despite `eigen(LHS, RHS)` is more efficient than `eigen(RHS \ LHS)`, `eigen(Array(RHS_d \ LHS_d))` is still the fastest one! + +Now, we can change the code of `solver.jl`. Only one function is added: + +```julia +function solve(solver::Solver, k::AbstractVector{<:Real}, polarisation::Polarisation, GPU::Int; bands=:) + basis = solver.basis + if GPU == 1 + epc, muc = CuArray(solver.epc), CuArray(solver.muc) + Kx = CuArray(DiagonalMatrix(basis.kxs) + k[1]*I) + Ky = CuArray(DiagonalMatrix(basis.kys) + k[2]*I) + if polarisation == TE + LHS = Kx/epc*Kx + Ky/epc*Ky + RHS = muc + label = L"H_z" + elseif polarisation == TM + LHS = Kx/muc*Kx + Ky/muc*Ky + RHS = epc + label = L"E_z" + end + freqs_squared, modes_data = eigen(Array(RHS \ LHS)); + else + epc, muc = solver.epc, solver.muc + Kx = DiagonalMatrix(basis.kxs) + k[1]*I + Ky = DiagonalMatrix(basis.kys) + k[2]*I + if polarisation == TE + LHS = Kx/epc*Kx + Ky/epc*Ky + RHS = muc + label = L"H_z" + elseif polarisation == TM + LHS = Kx/muc*Kx + Ky/muc*Ky + RHS = epc + label = L"E_z" + end + freqs_squared, modes_data = try + eigen(LHS, RHS) + catch + eigen(RHS \ LHS) + end + end + + freqs = sqrt.(freqs_squared) + # Sort by increasing frequency + idx = sortperm(freqs, by=real) + freqs = freqs[idx][bands] + modes_data = modes_data[:,idx][:,bands] + # Eigenmodes are weighted by the RHS of the generalised eigenvalue problem + weighting = RHS + modes = Mode[] + for i in 1:length(freqs) + mode = Mode(k, freqs[i], modes_data[:,i], weighting, basis, label) + push!(modes, mode) + end + return modes +end +``` + +The last function of `band_diagrams.jl` is change to: + +```julia +function plot_band_diagram(solver::Solver, ks, polarisation::Polarisation, GPU::Int; + dk=nothing, labels=[], bands=1:10, frequency_scale=1, color="k", markersize=nothing) + # Convert BrillouinZoneCoordinate to labelled positions in k space + if labels == [] + labels = [hasproperty(x,:label) ? x.label : "" for x in ks] + end + ks = [typeof(x)==BrillouinZoneCoordinate ? get_k(x,solver.basis) : x for x in ks] + # Wrap all the variables into a single function of k that returns frequencies + function my_solve(k) + modes = solve(solver, k, polarisation, GPU) + return [mode.frequency for mode in modes] + end + # Pass on to more general function + plot_band_diagram(my_solve, ks; dk=dk, labels=labels, bands=bands, frequency_scale=frequency_scale, color=color, markersize=markersize) +end +``` + +And we can run the last benchmark -- plot a band diagram, the model is just same as before: + +```julia +using PyPlot +using Peacock +using CUDA +GPU = 1 # running on GPU + +function epf(x,y) + # equation of a circle with radius 0.2a + if x^2+y^2 <= 0.2^2 + # dielectric inside the circle + return 8.9 + else + # air outside the circle + return 1 + end +end + +# Permeability is unity everywhere +function muf(x,y) + return 1 +end + +a1 = [1, 0] # first lattice vector +a2 = [0, 1] # second lattice vector +d1 = 0.01 # resolution along first lattice vector +d2 = 0.01 # resolution along second lattice vector +geometry = Geometry(epf, muf, a1, a2, d1, d2) +fourier_space_cutoff = 7 +solver = Solver(geometry, fourier_space_cutoff) +G = BrillouinZoneCoordinate( 0, 0, "Γ") +X = BrillouinZoneCoordinate(1/2, 0, "X") +M = BrillouinZoneCoordinate(1/2, 1/2, "M") +ks = [G,X,M,G] +``` + +The results when running on terminal: + +``` +julia> @time plot_band_diagram(solver, ks, TE, 1, color="red", + bands=1:4, dk=0.1, frequency_scale=1/2pi) + 3.792055 seconds (510.63 k allocations: 124.970 MiB, 0.71% gc time, 0.14% compilation time) +(0, 0.9109956614327619) + +julia> @time plot_band_diagram(solver, ks, TM, 1, color="red", + bands=1:4, dk=0.1, frequency_scale=1/2pi) + 3.321212 seconds (571.62 k allocations: 125.828 MiB, 0.57% gc time) +(0, 0.9109956614327619) + +julia> @time plot_band_diagram(solver, ks, TE, 0, color="red", + bands=1:4, dk=0.1, frequency_scale=1/2pi) + 29.343061 seconds (296.76 k allocations: 50.585 MiB, 0.03% gc time) +(0, 0.9109956614327619) + +julia> @time plot_band_diagram(solver, ks, TM, 0, color="red", + bands=1:4, dk=0.1, frequency_scale=1/2pi) + 28.964005 seconds (294.21 k allocations: 50.534 MiB, 0.03% gc time) +(0, 0.9109956614327619) +``` + +When running on terminal, the diagram will be displayed in real time at one point by one, which may reduce efficiency. So I also run this benchmarks on Jupyter: + + + +![](https://github.com/kabume/Peacock.jl/blob/master/benchmark/GPU_0.png) + +![](https://github.com/kabume/Peacock.jl/blob/master/benchmark/GPU_1.png) + +GPU-based computing is about 30 times faster than CPU-based. + +My versioninfo(): + +``` +Julia Version 1.6.0 +Commit f9720dc2eb (2021-03-24 12:55 UTC) +Platform Info: + OS: Linux (x86_64-pc-linux-gnu) + CPU: Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-11.0.1 (ORCJIT, skylake) +``` + +CUDA.versioninfo(): + +``` +CUDA toolkit 10.1.243, artifact installation +CUDA driver 10.1.0 +NVIDIA driver 430.50.0 + +Libraries: +- CUBLAS: 10.2.1 +- CURAND: 10.1.1 +- CUFFT: 10.1.1 +- CUSOLVER: 10.2.0 +- CUSPARSE: 10.3.0 +- CUPTI: 12.0.0 +- NVML: 10.0.0+430.50 +- CUDNN: 8.0.4 (for CUDA 10.1.0) +- CUTENSOR: 1.2.2 (for CUDA 10.1.0) + +Toolchain: +- Julia: 1.6.0 +- LLVM: 11.0.1 +- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4 +- Device support: sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75 + +1 device: + 0: GeForce MX150 (sm_61, 957.250 MiB / 1.956 GiB available) +``` + + + +### CPU Parallel (TODO) + +We can also use Multi-Threading (`@threads`) or Multi-Programming (`using Distributed; @distributed`) to parallel the `for` loop [like](https://github.com/sp94/Peacock.jl/blob/master/src/band_diagrams.jl#L56): + +```julia +for (x,k) in zip(xs,ks) + frequencies = my_solve(k) + xs_k = [x for _ in 1:length(frequencies[bands])] + ys = frequency_scale * frequencies[bands] + PyPlot.plot(xs_k, real(ys), ".", color=color, alpha=0.5, markersize=markersize) +end +``` + diff --git a/benchmark/benchmark_GPU_Peacock.jl b/benchmark/benchmark_GPU_Peacock.jl new file mode 100644 index 0000000..3cdf2d9 --- /dev/null +++ b/benchmark/benchmark_GPU_Peacock.jl @@ -0,0 +1,39 @@ +using PyPlot +using Peacock +using CUDA +GPU = 1 + +function epf(x,y) + # equation of a circle with radius 0.2a + if x^2+y^2 <= 0.2^2 + # dielectric inside the circle + return 8.9 + else + # air outside the circle + return 1 + end +end + +# Permeability is unity everywhere +function muf(x,y) + return 1 +end + +a1 = [1, 0] # first lattice vector +a2 = [0, 1] # second lattice vector +d1 = 0.01 # resolution along first lattice vector +d2 = 0.01 # resolution along second lattice vector +geometry = Geometry(epf, muf, a1, a2, d1, d2) +fourier_space_cutoff = 7 +solver = Solver(geometry, fourier_space_cutoff) +G = BrillouinZoneCoordinate( 0, 0, "Γ") +X = BrillouinZoneCoordinate(1/2, 0, "X") +M = BrillouinZoneCoordinate(1/2, 1/2, "M") +ks = [G,X,M,G] + +figure(figsize=(4,3)) +@time plot_band_diagram(solver, ks, TE, GPU, color="red", + bands=1:4, dk=0.1, frequency_scale=1/2pi) +@time plot_band_diagram(solver, ks, TM, GPU, color="blue", + bands=1:4, dk=0.1, frequency_scale=1/2pi) +ylim(0,0.8) diff --git a/benchmark/test_GPU_Peacock.jl b/benchmark/test_GPU_Peacock.jl new file mode 100644 index 0000000..7c8a9ae --- /dev/null +++ b/benchmark/test_GPU_Peacock.jl @@ -0,0 +1,68 @@ +using PyPlot +using Peacock +using LinearAlgebra +using SparseArrays +using BenchmarkTools +using CUDA + +get_k = Peacock.get_k +DiagonalMatrix = Peacock.DiagonalMatrix + +# Permittivity +function epf(x,y) + # equation of a circle with radius 0.2a + if x^2+y^2 <= 0.2^2 + # dielectric inside the circle + return 8.9 + else + # air outside the circle + return 1 + end +end + +# Permeability is unity everywhere +function muf(x,y) + return 1 +end + +a1 = [1, 0] # first lattice vector +a2 = [0, 1] # second lattice vector +d1 = 0.01 # resolution along first lattice vector +d2 = 0.01 # resolution along second lattice vector +geometry = Geometry(epf, muf, a1, a2, d1, d2) +fourier_space_cutoff = 17 +solver = Solver(geometry, fourier_space_cutoff) +G = BrillouinZoneCoordinate( 0, 0, "Γ") +X = BrillouinZoneCoordinate(1/2, 0, "X") +M = BrillouinZoneCoordinate(1/2, 1/2, "M") +ks = [G,X,M,G] +ks = [typeof(x)==BrillouinZoneCoordinate ? get_k(x,solver.basis) : x for x in ks] + +polarisation = TE + +k = ks[1] +bands = 1:2 + +basis = solver.basis +epc, muc = solver.epc, solver.muc +Kx = DiagonalMatrix(basis.kxs) + k[1]*I +Ky = DiagonalMatrix(basis.kys) + k[2]*I +Kx_d = CuArray(Kx); Ky_d = CuArray(Ky); epc_d = CuArray(epc); muc_d = CuArray(muc); +if polarisation == TE + LHS = Kx/epc*Kx + Ky/epc*Ky + RHS = muc + LHS_d = CuArray(LHS) + RHS_d = CuArray(RHS) + LHS_dd = Kx_d/epc_d*Kx_d + Ky_d/epc_d*Ky_d + RHS_dd = muc_d + label = L"H_z" +elseif polarisation == TM + LHS = Kx/muc*Kx + Ky/muc*Ky + RHS = epc + label = L"E_z" +end +# Sometimes the generalised eigenvalue problem solver +# fails near Γ when the crystals are symmetric. +# In these cases, rewrite as a regular eigenvalue problem +@time freqs_squared, modes_data = eigen(RHS \ LHS); +@time freqs_squared, modes_data = eigen(Array(RHS_d \ LHS_d)); diff --git a/src/Peacock.jl b/src/Peacock.jl index cf719e2..f893d0c 100644 --- a/src/Peacock.jl +++ b/src/Peacock.jl @@ -2,6 +2,7 @@ module Peacock using PyPlot using LinearAlgebra, FFTW +using CUDA include("utils.jl") diff --git a/src/band_diagrams.jl b/src/band_diagrams.jl index 1368f00..7972789 100644 --- a/src/band_diagrams.jl +++ b/src/band_diagrams.jl @@ -58,7 +58,6 @@ function plot_band_diagram(my_solve::Function, ks; color="k", markersize=nothing, show_vlines=true) # Add labels xs = cumsum([0; norm.(diff(ks))]) - xs /= xs[end] xticks(xs, labels) if show_vlines for (x,label) in zip(xs,labels) @@ -70,7 +69,6 @@ function plot_band_diagram(my_solve::Function, ks; # Sample path ks, labels = sample_path(ks, labels=labels, dk=dk) xs = cumsum([0; norm.(diff(ks))]) - xs /= xs[end] # Solve and plot points for (x,k) in zip(xs,ks) frequencies = my_solve(k) @@ -85,14 +83,14 @@ end """ - plot_band_diagram(solver::Solver, ks, polarisation::Polarisation, ) + plot_band_diagram(solver::Solver, ks, polarisation::Polarisation, GPU::Int, ) Plot the bands generated by `solve(solver, k, polarisation)` along `ks`. Takes the same keyword arguments as [`plot_band_diagram(my_solve::Function, ks)`](@ref), but here `ks` can also include [`BrillouinZoneCoordinate`](@ref)s. """ -function plot_band_diagram(solver::Solver, ks, polarisation::Polarisation; +function plot_band_diagram(solver::Solver, ks, polarisation::Polarisation, GPU::Int; dk=nothing, labels=[], bands=1:10, frequency_scale=1, color="k", markersize=nothing) # Convert BrillouinZoneCoordinate to labelled positions in k space if labels == [] @@ -101,7 +99,7 @@ function plot_band_diagram(solver::Solver, ks, polarisation::Polarisation; ks = [typeof(x)==BrillouinZoneCoordinate ? get_k(x,solver.basis) : x for x in ks] # Wrap all the variables into a single function of k that returns frequencies function my_solve(k) - modes = solve(solver, k, polarisation) + modes = solve(solver, k, polarisation, GPU) return [mode.frequency for mode in modes] end # Pass on to more general function diff --git a/src/solver.jl b/src/solver.jl index 97d5a09..250f0e1 100644 --- a/src/solver.jl +++ b/src/solver.jl @@ -3,7 +3,6 @@ Transverse electric (TE) or transverse magnetic (TM) polarisation. """ @enum Polarisation TE TM - """ Solver(basis::PlaneWaveBasis, epc::Matrix{ComplexF64}, muc::Matrix{ComplexF64}) @@ -86,10 +85,68 @@ end """ - solve(solver::Solver, k::AbstractVector{<:Real}, polarisation::Polarisation; bands=:) + solve(solver::Solver, k::AbstractVector{<:Real}, polarisation::Polarisation, GPU::Int; bands=:) -Calculate the eigenmodes of a photonic crystal at position `k` in reciprocal space. +Calculate the eigenmodes of a photonic crystal at position `k` in reciprocal space. Add GPU support """ +function solve(solver::Solver, k::AbstractVector{<:Real}, polarisation::Polarisation, GPU::Int; bands=:) + # Get left and right hand sides of the generalised eigenvalue problem + basis = solver.basis + if GPU == 1 + epc, muc = CuArray(solver.epc), CuArray(solver.muc) + Kx = CuArray(DiagonalMatrix(basis.kxs) + k[1]*I) + Ky = CuArray(DiagonalMatrix(basis.kys) + k[2]*I) + if polarisation == TE + LHS = Kx/epc*Kx + Ky/epc*Ky + RHS = muc + label = L"H_z" + elseif polarisation == TM + LHS = Kx/muc*Kx + Ky/muc*Ky + RHS = epc + label = L"E_z" + end + # Sometimes the generalised eigenvalue problem solver + # fails near Γ when the crystals are symmetric. + # In these cases, rewrite as a regular eigenvalue problem + freqs_squared, modes_data = eigen(Array(RHS \ LHS)); + else + epc, muc = solver.epc, solver.muc + Kx = DiagonalMatrix(basis.kxs) + k[1]*I + Ky = DiagonalMatrix(basis.kys) + k[2]*I + if polarisation == TE + LHS = Kx/epc*Kx + Ky/epc*Ky + RHS = muc + label = L"H_z" + elseif polarisation == TM + LHS = Kx/muc*Kx + Ky/muc*Ky + RHS = epc + label = L"E_z" + end + # Sometimes the generalised eigenvalue problem solver + # fails near Γ when the crystals are symmetric. + # In these cases, rewrite as a regular eigenvalue problem + freqs_squared, modes_data = try + eigen(LHS, RHS) + catch + eigen(RHS \ LHS) + end + end + + freqs = sqrt.(freqs_squared) + # Sort by increasing frequency + idx = sortperm(freqs, by=real) + freqs = freqs[idx][bands] + modes_data = modes_data[:,idx][:,bands] + # Eigenmodes are weighted by the RHS of the generalised eigenvalue problem + weighting = RHS + modes = Mode[] + for i in 1:length(freqs) + mode = Mode(k, freqs[i], modes_data[:,i], weighting, basis, label) + push!(modes, mode) + end + return modes +end + function solve(solver::Solver, k::AbstractVector{<:Real}, polarisation::Polarisation; bands=:) # Get left and right hand sides of the generalised eigenvalue problem basis = solver.basis @@ -113,6 +170,7 @@ function solve(solver::Solver, k::AbstractVector{<:Real}, polarisation::Polarisa catch eigen(RHS \ LHS) end + freqs = sqrt.(freqs_squared) # Sort by increasing frequency idx = sortperm(freqs, by=real)