diff --git a/Project.toml b/Project.toml
index 985523ca..239245ec 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "JutulDarcy"
 uuid = "82210473-ab04-4dce-b31b-11573c4f8e0a"
 authors = ["Olav Møyner <olav.moyner@gmail.com>"]
-version = "0.2.35"
+version = "0.2.36"
 
 [deps]
 AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
@@ -34,6 +34,8 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [weakdeps]
+AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -41,6 +43,8 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 
 [extensions]
+JutulDarcyAMGXExt = "AMGX"
+JutulDarcyCUDAExt = "CUDA"
 JutulDarcyGLMakieExt = "GLMakie"
 JutulDarcyMakieExt = "Makie"
 JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]
@@ -48,6 +52,8 @@ JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]
 [compat]
 AlgebraicMultigrid = "0.5.1, 0.6.0"
 Artifacts = "1"
+AMGX = "0.2"
+CUDA = "5"
 DataStructures = "0.18.13"
 Dates = "1"
 DelimitedFiles = "1.6"
@@ -56,7 +62,7 @@ ForwardDiff = "0.10.30"
 GLMakie = "0.10.13"
 GeoEnergyIO = "1.1.12"
 HYPRE = "1.6.0, 1.7"
-Jutul = "0.2.40"
+Jutul = "0.2.42"
 Krylov = "0.9.1"
 LazyArtifacts = "1"
 LinearAlgebra = "1"
@@ -78,6 +84,8 @@ Tullio = "0.3.4"
 julia = "1.7"
 
 [extras]
+AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -87,4 +95,4 @@ TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 
 [targets]
-test = ["Test", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
+test = ["Test", "CUDA", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
diff --git a/docs/make.jl b/docs/make.jl
index a024419d..49f3e800 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -137,6 +137,7 @@ function build_jutul_darcy_docs(build_format = nothing;
                     ],
             "Further reading" => [
                 "man/advanced/mpi.md",
+                "man/advanced/gpu.md",
                 "man/advanced/compiled.md",
                 "Jutul functions" => "ref/jutul.md",
                 "Bibliography" => "extras/refs.md"
diff --git a/docs/src/index.md b/docs/src/index.md
index 97f127d6..d803fe5c 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -33,8 +33,8 @@ features:
     link: /examples/intro_sensitivities
 
   - icon: 🏃
-    title: High performance 
-    details: Fast execution with support for MPI and thread parallelism
+    title: High performance on CPU & GPU
+    details: Fast execution with support for MPI, CUDA and thread parallelism
     link: /man/advanced/mpi
 ---
 ````
diff --git a/docs/src/man/advanced/gpu.md b/docs/src/man/advanced/gpu.md
new file mode 100644
index 00000000..6d1f3391
--- /dev/null
+++ b/docs/src/man/advanced/gpu.md
@@ -0,0 +1,57 @@
+# GPU support
+
+JutulDarcy includes experimental support for running linear solves on the GPU. For many simulations, the linear systems are the most compute-intensive part and a natural choice for acceleration. At the moment, the support is limited to CUDA GPUs through [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl). For the most efficient CPR preconditioner, [AMGX.jl](https://github.com/JuliaGPU/AMGX.jl) is required which is currently limited to Linux systems. Windows users may have luck by running Julia inside [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+
+## How to use
+
+If you have installed JutulDarcy, you should start by adding the CUDA and optionally the AMGX packages using the package manager:
+
+```julia
+using Pkg
+Pkg.add("CUDA") # Requires a CUDA-capable GPU
+Pkg.add("AMGX") # Requires CUDA + Linux
+```
+
+Once the packages have been added to the same environment as JutulDarcy, you can load them to enable GPU support. Let us grab the first ten steps of the EGG benchmark model:
+
+```julia
+using Jutul, JutulDarcy
+dpth = JutulDarcy.GeoEnergyIO.test_input_file_path("EGG", "EGG.DATA")
+case = setup_case_from_data_file(dpth)
+case = case[1:10]
+```
+
+### Running on CPU
+
+If we wanted to run this on CPU we would simply call `simulate_reservoir`:
+
+```julia
+result_cpu = simulate_reservoir(case);
+```
+
+### Running on GPU with block ILU(0)
+
+If we now load `CUDA` we can run the same simulation using the CUDA-accelerated linear solver. By itself, CUDA only supports the ILU(0) preconditioner. JutulDarcy will automatically pick this preconditioner when CUDA is requested without AMGX, but we write it explicitly here:
+
+```julia
+using CUDA
+result_ilu0_cuda = simulate_reservoir(case, linear_solver_backend = :cuda, precond = :ilu0);
+```
+
+### Running on GPU with CPR AMGX-ILU(0)
+
+Loading the AMGX package makes a pure GPU-based two-stage CPR available. Again, we are explicit in requesting CPR, but if both `CUDA` and `AMGX` are available and functional this is redundant:
+
+```julia
+using AMGX
+result_amgx_cuda = simulate_reservoir(case, linear_solver_backend = :cuda, precond = :cpr);
+```
+
+In short, load `AMGX` and `CUDA` and run `simulate_reservoir(case, linear_solver_backend = :cuda)` to get GPU results. The EGG model is quite small, so if you want to see significant performance increases, a larger case will be necessary. `AMGX` also contains a large number of options that can be configured for advanced users.
+
+## Technical details and limitations
+
+The GPU implementation relies on assembly on CPU and pinned memory to transfer onto the CPU. This means that the performance can be significantly improved by launching Julia with multiple threads to speed up the non-GPU parts of the code. AMGX is currently single-GPU only and does not work with MPI. Currently, only `Float64` is supported for CPR, but pure ILU(0) solves support `Float32` as well.
+
+!!! warning "Experimental status"
+    Multiple successive runs with different `AMGX` instances have resulted in crashes when old instances are garbage collected. This part of the code is still considered experimental, with contributions welcome if you are using it.
diff --git a/docs/src/man/advanced/mpi.md b/docs/src/man/advanced/mpi.md
index 7e92cd32..a3677aba 100644
--- a/docs/src/man/advanced/mpi.md
+++ b/docs/src/man/advanced/mpi.md
@@ -4,7 +4,7 @@ JutulDarcy can use threads by default, but advanced options can improve performa
 
 ## Overview of parallel support
 
-There are two main ways of exploiting multiple cores in Jutul/JutulDarcy: Threads are automatically used for assembly and can be used for parts of the linear solve. If you require the best performance, you have to go to MPI where the linear solvers can use a parallel [BoomerAMG preconditioner](https://hypre.readthedocs.io/en/latest/solvers-boomeramg.html) via [HYPRE.jl](https://github.com/fredrikekre/HYPRE.jl).
+There are two main ways of exploiting multiple cores in Jutul/JutulDarcy: Threads are automatically used for assembly and can be used for parts of the linear solve. If you require the best performance, you have to go to MPI where the linear solvers can use a parallel [BoomerAMG preconditioner](https://hypre.readthedocs.io/en/latest/solvers-boomeramg.html) via [HYPRE.jl](https://github.com/fredrikekre/HYPRE.jl). In addition, there is experimental GPU support described in [GPU support](@ref).
 
 ### MPI parallelization
 
@@ -25,7 +25,6 @@ Starting Julia with multiple threads (for example `julia --project. --threads=4`
 
 Threads are easy to use and can give a bit of benefit for large models.
 
-
 ### Mixed-mode parallelism
 
 You can mix the two approaches: Adding multiple threads to each MPI process can use threads to speed up assembly and property evaluations.
@@ -42,7 +41,6 @@ A few hints when you are looking at performance:
 
 Example: 200k cell model on laptop: 1 process 235 s -> 4 processes 145s
 
-
 ## Solving with MPI in practice
 
 There are a few adjustments needed before a script can be run in MPI.
diff --git a/examples/co2_sloped.jl b/examples/co2_sloped.jl
index d2d403b1..c61c8615 100644
--- a/examples/co2_sloped.jl
+++ b/examples/co2_sloped.jl
@@ -133,7 +133,7 @@ krog_t = so.^2
 krog = PhaseRelativePermeability(so, krog_t, label = :og)
 
 # Higher resolution for second table:
-sg = range(0, 1, 50)
+sg = range(0, 1, 50);
 
 # Evaluate Brooks-Corey to generate tables:
 tab_krg_drain = brooks_corey_relperm.(sg, n = 2, residual = 0.1)
@@ -251,7 +251,7 @@ wd, states, t = simulate_reservoir(state0, model, dt,
     parameters = parameters,
     forces = forces,
     max_timestep = 90day
-)
+);
 # ## Plot the CO2 mole fraction
 # We plot the overall CO2 mole fraction. We scale the color range to log10 to
 # account for the fact that the mole fraction in cells made up of only the
@@ -339,7 +339,8 @@ for cell in 1:nc
     x, y, z = centers[:, cell]
     is_inside[cell] = sqrt((x - 720.0)^2 + 20*(z-70.0)^2) < 75
 end
-plot_cell_data(mesh, is_inside)
+fig, ax, plt = plot_cell_data(mesh, is_inside)
+fig
 # ## Plot inventory in ellipsoid
 # Note that a small mobile dip can be seen when free CO2 passes through this region.
 inventory = co2_inventory(model, wd, states, t, cells = findall(is_inside))
diff --git a/examples/data_input_file.jl b/examples/data_input_file.jl
index 04d6088c..cd18867e 100644
--- a/examples/data_input_file.jl
+++ b/examples/data_input_file.jl
@@ -5,14 +5,19 @@
 # regular JutulDarcy simulation, allowing modification and use of the case in
 # differentiable workflows.
 #
-# We begin by loading the SPE9 dataset via the GeoEnergyIO package.
+# We begin by loading the SPE9 dataset via the GeoEnergyIO package. This package
+# includes a set of open datasets that can be used for testing and benchmarking.
+# The SPE9 dataset is a 3D model with a corner-point grid and a set of wells
+# produced by the Society of Petroleum Engineers. The specific version of the
+# file included here is taken from the [OPM
+# tests](https://github.com/OPM/opm-tests) repository.
 using JutulDarcy, GeoEnergyIO
-pth = GeoEnergyIO.test_input_file_path("SPE9", "SPE9.DATA")
+pth = GeoEnergyIO.test_input_file_path("SPE9", "SPE9.DATA");
 # ## Set up and run a simulation
 # If we do not need the case, we could also have done:
 # ws, states = simulate_data_file(pth)
 case = setup_case_from_data_file(pth)
-ws, states = simulate_reservoir(case)
+ws, states = simulate_reservoir(case);
 # ## Show the input data
 # The input data takes the form of a Dict:
 case.input_data
@@ -20,13 +25,22 @@ case.input_data
 # as a Dict.
 case.input_data["RUNSPEC"]
 # ## Plot the simulation model
-# These plot are interactive when run outside of the documentations.
+# These plot are normally interactive, but if you are reading the published
+# online documentation static screenshots will be inserted instead.
 using GLMakie
 plot_reservoir(case.model, states)
 # ## Plot the well responses
 # We can plot the well responses (rates and pressures) in an interactive viewer.
+# Multiple wells can be plotted simultaneously, with options to select which
+# units are to be used for plotting.
 plot_well_results(ws)
 # ## Plot the field responses
 # Similar to the wells, we can also plot field-wide measurables. We plot the
 # field gas production rate and the average pressure as the initial selection.
+# If you are running this case interactively you can select which measurables to
+# plot.
+#
+# We observe that the field pressure steadily decreases over time, as a result
+# of the gas production. The drop in pressure is not uniform, as during the
+# period where little gas is produced, the decrease in field pressure is slower.
 plot_reservoir_measurables(case, ws, states, left = :fgpr, right = :pres)
diff --git a/examples/five_spot_ensemble.jl b/examples/five_spot_ensemble.jl
index 632e530c..774a7502 100644
--- a/examples/five_spot_ensemble.jl
+++ b/examples/five_spot_ensemble.jl
@@ -4,8 +4,7 @@
 # injector in one corner and the producer in the opposing corner, with a
 # significant volume of fluids injected into the domain.
 using JutulDarcy, Jutul
-nx = 50
-#-
+nx = 50;
 # ## Setup
 # We define a function that, for a given porosity field, computes a solution
 # with an estimated permeability field. For assumptions and derivation of the
@@ -55,10 +54,9 @@ function simulate_qfs(porosity = 0.2)
     forces = setup_reservoir_forces(model, control = controls)
     return simulate_reservoir(state0, model, dt, parameters = parameters, forces = forces, info_level = -1)
 end
-#-
 # ## Simulate base case
 # This will give the solution with uniform porosity of 0.2.
-ws, states, report_time = simulate_qfs()
+ws, states, report_time = simulate_qfs();
 # ### Plot the solution of the base case
 # We observe a radial flow pattern initially, before coning occurs near the
 # producer well once the fluid has reached the opposite corner. The uniform
@@ -75,7 +73,6 @@ ax = Axis(fig[1, 2])
 h = contourf!(ax, get_sat(states[nt]))
 Colorbar(fig[1, end+1], h)
 fig
-#-
 # ## Create 10 realizations
 # We create a small set of realizations of the same model, with porosity that is
 # uniformly varying between 0.05 and 0.3. This is not especially sophisticated
@@ -89,11 +86,10 @@ wells = []
 report_step = nt
 for i = 1:N
     poro = 0.05 .+ 0.25*rand(Float64, (nx*nx))
-    ws, states, rt = simulate_qfs(poro)
-    push!(wells, ws)
-    push!(saturations, get_sat(states[report_step]))
+    ws_i, states_i, rt = simulate_qfs(poro)
+    push!(wells, ws_i)
+    push!(saturations, get_sat(states_i[report_step]))
 end
-#-
 # ### Plot the oil rate at the producer over the ensemble
 using Statistics
 fig = Figure()
@@ -106,7 +102,6 @@ end
 xlims!(ax, [mean(report_time), report_time[end]])
 ylims!(ax, 0, 0.0075)
 fig
-#-
 # ### Plot the average saturation over the ensemble
 avg = mean(saturations)
 fig = Figure()
@@ -114,7 +109,6 @@ h = nothing
 ax = Axis(fig[1, 1])
 h = contourf!(ax, avg)
 fig
-#-
 # ### Plot the isocontour lines over the ensemble
 fig = Figure()
 h = nothing
diff --git a/examples/model_coarsening.jl b/examples/model_coarsening.jl
index ebb951ac..6a8c4e5e 100644
--- a/examples/model_coarsening.jl
+++ b/examples/model_coarsening.jl
@@ -27,7 +27,7 @@ fine_case = setup_case_from_data_file(data_pth);
 fine_model = fine_case.model
 fine_reservoir = reservoir_domain(fine_model)
 fine_mesh = physical_representation(fine_reservoir)
-ws, states = simulate_reservoir(fine_case, info_level = 1);
+ws, states = simulate_reservoir(fine_case, info_level = -1);
 # ## Coarsen the model and plot partition
 # We coarsen the model using a partition size of 20x20x2 and the IJK method
 # where the underlying structure of the mesh is used to subdivide the blocks.
diff --git a/examples/optimize_simple_bl.jl b/examples/optimize_simple_bl.jl
index ea82952e..dd792a9e 100644
--- a/examples/optimize_simple_bl.jl
+++ b/examples/optimize_simple_bl.jl
@@ -1,12 +1,13 @@
-using Jutul
-using JutulDarcy
-using LinearAlgebra
-using GLMakie
 # # Example demonstrating optimzation of parameters against observations
 # We create a simple test problem: A 1D nonlinear displacement. The observations
 # are generated by solving the same problem with the true parameters. We then
 # match the parameters against the observations using a different starting guess
 # for the parameters, but otherwise the same physical description of the system.
+using Jutul
+using JutulDarcy
+using LinearAlgebra
+using GLMakie
+
 function setup_bl(;nc = 100, time = 1.0, nstep = 100, poro = 0.1, perm = 9.8692e-14)
     T = time
     tstep = repeat([T/nstep], nstep)
@@ -39,10 +40,10 @@ poro_ref = 0.1
 perm_ref = 9.8692e-14
 # ## Set up and simulate reference
 model_ref, state0_ref, parameters_ref, forces, tstep = setup_bl(nc = N, nstep = Nt, poro = poro_ref, perm = perm_ref)
-states_ref, = simulate(state0_ref, model_ref, tstep, parameters = parameters_ref, forces = forces, info_level = -1)
+states_ref, = simulate(state0_ref, model_ref, tstep, parameters = parameters_ref, forces = forces, info_level = -1);
 # ## Set up another case where the porosity is different
 model, state0, parameters, = setup_bl(nc = N, nstep = Nt, poro = 2*poro_ref, perm = 1.0*perm_ref)
-states, rep = simulate(state0, model, tstep, parameters = parameters, forces = forces, info_level = -1)
+states, rep = simulate(state0, model, tstep, parameters = parameters, forces = forces, info_level = -1);
 # ## Plot the results
 fig = Figure()
 ax = Axis(fig[1, 1], title = "Saturation")
diff --git a/examples/relperms.jl b/examples/relperms.jl
index 50aacc4b..6f5eed99 100644
--- a/examples/relperms.jl
+++ b/examples/relperms.jl
@@ -316,7 +316,6 @@ lines(simulate_bl(model, prm), axis = (title = "Parametric LET function simulati
 # ### Check out the parameters
 # The LET parameters are now numerical parameters in the reservoir:
 rmodel = reservoir_model(model)
-display(rmodel)
 
 # ## Conclusion
 # We have explored a few aspects of relative permeabilities in JutulDarcy. There
diff --git a/examples/two_phase_buckley_leverett.jl b/examples/two_phase_buckley_leverett.jl
index f085061c..d5b05ed7 100644
--- a/examples/two_phase_buckley_leverett.jl
+++ b/examples/two_phase_buckley_leverett.jl
@@ -52,7 +52,6 @@ end
 n, n_f = 100, 1000
 states, model, report = solve_bl(nc = n)
 print_stats(report)
-#-
 # ## Run refined version (1000 cells, 1000 steps)
 # Using a grid with 100 cells will not yield a fully converged solution. We can
 # increase the number of cells at the cost of increasing the runtime a bit. Note
@@ -61,7 +60,6 @@ print_stats(report)
 # use an iterative solver.
 states_refined, _, report_refined = solve_bl(nc = n_f);
 print_stats(report_refined)
-#-
 # ## Plot results
 # We plot the saturation front for the base case at different times together
 # with the final solution for the refined model. In this case, refining the grid
diff --git a/examples/two_phase_gravity_segregation.jl b/examples/two_phase_gravity_segregation.jl
index f1ad601f..652c31e7 100644
--- a/examples/two_phase_gravity_segregation.jl
+++ b/examples/two_phase_gravity_segregation.jl
@@ -7,14 +7,13 @@
 # ## Problem set up
 # We define a simple 1D gravity column with an approximate 10-1 ratio in density
 # between the two compressible phases and let it simulate until equilibrium is
-# reached.
+# reached. We begin by defining the reservoir domain itself.
 using JutulDarcy, Jutul
 nc = 100
 Darcy, bar, kg, meter, day = si_units(:darcy, :bar, :kilogram, :meter, :day)
 
 g = CartesianMesh((1, 1, nc), (1.0, 1.0, 10.0))
-domain = reservoir_domain(g, permeability = 1.0*Darcy)
-#-
+domain = reservoir_domain(g, permeability = 1.0*Darcy);
 # ## Fluid properties
 # Define two phases liquid and vapor with a 10-1 ratio reference densities and
 # set up the simulation model.
@@ -25,26 +24,28 @@ rhoVS = 100.0*kg/meter^3
 cl, cv = 1e-5/bar, 1e-4/bar
 L, V = LiquidPhase(), VaporPhase()
 sys = ImmiscibleSystem([L, V])
-model = SimulationModel(domain, sys)
-#-
+model = SimulationModel(domain, sys);
 # ### Definition for phase mass densities
 # Replace default density with a constant compressibility function that uses the
 # reference values at the initial pressure.
 density = ConstantCompressibilityDensities(sys, p0, [rhoLS, rhoVS], [cl, cv])
-set_secondary_variables!(model, PhaseMassDensities = density)
+set_secondary_variables!(model, PhaseMassDensities = density);
 # ### Set up initial state
 # Put heavy phase on top and light phase on bottom. Saturations have one value
 # per phase, per cell and consequently a per-cell instantiation will require a
-# two by number of cells matrix as input.
+# two by number of cells matrix as input. We also set up time-steps for the
+# simulation, using the provided conversion factor to convert days into seconds.
 nl = nc ÷ 2
 sL = vcat(ones(nl), zeros(nc - nl))'
 s0 = vcat(sL, 1 .- sL)
 state0 = setup_state(model, Pressure = p0, Saturations = s0)
-# Convert time-steps from days to seconds
-timesteps = repeat([0.02]*day, 150)
-#-
-## Perform simulation
-states, report = simulate(state0, model, timesteps, info_level = -1)
+timesteps = repeat([0.02]*day, 150);
+# ## Perform simulation
+# We simulate the system using the default linear solver and otherwise default
+# options. Using `simulate` with the default options means that no dynamic
+# timestepping will be used, and the simulation will report on the exact 150
+# steps defined above.
+states, report = simulate(state0, model, timesteps, info_level = -1);
 # ## Plot results
 # The 1D nature of the problem allows us to plot all timesteps simultaneously in
 # 2D. We see that the heavy fluid, colored blue, is initially at the top of the
diff --git a/examples/two_phase_unstable_gravity.jl b/examples/two_phase_unstable_gravity.jl
index be9704a3..e95c5f0d 100644
--- a/examples/two_phase_unstable_gravity.jl
+++ b/examples/two_phase_unstable_gravity.jl
@@ -17,7 +17,7 @@ nx = nz = 100;
 # ## Define the domain
 D = 10.0
 g = CartesianMesh((nx, 1, nz), (D, 1.0, D))
-domain = reservoir_domain(g)
+domain = reservoir_domain(g);
 # ## Set up model and properties
 Darcy, bar, kg, meter, day = si_units(:darcy, :bar, :kilogram, :meter, :day)
 p0 = 100*bar
diff --git a/examples/wells_intro.jl b/examples/wells_intro.jl
index 695b55b5..a8b17aea 100644
--- a/examples/wells_intro.jl
+++ b/examples/wells_intro.jl
@@ -11,7 +11,7 @@ using JutulDarcy, Jutul
 # `JutulDarcy` uses SI units internally. It is therefore convenient to define a
 # few constants at the start of the script to have more managable numbers later
 # on.
-Darcy, bar, kg, meter, day = si_units(:darcy, :bar, :kilogram, :meter, :day)
+Darcy, bar, kg, meter, day = si_units(:darcy, :bar, :kilogram, :meter, :day);
 
 # ## Defining a porous medium
 # We start by defining the static part of our simulation problem -- the porous medium itself.
@@ -44,10 +44,10 @@ g = CartesianMesh(dims, (2000.0, 1500.0, 50.0))
 # are also automatically added:
 nlayer = nx*ny # Cells in each layer
 K = vcat(
-    repeat([0.65], nlayer),
-    repeat([0.3], nlayer),
-    repeat([0.5], nlayer),
-    repeat([0.2], nlayer)
+    fill(0.65, nlayer),
+    fill(0.3, nlayer),
+    fill(0.5, nlayer),
+    fill(0.2, nlayer)
     )*Darcy
 
 domain = reservoir_domain(g, permeability = K, porosity = 0.2)
@@ -166,7 +166,7 @@ forces = setup_reservoir_forces(model, control = controls)
 # We are finally ready to simulate the model for the given initial state
 # `state0`, report steps `dt`, `parameters` and forces. As the model is small,
 # barring any compilation time, this should run in about 300 ms.
-result = simulate_reservoir(state0, model, dt, parameters = parameters, forces = forces)
+result = simulate_reservoir(state0, model, dt, parameters = parameters, forces = forces);
 # ### Unpacking the result
 # The result contains a lot of data. This can be unpacked to get the most
 # typical desired outputs: Well responses, reservoir states and the time they
@@ -184,29 +184,32 @@ using GLMakie
 # from liquid to gas as the front propagate through the domain and hits the
 # producer well.
 # Gas rates
-qg = wd[:Producer][:grat]
+qg = wd[:Producer][:grat];
 # Total rate
-qt = wd[:Producer][:rate]
+qt = wd[:Producer][:rate];
 # Compute liquid rate and plot
 ql = qt - qg
 x = t/day
 fig = Figure()
-ax = Axis(fig[1, 1], xlabel = "Time (days)",
-                     ylabel = "Rate (m³/day)",
-                     title = "Well production rates")
+ax = Axis(fig[1, 1],
+    xlabel = "Time (days)",
+    ylabel = "Rate (m³/day)",
+    title = "Well production rates"
+)
 lines!(ax, x, abs.(qg).*day, label = "Gas")
 lines!(ax, x, abs.(ql).*day, label = "Liquid")
 lines!(ax, x, abs.(qt).*day, label = "Total")
 axislegend(position = :rb)
 fig
-#-
 # ## Plot bottom hole pressure of the injector
 # The pressure builds during injection, until the gas breaks through to the
 # other well.
 bh = wd[:Injector][:bhp]
 fig = Figure()
-ax = Axis(fig[1, 1], xlabel = "Time (days)",
-                     ylabel = "Bottom hole pressure (bar)",
-                     title = "Injector bottom hole pressure")
+ax = Axis(fig[1, 1],
+    xlabel = "Time (days)",
+    ylabel = "Bottom hole pressure (bar)",
+    title = "Injector bottom hole pressure"
+)
 lines!(ax, x, bh./bar)
 fig
diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
new file mode 100644
index 00000000..b1df4954
--- /dev/null
+++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
@@ -0,0 +1,15 @@
+module JutulDarcyAMGXExt
+    using JutulDarcy, Jutul, AMGX, AMGX.CUDA, LinearAlgebra, SparseArrays
+    import JutulDarcy: AMGXPreconditioner
+    import Jutul: @tic
+
+    timeit_debug_enabled() = Jutul.timeit_debug_enabled()
+
+    include("cpr.jl")
+    include("precond.jl")
+
+    function __init__()
+        # TODO: Figure out a way to not always do this?
+        AMGX.initialize()
+    end
+end
diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
new file mode 100644
index 00000000..57510048
--- /dev/null
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -0,0 +1,115 @@
+function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv, cpr::JutulDarcy.CPRPreconditioner, recorder)
+    do_p_update = JutulDarcy.should_update_cpr(cpr, recorder, :amg)
+    already_setup = haskey(amgx.data, :storage)
+    if already_setup
+        if do_p_update || cpr.partial_update
+            pb = amgx.data[:buffer_p]
+            s = amgx.data[:storage]
+            A_gpu = s.matrix
+            @assert nnz(A) == nnz(A_gpu)
+            @tic "AMGX coefficients" AMGX.replace_coefficients!(A_gpu, A.At.nzval)
+        end
+    else
+        if Tv == Float64
+            amgx_mode = AMGX.dDDI
+        else
+            amgx_mode = AMGX.dFFI
+        end
+        n, m = size(A)
+        @assert n == m
+        s = AMGXStorage(amgx.settings, amgx_mode)
+        # RHS and solution vectors to right size just in case
+        AMGX.set_zero!(s.x, n)
+        AMGX.set_zero!(s.r, n)
+
+        row_ptr = Cint.(A.At.colptr .- 1)
+        colval = Cint.(A.At.rowval .- 1)
+        nzval = A.At.nzval
+        # TODO: Support for other types than Float64, should be done in setup of
+        # pressure system
+        AMGX.pin_memory(nzval)
+        AMGX.upload!(s.matrix,
+            row_ptr,
+            colval,
+            nzval
+        )
+        amgx.data[:nzval] = A.At.nzval
+        amgx.data[:storage] = s
+        amgx.data[:block_size] = 1
+        amgx.data[:n] = n
+        amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
+    end
+    if do_p_update
+        @tic "AMGX setup" if already_setup && amgx.resetup
+            AMGX.resetup!(s.solver, s.matrix)
+        else
+            AMGX.setup!(s.solver, s.matrix)
+        end
+    end
+    return amgx
+end
+
+mutable struct AMGXStorage{C, R, V, M, S}
+    config::C
+    resources::R
+    r::V
+    x::V
+    matrix::M
+    solver::S
+    function AMGXStorage(settings::AbstractDict, amgx_mode = AMGX.dDDI)
+        config = AMGX.Config(settings)
+        resources = AMGX.Resources(config)
+        r = AMGX.AMGXVector(resources, amgx_mode)
+        x = AMGX.AMGXVector(resources, amgx_mode)
+        matrix = AMGX.AMGXMatrix(resources, amgx_mode)
+        solver = AMGX.Solver(resources, amgx_mode, config)
+        function finalize_storage!(amgx_s::AMGXStorage)
+            close(amgx_s.solver)
+            close(amgx_s.x)
+            close(amgx_s.r)
+            close(amgx_s.matrix)
+            close(amgx_s.resources)
+            close(amgx_s.config)
+        end
+        s = new{
+            typeof(config),
+            typeof(resources),
+            typeof(r),
+            typeof(matrix),
+            typeof(solver)
+        }(config, resources, r, x, matrix, solver)
+        return finalizer(finalize_storage!, s)
+    end
+end
+
+function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
+    Jutul.apply!(r_p, amgx, r_p)
+end
+
+function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op, recorder)
+    # TODO: Rename this function
+    data = cpr.pressure_precond.data
+    is_first = !haskey(data, :w_p)
+    cpr_s = cpr.storage
+    if is_first
+        data[:w_p_cpu] = CUDA.pin(cpr_s.w_p)
+        Tv = eltype(r_cu)
+        n = length(r_cu)
+        bz = cpr_s.block_size
+        cpr.pressure_precond.data[:buffer_full] = similar(r_cu)
+        bz_w, n_w = size(cpr_s.w_p)
+        @assert n == bz*n_w
+        data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w)
+        data[:main_system] = J_bsr
+        data[:operator] = op
+    end
+    do_p_update = JutulDarcy.should_update_cpr(cpr, recorder, :amg)
+    if is_first || do_p_update || cpr.partial_update
+        # Put updated weights on GPU
+        w_p_cpu = data[:w_p_cpu]
+        w_p_gpu = data[:w_p]
+        @assert size(w_p_cpu) == size(w_p_gpu)
+        @tic "weights to gpu" copyto!(w_p_gpu, w_p_cpu)
+    end
+    return cpr
+end
diff --git a/ext/JutulDarcyAMGXExt/precond.jl b/ext/JutulDarcyAMGXExt/precond.jl
new file mode 100644
index 00000000..3bfac6da
--- /dev/null
+++ b/ext/JutulDarcyAMGXExt/precond.jl
@@ -0,0 +1,57 @@
+function JutulDarcy.gpu_update_preconditioner!(prec::JutulDarcy.AMGXPreconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    nzval = nonzeros(J_bsr)
+    bz = Jutul.block_size(lsys[1, 1])
+    already_setup = haskey(prec.data, :storage)
+    if already_setup
+        s = prec.data[:storage]
+        AMGX.replace_coefficients!(s.matrix, nzval)
+    else
+        Tv = eltype(J_bsr)
+        if Tv == Float64
+            amgx_mode = AMGX.dDDI
+        else
+            amgx_mode = AMGX.dFFI
+        end
+        n, m = size(J_bsr)
+        @assert n == m
+        N = n ÷ bz
+        config = AMGX.Config(prec.settings)
+        s = AMGXStorage(config, amgx_mode)
+
+        AMGX.set_zero!(s.x, N, block_dim = bz)
+        AMGX.set_zero!(s.r, N, block_dim = bz)
+
+        row_ptr = J_bsr.rowPtr
+        colval = J_bsr.colVal
+        AMGX.upload!(
+            s.matrix,
+            row_ptr,
+            colval,
+            nzval,
+            block_dims = (bz, bz)
+        )
+        prec.data[:storage] = s
+        prec.data[:n] = N
+        prec.data[:block_size] = bz
+    end
+    if already_setup && prec.resetup
+        AMGX.resetup!(s.solver, s.matrix)
+    else
+        AMGX.setup!(s.solver, s.matrix)
+    end
+    return prec
+end
+
+function Jutul.operator_nrows(prec::JutulDarcy.AMGXPreconditioner)
+    return prec.data[:n]*prec.data[:block_size]
+end
+
+function Jutul.apply!(x, amgx::JutulDarcy.AMGXPreconditioner, r)
+    s = amgx.data[:storage]
+    bz = amgx.data[:block_size]::Int
+    n = amgx.data[:n]::Int
+    AMGX.upload!(s.r, r, block_dim = bz)
+    AMGX.set_zero!(s.x, n, block_dim = bz)
+    AMGX.solve!(s.x, s.solver, s.r)
+    AMGX.download!(x, s.x)
+end
diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
new file mode 100644
index 00000000..5c467057
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
@@ -0,0 +1,11 @@
+module JutulDarcyCUDAExt
+    using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays
+    import Jutul: @tic
+
+    timeit_debug_enabled() = Jutul.timeit_debug_enabled()
+
+    include("ilu0.jl")
+    include("krylov.jl")
+    include("cuda_utils.jl")
+    include("cpr.jl")
+end
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
new file mode 100644
index 00000000..23e05e69
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -0,0 +1,65 @@
+function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
+    @assert eltype(r_p) == eltype(w_p) == eltype(r)
+    @assert length(w_p) == length(r)
+    @assert size(w_p, 2) == length(r_p)
+    kernel = CUDA.@cuda launch = false reduce_residual_kernel!(r_p, w_p, r)
+    threads, blocks = kernel_helper(kernel, r_p)
+    CUDA.@sync begin
+        kernel(r_p, w_p, r; threads, blocks)
+    end
+    return r_p
+end
+
+function reduce_residual_kernel!(r_p, w_p, r_ps)
+    index = threadIdx().x
+    stride = blockDim().x
+    T = eltype(r_p)
+    ncomp = size(w_p, 1)
+    for cell in index:stride:length(r_p)
+        v = zero(T)
+        for comp in 1:ncomp
+            ix = (cell-1)*ncomp + comp
+            @inbounds wi = w_p[comp, cell]
+            @inbounds ri = r_ps[ix]
+            v += wi*ri
+        end
+        @inbounds r_p[cell] = v
+    end
+    return nothing
+end
+
+function JutulDarcy.gpu_increment_pressure!(x, dp)
+    n = length(x)
+    bz = div(n, length(dp))
+    @assert bz > 0
+    if false
+        kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
+        threads, blocks = kernel_helper(kernel, dp)
+        threads = blocks = 1
+
+        CUDA.@sync begin
+        kernel(x, dp, bz; threads, blocks)
+        end
+    else
+        x_view = view(x, 1:bz:(n+1-bz))
+        @. x_view += dp
+    end
+    return x
+end
+
+function gpu_increment_pressure_kernel!(x, dp, bz)
+    index = threadIdx().x
+    stride = blockDim().x
+    for cell in index:stride:length(dp)
+        x[(cell-1)*bz + 1] += dp[cell]
+    end
+    return nothing
+end
+
+function kernel_helper(kernel, V)
+    config = launch_configuration(kernel.fun)
+    threads = min(length(V), config.threads)
+    blocks = cld(length(V), threads)
+
+    return (threads, blocks)
+end
\ No newline at end of file
diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
new file mode 100644
index 00000000..672ac865
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -0,0 +1,131 @@
+function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim::Int, rowptr, colval, nzval, r0)
+    ensure_minimum_block_values!(nzval, blockDim)
+    rowPtr = CuVector{Ti}(rowptr)
+    colVal = CuVector{Ti}(colval)
+    nzVal = CuVector{Tv}(nzval)
+    dims = blockDim.*sz
+    dir = 'C'
+    nnz = length(nzVal)÷(blockDim^2)
+    J_bsr = CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
+    r_cu = CuVector{Tv}(vec(r0))
+    return (J_bsr, r_cu)
+end
+
+function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval)
+    ensure_minimum_block_values!(nzval, blockDim)
+    copyto!(J.nzVal, nzval)
+end
+
+function ensure_minimum_block_values!(nzval, blockDim, ϵ = 1e-12)
+    if ϵ > 0.0
+        for (i, v) in enumerate(nzval)
+            if abs(v) < ϵ
+                if v > 0
+                    v = ϵ
+                else
+                    v = ϵ
+                end
+                @inbounds nzval[i] = v
+            end
+        end
+    end
+    return nzval
+end
+
+function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r)
+    copyto!(r_cu, r)
+end
+
+function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::LinearizedSystem)
+    return nothing
+end
+
+function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSystem)
+    @assert size(lsys.subsystems) == (2, 2)
+    @assert length(lsys.schur_buffer) == 2
+    buf_full_cpu = lsys.schur_buffer[1]
+    buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2]
+    # B_cpu = lsys[1, 1].jac # Already transferred, not needed
+    C_cpu = lsys[1, 2].jac
+    D_cpu = lsys[2, 1].jac
+    # E_cpu = lsys[2, 2].jac # Only factorization needed I think.
+
+    E_factor = only(lsys.factor.factor)
+    # E_L_cpu = E_factor.L
+    # E_U_cpu = E_factor.U
+
+    to_gpu(x) = copy_to_gpu(x, Tv, Ti)
+
+    D = to_gpu(D_cpu)
+    # E_L = to_gpu(E_L_cpu)
+    # E_U = to_gpu(E_U_cpu)
+    C = to_gpu(C_cpu)
+
+    buf = to_gpu(buf_1_cpu)
+    # buf_2 = to_gpu(buf_2_cpu)
+    # Operations and format:
+    # B C
+    # D E
+    # mul!(b_buf_2, D_i, x)
+    # ldiv!(b_buf_1, E_i, b_buf_2)
+    # mul!(res, C_i, b_buf_1, -α, true)
+
+    D_nzval = CUDA.pin(D_cpu.nzval)
+    C_nzval = CUDA.pin(C_cpu.nzval)
+    return Dict(
+        :C => C,
+        :D => D,
+        :buf => buf,
+        :buf_1_cpu => CUDA.pin(buf_1_cpu),
+        :buf_2_cpu => CUDA.pin(buf_2_cpu),
+        :E_factor => E_factor,
+        :D_nzval => D_nzval,
+        :C_nzval => C_nzval
+    )
+end
+
+function JutulDarcy.update_gpu_schur_system!(schur::Nothing, lsys)
+    return schur
+end
+
+function JutulDarcy.update_gpu_schur_system!(schur, lsys)
+    C_cpu = lsys[1, 2].jac
+    copyto!(schur[:C].nzVal, schur[:C_nzval])
+    D_cpu = lsys[2, 1].jac
+    copyto!(schur[:D].nzVal, schur[:D_nzval])
+    return schur
+end
+
+function copy_to_gpu(x, Ti, Tv)
+    error("Internal converter not set up for $(typeof(x))")
+end
+
+function copy_to_gpu(x::SparseMatrixCSC{Tvc, Tic}, Tv, Ti) where {Tvc, Tic}
+    if Tvc != Tv || Tic != Ti
+        x = SparseMatrixCSC{Tv, Ti}(x)
+    end
+    return CUDA.CUSPARSE.CuSparseMatrixCSC(x)
+end
+
+function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
+    return convert(CuVector{Tv}, x)
+end
+
+function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor)
+    @tic "schur apply" @sync begin 
+        # Working on GPU
+        @async mul!(y, J, x, α, β)
+        mul!(buf, D, x)
+        # Move over to CPU for this part
+        copyto!(buf1_cpu, buf)
+        ldiv!(buf2_cpu, E_factor, buf1_cpu)
+        # Back to GPU for the big array...
+        copyto!(buf, buf2_cpu)
+    end
+    mul!(y, C, buf, -α, true)
+    return y
+end
+
+function JutulDarcy.pin_cpu_memory(x)
+    return CUDA.pin(x)
+end
diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl
new file mode 100644
index 00000000..9bc1e925
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/ilu0.jl
@@ -0,0 +1,171 @@
+function ilu0_invert!(y, A::CUDA.CUSPARSE.CuSparseMatrixBSR, Tv, c_char, data)
+    if Tv == Float32
+        bname = CUDA.CUSPARSE.cusparseSbsrsv2_bufferSize
+        aname = CUDA.CUSPARSE.cusparseSbsrsv2_analysis
+        sname = CUDA.CUSPARSE.cusparseSbsrsv2_solve
+    else
+        bname = CUDA.CUSPARSE.cusparseDbsrsv2_bufferSize
+        aname = CUDA.CUSPARSE.cusparseDbsrsv2_analysis
+        sname = CUDA.CUSPARSE.cusparseDbsrsv2_solve
+    end
+    if c_char == 'U'
+        d_char = 'N'
+        kval = :ilu_upper
+    else
+        @assert c_char == 'L'
+        d_char = 'U'
+        kval = :ilu_lower
+    end
+    transa = 'N'
+    uplo = c_char
+    diag = d_char
+    alpha = 1.0
+    index = 'O'
+    X = y
+    m,n = size(A)
+    if m != n
+        throw(DimensionMismatch("A must be square, but has dimensions ($m,$n)!"))
+    end
+
+    mb = cld(m, A.blockDim)
+
+    is_analysed = haskey(data, kval)
+    # is_analysed = false
+    if is_analysed
+        info, desc = data[kval]
+    else
+        info = CUDA.CUSPARSE.bsrsv2Info_t[0]
+        desc = CUDA.CUSPARSE.CuMatrixDescriptor('G', uplo, diag, index)
+        CUDA.CUSPARSE.cusparseCreateBsrsv2Info(info)
+        data[kval] = (info, desc)
+    end
+
+    function bufferSize()
+        out = Ref{Cint}(1)
+        bname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb,
+            desc, nonzeros(A), A.rowPtr, A.colVal, A.blockDim,
+            info[1], out)
+        return out[]
+    end
+    CUDA.CUSPARSE.with_workspace(bufferSize) do buffer
+        if !is_analysed
+            aname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb,
+                    desc, nonzeros(A), A.rowPtr, A.colVal, A.blockDim,
+                    info[1], CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+            posit = Ref{Cint}(1)
+            CUDA.CUSPARSE.cusparseXbsrsv2_zeroPivot(CUDA.CUSPARSE.handle(), info[1], posit)
+            if posit[] >= 0
+                error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+            end
+        end
+        sname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb,
+                alpha, desc, nonzeros(A), A.rowPtr, A.colVal,
+                A.blockDim, info[1], X, X,
+                CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+    end
+    # Alternatively we could not cache setup and do the following:
+    # if c_char == 'U'
+    #     CUDA.CUSPARSE.sv2!('N', 'U', 'N', 1.0, A, y, 'O')
+    # else
+    #     @assert c_char == 'L'
+    #     CUDA.CUSPARSE.sv2!('N', 'L', 'U', 1.0, A, y, 'O')
+    # end
+    return y
+end
+
+function ilu0_gpu_update!(A, Tv, data)
+    if Tv == Float32
+        bname = CUDA.CUSPARSE.cusparseSbsrilu02_bufferSize
+        aname = CUDA.CUSPARSE.cusparseSbsrilu02_analysis
+        sname = CUDA.CUSPARSE.cusparseSbsrilu02
+    else
+        bname = CUDA.CUSPARSE.cusparseDbsrilu02_bufferSize
+        aname = CUDA.CUSPARSE.cusparseDbsrilu02_analysis
+        sname = CUDA.CUSPARSE.cusparseDbsrilu02
+    end
+    mb = div(size(A, 1), A.blockDim)
+    ilu0_is_analyzed = haskey(data, :ilu0)
+    ilu0_is_analyzed = false
+    if ilu0_is_analyzed
+        info, desc = data[:ilu0]
+    else
+        info = CUDA.CUSPARSE.ILU0InfoBSR()
+        desc = CUDA.CUSPARSE.CuMatrixDescriptor('G', 'U', 'N', 'O')
+        data[:ilu0] = (info, desc)
+    end
+
+    function bufferSize()
+        out = Ref{Cint}(1)
+        bname(CUDA.CUSPARSE.handle(), A.dir, mb, nnz(A), desc, nonzeros(A),
+               A.rowPtr, A.colVal, A.blockDim, info, out)
+        return out[]
+    end
+
+    policy = CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL
+    CUDA.CUSPARSE.with_workspace(bufferSize) do buffer
+        if !ilu0_is_analyzed
+            aname(
+                CUDA.CUSPARSE.handle(),
+                A.dir,
+                mb,
+                nnz(A),
+                desc,
+                nonzeros(A),
+                A.rowPtr,
+                A.colVal,
+                A.blockDim,
+                info,
+                policy,
+                buffer
+            )
+            posit = Ref{Cint}(1)
+            CUDA.CUSPARSE.cusparseXbsrilu02_zeroPivot(CUDA.CUSPARSE.handle(), info, posit)
+            if posit[] >= 0
+                error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+            end
+        end
+        sname(
+            CUDA.CUSPARSE.handle(),
+            A.dir,
+            mb,
+            SparseArrays.nnz(A),
+            desc,
+            nonzeros(A),
+            A.rowPtr,
+            A.colVal,
+            A.blockDim,
+            info,
+            policy,
+            buffer
+        )
+    end
+end
+
+
+function Jutul.update_preconditioner!(ilu::ILUZeroPreconditioner, J_bsr::CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}, b, context, executor) where {Tv, Ti}
+    if isnothing(ilu.factor)
+        data = Dict()
+        data[:ilu0_gpu] = copy(J_bsr)
+        ilu.dim = size(J_bsr)
+        ilu.factor = data
+    end
+    data = ilu.factor
+    ilu_gpu = data[:ilu0_gpu]
+    copyto!(ilu_gpu.nzVal, J_bsr.nzVal)
+    ilu0_gpu_update!(ilu_gpu, Tv, data)
+end
+
+function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv}) where Tv
+    copyto!(y, x)
+    # Equivialent but does not skip analyse
+    # sv2!('N', 'L', 'U', 1.0, P, y, 'O')
+    # sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+    P = f.factor[:ilu0_gpu]
+    ilu0_invert!(y, P, Tv, 'L', f.factor)
+    ilu0_invert!(y, P, Tv, 'U', f.factor)
+    # CUDA.synchronize()
+end
+
+function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    Jutul.update_preconditioner!(prec, J_bsr, r_cu, model.context, executor)
+end
diff --git a/ext/JutulDarcyCUDAExt/krylov.jl b/ext/JutulDarcyCUDAExt/krylov.jl
new file mode 100644
index 00000000..e69de29b
diff --git a/src/CO2Properties/props.jl b/src/CO2Properties/props.jl
index 21ebd36a..a7749077 100644
--- a/src/CO2Properties/props.jl
+++ b/src/CO2Properties/props.jl
@@ -37,6 +37,25 @@ function co2brine_phase_property_table(data::NamedTuple, name, T = missing)
     return co2brine_phase_property_table((data, ), name, T)
 end
 
+"""
+    co2brine_phase_property_table(tables, name::Symbol, T = missing)
+
+Generates a phase property table for CO₂-brine mixtures.
+
+# Arguments
+- `tables`: The data tables containing the phase properties.
+- `name`::Symbol
+    The name of the property to be extracted. Possible choices for `name` include:
+    - `:density`: The density of the CO2.
+    - `:viscosity`: The viscosity of the CO2.
+    - `:thermal_conductivity`: The thermal conductivity of the CO2.
+    - `:specific_heat`: The specific heat capacity of the CO2.
+- `:compressibility`: The compressibility factor of the CO2.
+- `T`: Optional temperature value. If not provided, defaults to `missing`.
+
+# Returns
+A table containing the phase properties for the specified CO₂-brine mixture.
+"""
 function co2brine_phase_property_table(tables, name::Symbol, T = missing)
     choices = (:density, :viscosity, :H, :E, :cv, :cp, :phase_conductivity)
     name in choices || throw(ArgumentError("Property name must be one of $choices, was $name"))
diff --git a/src/CO2Properties/setup.jl b/src/CO2Properties/setup.jl
index 30d96602..b1c73f14 100644
--- a/src/CO2Properties/setup.jl
+++ b/src/CO2Properties/setup.jl
@@ -6,6 +6,7 @@ function setup_reservoir_model_co2_brine(reservoir::DataDomain;
         co2_source = missing,
         co2_density = :nist,
         extra_out = true,
+        parameters = Dict{Symbol, Any}(),
         salt_names = String[],
         salt_mole_fractions = Float64[],
         kwarg...
@@ -40,12 +41,7 @@ function setup_reservoir_model_co2_brine(reservoir::DataDomain;
         error("Unknown physics argument for co2_physics: $co2_physics is not one of :kvalue or :immiscible.")
     end
 
-    out = setup_reservoir_model(reservoir, sys; thermal = thermal, extra_out = extra_out, kwarg...)
-    if extra_out
-        model = out[1]
-    else
-        model = out
-    end
+    model = setup_reservoir_model(reservoir, sys; thermal = thermal, extra_out = false, kwarg...)
 
     outvar = model[:Reservoir].output_variables
     push!(outvar, :Saturations)
@@ -81,6 +77,12 @@ function setup_reservoir_model_co2_brine(reservoir::DataDomain;
             end
         end
     end
+    if extra_out
+        parameters = setup_parameters(model, parameters)
+        out = (model, parameters)
+    else
+        out = model
+    end
     return out
 end
 
diff --git a/src/JutulDarcy.jl b/src/JutulDarcy.jl
index 18197fe6..de691131 100644
--- a/src/JutulDarcy.jl
+++ b/src/JutulDarcy.jl
@@ -187,7 +187,7 @@ module JutulDarcy
     include("formulations/formulations.jl")
     include("coarsening/coarsening.jl")
 
-    include("ext.jl")
+    include("ext/ext.jl")
     # Nonlinear domain decomposition solvers
     include("NLDD/NLDD.jl")
     # CO2-brine properties
diff --git a/src/NLDD/reservoir.jl b/src/NLDD/reservoir.jl
index 51ee7bd3..9d8e00c7 100644
--- a/src/NLDD/reservoir.jl
+++ b/src/NLDD/reservoir.jl
@@ -385,18 +385,6 @@ function reservoir_change_buffers(storage, model::MultiModel)
     return reservoir_change_buffers(storage[:Reservoir], model[:Reservoir], extra_cells = extra_cells)
 end
 
-function store_reservoir_change_buffer!(buf, sim, cfg)
-    model = sim.model
-    state = sim.storage.state
-    if model isa MultiModel
-        model = model[:Reservoir]
-        state = state.Reservoir
-    end
-
-    tols = get_nldd_solution_change_tolerances(cfg)
-    store_reservoir_change_buffer_inner!(buf, tols, model, state)
-end
-
 function get_nldd_solution_change_tolerances(cfg)
     function expand_tol(val::Nothing, avg::Nothing)
         return nothing
@@ -440,7 +428,7 @@ function get_nldd_solution_change_tolerances(cfg)
     return out
 end
 
-function check_if_subdomain_needs_solving(buf, sim, cfg, iteration)
+function check_if_subdomain_needs_solving(prev_state, state, sim, cfg, iteration)
     tol = get_nldd_solution_change_tolerances(cfg)
     model = sim.model
     has_wells = model isa MultiModel && length(model.models) > 1
@@ -453,14 +441,14 @@ function check_if_subdomain_needs_solving(buf, sim, cfg, iteration)
         if iteration == 1
             do_solve = !cfg[:solve_tol_first_newton]
         else
-            state = sim.storage.state
-            do_solve = check_inner(buf, model, state, tol)
+            cells = reservoir_model(sim.model).domain.global_map.cells
+            do_solve = check_inner(prev_state, model, state, tol, cells)
         end
     end
     return do_solve
 end
 
-function check_inner(buf, model, state, tol)
+function check_inner(prev_state, model, state, tol, cells)
     do_solve = false
     criteria = (
         (:Saturations, :abs),
@@ -473,7 +461,7 @@ function check_inner(buf, model, state, tol)
         (:PhaseMassDensities, :abs),
         )
     for (k, t) in criteria
-        do_solve = do_solve || check_subdomain_change_inner(buf, model, state, k, tol[k], t)
+        do_solve = do_solve || check_subdomain_change_inner(prev_state, cells, model, state, k, tol[k], t)
         if do_solve
             break
         end
@@ -481,58 +469,27 @@ function check_inner(buf, model, state, tol)
     return do_solve
 end
 
-function store_reservoir_change_buffer_inner!(buf, tols::Nothing, model, state)
-    # No tolerances, do nothing.
-end
-
-function store_reservoir_change_buffer_inner!(buf, tols, model, state)
-    function buf_transfer!(out::Vector, in::Vector, cells)
-        for i in eachindex(cells)
-            c = cells[i]
-            @inbounds out[i] = value(in[c])
-        end
-    end
-    function buf_transfer!(out::Matrix, in::Matrix, cells)
-        for i in eachindex(cells)
-            c = cells[i]
-            for j in axes(out, 1)
-                @inbounds out[j, i] = value(in[j, c])
-            end
-        end
-    end
-    cells = buf.Cells
-    for (k, v) in pairs(buf)
-        if k == :Cells
-            continue
-        end
-        if isnothing(tols[k])
-            continue
-        end
-        vals = state[k]
-        buf_transfer!(v, vals, cells)
-    end
-end
-
-function check_subdomain_change_inner(buf, model::MultiModel, state, f, tol, sum_t)
-    return check_subdomain_change_inner(buf, model[:Reservoir], state[:Reservoir], f, tol, sum_t)
+function check_subdomain_change_inner(prev_state, cells, model::MultiModel, state, f, tol, sum_t)
+    return check_subdomain_change_inner(prev_state, cells, model[:Reservoir], state[:Reservoir], f, tol, sum_t)
 end
 
-function check_subdomain_change_inner(buf, model::SimulationModel, state, f, tol::Nothing, sum_t)
+function check_subdomain_change_inner(prev_state, cells, model::SimulationModel, state, f, tol::Nothing, sum_t)
     return false
 end
 
-function check_subdomain_change_inner(buf, model::SimulationModel, state, f, tol, sum_t)
+function check_subdomain_change_inner(prev_state, cells, model::SimulationModel, state, f, tol, sum_t)
     if isnothing(tol)
         out = false # No tolerance - no need to check
     elseif haskey(state, f)
-        cells = buf.Cells
         state_val = state[f]
+        prev_state_val = prev_state[f]
         if state_val isa Matrix
             current = view(state_val, :, cells)
+            old = view(prev_state_val, :, cells)
         else
             current = view(state_val, cells)
+            old = view(prev_state_val, cells)
         end
-        old = buf[f]
         if sum_t == :relsum
             current::AbstractMatrix
             out = subdomain_delta_relsum(current, old, tol)
diff --git a/src/NLDD/simulator.jl b/src/NLDD/simulator.jl
index 2c4b2b4a..b0a5e9f4 100644
--- a/src/NLDD/simulator.jl
+++ b/src/NLDD/simulator.jl
@@ -14,6 +14,8 @@ function NLDDSimulator(case::JutulCase, partition = missing;
         cells_per_block = missing,
         mpi_sync_after_solve = true,
         no_blocks = missing,
+        partitioner_type = :metis,
+        partitioner_arg = (partitioner_conn_type = :unit,),
         specialize_submodels = false,
         kwarg...
     )
@@ -33,7 +35,8 @@ function NLDDSimulator(case::JutulCase, partition = missing;
             @assert ismissing(no_blocks)
             N = Int64(clamp(ceil(nc/cells_per_block), 1, nc))
         end
-        p = partition_from_N(model, parameters, N)
+        jutul_message("Partition", "Generating $N coarse blocks using $partitioner_type.")
+        p = JutulDarcy.partition_reservoir(case, N, partitioner_type; partitioner_arg..., wells_in_single_block = true)
         partition = reservoir_partition(model, p);
     elseif partition isa Vector{Int}
         # Convert it.
@@ -118,7 +121,12 @@ function NLDDSimulator(case::JutulCase, partition = missing;
     if reservoir_model(model) isa JutulDarcy.StandardBlackOilModel
         storage[:black_oil_primary_buffers] = map(s -> black_oil_primary_buffers(s.storage, s.model), subsim)
     end
-    storage[:local_changes] = map(s -> reservoir_change_buffers(s.storage, s.model), subsim)
+    # storage[:local_changes] = map(s -> reservoir_change_buffers(s.storage, s.model), subsim)
+    state_mirror = outer_sim.storage.state0
+    if haskey(state_mirror, :Reservoir)
+        state_mirror = state_mirror[:Reservoir]
+    end
+    storage[:state_mirror] = deepcopy(state_mirror)
     storage[:boundary_discretizations] = map((s) -> boundary_discretization(outer_sim.storage, outer_sim.model, s.model, s.storage), subsim)
     storage[:solve_log] = NLDDSolveLog()
     storage[:coarse_neighbors] = coarse_neighbors
@@ -151,8 +159,8 @@ function Jutul.perform_step!(
     )
     log = simulator.storage[:solve_log]
     strategy = config[:strategy]
-
     base_arg = (simulator, dt, forces, config, iteration)
+    s = simulator.simulator
     if executor isa Jutul.DefaultExecutor
         # This means we are not in MPI and we must call this part manually. This
         # contains the entire local stage of the algorithm.
@@ -243,26 +251,21 @@ function local_stage(simulator, dt, forces, config, iteration)
 
     do_solve = zeros(Bool, n)
     allow_early_termination = config[:subdomain_failure_cuts]
+    global_state_mirror = simulator.storage.state_mirror
+    global_state = sim_global.storage.state
 
     function pre(i)
         subsim = sub_sims[i]
-        change_buffer = simulator.storage.local_changes[i]
-        store_reservoir_change_buffer!(change_buffer, subsim, config)
         # Make sure that recorder is updated to match global context
         rec_global = sim_global.storage.recorder
         rec_local = subsim.storage.recorder
         Jutul.reset!(rec_local, rec_global)
-        # Update state0
-        # state0 should have the right secondary variables since it comes from a converged state.
-        update_subdomain_from_global(sim_global, subsim, i, current = false, transfer_secondary = true)
-        # We do not transfer the secondaries. This means that they are
-        # updated/recomputed locally. The current primary variables in the
-        # global state have been updated by the last global stage and the
-        # secondary variables are then "out of sync". The local solvers do the
-        # work of recomputing them.
-        update_subdomain_from_global(sim_global, subsim, i, current = true, transfer_secondary = iteration == 1)
-        should_solve = check_if_subdomain_needs_solving(change_buffer, subsim, config, iteration)
+        should_solve = check_if_subdomain_needs_solving(global_state_mirror, global_state, subsim, config, iteration)
         do_solve[i] = should_solve
+        if should_solve
+            update_subdomain_from_global(sim_global, subsim, i, current = false, transfer_secondary = true)
+            update_subdomain_from_global(sim_global, subsim, i, current = true, transfer_secondary = true)
+        end
     end
     function solve_and_transfer(i)
         should_solve = do_solve[i]
@@ -275,6 +278,10 @@ function local_stage(simulator, dt, forces, config, iteration)
             end
             # cfg[:always_update_secondary] = true
             ok, rep = solve_subdomain(sim, i, dt, forces, cfg)
+            if ok
+                # Still need to transfer over secondary variables
+                update_global_from_subdomain(sim_global, sim, i, secondary = true)
+            end
         else
             if il_i > 0
                 jutul_message("Subdomain: $i", "Skipping subdomain.")
@@ -284,14 +291,9 @@ function local_stage(simulator, dt, forces, config, iteration)
         end
         subreports[i] = rep
         solve_status[i] = get_solve_status(rep, ok)
-        # Still need to transfer over secondary variables
-        if !ok
-            if config[:info_level] > 0
-                Jutul.jutul_message("Failure $i", color = :red)
-            end
-            update_subdomain_from_global(sim_global, sim, i, current = true, transfer_secondary = false)
+        if !ok && config[:info_level] > 0
+            Jutul.jutul_message("Failure $i", color = :red)
         end
-        update_global_from_subdomain(sim_global, sim, i, secondary = true)
         return ok
     end
     # Now that we have the main functions set up we can do either kind of local solves
@@ -647,6 +649,8 @@ function global_stage(
     )
     # @warn "Starting global stage"
     m = config[:method]
+    t_secondary = report[:secondary_time]
+    report[:secondary_time] = 0.0
     g_forces = global_forces(forces)
     s = simulator.simulator
     function solve_fi(do_solve = solve)
@@ -674,11 +678,11 @@ function global_stage(
         end
     end
     if all_local_converged && (iteration > config[:min_nonlinear_iterations]) && config[:subdomain_tol_sufficient]
-        report[:secondary_time] = 0.0
-        report[:equations_time] = 0.0
-        report[:linear_system_time] = 0.0
+        # report[:secondary_time] = 0.0
+        # report[:equations_time] = 0.0
+        # report[:linear_system_time] = 0.0
         report[:converged] = true
-        report[:convergence_time] = 0.0
+        # report[:convergence_time] = 0.0
         Jutul.extra_debug_output!(report, s.storage, s.model, config, iteration, dt)
         out = (0.0, true, report)
         il = config[:info_level]
@@ -713,6 +717,7 @@ function global_stage(
             error("Method must be either :aspen or :nldd. Method was: :$m")
         end
     end
+    report[:secondary_time] += t_secondary
     check_locals_after = config[:debug_checks]
     if check_locals_after && out[2] == true
         if simulator.simulator.storage.LinearizedSystem isa MultiLinearizedSystem
@@ -895,6 +900,9 @@ function Jutul.perform_step_per_process_initial_update!(simulator::NLDDSimulator
     strategy = config[:strategy]
     e = NaN
     converged = false
+    s = simulator.simulator
+    t_secondary = @elapsed @tic "secondary variables" Jutul.update_secondary_variables!(s.storage, s.model)
+    report[:secondary_time] += t_secondary
 
     base_arg = (simulator, dt, forces, config, iteration)
     if iteration == 1
@@ -912,12 +920,34 @@ function Jutul.perform_step_per_process_initial_update!(simulator::NLDDSimulator
         failures = []
         status = [local_solve_skipped for i in eachindex(simulator.subdomain_simulators)]
     end
+    solve_tol = get_nldd_solution_change_tolerances(config)
+    if !isnothing(solve_tol)
+        update_state_mirror!(simulator.storage.state_mirror, s.storage.state, s.model, solve_tol)
+    end
+
     report[:subdomains] = subreports
     report[:time_subdomains] = t_sub
     report[:subdomain_failures] = failures
     report[:local_solves_active] = active
     report[:solve_order] = solve_order
     report[:solve_status] = status
-
     return report
 end
+
+function update_state_mirror!(state_mirror, state, m::SimulationModel, flds)
+    for (k, v) in pairs(flds)
+        if !isnothing(v) && haskey(state_mirror, k)
+            nldd_unsafe_replace!(state_mirror[k], state[k])
+        end
+    end
+end
+
+function nldd_unsafe_replace!(x, y)
+    for i in eachindex(x, y)
+        @inbounds x[i] = value(y[i])
+    end
+end
+
+function update_state_mirror!(state_mirror, state, m::MultiModel, flds)
+    update_state_mirror!(state_mirror, state[:Reservoir], m[:Reservoir], flds)
+end
diff --git a/src/NLDD/transfer.jl b/src/NLDD/transfer.jl
index 1123c905..c40265a9 100644
--- a/src/NLDD/transfer.jl
+++ b/src/NLDD/transfer.jl
@@ -1,10 +1,11 @@
 function state_pair(storage_g, storage_l, current = true)
     # storage_g = simulator.simulator.storage
     # storage_l = sim.storage
+    F(x) = (; x...)
     if current
-        return (storage_g.state, storage_l.state)
+        return (F(storage_g.state), F(storage_l.state))
     else
-        return (storage_g.state0, storage_l.state0)
+        return (F(storage_g.state0), F(storage_l.state0))
     end
 end
 
diff --git a/src/NLDD/utils.jl b/src/NLDD/utils.jl
index 51d75ae3..aecf8365 100644
--- a/src/NLDD/utils.jl
+++ b/src/NLDD/utils.jl
@@ -568,13 +568,6 @@ end
 function aspen_forces(forces, submodels)
     return (outer = forces, subdomains = map(x -> subforces(forces, x), submodels))
 end
-function partition_internal_metis(model, parameters, np, do_print = true)
-    if do_print
-        jutul_message("Partition", "Generating $np coarse blocks using Metis.")
-    end
-    N, T, groups = JutulDarcy.partitioner_input(model, parameters, conn = :unit)
-    return Jutul.partition_hypergraph(N, np, groups = groups)
-end
 
 function Jutul.progress_recorder(sim::NLDDSimulator)
     return sim.simulator.storage.recorder
diff --git a/src/coarsening/coarsening.jl b/src/coarsening/coarsening.jl
index 6857673f..670aa87a 100644
--- a/src/coarsening/coarsening.jl
+++ b/src/coarsening/coarsening.jl
@@ -227,6 +227,15 @@ function partition_reservoir(model::JutulModel, coarsedim::Union{Tuple, Int}, me
         p = Int64.(p)
     end
     p = Jutul.process_partition(mesh, p)
+    if wells_in_single_block
+        # Could have split up things that are actually connected by wells.
+        for group in partitioner_well_groups(model)
+            group_t = unique!(p[group])
+            for i in 2:length(group_t)
+                p[findall(isequal(group_t[i]), p)] .= group_t[1]
+            end
+        end
+    end
     p = Jutul.compress_partition(p)
     return p
 end
diff --git a/src/cpr.jl b/src/cpr.jl
index a6b91fea..4c008093 100644
--- a/src/cpr.jl
+++ b/src/cpr.jl
@@ -124,6 +124,8 @@ end
 function default_psolve(; max_levels = 10, max_coarse = 10, amgcl_type = :amg, type = default_amg_symbol(), kwarg...)
     if type == :hypre
         amg = BoomerAMGPreconditioner(; kwarg...)
+    elseif type == :amgx
+        amg = AMGXPreconditioner(; kwarg...)
     elseif type == :amgcl
         if length(kwarg) == 0
             # Some reasonable defaults for reservoir system
@@ -168,11 +170,13 @@ function default_psolve(; max_levels = 10, max_coarse = 10, amgcl_type = :amg, t
     end
 end
 
-function update_preconditioner!(cpr::CPRPreconditioner, lsys, model, storage, recorder, executor)
+function update_preconditioner!(cpr::CPRPreconditioner, lsys, model, storage, recorder, executor; update_system_precond = true)
     rmodel = reservoir_model(model, type = :flow)
     ctx = rmodel.context
     update_p = update_cpr_internals!(cpr, lsys, model, storage, recorder, executor)
-    @tic "s-precond" update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor)
+    if update_system_precond
+        @tic "s-precond" update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor)
+    end
     if update_p
         @tic "p-precond" update_preconditioner!(cpr.pressure_precond, cpr.storage.A_p, cpr.storage.r_p, ctx, executor)
     elseif should_update_cpr(cpr, recorder, :partial)
@@ -269,6 +273,10 @@ function update_pressure_system!(A_p::Jutul.StaticSparsityMatrixCSR, p_prec, A::
     ncomp = size(w_p, 1)
     N = Val(ncomp)
     is_adjoint = Val(Jutul.represented_as_adjoint(matrix_layout(ctx)))
+    update_rows_csr(nz, A_p, w_p, cols, nz_s, N, is_adjoint, n, tb)
+end
+
+function update_rows_csr(nz, A_p, w_p, cols, nz_s, N, is_adjoint, n, tb)
     @batch minbatch=tb for row in 1:n
         update_row_csr!(nz, A_p, w_p, cols, nz_s, row, N, is_adjoint)
     end
@@ -633,7 +641,7 @@ function should_update_cpr(cpr, rec, type = :amg)
             crit = it == 1
         elseif interval == :step
             n = outer_step
-            crit = it == 1
+            crit = it == 1 && ministep == 1
         else
             error("Bad parameter update_frequency: $interval")
         end
diff --git a/src/ext/ext.jl b/src/ext/ext.jl
new file mode 100644
index 00000000..aaa978d2
--- /dev/null
+++ b/src/ext/ext.jl
@@ -0,0 +1,4 @@
+include("ext_makie.jl")
+include("ext_partitionedarrays.jl")
+include("ext_cuda.jl")
+include("ext_amgx.jl")
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
new file mode 100644
index 00000000..caffae8b
--- /dev/null
+++ b/src/ext/ext_amgx.jl
@@ -0,0 +1,106 @@
+struct AMGXPreconditioner <: Jutul.JutulPreconditioner
+    settings::Dict{String, Any}
+    data::Dict{Symbol, Any}
+    resetup::Bool
+    function AMGXPreconditioner(settings::Dict{String, Any}; resetup = true)
+        data = Dict{Symbol, Any}()
+        function finalize_data!(data)
+            if haskey(data, :nzval)
+                AMGX.unpin_memory(data[:nzval])
+            end
+        end
+        new(settings, finalizer(finalize_data!, data), resetup)
+    end
+end
+
+"""
+    amgx = AMGXPreconditioner()
+    amgx = AMGXPreconditioner(selector = "SIZE_4", algorithm = "AGGREGATION")
+    amgx = AMGXPreconditioner(selector = "PMIS", algorithm = "CLASSICAL")
+
+AMGX preconditioner primarily used for algebraic multigrid implementation on
+CUDA-capable GPUs.
+"""
+function AMGXPreconditioner(;
+        resetup = true,
+        solver = "AMG",
+        algorithm = "CLASSICAL",
+        interpolator = "D2",
+        selector = "PMIS",
+        presweeps = 3,
+        postsweeps = presweeps,
+        strength_threshold = 0.5,
+        max_iters = 1,
+        kwarg...
+    )
+    settings = Dict{String, Any}()
+    for (k, v) in kwarg
+        settings["$k"] = v
+    end
+    # Needed to act as a preconditioner
+    settings["max_iters"] = max_iters
+    if solver == "AMG"
+        settings["solver"] = solver
+        settings["algorithm"] = algorithm
+        if algorithm == "CLASSICAL"
+            selector in ("PMIS", "HMIS") || throw(ArgumentError("Invalid selector $selector for CLASSICAL, must be PMIS or HMIS"))
+        else
+            selector in ("SIZE_2", "SIZE_4", "SIZE_8") || throw(ArgumentError("Invalid selector $selector for AGGREGATION, must be SIZE_X where X is 2, 4 or 8"))
+        end
+        settings["interpolator"] = interpolator
+        settings["selector"] = selector
+        settings["strength_threshold"] = strength_threshold
+        settings["presweeps"] = presweeps
+        settings["postsweeps"] = postsweeps
+    end
+    return AMGXPreconditioner(settings, resetup = resetup)
+end
+
+const AMGXCPR = CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}
+
+function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsityMatrixCSR, b, context::ParallelCSRContext, executor)
+    # Intentionally do nothing - a bit hackish
+end
+
+function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    @tic "CPU cpr work" Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor, update_system_precond = false)
+    # Transfer pressure system to GPU
+    @tic "update system precond" JutulDarcy.gpu_update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    @tic "update pressure system" JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p, eltype(J_bsr), cpr, recorder)
+    # How to get the linear operator in here?
+    gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op, recorder)
+end
+
+function update_amgx_pressure_system!
+
+end
+
+function gpu_cpr_setup_buffers!
+
+end
+
+function gpu_amgx_solve!
+
+end
+
+function Jutul.apply!(x, cpr::AMGXCPR, r)
+    # Apply smoother
+    @tic "system precond apply" Jutul.apply!(x, cpr.system_precond, r)
+    # Correct the residual
+    A_ps = cpr.pressure_precond.data[:operator]
+    r_corrected = cpr.pressure_precond.data[:buffer_full]
+    @tic "residual correction" begin
+        copyto!(r_corrected, r)
+        JutulDarcy.correct_residual!(r_corrected, A_ps, x)
+    end
+    # Construct pressure residual
+    r_p = cpr.pressure_precond.data[:buffer_p]
+    w_p = cpr.pressure_precond.data[:w_p]
+    ncomp, ncell = size(w_p)
+    @tic "residual reduction" gpu_reduce_residual!(r_p, w_p, r_corrected)
+    # Apply pressure preconditioner
+    amgx = cpr.pressure_precond
+    @tic "AMGX apply" dp = gpu_amgx_solve!(amgx, r_p)
+    # Update increments for pressure
+    @tic "increment pressure" gpu_increment_pressure!(x, dp)
+end
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
new file mode 100644
index 00000000..f6c8ead2
--- /dev/null
+++ b/src/ext/ext_cuda.jl
@@ -0,0 +1,172 @@
+mutable struct CUDAReservoirKrylov{Tv, Ti} <: Jutul.AbstractKrylov
+    solver::Symbol
+    config::IterativeSolverConfig
+    preconditioner
+    data
+    storage
+end
+
+function CUDAReservoirKrylov(solver = :gmres;
+        preconditioner = ILUZeroPreconditioner(),
+        Float_t = Float64,
+        Int_t = Int32,
+        kwarg...
+    )
+    cfg = IterativeSolverConfig(; kwarg...)
+    return CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, preconditioner, Dict{Symbol, Any}(), nothing)
+end
+
+function Jutul.linear_solve!(lsys::Jutul.LSystem,
+        krylov::CUDAReservoirKrylov{Tv, Ti},
+        model,
+        storage = nothing,
+        dt = nothing,
+        recorder = ProgressRecorder(),
+        executor = default_executor();
+        dx = lsys.dx_buffer,
+        r = Jutul.vector_residual(lsys),
+        atol = Jutul.linear_solver_tolerance(krylov, :absolute),
+        rtol = Jutul.linear_solver_tolerance(krylov, :relative)
+    ) where {Tv, Ti}
+    cfg = krylov.config
+
+    atol = convert(Tv, atol)
+    rtol = convert(Tv, rtol)
+    t_prep0 = @elapsed @tic "prepare" Jutul.prepare_linear_solve!(lsys)
+
+    is_single_linear_system = lsys isa LinearizedSystem
+
+    L = lsys[1,1]
+    J = L.jac
+    J::Jutul.StaticSparsityMatrixCSR
+
+    bz = Jutul.block_size(L)
+    sz = size(J)
+    n, m = sz
+    @assert n == m
+    is_first = !haskey(krylov.data, :J)
+    t_setup = @elapsed @tic "initial_gpu" if is_first
+        csr_block_buffer = pin_cpu_memory(L.jac_buffer)
+        krylov.data[:csr_buffer] = csr_block_buffer
+        krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r)
+        krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys)
+        krylov.data[:dx_cpu] = pin_cpu_memory(zeros(n*bz))
+    end
+    csr_block_buffer = krylov.data[:csr_buffer]
+    J_bsr = krylov.data[:J]
+    r_cu = krylov.data[:r]
+    schur = krylov.data[:schur]
+    dx_cpu = krylov.data[:dx_cpu]
+    operator = gpu_system_linear_operator(J_bsr, schur, Tv)
+
+    t_gpu_update = @elapsed @tic "to_gpu" begin
+        if !is_first
+            update_gpu_block_residual!(r_cu, bz, r)
+            update_gpu_block_system!(J_bsr, bz, csr_block_buffer)
+            update_gpu_schur_system!(schur, lsys)
+        end
+        @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, operator)
+    end
+    t_prep = t_gpu_update + t_setup + t_prep0
+    max_it = krylov.config.max_iterations
+
+    prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv)
+    if cfg.precond_side == :right
+        preconditioner_arg = (N = prec_op, )
+    else
+        preconditioner_arg = (M = prec_op, )
+    end
+    solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu)
+    @tic "solve" solve_f(F, operator, r_cu;
+        itmax = max_it,
+        preconditioner_arg...,
+        verbose = 0,
+        ldiv = false,
+        history = true,
+        rtol = rtol,
+        atol = atol
+    )
+    dx_gpu, stats = (krylov.storage.x, krylov.storage.stats)
+
+    @tic "update dx" begin
+        copyto!(dx_cpu, dx_gpu)
+        Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx)
+    end
+
+    res = stats.residuals
+    solved = stats.solved
+    n = length(res) - 1
+
+    t_prec = 0.0
+    if prec_op isa Jutul.PrecondWrapper
+        t_prec_op = prec_op.time
+        t_prec_count = prec_op.count
+    else
+        t_prec_op = 0.0
+        t_prec_count = 0
+    end
+    return Jutul.linear_solve_return(solved, n, stats, precond = t_prec_op, precond_count = t_prec_count, prepare = t_prec + t_prep)
+end
+
+function gpu_update_preconditioner!
+
+end
+
+function pin_cpu_memory
+
+end
+
+function Base.show(io::IO, krylov::CUDAReservoirKrylov)
+    rtol = Jutul.linear_solver_tolerance(krylov.config, :relative)
+    atol = Jutul.linear_solver_tolerance(krylov.config, :absolute)
+    print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))")
+end
+
+function gpu_system_linear_operator(J, schur::Nothing, Tv)
+    return Jutul.LinearOperators.LinearOperator(J)
+end
+
+function gpu_system_linear_operator(J, schur, Tv)
+    n, m = size(J)
+    C = schur[:C]
+    D = schur[:D]
+    E_factor = schur[:E_factor]
+    buf = schur[:buf]
+    buf_1 = schur[:buf_1_cpu]
+    buf_2 = schur[:buf_2_cpu]
+
+    mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf, buf_1, buf_2, E_factor)
+    return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!)
+end
+
+function schur_mul_gpu!
+
+end
+
+function build_gpu_block_system
+
+end
+
+function update_gpu_block_system!
+
+end
+
+function build_gpu_schur_system
+
+end
+
+function update_gpu_block_residual!
+
+end
+
+function update_gpu_schur_system!
+
+end
+
+function gpu_reduce_residual!
+
+end
+
+function gpu_increment_pressure!
+
+end
diff --git a/src/ext.jl b/src/ext/ext_makie.jl
similarity index 93%
rename from src/ext.jl
rename to src/ext/ext_makie.jl
index f15689c2..d8070158 100644
--- a/src/ext.jl
+++ b/src/ext/ext_makie.jl
@@ -183,21 +183,6 @@ function plot_faults!(ax, mesh::UnstructuredMesh; kwarg...)
     ax
 end
 
-"""
-    simulate_reservoir_parray(case, mode = :mpi; kwarg...)
-
-Run simulation with parray. This function is primarily for testing.
-[`simulate_reservoir`](@ref) can do the same job by passing the correct mode.
-"""
-function simulate_reservoir_parray(case, mode = :mpi; kwarg...)
-    sim, cfg = setup_reservoir_simulator(case; mode = mode, kwarg...)
-    return simulate!(sim, case.dt, forces = case.forces, config = cfg)
-end
-
-function setup_reservoir_simulator_parray
-
-end
-
 """
     fig = JutulDarcy.plot_co2_inventory(t, inventory, plot_type = :stack)
 
diff --git a/src/ext/ext_partitionedarrays.jl b/src/ext/ext_partitionedarrays.jl
new file mode 100644
index 00000000..384acf4d
--- /dev/null
+++ b/src/ext/ext_partitionedarrays.jl
@@ -0,0 +1,14 @@
+"""
+    simulate_reservoir_parray(case, mode = :mpi; kwarg...)
+
+Run simulation with parray. This function is primarily for testing.
+[`simulate_reservoir`](@ref) can do the same job by passing the correct mode.
+"""
+function simulate_reservoir_parray(case, mode = :mpi; kwarg...)
+    sim, cfg = setup_reservoir_simulator(case; mode = mode, kwarg...)
+    return simulate!(sim, case.dt, forces = case.forces, config = cfg)
+end
+
+function setup_reservoir_simulator_parray
+
+end
diff --git a/src/flux.jl b/src/flux.jl
index b8fb843b..dfe12b01 100644
--- a/src/flux.jl
+++ b/src/flux.jl
@@ -59,16 +59,12 @@ end
     return q
 end
 
-function face_average_density(model, state, tpfa, phase)
+@inline function face_average_density(model, state, tpfa, phase)
     ρ = state.PhaseMassDensities
-    l = tpfa.left
-    r = tpfa.right
-    @inbounds ρ_c = ρ[phase, l]
-    @inbounds ρ_i = ρ[phase, r]
-    return 0.5*(ρ_i + ρ_c)
+    return phase_face_average(ρ, tpfa, phase)
 end
 
-function face_average_density(model::CompositionalModel, state, tpfa, phase)
+@inline function face_average_density(model::CompositionalModel, state, tpfa, phase)
     sys = flow_system(model.system)
     ρ = state.PhaseMassDensities
     l = tpfa.left
diff --git a/src/input_simulation/data_input.jl b/src/input_simulation/data_input.jl
index f9d2bfdd..efc180a1 100644
--- a/src/input_simulation/data_input.jl
+++ b/src/input_simulation/data_input.jl
@@ -845,49 +845,67 @@ function parse_reservoir(data_file; zcorn_depths = true, repair_zcorn = true, pr
         extra_data_arg[:pore_volume_multiplier] = multpv
     end
 
-    if haskey(grid, "NTG")
-        ntg = zeros(nc)
-        for (i, c) in enumerate(active_ix)
-            ntg[i] = grid["NTG"][c]
-        end
-        extra_data_arg[:net_to_gross] = ntg
-    end
 
-    for k in ("GRID", "EDIT")
+
+
+    tranmult = ones(nf)
+    tran_override = fill(NaN, nf)
+    ijk = map(i -> Jutul.cell_ijk(G, i), 1:nc)
+    for (secname, section) in pairs(data_file)
+        if haskey(section, "NTG")
+            ntg = zeros(nc)
+            for (i, c) in enumerate(active_ix)
+                ntg[i] = section["NTG"][c]
+            end
+            extra_data_arg[:net_to_gross] = ntg
+        end
         # TODO: This is not 100% robust if edit and grid interact.
-        if haskey(data_file, k)
-            if haskey(data_file[k], "PORV")
-                extra_data_arg[:pore_volume_override] = data_file[k]["PORV"][active_ix]
+        if haskey(section, "PORV")
+            extra_data_arg[:pore_volume_override] = section["PORV"][active_ix]
+        end
+        # Explicit trans given as cell values
+        for k in ["TRANX", "TRANY", "TRANZ"]
+            if haskey(section, k)
+                d = findfirst(isequal(k[end]), ('X', 'Y', 'Z'))
+                for (c, val) in enumerate(section[k][active_ix])
+                    ijk_c = ijk[c][d]
+                    if !isfinite(val) || val < 0.0
+                        continue
+                    end
+                    for face in G.faces.cells_to_faces[c]
+                        l, r = G.faces.neighbors[face]
+                        if l == c
+                            ijk_other = ijk[r][d]
+                        else
+                            @assert r == c
+                            ijk_other = ijk[l][d]
+                        end
+                        if ijk_other != ijk_c
+                            tran_override[face] = val
+                        end
+                    end
+                end
             end
         end
-    end
 
-    tranmult = ones(nf)
-    for k in ("GRID", "EDIT")
-        if haskey(data_file, k)
-            if haskey(data_file[k], "MULTFLT")
-                for (fault, vals) in data_file[k]["MULTFLT"]
-                    fault_faces = get_mesh_entity_tag(G, Faces(), :faults, Symbol(fault))
-                    tranmult[fault_faces] *= vals[1]
-                end
+        if haskey(section, "MULTFLT")
+            for (fault, vals) in section["MULTFLT"]
+                fault_faces = get_mesh_entity_tag(G, Faces(), :faults, Symbol(fault))
+                tranmult[fault_faces] *= vals[1]
             end
         end
-    end
-    mult_keys = ("MULTX", "MULTX-", "MULTY", "MULTY-", "MULTZ", "MULTZ-")
-    ijk = map(i -> Jutul.cell_ijk(G, i), 1:nc)
-    for k in mult_keys
-        if haskey(grid, k)
-            mult_on_active = grid[k][active_ix]
-            apply_mult_xyz!(tranmult, k, mult_on_active, G, ijk)
+        mult_keys = ("MULTX", "MULTX-", "MULTY", "MULTY-", "MULTZ", "MULTZ-")
+        for k in mult_keys
+            if haskey(section, k)
+                mult_on_active = section[k][active_ix]
+                apply_mult_xyz!(tranmult, k, mult_on_active, G, ijk)
+            end
         end
-    end
-    if haskey(data_file, "EDIT")
-        edit = data_file["EDIT"]
         for k in ["MULTRANX", "MULTRANY", "MULTRANZ"]
-            if haskey(edit, k)
+            if haskey(section, k)
                 fake_mult = ones(cartdims)
                 direction = k[end]
-                for (I_pair, J_pair, K_pair, val) in edit[k]
+                for (I_pair, J_pair, K_pair, val) in section[k]
                     @. fake_mult[
                         I_pair[1]:I_pair[2],
                         J_pair[1]:J_pair[2],
@@ -980,9 +998,6 @@ function parse_reservoir(data_file; zcorn_depths = true, repair_zcorn = true, pr
         end
         extra_data_arg[:temperature] = temperature
     end
-    if haskey(grid, "THCROCK")
-        extra_data_arg[:rock_thermal_conductivity] = grid["THCROCK"][active_ix]
-    end
     has(name) = haskey(data_file["RUNSPEC"], name) && data_file["RUNSPEC"][name]
 
     if has("THERMAL")
@@ -990,11 +1005,19 @@ function parse_reservoir(data_file; zcorn_depths = true, repair_zcorn = true, pr
         o = has("OIL")
         g = has("GAS")
         fluid_conductivity = zeros(w+o+g, nc)
-        pos = 1
-        for phase in ("WATER", "OIL", "GAS")
-            if has(phase)
-                fluid_conductivity[pos, :] .= grid["THC$phase"][active_ix]
-                pos += 1
+        if haskey(grid, "THCONR")
+            @assert !haskey(grid, "THCROCK") "THCONR and THCROCK should not be used together."
+            extra_data_arg[:rock_thermal_conductivity] = grid["THCONR"][active_ix]
+        else
+            pos = 1
+            for phase in ("WATER", "OIL", "GAS")
+                if has(phase)
+                    fluid_conductivity[pos, :] .= grid["THC$phase"][active_ix]
+                    pos += 1
+                end
+            end
+            if haskey(grid, "THCROCK")
+                extra_data_arg[:rock_thermal_conductivity] = grid["THCROCK"][active_ix]
             end
         end
         extra_data_arg[:fluid_thermal_conductivity] = fluid_conductivity
@@ -1042,16 +1065,27 @@ function parse_reservoir(data_file; zcorn_depths = true, repair_zcorn = true, pr
     if !all(isequal(1.0), tranmult)
         domain[:transmissibility_multiplier, Faces()] = tranmult
     end
+    if any(isfinite, tran_override)
+        domain[:transmissibility_override, Faces()] = tran_override
+    end
     if haskey(grid, "NNC")
         nnc = grid["NNC"]
         if length(nnc) > 0
             domain[:nnc, nothing] = nnc
         end
     end
+    edit = get(data_file, "EDIT", Dict())
     if haskey(grid, "DEPTH")
         for (i, c) in enumerate(active_ix)
             domain[:cell_centroids][3, i] = grid["DEPTH"][c]
         end
+    elseif haskey(edit, "DEPTH")
+        for (i, c) in enumerate(active_ix)
+            val = edit["DEPTH"][c]
+            if isfinite(val)
+                domain[:cell_centroids][3, i] = val
+            end
+        end
     elseif haskey(grid, "ZCORN") && zcorn_depths
         # Option to use ZCORN points to set depths
         z = get_zcorn_cell_depths(G, grid)
@@ -1649,8 +1683,9 @@ function keyword_to_control(sys, streams, kw, ::Val{:WCONPROD}; kwarg...)
     wrat = kw[5]
     grat = kw[6]
     lrat = kw[7]
+    resv = kw[8]
     bhp = kw[9]
-    return producer_control(sys, flag, ctrl, orat, wrat, grat, lrat, bhp; kwarg...)
+    return producer_control(sys, flag, ctrl, orat, wrat, grat, lrat, bhp; resv = resv, kwarg...)
 end
 
 function keyword_to_control(sys, streams, kw, ::Val{:WCONHIST}; kwarg...)
@@ -1690,7 +1725,7 @@ function producer_limits(; bhp = Inf, lrat = Inf, orat = Inf, wrat = Inf, grat =
     return NamedTuple(pairs(lims))
 end
 
-function producer_control(sys, flag, ctrl, orat, wrat, grat, lrat, bhp; is_hist = false, temperature = NaN, kwarg...)
+function producer_control(sys, flag, ctrl, orat, wrat, grat, lrat, bhp; is_hist = false, temperature = NaN, resv = 0.0, kwarg...)
     rho_s = reference_densities(sys)
     phases = get_phases(sys)
 
@@ -1717,12 +1752,14 @@ function producer_control(sys, flag, ctrl, orat, wrat, grat, lrat, bhp; is_hist
             t = BottomHolePressureTarget(self_val)
             is_rate = false
         elseif ctrl == "RESV"
-            self_val = -(wrat + orat + grat)
-            w = [wrat, orat, grat]
-            w = w./sum(w)
             if is_hist
+                self_val = -(wrat + orat + grat)
+                w = [wrat, orat, grat]
+                w = w./sum(w)
                 t = HistoricalReservoirVoidageTarget(self_val, w)
             else
+                self_val = resv
+                w = [1.0, 1.0, 1.0]
                 t = ReservoirVoidageTarget(self_val, w)
             end
         else
diff --git a/src/linsolve.jl b/src/linsolve.jl
index c79a0f45..64aa0695 100644
--- a/src/linsolve.jl
+++ b/src/linsolve.jl
@@ -20,6 +20,7 @@ Additional keywords are passed onto the linear solver constructor.
 """
 function reservoir_linsolve(model,
         precond = :cpr;
+        backend = :cpu,
         rtol = nothing,
         atol = nothing,
         v = 0,
@@ -38,12 +39,37 @@ function reservoir_linsolve(model,
         kwarg...
     )
     model = reservoir_model(model)
-    if solver == :lu
-        return LUSolver()
-    end
-    if !Jutul.is_cell_major(matrix_layout(model.context))
-        return nothing
+    is_equation_major = !Jutul.is_cell_major(matrix_layout(model.context))
+    backend in (:cpu, :cuda) || throw(ArgumentError("Backend $backend not supported, must be :cpu or :cuda."))
+    if backend == :cuda
+        # Check assumptions
+        !is_equation_major || throw(ArgumentError("Equation-major storage not supported for CUDA backend."))
+        solver != :lu || throw(ArgumentError("LU direct solver not supported for CUDA backend."))
+        has_cuda = !isnothing(Base.get_extension(JutulDarcy, :JutulDarcyCUDAExt))
+        has_cuda || throw(ArgumentError("CUDA backend not available. You must run \"using CUDA\" before using this function."))
+        has_amgx = !isnothing(Base.get_extension(JutulDarcy, :JutulDarcyAMGXExt))
+        # Make sure that options are compatible with CUDA backend
+        if precond == :cpr && !has_amgx
+            jutul_message("AMGX", "AMGX not available, disabling CPR and falling back to ILU(0) preconditioner.")
+            precond = :ilu0
+        else
+            amg_type = :amgx
+        end
+        if smoother_type != :ilu0
+            jutul_message("CUDA", "Smoother $smoother_type not supported for CUDA, falling back to ILU(0).")
+            smoother_type = :ilu0
+        end
+        krylov_constructor = CUDAReservoirKrylov
+    else
+        if solver == :lu
+            return LUSolver()
+        end
+        if is_equation_major
+            return nothing
+        end
+        krylov_constructor = GenericKrylov
     end
+
     default_tol = 0.01
     max_it = 200
     if precond == :cpr
@@ -97,7 +123,7 @@ function reservoir_linsolve(model,
             precond_side = :left
         end
     end
-    lsolve = GenericKrylov(
+    lsolve = krylov_constructor(
         solver;
         verbose = v,
         preconditioner = prec,
diff --git a/src/test_utils/setup_multimodel.jl b/src/test_utils/setup_multimodel.jl
index b1793224..482b942c 100644
--- a/src/test_utils/setup_multimodel.jl
+++ b/src/test_utils/setup_multimodel.jl
@@ -2,7 +2,6 @@ function simulate_mini_wellcase(::Val{:compositional_2ph_3c};
         dims = (3, 1, 1),
         setuparg = NamedTuple(),
         output_path = nothing,
-        default_linsolve = true,
         nstep = 12*5,
         total_time = 30.0*si_unit(:day)*nstep,
         kwarg...
@@ -52,7 +51,6 @@ function simulate_mini_wellcase(::Val{:compositional_2ph_3c};
     sim, config = setup_reservoir_simulator(
         model, state0, parameters;
         info_level = -1,
-        set_linear_solver = !default_linsolve,
         error_on_incomplete = true,
         output_path = output_path,
         setuparg...,
@@ -76,7 +74,6 @@ function simulate_mini_wellcase(::Val{:immiscible_2ph};
         permeability = 0.1*9.869232667160130e-13,
         nstep = 12*5,
         total_time = 30.0*si_unit(:day)*nstep,
-        default_linsolve = true,
         kwarg...)
     # Some useful constants
     day = 3600*24
@@ -125,7 +122,6 @@ function simulate_mini_wellcase(::Val{:immiscible_2ph};
     sim, config = setup_reservoir_simulator(
         model, state0, parameters;
         info_level = -1,
-        set_linear_solver = !default_linsolve,
         output_path = output_path,
         error_on_incomplete = true,
         setuparg...
@@ -145,7 +141,6 @@ function simulate_mini_wellcase(::Val{:bo_spe1};
         dims = (3, 1, 1),
         setuparg = NamedTuple(),
         output_path = nothing,
-        default_linsolve = true,
         nstep = 12*5,
         total_time = 30.0*si_unit(:day)*nstep,
         kwarg...
@@ -199,7 +194,6 @@ function simulate_mini_wellcase(::Val{:bo_spe1};
     sim, config = setup_reservoir_simulator(
         model, state0, parameters;
         info_level = -1,
-        set_linear_solver = !default_linsolve,
         output_path = output_path,
         error_on_incomplete = true,
         setuparg...
@@ -223,11 +217,11 @@ function precompile_darcy_multimodels(targets = missing; kwarg...)
     if ismissing(targets)
         targets = []
         # Block backend, CSC (CPR)
-        push!(targets, (true, :csc))
+        # push!(targets, (true, :csc))
         # Scalar blackend, CSC (direct solver)
         # push!(targets, (false, :csc))
         # Block backend, CSR (CPR)
-        # push!(targets, (true, :csr))
+        push!(targets, (true, :csr))
     end
     wellcases = [:bo_spe1, :immiscible_2ph, :compositional_2ph_3c]
     for (block_backend, backend) in targets
diff --git a/src/types.jl b/src/types.jl
index 41cb3585..12209d2b 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -417,10 +417,12 @@ function SimpleWell(
         name = :Well,
         explicit_dp = true,
         surface_conditions = default_surface_cond(),
-        reference_depth = 0,
+        reference_depth = 0.0,
         volume = 1000.0, # Regularization volume for well, not a real volume
         kwarg...
     )
+    reference_depth = convert(Float64, reference_depth)
+    volume = convert(Float64, volume)
     nr = length(reservoir_cells)
     WI, gdz = common_well_setup(nr; kwarg...)
     perf = (self = ones(Int64, nr), reservoir = vec(reservoir_cells), WI = WI, gdz = gdz)
diff --git a/src/utils.jl b/src/utils.jl
index 7998e888..9e777c71 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -221,7 +221,7 @@ impact simulation speed.
 - `split_wells=false`: Add a facility model for each well instead of one
   facility model that controls all wells. This must be set to `true` if you want
   to use MPI or nonlinear domain decomposition.
-- `backend=:csc`: Backend to use. Can be `:csc` for serial compressed sparse
+- `backend=:csr`: Backend to use. Can be `:csc` for serial compressed sparse
   column CSC matrix, `:csr` for parallel compressed sparse row matrix. `:csr` is
   a bit faster and is recommended when using MPI, HYPRE or multiple threads.
   `:csc` uses the default Julia format and is interoperable with other Julia
@@ -287,7 +287,7 @@ function setup_reservoir_model(reservoir::DataDomain, system::JutulSystem;
         context = DefaultContext(),
         reservoir_context = nothing,
         general_ad = false,
-        backend = :csc,
+        backend = :csr,
         thermal = false,
         extra_outputs = [:LiquidMassFractions, :VaporMassFractions, :Rs, :Rv, :Saturations],
         split_wells = false,
@@ -612,6 +612,7 @@ function setup_reservoir_simulator(case::JutulCase;
         method = :newton,
         precond = :cpr,
         linear_solver = :bicgstab,
+        linear_solver_backend = :cpu,
         max_timestep = si_unit(:year),
         max_dt = max_timestep,
         rtol = nothing,
@@ -630,18 +631,20 @@ function setup_reservoir_simulator(case::JutulCase;
         failure_cuts_timestep = true,
         max_timestep_cuts = 25,
         inc_tol_dz = Inf,
-        set_linear_solver = true,
         timesteps = :auto,
         relaxation = false,
         presolve_wells = false,
         parray_arg = Dict{Symbol, Any}(),
+        set_linear_solver = missing,
         linear_solver_arg = Dict{Symbol, Any}(),
         extra_timing_setup = false,
         nldd_partition = missing,
         nldd_arg = Dict{Symbol, Any}(),
         kwarg...
     )
-    set_linear_solver = set_linear_solver || linear_solver isa Symbol
+    if ismissing(set_linear_solver)
+        set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver)
+    end
     # Handle old kwarg...
     max_timestep = min(max_dt, max_timestep)
     extra_kwarg = Dict{Symbol, Any}()
@@ -699,6 +702,8 @@ function setup_reservoir_simulator(case::JutulCase;
     if set_linear_solver
         if info_level < 1
             v = -1
+        elseif info_level > 4
+            v = 1
         else
             v = 0
         end
@@ -708,6 +713,7 @@ function setup_reservoir_simulator(case::JutulCase;
             extra_ls = NamedTuple()
         end
         extra_kwarg[:linear_solver] = reservoir_linsolve(case.model, precond;
+            backend = linear_solver_backend,
             rtol = rtol,
             extra_ls...,
             linear_solver_arg...,
@@ -1383,17 +1389,24 @@ function partitioner_input(model, parameters; conn = :trans)
             error("conn must be one of :trans, :unit or :logtrans, was $conn")
         end
     end
+    groups = partitioner_well_groups(model)
+    return (N, T, groups)
+end
+
+function partitioner_well_groups(model::MultiModel)
     groups = Vector{Vector{Int}}()
-    if model isa MultiModel
-        for (k, m) in pairs(model.models)
-            wg = physical_representation(m.domain)
-            if wg isa WellDomain
-                rcells = vec(Int.(wg.perforations.reservoir))
-                push!(groups, rcells)
-            end
+    for (k, m) in pairs(model.models)
+        wg = physical_representation(m.domain)
+        if wg isa WellDomain
+            rcells = vec(Int.(wg.perforations.reservoir))
+            push!(groups, rcells)
         end
     end
-    return (N, T, groups)
+    return groups
+end
+
+function partitioner_well_groups(model)
+    return Vector{Vector{Int}}()
 end
 
 function reservoir_partition(model::MultiModel, p)
@@ -1664,6 +1677,13 @@ function reservoir_transmissibility(d::DataDomain; version = :xyz)
         tm = d[:transmissibility_multiplier]
         @. T *= tm
     end
+    if haskey(d, :transmissibility_override, Faces())
+        for (f, v) in enumerate(d[:transmissibility_override])
+            if isfinite(v)
+                T[f] = v
+            end
+        end
+    end
     if haskey(d, :numerical_aquifers)
         aquifers = d[:numerical_aquifers]
         bnd_areas = d[:boundary_areas]
diff --git a/test/gpu.jl b/test/gpu.jl
index 19590eb8..75de7e17 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -1,69 +1,60 @@
-using Jutul, JutulDarcy
-using LinearAlgebra, Krylov
-using Printf
-using ForwardDiff
-using CUDA, Test
-
-function test_single_phase_gpu(casename = "pico"; float_type = Float32, pvfrac=0.05, tstep = [1.0])#[1.0, 2.0])
-    G, mrst_data = get_minimal_tpfa_grid_from_mrst(casename, extraout = true)
-    nc = number_of_cells(G)
-    # Parameters
+using Jutul, JutulDarcy, CUDA, Test
+
+function solve_bl_lsolve(; nx = 10, ny = 1, nstep = nx*ny, lsolve = missing, backend = :csr, step_limit = nothing, kwarg...)
+    time = 1.0
+    T = time
+    tstep = repeat([T/nstep], nstep)
+    mesh = CartesianMesh((nx, ny))
+    domain = reservoir_domain(mesh)
+    nc = number_of_cells(domain)
+    timesteps = tstep*3600*24
     bar = 1e5
-    p0 = 100*bar # 100 bar
-
-    # Single-phase liquid system (compressible pressure equation)
-    phase = LiquidPhase()
-    sys = SinglePhaseSystem(phase)
-    # Simulation model wraps grid and system together with context (which will be used for GPU etc)
-    ctx = SingleCUDAContext(float_type)
-    linsolve = GenericKrylov(Krylov.dqgmres, preconditioner = ILUZeroPreconditioner(), relative_tolerance = 0.01)
-    model = SimulationModel(G, sys, context = ctx)
-
-
-    s = model.secondary_variables
-
-    mu_c = transfer(ctx, repeat([1e-3], 1, nc))
-    mu = ConstantVariables(mu_c, Cells(), single_entity = false)
-    s[:PhaseViscosities] = mu
-
-    s_d = transfer(ctx, ones(1, nc))
-    sat = ConstantVariables(s_d, Cells(), single_entity = false)
-    s[:Saturations] = sat
-    s[:RelativePermeabilities] = sat
-    # System state
-    pv = physical_representation(model).pore_volumes
-    timesteps = tstep*3600*24 # 1 day, 2 days
-    tot_time = sum(timesteps)  
-    irate = pvfrac*sum(pv)/tot_time
-    src = [SourceTerm(1, irate), 
-           SourceTerm(nc, -irate)]
-    src = CuArray(src)
-    forces = setup_forces(model, sources = src)
-
-    # State is dict with pressure in each cell
-    init = Dict(:Pressure => p0)
-    state0 = setup_state(model, init)
-    # Model parameters
+    p0 = 100*bar
+    sys = ImmiscibleSystem((LiquidPhase(), VaporPhase()), reference_densities = [100.0, 100.0])
+    model = setup_reservoir_model(domain, sys, extra_out = false, backend = backend)
+    kr = BrooksCoreyRelativePermeabilities(sys, [2.0, 2.0], [0.2, 0.2])
+    rmodel = reservoir_model(model)
+    c = [1e-3, 1e-3]/bar
+    density = ConstantCompressibilityDensities(
+        p_ref = 100*bar,
+        density_ref = [100.0, 100.0],
+        compressibility = c
+    )
+    replace_variables!(rmodel, RelativePermeabilities = kr, PhaseMassDensities = density)
+    tot_time = sum(timesteps)
+    pv = pore_volume(domain)
+    irate = 500*sum(pv)/tot_time
+    src  = SourceTerm(nc, irate, fractional_flow = [0.75, 0.25])
+    bc = FlowBoundaryCondition(1, p0/2)
+    forces = setup_reservoir_forces(model, sources = src, bc = bc)
     parameters = setup_parameters(model)
-    parameters[:reference_densities] = CuArray(float_type.([1.0]))
-
-    # linsolve = nothing
-    sim = Simulator(model, state0 = state0, parameters = parameters)
-    info_level = 2
-    debug_level = 2
-    simulate(sim, timesteps, forces = forces, linear_solver = linsolve, 
-            debug_level = debug_level, info_level = info_level, output_states = false)
-    return true
+    state0 = setup_reservoir_state(model, Pressure = p0, Saturations = [0.25, 0.75])
+    if ismissing(lsolve)
+        lsolve = reservoir_linsolve(model)
+    end
+    if !isnothing(step_limit)
+        timesteps = timesteps[1:step_limit]
+    end
+    states, report = simulate(state0, model, timesteps; failure_cuts_timestep = false,
+        forces = forces, parameters = parameters, linear_solver = lsolve, kwarg...)
+    return states[end][:Reservoir][:Saturations][1, :]
 end
-if has_cuda_gpu()
-    CUDA.allowscalar(false)
-    casename = "pico"
-    @testset "GPU multiphase" begin
-        @testset "Basic flow - single precision" begin
-            @test test_single_phase_gpu(casename, float_type = Float32)
-        end
-        @testset "Basic flow - double precision" begin
-            @test test_single_phase_gpu(casename, float_type = Float64)
+
+if CUDA.functional()
+    do_solve(x) = solve_bl_lsolve(
+        lsolve = x,
+        info_level = -1
+    )
+    @testset "SimulationModel" begin
+        krylov_cpu = GenericKrylov(:bicgstab, preconditioner = ILUZeroPreconditioner())
+        s_cpu = do_solve(krylov_cpu)
+
+        for T in [Float32, Float64]
+            for solver in [:bicgstab, :gmres]
+                krylov_cu = JutulDarcy.CUDAReservoirKrylov(solver, Float_t = T)
+                s_cu = do_solve(krylov_cu)
+                @test s_cpu ≈ s_cu
+            end
         end
     end
 end
diff --git a/test/multimodel.jl b/test/multimodel.jl
index f989ab32..71474230 100644
--- a/test/multimodel.jl
+++ b/test/multimodel.jl
@@ -127,15 +127,12 @@ end
     for b in [false, true]
         for backend in [:csr, :csc]
             for gen_ad in [true, false]
-                for default_linsolve in [false, true]
-                    @testset "Block=$b, backend=$b, defaulted linsolve=$default_linsolve" begin
-                        arg = (general_ad = gen_ad, backend = backend, block_backend = b, default_linsolve = default_linsolve)
-                        test_compositional_with_wells(; arg...)
-                        # Disabled test until Jutul is updated.
-                        # test_compositional_with_wells(; fast_flash = true, arg...)
-                        test_immiscible_with_wells(; arg...)
-                        test_blackoil_with_wells(; arg...)
-                    end
+                @testset "Block=$b, backend=$b" begin
+                    arg = (general_ad = gen_ad, backend = backend, block_backend = b)
+                    test_compositional_with_wells(; arg...)
+                    test_compositional_with_wells(; fast_flash = true, arg...)
+                    test_immiscible_with_wells(; arg...)
+                    test_blackoil_with_wells(; arg...)
                 end
             end
         end
diff --git a/test/nldd.jl b/test/nldd.jl
index 0a775622..cd0bd008 100644
--- a/test/nldd.jl
+++ b/test/nldd.jl
@@ -44,22 +44,22 @@ end
         end
         ds_max = 0.1
         reports_dd = states = states_as = states_gs = []
-        case, mrst_data = setup_case_from_mrst(name,
+        case, = setup_case_from_mrst(name,
             split_wells = true,
             ds_max = 0.1,
             block_backend = b
         );
+        if steps != :full
+            case = case[steps]
+        end
         push!(case.model[:Reservoir].output_variables, :Saturations)
         sim_arg = (
-                    info_level = -1,
-                    steps = steps,
-                    block_backend = b,
-                    maxstep = Inf,
-                    timestepping = :iteration,
-                    error_on_incomplete = true,
-                    case = case, mrst_data = mrst_data
+            info_level = -1,
+            max_timestep = Inf,
+            timesteps = :iteration,
+            error_on_incomplete = true
         );
-        results_newton = JutulDarcy.NLDD.bench_dd(name, :fi; sim_arg...);
+        results_newton = simulate_reservoir(case, method = :newton; sim_arg...);
         its_newton = get_its(results_newton)
 
         function test_solver_result(result)
@@ -68,13 +68,13 @@ end
         @testset "NLDD on $name" begin
             for solve_tol_mobility in [nothing, 0.01, 0.05]
                 @testset "Gauss-seidel λ_t = $solve_tol_mobility" begin
-                    results_gs = JutulDarcy.NLDD.bench_dd(name, :nldd; sim_arg...,
+                    results_gs = simulate_reservoir(case, method = :nldd; sim_arg...,
                         gauss_seidel = true,
                         solve_tol_mobility = solve_tol_mobility);
                     test_solver_result(results_gs)
                 end
                 @testset "Jacobi λ_t = $solve_tol_mobility" begin
-                    results_dd = JutulDarcy.NLDD.bench_dd(name, :nldd; sim_arg...,
+                    results_dd = simulate_reservoir(case, method = :nldd; sim_arg...,
                         gauss_seidel = false,
                         solve_tol_mobility = solve_tol_mobility);
                     test_solver_result(results_dd)
@@ -82,7 +82,7 @@ end
             end
             if name != "spe9"
                 @testset "ASPEN" begin
-                    results_as = JutulDarcy.NLDD.bench_dd(name, :aspen;
+                    results_as = simulate_reservoir(case, method = :aspen;
                     sim_arg...);
                     test_solver_result(results_as)
                 end