diff --git a/Project.toml b/Project.toml
index 64a483fc..667922e4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "BoundaryValueDiffEq"
 uuid = "764a87c0-6b3e-53db-9096-fe964310641d"
-version = "5.9.1"
+version = "5.10.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -51,7 +51,7 @@ LinearSolve = "2.21"
 Logging = "1.10"
 NonlinearSolve = "3.8.1"
 ODEInterface = "0.5"
-OrdinaryDiffEq = "6.88.1"
+OrdinaryDiffEq = "6.89.0"
 PreallocationTools = "0.4.24"
 PrecompileTools = "1.2"
 Preferences = "1.4"
diff --git a/README.md b/README.md
index a2d99f1d..c4c8cce8 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Precompilation can be controlled via `Preferences.jl`
   - `PrecompileMIRK` -- Precompile the MIRK2 - MIRK6 algorithms (default: `true`).
   - `PrecompileShooting` -- Precompile the single shooting algorithms (default: `true`).
   - `PrecompileMultipleShooting` -- Precompile the multiple shooting algorithms (default: `true`).
-  - `PrecompileMIRKNLLS` -- Precompile the MIRK2 - MIRK6 algorithms for under-determined and over-determined BVPs (default: `true`).
+  - `PrecompileMIRKNLLS` -- Precompile the MIRK2 - MIRK6 algorithms for under-determined and over-determined BVPs (default: `false`).
   - `PrecompileShootingNLLS` -- Precompile the single shooting algorithms for under-determined and over-determined BVPs (default: `true`).
   - `PrecompileMultipleShootingNLLS` -- Precompile the multiple shooting algorithms for under-determined and over-determined BVPs (default: `true` ).
 
diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index cb6055b1..6cbf6df9 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -95,179 +95,6 @@ end
             Threads.@spawn solve(prob, alg; dt = 0.2)
         end
     end
-
-    f1_nlls! = (du, u, p, t) -> begin
-        du[1] = u[2]
-        du[2] = -u[1]
-    end
-
-    f1_nlls = (u, p, t) -> [u[2], -u[1]]
-
-    bc1_nlls! = (resid, sol, p, t) -> begin
-        solₜ₁ = sol[:, 1]
-        solₜ₂ = sol[:, end]
-        resid[1] = solₜ₁[1]
-        resid[2] = solₜ₂[1] - 1
-        resid[3] = solₜ₂[2] + 1.729109
-        return nothing
-    end
-    bc1_nlls = (sol, p, t) -> [sol[:, 1][1], sol[:, end][1] - 1, sol[:, end][2] + 1.729109]
-
-    bc1_nlls_a! = (resid, ua, p) -> (resid[1] = ua[1])
-    bc1_nlls_b! = (resid, ub, p) -> (resid[1] = ub[1] - 1;
-    resid[2] = ub[2] + 1.729109)
-
-    bc1_nlls_a = (ua, p) -> [ua[1]]
-    bc1_nlls_b = (ub, p) -> [ub[1] - 1, ub[2] + 1.729109]
-
-    tspan = (0.0, 100.0)
-    u0 = [0.0, 1.0]
-    bcresid_prototype1 = Array{Float64}(undef, 3)
-    bcresid_prototype2 = (Array{Float64}(undef, 1), Array{Float64}(undef, 2))
-
-    probs = [
-        BVProblem(BVPFunction(f1_nlls!, bc1_nlls!; bcresid_prototype = bcresid_prototype1),
-            u0, tspan, nlls = Val(true)),
-        BVProblem(BVPFunction(f1_nlls, bc1_nlls; bcresid_prototype = bcresid_prototype1),
-            u0, tspan, nlls = Val(true)),
-        TwoPointBVProblem(f1_nlls!, (bc1_nlls_a!, bc1_nlls_b!), u0, tspan;
-            bcresid_prototype = bcresid_prototype2, nlls = Val(true)),
-        TwoPointBVProblem(f1_nlls, (bc1_nlls_a, bc1_nlls_b), u0, tspan;
-            bcresid_prototype = bcresid_prototype2, nlls = Val(true))]
-
-    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))
-
-    nlsolvers = [LevenbergMarquardt(; disable_geodesic = Val(true)), GaussNewton()]
-
-    algs = []
-
-    if Preferences.@load_preference("PrecompileMIRKNLLS", false)
-        for nlsolve in nlsolvers
-            append!(algs, [MIRK2(; jac_alg, nlsolve), MIRK6(; jac_alg, nlsolve)])
-        end
-    end
-
-    @compile_workload begin
-        @sync for prob in probs, alg in algs
-            Threads.@spawn solve(prob, alg; dt = 0.2, abstol = 1e-2)
-        end
-    end
-
-    function bc2!(residual, u, p, t)
-        residual[1] = u(0.0)[1] - 5
-        residual[2] = u(5.0)[1]
-    end
-    bc2 = (u, p, t) -> [u(0.0)[1] - 5, u(5.0)[1]]
-
-    tspan = (0.0, 5.0)
-    u0 = [5.0, -3.5]
-    bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1))
-
-    probs = [BVProblem(BVPFunction{true}(f1!, bc2!), u0, tspan; nlls = Val(false)),
-        BVProblem(BVPFunction{false}(f1, bc2), u0, tspan; nlls = Val(false)),
-        BVProblem(
-            BVPFunction{true}(
-                f1!, (bc1_a!, bc1_b!); bcresid_prototype, twopoint = Val(true)),
-            u0,
-            tspan;
-            nlls = Val(false)),
-        BVProblem(
-            BVPFunction{false}(f1, (bc1_a, bc1_b); bcresid_prototype, twopoint = Val(true)),
-            u0, tspan; nlls = Val(false))]
-
-    algs = []
-
-    if @load_preference("PrecompileShooting", true)
-        push!(algs,
-            Shooting(Tsit5(); nlsolve = NewtonRaphson(),
-                jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))))
-    end
-
-    if @load_preference("PrecompileMultipleShooting", true)
-        push!(algs,
-            MultipleShooting(10,
-                Tsit5();
-                nlsolve = NewtonRaphson(),
-                jac_alg = BVPJacobianAlgorithm(;
-                    bc_diffmode = AutoForwardDiff(; chunksize = 2),
-                    nonbc_diffmode = AutoSparse(AutoForwardDiff(; chunksize = 2)))))
-    end
-
-    @compile_workload begin
-        @sync for prob in probs, alg in algs
-            Threads.@spawn solve(prob, alg)
-        end
-    end
-
-    bc1_nlls! = (resid, sol, p, t) -> begin
-        solₜ₁ = sol(0.0)
-        solₜ₂ = sol(100.0)
-        resid[1] = solₜ₁[1]
-        resid[2] = solₜ₂[1] - 1
-        resid[3] = solₜ₂[2] + 1.729109
-        return nothing
-    end
-    bc1_nlls = (sol, p, t) -> [sol(0.0)[1], sol(100.0)[1] - 1, sol(1.0)[2] + 1.729109]
-
-    tspan = (0.0, 100.0)
-    u0 = [0.0, 1.0]
-    bcresid_prototype1 = Array{Float64}(undef, 3)
-    bcresid_prototype2 = (Array{Float64}(undef, 1), Array{Float64}(undef, 2))
-
-    probs = [
-        BVProblem(
-            BVPFunction{true}(f1_nlls!, bc1_nlls!; bcresid_prototype = bcresid_prototype1),
-            u0, tspan; nlls = Val(true)),
-        BVProblem(
-            BVPFunction{false}(f1_nlls, bc1_nlls; bcresid_prototype = bcresid_prototype1),
-            u0, tspan; nlls = Val(true)),
-        BVProblem(
-            BVPFunction{true}(f1_nlls!, (bc1_nlls_a!, bc1_nlls_b!);
-                bcresid_prototype = bcresid_prototype2, twopoint = Val(true)),
-            u0,
-            tspan;
-            nlls = Val(true)),
-        BVProblem(
-            BVPFunction{false}(f1_nlls, (bc1_nlls_a, bc1_nlls_b);
-                bcresid_prototype = bcresid_prototype2, twopoint = Val(true)),
-            u0,
-            tspan;
-            nlls = Val(true))]
-
-    algs = []
-
-    if @load_preference("PrecompileShootingNLLS", true)
-        append!(algs,
-            [
-                Shooting(
-                    Tsit5(); nlsolve = LevenbergMarquardt(; disable_geodesic = Val(true)),
-                    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))),
-                Shooting(Tsit5(); nlsolve = GaussNewton(),
-                    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2)))])
-    end
-
-    if @load_preference("PrecompileMultipleShootingNLLS", true)
-        append!(algs,
-            [
-                MultipleShooting(10,
-                    Tsit5();
-                    nlsolve = LevenbergMarquardt(; disable_geodesic = Val(true)),
-                    jac_alg = BVPJacobianAlgorithm(;
-                        bc_diffmode = AutoForwardDiff(; chunksize = 2),
-                        nonbc_diffmode = AutoSparse(AutoForwardDiff(; chunksize = 2)))),
-                MultipleShooting(10,
-                    Tsit5();
-                    nlsolve = GaussNewton(),
-                    jac_alg = BVPJacobianAlgorithm(;
-                        bc_diffmode = AutoForwardDiff(; chunksize = 2),
-                        nonbc_diffmode = AutoSparse(AutoForwardDiff(; chunksize = 2))))])
-    end
-
-    @compile_workload begin
-        @sync for prob in probs, alg in algs
-            Threads.@spawn solve(prob, alg; nlsolve_kwargs = (; abstol = 1e-2))
-        end
-    end
 end
 
 export Shooting, MultipleShooting
diff --git a/src/adaptivity.jl b/src/adaptivity.jl
index 89a58a3f..aabfe5f4 100644
--- a/src/adaptivity.jl
+++ b/src/adaptivity.jl
@@ -1,5 +1,7 @@
 """
-    interp_eval!(y::AbstractArray, cache::MIRKCache, t)
+    interp_eval!(y::AbstractArray, cache::MIRKCache, t, mesh, mesh_dt)
+    interp_eval!(y::AbstractArray, cache::FIRKCacheExpand, t, mesh, mesh_dt)
+    interp_eval!(y::AbstractArray, cache::FIRKCacheNested, t, mesh, mesh_dt)
 
 After we construct an interpolant, we use interp_eval to evaluate it.
 """
@@ -14,13 +16,6 @@ end
 
 @views function interp_eval!(
         y::AbstractArray, cache::FIRKCacheExpand{iip}, t, mesh, mesh_dt) where {iip}
-    i = findfirst(x -> x == y, cache.y₀.u)
-    interp_eval!(cache.y₀.u, i, cache::FIRKCacheExpand{iip}, t, mesh, mesh_dt)
-    return y
-end
-
-@views function interp_eval!(
-        y::AbstractArray, i::Int, cache::FIRKCacheExpand{iip}, t, mesh, mesh_dt) where {iip}
     j = interval(mesh, t)
     h = mesh_dt[j]
     lf = (length(cache.y₀) - 1) / (length(cache.y) - 1) # Cache length factor. We use a h corresponding to cache.y. Note that this assumes equidistributed mesh
@@ -29,12 +24,11 @@ end
     end
     τ = (t - mesh[j])
 
-    (; f, M, p, ITU) = cache
-    (; q_coeff, stage) = ITU
+    (; f, M, stage, p, ITU) = cache
+    (; q_coeff) = ITU
 
     K = __similar(cache.y[1].du, M, stage)
 
-    ctr_y0 = (i - 1) * (stage + 1) + 1
     ctr_y = (j - 1) * (stage + 1) + 1
 
     yᵢ = cache.y[ctr_y].du
@@ -130,6 +124,34 @@ function dS_interpolate!(dy::AbstractArray, t, S_coeffs)
     dy .= S_coeffs * ts
 end
 
+"""
+    s_constraints(M, h)
+
+Form the quartic interpolation constraint matrix, see bvp5c paper.
+"""
+function s_constraints(M, h)
+    t = vec(repeat([0.0, 1.0 * h, 0.5 * h, 0.0, 1.0 * h, 0.5 * h], 1, M))
+    A = zeros(6 * M, 6 * M)
+    for i in 1:6
+        row_start = (i - 1) * M + 1
+        for k in 0:(M - 1)
+            for j in 1:6
+                A[row_start + k, j + k * 6] = t[i + k * 6]^(j - 1)
+            end
+        end
+    end
+    for i in 4:6
+        row_start = (i - 1) * M + 1
+        for k in 0:(M - 1)
+            for j in 1:6
+                A[row_start + k, j + k * 6] = j == 1.0 ? 0.0 :
+                                              (j - 1) * t[i + k * 6]^(j - 2)
+            end
+        end
+    end
+    return A
+end
+
 """
     interval(mesh, t)
 
@@ -141,6 +163,8 @@ end
 
 """
     mesh_selector!(cache::MIRKCache)
+    mesh_selector!(cache::FIRKCacheExpand)
+    mesh_selector!(cache::FIRKCacheNested)
 
 Generate new mesh based on the defect.
 """
@@ -199,6 +223,8 @@ end
 
 """
     redistribute!(cache::MIRKCache, Nsub_star, ŝ, mesh, mesh_dt)
+    redistribute!(cache::FIRKCacheExpand, Nsub_star, ŝ, mesh, mesh_dt)
+    redistribute!(cache::FIRKCacheNested, Nsub_star, ŝ, mesh, mesh_dt)
 
 Generate a new mesh based on the `ŝ`.
 """
@@ -235,6 +261,8 @@ end
 """
     half_mesh!(mesh, mesh_dt)
     half_mesh!(cache::MIRKCache)
+    half_mesh!(cache::FIRKCacheExpand)
+    half_mesh!(cache::FIRKCacheNested)
 
 The input mesh has length of `n + 1`. Divide the original subinterval into two equal length
 subinterval. The `mesh` and `mesh_dt` are modified in place.
@@ -260,6 +288,8 @@ end
 
 """
     defect_estimate!(cache::MIRKCache)
+    defect_estimate!(cache::FIRKCacheExpand)
+    defect_estimate!(cache::FIRKCacheNested)
 
 defect_estimate use the discrete solution approximation Y, plus stages of
 the RK method in 'k_discrete', plus some new stages in 'k_interp' to construct
diff --git a/src/algorithms.jl b/src/algorithms.jl
index d1a6cc13..4b953ccb 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -224,7 +224,7 @@ for stage in (1, 2, 3, 5, 7)
         implicit FIRK step. Defaults to `false`. If set to `false`, the FIRK stages are 
         solved as a part of the global residual. The general recommendation is to choose 
         `true` for larger problems and `false` for smaller ones.
-        - `nest_tol`: The tolerance for the nested solver. Default is 0.0 which leads to 
+        - `nest_tol`: The tolerance for the nested solver. Default is nothing which leads to 
         `NonlinearSolve` automatically selecting the tolerance.
         - `defect_threshold`: Threshold for defect control.
         - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
@@ -277,11 +277,11 @@ for stage in (1, 2, 3, 5, 7)
             nlsolve::N = nothing
             jac_alg::J = BVPJacobianAlgorithm()
             nested_nlsolve::Bool = false
-            nest_tol::Number = 0.0
+            nest_tol::Union{Number, Nothing} = nothing
             defect_threshold::T = 0.1
             max_num_subintervals::Int = 3000
         end
-        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Number = 0.0, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
+        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Union{Number, Nothing} = nothing, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
             N, J, T}(
             nlsolve, jac_alg, nested, nest_tol, defect_threshold, max_num_subintervals)
     end
@@ -315,7 +315,7 @@ for stage in (2, 3, 4, 5)
         implicit FIRK step. Defaults to `false`. If set to `false`, the FIRK stages are 
         solved as a part of the global residual. The general recommendation is to choose 
         `true` for larger problems and `false` for smaller ones.
-        - `nest_tol`: The tolerance for the nested solver. Default is 0.0 which leads to 
+        - `nest_tol`: The tolerance for the nested solver. Default is nothing which leads to 
         `NonlinearSolve` automatically selecting the tolerance.
         - `defect_threshold`: Threshold for defect control.
         - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
@@ -369,11 +369,11 @@ for stage in (2, 3, 4, 5)
             nlsolve::N = nothing
             jac_alg::J = BVPJacobianAlgorithm()
             nested_nlsolve::Bool = false
-            nest_tol::Number = 0.0
+            nest_tol::Union{Number, Nothing} = nothing
             defect_threshold::T = 0.1
             max_num_subintervals::Int = 3000
         end
-        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Number = 0.0, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
+        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Union{Number, Nothing} = nothing, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
             N, J, T}(
             nlsolve, jac_alg, nested, nest_tol, defect_threshold, max_num_subintervals)
     end
@@ -407,7 +407,7 @@ for stage in (2, 3, 4, 5)
         implicit FIRK step. Defaults to `true`. If set to `false`, the FIRK stages are 
         solved as a part of the global residual. The general recommendation is to choose 
         `true` for larger problems and `false` for smaller ones.
-        - `nest_tol`: The tolerance for the nested solver. Default is 0.0 which leads to 
+        - `nest_tol`: The tolerance for the nested solver. Default is nothing which leads to 
         `NonlinearSolve` automatically selecting the tolerance.
         - `defect_threshold`: Threshold for defect control.
         - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
@@ -461,11 +461,11 @@ for stage in (2, 3, 4, 5)
             nlsolve::N = nothing
             jac_alg::J = BVPJacobianAlgorithm()
             nested_nlsolve::Bool = false
-            nest_tol::Number = 0.0
+            nest_tol::Union{Number, Nothing} = nothing
             defect_threshold::T = 0.1
             max_num_subintervals::Int = 3000
         end
-        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Number = 0.0, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
+        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Union{Number, Nothing} = nothing, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
             N, J, T}(
             nlsolve, jac_alg, nested, nest_tol, defect_threshold, max_num_subintervals)
     end
@@ -499,7 +499,7 @@ for stage in (2, 3, 4, 5)
         implicit FIRK step. Defaults to `true`. If set to `false`, the FIRK stages are 
         solved as a part of the global residual. The general recommendation is to choose 
         `true` for larger problems and `false` for smaller ones.
-        - `nest_tol`: The tolerance for the nested solver. Default is 0.0 which leads to 
+        - `nest_tol`: The tolerance for the nested solver. Default is nothing which leads to 
         `NonlinearSolve` automatically selecting the tolerance.
         - `defect_threshold`: Threshold for defect control.
         - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
@@ -553,11 +553,11 @@ for stage in (2, 3, 4, 5)
             nlsolve::N = nothing
             jac_alg::J = BVPJacobianAlgorithm()
             nested_nlsolve::Bool = false
-            nest_tol::Number = 0.0
+            nest_tol::Union{Number, Nothing} = nothing
             defect_threshold::T = 0.1
             max_num_subintervals::Int = 3000
         end
-        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Number = 0.0, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
+        $(alg)(nlsolve::N, jac_alg::J; nested = false, nest_tol::Union{Number, Nothing} = nothing, defect_threshold::T = 0.1, max_num_subintervals::Int = 3000) where {N, J, T} = $(alg){
             N, J, T}(
             nlsolve, jac_alg, nested, nest_tol, defect_threshold, max_num_subintervals)
     end
diff --git a/src/interpolation.jl b/src/interpolation.jl
index 6fe00495..876f9981 100644
--- a/src/interpolation.jl
+++ b/src/interpolation.jl
@@ -192,7 +192,7 @@ end
 
     for j in idx
         z = similar(cache.fᵢ₂_cache)
-        interp_eval!(z, j, id.cache, tvals[j], id.cache.mesh, id.cache.mesh_dt)
+        interp_eval!(z, id.cache, tvals[j], id.cache.mesh, id.cache.mesh_dt)
         vals[j] = z
     end
 end
@@ -200,7 +200,7 @@ end
 @inline function interpolation(tval::Number, id::FIRKExpandInterpolation, idxs,
         deriv::D, p, continuity::Symbol = :left) where {D}
     z = similar(id.cache.fᵢ₂_cache)
-    interp_eval!(z, 1, id.cache, tval, id.cache.mesh, id.cache.mesh_dt)
+    interp_eval!(z, id.cache, tval, id.cache.mesh, id.cache.mesh_dt)
     return idxs !== nothing ? z[idxs] : z
 end
 
@@ -210,44 +210,3 @@ end
     cache.mesh, u, cache)
 @inline __build_interpolation(cache::FIRKCacheNested, u::AbstractVector) = FIRKNestedInterpolation(
     cache.mesh, u, cache)
-
-"""
-    get_ymid(yᵢ, coeffs, K, h)
-
-Gets the interpolated middle value for a RK method, see bvp5c paper.
-"""
-function get_ymid(yᵢ, coeffs, K, h)
-    res = copy(yᵢ)
-    for i in axes(K, 2)
-        res .+= h .* K[:, i] .* coeffs[i]
-    end
-    return res
-end
-
-"""
-    s_constraints(M, h)
-
-Form the quartic interpolation constraint matrix, see bvp5c paper.
-"""
-function s_constraints(M, h)
-    t = vec(repeat([0.0, 1.0 * h, 0.5 * h, 0.0, 1.0 * h, 0.5 * h], 1, M))
-    A = zeros(6 * M, 6 * M)
-    for i in 1:6
-        row_start = (i - 1) * M + 1
-        if i <= 3
-            for k in 0:(M - 1)
-                for j in 1:6
-                    A[row_start + k, j + k * 6] = t[i + k * 6]^(j - 1)
-                end
-            end
-        else
-            for k in 0:(M - 1)
-                for j in 1:6
-                    A[row_start + k, j + k * 6] = j == 1.0 ? 0.0 :
-                                                  (j - 1) * t[i + k * 6]^(j - 2)
-                end
-            end
-        end
-    end
-    return A
-end
diff --git a/src/solve/firk.jl b/src/solve/firk.jl
index 3cb7851d..47e6d1a7 100644
--- a/src/solve/firk.jl
+++ b/src/solve/firk.jl
@@ -62,27 +62,19 @@ Base.eltype(::FIRKCacheExpand{iip, T}) where {iip, T} = T
 
 function extend_y(y, N::Int, stage::Int)
     y_extended = similar(y.u, (N - 1) * (stage + 1) + 1)
-    y_extended[1] = y.u[1]
-    let ctr1 = 2
-        for i in 2:N
-            for j in 1:(stage + 1)
-                y_extended[(ctr1)] = y.u[i]
-                ctr1 += 1
-            end
-        end
+    for (i, ctr) in enumerate(2:(stage + 1):((N - 1) * (stage + 1) + 1))
+        @views fill!(y_extended[ctr:(ctr + stage)], y.u[i + 1])
     end
+    y_extended[1] = y.u[1]
     return VectorOfArray(y_extended)
 end
 
-function shrink_y(y, N, M, stage)
+function shrink_y(y, N, stage)
     y_shrink = similar(y, N)
-    y_shrink[1] = y[1]
-    let ctr = stage + 2
-        for i in 2:N
-            y_shrink[i] = y[ctr]
-            ctr += (stage + 1)
-        end
+    for (i, ctr) in enumerate((stage + 2):(stage + 1):((N - 1) * (stage + 1) + 1))
+        y_shrink[i + 1] = y[ctr]
     end
+    y_shrink[1] = y[1]
     return y_shrink
 end
 
@@ -180,7 +172,7 @@ function init_nested(prob::BVProblem, alg::AbstractFIRK; dt = 0.0,
     K0 = repeat(avg_u0, 1, stage) # Somewhat arbitrary initialization of K
 
     nestprob_p = zeros(T, M + 2)
-    nest_tol = (alg.nest_tol > eps(T) ? alg.nest_tol : nothing)
+    nest_tol = alg.nest_tol
 
     if iip
         nestprob = NonlinearProblem(
@@ -310,15 +302,11 @@ function SciMLBase.solve!(cache::FIRKCacheExpand)
         end
     end
 
-    # sync y and y0 caches
-    for i in axes(cache.y₀, 1)
-        cache.y[i].du .= cache.y₀.u[i]
-    end
-
-    u = shrink_y([reshape(y, cache.in_size) for y in cache.y₀],
-        length(cache.mesh), cache.M, alg_stage(cache.alg))
+    u = shrink_y(
+        [reshape(y, cache.in_size) for y in cache.y₀], length(cache.mesh), cache.stage)
 
     interpolation = __build_interpolation(cache, u)
+
     odesol = DiffEqBase.build_solution(
         cache.prob, cache.alg, cache.mesh, u; interp = interpolation, retcode = info)
     return __build_solution(cache.prob, odesol, sol_nlprob)
@@ -377,13 +365,12 @@ function __construct_nlproblem(
         cache::FIRKCacheExpand{iip}, y, loss_bc::BC, loss_collocation::C,
         loss::LF, ::StandardBVProblem) where {iip, BC, C, LF}
     (; nlsolve, jac_alg) = cache.alg
+    (; stage) = cache
     N = length(cache.mesh)
-    TU, ITU = constructRK(cache.alg, eltype(y))
-    (; s) = TU
 
     resid_bc = cache.bcresid_prototype
     L = length(resid_bc)
-    resid_collocation = __similar(y, cache.M * (N - 1) * (TU.s + 1))
+    resid_collocation = __similar(y, cache.M * (N - 1) * (stage + 1))
 
     loss_bcₚ = (iip ? __Fix3 : Base.Fix2)(loss_bc, cache.p)
     loss_collocationₚ = (iip ? __Fix3 : Base.Fix2)(loss_collocation, cache.p)
@@ -403,10 +390,10 @@ function __construct_nlproblem(
                 sparse(colored_matrix.M), colored_matrix.row_colorvec,
                 colored_matrix.col_colorvec))
         else
-            block_size = cache.M * (s + 2)
+            block_size = cache.M * (stage + 2)
             J_full_band = BandedMatrix(
-                Ones{eltype(y)}(
-                    L + cache.M * (s + 1) * (N - 1), cache.M * (s + 1) * (N - 1) + cache.M),
+                Ones{eltype(y)}(L + cache.M * (stage + 1) * (N - 1),
+                    cache.M * (stage + 1) * (N - 1) + cache.M),
                 (block_size, block_size))
             __sparsity_detection_alg(__generate_sparse_jacobian_prototype(
                 cache, cache.problem_type, y, y, cache.M, N))
@@ -451,13 +438,12 @@ function __construct_nlproblem(
         cache::FIRKCacheExpand{iip}, y, loss_bc::BC, loss_collocation::C,
         loss::LF, ::TwoPointBVProblem) where {iip, BC, C, LF}
     (; nlsolve, jac_alg) = cache.alg
+    (; stage) = cache
     N = length(cache.mesh)
-    TU, ITU = constructRK(cache.alg, eltype(y))
-    (; s) = TU
 
     lossₚ = iip ? ((du, u) -> loss(du, u, cache.p)) : (u -> loss(u, cache.p))
 
-    resid_collocation = __similar(y, cache.M * (N - 1) * (TU.s + 1))
+    resid_collocation = __similar(y, cache.M * (N - 1) * (stage + 1))
 
     resid = vcat(
         @view(cache.bcresid_prototype[1:prod(cache.resid_size[1])]), resid_collocation,
@@ -465,10 +451,10 @@ function __construct_nlproblem(
     L = length(cache.bcresid_prototype)
 
     sd = if jac_alg.nonbc_diffmode isa AutoSparse
-        block_size = cache.M * (s + 2)
+        block_size = cache.M * (stage + 2)
         J_full_band = BandedMatrix(
-            Ones{eltype(y)}(
-                L + cache.M * (s + 1) * (N - 1), cache.M * (s + 1) * (N - 1) + cache.M),
+            Ones{eltype(y)}(L + cache.M * (stage + 1) * (N - 1),
+                cache.M * (stage + 1) * (N - 1) + cache.M),
             (block_size, block_size))
         __sparsity_detection_alg(__generate_sparse_jacobian_prototype(
             cache, cache.problem_type,
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index e84a001f..15291dc4 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -111,6 +111,8 @@ end
 
 """
     __expand_cache!(cache::MIRKCache)
+    __expand_cache!(cache::FIRKCacheNested)
+    __expand_cache!(cache::MIRKCacheExpand)
 
 After redistributing or halving the mesh, this function expands the required vectors to
 match the length of the new mesh.
diff --git a/src/sparse_jacobians.jl b/src/sparse_jacobians.jl
index 016b3790..7f8b0d88 100644
--- a/src/sparse_jacobians.jl
+++ b/src/sparse_jacobians.jl
@@ -75,23 +75,23 @@ end
 
 function __generate_sparse_jacobian_prototype(
         cache::FIRKCacheExpand, ::StandardBVProblem, ya, yb, M, N)
-    (; TU) = cache
-    (; s) = TU
+    (; stage) = cache
+
     # Get number of nonzeros
-    block_size = M * (s + 1) * M * (s + 2)
+    block_size = M * (stage + 1) * M * (stage + 2)
     l = (N - 1) * block_size
     # Initialize Is and Js
     Is = Vector{Int}(undef, l)
     Js = Vector{Int}(undef, l)
 
     # Fill Is and Js
-    row_size = M * (s + 1) * (N - 1)
+    row_size = M * (stage + 1) * (N - 1)
 
     idx = 1
     i_start = 0
     j_start = 0
-    i_step = M * (s + 1)
-    j_step = M * (s + 2)
+    i_step = M * (stage + 1)
+    j_step = M * (stage + 2)
     for k in 1:(N - 1)
         for i in 1:i_step
             for j in 1:j_step
@@ -109,11 +109,11 @@ function __generate_sparse_jacobian_prototype(
 
     col_colorvec = Vector{Int}(undef, size(J_c, 2))
     for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, (2 * M * (s + 1)) + M)
+        col_colorvec[i] = mod1(i, (2 * M * (stage + 1)) + M)
     end
     row_colorvec = Vector{Int}(undef, size(J_c, 1))
     for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, (2 * M * (s + 1)) + M)
+        row_colorvec[i] = mod1(i, (2 * M * (stage + 1)) + M)
     end
 
     return ColoredMatrix(J_c, row_colorvec, col_colorvec)
@@ -121,22 +121,22 @@ end
 
 function __generate_sparse_jacobian_prototype(
         cache::FIRKCacheExpand, ::TwoPointBVProblem, ya, yb, M, N)
-    (; TU) = cache
-    (; s) = TU
+    (; stage) = cache
+
     # Get number of nonzeros
-    block_size = M * (s + 1) * M * (s + 2)
-    l = (N - 1) * block_size + M * (s + 2) * (length(ya) + length(yb))
+    block_size = M * (stage + 1) * M * (stage + 2)
+    l = (N - 1) * block_size + M * (stage + 2) * (length(ya) + length(yb))
     # Initialize Is and Js
     Is = Vector{Int}(undef, l)
     Js = Vector{Int}(undef, l)
 
     # Fill Is and Js
-    row_size = M * (s + 1) * (N - 1)
+    row_size = M * (stage + 1) * (N - 1)
     idx = 1
     i_start = 0
     j_start = 0
-    i_step = M * (s + 1)
-    j_step = M * (s + 2)
+    i_step = M * (stage + 1)
+    j_step = M * (stage + 2)
 
     # Fill first rows
     for i in 1:length(ya)
diff --git a/test/firk/expanded/nlls_tests.jl b/test/firk/expanded/nlls_tests.jl
index bca1e822..5206d933 100644
--- a/test/firk/expanded/nlls_tests.jl
+++ b/test/firk/expanded/nlls_tests.jl
@@ -4,7 +4,7 @@ using BoundaryValueDiffEq, LinearAlgebra
 
 SOLVERS = [firk(; nlsolve)
            for firk in (RadauIIa5, LobattoIIIa4, LobattoIIIb4, LobattoIIIc4),
-nlsolve in (LevenbergMarquardt(), GaussNewton(), TrustRegion())]
+nlsolve in (LevenbergMarquardt(), GaussNewton())]#, TrustRegion())]
 
 SOLVERS_NAMES = ["$solver with $nlsolve"
                  for solver in ["RadauIIa5", "LobattoIIIa4", "LobattoIIIb4", "LobattoIIIc4"],