diff --git a/src/TransformUtils.jl b/src/TransformUtils.jl
index 5b9bc5e..c596b81 100644
--- a/src/TransformUtils.jl
+++ b/src/TransformUtils.jl
@@ -46,6 +46,7 @@ export
   ⊖,
   ⊕,
   \,
+  doinversetransform!,
 
   # type aliases
   FloatInt,
@@ -127,6 +128,7 @@ mutable struct Euler
     R::Float64
     P::Float64
     Y::Float64
+    # these will be inconsistent with R, P, Y -- not for public consumption...
     fastconvert::Quaternion
     Euler() = new()
     Euler(s::FloatInt) = new(0.0,0.0,0.0, Quaternion(0))
@@ -149,25 +151,42 @@ mutable struct SE3
   SE3(M::Array{Float64,2}) = new(SO3(M[1:3,1:3]), vec(M[1:3,4]) )
 end
 
+function fast_norm_TU(u)
+  # dest[1] = ...
+  n = length(u)
+  T = eltype(u)
+  s = zero(T)
+  @fastmath @inbounds @simd for i in 1:n
+      s += u[i]^2
+  end
+  @fastmath @inbounds return sqrt(s)
+end
+
 function normalize!(q::Quaternion, tol=1e-6)
-  @inbounds begin
-    mag2 = sum(q.v[1:3].^2) + q.s^2
-    if abs(mag2 - 1.0) > tol
-        mag = sqrt(mag2)
-        q.v[1:3] ./= mag
-        q.s /= mag # q.s /
-    end
+  # Quaternion(s, [x,y,z])
+  s = q.s^2
+  @fastmath @inbounds @simd for i in 1:3
+      s += q.v[i]^2
   end
+  @fastmath mag1 = 1.0/sqrt(s)
+  # @fastmath @inbounds for i in 1:3
+  #   q.v[i] *= mag1
+  # end
+  @fastmath @inbounds q.v .*= mag1
+  @fastmath q.s *= mag1
   nothing
 end
+
 function normalize(q::Quaternion, tol=0.00001)
   qq = deepcopy(q)
   normalize!(qq,tol)
   return qq
 end
 
+
+
 function normalize(v::Array{Float64,1})
-  return v / norm(v)
+  return v / fast_norm_TU(v)
 end
 
 
@@ -263,6 +282,15 @@ oplus(xi::SE3, xj::SE3) = xi*xj
 
 \(qi::Quaternion,qj::Quaternion) = q_conj(qi) * qj
 
+function doinversetransform!(dst::Vector{Float64}, aTb::SE3, src::Vector{Float64})::Void
+  At_mul_B!(dst, aTb.R.R, src)
+  @fastmath @inbounds for i in 1:3, j in 1:3
+    dst[i] -= aTb.R.R[j,i]*aTb.t[i]
+  end
+  nothing
+end
+# bR
+
 # comparison functions
 
 compare(a::SO3, b::SO3; tol::Float64=1e-14) = norm((a*transpose(b)).R-eye(3)) < tol
@@ -318,17 +346,18 @@ function convert!(R::SO3, q::Quaternion)
       q.s = -q.s
       q.v[1:3] = -q.v[1:3]
     end
-    nrm = sqrt(q.s^2+sum(q.v[1:3].^2))
-    if (nrm < 0.999)
-      println("q2C -- not a unit quaternion nrm = $(nrm)")
-      R = eye(3)
-    else
-      nrm = 1.0/nrm
-      q.s *= nrm
-      q.v[1:3] .*= nrm
-      # x = x*nrm
-      # y = y*nrm
-      # z = z*nrm
+    # nrm = sqrt(q.s^2+sum(q.v[1:3].^2))
+    # if (nrm < 0.999)
+    #   println("q2C -- not a unit quaternion nrm = $(nrm)")
+    #   R = eye(3)
+    # else
+      normalize!(q)
+      # nrm = 1.0/nrm
+      # q.s *= nrm
+      # q.v .*= nrm
+      # # x = x*nrm
+      # # y = y*nrm
+      # # z = z*nrm
       w2, x2, y2, z2 = q.s*q.s, q.v[1]*q.v[1], q.v[2]*q.v[2], q.v[3]*q.v[3]
       # y2 = y*y
       # z2 = z*z
@@ -350,7 +379,7 @@ function convert!(R::SO3, q::Quaternion)
       R.R[3,1] = xz-wy
       R.R[3,2] = yz+wx
       R.R[3,3] = w2-x2-y2+z2
-    end
+    # end
   end
   # return SO3(R)
   nothing
@@ -454,32 +483,25 @@ function convert(::Type{Euler}, R::SO3)
   convert(Euler, convert(Quaternion, R))
 end
 
-function convert!(q::Quaternion, E::Euler)
-  # Using fixed frame rotation scheme, as used in MIT libbot
-  # q = zeros(4)
-
-  halfroll = E.R/2.0;
-  halfpitch = E.P/2.0;
-  halfyaw = E.Y/2.0;
-
-  sin_r2 = sin(halfroll);
-  sin_p2 = sin(halfpitch);
-  sin_y2 = sin(halfyaw);
+function convert!(q::Quaternion, E::Euler)::Void
 
-  cos_r2 = cos(halfroll);
-  cos_p2 = cos(halfpitch);
-  cos_y2 = cos(halfyaw);
+  @fastmath halfroll = 0.5*E.R;
+  @fastmath halfpitch = 0.5*E.P;
+  @fastmath halfyaw = 0.5*E.Y;
+  @fastmath sin_r2 = sin(halfroll);
+  @fastmath sin_p2 = sin(halfpitch);
+  @fastmath sin_y2 = sin(halfyaw);
+  @fastmath cos_r2 = cos(halfroll);
+  @fastmath cos_p2 = cos(halfpitch);
+  @fastmath cos_y2 = cos(halfyaw);
 
-
-  @inbounds begin
-    q.s = cos_r2 * cos_p2 * cos_y2 + sin_r2 * sin_p2 * sin_y2;
-    q.v[1] = sin_r2 * cos_p2 * cos_y2 - cos_r2 * sin_p2 * sin_y2;
-    q.v[2] = cos_r2 * sin_p2 * cos_y2 + sin_r2 * cos_p2 * sin_y2;
-    q.v[3] = cos_r2 * cos_p2 * sin_y2 - sin_r2 * sin_p2 * cos_y2;
-  end
+  @fastmath q.s = cos_r2 * cos_p2 * cos_y2 + sin_r2 * sin_p2 * sin_y2;
+  @fastmath @inbounds q.v[1] = sin_r2 * cos_p2 * cos_y2 - cos_r2 * sin_p2 * sin_y2;
+  @fastmath @inbounds q.v[2] = cos_r2 * sin_p2 * cos_y2 + sin_r2 * cos_p2 * sin_y2;
+  @fastmath @inbounds q.v[3] = cos_r2 * cos_p2 * sin_y2 - sin_r2 * sin_p2 * cos_y2;
 
   # Enforce positive scalar quaternions following conversion from Euler angles
-  if (q.s<0.0)
+  @inbounds if (q.s<0.0)
     q.s = -q.s
     q.v[1:3] = -q.v[1:3];
   end
@@ -488,6 +510,41 @@ function convert!(q::Quaternion, E::Euler)
   # return Quaternion(q[1],q[2:4])
   nothing
 end
+  # mutable struct HalfAngles
+  #   halfang::Vector{Float64}
+  #   halfsin::Vector{Float64}
+  #   halfcos::Vector{Float64}
+  #   HalfAngles() = new()
+  #   HalfAngles(::Int) = new(zeros(3), zeros(3), zeros(3))
+  # end
+  # unsafe_reuseE2Q::Vector{HalfAngles}
+  # thr_reuse = E.unsafe_reuseE2Q[Threads.threadid()]
+  # thr_reuse.halfang[1] = 0.5*E.R;
+  # thr_reuse.halfang[2] = 0.5*E.P;
+  # thr_reuse.halfang[3] = 0.5*E.Y;
+  # thr_reuse.halfang[1] = 0.5*E.R;
+  # thr_reuse.halfang[2] = 0.5*E.P;
+  # thr_reuse.halfang[3] = 0.5*E.Y;
+  # thr_reuse.halfsin[1] = sin(thr_reuse.halfang[1]);
+  # thr_reuse.halfsin[2] = sin(thr_reuse.halfang[2]);
+  # thr_reuse.halfsin[3] = sin(thr_reuse.halfang[3]);
+  # thr_reuse.halfcos[1] = cos(thr_reuse.halfang[1]);
+  # thr_reuse.halfcos[2] = cos(thr_reuse.halfang[2]);
+  # thr_reuse.halfcos[3] = cos(thr_reuse.halfang[3]);
+  # @fastmath q.s = thr_reuse.halfcos[1] * thr_reuse.halfcos[2] * thr_reuse.halfcos[3] + thr_reuse.halfsin[1] * thr_reuse.halfsin[2] * thr_reuse.halfsin[3];
+  # @fastmath @inbounds q.v[1] = thr_reuse.halfsin[1] * thr_reuse.halfcos[2] * thr_reuse.halfcos[3] - thr_reuse.halfcos[1] * thr_reuse.halfsin[2] * thr_reuse.halfsin[3];
+  # @fastmath @inbounds q.v[2] = thr_reuse.halfcos[1] * thr_reuse.halfsin[2] * thr_reuse.halfcos[3] + thr_reuse.halfsin[1] * thr_reuse.halfcos[2] * thr_reuse.halfsin[3];
+  # @fastmath @inbounds q.v[3] = thr_reuse.halfcos[1] * thr_reuse.halfcos[2] * thr_reuse.halfsin[3] - thr_reuse.halfsin[1] * thr_reuse.halfsin[2] * thr_reuse.halfcos[3];
+  # @fastmath q.s = thr_reuse.halfcos[1] * thr_reuse.halfcos[2] * thr_reuse.halfcos[3]
+  # @fastmath @inbounds q.v[1] = thr_reuse.halfsin[1] * thr_reuse.halfcos[2] * thr_reuse.halfcos[3]
+  # @fastmath @inbounds q.v[2] = thr_reuse.halfcos[1] * thr_reuse.halfsin[2] * thr_reuse.halfcos[3]
+  # @fastmath @inbounds q.v[3] = thr_reuse.halfcos[1] * thr_reuse.halfcos[2] * thr_reuse.halfsin[3]
+  # @fastmath q.s += thr_reuse.halfsin[1] * thr_reuse.halfsin[2] * thr_reuse.halfsin[3];
+  # @fastmath @inbounds q.v[1] -= thr_reuse.halfcos[1] * thr_reuse.halfsin[2] * thr_reuse.halfsin[3];
+  # @fastmath @inbounds q.v[2] += thr_reuse.halfsin[1] * thr_reuse.halfcos[2] * thr_reuse.halfsin[3];
+  # @fastmath @inbounds q.v[3] -= thr_reuse.halfsin[1] * thr_reuse.halfsin[2] * thr_reuse.halfcos[3];
+
+
 function convert(::Type{Quaternion}, E::Euler)
   q = Quaternion(0)
   convert!(q, E)
diff --git a/test/runtests.jl b/test/runtests.jl
index dafcb4c..c74a1fa 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -119,6 +119,7 @@ end
   @test abs(ce.R - pi/4) < 1e-8
 end
 
+include("testEfficientSE3.jl")
 
 
 
diff --git a/test/testEfficientSE3.jl b/test/testEfficientSE3.jl
new file mode 100644
index 0000000..2d2310b
--- /dev/null
+++ b/test/testEfficientSE3.jl
@@ -0,0 +1,23 @@
+using TransformUtils
+using Base: Test
+
+
+@testset "test in-place and efficient inverse SE3..." begin
+
+dest = zeros(3)
+src = [1.0;0;0]
+wTb = SE3([1.0;0;0],SO3(eye(3)))
+
+doinversetransform!(dest, wTb, src)
+
+@test norm(dest[1:3]) < 1e-10
+
+
+# using BenchmarkTools
+# @btime for i in 1:1000 doinversetransform!(dest, wTb, src); end
+# Profile.clear()
+# @profiler for i in 1:1000000 doinversetransform!(dest, wTb, src); end
+
+
+
+end
diff --git a/test/testInPlaceConvert.jl b/test/testInPlaceConvert.jl
new file mode 100644
index 0000000..8c2f1b4
--- /dev/null
+++ b/test/testInPlaceConvert.jl
@@ -0,0 +1,115 @@
+# profile/test conversion utils
+
+using TransformUtils
+using Base.Test
+using BenchmarkTools, ProfileView
+
+R = SO3(0)
+E = Euler(0)
+
+RR = convert(SO3, so3(randn(3)))
+E = convert(Euler, RR)
+
+# pvt: 371.464 ms (5000000 allocations: 534.06 MiB)
+@btime for i in 1:10^6 convert!(R, E); end
+@profiler for i in 1:10^6 convert!(R, E); end
+
+
+
+
+
+
+T1 = SE3(zeros(3),R)
+T2 = SE3(0)
+
+# pvt: 520.225 ns (20 allocations: 1.39 KiB - norm()
+@btime T3 = T1 \ T2
+
+# pvt: 519.044 μs (20000 allocations: 1.36 MiB)
+@btime for i in 1:10^3 T3 = T1 \ T2; end
+using Profile
+Profile.clear()
+@profile for i in 1:10^3 T3 = T1 \ T2; end
+
+Juno.profiler()
+Juno.profiletree()
+
+
+
+
+@testset "From: quaternion" begin
+@test
+end
+
+
+
+function fast_norm_sonar(u::::Vector{Float64})
+  s = 0.0
+  @fastmath @inbounds @simd for i in 1:n
+      s += u[i]^2
+  end
+  @fastmath @inbounds return sqrt(s/n)
+end
+
+
+
+
+
+
+import Base: normalize!
+
+function normalize!(q::Quaternion, tol=1e-6)
+
+  s = q.s^2
+  @fastmath @inbounds @simd for i in 1:3
+      s += q.v[i]^2
+  end
+  @fastmath mag = sqrt(s)
+  @fastmath q.v ./= mag
+  @fastmath q.s /= mag
+  nothing
+end
+
+
+
+@code_warntype normalize!(q)
+
+@code_llvm normalize!(q)
+
+# code_native(normalize!, (Quaternion,))
+@code_native normalize!(q)
+
+
+@btime normalize!(q) # 17ns
+
+
+@btime for i in 1:1000 normalize!(q); end
+
+@time normalize!(q)
+
+
+
+
+
+
+q = convert(Quaternion, so3(randn(3)))
+E = convert(Euler, convert(SO3, so3(randn(3))))
+
+
+
+@btime convert!(q, E)
+
+#
+Profile.clear()
+@profiler convert!(q, E)
+
+
+Profile.clear()
+@profiler for i in 1:10^6 convert!(q, E); end
+
+
+
+
+@code_warntype convert!(q, E)
+
+# julia --track-allocation=user testInPlaceConvert.jl