diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..6a1d44a
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,40 @@
+steps:
+  - label: "GPU integration with julia v1.6"
+    plugins:
+      - JuliaCI/julia#v1:
+          # Drop default "registries" directory, so it is not persisted from execution to execution
+          # Taken from https://github.com/JuliaLang/julia/blob/v1.7.2/.buildkite/pipelines/main/platforms/package_linux.yml#L11-L12
+          persist_depot_dirs: packages,artifacts,compiled
+          version: "1.6"
+      - JuliaCI/julia-test#v1: ~
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    timeout_in_minutes: 60
+
+  - label: "GPU integration with julia v1"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    env:
+      JULIA_CUDA_USE_BINARYBUILDER: "true"
+    timeout_in_minutes: 60
+
+  # - label: "GPU nightly"
+  #   plugins:
+  #     - JuliaCI/julia#v1:
+  #         version: "nightly"
+  #     - JuliaCI/julia-test#v1: ~
+  #   agents:
+  #     queue: "juliagpu"
+  #     cuda: "*"
+  #   timeout_in_minutes: 60
+
+env:
+  SECRET_CODECOV_TOKEN: "fAV/xwuaV0l5oaIYSAXRQIor8h7yHdlrpLUZFwNVnchn7rDk9UZoz0oORG9vlKLc1GK2HhaPRAy+fTkJ3GM/8Y0phHh3ANK8f5UsGm2DUTNsnf6u9izgnwnoRTcsWu+vSO0fyYrxBvBCoJwljL+yZbDFz3oE16DP7HPIzxfQagm+o/kMEszVuoUXhuLXXH0LxT6pXl214qjqs04HfMRmKIIiup48NB6fBLdhGlQz64MdMNHBfgDa/fafB7eNvn0X6pEOxysoy6bDQLUhKelOXgcDx1UsTo34Yiqr+QeJPAeKcO//PWurwQhPoUoHfLad2da9DN4uQk4YQLqAlcIuAA==;U2FsdGVkX1+mRXF2c9soCXT7DYymY3msM+vrpaifiTp8xA+gMpbQ0G63WY3tJ+6V/fJcVnxYoKZVXbjcg8fl4Q=="
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index f649c74..949f5e7 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -49,3 +49,29 @@ jobs:
       - uses: codecov/codecov-action@v2
         with:
           files: lcov.info
+  
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1.6'
+      - run: |
+          julia --project=docs -e '
+            using Pkg
+            Pkg.develop(PackageSpec(path=pwd()))
+            Pkg.instantiate()'
+      - run: |
+          julia --project=docs/ -e '
+            using OneHotArrays
+            # using Pkg; Pkg.activate("docs")
+            using Documenter
+            using Documenter: doctest
+            DocMeta.setdocmeta!(OneHotArrays, :DocTestSetup, :(using OneHotArrays); recursive=true)
+            doctest(OneHotArrays)'
+      - run: julia --project=docs docs/make.jl
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 0000000..3a52a5d
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+
+[compat]
+Documenter = "0.27"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 0000000..79336a2
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,10 @@
+using Documenter, OneHotArrays
+
+DocMeta.setdocmeta!(OneHotArrays, :DocTestSetup, :(using OneHotArrays); recursive = true)
+makedocs(sitename = "OneHotArrays", doctest = false,
+         pages = ["Overview" => "index.md",
+                  "Reference" => "reference.md"])
+
+deploydocs(repo = "github.com/FluxML/OneHotArrays.jl.git",
+           target = "build",
+           push_preview = true)
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..627220f
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,39 @@
+# OneHotArrays.jl
+
+[![CI](https://github.com/FluxML/OneHotArrays.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/FluxML/OneHotArrays.jl/actions/workflows/CI.yml)
+
+Memory efficient one-hot array encodings (primarily for use in machine-learning contexts like Flux.jl).
+
+## Usage
+
+One-hot arrays are boolean arrays where only a single element in the first dimension is `true` (i.e. "hot"). OneHotArrays.jl stores such arrays efficiently by encoding a N-dimensional array of booleans as a (N - 1)-dimensional array of integers. For example, the one-hot vector below only uses a single `UInt32` for storage.
+
+```julia
+julia> β = onehot(:b, (:a, :b, :c))
+3-element OneHotVector(::UInt32) with eltype Bool:
+ ⋅
+ 1
+ ⋅
+```
+
+As seen above, the one-hot encoding can be useful for representing labeled data. The label `:b` is encoded into a 3-element vector where the "hot" element indicates the label from the set `(:a, :b, :c)`.
+
+We can also encode a batch of one-hot vectors or reverse the encoding.
+
+```julia
+julia> oh = onehotbatch("abracadabra", 'a':'e', 'e')
+5×11 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
+ 1  ⋅  ⋅  1  ⋅  1  ⋅  1  ⋅  ⋅  1
+ ⋅  1  ⋅  ⋅  ⋅  ⋅  ⋅  ⋅  1  ⋅  ⋅
+ ⋅  ⋅  ⋅  ⋅  1  ⋅  ⋅  ⋅  ⋅  ⋅  ⋅
+ ⋅  ⋅  ⋅  ⋅  ⋅  ⋅  1  ⋅  ⋅  ⋅  ⋅
+ ⋅  ⋅  1  ⋅  ⋅  ⋅  ⋅  ⋅  ⋅  1  ⋅
+
+julia> Flux.onecold(β, (:a, :b, :c))
+:b
+
+julia> Flux.onecold([0.3, 0.2, 0.5], (:a, :b, :c))
+:c
+```
+
+In addition to functions for encoding and decoding data as one-hot, this package provides numerous "fast-paths" for linear algebraic operations with one-hot arrays. For example, multiplying by a matrix by a one-hot vector triggers an indexing operation instead of a matrix multiplication.
diff --git a/docs/src/reference.md b/docs/src/reference.md
new file mode 100644
index 0000000..9423e5a
--- /dev/null
+++ b/docs/src/reference.md
@@ -0,0 +1,6 @@
+# Reference
+
+```@autodocs
+Modules = [OneHotArrays]
+Order   = [:function, :type]
+```
diff --git a/src/OneHotArrays.jl b/src/OneHotArrays.jl
index c14387e..ef6c886 100644
--- a/src/OneHotArrays.jl
+++ b/src/OneHotArrays.jl
@@ -7,8 +7,8 @@ using LinearAlgebra
 using MLUtils 
 using NNlib
 
-export onehot, onehotbatch, onecold, OneHotArray, 
-  OneHotVector, OneHotMatrix, OneHotLike
+export onehot, onehotbatch, onecold,
+       OneHotArray, OneHotVector, OneHotMatrix, OneHotLike
 
 include("array.jl")
 include("onehot.jl")
diff --git a/src/onehot.jl b/src/onehot.jl
index 4ff19e8..6231f3c 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -12,13 +12,13 @@ and [`onecold`](@ref) to reverse either of these, as well as to generalise `argm
 
 # Examples
 ```jldoctest
-julia> β = Flux.onehot(:b, (:a, :b, :c))
+julia> β = onehot(:b, (:a, :b, :c))
 3-element OneHotVector(::UInt32) with eltype Bool:
  ⋅
  1
  ⋅
 
-julia> αβγ = (Flux.onehot(0, 0:2), β, Flux.onehot(:z, [:a, :b, :c], :c))  # uses default
+julia> αβγ = (onehot(0, 0:2), β, onehot(:z, [:a, :b, :c], :c))  # uses default
 (Bool[1, 0, 0], Bool[0, 1, 0], Bool[0, 0, 1])
 
 julia> hcat(αβγ...)  # preserves sparsity
@@ -66,7 +66,7 @@ for `labels` will often speed up construction, certainly for less than 32 classe
 
 # Examples
 ```jldoctest
-julia> oh = Flux.onehotbatch("abracadabra", 'a':'e', 'e')
+julia> oh = onehotbatch("abracadabra", 'a':'e', 'e')
 5×11 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
  1  ⋅  ⋅  1  ⋅  1  ⋅  1  ⋅  ⋅  1
  ⋅  1  ⋅  ⋅  ⋅  ⋅  ⋅  ⋅  1  ⋅  ⋅
@@ -112,17 +112,17 @@ the same operation as `argmax(y, dims=1)` but sometimes a different return type.
 
 # Examples
 ```jldoctest
-julia> Flux.onecold([false, true, false])
+julia> onecold([false, true, false])
 2
 
-julia> Flux.onecold([0.3, 0.2, 0.5], (:a, :b, :c))
+julia> onecold([0.3, 0.2, 0.5], (:a, :b, :c))
 :c
 
-julia> Flux.onecold([ 1  0  0  1  0  1  0  1  0  0  1
-                      0  1  0  0  0  0  0  0  1  0  0
-                      0  0  0  0  1  0  0  0  0  0  0
-                      0  0  0  0  0  0  1  0  0  0  0
-                      0  0  1  0  0  0  0  0  0  1  0 ], 'a':'e') |> String
+julia> onecold([ 1  0  0  1  0  1  0  1  0  0  1
+                 0  1  0  0  0  0  0  0  1  0  0
+                 0  0  0  0  1  0  0  0  0  0  0
+                 0  0  0  0  0  0  1  0  0  0  0
+                 0  0  1  0  0  0  0  0  0  1  0 ], 'a':'e') |> String
 "abeacadabea"
 ```
 """
diff --git a/test/gpu.jl b/test/gpu.jl
index 91fa2a8..13c208c 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -6,7 +6,7 @@
   cx = cu(x)
   @test cx isa CuArray
 
-  @test_broken onecold(cu([1.0, 2.0, 3.0])) == 3  # scalar indexing error?
+  @test_skip onecold(cu([1.0, 2.0, 3.0])) == 3  # passes with CuArray with Julia 1.6, but fails with JLArray
 
   x = onehotbatch([1, 2, 3], 1:3)
   cx = cu(x)