faster sprand, new randsubseq (fixes JuliaLang#6714 and JuliaLang#6708)

stevengj · May 3, 2014 · 79e7f20 · 79e7f20
1 parent 27ab7af
commit 79e7f20
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 43 deletions.
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
@@ -1314,3 +1314,43 @@ push!(A, a, b, c...) = push!(push!(A, a, b), c...)
 unshift!(A) = A
 unshift!(A, a, b) = unshift!(unshift!(A, b), a)
 unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)
+
+# Fill S (resized as needed) with a random subsequence of A, where
+# each element of A is included in S with independent probability p.
+# (Note that this is different from the problem of finding a random
+#  size-m subset of A where m is fixed!)
+function randsubseq!(S::AbstractArray, A::AbstractArray, p::Real)
+    0 <= p <= 1 || throw(ArgumentError("probability $p not in [0,1]"))
+    n = length(A)
+    p == 1 && return copy!(resize!(S, n), A)
+    empty!(S)
+    p == 0 && return S
+    sizehint(S, int(p * length(A)))
+    if p > 0.15 # empirical threshold for trivial O(n) algorithm to be better
+        for i = 1:n
+            rand() <= p && push!(S, A[i])
+        end
+    else
+        # Skip through A, in order, from each element i to the next element i+s
+        # included in S. The probability that the next included element is 
+        # s==k (k > 0) is (1-p)^(k-1) * p, and hence the probability (CDF) that
+        # s is in {1,...,k} is 1-(1-p)^k = F(k).   Thus, we can draw the skip s
+        # from this probability distribution via the discrete inverse-transform
+        # method: s = iceil(F^{-1}(u)) where u = rand(), which is simply
+        # s = iceil(log(rand()) / log1p(-p)).
+        L = 1 / log1p(-p)
+        i = 0
+        while true
+            s = log(rand()) * L # note that rand() < 1, so s > 0
+            s >= n - i && return S # compare before iceil to avoid overflow
+            push!(S, A[i += iceil(s)])
+        end
+        # [This algorithm is similar in spirit to, but much simpler than,
+        #  the one by Vitter for a related problem in "Faster methods for
+        #  random sampling," Comm. ACM Magazine 7, 703-718 (1984).]
+    end
+    return S
+end
+
+randsubseq{T}(A::AbstractArray{T}, p::Real) = randsubseq!(T[], A, p)
+
diff --git a/base/exports.jl b/base/exports.jl
@@ -545,6 +545,8 @@ export
     promote_shape,
     randcycle,
     randperm,
+    randsubseq!,
+    randsubseq,
     range,
     reducedim,
     repmat,

diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl
@@ -335,56 +335,45 @@ function findnz{Tv,Ti}(S::SparseMatrixCSC{Tv,Ti})
     return (I, J, V)
 end
 
-truebools(n::Integer) = ones(Bool, n)
-function sprand{T}(m::Integer, n::Integer, density::FloatingPoint, rng::Function,::Type{T}=eltype(rng(1)))
-    0 <= density <= 1 || throw(ArgumentError("density must be between 0 and 1"))
+function sprand{T}(m::Integer, n::Integer, density::FloatingPoint,
+                   rng::Function,::Type{T}=eltype(rng(1)))
+    0 <= density <= 1 || throw(ArgumentError("$density not in [0,1]"))
     N = n*m
     N == 0 && return spzeros(T,m,n)
     N == 1 && return rand() <= density ? sparse(rng(1)) : spzeros(T,1,1)
-    # if density < 0.5, we'll randomly generate the indices to set
-    #        otherwise, we'll randomly generate the indices to skip
-    K = (density > 0.5) ? N*(1-density) : N*density
-    # Use Newton's method to invert the birthday problem
-    l = log(1.0-1.0/N)
-    k = K
-    k = k + ((1-K/N)*exp(-k*l) - 1)/l
-    k = k + ((1-K/N)*exp(-k*l) - 1)/l # for K<N/2, 2 iterations suffice
-    ik = int(k)
-    ind = sort(rand(1:N, ik))
-    uind = Array(Int, 0)   # unique indices
-    sizehint(uind, int(N*density))
-    if density < 0.5
-        if ik == 0
-            return sparse(Int[],Int[],Array(T,0),m,n)
-        end
-        j = ind[1]
-        push!(uind, j)
-        uj = j
-        for i = 2:length(ind)
-            j = ind[i]
-            if j != uj
-                push!(uind, j)
-                uj = j
-            end
-        end
-    else
-        push!(ind, N+1) # sentinel
-        ii = 1
-        for i = 1:N
-            if i != ind[ii]
-                push!(uind, i)
-            else
-                while (i == ind[ii])
-                    ii += 1
-                end
-            end
+
+    I, J = Array(Int, 0), Array(Int, 0) # indices of nonzero elements
+    sizehint(I, int(N*density))
+    sizehint(J, int(N*density))
+
+    # density of nonzero columns:
+    L = log1p(-density)
+    coldensity = -expm1(m*L) # = 1 - (1-density)^m 
+
+    rows = Array(Int, 0)
+    for j in randsubseq(1:n, coldensity)
+        # To get the right statistics, we *must* have a nonempty column j
+        # even if p*m << 1.   To do this, we use an approach similar to
+        # the one in randsubseq to compute the expected first nonzero row k,
+        # except given that at least one is nonzero (via Bayes' rule).
+        k = iceil(log1p(-(1-rand())*coldensity) / L) # 1-rand > 0 => k > 0
+        k = min(m, k) # k <= m in exact arith, but be careful of roundoff
+        randsubseq!(rows, 1:m-k, density)
+        push!(rows, m-k+1)
+        append!(I, rows)
+        nrows = length(rows)
+        Jlen = length(J)
+        resize!(J, Jlen+nrows)
+        for i = Jlen+1:length(J)
+            J[i] = j
         end
     end
-    I, J = ind2sub((m,n), uind)
-    return sparse_IJ_sorted!(I, J, rng(length(uind)), m, n, +)  # it will never need to combine
+    return sparse_IJ_sorted!(I, J, rng(length(I)), m, n, +)  # it will never need to combine
 end
+
 sprand(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,rand,Float64)
 sprandn(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,randn,Float64)
+truebools(n::Integer) = ones(Bool, n)
 sprandbool(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,truebools,Bool)
 
 spones{T}(S::SparseMatrixCSC{T}) =

diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -3814,6 +3814,19 @@ Indexing, Assignment, and Concatenation
 
    Throw an error if the specified indexes are not in bounds for the given array.
 
+.. function:: randsubseq(A, p) -> Vector
+
+   Return a vector consisting of a random subsequence of the given array ``A``,
+   where each element of ``A`` is included (in order) with independent
+   probability ``p``.   (Complexity is linear in ``p*length(A)``, so this
+   function is efficient even if ``p`` is small and ``A`` is large.)
+
+.. function:: randsubseq!(S, A, p)
+
+   Like ``randsubseq``, but the results are stored in ``S`` (which is
+   resized as needed).
+
+
 Array functions
 ~~~~~~~~~~~~~~~
 

diff --git a/doc/stdlib/sparse.rst b/doc/stdlib/sparse.rst
@@ -57,7 +57,7 @@ Sparse matrices support much of the same set of operations as dense matrices. Th
 
 .. function:: sprand(m,n,density[,rng])
 
-   Create a random sparse matrix with the specified density. Nonzeros are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified.
+   Create a random sparse matrix, in which the probability of any element being nonzero is given by ``density``. Nonzeros are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified.
 
 .. function:: sprandn(m,n,density)