diff --git a/Python/rerf/cy_usporf.pyx b/Python/rerf/cy_usporf.pyx deleted file mode 100644 index 8c75f3e4..00000000 --- a/Python/rerf/cy_usporf.pyx +++ /dev/null @@ -1,237 +0,0 @@ -""" -Usporf's utilities functions -""" -import random as rn -import cython -import numpy as np - -from libc.stdlib cimport rand, RAND_MAX -from libc.math cimport log as ln # base e -from libc.math cimport M_PI -from sklearn.tree._utils cimport log # base 2 -cimport numpy as np - -np.import_array() - -DTYPE = np.int -ctypedef np.int_t DTYPE_t -FTYPE = np.float -ctypedef np.float_t FTYPE_t -cdef double INFINITY = np.inf -DEF CUTOFF = 17 -cdef double Euler_const = 0.5772156649 - - -def c_factor(int n): - """ - Average path length of unsuccesful search in a binary - search tree given n points, using `ln` in C - - Parameters - ---------- - n : int - Number of data points for the binary search tree - (BST) - Returns - ------- - float - Average path length of unsuccesful search in a BST - """ - return 2.0*(ln(n-1)+Euler_const) - (2.0*(n-1.)/(n*1.0)) - - -def projectA(int dim, int d, double Lambda=0.05): - """ - Create a sparse matrix `A` with shape(dim, d) - Lambda = non-zero element (-1 and 1) - Note: non-zero element must be at least one - - Parameters - ---------- - dim : int - Number of data raw features - d : int - Number of data projected features - Returns - ------- - A : ndarray of shape (dim, d) - projected matrix - """ - cdef int i, i_new - cdef non_zeros = 0 - cdef int A_size = dim * d - cdef double p # random probability - cdef double Lambda_2 = Lambda*0.5 - cdef A = np.zeros((A_size,), dtype=float) - - for i in range(A_size): - p = uniform() - if p < Lambda_2: - A[i] = 1 - non_zeros += 1 - elif (p >= Lambda_2) and (p < Lambda): - A[i] = -1 - non_zeros += 1 - if non_zeros == 0: # there is no non-zero - i_new = int(rand()/RAND_MAX*A_size) - if (rand()/RAND_MAX) < 0.5: - A[i_new] = 1 - else: - A[i_new] = -1 - return A.reshape(dim, d) - - -cdef double uniform(): - """ - random number uniformly from [0,1] - """ - cdef double x1 = rand() - return x1/RAND_MAX - - -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. -def FastBIC(np.ndarray[FTYPE_t, ndim=1] X_1d): - """ - USPORF algorithm 3. Calculate the splitPoint and - min Bayesian information criterion (BIC) score - from 1D float array - - Parameters - ---------- - X_1d : ndarray of shape (N,) - projected matrix - - Returns - ------- - split_Point : float - splitting position - minBIC : float - minimum BIC score - """ - cdef int N = X_1d.shape[0] - cdef np.ndarray[FTYPE_t, ndim=1] X_sorted = sort(X_1d) - cdef np.ndarray[FTYPE_t, ndim=1] X_sorted_1 - cdef np.ndarray[FTYPE_t, ndim=1] X_sorted_2 - cdef FTYPE_t split_Point = X_sorted[0] - cdef double minBIC = INFINITY - cdef double BIC_diff_var, BIC_same_var, var_comb - - cdef Py_ssize_t s - cdef int n_1, n_2 - cdef double pi_0 = 1.0 - cdef double pi_1, pi_2 - - # var = sum(X-mu)^2/N = X_sumsq/N - (X_sum/N)^2 - cdef double X_sum = np.sum(X_sorted) - cdef double X_sumsq = np.sum(X_sorted**2) - cdef double X_sum_1 = 0 - cdef double X_sumsq_1 = 0 - cdef double X_sum_2 = X_sum - cdef double X_sumsq_2 = X_sumsq - cdef double i_1 = 1 - cdef double i_2 = N-1 - cdef double var_1 = 0.0 - cdef double var_2 = 0.0 - - pi_0 = pi_0/N - for s in range(0, N-1): - n_1 = s - n_2 = N-s - - X_sum_1 += X_sorted[s] - X_sumsq_1 += X_sorted[s]**2 - X_sum_2 -= X_sorted[s] - X_sumsq_2 -= X_sorted[s]**2 - - var_1 = (X_sumsq_1)/(n_1+1) - (X_sum_1/(n_1+1))**2 - if (var_1 == 0): - continue - var_2 = (X_sumsq_2)/(n_2-1) - (X_sum_2/(n_2-1))**2 - if (var_2 == 0): - continue - - pi_1 = s*pi_0 - pi_2 = 1-pi_1 - var_comb = (pi_1*var_1 + pi_2*var_2) - BIC_diff_var = -2*(n_1 * ln(pi_1) - - n_1/2 * ln(2*M_PI*var_1) - + n_2 * ln(pi_2) - - n_2/2 * ln(2*M_PI*var_2)) - BIC_same_var = -2*(n_1 * ln(pi_1) - - n_1/2 * ln(2*M_PI*var_comb) - + n_2 * ln(pi_2) - - n_2/2 * ln(2*M_PI*var_comb)) - - if BIC_diff_var > BIC_same_var: - BIC_diff_var = BIC_same_var - - if BIC_diff_var < minBIC: - minBIC = BIC_diff_var - split_Point = (X_sorted[s-1] + X_sorted[s])/2 - return(split_Point, minBIC) - - -cdef np.ndarray[FTYPE_t, ndim=1] sort(np.ndarray[FTYPE_t, ndim=1] a): - """ - Parameters - ---------- - a : ndarray of shape (N,) - projected matrix - - Returns - ------- - ndarray of shape (N,) - projected matrix - """ - qsort( a.data, 0, a.shape[0]) - return a - -cdef void qsort(FTYPE_t * a, Py_ssize_t start, Py_ssize_t end): - if (end - start) < CUTOFF: - insertion_sort(a, start, end) - return - cdef Py_ssize_t boundary = partition(a, start, end) - qsort(a, start, boundary) - qsort(a, boundary+1, end) - -cdef Py_ssize_t partition(FTYPE_t * a, Py_ssize_t start, Py_ssize_t end): - assert end > start - cdef Py_ssize_t i = start, j = end-1 - cdef FTYPE_t pivot = a[j] - while True: - # assert all(x < pivot for x in a[start:i]) - # assert all(x >= pivot for x in a[j:end]) - - while a[i] < pivot: - i += 1 - while i < j and pivot <= a[j]: - j -= 1 - if i >= j: - break - assert a[j] < pivot <= a[i] - swap(a, i, j) - assert a[i] < pivot <= a[j] - assert i >= j and i < end - swap(a, i, end-1) - assert a[i] == pivot - # assert all(x < pivot for x in a[start:i]) - # assert all(x >= pivot for x in a[i:end]) - return i - -cdef inline void swap(FTYPE_t * a, Py_ssize_t i, Py_ssize_t j): - a[i], a[j] = a[j], a[i] - -cdef void insertion_sort(FTYPE_t * a, Py_ssize_t start, Py_ssize_t end): - cdef Py_ssize_t i, j - cdef FTYPE_t v - for i in range(start, end): - # invariant: [start:i) is sorted - v = a[i] - j = i-1 - while j >= start: - if a[j] <= v: - break - a[j+1] = a[j] - j -= 1 - a[j+1] = v diff --git a/Python/rerf/pycy_usporf.py b/Python/rerf/pycy_usporf.py index 9d5cba6f..e107f94b 100644 --- a/Python/rerf/pycy_usporf.py +++ b/Python/rerf/pycy_usporf.py @@ -1,19 +1,17 @@ -""" -Usporf structure in python -Goal: apply USPORF's fast-BIC split, project-A matrix - apply EIF path length -""" import numpy as np import random as rn import cy_usporf # Utilities functions - class UForest(object): - """ + """ UForest: USPORF forest Creates an iForest object. This object holds the data as well as the trained trees (iTree objects). + + Note: download `cy_usporf.pyx` and `setup.py` into your local + computer and compile them to generate and generate `.C` and `.so` + before using this module - Attributes + Parameters ---------- n_samples: int Size of the dataset. @@ -25,6 +23,14 @@ class UForest(object): Maximum depth a tree can have. d: int number of features in limit sparse matrix `A` + + References + ---------- + .. [#Meghana] Meghana et al (2019). + https://arxiv.org/abs/1907.02844 + + .. [#Fei] Fei Tony et al. (2009). + https://arxiv.org/abs/1506.03410 Methods ------- @@ -61,18 +67,20 @@ def fit(self, X, y=None): return self def compute_paths(self, X_in=None): - """ - compute_paths(X_in = None) + """ Compute paths(X_in = None) + Compute anomaly scores for all data points in a dataset X_in Parameters ---------- X_in : list of list of floats Data to be scored. iForest.Trees are used for computing the depth reached in each tree by each data point. + Returns ------- float - Anomaly score for a given data point. + Anomaly score for a given data point. The higher score, + the more anomaly. """ if X_in is None: X_in = self.X_ @@ -88,10 +96,11 @@ def compute_paths(self, X_in=None): class Node(object): - """ + """Elements stored in tree node A single node from each tree (each iTree object). Nodes containe information on hyperplanes used for data division, date to be passed to left and right nodes, whether they are external or internal nodes. + Attributes ---------- e: int @@ -126,8 +135,9 @@ def __init__(self, X, d, Aj, splitPoint, e, left, right, node_type=''): class iTree(object): # USPORF's algo 1 - """ + """iTree: a single USPORF tree A single tree in the forest that is build using a unique subsample. + Attributes ---------- X: list @@ -160,9 +170,9 @@ def __init__(self, X, e, max_depth, d, Lambda): self.root = self.make_tree(X, e) # Create a tree with root node. def make_tree(self, X, e): - """ - make_tree(X, e, l) + """make_tree(X, e, l) Builds the tree recursively from a given node. Returns a Node object. + Parameters ---------- X: list of list of floats @@ -203,10 +213,11 @@ def make_tree(self, X, e): class PathFactor(object): - """ + """ Path length finder Given a single tree (iTree objext) and a data point x = [x1,x2,...,xn], compute the legth of the path traversed by the point on the tree when - it reaches an external node. + it reaches an external node. See Fei Tony et al. (2009) [#Fei]_ for + further details Attributes ---------- @@ -232,13 +243,13 @@ def __init__(self, x, itree): self.path = self.find_path(itree.root) def find_path(self, T): - """ - find_path(T) + """find_path(T) Given a tree, find the path for a single data point based on the splitting criteria stored at each node. Parameters ---------- T : iTree object (iTree.root object) + Returns ------- int diff --git a/Python/rerf/setup.py b/Python/rerf/setup.py deleted file mode 100644 index 9b2119c2..00000000 --- a/Python/rerf/setup.py +++ /dev/null @@ -1,7 +0,0 @@ -from setuptools import setup -from Cython.Build import cythonize - -setup(ext_modules = cythonize("cy_usporf.pyx")) - -# For local Mac and Linux -# in terminal$ python setup.py build_ext --inplace diff --git a/docs/demos/URerF/3d_usporf_outlier_detection.ipynb b/docs/demos/URerF/3d_usporf_outlier_detection.ipynb index b8ba319b..7ea917ed 100644 --- a/docs/demos/URerF/3d_usporf_outlier_detection.ipynb +++ b/docs/demos/URerF/3d_usporf_outlier_detection.ipynb @@ -12,6 +12,9 @@ "- Open `SPORF/Python/rerf` and download the following\n", "files into your local computer\n", " - `pycy_usporf.py`\n", + "\n", + "- Open `SPORF/Python/src` and download the following\n", + "files into your local computer\n", " - `cy_usporf.pyx` \n", " - `setup.py`\n", "- Open terminal, `cd` to the files location and run\n",