Skip to content

Commit

Permalink
tuning basic visitors (#293)
Browse files Browse the repository at this point in the history
* add `isConverged` method to visitors
* add new benchmarks for shingled points
  • Loading branch information
sudiptoguha authored Feb 2, 2022
1 parent c76e6aa commit 2fedfea
Show file tree
Hide file tree
Showing 18 changed files with 330 additions and 180 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import com.amazon.randomcutforest.returntypes.DensityOutput;
import com.amazon.randomcutforest.returntypes.DiVector;
import com.amazon.randomcutforest.returntypes.Neighbor;
import com.amazon.randomcutforest.testutils.ShingledMultiDimDataWithKeys;
import com.amazon.randomcutforest.testutils.NormalMixtureTestData;
import org.github.jamm.MemoryMeter;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Fork;
Expand All @@ -42,6 +42,7 @@
public class RandomCutForestBenchmark {

public final static int DATA_SIZE = 50_000;
public final static int INITIAL_DATA_SIZE = 25_000;

@State(Scope.Benchmark)
public static class BenchmarkState {
Expand All @@ -66,17 +67,19 @@ public static class BenchmarkState {
@Setup(Level.Trial)
public void setUpData() {
int dimensions = baseDimensions * shingleSize;
int sampleSize = 256;
int dataSize = 100 * sampleSize;
data = ShingledMultiDimDataWithKeys.getMultiDimData(dataSize + shingleSize - 1, 50, 100, 5, 17,
baseDimensions).data;
NormalMixtureTestData gen = new NormalMixtureTestData();
data = gen.generateTestData(INITIAL_DATA_SIZE + DATA_SIZE, dimensions);
}

@Setup(Level.Invocation)
public void setUpForest() {
forest = RandomCutForest.builder().numberOfTrees(numberOfTrees).dimensions(baseDimensions * shingleSize)
.internalShinglingEnabled(true).shingleSize(shingleSize).parallelExecutionEnabled(parallel)
.boundingBoxCacheFraction(boundingBoxCacheFraction).randomSeed(99).build();

for (int i = 0; i < INITIAL_DATA_SIZE; i++) {
forest.update(data[i]);
}
}
}

Expand All @@ -88,7 +91,7 @@ public RandomCutForest updateOnly(BenchmarkState state) {
double[][] data = state.data;
forest = state.forest;

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
forest.update(data[i]);
}

Expand All @@ -103,7 +106,7 @@ public RandomCutForest scoreOnly(BenchmarkState state, Blackhole blackhole) {
double score = 0.0;
Random rnd = new Random(0);

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
score += forest.getAnomalyScore(data[i]);
if (rnd.nextDouble() < 0.01) {
forest.update(data[i]); // this should execute sparingly
Expand All @@ -121,7 +124,7 @@ public RandomCutForest scoreAndUpdate(BenchmarkState state, Blackhole blackhole)
forest = state.forest;
double score = 0.0;

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
score = forest.getAnomalyScore(data[i]);
forest.update(data[i]);
}
Expand All @@ -141,7 +144,7 @@ public RandomCutForest attributionAndUpdate(BenchmarkState state, Blackhole blac
forest = state.forest;
DiVector vector = new DiVector(forest.getDimensions());

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
vector = forest.getAnomalyAttribution(data[i]);
forest.update(data[i]);
}
Expand All @@ -157,7 +160,7 @@ public RandomCutForest basicDensityAndUpdate(BenchmarkState state, Blackhole bla
forest = state.forest;
DensityOutput output = new DensityOutput(forest.getDimensions(), forest.getSampleSize());

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
output = forest.getSimpleDensity(data[i]);
forest.update(data[i]);
}
Expand All @@ -173,7 +176,7 @@ public RandomCutForest basicNeighborAndUpdate(BenchmarkState state, Blackhole bl
forest = state.forest;
List<Neighbor> output = null;

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
output = forest.getNearNeighborsInSample(data[i]);
forest.update(data[i]);
}
Expand All @@ -189,7 +192,7 @@ public RandomCutForest basicExtrapolateAndUpdate(BenchmarkState state, Blackhole
forest = state.forest;
double[] output = null;

for (int i = 0; i < data.length; i++) {
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
output = forest.extrapolate(1);
forest.update(data[i]);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.randomcutforest;

import com.amazon.randomcutforest.returntypes.DensityOutput;
import com.amazon.randomcutforest.returntypes.DiVector;
import com.amazon.randomcutforest.returntypes.Neighbor;
import com.amazon.randomcutforest.testutils.ShingledMultiDimDataWithKeys;
import org.github.jamm.MemoryMeter;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.OperationsPerInvocation;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.util.List;
import java.util.Random;

@Warmup(iterations = 2)
@Measurement(iterations = 5)
@Fork(value = 1)
@State(Scope.Thread)
public class RandomCutForestShingledBenchmark {

public final static int DATA_SIZE = 50_000;
public final static int INITIAL_DATA_SIZE = 25_000;

@State(Scope.Benchmark)
public static class BenchmarkState {
@Param({ "5" })
int baseDimensions;

@Param({ "8" })
int shingleSize;

@Param({ "30" })
int numberOfTrees;

@Param({ "1.0", "0.9", "0.8", "0.7", "0.6", "0.5", "0.4", "0.3", "0.2", "0.1", "0.0" })
double boundingBoxCacheFraction;

@Param({ "false", "true" })
boolean parallel;

double[][] data;
RandomCutForest forest;

@Setup(Level.Trial)
public void setUpData() {
data = ShingledMultiDimDataWithKeys.getMultiDimData(DATA_SIZE + INITIAL_DATA_SIZE, 50, 100, 5, 17,
baseDimensions).data;
}

@Setup(Level.Invocation)
public void setUpForest() {
forest = RandomCutForest.builder().numberOfTrees(numberOfTrees).dimensions(baseDimensions * shingleSize)
.internalShinglingEnabled(true).shingleSize(shingleSize).parallelExecutionEnabled(parallel)
.boundingBoxCacheFraction(boundingBoxCacheFraction).randomSeed(99).build();

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
forest.update(data[i]);
}
}
}

private RandomCutForest forest;

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest updateOnly(BenchmarkState state) {
double[][] data = state.data;
forest = state.forest;

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
forest.update(data[i]);
}

return forest;
}

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest scoreOnly(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
double score = 0.0;
Random rnd = new Random(0);

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
score += forest.getAnomalyScore(data[i]);
if (rnd.nextDouble() < 0.01) {
forest.update(data[i]); // this should execute sparingly
}
}

blackhole.consume(score);
return forest;
}

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest scoreAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
double score = 0.0;

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
score = forest.getAnomalyScore(data[i]);
forest.update(data[i]);
}

blackhole.consume(score);
if (!forest.parallelExecutionEnabled) {
MemoryMeter meter = new MemoryMeter();
System.out.println(" forest size " + meter.measureDeep(forest));
}
return forest;
}

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest attributionAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
DiVector vector = new DiVector(forest.getDimensions());

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
vector = forest.getAnomalyAttribution(data[i]);
forest.update(data[i]);
}

blackhole.consume(vector);
return forest;
}

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest basicDensityAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
DensityOutput output = new DensityOutput(forest.getDimensions(), forest.getSampleSize());

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
output = forest.getSimpleDensity(data[i]);
forest.update(data[i]);
}

blackhole.consume(output);
return forest;
}

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest basicNeighborAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
List<Neighbor> output = null;

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
output = forest.getNearNeighborsInSample(data[i]);
forest.update(data[i]);
}

blackhole.consume(output);
return forest;
}

@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest basicExtrapolateAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
double[] output = null;

for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
output = forest.extrapolate(1);
forest.update(data[i]);
}

blackhole.consume(output);
return forest;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1578,29 +1578,4 @@ public DiVector getApproximateDynamicAttribution(float[] point, double precision
return traverseForest(transformToShingledPoint(point), visitorFactory, accumulator, finisher);
}

public double dynamicScore(float[] point) {
return dynamicScore(point, 0, CommonUtils::defaultScoreSeenFunction, CommonUtils::defaultScoreUnseenFunction,
CommonUtils::defaultDampFunction, CommonUtils::defaultScalarNormalizerFunction);
}

public double dynamicScore(float[] point, int ignoreMass, BiFunction<Double, Double, Double> scoreSeen,
BiFunction<Double, Double, Double> scoreUnseen, BiFunction<Double, Double, Double> damp,
BiFunction<Double, Double, Double> normalizer) {
double result = 0;
float[] changedPoint = transformToShingledPoint(point);

if (parallelExecutionEnabled) {
result = updateExecutor.getComponents().parallelStream()
.map(x -> ((SamplerPlusTree) x).scalarScore(changedPoint, ignoreMass, scoreSeen, scoreUnseen, damp,
normalizer))
.reduce(Double::sum).orElseThrow(() -> new IllegalStateException("trees returned an empty result"));
} else {
result = updateExecutor.getComponents().stream()
.map(x -> ((SamplerPlusTree) x).scalarScore(changedPoint, ignoreMass, scoreSeen, scoreUnseen, damp,
normalizer))
.reduce(Double::sum).orElseThrow(() -> new IllegalStateException("trees returned an empty result"));
}
return result / numberOfTrees;

}
}
12 changes: 12 additions & 0 deletions Java/core/src/main/java/com/amazon/randomcutforest/Visitor.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,16 @@ default void acceptLeaf(INodeView leafNode, final int depthOfNode) {
* @return the result value computed by the visitor.
*/
R getResult();

/**
* This method short-circuits the evaluation of the Visitor at nodes on the traversal path. By default, the
* accept (or acceptLeaf) method will be invoked for each Node in the traversal path. But the NodeView has to prepare
* information to support that visitor invocation. Before invocation, the value of isConverged will be checked.
* If it is true, some of that preparation can be skipped -- because the visitor would not be upodated.
* This method can be overriden to optimize visitors that do not need to visit every node on the root to leaf path
* before returning a value.
**/
default boolean isConverged() {
return false;
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@

package com.amazon.randomcutforest.anomalydetection;

import java.util.Arrays;

import com.amazon.randomcutforest.CommonUtils;
import com.amazon.randomcutforest.Visitor;
import com.amazon.randomcutforest.returntypes.DiVector;
import com.amazon.randomcutforest.tree.IBoundingBoxView;
import com.amazon.randomcutforest.tree.INodeView;

import java.util.Arrays;

/**
* Attribution exposes the attribution of scores produced by ScalarScoreVisitor
* corresponding to different attributes. It allows a boolean
Expand All @@ -31,11 +31,11 @@
* duplicate points seen by the forest, so that the attribution does not change
* is a sequence of duplicate points are seen. For non-duplicate points, if the
* boolean turned on, reduces effects of masking (when anomalous points are
* included in the forest (which will be true with a few samples or when the
* included in the forest -- which will be true with a few samples or when the
* samples are not refreshed appropriately). It is worth remembering that
* disallowing anomalous points from being included in the forest forest
* explicitly will render the algorithm incapable of adjusting to a new normal
* -- which is a strength of this algorithm.
* disallowing anomalous points from being included in the forest explicitly
* will render the algorithm incapable of adjusting to a new normal -- which is
* a strength of this algorithm.
**/
public abstract class AbstractAttributionVisitor implements Visitor<DiVector> {

Expand Down Expand Up @@ -265,4 +265,9 @@ protected void updateRangesForScoring(IBoundingBoxView smallBox, IBoundingBoxVie
}
}
}

@Override
public boolean isConverged() {
return pointInsideBox;
}
}
Loading

0 comments on commit 2fedfea

Please sign in to comment.