Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat graph investigate query time explosion #306

Merged
merged 9 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions deployments/kubehound/graph/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ COPY dsl/kubehound/pom.xml /home/app
RUN mvn -f /home/app/pom.xml clean install

# Now build our janusgraph wrapper container with KubeHound customizations
FROM janusgraph/janusgraph:1.0.0
FROM janusgraph/janusgraph:1.1.0
LABEL org.opencontainers.image.source="https://github.com/DataDog/kubehound/"

# Add our initialization script for the database schema to the startup directory
Expand Down Expand Up @@ -59,9 +59,22 @@ ENV gremlinserver.metrics.slf4jReporter.enabled=false
ENV gremlinserver.metrics.graphiteReporter.enabled=false
ENV gremlinserver.metrics.csvReporter.enabled=false

# Add safety net settings to prevent OOM and other issues
ENV janusgraph.query.force-index=true
Zenithar marked this conversation as resolved.
Show resolved Hide resolved
ENV janusgraph.cluster.max-partitions=512
ENV janusgraph.query.batch=true
ENV janusgraph.query.hard-max-limit=10000
ENV janusgraph.query.smart-limit=true

# Enable cache
ENV janusgraph.cache.db-cache=true
ENV janusgraph.cache.db-cache-clean-wait=20
ENV janusgraph.cache.db-cache-time=180000
Zenithar marked this conversation as resolved.
Show resolved Hide resolved
ENV janusgraph.cache.db-cache-size=0.5

# Performance tweaks based on: https://www.sailpoint.com/blog/souping-up-the-gremlin/
# gremlinPool will default to Runtime.availableProcessors()
ENV gremlinserver.gremlinPool=0
ENV gremlinserver.gremlinPool=8
Zenithar marked this conversation as resolved.
Show resolved Hide resolved
# threadPoolWorker should be 2x VCPU (TODO: can we set dynamically?)
ENV gremlinserver.threadPoolWorker=16

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public default GraphTraversal<S, Path> criticalPaths(int maxHops) {

return repeat(((KubeHoundTraversalDsl) __.outE())
.inV()
.simplePath()).until(
.simplePath().dedup()).until(
__.has("critical", true)
.or()
.loops()
Expand Down Expand Up @@ -126,7 +126,7 @@ public default GraphTraversal<S, Path> criticalPathsFilter(int maxHops, String..
return repeat(((KubeHoundTraversalDsl) __.outE())
.has("class", P.not(P.within(exclusions)))
.inV()
.simplePath()).until(
.simplePath().dedup()).until(
__.has("critical", true)
.or()
.loops()
Expand Down Expand Up @@ -167,7 +167,7 @@ public default <E2 extends Comparable> GraphTraversal<S, E2> minHopsToCritical(i
throw new IllegalArgumentException(String.format("maxHops must be <= %d", PATH_HOPS_MAX));

return repeat(((KubeHoundTraversalDsl) __.out())
.simplePath()).until(
.simplePath().dedup()).until(
__.has("critical", true)
.or()
.loops()
Expand Down Expand Up @@ -202,7 +202,7 @@ public default <K> GraphTraversal<S, Map<K, Long>> criticalPathsFreq(int maxHops
return repeat(
(KubeHoundTraversalDsl) __.outE()
.inV()
.simplePath())
.simplePath().dedup())
.emit()
.until(
__.has("critical", true)
Expand Down
27 changes: 26 additions & 1 deletion deployments/kubehound/graph/kubehound-db-init.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,16 @@ mgmt.buildIndex('byServiceEndpoint', Vertex.class).addKey(serviceEndpoint).build
mgmt.buildIndex('byServiceDns', Vertex.class).addKey(serviceDns).buildCompositeIndex();
mgmt.buildIndex('byExposure', Vertex.class).addKey(exposure).buildCompositeIndex();

// Create composite indices for the properties we want to search on
mgmt.buildIndex('byClusterAndRunIDComposite', Vertex.class).addKey(cluster).addKey(runID).buildCompositeIndex();
mgmt.buildIndex('byClassAndRunIDComposite', Vertex.class).addKey(cls).addKey(runID).buildCompositeIndex();
mgmt.buildIndex('byClassAndClusterComposite', Vertex.class).addKey(cls).addKey(cluster).buildCompositeIndex();
mgmt.buildIndex('byClassAndTypeComposite', Vertex.class).addKey(cls).addKey(type).buildCompositeIndex();
mgmt.buildIndex('byClassAndExposureComposite', Vertex.class).addKey(cls).addKey(exposure).buildCompositeIndex();
mgmt.buildIndex('byTypeAndNameComposite', Vertex.class).addKey(type).addKey(name).buildCompositeIndex();
mgmt.buildIndex('byImageAndRunIDComposite', Vertex.class).addKey(image).addKey(runID).buildCompositeIndex();
mgmt.buildIndex('byAppAndRunIDComposite', Vertex.class).addKey(app).addKey(runID).buildCompositeIndex();
mgmt.buildIndex('byNamespaceAndRunIDComposite', Vertex.class).addKey(namespace).addKey(runID).buildCompositeIndex();
Comment on lines +182 to +191
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, those will definitely be super useful.
Just by curiosity, do you know if it would have a significant impact on insert time? Or does it stay negligeable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For in-memory, I don't see any relevant/noticeable write performance improvements, except for the graph nuking, which is using the index now. It is more visible with concrete datastore and search index backends.


mgmt.commit();

Expand All @@ -194,9 +204,24 @@ ManagementSystem.awaitGraphIndexStatus(graph, 'byName').status(SchemaStatus.ENAB
ManagementSystem.awaitGraphIndexStatus(graph, 'byNamespace').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byType').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byCritical').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byPort').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byPortName').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byServiceEndpoint').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byServiceDns').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byExposure').status(SchemaStatus.ENABLED).call();

ManagementSystem.awaitGraphIndexStatus(graph, 'byClusterAndRunIDComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byClassAndRunIDComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byClassAndClusterComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byClassAndTypeComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byClassAndExposureComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byTypeAndNameComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byImageAndRunIDComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byAppAndRunIDComposite').status(SchemaStatus.ENABLED).call();
ManagementSystem.awaitGraphIndexStatus(graph, 'byNamespaceAndRunIDComposite').status(SchemaStatus.ENABLED).call();

System.out.println("[KUBEHOUND] graph schema and indexes ready");
mgmt.close();

// Close the open connection
:remote close
:remote close
66 changes: 33 additions & 33 deletions deployments/kubehound/ui/LowHangingFruit-ContainerEscape.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,11 @@
" .has(\"runID\", graph.variables().get('runID_yourid').get().trim())\n",
" .where(\n",
" repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .limit(1)\n",
" )\n",
" .dedup().by(\"image\")\n",
Expand Down Expand Up @@ -201,11 +201,11 @@
" .has(\"runID\", graph.variables().get('runID_yourid').get().trim())\n",
" .where(\n",
" repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .limit(1)\n",
" )\n",
" .dedup()\n",
Expand Down Expand Up @@ -313,11 +313,11 @@
" .has(\"runID\", graph.variables().get('runID_yourid').get().trim())\n",
" .has(\"app\",graph.variables().get('containerEscape_vulnApp_yourid').get().trim())\n",
" .repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .path().by(elementMap())\n",
" .limit(1000)"
]
Expand Down Expand Up @@ -360,11 +360,11 @@
" .has(\"runID\", graph.variables().get('runID_yourid').get().trim())\n",
" .has(\"app\",graph.variables().get('containerEscape_vulnApp_yourid').get().trim())\n",
" .repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .path()\n",
" .by(valueMap(\"app\", \"class\",\"critical\").with(WithOptions.tokens,WithOptions.labels))\n",
" .dedup()\n",
Expand Down Expand Up @@ -402,11 +402,11 @@
" .has(\"runID\", graph.variables().get('runID_yourid').get().trim())\n",
" .has(\"app\",graph.variables().get('containerEscape_vulnApp_yourid').get().trim())\n",
" .repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .path().by(label())\n",
" .dedup()\n",
" .limit(1000)"
Expand Down Expand Up @@ -438,11 +438,11 @@
" .or().has(\"namespace\", within(graph.variables().get('containerEscape_whiteListedNamespace_yourid').get()))\n",
" )\n",
" .repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .path().by(elementMap())\n",
" .limit(1000)"
]
Expand All @@ -469,11 +469,11 @@
" .or().has(\"namespace\", within(graph.variables().get('containerEscape_whiteListedNamespace_yourid').get()))\n",
" )\n",
" .repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .path()\n",
" .by(valueMap(\"app\", \"class\",\"critical\").with(WithOptions.tokens,WithOptions.labels))\n",
" .dedup()\n",
Expand Down Expand Up @@ -502,12 +502,12 @@
" .or().has(\"namespace\", within(graph.variables().get('containerEscape_whiteListedNamespace_yourid').get()))\n",
" )\n",
" .repeat(\n",
" outE().inV().simplePath() // Building the path from one vertex to another\n",
" outE().inV().simplePath().dedup() // Building the path from one vertex to another\n",
" ).until(\n",
" has(label, \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(label, \"Node\") // Keep only path ending with a critical asset\n",
" .path().by(label())\n",
" has(\"class\", \"Node\") // Stop when meeting a critical asset\n",
" .or().loops().is(10) // Stop after X iteration\n",
" ).has(\"class\", \"Node\") // Keep only path ending with a critical asset\n",
" .path().by(\"class\")\n",
" .dedup()\n",
" .limit(1000)"
]
Expand Down
99 changes: 56 additions & 43 deletions pkg/kubehound/storage/graphdb/janusgraph_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,50 +67,63 @@ func (jgp *JanusGraphProvider) Prepare(ctx context.Context) error {
return nil
}

g := gremlin.Traversal_().WithRemote(jgp.drc)
tx := g.Tx()
defer tx.Close()

for {
// Begin a new transaction.
gtx, err := tx.Begin()
if err != nil {
return err
}

// Retrieve the number of vertices in the graph.
page, err := gtx.V().Count().Next()
if err != nil {
return err
}

// Decode the number of vertices from the page.
count, err := page.GetInt()
if err != nil {
return err
}

// If there are no more vertices to delete, break the loop.
if count == 0 {
break
}

// Delete the vertices in the graph.
err = <-gtx.V().Limit(deleteBatchSize).Drop().Iterate()
if err != nil {
return err
}

// Commit the transaction.
if err := tx.Commit(); err != nil {
return err
}
// These vertex types are defined in the schema.
Zenithar marked this conversation as resolved.
Show resolved Hide resolved
vertexTypes := []string{
"Container",
"Identity",
"Node",
"Pod",
"PermissionSet",
"Volume",
"Endpoint",
}

// Check context for cancellation.
select {
case <-ctx.Done():
return ctx.Err()
default:
for _, vertexType := range vertexTypes {
g := gremlin.Traversal_().WithRemote(jgp.drc)
tx := g.Tx()
defer tx.Close()

for {
// Begin a new transaction.
gtx, err := tx.Begin()
if err != nil {
return err
}

// Retrieve the number of vertices in the graph.
page, err := gtx.V().Has("class", vertexType).Count().Next()
if err != nil {
return err
}

// Decode the number of vertices from the page.
count, err := page.GetInt()
if err != nil {
return err
}

// If there are no more vertices to delete, break the loop.
if count == 0 {
break
}

// Delete the vertices in the graph.
err = <-gtx.V().Has("class", vertexType).Limit(deleteBatchSize).Drop().Iterate()
if err != nil {
return err
}

// Commit the transaction.
if err := tx.Commit(); err != nil {
return err
}

// Check context for cancellation.
select {
case <-ctx.Done():
return ctx.Err()
default:
}
}
}

Expand Down
Loading