Neo4j working & started benchmarking SBT task

plume-oss · DavidBakerEffendi · Jul 15, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 9, 2024
commit 2be5c74f870491ea4240b7b6cae0eaad70d116f5
diff --git a/build.sbt b/build.sbt
@@ -45,8 +45,8 @@ libraryDependencies ++= Seq(
   "org.openjdk.jmh"          % "jmh-generator-reflection" % Versions.jmh,
   "org.openjdk.jmh"          % "jmh-generator-asm"        % Versions.jmh,
   "org.slf4j"                % "slf4j-api"                % Versions.slf4j,
-  "org.apache.logging.log4j" % "log4j-core"               % Versions.log4j % Test,
-  "org.apache.logging.log4j" % "log4j-slf4j-impl"         % Versions.log4j % Test,
+  "org.apache.logging.log4j" % "log4j-core"               % Versions.log4j     % Test,
+  "org.apache.logging.log4j" % "log4j-slf4j-impl"         % Versions.log4j     % Test,
   "org.scalatest"           %% "scalatest"                % Versions.scalatest % Test
 )
 
@@ -64,3 +64,40 @@ developers := List(
 Global / onChangedBuildSource := ReloadOnSourceChanges
 
 publishMavenStyle := true
+
+// Benchmark Tasks
+
+lazy val datasetDir = taskKey[File]("Dataset directory")
+datasetDir := baseDirectory.value / "workspace" / "defects4"
+lazy val driversToBenchmark = taskKey[Seq[String]]("Drivers to benchmark")
+driversToBenchmark := Seq("overflowdb")
+
+lazy val defect4jDataset = taskKey[Seq[(String, String)]]("JARs for projects used in `defects4j`")
+defect4jDataset :=
+  Seq("Chart" -> "https://repo1.maven.org/maven2/org/jfree/jfreechart/1.5.5/jfreechart-1.5.5.jar")
+
+lazy val benchmarkDownloadTask = taskKey[Unit](s"Download `defects4j` candidates for benchmarking")
+benchmarkDownloadTask := {
+  defect4jDataset.value.foreach { case (name, url) =>
+    DownloadHelper.ensureIsAvailable(url, datasetDir.value / s"$name.jar")
+  }
+}
+
+lazy val benchmarkTask = taskKey[Unit](s"Run JMH benchmarks against drivers")
+benchmarkTask := {
+
+  def benchmarkArgs(driver: String, project: String): String = {
+    val projectDir  = (datasetDir.value / project).getAbsolutePath
+    val resultsPath = baseDirectory.value / "results" / s"results-$driver-$project"
+    val outputPath  = baseDirectory.value / "results" / s"output-$driver-$project"
+    s"com.github.plume.oss.Benchmark $driver $projectDir -o ${outputPath.getAbsolutePath} -r ${resultsPath.getAbsolutePath}"
+  }
+
+  driversToBenchmark.value.foreach { driver =>
+    defect4jDataset.value.foreach { case (_, project) =>
+      println(s"[INFO] Benchmarking $driver on $project")
+      (Jmh / runMain).toTask(benchmarkArgs(driver, project))
+    }
+  }
+
+}
diff --git a/project/DownloadHelper.scala b/project/DownloadHelper.scala
@@ -0,0 +1,48 @@
+import java.io.File
+import java.net.URI
+import java.nio.file.{Files, Path, Paths}
+
+object DownloadHelper {
+  val LocalStorageDir = Paths.get(".local/source-urls")
+
+  /** Downloads the remote file from the given url if either
+   * - the localFile is not available,
+   * - or the url is different from the previously downloaded file
+   * - or we don't have the original url from the previously downloaded file
+   * We store the information about the previously downloaded urls and the localFile in `.local`
+   */
+  def ensureIsAvailable(url: String, localFile: File): Unit = {
+    if (!localFile.exists() || Option(url) != previousUrlForLocalFile(localFile)) {
+      val localPath = localFile.toPath
+      Files.deleteIfExists(localPath)
+
+      println(s"[INFO] downloading $url to $localFile")
+      sbt.io.Using.urlInputStream(new URI(url).toURL) { inputStream =>
+        sbt.IO.transfer(inputStream, localFile)
+      }
+
+      // persist url in local storage
+      val storageFile = storageInfoFileFor(localFile)
+      Files.createDirectories(storageFile.getParent)
+      Files.writeString(storageFile, url)
+    }
+  }
+
+  private def relativePathToProjectRoot(path: Path): String =
+    Paths
+      .get("")
+      .toAbsolutePath
+      .normalize()
+      .relativize(path.toAbsolutePath)
+      .toString
+
+  private def previousUrlForLocalFile(localFile: File): Option[String] = {
+    Option(storageInfoFileFor(localFile))
+      .filter(Files.exists(_))
+      .map(Files.readString)
+      .filter(_.nonEmpty)
+  }
+
+  private def storageInfoFileFor(localFile: File): Path =
+    LocalStorageDir.resolve(relativePathToProjectRoot(localFile.toPath))
+}
diff --git a/src/main/scala/com/github/plume/oss/Benchmark.scala b/src/main/scala/com/github/plume/oss/Benchmark.scala
@@ -2,7 +2,12 @@ package com.github.plume.oss
 
 import better.files.File
 import com.github.plume.oss.Benchmark.BenchmarkType.*
-import com.github.plume.oss.benchmarking.{OverflowDbReadBenchmark, TinkerGraphReadBenchmark}
+import com.github.plume.oss.benchmarking.{
+  GraphWriteBenchmark,
+  Neo4jEmbedReadBenchmark,
+  OverflowDbReadBenchmark,
+  TinkerGraphReadBenchmark
+}
 import com.github.plume.oss.drivers.{IDriver, TinkerGraphDriver}
 import org.cache2k.benchmark.jmh.ForcedGcMemoryProfiler
 import org.openjdk.jmh.annotations.Mode
@@ -44,7 +49,7 @@ object Benchmark {
           case _: Neo4jEmbeddedConfig =>
             Option(
               createOptionsBoilerPlate(config, READ)
-                .include(classOf[OverflowDbReadBenchmark].getSimpleName)
+                .include(classOf[Neo4jEmbedReadBenchmark].getSimpleName)
                 .build()
             )
           case x =>

diff --git a/src/main/scala/com/github/plume/oss/benchmarking/GraphReadBenchmark.scala b/src/main/scala/com/github/plume/oss/benchmarking/GraphReadBenchmark.scala
@@ -80,14 +80,8 @@ trait GraphReadBenchmark {
   @Benchmark
   def unindexedMethodFullName(bh: Blackhole): Unit
 
-  @Setup(Level.Iteration)
-  def clearDriver(params: BenchmarkParams): Unit = {
-    driver.clear()
-  }
-
   @TearDown
   def cleanupBenchmark(): Unit = {
-    driver.clear()
     driver.close()
   }
 

diff --git a/src/main/scala/com/github/plume/oss/benchmarking/Neo4jEmbedReadBenchmark.scala b/src/main/scala/com/github/plume/oss/benchmarking/Neo4jEmbedReadBenchmark.scala
@@ -1,17 +1,15 @@
 package com.github.plume.oss.benchmarking
 
-import com.github.plume.oss.benchmarking.GraphReadBenchmark
-import com.github.plume.oss.drivers.{Neo4jEmbeddedDriver, TinkerGraphDriver}
+import com.github.plume.oss.drivers.Neo4jEmbeddedDriver
 import io.shiftleft.codepropertygraph.generated.EdgeTypes.AST
 import io.shiftleft.codepropertygraph.generated.NodeTypes.{CALL, METHOD}
 import io.shiftleft.codepropertygraph.generated.PropertyNames.{FULL_NAME, ORDER}
-import org.apache.tinkerpop.gremlin.process.traversal.P
-import org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.{GraphTraversalSource, __}
-import org.neo4j.graphdb.{GraphDatabaseService, Label}
+import org.neo4j.graphdb.GraphDatabaseService
 import org.openjdk.jmh.annotations.{Benchmark, Scope, Setup, State}
 import org.openjdk.jmh.infra.{BenchmarkParams, Blackhole}
 import overflowdb.traversal.*
 
+import java.util
 import scala.compiletime.uninitialized
 import scala.jdk.CollectionConverters.*
 import scala.util.{Random, Using}
@@ -33,9 +31,9 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
       tx.execute(s"""
            |MATCH (n)-[$AST]->()
            |WHERE NOT (n)<-[$AST]-()
-           |RETURN n.id
+           |RETURN n.id AS ID
            |""".stripMargin)
-        .map { result => result.get("n.id").asInstanceOf[Long] }
+        .map { result => result.get("ID").asInstanceOf[Long] }
         .toArray
     }
   }
@@ -44,9 +42,9 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
     Using.resource(g.beginTx) { tx =>
       tx.execute(s"""
             |MATCH (n)-[$AST]->()
-            |RETURN n.id
+            |RETURN n.id AS ID
             |""".stripMargin)
-        .map { result => result.get("n.id").asInstanceOf[Long] }
+        .map { result => result.get("ID").asInstanceOf[Long] }
         .toArray
     }
   }
@@ -56,9 +54,9 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
       tx.execute(s"""
             |MATCH (n)
             |WHERE n.$ORDER IS NOT NULL
-            |RETURN n.id
+            |RETURN n.id AS ID
             |""".stripMargin)
-        .map { result => result.get("n.id").asInstanceOf[Long] }
+        .map { result => result.get("ID").asInstanceOf[Long] }
         .toArray
     }
   }
@@ -69,11 +67,10 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
         .execute(s"""
             |MATCH (n: $CALL)
             |WHERE n.$ORDER IS NOT NULL
-            |RETURN n.id
+            |RETURN n.id AS ID
             |""".stripMargin)
-        .map { result => result.get("n.id").asInstanceOf[Long] }
+        .map { result => result.get("ID").asInstanceOf[Long] }
         .toList
-      println(res)
       res.toArray
     }
   }
@@ -83,9 +80,9 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
       tx.execute(s"""
             |MATCH (n: $METHOD)
             |WHERE n.$FULL_NAME IS NOT NULL
-            |RETURN n.$FULL_NAME
+            |RETURN n.$FULL_NAME as $FULL_NAME
             |""".stripMargin)
-        .map { result => result.get(s"n.$FULL_NAME").asInstanceOf[String] }
+        .map { result => result.get(FULL_NAME).asInstanceOf[String] }
         .toArray
     }
     fullNames = new Random(1234).shuffle(fullNames_).toArray
@@ -99,12 +96,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
     var nnodes = nodeStart.length
     while (stack.nonEmpty) {
       val childrenIds = Using.resource(g.beginTx) { tx =>
-        tx.execute(s"""
+        tx.execute(
+          s"""
                |MATCH (n)-[AST]->(m)
-               |WHERE n.id = ${stack.removeLast()}
-               |RETURN m.id
-               |""".stripMargin)
-          .map { result => result.get("m.id").asInstanceOf[Long] }
+               |WHERE n.id = $$nodeId
+               |RETURN m.id AS ID
+               |""".stripMargin,
+          new util.HashMap[String, Object](1) {
+            put("nodeId", stack.removeLast().asInstanceOf[Object])
+          }
+        ).map { result => result.get("ID").asInstanceOf[Long] }
           .toArray
       }
       stack.appendAll(childrenIds)
@@ -120,12 +121,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
     for (node <- nodeStart) {
       var nodeId = node
       def getResult = Using.resource(g.beginTx) { tx =>
-        tx.execute(s"""
+        tx.execute(
+          s"""
                |MATCH (n)<-[AST]-(m)
-               |WHERE n.id = $nodeId
-               |RETURN m.id
-               |""".stripMargin)
-          .map { result => result.get("m.id").asInstanceOf[Long] }
+               |WHERE n.id = $$nodeId
+               |RETURN m.id AS ID
+               |""".stripMargin,
+          new util.HashMap[String, Object](1) {
+            put("nodeId", nodeId.asInstanceOf[Object])
+          }
+        ).map { result => result.get("ID").asInstanceOf[Long] }
           .toArray
       }
       var result  = getResult
@@ -145,12 +150,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
     var sumOrder = 0
     for (nodeId <- nodeStart) {
       val orderArr = Using.resource(g.beginTx) { tx =>
-        tx.execute(s"""
+        tx.execute(
+          s"""
                |MATCH (n)
-               |WHERE n.id = $nodeId
-               |RETURN n.$ORDER
-               |""".stripMargin)
-          .map { result => result.get(s"n.$ORDER").asInstanceOf[Int] }
+               |WHERE n.id = $$nodeId
+               |RETURN n.$ORDER AS $ORDER
+               |""".stripMargin,
+          new util.HashMap[String, Object](1) {
+            put("nodeId", nodeId.asInstanceOf[Object])
+          }
+        ).map { result => result.get(ORDER).asInstanceOf[Int] }
           .toArray
       }
       sumOrder += orderArr.head
@@ -162,12 +171,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
   @Benchmark
   override def callOrderTrav(blackhole: Blackhole): Int = {
     val res = Using.resource(g.beginTx) { tx =>
-      tx.execute(s"""
+      tx.execute(
+        s"""
              |MATCH (n: $CALL)
-             |WHERE n.$ORDER > 2 AND n.id IN [${nodeStart.mkString(",")}]
-             |RETURN COUNT(n)
-             |""".stripMargin)
-        .map(_.get("COUNT(n)").asInstanceOf[Int])
+             |WHERE n.$ORDER > 2 AND n.id IN $$nodeIds
+             |RETURN COUNT(n) AS SIZE
+             |""".stripMargin,
+        new util.HashMap[String, Object](1) {
+          put("nodeIds", nodeStart.toList.asJava.asInstanceOf[Object])
+        }
+      ).map(_.get("SIZE").asInstanceOf[Int])
         .next()
     }
     Option(blackhole).foreach(_.consume(res))
@@ -178,12 +191,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
   override def callOrderExplicit(blackhole: Blackhole): Int = {
     var res = 0
     val nodes = Using.resource(g.beginTx) { tx =>
-      tx.execute(s"""
+      tx.execute(
+        s"""
              |MATCH (n: $CALL)
-             |WHERE n.id IN [${nodeStart.mkString(",")}]
-             |RETURN n.$ORDER
-             |""".stripMargin)
-        .map(_.get(s"n.$ORDER").asInstanceOf[Int])
+             |WHERE n.id IN $$nodeIds
+             |RETURN n.$ORDER as $ORDER
+             |""".stripMargin,
+        new util.HashMap[String, Object](1) {
+          put("nodeIds", nodeStart.toList.asJava.asInstanceOf[Object])
+        }
+      ).map(_.get(ORDER).asInstanceOf[Int])
         .toArray
     }
     for (order <- nodes) {
@@ -198,12 +215,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
     fullNames.foreach { fullName =>
       Using
         .resource(g.beginTx) { tx =>
-          tx.execute(s"""
+          tx.execute(
+            s"""
                |MATCH (n: $METHOD)
-               |WHERE n.$FULL_NAME = $fullName
-               |RETURN n
-               |""".stripMargin)
-            .map(_.get(s"n"))
+               |WHERE n.$FULL_NAME = $$fullName
+               |RETURN n AS NODE
+               |""".stripMargin,
+            new util.HashMap[String, Object](1) {
+              put("fullName", fullName.asInstanceOf[Object])
+            }
+          ).map(_.get("NODE"))
             .toArray
         }
         .foreach(bh.consume)
@@ -215,12 +236,16 @@ class Neo4jEmbedReadBenchmark extends GraphReadBenchmark {
     fullNames.foreach { fullName =>
       Using
         .resource(g.beginTx) { tx =>
-          tx.execute(s"""
+          tx.execute(
+            s"""
                |MATCH (n)
-               |WHERE n.$FULL_NAME = $fullName and $METHOD IN labels(n)
-               |RETURN n
-               |""".stripMargin)
-            .map(_.get(s"n"))
+               |WHERE n.$FULL_NAME = $$fullName and $METHOD IN labels(n)
+               |RETURN n AS NODE
+               |""".stripMargin,
+            new util.HashMap[String, Object](1) {
+              put("fullName", fullName.asInstanceOf[Object])
+            }
+          ).map(_.get("NODE"))
             .toArray
         }
         .foreach(bh.consume)