Deduplicate strings in the parser (#137)

This leads to slower results in ParseBenchmark (~ 5%) but with parsing taking only 10% of the total time in the main benchmark, the potential for performance regressions is severely limited. We see an improvement of overall benchmark times, both single-threaded and multi-threaded with a shared parse cache, in the main benchmark.
databricks · Nov 30, 2021 · d18a07a · d18a07a
1 parent 541b00b
commit d18a07a
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/sjsonnet/src/sjsonnet/Parser.scala b/sjsonnet/src/sjsonnet/Parser.scala
@@ -49,6 +49,8 @@ class Parser(val currentFile: Path) {
 
   private val fileScope = new FileScope(currentFile)
 
+  private val strings = new mutable.HashMap[String, String]
+
   def Pos[_: P]: P[Position] = Index.map(offset => new Position(fileScope, offset))
 
   def id[_: P] = P(
@@ -251,7 +253,12 @@ class Parser(val currentFile: Path) {
     }
   )
 
-  def constructString(pos: Position, lines: Seq[String]) = Val.Str(pos, lines.mkString)
+  def constructString(pos: Position, lines: Seq[String]) = {
+    val s = lines.mkString
+    val unique = strings.getOrElseUpdate(s, s)
+    Val.Str(pos, unique)
+  }
+
   // Any `expr` that isn't naively left-recursive
   def expr2[_: P]: P[Expr] = P(
     Pos.flatMapX{ pos =>