Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ruby] ANTLR Profiler Summary #4950

Merged
merged 3 commits into from
Sep 27, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
package io.joern.rubysrc2cpg.parser

import better.files.File
import better.files.File.OpenOptions
import org.antlr.v4.runtime.*
import org.antlr.v4.runtime.atn.{ATN, ATNConfigSet, ProfilingATNSimulator}
import org.antlr.v4.runtime.dfa.DFA
import org.slf4j.LoggerFactory

import java.io.FileWriter
import java.io.File.separator
import java.nio.file.{Files, Path, StandardOpenOption}
import java.util
import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
import scala.collection.concurrent.TrieMap
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.util.{Try, Using}
import flatgraph.help.Table
import flatgraph.help.Table.AvailableWidthProvider
import io.shiftleft.semanticcpg

/** Summarizes the result of parsing a file.
*/
Expand All @@ -22,20 +31,26 @@ case class ParserResult(program: Try[RubyParser.ProgramContext], errors: List[St
* the file path to the file to be parsed.
* @param withDebugging
* if set, will enable the ANTLR debugger, i.e, printing of the parse tree.
* @param withProfiler
* if set, will enable a profiler and dump logs for each parsed file alongside it.
* @param maybeParserProfiler
* the parser profiler used to capture profiling information.
*/
class AntlrParser(inputDir: File, filename: String, withDebugging: Boolean = false, withProfiler: Boolean = false) {

private val charStream = CharStreams.fromFileName(filename)
private val lexer = new RubyLexer(charStream)
class AntlrParser(
inputDir: File,
filename: String,
withDebugging: Boolean = false,
maybeParserProfiler: Option[ParserProfiler] = None
) {

private val tokenStream = new CommonTokenStream(RubyLexerPostProcessor(lexer))
val parser: RubyParser = new RubyParser(tokenStream)
val parser: RubyParser = {
val charStream = CharStreams.fromFileName(filename)
val lexer = new RubyLexer(charStream)
val tokenStream = new CommonTokenStream(RubyLexerPostProcessor(lexer))
new RubyParser(tokenStream)
}
private var profilerOpt: Option[ProfilingATNSimulator] = None

parser.setTrace(withDebugging)
if (withProfiler) {
if (maybeParserProfiler.isDefined) {
val profiler = new ProfilingATNSimulator(parser)
parser.setInterpreter(profiler)
parser.setProfile(true)
Expand Down Expand Up @@ -102,41 +117,20 @@ class AntlrParser(inputDir: File, filename: String, withDebugging: Boolean = fal
})

val program = Try {
val program = parser.program()

val parseStart = System.nanoTime()
val program = parser.program()
val parseTime = System.nanoTime() - parseStart
maybeParserProfiler.foreach(_.captureParseTime(filename, parseTime))
// If profiling is enabled, read metrics and write accompanying file
profilerOpt.foreach { profiler =>
val logFilename = filename.replaceAll("\\.[^.]+$", "") + ".log"
val atn = parser.getATN
Using.resource(FileWriter(logFilename)) { logFile =>
logFile.write("Profiling information for file: " + filename + "\n\n")

var totalTimeInPrediction = 0L
var totalLookaheadOps = 0L

profiler.getDecisionInfo.foreach { decision =>
val decisionNumber = decision.decision
val ruleIndex = atn.decisionToState.get(decisionNumber).ruleIndex
val ruleName = parser.getRuleNames()(ruleIndex)

logFile.write(s"Decision $decisionNumber ($ruleName):\n")
logFile.write(s" Invocations: ${decision.invocations}\n")
logFile.write(s" Time (ns): ${decision.timeInPrediction}\n")
logFile.write(s" SLL lookahead operations: ${decision.SLL_TotalLook}\n")
logFile.write(s" LL lookahead operations: ${decision.LL_TotalLook}\n")

totalTimeInPrediction += decision.timeInPrediction
totalLookaheadOps += decision.SLL_TotalLook + decision.LL_TotalLook
}
logFile.write(s"Total time in prediction: $totalTimeInPrediction ns\n")
logFile.write(s"Total lookahead operations: $totalLookaheadOps\n")
}
}
profilerOpt.foreach(profiler =>
maybeParserProfiler.foreach(_.captureProfilerLogs(parser, inputDir.pathAsString, filename, profiler))
)

program
}
ParserResult(program, errors.toList, warnings.toList)
}

}

/** A re-usable parser object that clears the ANTLR DFA-cache if it determines that the memory usage is becoming large.
Expand All @@ -156,10 +150,11 @@ class ResourceManagedParser(clearLimit: Double, debug: Boolean = false, profilin
private val runtime = Runtime.getRuntime
private var maybeDecisionToDFA: Option[Array[DFA]] = None
private var maybeAtn: Option[ATN] = None
private val profiler: Option[ParserProfiler] = if profiling then Option(ParserProfiler()) else None

def parse(inputFile: File, filename: String): Try[RubyParser.ProgramContext] = {
val inputDir = if inputFile.isDirectory then inputFile else inputFile.parent
val antlrParser = AntlrParser(inputDir, filename, debug, profiling)
val antlrParser = AntlrParser(inputDir, filename, debug, profiler)
val interp = antlrParser.parser.getInterpreter
// We need to grab a live instance in order to get the static variables as they are protected from static access
maybeDecisionToDFA = Option(interp.decisionToDFA)
Expand Down Expand Up @@ -188,3 +183,124 @@ class ResourceManagedParser(clearLimit: Double, debug: Boolean = false, profilin

override def close(): Unit = clearDFA()
}

class ParserProfiler {

private val logger = LoggerFactory.getLogger(getClass)
private val ruleCost = TrieMap.empty[String, RuleTimeCost]
private val fileCost = TrieMap.empty[String, Long]
private var projectRoot: Option[Path] = None

sys.addShutdownHook {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I wonder if a shutdown hook is really the best option here - feels a bit messy, since we're mixing concerns: the lifetime of the jvm vs. the completion of a scan.
Sure, that's identical in all current use cases, but are we certain that's going to stay like that forever?
I'd feel better if we collect the results somewhere, maybe even pass them properly, and explicitly print them at the end of the analysis.

Copy link
Collaborator Author

@DavidBakerEffendi DavidBakerEffendi Sep 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, my concern was getting metrics on projects that currently don't finish within a couple of minutes, e.g., GitLab. I do, however, understand the concern that shutdown hooks generally aren't for things like this.

Perhaps I could add some flag to detect an early exit, and use the shutdown hook to dump the summary via the hook only if this is the case? Right now the results for each file are written next to each file, so these can be recovered if the scan fails.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand. If a scan doesn't finish within a couple of minutes, how does a shutdown hook help with that?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we, for example, want to find out a summary of the most expensive rules so far (and not wait for hours), and then we use Ctrl+C to prematurely stop joern-parse, we can still get the results.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mpollmeier Just following up on this. If I remove the hook, it's not a big issue, as the conclusion of which rules are expensive are pretty universal across repos. Would that be preferrable?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, understood. Given that for current use cases the hook is not a problem, we can just leave it there and add a comment saying it's optional and just for debug output and can be removed if need be. Or something like that...

dumpSummary()
}

/** An object to aggregate the performance cost of a rule.
* @param predictionTime
* the total time in prediction (ns).
* @param lookaheads
* total lookahead operations.
*/
private case class RuleTimeCost(predictionTime: Long, lookaheads: Long) {
def +(o: RuleTimeCost): RuleTimeCost =
this.copy(this.predictionTime + o.predictionTime, this.lookaheads + o.lookaheads)
}

def captureParseTime(filename: String, nanoTime: Long): Unit =
fileCost.put(filename, nanoTime)

def captureProfilerLogs(
parser: RubyParser,
inputDir: String,
filename: String,
profiler: ProfilingATNSimulator
): Unit = {
// Set project root
if projectRoot.isEmpty then projectRoot = Option(Path.of(inputDir))

val logFilename = filename.replaceAll("\\.[^.]+$", "") + ".log"
val atn = parser.getATN
Using.resource(FileWriter(logFilename)) { logFile =>
logFile.write("Profiling information for file: " + filename + "\n\n")

var totalTimeInPrediction = 0L
var totalLookaheadOps = 0L

profiler.getDecisionInfo.foreach { decision =>
val decisionNumber = decision.decision
val ruleIndex = atn.decisionToState.get(decisionNumber).ruleIndex
val ruleName = parser.getRuleNames()(ruleIndex)

logFile.write(s"Decision $decisionNumber ($ruleName):\n")
logFile.write(s" Invocations: ${decision.invocations}\n")
logFile.write(s" Time (ns): ${decision.timeInPrediction}\n")
logFile.write(s" SLL lookahead operations: ${decision.SLL_TotalLook}\n")
logFile.write(s" LL lookahead operations: ${decision.LL_TotalLook}\n")

val ruleTimeCost = RuleTimeCost(decision.timeInPrediction, decision.SLL_TotalLook + decision.LL_TotalLook)
totalTimeInPrediction += ruleTimeCost.predictionTime
totalLookaheadOps += ruleTimeCost.lookaheads

ruleCost.updateWith(ruleName) {
case Some(x) => Option(x + ruleTimeCost)
case None => Option(ruleTimeCost)
}
}
logFile.write(s"Total time in prediction: $totalTimeInPrediction ns\n")
logFile.write(s"Total lookahead operations: $totalLookaheadOps\n")
}
}

private def dumpSummary(): Unit = {
projectRoot match {
case Some(root) =>
val conversionFactor = 1 / 1e6
val timeUnit = "ms"
val summaryPath = root.resolve("antlr_summary.log")
implicit val widthProvider: AvailableWidthProvider = semanticcpg.defaultAvailableWidthProvider
val totalParseTime = fileCost.values.sum
val avgParseTime = totalParseTime / fileCost.size.toDouble
val mostExpensiveFileStr = fileCost.toList.sortBy(_._2).reverse.headOption.map { case (name, time) =>
f"Most Expensive File: ${root.relativize(Path.of(name))} (${time * conversionFactor}%.2f $timeUnit)"
}

val columnNames = Seq("Rule Name", s"Prediction Time ($timeUnit)", "Total Lookaheads")
val rulesByTimeTable = Table(
columnNames = columnNames,
rows = ruleCost.toList.sortBy { case (_, timeCost) => timeCost.predictionTime }.reverse.take(10).map {
case (ruleName, timeCost) =>
Seq(ruleName, f"${timeCost.predictionTime * conversionFactor}%.2f", timeCost.lookaheads.toString)
}
)
val rulesByLookaheadTable = Table(
columnNames = columnNames,
rows = ruleCost.toList.sortBy { case (_, timeCost) => timeCost.lookaheads }.reverse.take(10).map {
case (ruleName, timeCost) =>
Seq(ruleName, f"${timeCost.predictionTime * conversionFactor}%.2f", timeCost.lookaheads.toString)
}
)

Files.writeString(
summaryPath,
f"""Summary for project at '${root.getFileName}'
|Total Parsed Files: ${fileCost.size}
|Total Parse Time (CPU): ${totalParseTime * conversionFactor}%.2f ($timeUnit)
|Avg. Parse Time Per File: ${avgParseTime * conversionFactor}%.2f ($timeUnit)
|${mostExpensiveFileStr.getOrElse("")}
|
|Most Expensive Rules By Time in Prediction
|==========================================
|${rulesByTimeTable.render}
|
|Most Expensive Rules By Total SLL & LL Lookaheads
|=================================================
|${rulesByLookaheadTable.render}
|""".stripMargin,
StandardOpenOption.TRUNCATE_EXISTING,
StandardOpenOption.CREATE
)
case None => logger.warn("At least one file must be parsed for profiling information to be dumped")
}
}

}
Loading