Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better performance and feedback #171

Merged
merged 20 commits into from
Aug 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ dependencies {

// ===== Detekt =====
detektPlugins("io.gitlab.arturbosch.detekt:detekt-formatting:1.17.1")

// ==== Status bar ====
implementation("me.tongfei:progressbar:0.9.2")
}

val generatedSourcesPath = "src/main/generated"
Expand Down
4 changes: 4 additions & 0 deletions configs/antlr_java_js_ast.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ label:
# save to disk ASTs in the JSON format
storage:
name: json AST

# number of threads used for parsing
# the default is one thread
numOfThreads: 4
22 changes: 16 additions & 6 deletions src/main/kotlin/astminer/common/TreeUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ const val EMPTY_TOKEN = "EMPTY"
* The function was adopted from the original code2vec implementation in order to match their behavior:
* https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
*/

val newLineReg = "\\\\n".toRegex()
val whitespaceReg = "//s+".toRegex()
val quotesApostrophesCommasReg = "[\"',]".toRegex()
val unicodeWeirdCharReg = "\\P{Print}".toRegex()
val notALetterReg = "[^A-Za-z]".toRegex()

fun normalizeToken(token: String, defaultToken: String): String {
val cleanToken = token.lowercase()
.replace("\\\\n".toRegex(), "") // escaped new line
.replace("//s+".toRegex(), "") // whitespaces
.replace("[\"',]".toRegex(), "") // quotes, apostrophes, commas
.replace("\\P{Print}".toRegex(), "") // unicode weird characters
.replace(newLineReg, "") // escaped new line
.replace(whitespaceReg, "") // whitespaces
.replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas
.replace(unicodeWeirdCharReg, "") // unicode weird characters

val stripped = cleanToken.replace("[^A-Za-z]".toRegex(), "")
val stripped = cleanToken.replace(notALetterReg, "")

return stripped.ifEmpty {
val carefulStripped = cleanToken.replace(" ", "_")
Expand All @@ -27,9 +34,12 @@ fun normalizeToken(token: String, defaultToken: String): String {
* The function was adopted from the original code2vec implementation in order to match their behavior:
* https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
*/

val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex()

fun splitToSubtokens(token: String) = token
.trim()
.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex())
.split(splitRegex)
.map { s -> normalizeToken(s, "") }
.filter { it.isNotEmpty() }
.toList()
9 changes: 5 additions & 4 deletions src/main/kotlin/astminer/common/model/ParsingModel.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@ import java.io.File
import java.io.InputStream
import java.util.*

abstract class Node {
abstract class Node(val originalToken: String?) {
abstract val typeLabel: String
abstract val children: List<Node>
abstract val parent: Node?
abstract val originalToken: String?

val normalizedToken: String by lazy {
val normalizedToken: String =
originalToken?.let {
val subtokens = splitToSubtokens(it)
if (subtokens.isEmpty()) EMPTY_TOKEN else subtokens.joinToString(TOKEN_DELIMITER)
} ?: EMPTY_TOKEN
}

var technicalToken: String? = null

val token: String
Expand All @@ -42,13 +41,15 @@ abstract class Node {
resultList.add(this)
children.forEach { it.doTraversePreOrder(resultList) }
}

fun preOrderIterator(): Iterator<Node> = preOrder().listIterator()
open fun preOrder(): List<Node> = mutableListOf<Node>().also { doTraversePreOrder(it) }

private fun doTraversePostOrder(resultList: MutableList<Node>) {
children.forEach { it.doTraversePostOrder(resultList) }
resultList.add(this)
}

fun postOrderIterator(): Iterator<Node> = postOrder().listIterator()
open fun postOrder(): List<Node> = mutableListOf<Node>().also { doTraversePostOrder(it) }

Expand Down
37 changes: 33 additions & 4 deletions src/main/kotlin/astminer/common/model/ParsingResultModel.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,47 @@ package astminer.common.model
import astminer.parse.ParsingException
import mu.KotlinLogging
import java.io.File
import kotlin.concurrent.thread
import kotlin.math.ceil

private val logger = KotlinLogging.logger("HandlerFactory")

interface ParsingResultFactory {
fun parse(file: File): ParsingResult<out Node>

fun <T> parseFiles(files: List<File>, action: (ParsingResult<out Node>) -> T) =
fun <T> parseFiles(
files: List<File>,
action: (ParsingResult<out Node>) -> T
): List<T?> {
val results = mutableListOf<T?>()
files.map { file ->
try {
action(parse(file))
results.add(action(parse(file)))
} catch (parsingException: ParsingException) {
logger.error(parsingException) { "Failed to parse file ${file.path}" }
null
results.add(null)
}
}
return results
}

fun <T> parseFilesInThreads(
files: List<File>,
numOfThreads: Int,
action: (ParsingResult<out Node>) -> T
): List<T?> {
val results = mutableListOf<T?>()
val threads = mutableListOf<Thread>()

synchronized(results) {
files.chunked(ceil(files.size.toDouble() / numOfThreads).toInt()).filter { it.isNotEmpty() }
.map { chunk ->
threads.add(thread { results.addAll(parseFiles(chunk, action)) })
}
}
threads.map { it.join() }
return results
}
}

interface PreprocessingParsingResultFactory : ParsingResultFactory {
Expand All @@ -28,7 +54,10 @@ interface PreprocessingParsingResultFactory : ParsingResultFactory {
* @param files list of files to be parsed with preprocessing
* @param action action to do with parsed files (e.g. save on the disk)
*/
override fun <T> parseFiles(files: List<File>, action: (ParsingResult<out Node>) -> T) =
override fun <T> parseFiles(
files: List<File>,
action: (ParsingResult<out Node>) -> T
) =
files.map { file ->
try {
val preprocessedFile = preprocess(file)
Expand Down
12 changes: 10 additions & 2 deletions src/main/kotlin/astminer/config/PipelineConfig.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package astminer.config

import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable
import kotlinx.serialization.SerializationException

/**
* Config which defines the pipeline
Expand All @@ -14,5 +15,12 @@ data class PipelineConfig(
val parser: ParserConfig,
val filters: List<FilterConfig> = emptyList(),
@SerialName("label") val labelExtractor: LabelExtractorConfig,
val storage: StorageConfig
)
val storage: StorageConfig,
val numOfThreads: Int = 1
) {
init {
if (numOfThreads <= 0) {
throw SerializationException("Number of threads must be a positive integer")
}
}
}
4 changes: 2 additions & 2 deletions src/main/kotlin/astminer/parse/antlr/AntlrNode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import astminer.common.model.Node
class AntlrNode(
override val typeLabel: String,
override var parent: AntlrNode?,
override val originalToken: String?
) : Node() {
originalToken: String?
) : Node(originalToken) {

override val children: MutableList<AntlrNode> = mutableListOf()

Expand Down
1 change: 0 additions & 1 deletion src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ private fun convertRuleContext(
}
}
currentNode.replaceChildren(children)

return currentNode
}

Expand Down
4 changes: 2 additions & 2 deletions src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ import com.google.common.collect.TreeMultiset
*/
class FuzzyNode(
override val typeLabel: String,
override val originalToken: String?,
originalToken: String?,
order: Int?
) : Node() {
) : Node(originalToken) {
private val order = order ?: -1
override var parent: Node? = null
private val childrenMultiset = TreeMultiset.create<FuzzyNode>(
Expand Down
4 changes: 2 additions & 2 deletions src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ import astminer.common.model.Node
import com.github.gumtreediff.tree.ITree
import com.github.gumtreediff.tree.TreeContext

class GumTreeNode(val wrappedNode: ITree, val context: TreeContext, override var parent: GumTreeNode?) : Node() {
class GumTreeNode(val wrappedNode: ITree, val context: TreeContext, override var parent: GumTreeNode?) :
Node(wrappedNode.label) {
override val typeLabel: String
get() = context.getTypeLabel(wrappedNode)

override val children: MutableList<GumTreeNode> by lazy {
wrappedNode.children.map { GumTreeNode(it, context, this) }.toMutableList()
}
override val originalToken: String = wrappedNode.label

override fun removeChildrenOfType(typeLabel: String) {
children.removeIf { it.typeLabel == typeLabel }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,16 @@ class GumTreeJavaFunctionInfo(
override val returnType: String? = root.getElementType()
override val enclosingElement: EnclosingElement<GumTreeNode>? = collectEnclosingClass()

override val modifiers: List<String> = root.children.filter { it.typeLabel == "Modifier" }.map { it.originalToken }
override val modifiers: List<String> = root
.children
.filter { it.typeLabel == "Modifier" }
.mapNotNull { it.originalToken }

override val annotations: List<String> = root
.children
.filter { it.typeLabel == "MarkerAnnotation" }
.map { it.children.first().originalToken }
.mapNotNull { it.children.first().originalToken }

override val isConstructor: Boolean = root.typeLabel == "Initializer"

private fun collectEnclosingClass(): EnclosingElement<GumTreeNode>? {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ class GumTreePythonFunctionInfo(
else -> emptyList()
}
}
return params.map { FunctionInfoParameter(it.originalToken, getElementType(it)?.originalToken) }
return params.mapNotNull {
FunctionInfoParameter(it.originalToken ?: return@mapNotNull null, getElementType(it)?.originalToken)
}
}

companion object {
Expand Down
18 changes: 15 additions & 3 deletions src/main/kotlin/astminer/pipeline/Pipeline.kt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import astminer.parse.getParsingResultFactory
import astminer.pipeline.branch.FilePipelineBranch
import astminer.pipeline.branch.FunctionPipelineBranch
import astminer.pipeline.branch.IllegalLabelExtractorException
import me.tongfei.progressbar.ProgressBar
import java.io.File

/**
Expand Down Expand Up @@ -44,18 +45,29 @@ class Pipeline(private val config: PipelineConfig) {
* Runs the pipeline that is defined in the [config].
*/
fun run() {
println("Working in ${config.numOfThreads} thread(s)")
for (language in config.parser.languages) {
println("Parsing $language")
val parsingResultFactory = getParsingResultFactory(language, config.parser.name)

println("Collecting files...")
val files = getProjectFilesWithExtension(inputDirectory, language.fileExtension)
println("${files.size} files retrieved")

val progressBar = ProgressBar("", files.size.toLong())

createStorage(language).use { storage ->
parsingResultFactory.parseFiles(files) { parseResult ->
for (labeledResult in branch.process(parseResult)) {
storage.store(labeledResult)
synchronized(storage) {
parsingResultFactory.parseFilesInThreads(files, config.numOfThreads) { parseResult ->
for (labeledResult in branch.process(parseResult)) {
storage.store(labeledResult)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, you can update progress bar here

progressBar.step()
}
}
}
progressBar.close()
}
println("Done!")
}
}
4 changes: 1 addition & 3 deletions src/test/kotlin/astminer/common/DummyNode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@ import java.io.File
class DummyNode(
override val typeLabel: String,
override val children: MutableList<DummyNode> = mutableListOf()
) : Node() {
) : Node(typeLabel) {

override val parent: Node? = null

override val originalToken: String = typeLabel

init {
// Tokens may change after normalization, for tests we want tokens to be unchanged
technicalToken = typeLabel
Expand Down
2 changes: 1 addition & 1 deletion src/test/kotlin/astminer/featureextraction/PrettyNode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package astminer.featureextraction

import astminer.common.model.Node

class PrettyNode(override val typeLabel: String, override val originalToken: String) : Node() {
class PrettyNode(override val typeLabel: String, originalToken: String) : Node(originalToken) {
override var children: MutableList<PrettyNode> = ArrayList()
override var parent: PrettyNode? = null
set(value) {
Expand Down
Loading