From 65a489db4f7e0f475468fd91c3b63f3afa17c0a4 Mon Sep 17 00:00:00 2001 From: ilya Date: Sat, 23 Oct 2021 12:53:01 +0300 Subject: [PATCH 01/48] gumtree update --- build.gradle.kts | 10 +++++----- src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt | 10 +++++----- .../gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt | 11 +++++++++-- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/build.gradle.kts b/build.gradle.kts index 856f70c5..2df78a4e 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -25,11 +25,11 @@ dependencies { // ===== Parsers ===== antlr("org.antlr:antlr4:4.7.1") // https://mvnrepository.com/artifact/com.github.gumtreediff - api("com.github.gumtreediff", "core", "2.1.2") - api("com.github.gumtreediff", "client", "2.1.2") - api("com.github.gumtreediff", "gen.jdt", "2.1.2") - api("com.github.gumtreediff", "gen.srcml","2.1.2") - api("com.github.gumtreediff", "gen.python", "2.1.2") + api("com.github.gumtreediff", "core", "3.0.0") + api("com.github.gumtreediff", "client", "3.0.0") + api("com.github.gumtreediff", "gen.jdt", "3.0.0") + api("com.github.gumtreediff", "gen.srcml","3.0.0") + api("com.github.gumtreediff", "gen.python", "3.0.0") // https://github.com/javaparser/javaparser implementation("com.github.javaparser:javaparser-symbol-solver-core:3.22.1") diff --git a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt index 94e1316b..8f3a1f46 100644 --- a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt +++ b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt @@ -1,16 +1,16 @@ package astminer.parse.gumtree import astminer.common.model.Node -import com.github.gumtreediff.tree.ITree +import com.github.gumtreediff.tree.Tree import com.github.gumtreediff.tree.TreeContext -class GumTreeNode(val wrappedNode: ITree, val context: TreeContext, override var parent: GumTreeNode?) : +class GumTreeNode(val wrappedNode: Tree, override var parent: GumTreeNode?) : Node(wrappedNode.label) { override val typeLabel: String - get() = context.getTypeLabel(wrappedNode) + get() = wrappedNode.type.name override val children: MutableList by lazy { - wrappedNode.children.map { GumTreeNode(it, context, this) }.toMutableList() + wrappedNode.children.map { GumTreeNode(it, this) }.toMutableList() } override fun removeChildrenOfType(typeLabel: String) { @@ -29,4 +29,4 @@ class GumTreeNode(val wrappedNode: ITree, val context: TreeContext, override var override fun preOrder(): List = super.preOrder().map { it as GumTreeNode } } -fun wrapGumTreeNode(treeContext: TreeContext): GumTreeNode = GumTreeNode(treeContext.root, treeContext, null) +fun wrapGumTreeNode(treeContext: TreeContext): GumTreeNode = GumTreeNode(treeContext.root, null) diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt index 943c3de9..f6a211e7 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt @@ -4,6 +4,7 @@ import astminer.common.model.EnclosingElement import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo import astminer.common.model.FunctionInfoParameter +import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.gumtree.GumTreeNode import mu.KotlinLogging @@ -18,7 +19,9 @@ class GumTreeJavaJDTFunctionInfo( override val returnType: String? = root.getElementType() override val enclosingElement: EnclosingElement? = collectEnclosingClass() override val parameters: List? = - try { collectParameters() } catch (e: IllegalStateException) { + try { + collectParameters() + } catch (e: IllegalStateException) { logger.warn { e.message } null } @@ -65,12 +68,16 @@ class GumTreeJavaJDTFunctionInfo( private fun GumTreeNode.getElementName(): String = getChildOfType(TypeLabels.simpleName)?.originalToken ?: error("No name found for element") - private fun GumTreeNode.getElementType(): String? = children.firstOrNull { it.isTypeNode() }?.originalToken + private fun GumTreeNode.getElementType(): String? = children.firstOrNull { it.isTypeNode() }?.preOrder() + ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.originalToken } + ?.joinToString(separator = "") private fun GumTreeNode.isTypeNode() = typeLabel.endsWith("Type") companion object { private object TypeLabels { + const val arrayType = "ArrayType" + const val arrayDimensions = "Dimension" const val simpleName = "SimpleName" const val typeDeclaration = "TypeDeclaration" const val enumDeclaration = "EnumDeclaration" From 4d88ddea3187b82ad63329df23c8af3bd878f33b Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 15:08:52 +0300 Subject: [PATCH 02/48] normalization and token logic extracted as separate class --- src/main/kotlin/astminer/common/TreeUtil.kt | 43 ------------------- .../common/model/FunctionInfoModel.kt | 2 +- .../astminer/common/model/ParsingModel.kt | 25 +++-------- .../astminer/featureextraction/TreeFeature.kt | 2 +- .../kotlin/astminer/filters/CommonFilters.kt | 2 +- .../astminer/filters/FunctionFilters.kt | 4 +- .../FunctionNameLabelExtractor.kt | 8 ++-- .../kotlin/astminer/parse/ForeignParser.kt | 8 +++- .../kotlin/astminer/parse/antlr/AntlrUtil.kt | 5 +-- .../parse/antlr/java/AntlrJavaFunctionInfo.kt | 6 +-- .../javascript/AntlrJavaScriptElementInfo.kt | 6 +-- .../parse/antlr/php/ANTLRPHPFunctionInfo.kt | 14 +++--- .../antlr/python/AntlrPythonFunctionInfo.kt | 4 +- .../parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt | 8 ++-- .../java/jdt/GumTreeJavaJDTFunctionInfo.kt | 10 ++--- .../srcML/GumTreeJavaSrcmlFunctionInfo.kt | 10 ++--- .../python/GumTreePythonFunctionInfo.kt | 4 +- .../javaparser/JavaparserFunctionInfo.kt | 12 +++--- .../parse/spoon/SpoonJavaFunctionInfo.kt | 10 ++--- .../java/TreeSitterJavaFunctionInfo.kt | 14 +++--- src/main/kotlin/astminer/paths/PathUtil.kt | 2 +- src/main/kotlin/astminer/paths/PathWorker.kt | 2 +- .../astminer/storage/ast/CsvAstStorage.kt | 4 +- .../astminer/storage/ast/JsonAstStorage.kt | 2 +- .../astminer/storage/path/PathBasedStorage.kt | 2 +- ...ilTest.kt => Code2VecNormalizationTest.kt} | 13 +++--- src/test/kotlin/astminer/common/DummyNode.kt | 2 +- .../astminer/featureextraction/PrettyNode.kt | 2 +- .../FunctionNameLabelExtractorTest.kt | 6 +-- .../GumTreePythonFunctionSplitterTest.kt | 2 +- 30 files changed, 92 insertions(+), 142 deletions(-) rename src/test/kotlin/astminer/common/{TreeUtilTest.kt => Code2VecNormalizationTest.kt} (82%) diff --git a/src/main/kotlin/astminer/common/TreeUtil.kt b/src/main/kotlin/astminer/common/TreeUtil.kt index 1a53158d..129178bf 100644 --- a/src/main/kotlin/astminer/common/TreeUtil.kt +++ b/src/main/kotlin/astminer/common/TreeUtil.kt @@ -1,45 +1,2 @@ package astminer.common -const val EMPTY_TOKEN = "EMPTY" - -/** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - -val newLineReg = "\\\\n".toRegex() -val whitespaceReg = "//s+".toRegex() -val quotesApostrophesCommasReg = "[\"',]".toRegex() -val unicodeWeirdCharReg = "\\P{Print}".toRegex() -val notALetterReg = "[^A-Za-z]".toRegex() - -fun normalizeToken(token: String, defaultToken: String): String { - val cleanToken = token.lowercase() - .replace(newLineReg, "") // escaped new line - .replace(whitespaceReg, "") // whitespaces - .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas - .replace(unicodeWeirdCharReg, "") // unicode weird characters - - val stripped = cleanToken.replace(notALetterReg, "") - - return stripped.ifEmpty { - val carefulStripped = cleanToken.replace(" ", "_") - carefulStripped.ifEmpty { - defaultToken - } - } -} - -/** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - -val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - -fun splitToSubtokens(token: String) = token - .trim() - .split(splitRegex) - .map { s -> normalizeToken(s, "") } - .filter { it.isNotEmpty() } - .toList() diff --git a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt index 7b2f4a9e..16b2e2e1 100644 --- a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt +++ b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt @@ -20,7 +20,7 @@ interface NamedTree { val nameNode: T? get() = notImplemented("nameNode") val name: String? - get() = nameNode?.originalToken + get() = nameNode?.token?.original val root: T get() = notImplemented("root") val body: T? diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index 5f6c8dfa..bbdc2f9c 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -1,31 +1,20 @@ package astminer.common.model -import astminer.common.EMPTY_TOKEN -import astminer.common.splitToSubtokens import java.io.File import java.io.InputStream -import java.util.* -abstract class Node(val originalToken: String?) { +abstract class Node(val token: Token) { + constructor(originalToken: String?): this(Token(originalToken, null)) + abstract val typeLabel: String abstract val children: List abstract val parent: Node? - val normalizedToken: String = - originalToken?.let { - val subtokens = splitToSubtokens(it) - if (subtokens.isEmpty()) EMPTY_TOKEN else subtokens.joinToString(TOKEN_DELIMITER) - } ?: EMPTY_TOKEN - - var technicalToken: String? = null - - val token: String - get() = technicalToken ?: normalizedToken - val metadata: MutableMap = HashMap() fun isLeaf() = children.isEmpty() override fun toString(): String = "$typeLabel : $token" + fun prettyPrint(indent: Int = 0, indentSymbol: String = "--") { repeat(indent) { print(indentSymbol) } println(this) @@ -52,10 +41,6 @@ abstract class Node(val originalToken: String?) { fun postOrderIterator(): Iterator = postOrder().listIterator() open fun postOrder(): List = mutableListOf().also { doTraversePostOrder(it) } - - companion object { - const val TOKEN_DELIMITER = "|" - } } /** Node simplest implementation **/ @@ -63,7 +48,7 @@ class SimpleNode( override val typeLabel: String, override val children: MutableList, override val parent: Node?, - token: String? + token: Token ) : Node(token) { override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } diff --git a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt index 9a76c7fc..3068c2e9 100644 --- a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt +++ b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt @@ -57,7 +57,7 @@ object Tokens : TreeFeature> { private fun findTokens(node: Node, tokensList: MutableList): List { node.children.forEach { findTokens(it, tokensList) } - tokensList.add(node.token) + tokensList.add(node.token.final) return tokensList } } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index a0f47848..7f6e32d3 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -24,7 +24,7 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { private fun validateTree(root: Node) = - !root.preOrder().any { node -> node.token.split(Node.TOKEN_DELIMITER).size > maxWordsNumber } + !root.preOrder().any { node -> node.token.final.split(Code2VecNormalization.TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/filters/FunctionFilters.kt b/src/main/kotlin/astminer/filters/FunctionFilters.kt index d5316459..2edc50f4 100644 --- a/src/main/kotlin/astminer/filters/FunctionFilters.kt +++ b/src/main/kotlin/astminer/filters/FunctionFilters.kt @@ -1,9 +1,9 @@ package astminer.filters +import astminer.common.model.Code2VecNormalization import astminer.common.model.FunctionFilter import astminer.common.model.FunctionInfo import astminer.common.model.Node -import astminer.common.splitToSubtokens /** * Filter that excludes functions that have at least one of modifiers from the [excludeModifiers] list. @@ -38,7 +38,7 @@ object ConstructorFilter : FunctionFilter { class FunctionNameWordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter { override fun validate(functionInfo: FunctionInfo): Boolean { val name = functionInfo.name - return name != null && splitToSubtokens(name).size <= maxWordsNumber + return name != null && Code2VecNormalization.splitToSubtokens(name).size <= maxWordsNumber } } diff --git a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt index 9a62e645..e5abb201 100644 --- a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt +++ b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt @@ -14,13 +14,13 @@ object FunctionNameLabelExtractor : FunctionLabelExtractor { private const val RECURSIVE_CALL_TOKEN = "SELF" override fun process(functionInfo: FunctionInfo): LabeledResult? { - val normalizedName = functionInfo.nameNode?.normalizedToken ?: return null + val normalizedName = functionInfo.nameNode?.token?.normalized ?: return null functionInfo.root.preOrder().forEach { node -> - if (node.originalToken == functionInfo.nameNode?.originalToken) { - node.technicalToken = RECURSIVE_CALL_TOKEN + if (node.token.original == functionInfo.nameNode?.token?.original) { + node.token.technical = RECURSIVE_CALL_TOKEN } } - functionInfo.nameNode?.technicalToken = HIDDEN_METHOD_NAME_TOKEN + functionInfo.nameNode?.token?.technical = HIDDEN_METHOD_NAME_TOKEN return LabeledResult(functionInfo.root, normalizedName, functionInfo.qualifiedPath) } } diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index d7ff2df0..b0f42447 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -2,6 +2,7 @@ package astminer.parse import astminer.common.model.Parser import astminer.common.model.SimpleNode +import astminer.common.model.Token import astminer.config.FileExtension import astminer.config.ParserType import kotlinx.serialization.Serializable @@ -57,7 +58,12 @@ private fun launchScript(args: List): String { private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent: SimpleNode? = null): SimpleNode { val foreignNode = context.tree[rootId] - val node = SimpleNode(foreignNode.nodeType, mutableListOf(), parent, foreignNode.token) + val node = SimpleNode( + typeLabel = foreignNode.nodeType, + children = mutableListOf(), + parent = parent, + token = Token(foreignNode.token, null) + ) val children = foreignNode.children.map { convertFromForeignTree(context, it, node) } node.children.addAll(children) return node diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt b/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt index e9aea811..253a14c2 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt @@ -1,6 +1,5 @@ package astminer.parse.antlr -import astminer.common.EMPTY_TOKEN import astminer.common.model.Node import org.antlr.v4.runtime.ParserRuleContext import org.antlr.v4.runtime.Vocabulary @@ -58,7 +57,7 @@ fun compressTree(root: AntlrNode): AntlrNode { val compressedNode = AntlrNode( root.typeLabel + "|" + child.typeLabel, root.parent, - child.originalToken + child.token.original ) compressedNode.replaceChildren(child.children) compressedNode @@ -83,7 +82,7 @@ fun AntlrNode.hasFirstLabel(label: String): Boolean = firstLabel() == label fun AntlrNode.firstLabelIn(labels: List): Boolean = labels.contains(firstLabel()) fun Node.getTokensFromSubtree(): String = - if (isLeaf()) originalToken ?: EMPTY_TOKEN else children.joinToString(separator = "") { it.getTokensFromSubtree() } + if (isLeaf()) token.original ?: "" else children.joinToString(separator = "") { it.getTokensFromSubtree() } fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel).map { it } diff --git a/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt index a99fa460..def07552 100644 --- a/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt @@ -23,12 +23,12 @@ class AntlrJavaFunctionInfo(override val root: AntlrNode, override val filePath: override val modifiers: List? = root.parent?.children ?.filter { it.hasFirstLabel(METHOD_MODIFIER) && !it.hasLastLabel(METHOD_ANNOTATION) } - ?.mapNotNull { it.originalToken } + ?.mapNotNull { it.token.original } override val annotations: List? = root.parent?.children ?.filter { it.hasLastLabel(METHOD_ANNOTATION) } - ?.mapNotNull { it.getChildOfType(ANNOTATION_NAME)?.originalToken } + ?.mapNotNull { it.getChildOfType(ANNOTATION_NAME)?.token?.original } override val body: AntlrNode? = root.children.find { it.hasFirstLabel(METHOD_BODY_NODE) } @@ -51,7 +51,7 @@ class AntlrJavaFunctionInfo(override val root: AntlrNode, override val filePath: } EnclosingElement( type = enclosingType, - name = enclosingClassNode.getChildOfType(ENCLOSING_NAME_NODE)?.originalToken, + name = enclosingClassNode.getChildOfType(ENCLOSING_NAME_NODE)?.token?.original, root = enclosingClassNode ) } diff --git a/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt b/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt index d5c12555..fedf1b5a 100644 --- a/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt @@ -30,7 +30,7 @@ abstract class AntlrJavaScriptElementInfo(override val root: AntlrNode, override private fun getEnclosingElementName(enclosingRoot: AntlrNode?): String? { return enclosingRoot?.children?.firstOrNull { it.hasLastLabel(ENCLOSING_ELEMENT_NAME_NODE) - }?.originalToken + }?.token?.original } private fun getEnclosingElementType(enclosingRoot: AntlrNode): EnclosingElementType { @@ -59,8 +59,8 @@ abstract class AntlrJavaScriptElementInfo(override val root: AntlrNode, override .map { it.getChildOfType(PARAMETER_NAME_NODE) ?: it } } return parameterNameNodes.map { - check(it.originalToken != null) { "Parameter name wasn't found" } - FunctionInfoParameter(name = it.originalToken, type = null) + check(it.token.original != null) { "Parameter name wasn't found" } + FunctionInfoParameter(name = it.token.original, type = null) } } diff --git a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt index e7642cd2..b96de262 100644 --- a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt @@ -15,7 +15,9 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: override val nameNode: AntlrNode? = root.getChildOfType(FUNCTION_NAME) override val enclosingElement: EnclosingElement? = collectEnclosingElement() override val parameters: List? = - try { collectParameters() } catch (e: IllegalStateException) { + try { + collectParameters() + } catch (e: IllegalStateException) { logger.warn { e.message } null } @@ -65,18 +67,18 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: val isPassedByReference = parameterNode.getChildOfType(REFERENCE) != null if (parameterNode.hasLastLabel(PARAMETER_NAME)) { - return parameterNode.originalToken ?: error("No name was found for a parameter") + return parameterNode.token.original ?: error("No name was found for a parameter") } val varInit = parameterNode.getItOrChildrenOfType(VAR_DECLARATION).first() - val name = varInit.getItOrChildrenOfType(PARAMETER_NAME).first().originalToken + val name = varInit.getItOrChildrenOfType(PARAMETER_NAME).first().token.original ?: error("No name was found for a parameter") return (if (isPassedByReference) "&" else "") + (if (isSplattedArg) "..." else "") + name } - private fun getElementType(element: AntlrNode): String? = element.getChildOfType(TYPE)?.originalToken + private fun getElementType(element: AntlrNode): String? = element.getChildOfType(TYPE)?.token?.original private fun collectEnclosingElement(): EnclosingElement? { val enclosing = root.findEnclosingElementBy { it.isPossibleEnclosing() } ?: return null @@ -104,8 +106,8 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: private fun getEnclosingElementName(enclosing: AntlrNode): String? { return when { - enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.originalToken - enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.originalToken + enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.token?.original + enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.token?.original else -> error("No type can be associated") } } diff --git a/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt index 365dec5f..6ccf3f62 100644 --- a/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt @@ -38,7 +38,7 @@ class AntlrPythonFunctionInfo(override val root: AntlrNode, override val filePat val parameterHaveNoDefaultOrType = parameterNode.hasLastLabel(PARAMETER_NAME_NODE) val parameterNameNode = if (parameterHaveNoDefaultOrType) parameterNode else parameterNode.getChildOfType(PARAMETER_NAME_NODE) - val parameterName = parameterNameNode?.originalToken + val parameterName = parameterNameNode?.token?.original require(parameterName != null) { "Method name was not found" } val parameterType = parameterNode.getChildOfType(PARAMETER_TYPE_NODE)?.getTokensFromSubtree() @@ -63,7 +63,7 @@ class AntlrPythonFunctionInfo(override val root: AntlrNode, override val filePat EnclosingElementType.Method, EnclosingElementType.Function -> enclosingNode.getChildOfType(FUNCTION_NAME_NODE) else -> error("Enclosing node can only be function or class") - }?.originalToken + }?.token?.original return EnclosingElement( type = type, name = name, diff --git a/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt b/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt index 711293b7..72e05617 100644 --- a/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt @@ -24,7 +24,7 @@ class FuzzyCppFunctionInfo(override val root: FuzzyNode, override val filePath: private fun collectNameNode(): FuzzyNode? = root.getChildOfType(METHOD_NAME_NODE) as? FuzzyNode private fun collectReturnType(): String? = - root.getChildOfType(METHOD_RETURN_NODE)?.getChildOfType(METHOD_RETURN_TYPE_NODE)?.originalToken + root.getChildOfType(METHOD_RETURN_NODE)?.getChildOfType(METHOD_RETURN_TYPE_NODE)?.token?.original private fun collectEnclosingClass(): EnclosingElement? { val enclosingClass = findEnclosingClass() ?: return null @@ -40,13 +40,13 @@ class FuzzyCppFunctionInfo(override val root: FuzzyNode, override val filePath: root.findEnclosingElementBy { it.typeLabel == CLASS_DECLARATION_NODE } private fun findEnclosingClassName(enclosingClass: FuzzyNode): String? = - enclosingClass.getChildOfType(CLASS_NAME_NODE)?.originalToken + enclosingClass.getChildOfType(CLASS_NAME_NODE)?.token?.original private fun collectParameters(): List { val parameters = root.getChildrenOfType(METHOD_PARAMETER_NODE) return parameters.map { param -> - val type = param.getChildOfType(PARAMETER_TYPE_NODE)?.originalToken - val name = param.getChildOfType(PARAMETER_NAME_NODE)?.originalToken ?: "" + val type = param.getChildOfType(PARAMETER_TYPE_NODE)?.token?.original + val name = param.getChildOfType(PARAMETER_NAME_NODE)?.token?.original ?: "" FunctionInfoParameter(name, type) } } diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt index f6a211e7..b7dd3c09 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt @@ -29,12 +29,12 @@ class GumTreeJavaJDTFunctionInfo( override val modifiers: List = root .children .filter { it.typeLabel == "Modifier" } - .mapNotNull { it.originalToken } + .mapNotNull { it.token.original } override val annotations: List = root .children .filter { it.typeLabel == "MarkerAnnotation" } - .mapNotNull { it.children.first().originalToken } + .mapNotNull { it.children.first().token.original } override val isConstructor: Boolean = enclosingElement?.name?.equals(name) ?: false @@ -42,7 +42,7 @@ class GumTreeJavaJDTFunctionInfo( private fun collectEnclosingClass(): EnclosingElement? = extractWithLogger(logger) { val enclosingNode = getEnclosingClassNode(root.parent) ?: return@extractWithLogger null - val name = enclosingNode.getChildOfType(TypeLabels.simpleName)?.originalToken + val name = enclosingNode.getChildOfType(TypeLabels.simpleName)?.token?.original val type = when (enclosingNode.typeLabel) { TypeLabels.typeDeclaration -> EnclosingElementType.Class TypeLabels.enumDeclaration -> EnclosingElementType.Enum @@ -66,10 +66,10 @@ class GumTreeJavaJDTFunctionInfo( } private fun GumTreeNode.getElementName(): String = - getChildOfType(TypeLabels.simpleName)?.originalToken ?: error("No name found for element") + getChildOfType(TypeLabels.simpleName)?.token?.original ?: error("No name found for element") private fun GumTreeNode.getElementType(): String? = children.firstOrNull { it.isTypeNode() }?.preOrder() - ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.originalToken } + ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.token.original } ?.joinToString(separator = "") private fun GumTreeNode.isTypeNode() = typeLabel.endsWith("Type") diff --git a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt index 7e75468f..57fbbcfa 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt @@ -30,7 +30,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val override val annotations: List? = run { root.children.filter { it.typeLabel == ANNOTATION }.map { - val token = it.getChildOfType(NAME)?.originalToken + val token = it.getChildOfType(NAME)?.token?.original if (token == null) { logger.warn { "Annotation in function $name in file $filePath don't have a name" } return@run null @@ -42,7 +42,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val override val modifiers: List? = run { val type = checkNotNull(root.getChildOfType(TYPE)) { "Function $name in file $filePath doesn't have a type" } type.children.filter { it.typeLabel == MODIFIER }.map { - val token = it.originalToken + val token = it.token.original if (token == null) { logger.warn { "Modifier in function $name in file $filePath doesn't have a name" } return@run null @@ -60,7 +60,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val private fun assembleParameter(node: GumTreeNode): FunctionInfoParameter { val parameter = checkNotNull(node.getChildOfType(VAR_DECLARATION)) { "No variable found" } - val name = checkNotNull(parameter.getChildOfType(NAME)?.originalToken) { "Parameter name was not found" } + val name = checkNotNull(parameter.getChildOfType(NAME)?.token?.original) { "Parameter name was not found" } val type = parameter.extractType() return FunctionInfoParameter(name, type) } @@ -73,7 +73,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val } EnclosingElement( type = enclosingType, - name = this.getChildOfType(NAME)?.originalToken ?: return@extractWithLogger null, + name = this.getChildOfType(NAME)?.token?.original ?: return@extractWithLogger null, root = this ) } @@ -84,7 +84,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val if (node.typeLabel == ARRAY_BRACKETS) { "[]" } else { - checkNotNull(node.originalToken) { "No type found" } + checkNotNull(node.token.original) { "No type found" } } } } diff --git a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt index c9f77217..00f4b48e 100644 --- a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt @@ -43,7 +43,7 @@ class GumTreePythonFunctionInfo( val enclosing = findEnclosingClass() ?: return null return EnclosingElement( type = EnclosingElementType.Class, - name = enclosing.originalToken, + name = enclosing.token.original, root = enclosing ) } @@ -63,7 +63,7 @@ class GumTreePythonFunctionInfo( } } return params.mapNotNull { - FunctionInfoParameter(it.originalToken ?: return@mapNotNull null, getElementType(it)?.originalToken) + FunctionInfoParameter(it.token.original ?: return@mapNotNull null, getElementType(it)?.token?.original) } } diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt b/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt index 9d5c5d63..9d754702 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt @@ -27,7 +27,7 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil override val modifiers: List? = run { root.children.filter { it.typeLabel == MODIFIER }.map { - val token = it.originalToken + val token = it.token.original if (token == null) { logger.warn { "Modifier for function $name in file $filePath doesn't have a token" } return@run null @@ -38,7 +38,7 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil override val annotations: List? = run { root.children.filter { it.typeLabel in POSSIBLE_ANNOTATION_TYPES }.map { - val token = it.getChildOfType(ANNOTATION_NAME)?.originalToken?.split(".")?.last() + val token = it.getChildOfType(ANNOTATION_NAME)?.token?.original?.split(".")?.last() if (token == null) { logger.warn { "Annotation for function $name in file $filePath doesn't have a token" } return@run null @@ -62,8 +62,8 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil checkNotNull(possibleTypeNode) { "Couldn't find parameter type node" } val typeToken = when (possibleTypeNode.typeLabel) { ARRAY_TYPE -> getParameterType(possibleTypeNode) + ARRAY_BRACKETS - PRIMITIVE_TYPE -> possibleTypeNode.originalToken - CLASS_OR_INTERFACE_TYPE -> possibleTypeNode.getChildOfType(CLASS_NAME)?.originalToken + PRIMITIVE_TYPE -> possibleTypeNode.token.original + CLASS_OR_INTERFACE_TYPE -> possibleTypeNode.getChildOfType(CLASS_NAME)?.token?.original else -> null } checkNotNull(typeToken) { "Couldn't extract parameter type from node" } @@ -71,12 +71,12 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil } private fun getParameterName(node: JavaParserNode): String { - val name = checkNotNull(node.getChildOfType(PARAMETER_NAME)?.originalToken) { "Couldn't find parameter name" } + val name = checkNotNull(node.getChildOfType(PARAMETER_NAME)?.token?.original) { "Couldn't find parameter name" } return name.replace(ARRAY_BRACKETS_REGEX, "") } private fun JavaParserNode.assembleEnclosingClass(): EnclosingElement? = extractWithLogger(logger) { - val name = this.getChildOfType(CLASS_NAME)?.originalToken + val name = this.getChildOfType(CLASS_NAME)?.token?.original val type = when (this.typeLabel) { CLASS_OR_INTERFACE_DECLARATION -> EnclosingElementType.Class ENUM_DECLARATION -> EnclosingElementType.Enum diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt index 3247ab85..8f9b7503 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt @@ -23,11 +23,11 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: override val annotations: List? = run { root.getChildrenOfType(ANNOTATION_NODE_TYPE).map { - it.getChildOfType(TYPE_REFERENCE)?.originalToken ?: return@run null + it.getChildOfType(TYPE_REFERENCE)?.token?.original ?: return@run null } } - override val returnType: String? = root.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.originalToken + override val returnType: String? = root.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.token?.original override val body: SpoonNode? = root.getChildOfType(BLOCK) @@ -37,8 +37,8 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: root.findEnclosingElementBy { it.typeLabel in POSSIBLE_ENCLOSING_ELEMENTS }?.assembleEnclosingClass() private fun assembleParameter(parameterNode: SpoonNode): FunctionInfoParameter { - val type = parameterNode.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.originalToken - val name = parameterNode.originalToken + val type = parameterNode.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.token?.original + val name = parameterNode.token.original checkNotNull(name) { "Couldn't find parameter name token" } return FunctionInfoParameter(name, type) } @@ -49,7 +49,7 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: CLASS_DECLARATION_TYPE -> EnclosingElementType.Class else -> error("Can't find any enclosing type association") } - EnclosingElement(type, this.originalToken, root) + EnclosingElement(type, this.token.original, root) } companion object { diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 5c5360d6..57ff2d29 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,6 +1,6 @@ package astminer.parse.treesitter.java -import astminer.common.EMPTY_TOKEN +import astminer.common.model.Code2VecNormalization import astminer.common.model.* import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy @@ -20,7 +20,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil annotations.children .filter { it.typeLabel in possibleAnnotations } .map { annotation -> annotation.preOrder().filter { it.typeLabel in listOf(NAME, SCOPE_IDENTIFIER, DOT) } } - .map { nameNodes -> nameNodes.map { it.originalToken ?: "" } } + .map { nameNodes -> nameNodes.map { it.token.original ?: "" } } .map { nameNodesWithToken -> nameNodesWithToken.joinToString(separator = "") } } @@ -28,7 +28,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil val modifiers = root.getChildOfType(MODIFIERS) ?: return@extractWithLogger listOf() modifiers.children .filter { it.typeLabel in possibleModifiers } - .map { it.originalToken } + .map { it.token.original } .map { checkNotNull(it) { "Modifier without a token" } } } @@ -37,9 +37,9 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil parametersRoot.children.filter { it.typeLabel in possibleParameters }.map { parameter -> val possibleNameNode = parameter.getChildOfType(NAME) val name = if (possibleNameNode != null) { - possibleNameNode.originalToken + possibleNameNode.token.original } else { - parameter.getChildOfType(VARIABLE_DECLARATOR)?.getChildOfType(NAME)?.originalToken + parameter.getChildOfType(VARIABLE_DECLARATOR)?.getChildOfType(NAME)?.token?.original } checkNotNull(name) { "Can't find parameter name" } @@ -57,7 +57,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil var collectedType = returnTypeNode.getTokensFromSubtree() if (returnTypeNode.typeLabel == ARRAY_TYPE) { - collectedType = collectedType.replace(EMPTY_TOKEN, "[]") + collectedType = collectedType.replace(Code2VecNormalization.EMPTY_TOKEN, "[]") } return@run collectedType } @@ -65,7 +65,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil override val enclosingElement: EnclosingElement? = extractWithLogger(logger) { val enclosingNode = root.findEnclosingElementBy { it.typeLabel in possible_enclosings } ?: return@extractWithLogger null - val name = enclosingNode.getChildOfType(NAME)?.originalToken + val name = enclosingNode.getChildOfType(NAME)?.token?.original val type = when (enclosingNode.typeLabel) { CLASS_DECLARATION -> EnclosingElementType.Class ENUM_DECLARATION -> EnclosingElementType.Enum diff --git a/src/main/kotlin/astminer/paths/PathUtil.kt b/src/main/kotlin/astminer/paths/PathUtil.kt index ad8c9cc8..4baa90cd 100644 --- a/src/main/kotlin/astminer/paths/PathUtil.kt +++ b/src/main/kotlin/astminer/paths/PathUtil.kt @@ -2,7 +2,7 @@ package astminer.paths import astminer.common.model.* -fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token }): PathContext { +fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token.final }): PathContext { val startToken = getToken(path.upwardNodes.first()) val endToken = getToken(path.downwardNodes.last()) val astNodes = path.upwardNodes.map { OrientedNodeType(it.typeLabel, Direction.UP) } + diff --git a/src/main/kotlin/astminer/paths/PathWorker.kt b/src/main/kotlin/astminer/paths/PathWorker.kt index a19bd487..de78686e 100644 --- a/src/main/kotlin/astminer/paths/PathWorker.kt +++ b/src/main/kotlin/astminer/paths/PathWorker.kt @@ -49,7 +49,7 @@ class PathWorker { val paths: MutableList = ArrayList() iterator.forEach { currentNode -> if (currentNode.isLeaf()) { - if (currentNode.token.isNotEmpty()) { + if (currentNode.token.final.isNotEmpty()) { currentNode.setPathPieces(listOf(listOf(currentNode))) } } else { diff --git a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt index 9ef41b0b..7691ff28 100644 --- a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt @@ -28,7 +28,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { for (node in labeledResult.root.preOrder()) { - tokensMap.record(node.token) + tokensMap.record(node.token.final) nodeTypesMap.record(node.typeLabel) } val writer = astsPrintWriters.getOrPut(holdout) { holdout.resolveHoldout() } @@ -55,7 +55,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { } internal fun astString(node: Node): String { - return "${tokensMap.getId(node.token)} ${nodeTypesMap.getId(node.typeLabel)}{${ + return "${tokensMap.getId(node.token.final)} ${nodeTypesMap.getId(node.typeLabel)}{${ node.children.joinToString(separator = "", transform = ::astString) }}" } diff --git a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt index 3ce2463e..15fb27fc 100644 --- a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt @@ -34,7 +34,7 @@ class JsonAstStorage(override val outputDirectoryPath: String, private val withP private data class OutputNode(val token: String, val typeLabel: String, val children: List) private fun TreeFlattener.EnumeratedNode.toOutputNode() = - OutputNode(node.token, node.typeLabel, children.map { it.id }) + OutputNode(node.token.final, node.typeLabel, children.map { it.id }) override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { val outputNodes = treeFlattener.flatten(labeledResult.root).map { it.toOutputNode() } diff --git a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt index d41ad286..fec17dc2 100644 --- a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt +++ b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt @@ -54,7 +54,7 @@ abstract class PathBasedStorage( return LabeledPathContexts( labeledResult.label, paths.map { astPath -> - toPathContext(astPath) { it.token.replace("\n", "\\n") } + toPathContext(astPath) { it.token.final.replace("\n", "\\n") } } ) } diff --git a/src/test/kotlin/astminer/common/TreeUtilTest.kt b/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt similarity index 82% rename from src/test/kotlin/astminer/common/TreeUtilTest.kt rename to src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt index 7a263f7e..504fda39 100644 --- a/src/test/kotlin/astminer/common/TreeUtilTest.kt +++ b/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt @@ -1,9 +1,10 @@ package astminer.common +import astminer.common.model.Code2VecNormalization import org.junit.Assert import org.junit.Test -class TreeUtilTest { +class Code2VecNormalizationTest { private val defaultToken = "EMPTY" @Test @@ -29,7 +30,7 @@ class TreeUtilTest { Assert.assertEquals( "All whitespace characters and punctuation should be removed, keeping only letters", expectedToken, - normalizeToken(token, defaultToken) + Code2VecNormalization.normalizeSubToken(token, defaultToken) ) } @@ -40,18 +41,18 @@ class TreeUtilTest { Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - normalizeToken(token, defaultToken) + Code2VecNormalization.normalizeSubToken(token, defaultToken) ) } @Test fun testNormalizeEmptyToken() { val token = "\n\n" - val expectedToken = EMPTY_TOKEN + val expectedToken = Code2VecNormalization.EMPTY_TOKEN Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - normalizeToken(token, defaultToken) + Code2VecNormalization.normalizeSubToken(token, defaultToken) ) } @@ -62,7 +63,7 @@ class TreeUtilTest { Assert.assertEquals( "Token with snake, camel and combined case should be split into list of its parts", expectedToken, - splitToSubtokens(token) + Code2VecNormalization.splitToSubtokens(token) ) } } diff --git a/src/test/kotlin/astminer/common/DummyNode.kt b/src/test/kotlin/astminer/common/DummyNode.kt index a00ca3ef..c158614b 100644 --- a/src/test/kotlin/astminer/common/DummyNode.kt +++ b/src/test/kotlin/astminer/common/DummyNode.kt @@ -12,7 +12,7 @@ class DummyNode( init { // Tokens may change after normalization, for tests we want tokens to be unchanged - technicalToken = typeLabel + token.technical = typeLabel } override fun removeChildrenOfType(typeLabel: String) { diff --git a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt index d0867ce0..4624535d 100644 --- a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt +++ b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt @@ -15,7 +15,7 @@ class PrettyNode(override val typeLabel: String, originalToken: String) : Node(o fun toPrettyString(indent: Int = 0, indentSymbol: String = "--"): String = with(StringBuilder()) { repeat(indent) { append(indentSymbol) } append(typeLabel) - if (token.isNotEmpty()) { + if (token.final.isNotEmpty()) { appendLine(" : $token") } else { appendLine() diff --git a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt index c51226ab..7760f7e9 100644 --- a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt +++ b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt @@ -38,20 +38,20 @@ class FunctionNameLabelExtractorTest { @Test fun `test FunctionNameProblem hides function name node token with METHOD_NAME`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.nameNode?.token) + assertEquals("METHOD_NAME", functionInfo.nameNode?.token?.final) } @Test fun `test FunctionNameProblem hides function root token with METHOD_NAME if it is the name node`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.root.token) + assertEquals("METHOD_NAME", functionInfo.root.token.final) } @Test fun `test function name problem should hide recursive call tokens with SELF`() { FunctionNameLabelExtractor.process(functionInfo) val recursiveCallNode = functionInfo.root.children.firstOrNull()?.children?.firstOrNull() - assertEquals("SELF", recursiveCallNode?.token) + assertEquals("SELF", recursiveCallNode?.token?.final) } companion object { diff --git a/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt index 124926af..52c0ccff 100644 --- a/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt @@ -134,7 +134,7 @@ class GumTreePythonFunctionSplitterTest { root.getChildOfType("body") ?.getChildOfType("Expr") ?.getChildOfType("Constant-str") - ?.originalToken + ?.token?.original ) assertEquals(4, parameters?.size) assertEquals( From b681d8434d656da7aae8aa842874e70f349d4f27 Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 15:08:52 +0300 Subject: [PATCH 03/48] normalization and token logic extracted as separate class --- src/main/kotlin/astminer/common/TreeUtil.kt | 43 ---------- .../common/model/FunctionInfoModel.kt | 2 +- .../astminer/common/model/ParsingModel.kt | 25 ++---- .../kotlin/astminer/common/model/Token.kt | 86 +++++++++++++++++++ .../astminer/featureextraction/TreeFeature.kt | 2 +- .../kotlin/astminer/filters/CommonFilters.kt | 2 +- .../astminer/filters/FunctionFilters.kt | 4 +- .../FunctionNameLabelExtractor.kt | 8 +- .../kotlin/astminer/parse/ForeignParser.kt | 8 +- .../kotlin/astminer/parse/antlr/AntlrUtil.kt | 5 +- .../parse/antlr/java/AntlrJavaFunctionInfo.kt | 6 +- .../javascript/AntlrJavaScriptElementInfo.kt | 6 +- .../parse/antlr/php/ANTLRPHPFunctionInfo.kt | 14 +-- .../antlr/python/AntlrPythonFunctionInfo.kt | 4 +- .../parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt | 8 +- .../java/jdt/GumTreeJavaJDTFunctionInfo.kt | 10 +-- .../srcML/GumTreeJavaSrcmlFunctionInfo.kt | 10 +-- .../python/GumTreePythonFunctionInfo.kt | 4 +- .../javaparser/JavaparserFunctionInfo.kt | 12 +-- .../parse/spoon/SpoonJavaFunctionInfo.kt | 10 +-- .../java/TreeSitterJavaFunctionInfo.kt | 14 +-- src/main/kotlin/astminer/paths/PathUtil.kt | 2 +- src/main/kotlin/astminer/paths/PathWorker.kt | 2 +- .../astminer/storage/ast/CsvAstStorage.kt | 4 +- .../astminer/storage/ast/JsonAstStorage.kt | 2 +- .../astminer/storage/path/PathBasedStorage.kt | 2 +- ...ilTest.kt => Code2VecNormalizationTest.kt} | 13 +-- src/test/kotlin/astminer/common/DummyNode.kt | 2 +- .../astminer/featureextraction/PrettyNode.kt | 2 +- .../FunctionNameLabelExtractorTest.kt | 6 +- .../GumTreePythonFunctionSplitterTest.kt | 2 +- 31 files changed, 178 insertions(+), 142 deletions(-) create mode 100644 src/main/kotlin/astminer/common/model/Token.kt rename src/test/kotlin/astminer/common/{TreeUtilTest.kt => Code2VecNormalizationTest.kt} (82%) diff --git a/src/main/kotlin/astminer/common/TreeUtil.kt b/src/main/kotlin/astminer/common/TreeUtil.kt index 1a53158d..129178bf 100644 --- a/src/main/kotlin/astminer/common/TreeUtil.kt +++ b/src/main/kotlin/astminer/common/TreeUtil.kt @@ -1,45 +1,2 @@ package astminer.common -const val EMPTY_TOKEN = "EMPTY" - -/** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - -val newLineReg = "\\\\n".toRegex() -val whitespaceReg = "//s+".toRegex() -val quotesApostrophesCommasReg = "[\"',]".toRegex() -val unicodeWeirdCharReg = "\\P{Print}".toRegex() -val notALetterReg = "[^A-Za-z]".toRegex() - -fun normalizeToken(token: String, defaultToken: String): String { - val cleanToken = token.lowercase() - .replace(newLineReg, "") // escaped new line - .replace(whitespaceReg, "") // whitespaces - .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas - .replace(unicodeWeirdCharReg, "") // unicode weird characters - - val stripped = cleanToken.replace(notALetterReg, "") - - return stripped.ifEmpty { - val carefulStripped = cleanToken.replace(" ", "_") - carefulStripped.ifEmpty { - defaultToken - } - } -} - -/** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - -val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - -fun splitToSubtokens(token: String) = token - .trim() - .split(splitRegex) - .map { s -> normalizeToken(s, "") } - .filter { it.isNotEmpty() } - .toList() diff --git a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt index 7b2f4a9e..16b2e2e1 100644 --- a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt +++ b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt @@ -20,7 +20,7 @@ interface NamedTree { val nameNode: T? get() = notImplemented("nameNode") val name: String? - get() = nameNode?.originalToken + get() = nameNode?.token?.original val root: T get() = notImplemented("root") val body: T? diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index 5f6c8dfa..bbdc2f9c 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -1,31 +1,20 @@ package astminer.common.model -import astminer.common.EMPTY_TOKEN -import astminer.common.splitToSubtokens import java.io.File import java.io.InputStream -import java.util.* -abstract class Node(val originalToken: String?) { +abstract class Node(val token: Token) { + constructor(originalToken: String?): this(Token(originalToken, null)) + abstract val typeLabel: String abstract val children: List abstract val parent: Node? - val normalizedToken: String = - originalToken?.let { - val subtokens = splitToSubtokens(it) - if (subtokens.isEmpty()) EMPTY_TOKEN else subtokens.joinToString(TOKEN_DELIMITER) - } ?: EMPTY_TOKEN - - var technicalToken: String? = null - - val token: String - get() = technicalToken ?: normalizedToken - val metadata: MutableMap = HashMap() fun isLeaf() = children.isEmpty() override fun toString(): String = "$typeLabel : $token" + fun prettyPrint(indent: Int = 0, indentSymbol: String = "--") { repeat(indent) { print(indentSymbol) } println(this) @@ -52,10 +41,6 @@ abstract class Node(val originalToken: String?) { fun postOrderIterator(): Iterator = postOrder().listIterator() open fun postOrder(): List = mutableListOf().also { doTraversePostOrder(it) } - - companion object { - const val TOKEN_DELIMITER = "|" - } } /** Node simplest implementation **/ @@ -63,7 +48,7 @@ class SimpleNode( override val typeLabel: String, override val children: MutableList, override val parent: Node?, - token: String? + token: Token ) : Node(token) { override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt new file mode 100644 index 00000000..ebe176ad --- /dev/null +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -0,0 +1,86 @@ +package astminer.common.model + +data class Token( + val original: String?, + val range: TokenRange?, + val normalization: Normalization = Code2VecNormalization +) { + init { + if (original == null) require(range == null) { "Token range without token was provided" } + } + + val final: String + get() = technical ?: normalized + + var technical: String? = null + + val normalized = Code2VecNormalization.normalizeToken(original) + + override fun toString(): String = final +} + +typealias Line = Int +typealias Column = Int + +data class TokenRange(val start: Pair, val end: Pair) { + init { + require(start.first >= end.first) { "Wrong line format" } + require(start.second >= end.second) { "Wrong column format" } + } +} + +interface Normalization { + fun normalizeToken(token: String?): String +} + +object Code2VecNormalization: Normalization { + const val EMPTY_TOKEN = "EMPTY" + const val TOKEN_DELIMITER = "|" + + override fun normalizeToken(token: String?): String { + if (token == null) return EMPTY_TOKEN + val subTokens = splitToSubtokens(token) + return if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) + } + + /** + * The function was adopted from the original code2vec implementation in order to match their behavior: + * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + */ + fun splitToSubtokens(token: String) = token + .trim() + .split(splitRegex) + .map { s -> normalizeSubToken(s, "") } + .filter { it.isNotEmpty() } + .toList() + + private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() + + + /** + * The function was adopted from the original code2vec implementation in order to match their behavior: + * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + */ + fun normalizeSubToken(token: String, defaultToken: String): String { + val cleanToken = token.lowercase() + .replace(newLineReg, "") // escaped new line + .replace(whitespaceReg, "") // whitespaces + .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas + .replace(unicodeWeirdCharReg, "") // unicode weird characters + + val stripped = cleanToken.replace(notALetterReg, "") + + return stripped.ifEmpty { + val carefulStripped = cleanToken.replace(" ", "_") + carefulStripped.ifEmpty { + defaultToken + } + } + } + + private val newLineReg = "\\\\n".toRegex() + private val whitespaceReg = "//s+".toRegex() + private val quotesApostrophesCommasReg = "[\"',]".toRegex() + private val unicodeWeirdCharReg = "\\P{Print}".toRegex() + private val notALetterReg = "[^A-Za-z]".toRegex() +} \ No newline at end of file diff --git a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt index 9a76c7fc..3068c2e9 100644 --- a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt +++ b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt @@ -57,7 +57,7 @@ object Tokens : TreeFeature> { private fun findTokens(node: Node, tokensList: MutableList): List { node.children.forEach { findTokens(it, tokensList) } - tokensList.add(node.token) + tokensList.add(node.token.final) return tokensList } } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index a0f47848..7f6e32d3 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -24,7 +24,7 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { private fun validateTree(root: Node) = - !root.preOrder().any { node -> node.token.split(Node.TOKEN_DELIMITER).size > maxWordsNumber } + !root.preOrder().any { node -> node.token.final.split(Code2VecNormalization.TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/filters/FunctionFilters.kt b/src/main/kotlin/astminer/filters/FunctionFilters.kt index d5316459..2edc50f4 100644 --- a/src/main/kotlin/astminer/filters/FunctionFilters.kt +++ b/src/main/kotlin/astminer/filters/FunctionFilters.kt @@ -1,9 +1,9 @@ package astminer.filters +import astminer.common.model.Code2VecNormalization import astminer.common.model.FunctionFilter import astminer.common.model.FunctionInfo import astminer.common.model.Node -import astminer.common.splitToSubtokens /** * Filter that excludes functions that have at least one of modifiers from the [excludeModifiers] list. @@ -38,7 +38,7 @@ object ConstructorFilter : FunctionFilter { class FunctionNameWordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter { override fun validate(functionInfo: FunctionInfo): Boolean { val name = functionInfo.name - return name != null && splitToSubtokens(name).size <= maxWordsNumber + return name != null && Code2VecNormalization.splitToSubtokens(name).size <= maxWordsNumber } } diff --git a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt index 9a62e645..e5abb201 100644 --- a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt +++ b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt @@ -14,13 +14,13 @@ object FunctionNameLabelExtractor : FunctionLabelExtractor { private const val RECURSIVE_CALL_TOKEN = "SELF" override fun process(functionInfo: FunctionInfo): LabeledResult? { - val normalizedName = functionInfo.nameNode?.normalizedToken ?: return null + val normalizedName = functionInfo.nameNode?.token?.normalized ?: return null functionInfo.root.preOrder().forEach { node -> - if (node.originalToken == functionInfo.nameNode?.originalToken) { - node.technicalToken = RECURSIVE_CALL_TOKEN + if (node.token.original == functionInfo.nameNode?.token?.original) { + node.token.technical = RECURSIVE_CALL_TOKEN } } - functionInfo.nameNode?.technicalToken = HIDDEN_METHOD_NAME_TOKEN + functionInfo.nameNode?.token?.technical = HIDDEN_METHOD_NAME_TOKEN return LabeledResult(functionInfo.root, normalizedName, functionInfo.qualifiedPath) } } diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index d7ff2df0..b0f42447 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -2,6 +2,7 @@ package astminer.parse import astminer.common.model.Parser import astminer.common.model.SimpleNode +import astminer.common.model.Token import astminer.config.FileExtension import astminer.config.ParserType import kotlinx.serialization.Serializable @@ -57,7 +58,12 @@ private fun launchScript(args: List): String { private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent: SimpleNode? = null): SimpleNode { val foreignNode = context.tree[rootId] - val node = SimpleNode(foreignNode.nodeType, mutableListOf(), parent, foreignNode.token) + val node = SimpleNode( + typeLabel = foreignNode.nodeType, + children = mutableListOf(), + parent = parent, + token = Token(foreignNode.token, null) + ) val children = foreignNode.children.map { convertFromForeignTree(context, it, node) } node.children.addAll(children) return node diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt b/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt index e9aea811..253a14c2 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt @@ -1,6 +1,5 @@ package astminer.parse.antlr -import astminer.common.EMPTY_TOKEN import astminer.common.model.Node import org.antlr.v4.runtime.ParserRuleContext import org.antlr.v4.runtime.Vocabulary @@ -58,7 +57,7 @@ fun compressTree(root: AntlrNode): AntlrNode { val compressedNode = AntlrNode( root.typeLabel + "|" + child.typeLabel, root.parent, - child.originalToken + child.token.original ) compressedNode.replaceChildren(child.children) compressedNode @@ -83,7 +82,7 @@ fun AntlrNode.hasFirstLabel(label: String): Boolean = firstLabel() == label fun AntlrNode.firstLabelIn(labels: List): Boolean = labels.contains(firstLabel()) fun Node.getTokensFromSubtree(): String = - if (isLeaf()) originalToken ?: EMPTY_TOKEN else children.joinToString(separator = "") { it.getTokensFromSubtree() } + if (isLeaf()) token.original ?: "" else children.joinToString(separator = "") { it.getTokensFromSubtree() } fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel).map { it } diff --git a/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt index a99fa460..def07552 100644 --- a/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt @@ -23,12 +23,12 @@ class AntlrJavaFunctionInfo(override val root: AntlrNode, override val filePath: override val modifiers: List? = root.parent?.children ?.filter { it.hasFirstLabel(METHOD_MODIFIER) && !it.hasLastLabel(METHOD_ANNOTATION) } - ?.mapNotNull { it.originalToken } + ?.mapNotNull { it.token.original } override val annotations: List? = root.parent?.children ?.filter { it.hasLastLabel(METHOD_ANNOTATION) } - ?.mapNotNull { it.getChildOfType(ANNOTATION_NAME)?.originalToken } + ?.mapNotNull { it.getChildOfType(ANNOTATION_NAME)?.token?.original } override val body: AntlrNode? = root.children.find { it.hasFirstLabel(METHOD_BODY_NODE) } @@ -51,7 +51,7 @@ class AntlrJavaFunctionInfo(override val root: AntlrNode, override val filePath: } EnclosingElement( type = enclosingType, - name = enclosingClassNode.getChildOfType(ENCLOSING_NAME_NODE)?.originalToken, + name = enclosingClassNode.getChildOfType(ENCLOSING_NAME_NODE)?.token?.original, root = enclosingClassNode ) } diff --git a/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt b/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt index d5c12555..fedf1b5a 100644 --- a/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt @@ -30,7 +30,7 @@ abstract class AntlrJavaScriptElementInfo(override val root: AntlrNode, override private fun getEnclosingElementName(enclosingRoot: AntlrNode?): String? { return enclosingRoot?.children?.firstOrNull { it.hasLastLabel(ENCLOSING_ELEMENT_NAME_NODE) - }?.originalToken + }?.token?.original } private fun getEnclosingElementType(enclosingRoot: AntlrNode): EnclosingElementType { @@ -59,8 +59,8 @@ abstract class AntlrJavaScriptElementInfo(override val root: AntlrNode, override .map { it.getChildOfType(PARAMETER_NAME_NODE) ?: it } } return parameterNameNodes.map { - check(it.originalToken != null) { "Parameter name wasn't found" } - FunctionInfoParameter(name = it.originalToken, type = null) + check(it.token.original != null) { "Parameter name wasn't found" } + FunctionInfoParameter(name = it.token.original, type = null) } } diff --git a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt index e7642cd2..b96de262 100644 --- a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt @@ -15,7 +15,9 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: override val nameNode: AntlrNode? = root.getChildOfType(FUNCTION_NAME) override val enclosingElement: EnclosingElement? = collectEnclosingElement() override val parameters: List? = - try { collectParameters() } catch (e: IllegalStateException) { + try { + collectParameters() + } catch (e: IllegalStateException) { logger.warn { e.message } null } @@ -65,18 +67,18 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: val isPassedByReference = parameterNode.getChildOfType(REFERENCE) != null if (parameterNode.hasLastLabel(PARAMETER_NAME)) { - return parameterNode.originalToken ?: error("No name was found for a parameter") + return parameterNode.token.original ?: error("No name was found for a parameter") } val varInit = parameterNode.getItOrChildrenOfType(VAR_DECLARATION).first() - val name = varInit.getItOrChildrenOfType(PARAMETER_NAME).first().originalToken + val name = varInit.getItOrChildrenOfType(PARAMETER_NAME).first().token.original ?: error("No name was found for a parameter") return (if (isPassedByReference) "&" else "") + (if (isSplattedArg) "..." else "") + name } - private fun getElementType(element: AntlrNode): String? = element.getChildOfType(TYPE)?.originalToken + private fun getElementType(element: AntlrNode): String? = element.getChildOfType(TYPE)?.token?.original private fun collectEnclosingElement(): EnclosingElement? { val enclosing = root.findEnclosingElementBy { it.isPossibleEnclosing() } ?: return null @@ -104,8 +106,8 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: private fun getEnclosingElementName(enclosing: AntlrNode): String? { return when { - enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.originalToken - enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.originalToken + enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.token?.original + enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.token?.original else -> error("No type can be associated") } } diff --git a/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt index 365dec5f..6ccf3f62 100644 --- a/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt @@ -38,7 +38,7 @@ class AntlrPythonFunctionInfo(override val root: AntlrNode, override val filePat val parameterHaveNoDefaultOrType = parameterNode.hasLastLabel(PARAMETER_NAME_NODE) val parameterNameNode = if (parameterHaveNoDefaultOrType) parameterNode else parameterNode.getChildOfType(PARAMETER_NAME_NODE) - val parameterName = parameterNameNode?.originalToken + val parameterName = parameterNameNode?.token?.original require(parameterName != null) { "Method name was not found" } val parameterType = parameterNode.getChildOfType(PARAMETER_TYPE_NODE)?.getTokensFromSubtree() @@ -63,7 +63,7 @@ class AntlrPythonFunctionInfo(override val root: AntlrNode, override val filePat EnclosingElementType.Method, EnclosingElementType.Function -> enclosingNode.getChildOfType(FUNCTION_NAME_NODE) else -> error("Enclosing node can only be function or class") - }?.originalToken + }?.token?.original return EnclosingElement( type = type, name = name, diff --git a/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt b/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt index 711293b7..72e05617 100644 --- a/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt @@ -24,7 +24,7 @@ class FuzzyCppFunctionInfo(override val root: FuzzyNode, override val filePath: private fun collectNameNode(): FuzzyNode? = root.getChildOfType(METHOD_NAME_NODE) as? FuzzyNode private fun collectReturnType(): String? = - root.getChildOfType(METHOD_RETURN_NODE)?.getChildOfType(METHOD_RETURN_TYPE_NODE)?.originalToken + root.getChildOfType(METHOD_RETURN_NODE)?.getChildOfType(METHOD_RETURN_TYPE_NODE)?.token?.original private fun collectEnclosingClass(): EnclosingElement? { val enclosingClass = findEnclosingClass() ?: return null @@ -40,13 +40,13 @@ class FuzzyCppFunctionInfo(override val root: FuzzyNode, override val filePath: root.findEnclosingElementBy { it.typeLabel == CLASS_DECLARATION_NODE } private fun findEnclosingClassName(enclosingClass: FuzzyNode): String? = - enclosingClass.getChildOfType(CLASS_NAME_NODE)?.originalToken + enclosingClass.getChildOfType(CLASS_NAME_NODE)?.token?.original private fun collectParameters(): List { val parameters = root.getChildrenOfType(METHOD_PARAMETER_NODE) return parameters.map { param -> - val type = param.getChildOfType(PARAMETER_TYPE_NODE)?.originalToken - val name = param.getChildOfType(PARAMETER_NAME_NODE)?.originalToken ?: "" + val type = param.getChildOfType(PARAMETER_TYPE_NODE)?.token?.original + val name = param.getChildOfType(PARAMETER_NAME_NODE)?.token?.original ?: "" FunctionInfoParameter(name, type) } } diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt index f6a211e7..b7dd3c09 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt @@ -29,12 +29,12 @@ class GumTreeJavaJDTFunctionInfo( override val modifiers: List = root .children .filter { it.typeLabel == "Modifier" } - .mapNotNull { it.originalToken } + .mapNotNull { it.token.original } override val annotations: List = root .children .filter { it.typeLabel == "MarkerAnnotation" } - .mapNotNull { it.children.first().originalToken } + .mapNotNull { it.children.first().token.original } override val isConstructor: Boolean = enclosingElement?.name?.equals(name) ?: false @@ -42,7 +42,7 @@ class GumTreeJavaJDTFunctionInfo( private fun collectEnclosingClass(): EnclosingElement? = extractWithLogger(logger) { val enclosingNode = getEnclosingClassNode(root.parent) ?: return@extractWithLogger null - val name = enclosingNode.getChildOfType(TypeLabels.simpleName)?.originalToken + val name = enclosingNode.getChildOfType(TypeLabels.simpleName)?.token?.original val type = when (enclosingNode.typeLabel) { TypeLabels.typeDeclaration -> EnclosingElementType.Class TypeLabels.enumDeclaration -> EnclosingElementType.Enum @@ -66,10 +66,10 @@ class GumTreeJavaJDTFunctionInfo( } private fun GumTreeNode.getElementName(): String = - getChildOfType(TypeLabels.simpleName)?.originalToken ?: error("No name found for element") + getChildOfType(TypeLabels.simpleName)?.token?.original ?: error("No name found for element") private fun GumTreeNode.getElementType(): String? = children.firstOrNull { it.isTypeNode() }?.preOrder() - ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.originalToken } + ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.token.original } ?.joinToString(separator = "") private fun GumTreeNode.isTypeNode() = typeLabel.endsWith("Type") diff --git a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt index 7e75468f..57fbbcfa 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt @@ -30,7 +30,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val override val annotations: List? = run { root.children.filter { it.typeLabel == ANNOTATION }.map { - val token = it.getChildOfType(NAME)?.originalToken + val token = it.getChildOfType(NAME)?.token?.original if (token == null) { logger.warn { "Annotation in function $name in file $filePath don't have a name" } return@run null @@ -42,7 +42,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val override val modifiers: List? = run { val type = checkNotNull(root.getChildOfType(TYPE)) { "Function $name in file $filePath doesn't have a type" } type.children.filter { it.typeLabel == MODIFIER }.map { - val token = it.originalToken + val token = it.token.original if (token == null) { logger.warn { "Modifier in function $name in file $filePath doesn't have a name" } return@run null @@ -60,7 +60,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val private fun assembleParameter(node: GumTreeNode): FunctionInfoParameter { val parameter = checkNotNull(node.getChildOfType(VAR_DECLARATION)) { "No variable found" } - val name = checkNotNull(parameter.getChildOfType(NAME)?.originalToken) { "Parameter name was not found" } + val name = checkNotNull(parameter.getChildOfType(NAME)?.token?.original) { "Parameter name was not found" } val type = parameter.extractType() return FunctionInfoParameter(name, type) } @@ -73,7 +73,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val } EnclosingElement( type = enclosingType, - name = this.getChildOfType(NAME)?.originalToken ?: return@extractWithLogger null, + name = this.getChildOfType(NAME)?.token?.original ?: return@extractWithLogger null, root = this ) } @@ -84,7 +84,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val if (node.typeLabel == ARRAY_BRACKETS) { "[]" } else { - checkNotNull(node.originalToken) { "No type found" } + checkNotNull(node.token.original) { "No type found" } } } } diff --git a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt index c9f77217..00f4b48e 100644 --- a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt @@ -43,7 +43,7 @@ class GumTreePythonFunctionInfo( val enclosing = findEnclosingClass() ?: return null return EnclosingElement( type = EnclosingElementType.Class, - name = enclosing.originalToken, + name = enclosing.token.original, root = enclosing ) } @@ -63,7 +63,7 @@ class GumTreePythonFunctionInfo( } } return params.mapNotNull { - FunctionInfoParameter(it.originalToken ?: return@mapNotNull null, getElementType(it)?.originalToken) + FunctionInfoParameter(it.token.original ?: return@mapNotNull null, getElementType(it)?.token?.original) } } diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt b/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt index 9d5c5d63..9d754702 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt @@ -27,7 +27,7 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil override val modifiers: List? = run { root.children.filter { it.typeLabel == MODIFIER }.map { - val token = it.originalToken + val token = it.token.original if (token == null) { logger.warn { "Modifier for function $name in file $filePath doesn't have a token" } return@run null @@ -38,7 +38,7 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil override val annotations: List? = run { root.children.filter { it.typeLabel in POSSIBLE_ANNOTATION_TYPES }.map { - val token = it.getChildOfType(ANNOTATION_NAME)?.originalToken?.split(".")?.last() + val token = it.getChildOfType(ANNOTATION_NAME)?.token?.original?.split(".")?.last() if (token == null) { logger.warn { "Annotation for function $name in file $filePath doesn't have a token" } return@run null @@ -62,8 +62,8 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil checkNotNull(possibleTypeNode) { "Couldn't find parameter type node" } val typeToken = when (possibleTypeNode.typeLabel) { ARRAY_TYPE -> getParameterType(possibleTypeNode) + ARRAY_BRACKETS - PRIMITIVE_TYPE -> possibleTypeNode.originalToken - CLASS_OR_INTERFACE_TYPE -> possibleTypeNode.getChildOfType(CLASS_NAME)?.originalToken + PRIMITIVE_TYPE -> possibleTypeNode.token.original + CLASS_OR_INTERFACE_TYPE -> possibleTypeNode.getChildOfType(CLASS_NAME)?.token?.original else -> null } checkNotNull(typeToken) { "Couldn't extract parameter type from node" } @@ -71,12 +71,12 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil } private fun getParameterName(node: JavaParserNode): String { - val name = checkNotNull(node.getChildOfType(PARAMETER_NAME)?.originalToken) { "Couldn't find parameter name" } + val name = checkNotNull(node.getChildOfType(PARAMETER_NAME)?.token?.original) { "Couldn't find parameter name" } return name.replace(ARRAY_BRACKETS_REGEX, "") } private fun JavaParserNode.assembleEnclosingClass(): EnclosingElement? = extractWithLogger(logger) { - val name = this.getChildOfType(CLASS_NAME)?.originalToken + val name = this.getChildOfType(CLASS_NAME)?.token?.original val type = when (this.typeLabel) { CLASS_OR_INTERFACE_DECLARATION -> EnclosingElementType.Class ENUM_DECLARATION -> EnclosingElementType.Enum diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt index 3247ab85..8f9b7503 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt @@ -23,11 +23,11 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: override val annotations: List? = run { root.getChildrenOfType(ANNOTATION_NODE_TYPE).map { - it.getChildOfType(TYPE_REFERENCE)?.originalToken ?: return@run null + it.getChildOfType(TYPE_REFERENCE)?.token?.original ?: return@run null } } - override val returnType: String? = root.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.originalToken + override val returnType: String? = root.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.token?.original override val body: SpoonNode? = root.getChildOfType(BLOCK) @@ -37,8 +37,8 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: root.findEnclosingElementBy { it.typeLabel in POSSIBLE_ENCLOSING_ELEMENTS }?.assembleEnclosingClass() private fun assembleParameter(parameterNode: SpoonNode): FunctionInfoParameter { - val type = parameterNode.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.originalToken - val name = parameterNode.originalToken + val type = parameterNode.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.token?.original + val name = parameterNode.token.original checkNotNull(name) { "Couldn't find parameter name token" } return FunctionInfoParameter(name, type) } @@ -49,7 +49,7 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: CLASS_DECLARATION_TYPE -> EnclosingElementType.Class else -> error("Can't find any enclosing type association") } - EnclosingElement(type, this.originalToken, root) + EnclosingElement(type, this.token.original, root) } companion object { diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 5c5360d6..57ff2d29 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,6 +1,6 @@ package astminer.parse.treesitter.java -import astminer.common.EMPTY_TOKEN +import astminer.common.model.Code2VecNormalization import astminer.common.model.* import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy @@ -20,7 +20,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil annotations.children .filter { it.typeLabel in possibleAnnotations } .map { annotation -> annotation.preOrder().filter { it.typeLabel in listOf(NAME, SCOPE_IDENTIFIER, DOT) } } - .map { nameNodes -> nameNodes.map { it.originalToken ?: "" } } + .map { nameNodes -> nameNodes.map { it.token.original ?: "" } } .map { nameNodesWithToken -> nameNodesWithToken.joinToString(separator = "") } } @@ -28,7 +28,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil val modifiers = root.getChildOfType(MODIFIERS) ?: return@extractWithLogger listOf() modifiers.children .filter { it.typeLabel in possibleModifiers } - .map { it.originalToken } + .map { it.token.original } .map { checkNotNull(it) { "Modifier without a token" } } } @@ -37,9 +37,9 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil parametersRoot.children.filter { it.typeLabel in possibleParameters }.map { parameter -> val possibleNameNode = parameter.getChildOfType(NAME) val name = if (possibleNameNode != null) { - possibleNameNode.originalToken + possibleNameNode.token.original } else { - parameter.getChildOfType(VARIABLE_DECLARATOR)?.getChildOfType(NAME)?.originalToken + parameter.getChildOfType(VARIABLE_DECLARATOR)?.getChildOfType(NAME)?.token?.original } checkNotNull(name) { "Can't find parameter name" } @@ -57,7 +57,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil var collectedType = returnTypeNode.getTokensFromSubtree() if (returnTypeNode.typeLabel == ARRAY_TYPE) { - collectedType = collectedType.replace(EMPTY_TOKEN, "[]") + collectedType = collectedType.replace(Code2VecNormalization.EMPTY_TOKEN, "[]") } return@run collectedType } @@ -65,7 +65,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil override val enclosingElement: EnclosingElement? = extractWithLogger(logger) { val enclosingNode = root.findEnclosingElementBy { it.typeLabel in possible_enclosings } ?: return@extractWithLogger null - val name = enclosingNode.getChildOfType(NAME)?.originalToken + val name = enclosingNode.getChildOfType(NAME)?.token?.original val type = when (enclosingNode.typeLabel) { CLASS_DECLARATION -> EnclosingElementType.Class ENUM_DECLARATION -> EnclosingElementType.Enum diff --git a/src/main/kotlin/astminer/paths/PathUtil.kt b/src/main/kotlin/astminer/paths/PathUtil.kt index ad8c9cc8..4baa90cd 100644 --- a/src/main/kotlin/astminer/paths/PathUtil.kt +++ b/src/main/kotlin/astminer/paths/PathUtil.kt @@ -2,7 +2,7 @@ package astminer.paths import astminer.common.model.* -fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token }): PathContext { +fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token.final }): PathContext { val startToken = getToken(path.upwardNodes.first()) val endToken = getToken(path.downwardNodes.last()) val astNodes = path.upwardNodes.map { OrientedNodeType(it.typeLabel, Direction.UP) } + diff --git a/src/main/kotlin/astminer/paths/PathWorker.kt b/src/main/kotlin/astminer/paths/PathWorker.kt index a19bd487..de78686e 100644 --- a/src/main/kotlin/astminer/paths/PathWorker.kt +++ b/src/main/kotlin/astminer/paths/PathWorker.kt @@ -49,7 +49,7 @@ class PathWorker { val paths: MutableList = ArrayList() iterator.forEach { currentNode -> if (currentNode.isLeaf()) { - if (currentNode.token.isNotEmpty()) { + if (currentNode.token.final.isNotEmpty()) { currentNode.setPathPieces(listOf(listOf(currentNode))) } } else { diff --git a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt index 9ef41b0b..7691ff28 100644 --- a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt @@ -28,7 +28,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { for (node in labeledResult.root.preOrder()) { - tokensMap.record(node.token) + tokensMap.record(node.token.final) nodeTypesMap.record(node.typeLabel) } val writer = astsPrintWriters.getOrPut(holdout) { holdout.resolveHoldout() } @@ -55,7 +55,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { } internal fun astString(node: Node): String { - return "${tokensMap.getId(node.token)} ${nodeTypesMap.getId(node.typeLabel)}{${ + return "${tokensMap.getId(node.token.final)} ${nodeTypesMap.getId(node.typeLabel)}{${ node.children.joinToString(separator = "", transform = ::astString) }}" } diff --git a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt index 3ce2463e..15fb27fc 100644 --- a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt @@ -34,7 +34,7 @@ class JsonAstStorage(override val outputDirectoryPath: String, private val withP private data class OutputNode(val token: String, val typeLabel: String, val children: List) private fun TreeFlattener.EnumeratedNode.toOutputNode() = - OutputNode(node.token, node.typeLabel, children.map { it.id }) + OutputNode(node.token.final, node.typeLabel, children.map { it.id }) override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { val outputNodes = treeFlattener.flatten(labeledResult.root).map { it.toOutputNode() } diff --git a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt index d41ad286..fec17dc2 100644 --- a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt +++ b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt @@ -54,7 +54,7 @@ abstract class PathBasedStorage( return LabeledPathContexts( labeledResult.label, paths.map { astPath -> - toPathContext(astPath) { it.token.replace("\n", "\\n") } + toPathContext(astPath) { it.token.final.replace("\n", "\\n") } } ) } diff --git a/src/test/kotlin/astminer/common/TreeUtilTest.kt b/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt similarity index 82% rename from src/test/kotlin/astminer/common/TreeUtilTest.kt rename to src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt index 7a263f7e..504fda39 100644 --- a/src/test/kotlin/astminer/common/TreeUtilTest.kt +++ b/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt @@ -1,9 +1,10 @@ package astminer.common +import astminer.common.model.Code2VecNormalization import org.junit.Assert import org.junit.Test -class TreeUtilTest { +class Code2VecNormalizationTest { private val defaultToken = "EMPTY" @Test @@ -29,7 +30,7 @@ class TreeUtilTest { Assert.assertEquals( "All whitespace characters and punctuation should be removed, keeping only letters", expectedToken, - normalizeToken(token, defaultToken) + Code2VecNormalization.normalizeSubToken(token, defaultToken) ) } @@ -40,18 +41,18 @@ class TreeUtilTest { Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - normalizeToken(token, defaultToken) + Code2VecNormalization.normalizeSubToken(token, defaultToken) ) } @Test fun testNormalizeEmptyToken() { val token = "\n\n" - val expectedToken = EMPTY_TOKEN + val expectedToken = Code2VecNormalization.EMPTY_TOKEN Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - normalizeToken(token, defaultToken) + Code2VecNormalization.normalizeSubToken(token, defaultToken) ) } @@ -62,7 +63,7 @@ class TreeUtilTest { Assert.assertEquals( "Token with snake, camel and combined case should be split into list of its parts", expectedToken, - splitToSubtokens(token) + Code2VecNormalization.splitToSubtokens(token) ) } } diff --git a/src/test/kotlin/astminer/common/DummyNode.kt b/src/test/kotlin/astminer/common/DummyNode.kt index a00ca3ef..c158614b 100644 --- a/src/test/kotlin/astminer/common/DummyNode.kt +++ b/src/test/kotlin/astminer/common/DummyNode.kt @@ -12,7 +12,7 @@ class DummyNode( init { // Tokens may change after normalization, for tests we want tokens to be unchanged - technicalToken = typeLabel + token.technical = typeLabel } override fun removeChildrenOfType(typeLabel: String) { diff --git a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt index d0867ce0..4624535d 100644 --- a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt +++ b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt @@ -15,7 +15,7 @@ class PrettyNode(override val typeLabel: String, originalToken: String) : Node(o fun toPrettyString(indent: Int = 0, indentSymbol: String = "--"): String = with(StringBuilder()) { repeat(indent) { append(indentSymbol) } append(typeLabel) - if (token.isNotEmpty()) { + if (token.final.isNotEmpty()) { appendLine(" : $token") } else { appendLine() diff --git a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt index c51226ab..7760f7e9 100644 --- a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt +++ b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt @@ -38,20 +38,20 @@ class FunctionNameLabelExtractorTest { @Test fun `test FunctionNameProblem hides function name node token with METHOD_NAME`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.nameNode?.token) + assertEquals("METHOD_NAME", functionInfo.nameNode?.token?.final) } @Test fun `test FunctionNameProblem hides function root token with METHOD_NAME if it is the name node`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.root.token) + assertEquals("METHOD_NAME", functionInfo.root.token.final) } @Test fun `test function name problem should hide recursive call tokens with SELF`() { FunctionNameLabelExtractor.process(functionInfo) val recursiveCallNode = functionInfo.root.children.firstOrNull()?.children?.firstOrNull() - assertEquals("SELF", recursiveCallNode?.token) + assertEquals("SELF", recursiveCallNode?.token?.final) } companion object { diff --git a/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt index 124926af..52c0ccff 100644 --- a/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionSplitterTest.kt @@ -134,7 +134,7 @@ class GumTreePythonFunctionSplitterTest { root.getChildOfType("body") ?.getChildOfType("Expr") ?.getChildOfType("Constant-str") - ?.originalToken + ?.token?.original ) assertEquals(4, parameters?.size) assertEquals( From 2c9e5358f7601e89a58beb830226c9a7d2f8bfee Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 15:13:47 +0300 Subject: [PATCH 04/48] renaming and refactoring --- src/main/kotlin/astminer/common/TreeUtil.kt | 2 -- .../model/{Token.kt => NormalizationModel.kt} | 29 ------------------ .../astminer/common/model/TokenModel.kt | 30 +++++++++++++++++++ 3 files changed, 30 insertions(+), 31 deletions(-) delete mode 100644 src/main/kotlin/astminer/common/TreeUtil.kt rename src/main/kotlin/astminer/common/model/{Token.kt => NormalizationModel.kt} (74%) create mode 100644 src/main/kotlin/astminer/common/model/TokenModel.kt diff --git a/src/main/kotlin/astminer/common/TreeUtil.kt b/src/main/kotlin/astminer/common/TreeUtil.kt deleted file mode 100644 index 129178bf..00000000 --- a/src/main/kotlin/astminer/common/TreeUtil.kt +++ /dev/null @@ -1,2 +0,0 @@ -package astminer.common - diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/NormalizationModel.kt similarity index 74% rename from src/main/kotlin/astminer/common/model/Token.kt rename to src/main/kotlin/astminer/common/model/NormalizationModel.kt index ebe176ad..e8c21092 100644 --- a/src/main/kotlin/astminer/common/model/Token.kt +++ b/src/main/kotlin/astminer/common/model/NormalizationModel.kt @@ -1,34 +1,5 @@ package astminer.common.model -data class Token( - val original: String?, - val range: TokenRange?, - val normalization: Normalization = Code2VecNormalization -) { - init { - if (original == null) require(range == null) { "Token range without token was provided" } - } - - val final: String - get() = technical ?: normalized - - var technical: String? = null - - val normalized = Code2VecNormalization.normalizeToken(original) - - override fun toString(): String = final -} - -typealias Line = Int -typealias Column = Int - -data class TokenRange(val start: Pair, val end: Pair) { - init { - require(start.first >= end.first) { "Wrong line format" } - require(start.second >= end.second) { "Wrong column format" } - } -} - interface Normalization { fun normalizeToken(token: String?): String } diff --git a/src/main/kotlin/astminer/common/model/TokenModel.kt b/src/main/kotlin/astminer/common/model/TokenModel.kt new file mode 100644 index 00000000..7382f50c --- /dev/null +++ b/src/main/kotlin/astminer/common/model/TokenModel.kt @@ -0,0 +1,30 @@ +package astminer.common.model + +data class Token( + val original: String?, + val range: TokenRange?, + val normalization: Normalization = Code2VecNormalization +) { + init { + if (original == null) require(range == null) { "Token range without token was provided" } + } + + val final: String + get() = technical ?: normalized + + var technical: String? = null + + val normalized = normalization.normalizeToken(original) + + override fun toString(): String = final +} + +typealias Line = Int +typealias Column = Int + +data class TokenRange(val start: Pair, val end: Pair) { + init { + require(start.first >= end.first) { "Wrong line format" } + require(start.second >= end.second) { "Wrong column format" } + } +} From c3413c595e1bc107ea438106f2178384da6257cf Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 15:42:38 +0300 Subject: [PATCH 05/48] code style fixes --- detekt.yaml | 2 ++ .../common/model/NormalizationModel.kt | 21 +++++++++---------- .../astminer/common/model/ParsingModel.kt | 5 +++-- .../astminer/common/model/TokenModel.kt | 4 ++-- .../kotlin/astminer/filters/CommonFilters.kt | 4 ++-- .../parse/antlr/php/ANTLRPHPFunctionInfo.kt | 10 ++++----- .../java/jdt/GumTreeJavaJDTFunctionInfo.kt | 1 - .../java/TreeSitterJavaFunctionInfo.kt | 2 +- 8 files changed, 24 insertions(+), 25 deletions(-) diff --git a/detekt.yaml b/detekt.yaml index 52fc010a..4818dd27 100644 --- a/detekt.yaml +++ b/detekt.yaml @@ -26,6 +26,8 @@ style: max: 5 WildcardImport: active: false + UseDataClass: + allowVars: true formatting: autoCorrect: true diff --git a/src/main/kotlin/astminer/common/model/NormalizationModel.kt b/src/main/kotlin/astminer/common/model/NormalizationModel.kt index e8c21092..c9193005 100644 --- a/src/main/kotlin/astminer/common/model/NormalizationModel.kt +++ b/src/main/kotlin/astminer/common/model/NormalizationModel.kt @@ -4,10 +4,18 @@ interface Normalization { fun normalizeToken(token: String?): String } -object Code2VecNormalization: Normalization { +object Code2VecNormalization : Normalization { const val EMPTY_TOKEN = "EMPTY" const val TOKEN_DELIMITER = "|" + private val newLineReg = "\\\\n".toRegex() + private val whitespaceReg = "//s+".toRegex() + private val quotesApostrophesCommasReg = "[\"',]".toRegex() + private val unicodeWeirdCharReg = "\\P{Print}".toRegex() + private val notALetterReg = "[^A-Za-z]".toRegex() + + private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() + override fun normalizeToken(token: String?): String { if (token == null) return EMPTY_TOKEN val subTokens = splitToSubtokens(token) @@ -25,9 +33,6 @@ object Code2VecNormalization: Normalization { .filter { it.isNotEmpty() } .toList() - private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - - /** * The function was adopted from the original code2vec implementation in order to match their behavior: * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java @@ -48,10 +53,4 @@ object Code2VecNormalization: Normalization { } } } - - private val newLineReg = "\\\\n".toRegex() - private val whitespaceReg = "//s+".toRegex() - private val quotesApostrophesCommasReg = "[\"',]".toRegex() - private val unicodeWeirdCharReg = "\\P{Print}".toRegex() - private val notALetterReg = "[^A-Za-z]".toRegex() -} \ No newline at end of file +} diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index bbdc2f9c..c5c8920e 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -4,13 +4,14 @@ import java.io.File import java.io.InputStream abstract class Node(val token: Token) { - constructor(originalToken: String?): this(Token(originalToken, null)) - abstract val typeLabel: String abstract val children: List abstract val parent: Node? val metadata: MutableMap = HashMap() + + constructor(originalToken: String?) : this(Token(originalToken, null)) + fun isLeaf() = children.isEmpty() override fun toString(): String = "$typeLabel : $token" diff --git a/src/main/kotlin/astminer/common/model/TokenModel.kt b/src/main/kotlin/astminer/common/model/TokenModel.kt index 7382f50c..2eb9b5d0 100644 --- a/src/main/kotlin/astminer/common/model/TokenModel.kt +++ b/src/main/kotlin/astminer/common/model/TokenModel.kt @@ -1,9 +1,9 @@ package astminer.common.model -data class Token( +class Token( val original: String?, val range: TokenRange?, - val normalization: Normalization = Code2VecNormalization + private val normalization: Normalization = Code2VecNormalization ) { init { if (original == null) require(range == null) { "Token range without token was provided" } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index 7f6e32d3..16ede08c 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -23,8 +23,8 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n * Filter that excludes trees that have more words than [maxWordsNumber] in any token of their node. */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { - private fun validateTree(root: Node) = - !root.preOrder().any { node -> node.token.final.split(Code2VecNormalization.TOKEN_DELIMITER).size > maxWordsNumber } + private fun validateTree(root: Node) = root.preOrder() + .none { node -> node.token.final.split(Code2VecNormalization.TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt index b96de262..45cacf17 100644 --- a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt @@ -104,12 +104,10 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: } } - private fun getEnclosingElementName(enclosing: AntlrNode): String? { - return when { - enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.token?.original - enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.token?.original - else -> error("No type can be associated") - } + private fun getEnclosingElementName(enclosing: AntlrNode): String? = when { + enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.token?.original + enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.token?.original + else -> error("No type can be associated") } // No check for method because method is a function diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt index b7dd3c09..e6d129a4 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt @@ -4,7 +4,6 @@ import astminer.common.model.EnclosingElement import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo import astminer.common.model.FunctionInfoParameter -import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.gumtree.GumTreeNode import mu.KotlinLogging diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 57ff2d29..4aaa7bb8 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,7 +1,7 @@ package astminer.parse.treesitter.java -import astminer.common.model.Code2VecNormalization import astminer.common.model.* +import astminer.common.model.Code2VecNormalization import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy import mu.KotlinLogging From b5ab209216764c3ac2d378f81222f8b019396e0f Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 20:30:59 +0300 Subject: [PATCH 06/48] init restrictions removed --- src/main/kotlin/astminer/common/model/TokenModel.kt | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/main/kotlin/astminer/common/model/TokenModel.kt b/src/main/kotlin/astminer/common/model/TokenModel.kt index 2eb9b5d0..d2de02a7 100644 --- a/src/main/kotlin/astminer/common/model/TokenModel.kt +++ b/src/main/kotlin/astminer/common/model/TokenModel.kt @@ -5,10 +5,6 @@ class Token( val range: TokenRange?, private val normalization: Normalization = Code2VecNormalization ) { - init { - if (original == null) require(range == null) { "Token range without token was provided" } - } - val final: String get() = technical ?: normalized @@ -22,9 +18,4 @@ class Token( typealias Line = Int typealias Column = Int -data class TokenRange(val start: Pair, val end: Pair) { - init { - require(start.first >= end.first) { "Wrong line format" } - require(start.second >= end.second) { "Wrong column format" } - } -} +data class TokenRange(val start: Pair, val end: Pair) From 84badf71ef0795d4d3fb56b7fbd341ee7eb89677 Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 20:31:21 +0300 Subject: [PATCH 07/48] token range added to javaparser --- .../astminer/parse/javaparser/JavaParserNode.kt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt index 4f613bc7..a6a5fd47 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt @@ -1,6 +1,8 @@ package astminer.parse.javaparser import astminer.common.model.Node +import astminer.common.model.Token +import astminer.common.model.TokenRange import com.github.javaparser.ast.expr.AssignExpr import com.github.javaparser.ast.expr.BinaryExpr import com.github.javaparser.ast.expr.Name @@ -64,10 +66,18 @@ private fun JPNode.isLeaf(): Boolean = this.childNodes.isEmpty() private fun JPNode.hasNoToken(): Boolean = !this.tokenRange.isPresent -private fun getJavaParserNodeToken(jpNode: JPNode): String? { - return when { +private fun getJavaParserNodeToken(jpNode: JPNode): Token { + val originalToken = when { jpNode is Name -> jpNode.asString() jpNode.isLeaf() -> jpNode.tokenRange.get().toString() else -> null } + val tokenRange = if (jpNode.hasNoToken()) { + null + } else { + val start = jpNode.begin.get() + val end = jpNode.end.get() + TokenRange(start.line to start.column, end.line to end.column) + } + return Token(originalToken, tokenRange) } From 8b006e53f55580edf200bd77a1a2be3c4f160662 Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 20:32:43 +0300 Subject: [PATCH 08/48] javaparser doc update --- src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt index a6a5fd47..3cfff007 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt @@ -15,8 +15,8 @@ private val logger = KotlinLogging.logger("JavaParser-Node") /** * Representation of JavaParser nodes inside `astminer` * - * @property jpNode node from javapParser. JPNode is an alias for Node from javaparser - * @property parent parent of this node. Null if it's a root. + * @param jpNode node from JavaParser. JPNode is an alias for Node from javaparser + * @param parent parent of this node. Null if it's a root. */ class JavaParserNode(jpNode: JPNode, override val parent: JavaParserNode?) : Node(getJavaParserNodeToken(jpNode)) { override val children: MutableList = From 32a69176df05c565e77df79e230c12afb432ba18 Mon Sep 17 00:00:00 2001 From: ilya Date: Sun, 24 Oct 2021 21:09:45 +0300 Subject: [PATCH 09/48] spoon token position introduced --- .../kotlin/astminer/parse/spoon/SpoonNode.kt | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt index f05fda6e..8666b3d3 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt @@ -1,12 +1,14 @@ package astminer.parse.spoon import astminer.common.model.Node +import astminer.common.model.Token +import astminer.common.model.TokenRange import spoon.reflect.code.* import spoon.reflect.declaration.CtElement import spoon.reflect.declaration.CtNamedElement import spoon.reflect.reference.CtReference -class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpoonValue()) { +class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpoonToken()) { // Turning CtImpl -> override val typeLabel = el.javaClass.simpleName.substring(startIndex = 2).dropLast(4) @@ -26,8 +28,8 @@ class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpo override fun postOrder(): List = super.postOrder().map { it as SpoonNode } } -private fun CtElement.getSpoonValue(): String? { - return when { +private fun CtElement.getSpoonToken(): Token { + val originalToken = when { this is CtNamedElement -> this.simpleName this is CtVariableAccess<*> -> this.variable.simpleName this is CtInvocation<*> -> this.executable?.simpleName @@ -40,4 +42,15 @@ private fun CtElement.getSpoonValue(): String? { this.directChildren.size == 0 -> this.toString() else -> null } + val range = if (this.position.isValidPosition) { + try { + TokenRange( + this.position.line to this.position.column, + this.position.endLine to this.position.endColumn + ) + } catch (e: NullPointerException) { + null + } + } else null + return Token(originalToken, range) } From 8d376d9af6d29c4982c9a992f68c207dbf300b2c Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 12:34:57 +0300 Subject: [PATCH 10/48] range now is a Node field + some interface rearrangement --- ...ationModel.kt => Code2VecNormalization.kt} | 6 ++--- src/main/kotlin/astminer/common/SimpleNode.kt | 23 ++++++++++++++++ .../astminer/common/model/Normalization.kt | 5 ++++ .../astminer/common/model/ParsingModel.kt | 26 +++++-------------- .../common/model/{TokenModel.kt => Token.kt} | 8 ++---- .../kotlin/astminer/filters/CommonFilters.kt | 1 + .../astminer/filters/FunctionFilters.kt | 2 +- .../kotlin/astminer/parse/ForeignParser.kt | 8 +++--- .../kotlin/astminer/parse/antlr/AntlrNode.kt | 3 +++ .../kotlin/astminer/parse/fuzzy/FuzzyNode.kt | 3 +++ .../astminer/parse/gumtree/GumTreeNode.kt | 3 +++ .../parse/javaparser/JavaParserNode.kt | 20 +++++++------- .../kotlin/astminer/parse/spoon/SpoonNode.kt | 24 +++++++---------- .../treesitter/java/TreeSitterJavaFactory.kt | 2 +- .../java/TreeSitterJavaFunctionInfo.kt | 3 ++- .../java/TreeSitterJavaFunctionSplitter.kt | 2 +- .../common/Code2VecNormalizationTest.kt | 1 - .../java/TreeSitterJavaMethodSplitterTest.kt | 2 +- 18 files changed, 77 insertions(+), 65 deletions(-) rename src/main/kotlin/astminer/common/{model/NormalizationModel.kt => Code2VecNormalization.kt} (95%) create mode 100644 src/main/kotlin/astminer/common/SimpleNode.kt create mode 100644 src/main/kotlin/astminer/common/model/Normalization.kt rename src/main/kotlin/astminer/common/model/{TokenModel.kt => Token.kt} (69%) diff --git a/src/main/kotlin/astminer/common/model/NormalizationModel.kt b/src/main/kotlin/astminer/common/Code2VecNormalization.kt similarity index 95% rename from src/main/kotlin/astminer/common/model/NormalizationModel.kt rename to src/main/kotlin/astminer/common/Code2VecNormalization.kt index c9193005..4f9b44b0 100644 --- a/src/main/kotlin/astminer/common/model/NormalizationModel.kt +++ b/src/main/kotlin/astminer/common/Code2VecNormalization.kt @@ -1,8 +1,6 @@ -package astminer.common.model +package astminer.common -interface Normalization { - fun normalizeToken(token: String?): String -} +import astminer.common.model.Normalization object Code2VecNormalization : Normalization { const val EMPTY_TOKEN = "EMPTY" diff --git a/src/main/kotlin/astminer/common/SimpleNode.kt b/src/main/kotlin/astminer/common/SimpleNode.kt new file mode 100644 index 00000000..883428c6 --- /dev/null +++ b/src/main/kotlin/astminer/common/SimpleNode.kt @@ -0,0 +1,23 @@ +package astminer.common + +import astminer.common.model.Node +import astminer.common.model.NodeRange + +/** Node simplest implementation **/ +class SimpleNode( + override val typeLabel: String, + override val children: MutableList, + override val parent: Node?, + override val range: NodeRange?, + token: String? +) : Node(token) { + override fun removeChildrenOfType(typeLabel: String) { + children.removeIf { it.typeLabel == typeLabel } + } + + override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode } + override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode + + override fun preOrder() = super.preOrder().map { it as SimpleNode } + override fun postOrder() = super.postOrder().map { it as SimpleNode } +} diff --git a/src/main/kotlin/astminer/common/model/Normalization.kt b/src/main/kotlin/astminer/common/model/Normalization.kt new file mode 100644 index 00000000..3108c796 --- /dev/null +++ b/src/main/kotlin/astminer/common/model/Normalization.kt @@ -0,0 +1,5 @@ +package astminer.common.model + +interface Normalization { + fun normalizeToken(token: String?): String +} diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index c5c8920e..22df00dd 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -3,14 +3,13 @@ package astminer.common.model import java.io.File import java.io.InputStream -abstract class Node(val token: Token) { +abstract class Node(originalToken: String?) { abstract val typeLabel: String abstract val children: List abstract val parent: Node? - + abstract val range: NodeRange? val metadata: MutableMap = HashMap() - - constructor(originalToken: String?) : this(Token(originalToken, null)) + val token = Token(originalToken) fun isLeaf() = children.isEmpty() @@ -44,23 +43,10 @@ abstract class Node(val token: Token) { open fun postOrder(): List = mutableListOf().also { doTraversePostOrder(it) } } -/** Node simplest implementation **/ -class SimpleNode( - override val typeLabel: String, - override val children: MutableList, - override val parent: Node?, - token: Token -) : Node(token) { - override fun removeChildrenOfType(typeLabel: String) { - children.removeIf { it.typeLabel == typeLabel } - } +typealias Line = Int +typealias Column = Int - override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode } - override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode - - override fun preOrder() = super.preOrder().map { it as SimpleNode } - override fun postOrder() = super.postOrder().map { it as SimpleNode } -} +data class NodeRange(val start: Pair, val end: Pair) interface Parser { /** diff --git a/src/main/kotlin/astminer/common/model/TokenModel.kt b/src/main/kotlin/astminer/common/model/Token.kt similarity index 69% rename from src/main/kotlin/astminer/common/model/TokenModel.kt rename to src/main/kotlin/astminer/common/model/Token.kt index d2de02a7..028af9ff 100644 --- a/src/main/kotlin/astminer/common/model/TokenModel.kt +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -1,8 +1,9 @@ package astminer.common.model +import astminer.common.Code2VecNormalization + class Token( val original: String?, - val range: TokenRange?, private val normalization: Normalization = Code2VecNormalization ) { val final: String @@ -14,8 +15,3 @@ class Token( override fun toString(): String = final } - -typealias Line = Int -typealias Column = Int - -data class TokenRange(val start: Pair, val end: Pair) diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index 16ede08c..e85843ae 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -1,5 +1,6 @@ package astminer.filters +import astminer.common.Code2VecNormalization import astminer.common.model.* import astminer.featureextraction.NumberOfNodes diff --git a/src/main/kotlin/astminer/filters/FunctionFilters.kt b/src/main/kotlin/astminer/filters/FunctionFilters.kt index 2edc50f4..577c66c5 100644 --- a/src/main/kotlin/astminer/filters/FunctionFilters.kt +++ b/src/main/kotlin/astminer/filters/FunctionFilters.kt @@ -1,6 +1,6 @@ package astminer.filters -import astminer.common.model.Code2VecNormalization +import astminer.common.Code2VecNormalization import astminer.common.model.FunctionFilter import astminer.common.model.FunctionInfo import astminer.common.model.Node diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index b0f42447..b5e7759a 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -1,8 +1,7 @@ package astminer.parse +import astminer.common.SimpleNode import astminer.common.model.Parser -import astminer.common.model.SimpleNode -import astminer.common.model.Token import astminer.config.FileExtension import astminer.config.ParserType import kotlinx.serialization.Serializable @@ -59,10 +58,11 @@ private fun launchScript(args: List): String { private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent: SimpleNode? = null): SimpleNode { val foreignNode = context.tree[rootId] val node = SimpleNode( - typeLabel = foreignNode.nodeType, children = mutableListOf(), parent = parent, - token = Token(foreignNode.token, null) + typeLabel = foreignNode.nodeType, + token = foreignNode.token, + range = null ) val children = foreignNode.children.map { convertFromForeignTree(context, it, node) } node.children.addAll(children) diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt b/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt index cc0899e3..f81e63e4 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt +++ b/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt @@ -1,6 +1,7 @@ package astminer.parse.antlr import astminer.common.model.Node +import astminer.common.model.NodeRange class AntlrNode( override val typeLabel: String, @@ -8,6 +9,8 @@ class AntlrNode( originalToken: String? ) : Node(originalToken) { + override val range: NodeRange? = null + override val children: MutableList = mutableListOf() fun replaceChildren(newChildren: List) { diff --git a/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt b/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt index af122b38..0e6a9aed 100644 --- a/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt +++ b/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt @@ -1,6 +1,7 @@ package astminer.parse.fuzzy import astminer.common.model.Node +import astminer.common.model.NodeRange import com.google.common.collect.TreeMultiset /** @@ -23,6 +24,8 @@ class FuzzyNode( override val children get() = childrenMultiset.toList() + override val range: NodeRange? = null + fun addChild(node: FuzzyNode) { childrenMultiset.add(node) node.parent = this diff --git a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt index 8f3a1f46..22bd7858 100644 --- a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt +++ b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt @@ -1,6 +1,7 @@ package astminer.parse.gumtree import astminer.common.model.Node +import astminer.common.model.NodeRange import com.github.gumtreediff.tree.Tree import com.github.gumtreediff.tree.TreeContext @@ -13,6 +14,8 @@ class GumTreeNode(val wrappedNode: Tree, override var parent: GumTreeNode?) : wrappedNode.children.map { GumTreeNode(it, this) }.toMutableList() } + override val range: NodeRange? = null + override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } } diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt index 3cfff007..8db3f858 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt @@ -1,8 +1,7 @@ package astminer.parse.javaparser import astminer.common.model.Node -import astminer.common.model.Token -import astminer.common.model.TokenRange +import astminer.common.model.NodeRange import com.github.javaparser.ast.expr.AssignExpr import com.github.javaparser.ast.expr.BinaryExpr import com.github.javaparser.ast.expr.Name @@ -33,6 +32,12 @@ class JavaParserNode(jpNode: JPNode, override val parent: JavaParserNode?) : Nod SHORTEN_VALUES.getOrDefault(rawType, rawType) } + override val range: NodeRange? = if (jpNode.hasNoToken()) { null } else { + val start = jpNode.begin.get() + val end = jpNode.end.get() + NodeRange(start.line to start.column, end.line to end.column) + } + /** * Returns node type. Composed of `javaClass.simpleName` and * `jpNode.operator` if node is expression. @@ -66,18 +71,11 @@ private fun JPNode.isLeaf(): Boolean = this.childNodes.isEmpty() private fun JPNode.hasNoToken(): Boolean = !this.tokenRange.isPresent -private fun getJavaParserNodeToken(jpNode: JPNode): Token { +private fun getJavaParserNodeToken(jpNode: JPNode): String? { val originalToken = when { jpNode is Name -> jpNode.asString() jpNode.isLeaf() -> jpNode.tokenRange.get().toString() else -> null } - val tokenRange = if (jpNode.hasNoToken()) { - null - } else { - val start = jpNode.begin.get() - val end = jpNode.end.get() - TokenRange(start.line to start.column, end.line to end.column) - } - return Token(originalToken, tokenRange) + return originalToken } diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt index 8666b3d3..1b4c7c8d 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt @@ -1,8 +1,7 @@ package astminer.parse.spoon import astminer.common.model.Node -import astminer.common.model.Token -import astminer.common.model.TokenRange +import astminer.common.model.NodeRange import spoon.reflect.code.* import spoon.reflect.declaration.CtElement import spoon.reflect.declaration.CtNamedElement @@ -14,6 +13,13 @@ class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpo override val children = run { el.directChildren.map { SpoonNode(it, this) } }.toMutableList() + override val range: NodeRange? = if (el.position.isValidPosition && el.position != null) { + NodeRange( + el.position.line to el.position.column, + el.position.endLine to el.position.endColumn + ) + } else null + override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } } @@ -28,7 +34,7 @@ class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpo override fun postOrder(): List = super.postOrder().map { it as SpoonNode } } -private fun CtElement.getSpoonToken(): Token { +private fun CtElement.getSpoonToken(): String? { val originalToken = when { this is CtNamedElement -> this.simpleName this is CtVariableAccess<*> -> this.variable.simpleName @@ -42,15 +48,5 @@ private fun CtElement.getSpoonToken(): Token { this.directChildren.size == 0 -> this.toString() else -> null } - val range = if (this.position.isValidPosition) { - try { - TokenRange( - this.position.line to this.position.column, - this.position.endLine to this.position.endColumn - ) - } catch (e: NullPointerException) { - null - } - } else null - return Token(originalToken, range) + return originalToken } diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt index b45a6442..97d285d9 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt @@ -1,8 +1,8 @@ package astminer.parse.treesitter.java +import astminer.common.SimpleNode import astminer.common.model.ParsingResult import astminer.common.model.ParsingResultFactory -import astminer.common.model.SimpleNode import astminer.common.model.TreeFunctionSplitter import java.io.File diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 4aaa7bb8..15c5800e 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,7 +1,8 @@ package astminer.parse.treesitter.java +import astminer.common.Code2VecNormalization +import astminer.common.SimpleNode import astminer.common.model.* -import astminer.common.model.Code2VecNormalization import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy import mu.KotlinLogging diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt index 13866a64..a4dbb724 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt @@ -1,7 +1,7 @@ package astminer.parse.treesitter.java +import astminer.common.SimpleNode import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode import astminer.common.model.TreeFunctionSplitter class TreeSitterJavaFunctionSplitter : TreeFunctionSplitter { diff --git a/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt b/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt index 504fda39..1c0839b6 100644 --- a/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt +++ b/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt @@ -1,6 +1,5 @@ package astminer.common -import astminer.common.model.Code2VecNormalization import org.junit.Assert import org.junit.Test diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index c72c382c..45cf31cc 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -1,8 +1,8 @@ package astminer.parse.treesitter.java +import astminer.common.SimpleNode import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode import org.junit.BeforeClass import org.junit.Test import java.io.File From 95400d88dfc98fdf036053af87e1c6b96c180968 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 13:24:56 +0300 Subject: [PATCH 11/48] test fixes --- src/test/kotlin/astminer/common/DummyNode.kt | 2 ++ src/test/kotlin/astminer/featureextraction/PrettyNode.kt | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/test/kotlin/astminer/common/DummyNode.kt b/src/test/kotlin/astminer/common/DummyNode.kt index c158614b..de266e0e 100644 --- a/src/test/kotlin/astminer/common/DummyNode.kt +++ b/src/test/kotlin/astminer/common/DummyNode.kt @@ -10,6 +10,8 @@ class DummyNode( override val parent: Node? = null + override val range: NodeRange? = null + init { // Tokens may change after normalization, for tests we want tokens to be unchanged token.technical = typeLabel diff --git a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt index 4624535d..5e8ff667 100644 --- a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt +++ b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt @@ -1,6 +1,7 @@ package astminer.featureextraction import astminer.common.model.Node +import astminer.common.model.NodeRange class PrettyNode(override val typeLabel: String, originalToken: String) : Node(originalToken) { override var children: MutableList = ArrayList() @@ -10,6 +11,8 @@ class PrettyNode(override val typeLabel: String, originalToken: String) : Node(o field = value } + override val range: NodeRange? = null + fun addChild(node: PrettyNode) = children.add(node) fun toPrettyString(indent: Int = 0, indentSymbol: String = "--"): String = with(StringBuilder()) { From b858836e3c9cab30a6602671bb56e0843bab1350 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 13:25:11 +0300 Subject: [PATCH 12/48] simple node default parameters added --- src/main/kotlin/astminer/common/SimpleNode.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/astminer/common/SimpleNode.kt b/src/main/kotlin/astminer/common/SimpleNode.kt index 883428c6..48e286f1 100644 --- a/src/main/kotlin/astminer/common/SimpleNode.kt +++ b/src/main/kotlin/astminer/common/SimpleNode.kt @@ -7,8 +7,8 @@ import astminer.common.model.NodeRange class SimpleNode( override val typeLabel: String, override val children: MutableList, - override val parent: Node?, - override val range: NodeRange?, + override val parent: Node? = null, + override val range: NodeRange? = null, token: String? ) : Node(token) { override fun removeChildrenOfType(typeLabel: String) { From 4ca400dbf6063b1367b55331f522163a57948614 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 13:25:23 +0300 Subject: [PATCH 13/48] node range introduced in ANTLR --- .../kotlin/astminer/parse/antlr/AntlrNode.kt | 5 ++- .../kotlin/astminer/parse/antlr/AntlrUtil.kt | 32 ++++++++++++++++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt b/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt index f81e63e4..904d3152 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt +++ b/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt @@ -6,10 +6,9 @@ import astminer.common.model.NodeRange class AntlrNode( override val typeLabel: String, override var parent: AntlrNode?, - originalToken: String? -) : Node(originalToken) { - + originalToken: String?, override val range: NodeRange? = null +) : Node(originalToken) { override val children: MutableList = mutableListOf() diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt b/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt index 253a14c2..cae593f6 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt @@ -1,6 +1,8 @@ package astminer.parse.antlr import astminer.common.model.Node +import astminer.common.model.NodeRange +import com.github.javaparser.TokenRange import org.antlr.v4.runtime.ParserRuleContext import org.antlr.v4.runtime.Vocabulary import org.antlr.v4.runtime.tree.ErrorNode @@ -16,7 +18,7 @@ private fun convertRuleContext( vocabulary: Vocabulary ): AntlrNode { val typeLabel = ruleNames[ruleContext.ruleIndex] - val currentNode = AntlrNode(typeLabel, parent, null) + val currentNode = AntlrNode(typeLabel, parent, null, ruleContext.getNodeRange()) val children: MutableList = ArrayList() ruleContext.children?.forEach { @@ -30,11 +32,32 @@ private fun convertRuleContext( return currentNode } +private fun ParserRuleContext.getNodeRange(): NodeRange? { + if (start == null || stop == null) return null + return NodeRange( + start.line to start.charPositionInLine, + stop.line to stop.charPositionInLine + stop.stopIndex - stop.startIndex + ) +} + private fun convertTerminal(terminalNode: TerminalNode, parent: AntlrNode?, vocabulary: Vocabulary): AntlrNode = - AntlrNode(vocabulary.getSymbolicName(terminalNode.symbol.type), parent, terminalNode.symbol.text) + AntlrNode( + vocabulary.getSymbolicName(terminalNode.symbol.type), + parent, + terminalNode.symbol.text, + terminalNode.getNodeRange() + ) + +private fun TerminalNode.getNodeRange(): NodeRange? { + if (symbol == null) return null + return NodeRange( + symbol.line to symbol.charPositionInLine, + symbol.line to symbol.charPositionInLine + symbol.stopIndex - symbol.startIndex + ) +} private fun convertErrorNode(errorNode: ErrorNode, parent: AntlrNode?): AntlrNode = - AntlrNode("Error", parent, errorNode.text) + AntlrNode("Error", parent, errorNode.text, errorNode.getNodeRange()) /** * Remove intermediate nodes that have a single child. @@ -57,7 +80,8 @@ fun compressTree(root: AntlrNode): AntlrNode { val compressedNode = AntlrNode( root.typeLabel + "|" + child.typeLabel, root.parent, - child.token.original + child.token.original, + root.range ) compressedNode.replaceChildren(child.children) compressedNode From 02192f556f29e60936123c5c2795f35e5adbc9cc Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 13:45:23 +0300 Subject: [PATCH 14/48] ANTLR util refactor --- .../antlr/{AntlrUtil.kt => conversionUtil.kt} | 22 ------------------ .../kotlin/astminer/parse/antlr/searchUtil.kt | 23 +++++++++++++++++++ 2 files changed, 23 insertions(+), 22 deletions(-) rename src/main/kotlin/astminer/parse/antlr/{AntlrUtil.kt => conversionUtil.kt} (76%) create mode 100644 src/main/kotlin/astminer/parse/antlr/searchUtil.kt diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt b/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt similarity index 76% rename from src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt rename to src/main/kotlin/astminer/parse/antlr/conversionUtil.kt index cae593f6..4ab4358a 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt @@ -1,8 +1,6 @@ package astminer.parse.antlr -import astminer.common.model.Node import astminer.common.model.NodeRange -import com.github.javaparser.TokenRange import org.antlr.v4.runtime.ParserRuleContext import org.antlr.v4.runtime.Vocabulary import org.antlr.v4.runtime.tree.ErrorNode @@ -90,23 +88,3 @@ fun compressTree(root: AntlrNode): AntlrNode { root } } - -fun decompressTypeLabel(typeLabel: String) = typeLabel.split("|") - -fun AntlrNode.lastLabel() = decompressTypeLabel(typeLabel).last() - -fun AntlrNode.firstLabel() = decompressTypeLabel(typeLabel).first() - -fun AntlrNode.hasLastLabel(label: String): Boolean = lastLabel() == label - -fun AntlrNode.lastLabelIn(labels: List): Boolean = labels.contains(lastLabel()) - -fun AntlrNode.hasFirstLabel(label: String): Boolean = firstLabel() == label - -fun AntlrNode.firstLabelIn(labels: List): Boolean = labels.contains(firstLabel()) - -fun Node.getTokensFromSubtree(): String = - if (isLeaf()) token.original ?: "" else children.joinToString(separator = "") { it.getTokensFromSubtree() } - -fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = - if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel).map { it } diff --git a/src/main/kotlin/astminer/parse/antlr/searchUtil.kt b/src/main/kotlin/astminer/parse/antlr/searchUtil.kt new file mode 100644 index 00000000..88ffc78a --- /dev/null +++ b/src/main/kotlin/astminer/parse/antlr/searchUtil.kt @@ -0,0 +1,23 @@ +package astminer.parse.antlr + +import astminer.common.model.Node + +fun decompressTypeLabel(typeLabel: String) = typeLabel.split("|") + +fun AntlrNode.lastLabel() = decompressTypeLabel(typeLabel).last() + +fun AntlrNode.firstLabel() = decompressTypeLabel(typeLabel).first() + +fun AntlrNode.hasLastLabel(label: String): Boolean = lastLabel() == label + +fun AntlrNode.lastLabelIn(labels: List): Boolean = labels.contains(lastLabel()) + +fun AntlrNode.hasFirstLabel(label: String): Boolean = firstLabel() == label + +fun AntlrNode.firstLabelIn(labels: List): Boolean = labels.contains(firstLabel()) + +fun Node.getTokensFromSubtree(): String = + if (isLeaf()) token.original ?: "" else children.joinToString(separator = "") { it.getTokensFromSubtree() } + +fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = + if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel).map { it } From 8d15d549b3763fac56908bb6652942c73d88c2ca Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 14:33:53 +0300 Subject: [PATCH 15/48] spoon bug fixed --- src/main/kotlin/astminer/parse/spoon/SpoonNode.kt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt index 1b4c7c8d..99e5bd7d 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt @@ -3,6 +3,7 @@ package astminer.parse.spoon import astminer.common.model.Node import astminer.common.model.NodeRange import spoon.reflect.code.* +import spoon.reflect.cu.position.NoSourcePosition import spoon.reflect.declaration.CtElement import spoon.reflect.declaration.CtNamedElement import spoon.reflect.reference.CtReference @@ -13,7 +14,9 @@ class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpo override val children = run { el.directChildren.map { SpoonNode(it, this) } }.toMutableList() - override val range: NodeRange? = if (el.position.isValidPosition && el.position != null) { + override val range: NodeRange? = if (el.position.compilationUnit.originalSourceCode != null && + el.position !is NoSourcePosition + ) { NodeRange( el.position.line to el.position.column, el.position.endLine to el.position.endColumn From 8798c065a3741b53f5381410e3ebeb28edbed3b9 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 15:37:39 +0300 Subject: [PATCH 16/48] node range refactor --- src/main/kotlin/astminer/common/model/ParsingModel.kt | 11 ++++++++--- .../kotlin/astminer/parse/antlr/conversionUtil.kt | 9 +++++---- .../astminer/parse/javaparser/JavaParserNode.kt | 6 +++++- src/main/kotlin/astminer/parse/spoon/SpoonNode.kt | 5 +++-- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index 22df00dd..564d9401 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -1,5 +1,6 @@ package astminer.common.model +import kotlinx.serialization.Serializable import java.io.File import java.io.InputStream @@ -43,10 +44,14 @@ abstract class Node(originalToken: String?) { open fun postOrder(): List = mutableListOf().also { doTraversePostOrder(it) } } -typealias Line = Int -typealias Column = Int +@Serializable +data class NodeRange(val start: Position, val end: Position) { + override fun toString(): String = "[${start.line}, ${start.column}] - [${end.line}, ${end.column}]" +} + +@Serializable +data class Position(val line: Int, val column: Int) -data class NodeRange(val start: Pair, val end: Pair) interface Parser { /** diff --git a/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt b/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt index 4ab4358a..c6c12e6b 100644 --- a/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt @@ -1,6 +1,7 @@ package astminer.parse.antlr import astminer.common.model.NodeRange +import astminer.common.model.Position import org.antlr.v4.runtime.ParserRuleContext import org.antlr.v4.runtime.Vocabulary import org.antlr.v4.runtime.tree.ErrorNode @@ -33,8 +34,8 @@ private fun convertRuleContext( private fun ParserRuleContext.getNodeRange(): NodeRange? { if (start == null || stop == null) return null return NodeRange( - start.line to start.charPositionInLine, - stop.line to stop.charPositionInLine + stop.stopIndex - stop.startIndex + Position(start.line, start.charPositionInLine), + Position(stop.line, stop.charPositionInLine + stop.stopIndex - stop.startIndex) ) } @@ -49,8 +50,8 @@ private fun convertTerminal(terminalNode: TerminalNode, parent: AntlrNode?, voca private fun TerminalNode.getNodeRange(): NodeRange? { if (symbol == null) return null return NodeRange( - symbol.line to symbol.charPositionInLine, - symbol.line to symbol.charPositionInLine + symbol.stopIndex - symbol.startIndex + Position(symbol.line, symbol.charPositionInLine), + Position(symbol.line, symbol.charPositionInLine + symbol.stopIndex - symbol.startIndex) ) } diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt index 8db3f858..f6af9bc0 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt @@ -2,6 +2,7 @@ package astminer.parse.javaparser import astminer.common.model.Node import astminer.common.model.NodeRange +import astminer.common.model.Position import com.github.javaparser.ast.expr.AssignExpr import com.github.javaparser.ast.expr.BinaryExpr import com.github.javaparser.ast.expr.Name @@ -35,7 +36,10 @@ class JavaParserNode(jpNode: JPNode, override val parent: JavaParserNode?) : Nod override val range: NodeRange? = if (jpNode.hasNoToken()) { null } else { val start = jpNode.begin.get() val end = jpNode.end.get() - NodeRange(start.line to start.column, end.line to end.column) + NodeRange( + Position(start.line, start.column), + Position(end.line, end.column) + ) } /** diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt index 99e5bd7d..b70fc23a 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt @@ -2,6 +2,7 @@ package astminer.parse.spoon import astminer.common.model.Node import astminer.common.model.NodeRange +import astminer.common.model.Position import spoon.reflect.code.* import spoon.reflect.cu.position.NoSourcePosition import spoon.reflect.declaration.CtElement @@ -18,8 +19,8 @@ class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpo el.position !is NoSourcePosition ) { NodeRange( - el.position.line to el.position.column, - el.position.endLine to el.position.endColumn + Position(el.position.line, el.position.column), + Position(el.position.endLine, el.position.endColumn) ) } else null From 7badac33a20c2e1fd64e7ecd00b44fec91b05f66 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 15:38:13 +0300 Subject: [PATCH 17/48] foreign parser update --- .../kotlin/astminer/parse/ForeignParser.kt | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index b5e7759a..512266a2 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -1,6 +1,7 @@ package astminer.parse import astminer.common.SimpleNode +import astminer.common.model.NodeRange import astminer.common.model.Parser import astminer.config.FileExtension import astminer.config.ParserType @@ -23,17 +24,29 @@ import kotlin.io.path.createTempDirectory * { * "token": null, * "nodeType": "i_am_root", - * "children": [1,2] + * "children": [1,2], + * "range" : { + * "start" : { "line" : 0, "column" : 0 }, + * "end" : { "line" 1, "column" : 4 } + * } * }, * { * "token": "Hello", * "nodeType": "left_child", * "children": [] + * "range" : { + * "start" : { "line" : 0, "column": 0 }, + * "end" : { "line: 0, "column": 5 } + * } * }, * { * "token": "World!", * "nodeType": "right_child", - * "children": [] + * "children": [], + * "range" : { + * "start" : { "line" : 1, "column" : 0}, + * "end" : { "line" : 1, "column" : 6} + * } * } * ] * } @@ -57,12 +70,13 @@ private fun launchScript(args: List): String { private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent: SimpleNode? = null): SimpleNode { val foreignNode = context.tree[rootId] + val node = SimpleNode( children = mutableListOf(), parent = parent, typeLabel = foreignNode.nodeType, token = foreignNode.token, - range = null + range = foreignNode.range ) val children = foreignNode.children.map { convertFromForeignTree(context, it, node) } node.children.addAll(children) @@ -73,7 +87,12 @@ private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent private data class ForeignTree(val tree: List) @Serializable -private data class ForeignNode(val token: String?, val nodeType: String, val children: List) +private data class ForeignNode( + val token: String?, + val nodeType: String, + val range: NodeRange? = null, + val children: List +) /** Use this parser to get a tree from external script. * It uses `getTreeFromScript` and `getArguments` functions to generate From 5523bc7e839e851d7d0885058fcb7f1aaf361e81 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 15:38:50 +0300 Subject: [PATCH 18/48] added nodeRange extraction in tree sitter --- .../parse/tree_sitter/aw_tree_sitter/ast.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py index 77bd8303..354715d4 100644 --- a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py +++ b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py @@ -1,7 +1,11 @@ from tree_sitter import TreeCursor from typing import Optional, TypedDict, List -NodeAsDict = TypedDict("NodeAsDict", {"token": Optional[str], "nodeType": str, "children": List[int]}) +Position = TypedDict("Position", {"line": int, "column": int}) +NodeRange = TypedDict("NodeRange", {"start": Position, "end": Position}) +NodeAsDict = TypedDict( + "NodeAsDict", {"token": Optional[str], "nodeType": str, "range": NodeRange, "children": List[int]} +) TreeAsDict = TypedDict("TreeAsDict", {"tree": List[NodeAsDict]}) @@ -13,15 +17,26 @@ def __init__(self, cursor: TreeCursor, file_bytes: bytes): self._cursor = cursor self._file_bytes = file_bytes + def _get_current_node_range(self) -> NodeRange: + node = self._cursor.node + start = node.start_point + end = node.end_point + return { + "start": {"line": start[0], "column": start[1]}, + "end": {"line": end[0], "column": end[1]} + } + def _get_current_node_as_dict(self) -> NodeAsDict: node_type = self._cursor.node.type + node_range = self._get_current_node_range() + if len(self._cursor.node.children) == 0: node_value_bytes = self._file_bytes[self._cursor.node.start_byte : self._cursor.node.end_byte] node_value: Optional[str] = node_value_bytes.decode("utf-8") else: node_value = None - return {"token": node_value, "nodeType": node_type, "children": []} + return {"token": node_value, "nodeType": node_type, "range": node_range, "children": []} def get_tree_as_dict(self) -> TreeAsDict: depth = 0 From abbb4f1bbbd1474eb978dbb7d7d8d563cc26ddad Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 15:39:15 +0300 Subject: [PATCH 19/48] setup.py code style refactor --- src/main/python/parse/tree_sitter/setup.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/main/python/parse/tree_sitter/setup.py b/src/main/python/parse/tree_sitter/setup.py index fcf3f1db..12f23983 100644 --- a/src/main/python/parse/tree_sitter/setup.py +++ b/src/main/python/parse/tree_sitter/setup.py @@ -1,16 +1,12 @@ from setuptools import setup setup( - name='tree_sitter_astminer_wrapper', - version='1.0.0', - description='Wrapper for tree sitter python bindings for using with astminer', - packages=['aw_tree_sitter'], - license='MIT', - author='Ilya Utkin', - entry_points={ - 'console_scripts': ["aw_tree_sitter = aw_tree_sitter.main:main"] - }, - install_requires=[ - 'tree_sitter~=0.19.0' - ] -) \ No newline at end of file + name="tree_sitter_astminer_wrapper", + version="1.0.0", + description="Wrapper for tree sitter python bindings for using with astminer", + packages=["aw_tree_sitter"], + license="MIT", + author="Ilya Utkin", + entry_points={"console_scripts": ["aw_tree_sitter = aw_tree_sitter.main:main"]}, + install_requires=["tree_sitter~=0.19.0"], +) From 41942585818a295a8d08c098eb85635c73641261 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 19:02:47 +0300 Subject: [PATCH 20/48] node range support in gumtree --- .../astminer/parse/gumtree/GumTreeNode.kt | 9 +++----- .../parse/gumtree/PositionConverter.kt | 23 +++++++++++++++++++ .../gumtree/java/jdt/GumTreeJavaJDTParser.kt | 10 ++++---- .../java/srcML/GumTreeJavaSrcmlParser.kt | 9 +++++--- .../gumtree/python/GumTreePythonParser.kt | 10 ++++---- 5 files changed, 44 insertions(+), 17 deletions(-) create mode 100644 src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt diff --git a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt index 22bd7858..f2ad8689 100644 --- a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt +++ b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt @@ -3,18 +3,17 @@ package astminer.parse.gumtree import astminer.common.model.Node import astminer.common.model.NodeRange import com.github.gumtreediff.tree.Tree -import com.github.gumtreediff.tree.TreeContext -class GumTreeNode(val wrappedNode: Tree, override var parent: GumTreeNode?) : +class GumTreeNode(val wrappedNode: Tree, posConverter: PositionConverter, override var parent: GumTreeNode? = null) : Node(wrappedNode.label) { override val typeLabel: String get() = wrappedNode.type.name override val children: MutableList by lazy { - wrappedNode.children.map { GumTreeNode(it, this) }.toMutableList() + wrappedNode.children.map { GumTreeNode(it, posConverter, this) }.toMutableList() } - override val range: NodeRange? = null + override val range: NodeRange = posConverter.getRange(wrappedNode.pos, wrappedNode.endPos) override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } @@ -31,5 +30,3 @@ class GumTreeNode(val wrappedNode: Tree, override var parent: GumTreeNode?) : override fun preOrder(): List = super.preOrder().map { it as GumTreeNode } } - -fun wrapGumTreeNode(treeContext: TreeContext): GumTreeNode = GumTreeNode(treeContext.root, null) diff --git a/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt b/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt new file mode 100644 index 00000000..66f1b320 --- /dev/null +++ b/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt @@ -0,0 +1,23 @@ +package astminer.parse.gumtree + +import astminer.common.model.NodeRange +import astminer.common.model.Position + +class PositionConverter(content: String) { + private val newLineIndexes: List = + content.asSequence().mapIndexedNotNull { index, c -> if (c != '\n') null else index }.toList() + + private fun searchPosition(pos: Int): Position { + val line = newLineIndexes.binarySearch(pos) + if (line >= 0) return searchPosition(pos - 1) + if (line == -1) return Position(1, pos) + val previousNewLine = -line - 2 + return Position(previousNewLine + 2, pos - newLineIndexes[previousNewLine]) + } + + fun getRange(pos: Int, endPos: Int): NodeRange { + val start = searchPosition(pos) + val end = searchPosition(endPos) + return NodeRange(start, end) + } +} \ No newline at end of file diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt index 2bab2683..9a0eea4c 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt @@ -3,13 +3,12 @@ package astminer.parse.gumtree.java.jdt import astminer.common.model.Parser import astminer.parse.ParsingException import astminer.parse.gumtree.GumTreeNode -import astminer.parse.gumtree.wrapGumTreeNode +import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.SyntaxException import com.github.gumtreediff.gen.jdt.JdtTreeGenerator import mu.KotlinLogging import java.io.InputStream -import java.io.InputStreamReader private val logger = KotlinLogging.logger("GumTree-JavaParser") @@ -19,8 +18,11 @@ class GumTreeJavaJDTParser : Parser { } override fun parseInputStream(content: InputStream): GumTreeNode = try { - val treeContext = JdtTreeGenerator().generate(InputStreamReader(content)) - wrapGumTreeNode(treeContext) + val contentAsString = content.bufferedReader().use { it.readText() } + val converter = PositionConverter(contentAsString) + val reader = contentAsString.reader() + val treeContext = JdtTreeGenerator().generate(reader) + GumTreeNode(treeContext.root, converter) } catch (e: SyntaxException) { throw ParsingException(parserType = "Gumtree", language = "Java", exc = e) } diff --git a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt index 5227f271..9bfdf48e 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt @@ -2,7 +2,7 @@ package astminer.parse.gumtree.java.srcML import astminer.common.model.Parser import astminer.parse.gumtree.GumTreeNode -import astminer.parse.gumtree.wrapGumTreeNode +import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.srcml.SrcmlJavaTreeGenerator import java.io.InputStream @@ -14,7 +14,10 @@ class GumTreeJavaSrcmlParser : Parser { } override fun parseInputStream(content: InputStream): GumTreeNode { - val treeContext = SrcmlJavaTreeGenerator().generate(InputStreamReader(content)) - return wrapGumTreeNode(treeContext) + val contentAsString = content.bufferedReader().use { it.readText() } + val converter = PositionConverter(contentAsString) + val reader = contentAsString.reader() + val treeContext = SrcmlJavaTreeGenerator().generate(reader) + return GumTreeNode(treeContext.root, converter) } } diff --git a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt index fff110e4..dc998592 100644 --- a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt @@ -4,12 +4,11 @@ import astminer.common.model.Parser import astminer.common.model.ParserNotInstalledException import astminer.parse.ParsingException import astminer.parse.gumtree.GumTreeNode -import astminer.parse.gumtree.wrapGumTreeNode +import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.python.PythonTreeGenerator import java.io.IOException import java.io.InputStream -import java.io.InputStreamReader class GumTreePythonParser : Parser { init { @@ -17,8 +16,11 @@ class GumTreePythonParser : Parser { } override fun parseInputStream(content: InputStream): GumTreeNode = try { - val context = PythonTreeGenerator().generate(InputStreamReader(content)) - wrapGumTreeNode(context) + val contentAsString = content.bufferedReader().use { it.readText() } + val converter = PositionConverter(contentAsString) + val reader = contentAsString.reader() + val context = PythonTreeGenerator().generate(reader) + GumTreeNode(context.root, converter) } catch (e: RuntimeException) { throw ParsingException("GumTree", "Python", e) } catch (e: IOException) { From dfcb5ebb7560416408c6dac43889757658ce362b Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 19:03:04 +0300 Subject: [PATCH 21/48] doc little fix --- src/main/kotlin/astminer/parse/ForeignParser.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index 512266a2..d32bc565 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -44,8 +44,8 @@ import kotlin.io.path.createTempDirectory * "nodeType": "right_child", * "children": [], * "range" : { - * "start" : { "line" : 1, "column" : 0}, - * "end" : { "line" : 1, "column" : 6} + * "start" : { "line" : 1, "column" : 0 }, + * "end" : { "line" : 1, "column" : 6 } * } * } * ] From a4eceeeae36808edde41e83b13bfef8fa0cc3820 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 28 Oct 2021 19:04:10 +0300 Subject: [PATCH 22/48] code style fixes --- src/main/kotlin/astminer/common/model/ParsingModel.kt | 1 - src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt | 2 +- .../astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index 564d9401..dff66403 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -52,7 +52,6 @@ data class NodeRange(val start: Position, val end: Position) { @Serializable data class Position(val line: Int, val column: Int) - interface Parser { /** * Parse input stream into an AST. diff --git a/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt b/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt index 66f1b320..aa50f12b 100644 --- a/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt +++ b/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt @@ -20,4 +20,4 @@ class PositionConverter(content: String) { val end = searchPosition(endPos) return NodeRange(start, end) } -} \ No newline at end of file +} diff --git a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt index 9bfdf48e..6ee05e17 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt @@ -6,7 +6,6 @@ import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.srcml.SrcmlJavaTreeGenerator import java.io.InputStream -import java.io.InputStreamReader class GumTreeJavaSrcmlParser : Parser { init { From 38e705b2375b105f89988dba780b6bf38e779f64 Mon Sep 17 00:00:00 2001 From: ilya Date: Sat, 30 Oct 2021 18:40:05 +0300 Subject: [PATCH 23/48] serial name change --- src/main/kotlin/astminer/common/model/ParsingModel.kt | 3 ++- src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index dff66403..1956fbec 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -1,5 +1,6 @@ package astminer.common.model +import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable import java.io.File import java.io.InputStream @@ -50,7 +51,7 @@ data class NodeRange(val start: Position, val end: Position) { } @Serializable -data class Position(val line: Int, val column: Int) +data class Position(@SerialName("l") val line: Int, @SerialName("c") val column: Int) interface Parser { /** diff --git a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py index 354715d4..4e4ab5cb 100644 --- a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py +++ b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py @@ -1,7 +1,7 @@ from tree_sitter import TreeCursor from typing import Optional, TypedDict, List -Position = TypedDict("Position", {"line": int, "column": int}) +Position = TypedDict("Position", {"l": int, "c": int}) NodeRange = TypedDict("NodeRange", {"start": Position, "end": Position}) NodeAsDict = TypedDict( "NodeAsDict", {"token": Optional[str], "nodeType": str, "range": NodeRange, "children": List[int]} @@ -22,8 +22,8 @@ def _get_current_node_range(self) -> NodeRange: start = node.start_point end = node.end_point return { - "start": {"line": start[0], "column": start[1]}, - "end": {"line": end[0], "column": end[1]} + "start": {"l": start[0], "c": start[1]}, + "end": {"l": end[0], "c": end[1]} } def _get_current_node_as_dict(self) -> NodeAsDict: From 734a2a9bd135bc9beaf964704de17dd1d721faa4 Mon Sep 17 00:00:00 2001 From: ilya Date: Sat, 30 Oct 2021 18:40:20 +0300 Subject: [PATCH 24/48] new option in json ast storage --- .../kotlin/astminer/config/StorageConfigs.kt | 11 ++++++-- .../astminer/storage/ast/JsonAstStorage.kt | 26 +++++++++++++------ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/main/kotlin/astminer/config/StorageConfigs.kt b/src/main/kotlin/astminer/config/StorageConfigs.kt index 42b0788b..41646003 100644 --- a/src/main/kotlin/astminer/config/StorageConfigs.kt +++ b/src/main/kotlin/astminer/config/StorageConfigs.kt @@ -42,8 +42,15 @@ class DotAstStorageConfig : StorageConfig() { */ @Serializable @SerialName("json AST") -class JsonAstStorageConfig(private val withPaths: Boolean = false) : StorageConfig() { - override fun createStorage(outputDirectoryPath: String) = JsonAstStorage(outputDirectoryPath, withPaths) +class JsonAstStorageConfig( + private val withPaths: Boolean = false, + private val withRanges: Boolean = false +) : StorageConfig() { + override fun createStorage(outputDirectoryPath: String) = + JsonAstStorage( + outputDirectoryPath, withPaths, + withRanges + ) } /** diff --git a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt index 15fb27fc..2b60b6ff 100644 --- a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt @@ -1,9 +1,6 @@ package astminer.storage.ast -import astminer.common.model.DatasetHoldout -import astminer.common.model.LabeledResult -import astminer.common.model.Node -import astminer.common.model.Storage +import astminer.common.model.* import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString import kotlinx.serialization.json.Json @@ -17,7 +14,11 @@ private typealias Id = Int * Each line in the output file is a single json object that corresponds to one of the labeled trees. * Each tree is flattened and represented as a list of nodes. */ -class JsonAstStorage(override val outputDirectoryPath: String, private val withPaths: Boolean) : Storage { +class JsonAstStorage( + override val outputDirectoryPath: String, + private val withPaths: Boolean, + private val withRanges: Boolean +) : Storage { private val treeFlattener = TreeFlattener() private val datasetWriters = mutableMapOf() @@ -28,13 +29,22 @@ class JsonAstStorage(override val outputDirectoryPath: String, private val withP } @Serializable - private data class LabeledAst(val label: String, val path: String? = null, val ast: List) + private data class LabeledAst( + val label: String, + val path: String? = null, + val ast: List + ) @Serializable - private data class OutputNode(val token: String, val typeLabel: String, val children: List) + private data class OutputNode( + val token: String, + val typeLabel: String, + val range: NodeRange? = null, + val children: List + ) private fun TreeFlattener.EnumeratedNode.toOutputNode() = - OutputNode(node.token.final, node.typeLabel, children.map { it.id }) + OutputNode(node.token.final, node.typeLabel, node.range, children.map { it.id }) override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { val outputNodes = treeFlattener.flatten(labeledResult.root).map { it.toOutputNode() } From da2faf4cbf66a0840fbd772b54e70016d5ef69ca Mon Sep 17 00:00:00 2001 From: ilya Date: Sat, 30 Oct 2021 18:42:59 +0300 Subject: [PATCH 25/48] option to disable range serialization properly added --- src/main/kotlin/astminer/config/StorageConfigs.kt | 3 ++- src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/astminer/config/StorageConfigs.kt b/src/main/kotlin/astminer/config/StorageConfigs.kt index 41646003..fb3a3d04 100644 --- a/src/main/kotlin/astminer/config/StorageConfigs.kt +++ b/src/main/kotlin/astminer/config/StorageConfigs.kt @@ -48,7 +48,8 @@ class JsonAstStorageConfig( ) : StorageConfig() { override fun createStorage(outputDirectoryPath: String) = JsonAstStorage( - outputDirectoryPath, withPaths, + outputDirectoryPath, + withPaths, withRanges ) } diff --git a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt index 2b60b6ff..aef90b76 100644 --- a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt @@ -44,7 +44,12 @@ class JsonAstStorage( ) private fun TreeFlattener.EnumeratedNode.toOutputNode() = - OutputNode(node.token.final, node.typeLabel, node.range, children.map { it.id }) + OutputNode( + node.token.final, + node.typeLabel, + if (withRanges) node.range else null, + children.map { it.id } + ) override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { val outputNodes = treeFlattener.flatten(labeledResult.root).map { it.toOutputNode() } From c47517deb5946f1d67df48c9514468fa955808c7 Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 2 Nov 2021 16:25:46 +0300 Subject: [PATCH 26/48] test fixed --- .../parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index 80901917..16755194 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -3,7 +3,8 @@ package astminer.parse.treesitter.java import astminer.checkExecutable import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode +import astminer.common.SimpleNode +import org.junit.Assume import org.junit.BeforeClass import org.junit.Test import java.io.File From f4bea6ed3cd68bec301420713f8cfc9ada2377d6 Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 2 Nov 2021 17:08:14 +0300 Subject: [PATCH 27/48] now it compiles --- .../astminer/parse/javalang/JavaLangFunctionInfo.kt | 11 ++++++----- .../parse/javalang/JavaLangFunctionSplitter.kt | 2 +- .../parse/javalang/JavaLangParsingResultFactory.kt | 6 +++++- .../parse/javalang/JavaLangFunctionSplitterTest.kt | 2 +- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt index 2ab68741..a72b0a41 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt @@ -1,5 +1,6 @@ package astminer.parse.javalang +import astminer.common.SimpleNode import astminer.common.model.* import astminer.parse.findEnclosingElementBy import mu.KotlinLogging @@ -14,14 +15,14 @@ class JavaLangFunctionInfo(override val root: SimpleNode, override val filePath: override val annotations: List? = extractWithLogger(logger) { val annotations = root.getChildOfType(ANNOTATIONS) ?: return@extractWithLogger listOf() annotations.children - .map { it.getChildOfType(NAME)?.originalToken } + .map { it.getChildOfType(NAME)?.token?.original } .map { checkNotNull(it) { "No name for annotation found" } } } override val modifiers: List? = extractWithLogger(logger) { val modifiers = root.getChildOfType(MODIFIERS) ?: return@extractWithLogger listOf() modifiers.children - .map { it.originalToken } + .map { it.token.original } .map { checkNotNull(it) { "No name for modifier found" } } } @@ -30,7 +31,7 @@ class JavaLangFunctionInfo(override val root: SimpleNode, override val filePath: parameters.children.map { parameter -> val type = parameter.children.find { it.typeLabel in possibleTypes }?.extractType() checkNotNull(type) { "Can't extract parameter type" } - val name = parameter.children.find { it.typeLabel == NAME }?.originalToken + val name = parameter.children.find { it.typeLabel == NAME }?.token?.original checkNotNull(name) { "Can't find parameter name" } return@map FunctionInfoParameter(name, type) } @@ -46,14 +47,14 @@ class JavaLangFunctionInfo(override val root: SimpleNode, override val filePath: ENUM_DECLARATION -> EnclosingElementType.Enum else -> error("No type can be associated with enclosing node type label") } - val name = enclosingNode.getChildOfType(NAME)?.originalToken + val name = enclosingNode.getChildOfType(NAME)?.token?.original EnclosingElement(type, name, enclosingNode) } override val isConstructor: Boolean = false private fun SimpleNode.extractType(): String = this.preOrder() - .mapNotNull { if (it.typeLabel == "dimensions" && it.isLeaf()) "[]" else it.originalToken } + .mapNotNull { if (it.typeLabel == "dimensions" && it.isLeaf()) "[]" else it.token.original } .joinToString(separator = "") companion object { diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt index 9a45c241..acbb5c85 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt @@ -1,7 +1,7 @@ package astminer.parse.javalang import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode +import astminer.common.SimpleNode import astminer.common.model.TreeFunctionSplitter class JavaLangFunctionSplitter : TreeFunctionSplitter { diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt index 949337c0..61ce478a 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt @@ -1,6 +1,10 @@ package astminer.parse.javalang -import astminer.common.model.* +import astminer.common.SimpleNode +import astminer.common.model.Node +import astminer.common.model.ParsingResult +import astminer.common.model.ParsingResultFactory +import astminer.common.model.TreeFunctionSplitter import java.io.File object JavaLangParsingResultFactory : ParsingResultFactory { diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index 2e015872..751e0736 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -2,7 +2,7 @@ package astminer.parse.javalang import astminer.checkExecutable import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode +import astminer.common.SimpleNode import org.junit.Assume import org.junit.BeforeClass import org.junit.Test From e2b97501d43962400493b86520f1e1a0d9b905a5 Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 2 Nov 2021 17:08:26 +0300 Subject: [PATCH 28/48] range support added --- .../javalang/aw_javalang/ast_generation.py | 29 +++++++++++++++++-- .../javalang/aw_javalang/tree_flattening.py | 6 ++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/main/python/parse/javalang/aw_javalang/ast_generation.py b/src/main/python/parse/javalang/aw_javalang/ast_generation.py index bf79b3b2..cc6ce85b 100644 --- a/src/main/python/parse/javalang/aw_javalang/ast_generation.py +++ b/src/main/python/parse/javalang/aw_javalang/ast_generation.py @@ -6,11 +6,28 @@ DUMMY_NONE_PROCESSING_IN_ITERABLE = True +@dataclass +class Position: + l: int + c: int + + +@dataclass +class Range: + start: Position + end: Position + + +def node_range(start: List[int], end: List[int]): + return Range(Position(start[0], start[1]), Position(end[0], end[1])) + + @dataclass class Node: type: str value: Optional[str] children: List["Node"] + node_range: Range def __str__(self): return self.type + (f" : {self.value}" if self.value is not None else "") @@ -38,7 +55,7 @@ def generate_presentable_AST(node: JavaLangNode, show_declined: bool = True) -> if isinstance(value, str): children.append(process_string_attribute(node, attr, value)) elif isinstance(value, list) or isinstance(value, set): - attribute_node = Node(attr, None, []) + attribute_node = Node(attr, None, [], node_range([-1, -1], [-1, -1])) attribute_node.children = process_iterable_attributes(attribute_node, attr, value, show_declined) if not attribute_node.is_leaf(): children.append(attribute_node) @@ -46,7 +63,13 @@ def generate_presentable_AST(node: JavaLangNode, show_declined: bool = True) -> children.append(process_node_attribute(node, value, show_declined)) elif (value is not None or (value is None and not IGNORE_NONE_ATTR)) and show_declined: process_declined_attribute(attr, value) - return Node(generate_node_type(node), None, children) + node_pos = node.position + if node_pos is None: + start = [-1, -1] + else: + start = [node_pos.line, node_pos.column] + return Node(type=generate_node_type(node), value=None, children=children, + node_range=node_range(start, [-1, -1])) def generate_node_type(node: JavaLangNode) -> str: @@ -72,7 +95,7 @@ def get_singular(string: str) -> str: def process_string_attribute(node: Node, attr: str, value: str) -> Node: - return Node(attr, value, []) + return Node(attr, value, [], node_range([-1, -1], [-1, -1])) def process_node_attribute(node: Node, value: JavaLangNode, show_declined: bool) -> Node: diff --git a/src/main/python/parse/javalang/aw_javalang/tree_flattening.py b/src/main/python/parse/javalang/aw_javalang/tree_flattening.py index 39b25846..236aaa35 100644 --- a/src/main/python/parse/javalang/aw_javalang/tree_flattening.py +++ b/src/main/python/parse/javalang/aw_javalang/tree_flattening.py @@ -1,5 +1,5 @@ from typing import Optional, List, Tuple -from aw_javalang.ast_generation import Node +from aw_javalang.ast_generation import Node, Range from dataclasses import dataclass @@ -13,14 +13,14 @@ class EnumeratedNode: token: Optional[str] nodeType: str children: List[int] - + range: Range class TreeSerializer: def __init__(self): self._current_id = 0 def _enumerate_tree(self, node) -> Tuple[List["EnumeratedNode"], int]: - enumerated_root = EnumeratedNode(node.value, node.type, []) + enumerated_root = EnumeratedNode(node.value, node.type, [], node.node_range) root_id = self._current_id self._current_id += 1 enumerated_tree = [enumerated_root] From 3d7ae176b19c46e7e529cb6b2442dee7774549bf Mon Sep 17 00:00:00 2001 From: ilya Date: Wed, 3 Nov 2021 17:21:50 +0300 Subject: [PATCH 29/48] now it compiles --- .../parse/gumtree/python/GumTreePythonFunctionInfo.kt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt index 76577fb2..39e9f62d 100644 --- a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt @@ -19,9 +19,9 @@ class GumTreePythonFunctionInfo( override val isConstructor: Boolean = name == CONSTRUCTOR_NAME - override val returnType = if (root.children.find { it.originalToken == RETURN_TYPE_OPERATOR } == null) { + override val returnType = if (root.children.find { it.token.original == RETURN_TYPE_OPERATOR } == null) { null - } else root.getChildrenOfType(NAME).lastOrNull()?.preOrder()?.mapNotNull { it.originalToken }?.joinToString("") + } else root.getChildrenOfType(NAME).lastOrNull()?.preOrder()?.mapNotNull { it.token.original }?.joinToString("") override val enclosingElement: EnclosingElement? = extractWithLogger(logger) { val enclosing = root.findEnclosingElementBy { it.typeLabel in possibleEnclosingElements } @@ -32,7 +32,7 @@ class GumTreePythonFunctionInfo( else -> error("No enclosing type can be associated") } EnclosingElement( - name = enclosing.getChildOfType(NAME)?.originalToken, + name = enclosing.getChildOfType(NAME)?.token?.original, type = type, root = enclosing ) @@ -43,14 +43,14 @@ class GumTreePythonFunctionInfo( parameters.children.filter { it.typeLabel == PARAMETER }.map { param -> // Simple case: param has name and possibly default if (param.getChildOfType(TYPE_DEFINITION) == null) { - val name = param.getChildOfType(NAME)?.originalToken + val name = param.getChildOfType(NAME)?.token?.original checkNotNull(name) { "Parameter has no name" } FunctionInfoParameter(name, null) } else { // Complicated case: parameter has some type val variableDef = param.getChildOfType(TYPE_DEFINITION) ?: error("Tree structure was changed while function info collection") - val name = variableDef.getChildOfType(NAME)?.originalToken + val name = variableDef.getChildOfType(NAME)?.token?.original ?: error("Parameter has no name") val type = if (variableDef.children.size > 1) variableDef.children[1].getTokensFromSubtree() else null FunctionInfoParameter(name, type) From 5844f8d02aa9d75809371870e8b3545d9edac957 Mon Sep 17 00:00:00 2001 From: ilya Date: Wed, 3 Nov 2021 17:22:33 +0300 Subject: [PATCH 30/48] code style fixes --- .../kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt | 2 +- .../astminer/parse/javalang/JavaLangFunctionSplitterTest.kt | 2 +- .../parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt index acbb5c85..f739f14a 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt @@ -1,7 +1,7 @@ package astminer.parse.javalang -import astminer.common.model.FunctionInfo import astminer.common.SimpleNode +import astminer.common.model.FunctionInfo import astminer.common.model.TreeFunctionSplitter class JavaLangFunctionSplitter : TreeFunctionSplitter { diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index 751e0736..3c906e04 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -1,8 +1,8 @@ package astminer.parse.javalang import astminer.checkExecutable -import astminer.common.model.FunctionInfo import astminer.common.SimpleNode +import astminer.common.model.FunctionInfo import org.junit.Assume import org.junit.BeforeClass import org.junit.Test diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index 16755194..25f9aca4 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -1,9 +1,9 @@ package astminer.parse.treesitter.java import astminer.checkExecutable +import astminer.common.SimpleNode import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo -import astminer.common.SimpleNode import org.junit.Assume import org.junit.BeforeClass import org.junit.Test From beab3d2f955d9a5e1f832a6950fa0190170951f6 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 4 Nov 2021 17:28:19 +0300 Subject: [PATCH 31/48] position tests added --- .../antlr/java/JavaFunctionSplitterTest.kt | 31 ++++++++++++++++++ .../javalang/JavaLangFunctionSplitterTest.kt | 32 +++++++++++++++++++ .../JavaparserMethodSplitterTest.kt | 31 ++++++++++++++++++ .../spoon/SpoonJavaFunctionSplitterTest.kt | 30 +++++++++++++++++ .../java/TreeSitterJavaMethodSplitterTest.kt | 31 ++++++++++++++++++ 5 files changed, 155 insertions(+) diff --git a/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt index d9039bb3..d14900cf 100644 --- a/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt @@ -7,6 +7,7 @@ import java.io.File import kotlin.test.BeforeTest import kotlin.test.assertEquals import kotlin.test.assertNotNull +import kotlin.test.assertTrue class JavaFunctionSplitterTest { @@ -154,10 +155,40 @@ class JavaFunctionSplitterTest { assertEquals(setOf("Deprecated"), annotations.toSet()) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 val functionSplitter = JavaFunctionSplitter() val parser = JavaParser() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 18 to 20, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) } } diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index 3c906e04..f2b582ea 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -3,12 +3,14 @@ package astminer.parse.javalang import astminer.checkExecutable import astminer.common.SimpleNode import astminer.common.model.FunctionInfo +import astminer.parse.antlr.java.JavaFunctionSplitterTest import org.junit.Assume import org.junit.BeforeClass import org.junit.Test import java.io.File import kotlin.test.assertEquals import kotlin.test.assertNotNull +import kotlin.test.assertTrue internal class JavaLangFunctionSplitterTest { @Test @@ -146,12 +148,42 @@ internal class JavaLangFunctionSplitterTest { assertEquals(setOf("Deprecated"), annotations.toSet()) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { private const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 private val functionSplitter = JavaLangFunctionSplitter() val parser = JavaLangParser() lateinit var functionInfos: Collection> + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 18 to 20, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) @BeforeClass @JvmStatic diff --git a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt index 41b729a8..52123e35 100644 --- a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt @@ -2,6 +2,7 @@ package astminer.parse.javaparser import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo +import astminer.parse.javalang.JavaLangFunctionSplitterTest import org.junit.BeforeClass import org.junit.Test import java.io.File @@ -133,12 +134,42 @@ internal class JavaparserMethodSplitterTest { testAnnotationsMatches("functionWithModifiersAndAnnotations", setOf("Deprecated")) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { private const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 private val functionSplitter = JavaparserMethodSplitter() val parser = JavaParserParseWrapper() var functionInfos: Collection> = listOf() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 18 to 20, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) @BeforeClass @JvmStatic diff --git a/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt index ef8990ae..adc2bf16 100644 --- a/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt @@ -147,10 +147,40 @@ internal class SpoonJavaFunctionSplitterTest { assertTrue(blankFunction.isBlank()) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 val functionSplitter = SpoonJavaFunctionSplitter() val parser = SpoonJavaParser() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 18 to 20, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) } } diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index 25f9aca4..e1bee530 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -4,6 +4,7 @@ import astminer.checkExecutable import astminer.common.SimpleNode import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo +import astminer.parse.javalang.JavaLangFunctionSplitterTest import org.junit.Assume import org.junit.BeforeClass import org.junit.Test @@ -155,12 +156,42 @@ class TreeSitterJavaMethodSplitterTest { testAnnotationsMatches("functionWithModifiersAndAnnotations", setOf("Deprecated")) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { private const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 private val functionSplitter = TreeSitterJavaFunctionSplitter() val parser = TreeSitterJavaParser() var functionInfos: Collection> = listOf() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 18 to 20, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) @BeforeClass @JvmStatic From c1df042879f2a6f17dc930965abf3c8b81060b98 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 4 Nov 2021 17:28:30 +0300 Subject: [PATCH 32/48] tree sitter bug fix --- src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py index 4e4ab5cb..6b703330 100644 --- a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py +++ b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py @@ -22,8 +22,8 @@ def _get_current_node_range(self) -> NodeRange: start = node.start_point end = node.end_point return { - "start": {"l": start[0], "c": start[1]}, - "end": {"l": end[0], "c": end[1]} + "start": {"l": start[0] + 1, "c": start[1] + 1}, + "end": {"l": end[0] + 1, "c": end[1] + 1} } def _get_current_node_as_dict(self) -> NodeAsDict: From 383425ea13184f6832f79c4956977adb4d00f638 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 4 Nov 2021 17:29:17 +0300 Subject: [PATCH 33/48] unused imports removed --- .../astminer/parse/javalang/JavaLangFunctionSplitterTest.kt | 1 - .../astminer/parse/javaparser/JavaparserMethodSplitterTest.kt | 1 - .../parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt | 1 - 3 files changed, 3 deletions(-) diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index f2b582ea..fc21dc9b 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -3,7 +3,6 @@ package astminer.parse.javalang import astminer.checkExecutable import astminer.common.SimpleNode import astminer.common.model.FunctionInfo -import astminer.parse.antlr.java.JavaFunctionSplitterTest import org.junit.Assume import org.junit.BeforeClass import org.junit.Test diff --git a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt index 52123e35..4d58e905 100644 --- a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt @@ -2,7 +2,6 @@ package astminer.parse.javaparser import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo -import astminer.parse.javalang.JavaLangFunctionSplitterTest import org.junit.BeforeClass import org.junit.Test import java.io.File diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index e1bee530..ddc65fce 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -4,7 +4,6 @@ import astminer.checkExecutable import astminer.common.SimpleNode import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo -import astminer.parse.javalang.JavaLangFunctionSplitterTest import org.junit.Assume import org.junit.BeforeClass import org.junit.Test From 2ba6befee3b2919d06d3f2588b1c01c11479e6be Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 4 Nov 2021 17:34:57 +0300 Subject: [PATCH 34/48] expected function positions changed --- .../antlr/java/JavaFunctionSplitterTest.kt | 2 +- .../javalang/JavaLangFunctionSplitterTest.kt | 24 +++---------------- .../JavaparserMethodSplitterTest.kt | 2 +- .../spoon/SpoonJavaFunctionSplitterTest.kt | 2 +- .../java/TreeSitterJavaMethodSplitterTest.kt | 2 +- 5 files changed, 7 insertions(+), 25 deletions(-) diff --git a/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt index d14900cf..ef060293 100644 --- a/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt @@ -179,7 +179,7 @@ class JavaFunctionSplitterTest { 8 to 10, 12 to 14, 16 to 16, - 18 to 20, + 19 to 19, 22 to 22, 24 to 24, 26 to 26, diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index fc21dc9b..968f169e 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -152,10 +152,8 @@ internal class JavaLangFunctionSplitterTest { assertTrue( functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { val actualStart = it.first.start.line - val actualEnd = it.first.end.line - val expectedStart = it.second.first - val expectedEnd = it.second.second - (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + val expectedStart = it.second + actualStart == expectedStart } ) } @@ -166,23 +164,7 @@ internal class JavaLangFunctionSplitterTest { private val functionSplitter = JavaLangFunctionSplitter() val parser = JavaLangParser() lateinit var functionInfos: Collection> - val functionLinePositions = listOf( - 2 to 2, - 4 to 6, - 8 to 10, - 12 to 14, - 16 to 16, - 18 to 20, - 22 to 22, - 24 to 24, - 26 to 26, - 28 to 28, - 30 to 31, - 33 to 35, - 37 to 38, - 42 to 42, - 44 to 44 - ) + val functionLinePositions = listOf(2, 4, 8, 12, 16, 19, 22, 24, 26, 28, 30, 33, 37, 42, 44) @BeforeClass @JvmStatic diff --git a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt index 4d58e905..ee2e55c7 100644 --- a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt @@ -158,7 +158,7 @@ internal class JavaparserMethodSplitterTest { 8 to 10, 12 to 14, 16 to 16, - 18 to 20, + 19 to 19, 22 to 22, 24 to 24, 26 to 26, diff --git a/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt index adc2bf16..9e197f79 100644 --- a/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt @@ -171,7 +171,7 @@ internal class SpoonJavaFunctionSplitterTest { 8 to 10, 12 to 14, 16 to 16, - 18 to 20, + 19 to 19, 22 to 22, 24 to 24, 26 to 26, diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index ddc65fce..1c9dfea1 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -180,7 +180,7 @@ class TreeSitterJavaMethodSplitterTest { 8 to 10, 12 to 14, 16 to 16, - 18 to 20, + 19 to 19, 22 to 22, 24 to 24, 26 to 26, From edc24d3a6f56ded776aed0c793a6ccedcdcc73bb Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 4 Nov 2021 17:37:22 +0300 Subject: [PATCH 35/48] positions in javalang test adjusted --- .../astminer/parse/javalang/JavaLangFunctionSplitterTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index 968f169e..4a8aecea 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -164,7 +164,7 @@ internal class JavaLangFunctionSplitterTest { private val functionSplitter = JavaLangFunctionSplitter() val parser = JavaLangParser() lateinit var functionInfos: Collection> - val functionLinePositions = listOf(2, 4, 8, 12, 16, 19, 22, 24, 26, 28, 30, 33, 37, 42, 44) + val functionLinePositions = listOf(2, 4, 8, 12, 16, 19, 22, 24, 26, 28, 31, 35, 38, 42, 44) @BeforeClass @JvmStatic From 396ab209cf358b2546772b481300a0d9f30b5fc3 Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 4 Nov 2021 17:43:30 +0300 Subject: [PATCH 36/48] doc fix --- src/main/kotlin/astminer/parse/ForeignParser.kt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index d32bc565..48e8f0a4 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -26,8 +26,8 @@ import kotlin.io.path.createTempDirectory * "nodeType": "i_am_root", * "children": [1,2], * "range" : { - * "start" : { "line" : 0, "column" : 0 }, - * "end" : { "line" 1, "column" : 4 } + * "start" : { "l" : 0, "c" : 0 }, + * "end" : { "l" 1, "c" : 4 } * } * }, * { @@ -35,8 +35,8 @@ import kotlin.io.path.createTempDirectory * "nodeType": "left_child", * "children": [] * "range" : { - * "start" : { "line" : 0, "column": 0 }, - * "end" : { "line: 0, "column": 5 } + * "start" : { "l" : 0, "c": 0 }, + * "end" : { "l": 0, "c": 5 } * } * }, * { @@ -44,8 +44,8 @@ import kotlin.io.path.createTempDirectory * "nodeType": "right_child", * "children": [], * "range" : { - * "start" : { "line" : 1, "column" : 0 }, - * "end" : { "line" : 1, "column" : 6 } + * "start" : { "l" : 1, "c" : 0 }, + * "end" : { "l" : 1, "c" : 6 } * } * } * ] From 2186e8986bdcfc3f6f472d096c8375b443ff8162 Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 9 Nov 2021 21:11:22 +0300 Subject: [PATCH 37/48] Normalization refactor --- ...ode2VecNormalization.kt => TokenNormalization.kt} | 6 ++---- .../kotlin/astminer/common/model/Normalization.kt | 5 ----- src/main/kotlin/astminer/common/model/Token.kt | 9 +++------ src/main/kotlin/astminer/filters/CommonFilters.kt | 4 ++-- src/main/kotlin/astminer/filters/FunctionFilters.kt | 4 ++-- .../treesitter/java/TreeSitterJavaFunctionInfo.kt | 4 ++-- ...ormalizationTest.kt => TokenNormalizationTest.kt} | 12 ++++++------ 7 files changed, 17 insertions(+), 27 deletions(-) rename src/main/kotlin/astminer/common/{Code2VecNormalization.kt => TokenNormalization.kt} (93%) delete mode 100644 src/main/kotlin/astminer/common/model/Normalization.kt rename src/test/kotlin/astminer/common/{Code2VecNormalizationTest.kt => TokenNormalizationTest.kt} (84%) diff --git a/src/main/kotlin/astminer/common/Code2VecNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt similarity index 93% rename from src/main/kotlin/astminer/common/Code2VecNormalization.kt rename to src/main/kotlin/astminer/common/TokenNormalization.kt index 4f9b44b0..637a673a 100644 --- a/src/main/kotlin/astminer/common/Code2VecNormalization.kt +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -1,8 +1,6 @@ package astminer.common -import astminer.common.model.Normalization - -object Code2VecNormalization : Normalization { +object TokenNormalization { const val EMPTY_TOKEN = "EMPTY" const val TOKEN_DELIMITER = "|" @@ -14,7 +12,7 @@ object Code2VecNormalization : Normalization { private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - override fun normalizeToken(token: String?): String { + fun normalizeToken(token: String?): String { if (token == null) return EMPTY_TOKEN val subTokens = splitToSubtokens(token) return if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) diff --git a/src/main/kotlin/astminer/common/model/Normalization.kt b/src/main/kotlin/astminer/common/model/Normalization.kt deleted file mode 100644 index 3108c796..00000000 --- a/src/main/kotlin/astminer/common/model/Normalization.kt +++ /dev/null @@ -1,5 +0,0 @@ -package astminer.common.model - -interface Normalization { - fun normalizeToken(token: String?): String -} diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt index 028af9ff..9fcf5efb 100644 --- a/src/main/kotlin/astminer/common/model/Token.kt +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -1,17 +1,14 @@ package astminer.common.model -import astminer.common.Code2VecNormalization +import astminer.common.TokenNormalization -class Token( - val original: String?, - private val normalization: Normalization = Code2VecNormalization -) { +class Token(val original: String?) { val final: String get() = technical ?: normalized var technical: String? = null - val normalized = normalization.normalizeToken(original) + val normalized = TokenNormalization.normalizeToken(original) override fun toString(): String = final } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index e85843ae..acee202c 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -1,6 +1,6 @@ package astminer.filters -import astminer.common.Code2VecNormalization +import astminer.common.TokenNormalization import astminer.common.model.* import astminer.featureextraction.NumberOfNodes @@ -25,7 +25,7 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { private fun validateTree(root: Node) = root.preOrder() - .none { node -> node.token.final.split(Code2VecNormalization.TOKEN_DELIMITER).size > maxWordsNumber } + .none { node -> node.token.final.split(TokenNormalization.TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/filters/FunctionFilters.kt b/src/main/kotlin/astminer/filters/FunctionFilters.kt index 577c66c5..6616c1b1 100644 --- a/src/main/kotlin/astminer/filters/FunctionFilters.kt +++ b/src/main/kotlin/astminer/filters/FunctionFilters.kt @@ -1,6 +1,6 @@ package astminer.filters -import astminer.common.Code2VecNormalization +import astminer.common.TokenNormalization import astminer.common.model.FunctionFilter import astminer.common.model.FunctionInfo import astminer.common.model.Node @@ -38,7 +38,7 @@ object ConstructorFilter : FunctionFilter { class FunctionNameWordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter { override fun validate(functionInfo: FunctionInfo): Boolean { val name = functionInfo.name - return name != null && Code2VecNormalization.splitToSubtokens(name).size <= maxWordsNumber + return name != null && TokenNormalization.splitToSubtokens(name).size <= maxWordsNumber } } diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 15c5800e..8417fc69 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,6 +1,6 @@ package astminer.parse.treesitter.java -import astminer.common.Code2VecNormalization +import astminer.common.TokenNormalization import astminer.common.SimpleNode import astminer.common.model.* import astminer.parse.antlr.getTokensFromSubtree @@ -58,7 +58,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil var collectedType = returnTypeNode.getTokensFromSubtree() if (returnTypeNode.typeLabel == ARRAY_TYPE) { - collectedType = collectedType.replace(Code2VecNormalization.EMPTY_TOKEN, "[]") + collectedType = collectedType.replace(TokenNormalization.EMPTY_TOKEN, "[]") } return@run collectedType } diff --git a/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt similarity index 84% rename from src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt rename to src/test/kotlin/astminer/common/TokenNormalizationTest.kt index 1c0839b6..823b51cd 100644 --- a/src/test/kotlin/astminer/common/Code2VecNormalizationTest.kt +++ b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt @@ -3,7 +3,7 @@ package astminer.common import org.junit.Assert import org.junit.Test -class Code2VecNormalizationTest { +class TokenNormalizationTest { private val defaultToken = "EMPTY" @Test @@ -29,7 +29,7 @@ class Code2VecNormalizationTest { Assert.assertEquals( "All whitespace characters and punctuation should be removed, keeping only letters", expectedToken, - Code2VecNormalization.normalizeSubToken(token, defaultToken) + TokenNormalization.normalizeSubToken(token, defaultToken) ) } @@ -40,18 +40,18 @@ class Code2VecNormalizationTest { Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - Code2VecNormalization.normalizeSubToken(token, defaultToken) + TokenNormalization.normalizeSubToken(token, defaultToken) ) } @Test fun testNormalizeEmptyToken() { val token = "\n\n" - val expectedToken = Code2VecNormalization.EMPTY_TOKEN + val expectedToken = TokenNormalization.EMPTY_TOKEN Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - Code2VecNormalization.normalizeSubToken(token, defaultToken) + TokenNormalization.normalizeSubToken(token, defaultToken) ) } @@ -62,7 +62,7 @@ class Code2VecNormalizationTest { Assert.assertEquals( "Token with snake, camel and combined case should be split into list of its parts", expectedToken, - Code2VecNormalization.splitToSubtokens(token) + TokenNormalization.splitToSubtokens(token) ) } } From 8a64df894f3d10444267902e29053e1368fb0de2 Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 9 Nov 2021 21:11:55 +0300 Subject: [PATCH 38/48] EMPTY token updated --- src/main/kotlin/astminer/common/TokenNormalization.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt index 637a673a..27a6b557 100644 --- a/src/main/kotlin/astminer/common/TokenNormalization.kt +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -1,7 +1,7 @@ package astminer.common object TokenNormalization { - const val EMPTY_TOKEN = "EMPTY" + const val EMPTY_TOKEN = "" const val TOKEN_DELIMITER = "|" private val newLineReg = "\\\\n".toRegex() From 5898169875ad72a56e83eae1e141bded72cb25c9 Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 9 Nov 2021 21:17:59 +0300 Subject: [PATCH 39/48] file renamed and redundant map deleted --- .../parse/antlr/{searchUtil.kt => compressedTreesUtil.kt} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/main/kotlin/astminer/parse/antlr/{searchUtil.kt => compressedTreesUtil.kt} (96%) diff --git a/src/main/kotlin/astminer/parse/antlr/searchUtil.kt b/src/main/kotlin/astminer/parse/antlr/compressedTreesUtil.kt similarity index 96% rename from src/main/kotlin/astminer/parse/antlr/searchUtil.kt rename to src/main/kotlin/astminer/parse/antlr/compressedTreesUtil.kt index 88ffc78a..1ebec8e4 100644 --- a/src/main/kotlin/astminer/parse/antlr/searchUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/compressedTreesUtil.kt @@ -20,4 +20,4 @@ fun Node.getTokensFromSubtree(): String = if (isLeaf()) token.original ?: "" else children.joinToString(separator = "") { it.getTokensFromSubtree() } fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = - if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel).map { it } + if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel) From cbb70f05113c8072a3a7d6a82fbb11c58c74adef Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 9 Nov 2021 21:19:26 +0300 Subject: [PATCH 40/48] code style fixes --- .../parse/treesitter/java/TreeSitterJavaFunctionInfo.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 8417fc69..7965ec19 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,7 +1,7 @@ package astminer.parse.treesitter.java -import astminer.common.TokenNormalization import astminer.common.SimpleNode +import astminer.common.TokenNormalization import astminer.common.model.* import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy From f2a96a6784403683bba56c6fe2994180b23bd0ba Mon Sep 17 00:00:00 2001 From: ilya Date: Tue, 9 Nov 2021 22:24:59 +0300 Subject: [PATCH 41/48] test fixed --- src/test/kotlin/astminer/common/TokenNormalizationTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/kotlin/astminer/common/TokenNormalizationTest.kt b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt index 823b51cd..5d4c4e51 100644 --- a/src/test/kotlin/astminer/common/TokenNormalizationTest.kt +++ b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt @@ -4,7 +4,7 @@ import org.junit.Assert import org.junit.Test class TokenNormalizationTest { - private val defaultToken = "EMPTY" + private val defaultToken = TokenNormalization.EMPTY_TOKEN @Test fun testPreOrder() { From 64219d26c3adf1a63010708c260bc675bcae8d1e Mon Sep 17 00:00:00 2001 From: ilya Date: Wed, 10 Nov 2021 20:52:12 +0300 Subject: [PATCH 42/48] normalization object deleted --- .../astminer/common/TokenNormalization.kt | 91 +++++++++---------- .../kotlin/astminer/common/model/Token.kt | 4 +- .../kotlin/astminer/filters/CommonFilters.kt | 4 +- .../astminer/filters/FunctionFilters.kt | 4 +- .../java/TreeSitterJavaFunctionInfo.kt | 4 +- .../astminer/common/TokenNormalizationTest.kt | 12 +-- 6 files changed, 59 insertions(+), 60 deletions(-) diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt index 27a6b557..d10b7624 100644 --- a/src/main/kotlin/astminer/common/TokenNormalization.kt +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -1,52 +1,51 @@ package astminer.common -object TokenNormalization { - const val EMPTY_TOKEN = "" - const val TOKEN_DELIMITER = "|" - - private val newLineReg = "\\\\n".toRegex() - private val whitespaceReg = "//s+".toRegex() - private val quotesApostrophesCommasReg = "[\"',]".toRegex() - private val unicodeWeirdCharReg = "\\P{Print}".toRegex() - private val notALetterReg = "[^A-Za-z]".toRegex() - - private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - - fun normalizeToken(token: String?): String { - if (token == null) return EMPTY_TOKEN - val subTokens = splitToSubtokens(token) - return if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) - } +const val EMPTY_TOKEN = "" +const val TOKEN_DELIMITER = "|" + +private val newLineReg = "\\\\n".toRegex() +private val whitespaceReg = "//s+".toRegex() +private val quotesApostrophesCommasReg = "[\"',]".toRegex() +private val unicodeWeirdCharReg = "\\P{Print}".toRegex() +private val notALetterReg = "[^A-Za-z]".toRegex() + +private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - /** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - fun splitToSubtokens(token: String) = token - .trim() - .split(splitRegex) - .map { s -> normalizeSubToken(s, "") } - .filter { it.isNotEmpty() } - .toList() - - /** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - fun normalizeSubToken(token: String, defaultToken: String): String { - val cleanToken = token.lowercase() - .replace(newLineReg, "") // escaped new line - .replace(whitespaceReg, "") // whitespaces - .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas - .replace(unicodeWeirdCharReg, "") // unicode weird characters - - val stripped = cleanToken.replace(notALetterReg, "") - - return stripped.ifEmpty { - val carefulStripped = cleanToken.replace(" ", "_") - carefulStripped.ifEmpty { - defaultToken - } +fun normalizeToken(token: String?): String { + if (token == null) return EMPTY_TOKEN + val subTokens = splitToSubtokens(token) + return if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) +} + +/** + * The function was adopted from the original code2vec implementation in order to match their behavior: + * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + */ +fun splitToSubtokens(token: String) = token + .trim() + .split(splitRegex) + .map { s -> normalizeSubToken(s, "") } + .filter { it.isNotEmpty() } + .toList() + +/** + * The function was adopted from the original code2vec implementation in order to match their behavior: + * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + */ +fun normalizeSubToken(token: String, defaultToken: String): String { + val cleanToken = token.lowercase() + .replace(newLineReg, "") // escaped new line + .replace(whitespaceReg, "") // whitespaces + .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas + .replace(unicodeWeirdCharReg, "") // unicode weird characters + + val stripped = cleanToken.replace(notALetterReg, "") + + return stripped.ifEmpty { + val carefulStripped = cleanToken.replace(" ", "_") + carefulStripped.ifEmpty { + defaultToken } } } + diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt index 9fcf5efb..7ed2eeff 100644 --- a/src/main/kotlin/astminer/common/model/Token.kt +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -1,6 +1,6 @@ package astminer.common.model -import astminer.common.TokenNormalization +import astminer.common.normalizeToken class Token(val original: String?) { val final: String @@ -8,7 +8,7 @@ class Token(val original: String?) { var technical: String? = null - val normalized = TokenNormalization.normalizeToken(original) + val normalized = normalizeToken(original) override fun toString(): String = final } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index acee202c..0e2a66ab 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -1,6 +1,6 @@ package astminer.filters -import astminer.common.TokenNormalization +import astminer.common.TOKEN_DELIMITER import astminer.common.model.* import astminer.featureextraction.NumberOfNodes @@ -25,7 +25,7 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { private fun validateTree(root: Node) = root.preOrder() - .none { node -> node.token.final.split(TokenNormalization.TOKEN_DELIMITER).size > maxWordsNumber } + .none { node -> node.token.final.split(TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/filters/FunctionFilters.kt b/src/main/kotlin/astminer/filters/FunctionFilters.kt index 6616c1b1..d5316459 100644 --- a/src/main/kotlin/astminer/filters/FunctionFilters.kt +++ b/src/main/kotlin/astminer/filters/FunctionFilters.kt @@ -1,9 +1,9 @@ package astminer.filters -import astminer.common.TokenNormalization import astminer.common.model.FunctionFilter import astminer.common.model.FunctionInfo import astminer.common.model.Node +import astminer.common.splitToSubtokens /** * Filter that excludes functions that have at least one of modifiers from the [excludeModifiers] list. @@ -38,7 +38,7 @@ object ConstructorFilter : FunctionFilter { class FunctionNameWordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter { override fun validate(functionInfo: FunctionInfo): Boolean { val name = functionInfo.name - return name != null && TokenNormalization.splitToSubtokens(name).size <= maxWordsNumber + return name != null && splitToSubtokens(name).size <= maxWordsNumber } } diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 7965ec19..b987c4c8 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,7 +1,7 @@ package astminer.parse.treesitter.java +import astminer.common.EMPTY_TOKEN import astminer.common.SimpleNode -import astminer.common.TokenNormalization import astminer.common.model.* import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy @@ -58,7 +58,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil var collectedType = returnTypeNode.getTokensFromSubtree() if (returnTypeNode.typeLabel == ARRAY_TYPE) { - collectedType = collectedType.replace(TokenNormalization.EMPTY_TOKEN, "[]") + collectedType = collectedType.replace(EMPTY_TOKEN, "[]") } return@run collectedType } diff --git a/src/test/kotlin/astminer/common/TokenNormalizationTest.kt b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt index 5d4c4e51..ca0f0132 100644 --- a/src/test/kotlin/astminer/common/TokenNormalizationTest.kt +++ b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt @@ -4,7 +4,7 @@ import org.junit.Assert import org.junit.Test class TokenNormalizationTest { - private val defaultToken = TokenNormalization.EMPTY_TOKEN + private val defaultToken = EMPTY_TOKEN @Test fun testPreOrder() { @@ -29,7 +29,7 @@ class TokenNormalizationTest { Assert.assertEquals( "All whitespace characters and punctuation should be removed, keeping only letters", expectedToken, - TokenNormalization.normalizeSubToken(token, defaultToken) + normalizeSubToken(token, defaultToken) ) } @@ -40,18 +40,18 @@ class TokenNormalizationTest { Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - TokenNormalization.normalizeSubToken(token, defaultToken) + normalizeSubToken(token, defaultToken) ) } @Test fun testNormalizeEmptyToken() { val token = "\n\n" - val expectedToken = TokenNormalization.EMPTY_TOKEN + val expectedToken = EMPTY_TOKEN Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - TokenNormalization.normalizeSubToken(token, defaultToken) + normalizeSubToken(token, defaultToken) ) } @@ -62,7 +62,7 @@ class TokenNormalizationTest { Assert.assertEquals( "Token with snake, camel and combined case should be split into list of its parts", expectedToken, - TokenNormalization.splitToSubtokens(token) + splitToSubtokens(token) ) } } From 66716c077b4cbf8e64774953c4c1d4f93d25af49 Mon Sep 17 00:00:00 2001 From: ilya Date: Wed, 10 Nov 2021 20:53:13 +0300 Subject: [PATCH 43/48] removed empty line --- src/main/kotlin/astminer/common/TokenNormalization.kt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt index d10b7624..920f9117 100644 --- a/src/main/kotlin/astminer/common/TokenNormalization.kt +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -48,4 +48,3 @@ fun normalizeSubToken(token: String, defaultToken: String): String { } } } - From bd76ac16323287072f4d8de4ff6f225df97ce640 Mon Sep 17 00:00:00 2001 From: ilya Date: Wed, 10 Nov 2021 21:07:21 +0300 Subject: [PATCH 44/48] documentation added --- .../astminer/common/TokenNormalization.kt | 19 +++++++++++-------- .../kotlin/astminer/common/model/Token.kt | 9 +++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt index 920f9117..94e930a3 100644 --- a/src/main/kotlin/astminer/common/TokenNormalization.kt +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -3,14 +3,9 @@ package astminer.common const val EMPTY_TOKEN = "" const val TOKEN_DELIMITER = "|" -private val newLineReg = "\\\\n".toRegex() -private val whitespaceReg = "//s+".toRegex() -private val quotesApostrophesCommasReg = "[\"',]".toRegex() -private val unicodeWeirdCharReg = "\\P{Print}".toRegex() -private val notALetterReg = "[^A-Za-z]".toRegex() - -private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - +/** Splits tokens in sub-tokens and normalizes them by removing new lines, whitespaces, quotes etc + * @see splitToSubtokens + * @see normalizeSubToken**/ fun normalizeToken(token: String?): String { if (token == null) return EMPTY_TOKEN val subTokens = splitToSubtokens(token) @@ -28,6 +23,8 @@ fun splitToSubtokens(token: String) = token .filter { it.isNotEmpty() } .toList() +private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() + /** * The function was adopted from the original code2vec implementation in order to match their behavior: * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java @@ -48,3 +45,9 @@ fun normalizeSubToken(token: String, defaultToken: String): String { } } } + +private val newLineReg = "\\\\n".toRegex() +private val whitespaceReg = "//s+".toRegex() +private val quotesApostrophesCommasReg = "[\"',]".toRegex() +private val unicodeWeirdCharReg = "\\P{Print}".toRegex() +private val notALetterReg = "[^A-Za-z]".toRegex() diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt index 7ed2eeff..7204673a 100644 --- a/src/main/kotlin/astminer/common/model/Token.kt +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -3,11 +3,20 @@ package astminer.common.model import astminer.common.normalizeToken class Token(val original: String?) { + /** Final token after all normalizations and shadowing + * @see technical + * @see normalized **/ val final: String get() = technical ?: normalized + /** Token that shadows any original or normalized token + * and have the most priority in calculating final token + * that will be saved. It can be useful when it's necessary to hide something + * (for example method name in method name prediction problem) **/ var technical: String? = null + /** Original token after string normalization + * @see normalizeToken **/ val normalized = normalizeToken(original) override fun toString(): String = final From 95c33e5e7ad31f5cf5e3e08a223e9c34855bd14d Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 11 Nov 2021 20:57:15 +0300 Subject: [PATCH 45/48] doc and normalization fixes --- detekt.yaml | 2 - .../astminer/common/TokenNormalization.kt | 31 +++++++------ .../kotlin/astminer/common/model/Token.kt | 44 ++++++++++++------- .../astminer/featureextraction/TreeFeature.kt | 2 +- .../kotlin/astminer/filters/CommonFilters.kt | 2 +- src/main/kotlin/astminer/paths/PathUtil.kt | 2 +- src/main/kotlin/astminer/paths/PathWorker.kt | 2 +- .../astminer/storage/ast/CsvAstStorage.kt | 4 +- .../astminer/storage/ast/JsonAstStorage.kt | 2 +- .../astminer/storage/path/PathBasedStorage.kt | 2 +- .../astminer/common/TokenNormalizationTest.kt | 10 ++--- .../astminer/featureextraction/PrettyNode.kt | 2 +- .../FunctionNameLabelExtractorTest.kt | 6 +-- 13 files changed, 61 insertions(+), 50 deletions(-) diff --git a/detekt.yaml b/detekt.yaml index 4818dd27..52fc010a 100644 --- a/detekt.yaml +++ b/detekt.yaml @@ -26,8 +26,6 @@ style: max: 5 WildcardImport: active: false - UseDataClass: - allowVars: true formatting: autoCorrect: true diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt index 94e930a3..816cb922 100644 --- a/src/main/kotlin/astminer/common/TokenNormalization.kt +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -2,41 +2,40 @@ package astminer.common const val EMPTY_TOKEN = "" const val TOKEN_DELIMITER = "|" - -/** Splits tokens in sub-tokens and normalizes them by removing new lines, whitespaces, quotes etc - * @see splitToSubtokens - * @see normalizeSubToken**/ -fun normalizeToken(token: String?): String { - if (token == null) return EMPTY_TOKEN - val subTokens = splitToSubtokens(token) - return if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) -} +const val EMPTY_STRING = "" /** + * Splits token into subtokens by commonly used practice, i.e. `camelCase` or `snake_case`. + * Returns a list of not empty, normalized subtokens. * The function was adopted from the original code2vec implementation in order to match their behavior: * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + * @see normalizeToken */ fun splitToSubtokens(token: String) = token .trim() .split(splitRegex) - .map { s -> normalizeSubToken(s, "") } + .map { s -> normalizeToken(s, EMPTY_STRING) } .filter { it.isNotEmpty() } .toList() private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() /** + * Normalize token by conversion to lower case, removing the new line, + * whitespace, quotes, and other weird Unicode characters. * The function was adopted from the original code2vec implementation in order to match their behavior: * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + * @param token Token to normalize + * @param defaultToken If the token is empty after the normalization process, it will be replaced with the default token */ -fun normalizeSubToken(token: String, defaultToken: String): String { +fun normalizeToken(token: String, defaultToken: String): String { val cleanToken = token.lowercase() - .replace(newLineReg, "") // escaped new line - .replace(whitespaceReg, "") // whitespaces - .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas - .replace(unicodeWeirdCharReg, "") // unicode weird characters + .replace(newLineReg, EMPTY_STRING) // escaped new line + .replace(whitespaceReg, EMPTY_STRING) // whitespaces + .replace(quotesApostrophesCommasReg, EMPTY_STRING) // quotes, apostrophes, commas + .replace(unicodeWeirdCharReg, EMPTY_STRING) // unicode weird characters - val stripped = cleanToken.replace(notALetterReg, "") + val stripped = cleanToken.replace(notALetterReg, EMPTY_STRING) return stripped.ifEmpty { val carefulStripped = cleanToken.replace(" ", "_") diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt index 7204673a..fd367bfd 100644 --- a/src/main/kotlin/astminer/common/model/Token.kt +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -1,23 +1,37 @@ package astminer.common.model -import astminer.common.normalizeToken +import astminer.common.* +/** + * Class to wrap logic with token processing. + * It is responsible for token normalization or replacing it with technical information. + * Use `token.original` to access the original token. + */ class Token(val original: String?) { - /** Final token after all normalizations and shadowing - * @see technical - * @see normalized **/ - val final: String - get() = technical ?: normalized - - /** Token that shadows any original or normalized token - * and have the most priority in calculating final token - * that will be saved. It can be useful when it's necessary to hide something - * (for example method name in method name prediction problem) **/ + /** + * Technical token is used to shadow the original token with mining pipeline specific value. + * For example, for the method name prediction problem + * we want to set technical `` token to hide real method name. + */ var technical: String? = null - /** Original token after string normalization - * @see normalizeToken **/ - val normalized = normalizeToken(original) + /** + * Original token with normalization applied + * @see normalizeToken + */ + val normalized = run { + if (original == null) return@run EMPTY_TOKEN + val subTokens = splitToSubtokens(original) + if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) + } + + /** + * Access to the final representation of the token after normalization and other preprocessing. + * It returns technical assign token if it exists or normalized token otherwise. + * @see technical + * @see normalized + */ + fun final() = technical ?: normalized - override fun toString(): String = final + override fun toString(): String = final() } diff --git a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt index 3068c2e9..4def1751 100644 --- a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt +++ b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt @@ -57,7 +57,7 @@ object Tokens : TreeFeature> { private fun findTokens(node: Node, tokensList: MutableList): List { node.children.forEach { findTokens(it, tokensList) } - tokensList.add(node.token.final) + tokensList.add(node.token.final()) return tokensList } } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index 0e2a66ab..470b25ef 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -25,7 +25,7 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { private fun validateTree(root: Node) = root.preOrder() - .none { node -> node.token.final.split(TOKEN_DELIMITER).size > maxWordsNumber } + .none { node -> node.token.final().split(TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/paths/PathUtil.kt b/src/main/kotlin/astminer/paths/PathUtil.kt index 4baa90cd..5dd456a5 100644 --- a/src/main/kotlin/astminer/paths/PathUtil.kt +++ b/src/main/kotlin/astminer/paths/PathUtil.kt @@ -2,7 +2,7 @@ package astminer.paths import astminer.common.model.* -fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token.final }): PathContext { +fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token.final() }): PathContext { val startToken = getToken(path.upwardNodes.first()) val endToken = getToken(path.downwardNodes.last()) val astNodes = path.upwardNodes.map { OrientedNodeType(it.typeLabel, Direction.UP) } + diff --git a/src/main/kotlin/astminer/paths/PathWorker.kt b/src/main/kotlin/astminer/paths/PathWorker.kt index de78686e..221427fa 100644 --- a/src/main/kotlin/astminer/paths/PathWorker.kt +++ b/src/main/kotlin/astminer/paths/PathWorker.kt @@ -49,7 +49,7 @@ class PathWorker { val paths: MutableList = ArrayList() iterator.forEach { currentNode -> if (currentNode.isLeaf()) { - if (currentNode.token.final.isNotEmpty()) { + if (currentNode.token.final().isNotEmpty()) { currentNode.setPathPieces(listOf(listOf(currentNode))) } } else { diff --git a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt index 7691ff28..47f426df 100644 --- a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt @@ -28,7 +28,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { for (node in labeledResult.root.preOrder()) { - tokensMap.record(node.token.final) + tokensMap.record(node.token.final()) nodeTypesMap.record(node.typeLabel) } val writer = astsPrintWriters.getOrPut(holdout) { holdout.resolveHoldout() } @@ -55,7 +55,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { } internal fun astString(node: Node): String { - return "${tokensMap.getId(node.token.final)} ${nodeTypesMap.getId(node.typeLabel)}{${ + return "${tokensMap.getId(node.token.final())} ${nodeTypesMap.getId(node.typeLabel)}{${ node.children.joinToString(separator = "", transform = ::astString) }}" } diff --git a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt index aef90b76..f2609109 100644 --- a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt @@ -45,7 +45,7 @@ class JsonAstStorage( private fun TreeFlattener.EnumeratedNode.toOutputNode() = OutputNode( - node.token.final, + node.token.final(), node.typeLabel, if (withRanges) node.range else null, children.map { it.id } diff --git a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt index fec17dc2..ed0f8a71 100644 --- a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt +++ b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt @@ -54,7 +54,7 @@ abstract class PathBasedStorage( return LabeledPathContexts( labeledResult.label, paths.map { astPath -> - toPathContext(astPath) { it.token.final.replace("\n", "\\n") } + toPathContext(astPath) { it.token.final().replace("\n", "\\n") } } ) } diff --git a/src/test/kotlin/astminer/common/TokenNormalizationTest.kt b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt index ca0f0132..939d2bcc 100644 --- a/src/test/kotlin/astminer/common/TokenNormalizationTest.kt +++ b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt @@ -25,11 +25,11 @@ class TokenNormalizationTest { @Test fun testNormalizeTokenCleaning() { val token = " Token THAT \n contains Whi\"t,es''pace characters!!!and pu.n.c.t.u.a.tion \n" - val expectedToken = "token" + "that" + "contains" + "whitespace" + "characters" + "and" + "punctuation" + val normalizedSubTokens = listOf("token", "that", "contains", "whitespace", "characters", "and", "punctuation") Assert.assertEquals( "All whitespace characters and punctuation should be removed, keeping only letters", - expectedToken, - normalizeSubToken(token, defaultToken) + normalizedSubTokens.joinToString(""), + normalizeToken(token, defaultToken) ) } @@ -40,7 +40,7 @@ class TokenNormalizationTest { Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - normalizeSubToken(token, defaultToken) + normalizeToken(token, defaultToken) ) } @@ -51,7 +51,7 @@ class TokenNormalizationTest { Assert.assertEquals( "Token without letters have whitespaces replaced with underscores", expectedToken, - normalizeSubToken(token, defaultToken) + normalizeToken(token, defaultToken) ) } diff --git a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt index 5e8ff667..80691b9e 100644 --- a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt +++ b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt @@ -18,7 +18,7 @@ class PrettyNode(override val typeLabel: String, originalToken: String) : Node(o fun toPrettyString(indent: Int = 0, indentSymbol: String = "--"): String = with(StringBuilder()) { repeat(indent) { append(indentSymbol) } append(typeLabel) - if (token.final.isNotEmpty()) { + if (token.final().isNotEmpty()) { appendLine(" : $token") } else { appendLine() diff --git a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt index 7760f7e9..2b42100f 100644 --- a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt +++ b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt @@ -38,20 +38,20 @@ class FunctionNameLabelExtractorTest { @Test fun `test FunctionNameProblem hides function name node token with METHOD_NAME`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.nameNode?.token?.final) + assertEquals("METHOD_NAME", functionInfo.nameNode?.token?.final()) } @Test fun `test FunctionNameProblem hides function root token with METHOD_NAME if it is the name node`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.root.token.final) + assertEquals("METHOD_NAME", functionInfo.root.token.final()) } @Test fun `test function name problem should hide recursive call tokens with SELF`() { FunctionNameLabelExtractor.process(functionInfo) val recursiveCallNode = functionInfo.root.children.firstOrNull()?.children?.firstOrNull() - assertEquals("SELF", recursiveCallNode?.token?.final) + assertEquals("SELF", recursiveCallNode?.token?.final()) } companion object { From bf5914b0b8940fcfb47dd9fbe2aaa9e610a6a17d Mon Sep 17 00:00:00 2001 From: ilya Date: Thu, 11 Nov 2021 21:20:45 +0300 Subject: [PATCH 46/48] docs update --- docs/storages.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/storages.md b/docs/storages.md index ef2ae0bf..4134a543 100644 --- a/docs/storages.md +++ b/docs/storages.md @@ -52,6 +52,24 @@ In this format, each line represents an AST with its [label](label_extractors.md ] ``` +Possible configuration options for Json storage: + +```yaml +storage: + name: json AST + withPaths: true +``` + +_For every saved tree also save its filepath._ + +```yaml +storage: + name: json AST + withRanges: true +``` + +_For each node, store the start and end positions of the code snippet that this node represents._ + ## Path-based representations Path-based representation was introduced by [Alon et al.](https://arxiv.org/abs/1803.09544). From 5f86b14ce51ec974c4aba2718eec6de2be0e30b8 Mon Sep 17 00:00:00 2001 From: Egor Spirin Date: Fri, 12 Nov 2021 12:51:13 +0300 Subject: [PATCH 47/48] Improve storage documentation --- docs/storages.md | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/docs/storages.md b/docs/storages.md index 4134a543..03a05536 100644 --- a/docs/storages.md +++ b/docs/storages.md @@ -37,6 +37,8 @@ Saves each tree with its label in the JSON lines format inspired by the [150k Py ```yaml name: json AST + withPaths: true # can be omitted + withRanges: true # can be omitted ``` In this format, each line represents an AST with its [label](label_extractors.md), path, and all vertices: @@ -50,25 +52,12 @@ In this format, each line represents an AST with its [label](label_extractors.md { "token": "class", "typeLabel": "TypeDeclaration", "children": [2, 3, 4] }, ... ] +} ``` Possible configuration options for Json storage: - -```yaml -storage: - name: json AST - withPaths: true -``` - -_For every saved tree also save its filepath._ - -```yaml -storage: - name: json AST - withRanges: true -``` - -_For each node, store the start and end positions of the code snippet that this node represents._ +1. `withPaths` allows for each tree to save the path to the file where it appears. Default: `false`. +2. `withRanges` allows for each node to save start and end positions in the corresponding source code. Default: `false`. ## Path-based representations From 37ae18560f8e427467992643ace835da90dabbc0 Mon Sep 17 00:00:00 2001 From: Egor Spirin Date: Fri, 12 Nov 2021 12:55:34 +0300 Subject: [PATCH 48/48] Change snippet format --- docs/storages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/storages.md b/docs/storages.md index 03a05536..2e2c098d 100644 --- a/docs/storages.md +++ b/docs/storages.md @@ -43,7 +43,7 @@ Saves each tree with its label in the JSON lines format inspired by the [150k Py In this format, each line represents an AST with its [label](label_extractors.md), path, and all vertices: -```json +```json lines { "label": "1.java", "path": "src/test/resources/examples/1.java",