diff --git a/docs/storages.md b/docs/storages.md index ef2ae0bf..2e2c098d 100644 --- a/docs/storages.md +++ b/docs/storages.md @@ -37,11 +37,13 @@ Saves each tree with its label in the JSON lines format inspired by the [150k Py ```yaml name: json AST + withPaths: true # can be omitted + withRanges: true # can be omitted ``` In this format, each line represents an AST with its [label](label_extractors.md), path, and all vertices: -```json +```json lines { "label": "1.java", "path": "src/test/resources/examples/1.java", @@ -50,8 +52,13 @@ In this format, each line represents an AST with its [label](label_extractors.md { "token": "class", "typeLabel": "TypeDeclaration", "children": [2, 3, 4] }, ... ] +} ``` +Possible configuration options for Json storage: +1. `withPaths` allows for each tree to save the path to the file where it appears. Default: `false`. +2. `withRanges` allows for each node to save start and end positions in the corresponding source code. Default: `false`. + ## Path-based representations Path-based representation was introduced by [Alon et al.](https://arxiv.org/abs/1803.09544). diff --git a/src/main/kotlin/astminer/common/SimpleNode.kt b/src/main/kotlin/astminer/common/SimpleNode.kt new file mode 100644 index 00000000..48e286f1 --- /dev/null +++ b/src/main/kotlin/astminer/common/SimpleNode.kt @@ -0,0 +1,23 @@ +package astminer.common + +import astminer.common.model.Node +import astminer.common.model.NodeRange + +/** Node simplest implementation **/ +class SimpleNode( + override val typeLabel: String, + override val children: MutableList, + override val parent: Node? = null, + override val range: NodeRange? = null, + token: String? +) : Node(token) { + override fun removeChildrenOfType(typeLabel: String) { + children.removeIf { it.typeLabel == typeLabel } + } + + override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode } + override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode + + override fun preOrder() = super.preOrder().map { it as SimpleNode } + override fun postOrder() = super.postOrder().map { it as SimpleNode } +} diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt new file mode 100644 index 00000000..816cb922 --- /dev/null +++ b/src/main/kotlin/astminer/common/TokenNormalization.kt @@ -0,0 +1,52 @@ +package astminer.common + +const val EMPTY_TOKEN = "" +const val TOKEN_DELIMITER = "|" +const val EMPTY_STRING = "" + +/** + * Splits token into subtokens by commonly used practice, i.e. `camelCase` or `snake_case`. + * Returns a list of not empty, normalized subtokens. + * The function was adopted from the original code2vec implementation in order to match their behavior: + * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + * @see normalizeToken + */ +fun splitToSubtokens(token: String) = token + .trim() + .split(splitRegex) + .map { s -> normalizeToken(s, EMPTY_STRING) } + .filter { it.isNotEmpty() } + .toList() + +private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() + +/** + * Normalize token by conversion to lower case, removing the new line, + * whitespace, quotes, and other weird Unicode characters. + * The function was adopted from the original code2vec implementation in order to match their behavior: + * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java + * @param token Token to normalize + * @param defaultToken If the token is empty after the normalization process, it will be replaced with the default token + */ +fun normalizeToken(token: String, defaultToken: String): String { + val cleanToken = token.lowercase() + .replace(newLineReg, EMPTY_STRING) // escaped new line + .replace(whitespaceReg, EMPTY_STRING) // whitespaces + .replace(quotesApostrophesCommasReg, EMPTY_STRING) // quotes, apostrophes, commas + .replace(unicodeWeirdCharReg, EMPTY_STRING) // unicode weird characters + + val stripped = cleanToken.replace(notALetterReg, EMPTY_STRING) + + return stripped.ifEmpty { + val carefulStripped = cleanToken.replace(" ", "_") + carefulStripped.ifEmpty { + defaultToken + } + } +} + +private val newLineReg = "\\\\n".toRegex() +private val whitespaceReg = "//s+".toRegex() +private val quotesApostrophesCommasReg = "[\"',]".toRegex() +private val unicodeWeirdCharReg = "\\P{Print}".toRegex() +private val notALetterReg = "[^A-Za-z]".toRegex() diff --git a/src/main/kotlin/astminer/common/TreeUtil.kt b/src/main/kotlin/astminer/common/TreeUtil.kt deleted file mode 100644 index 1a53158d..00000000 --- a/src/main/kotlin/astminer/common/TreeUtil.kt +++ /dev/null @@ -1,45 +0,0 @@ -package astminer.common - -const val EMPTY_TOKEN = "EMPTY" - -/** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - -val newLineReg = "\\\\n".toRegex() -val whitespaceReg = "//s+".toRegex() -val quotesApostrophesCommasReg = "[\"',]".toRegex() -val unicodeWeirdCharReg = "\\P{Print}".toRegex() -val notALetterReg = "[^A-Za-z]".toRegex() - -fun normalizeToken(token: String, defaultToken: String): String { - val cleanToken = token.lowercase() - .replace(newLineReg, "") // escaped new line - .replace(whitespaceReg, "") // whitespaces - .replace(quotesApostrophesCommasReg, "") // quotes, apostrophes, commas - .replace(unicodeWeirdCharReg, "") // unicode weird characters - - val stripped = cleanToken.replace(notALetterReg, "") - - return stripped.ifEmpty { - val carefulStripped = cleanToken.replace(" ", "_") - carefulStripped.ifEmpty { - defaultToken - } - } -} - -/** - * The function was adopted from the original code2vec implementation in order to match their behavior: - * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java - */ - -val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() - -fun splitToSubtokens(token: String) = token - .trim() - .split(splitRegex) - .map { s -> normalizeToken(s, "") } - .filter { it.isNotEmpty() } - .toList() diff --git a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt index 7b2f4a9e..16b2e2e1 100644 --- a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt +++ b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt @@ -20,7 +20,7 @@ interface NamedTree { val nameNode: T? get() = notImplemented("nameNode") val name: String? - get() = nameNode?.originalToken + get() = nameNode?.token?.original val root: T get() = notImplemented("root") val body: T? diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt index 5f6c8dfa..1956fbec 100644 --- a/src/main/kotlin/astminer/common/model/ParsingModel.kt +++ b/src/main/kotlin/astminer/common/model/ParsingModel.kt @@ -1,31 +1,22 @@ package astminer.common.model -import astminer.common.EMPTY_TOKEN -import astminer.common.splitToSubtokens +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable import java.io.File import java.io.InputStream -import java.util.* -abstract class Node(val originalToken: String?) { +abstract class Node(originalToken: String?) { abstract val typeLabel: String abstract val children: List abstract val parent: Node? - - val normalizedToken: String = - originalToken?.let { - val subtokens = splitToSubtokens(it) - if (subtokens.isEmpty()) EMPTY_TOKEN else subtokens.joinToString(TOKEN_DELIMITER) - } ?: EMPTY_TOKEN - - var technicalToken: String? = null - - val token: String - get() = technicalToken ?: normalizedToken - + abstract val range: NodeRange? val metadata: MutableMap = HashMap() + val token = Token(originalToken) + fun isLeaf() = children.isEmpty() override fun toString(): String = "$typeLabel : $token" + fun prettyPrint(indent: Int = 0, indentSymbol: String = "--") { repeat(indent) { print(indentSymbol) } println(this) @@ -52,30 +43,16 @@ abstract class Node(val originalToken: String?) { fun postOrderIterator(): Iterator = postOrder().listIterator() open fun postOrder(): List = mutableListOf().also { doTraversePostOrder(it) } - - companion object { - const val TOKEN_DELIMITER = "|" - } } -/** Node simplest implementation **/ -class SimpleNode( - override val typeLabel: String, - override val children: MutableList, - override val parent: Node?, - token: String? -) : Node(token) { - override fun removeChildrenOfType(typeLabel: String) { - children.removeIf { it.typeLabel == typeLabel } - } - - override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode } - override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode - - override fun preOrder() = super.preOrder().map { it as SimpleNode } - override fun postOrder() = super.postOrder().map { it as SimpleNode } +@Serializable +data class NodeRange(val start: Position, val end: Position) { + override fun toString(): String = "[${start.line}, ${start.column}] - [${end.line}, ${end.column}]" } +@Serializable +data class Position(@SerialName("l") val line: Int, @SerialName("c") val column: Int) + interface Parser { /** * Parse input stream into an AST. diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt new file mode 100644 index 00000000..fd367bfd --- /dev/null +++ b/src/main/kotlin/astminer/common/model/Token.kt @@ -0,0 +1,37 @@ +package astminer.common.model + +import astminer.common.* + +/** + * Class to wrap logic with token processing. + * It is responsible for token normalization or replacing it with technical information. + * Use `token.original` to access the original token. + */ +class Token(val original: String?) { + /** + * Technical token is used to shadow the original token with mining pipeline specific value. + * For example, for the method name prediction problem + * we want to set technical `` token to hide real method name. + */ + var technical: String? = null + + /** + * Original token with normalization applied + * @see normalizeToken + */ + val normalized = run { + if (original == null) return@run EMPTY_TOKEN + val subTokens = splitToSubtokens(original) + if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) + } + + /** + * Access to the final representation of the token after normalization and other preprocessing. + * It returns technical assign token if it exists or normalized token otherwise. + * @see technical + * @see normalized + */ + fun final() = technical ?: normalized + + override fun toString(): String = final() +} diff --git a/src/main/kotlin/astminer/config/StorageConfigs.kt b/src/main/kotlin/astminer/config/StorageConfigs.kt index 42b0788b..fb3a3d04 100644 --- a/src/main/kotlin/astminer/config/StorageConfigs.kt +++ b/src/main/kotlin/astminer/config/StorageConfigs.kt @@ -42,8 +42,16 @@ class DotAstStorageConfig : StorageConfig() { */ @Serializable @SerialName("json AST") -class JsonAstStorageConfig(private val withPaths: Boolean = false) : StorageConfig() { - override fun createStorage(outputDirectoryPath: String) = JsonAstStorage(outputDirectoryPath, withPaths) +class JsonAstStorageConfig( + private val withPaths: Boolean = false, + private val withRanges: Boolean = false +) : StorageConfig() { + override fun createStorage(outputDirectoryPath: String) = + JsonAstStorage( + outputDirectoryPath, + withPaths, + withRanges + ) } /** diff --git a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt index 9a76c7fc..4def1751 100644 --- a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt +++ b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt @@ -57,7 +57,7 @@ object Tokens : TreeFeature> { private fun findTokens(node: Node, tokensList: MutableList): List { node.children.forEach { findTokens(it, tokensList) } - tokensList.add(node.token) + tokensList.add(node.token.final()) return tokensList } } diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt index a0f47848..470b25ef 100644 --- a/src/main/kotlin/astminer/filters/CommonFilters.kt +++ b/src/main/kotlin/astminer/filters/CommonFilters.kt @@ -1,5 +1,6 @@ package astminer.filters +import astminer.common.TOKEN_DELIMITER import astminer.common.model.* import astminer.featureextraction.NumberOfNodes @@ -23,8 +24,8 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n * Filter that excludes trees that have more words than [maxWordsNumber] in any token of their node. */ class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter { - private fun validateTree(root: Node) = - !root.preOrder().any { node -> node.token.split(Node.TOKEN_DELIMITER).size > maxWordsNumber } + private fun validateTree(root: Node) = root.preOrder() + .none { node -> node.token.final().split(TOKEN_DELIMITER).size > maxWordsNumber } override fun validate(functionInfo: FunctionInfo) = validateTree(functionInfo.root) diff --git a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt index 9a62e645..e5abb201 100644 --- a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt +++ b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt @@ -14,13 +14,13 @@ object FunctionNameLabelExtractor : FunctionLabelExtractor { private const val RECURSIVE_CALL_TOKEN = "SELF" override fun process(functionInfo: FunctionInfo): LabeledResult? { - val normalizedName = functionInfo.nameNode?.normalizedToken ?: return null + val normalizedName = functionInfo.nameNode?.token?.normalized ?: return null functionInfo.root.preOrder().forEach { node -> - if (node.originalToken == functionInfo.nameNode?.originalToken) { - node.technicalToken = RECURSIVE_CALL_TOKEN + if (node.token.original == functionInfo.nameNode?.token?.original) { + node.token.technical = RECURSIVE_CALL_TOKEN } } - functionInfo.nameNode?.technicalToken = HIDDEN_METHOD_NAME_TOKEN + functionInfo.nameNode?.token?.technical = HIDDEN_METHOD_NAME_TOKEN return LabeledResult(functionInfo.root, normalizedName, functionInfo.qualifiedPath) } } diff --git a/src/main/kotlin/astminer/parse/ForeignParser.kt b/src/main/kotlin/astminer/parse/ForeignParser.kt index d7ff2df0..48e8f0a4 100644 --- a/src/main/kotlin/astminer/parse/ForeignParser.kt +++ b/src/main/kotlin/astminer/parse/ForeignParser.kt @@ -1,7 +1,8 @@ package astminer.parse +import astminer.common.SimpleNode +import astminer.common.model.NodeRange import astminer.common.model.Parser -import astminer.common.model.SimpleNode import astminer.config.FileExtension import astminer.config.ParserType import kotlinx.serialization.Serializable @@ -23,17 +24,29 @@ import kotlin.io.path.createTempDirectory * { * "token": null, * "nodeType": "i_am_root", - * "children": [1,2] + * "children": [1,2], + * "range" : { + * "start" : { "l" : 0, "c" : 0 }, + * "end" : { "l" 1, "c" : 4 } + * } * }, * { * "token": "Hello", * "nodeType": "left_child", * "children": [] + * "range" : { + * "start" : { "l" : 0, "c": 0 }, + * "end" : { "l": 0, "c": 5 } + * } * }, * { * "token": "World!", * "nodeType": "right_child", - * "children": [] + * "children": [], + * "range" : { + * "start" : { "l" : 1, "c" : 0 }, + * "end" : { "l" : 1, "c" : 6 } + * } * } * ] * } @@ -57,7 +70,14 @@ private fun launchScript(args: List): String { private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent: SimpleNode? = null): SimpleNode { val foreignNode = context.tree[rootId] - val node = SimpleNode(foreignNode.nodeType, mutableListOf(), parent, foreignNode.token) + + val node = SimpleNode( + children = mutableListOf(), + parent = parent, + typeLabel = foreignNode.nodeType, + token = foreignNode.token, + range = foreignNode.range + ) val children = foreignNode.children.map { convertFromForeignTree(context, it, node) } node.children.addAll(children) return node @@ -67,7 +87,12 @@ private fun convertFromForeignTree(context: ForeignTree, rootId: Int = 0, parent private data class ForeignTree(val tree: List) @Serializable -private data class ForeignNode(val token: String?, val nodeType: String, val children: List) +private data class ForeignNode( + val token: String?, + val nodeType: String, + val range: NodeRange? = null, + val children: List +) /** Use this parser to get a tree from external script. * It uses `getTreeFromScript` and `getArguments` functions to generate diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt b/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt index cc0899e3..904d3152 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt +++ b/src/main/kotlin/astminer/parse/antlr/AntlrNode.kt @@ -1,11 +1,13 @@ package astminer.parse.antlr import astminer.common.model.Node +import astminer.common.model.NodeRange class AntlrNode( override val typeLabel: String, override var parent: AntlrNode?, - originalToken: String? + originalToken: String?, + override val range: NodeRange? = null ) : Node(originalToken) { override val children: MutableList = mutableListOf() diff --git a/src/main/kotlin/astminer/parse/antlr/compressedTreesUtil.kt b/src/main/kotlin/astminer/parse/antlr/compressedTreesUtil.kt new file mode 100644 index 00000000..1ebec8e4 --- /dev/null +++ b/src/main/kotlin/astminer/parse/antlr/compressedTreesUtil.kt @@ -0,0 +1,23 @@ +package astminer.parse.antlr + +import astminer.common.model.Node + +fun decompressTypeLabel(typeLabel: String) = typeLabel.split("|") + +fun AntlrNode.lastLabel() = decompressTypeLabel(typeLabel).last() + +fun AntlrNode.firstLabel() = decompressTypeLabel(typeLabel).first() + +fun AntlrNode.hasLastLabel(label: String): Boolean = lastLabel() == label + +fun AntlrNode.lastLabelIn(labels: List): Boolean = labels.contains(lastLabel()) + +fun AntlrNode.hasFirstLabel(label: String): Boolean = firstLabel() == label + +fun AntlrNode.firstLabelIn(labels: List): Boolean = labels.contains(firstLabel()) + +fun Node.getTokensFromSubtree(): String = + if (isLeaf()) token.original ?: "" else children.joinToString(separator = "") { it.getTokensFromSubtree() } + +fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = + if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel) diff --git a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt b/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt similarity index 64% rename from src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt rename to src/main/kotlin/astminer/parse/antlr/conversionUtil.kt index e9aea811..c6c12e6b 100644 --- a/src/main/kotlin/astminer/parse/antlr/AntlrUtil.kt +++ b/src/main/kotlin/astminer/parse/antlr/conversionUtil.kt @@ -1,7 +1,7 @@ package astminer.parse.antlr -import astminer.common.EMPTY_TOKEN -import astminer.common.model.Node +import astminer.common.model.NodeRange +import astminer.common.model.Position import org.antlr.v4.runtime.ParserRuleContext import org.antlr.v4.runtime.Vocabulary import org.antlr.v4.runtime.tree.ErrorNode @@ -17,7 +17,7 @@ private fun convertRuleContext( vocabulary: Vocabulary ): AntlrNode { val typeLabel = ruleNames[ruleContext.ruleIndex] - val currentNode = AntlrNode(typeLabel, parent, null) + val currentNode = AntlrNode(typeLabel, parent, null, ruleContext.getNodeRange()) val children: MutableList = ArrayList() ruleContext.children?.forEach { @@ -31,11 +31,32 @@ private fun convertRuleContext( return currentNode } +private fun ParserRuleContext.getNodeRange(): NodeRange? { + if (start == null || stop == null) return null + return NodeRange( + Position(start.line, start.charPositionInLine), + Position(stop.line, stop.charPositionInLine + stop.stopIndex - stop.startIndex) + ) +} + private fun convertTerminal(terminalNode: TerminalNode, parent: AntlrNode?, vocabulary: Vocabulary): AntlrNode = - AntlrNode(vocabulary.getSymbolicName(terminalNode.symbol.type), parent, terminalNode.symbol.text) + AntlrNode( + vocabulary.getSymbolicName(terminalNode.symbol.type), + parent, + terminalNode.symbol.text, + terminalNode.getNodeRange() + ) + +private fun TerminalNode.getNodeRange(): NodeRange? { + if (symbol == null) return null + return NodeRange( + Position(symbol.line, symbol.charPositionInLine), + Position(symbol.line, symbol.charPositionInLine + symbol.stopIndex - symbol.startIndex) + ) +} private fun convertErrorNode(errorNode: ErrorNode, parent: AntlrNode?): AntlrNode = - AntlrNode("Error", parent, errorNode.text) + AntlrNode("Error", parent, errorNode.text, errorNode.getNodeRange()) /** * Remove intermediate nodes that have a single child. @@ -58,7 +79,8 @@ fun compressTree(root: AntlrNode): AntlrNode { val compressedNode = AntlrNode( root.typeLabel + "|" + child.typeLabel, root.parent, - child.originalToken + child.token.original, + root.range ) compressedNode.replaceChildren(child.children) compressedNode @@ -67,23 +89,3 @@ fun compressTree(root: AntlrNode): AntlrNode { root } } - -fun decompressTypeLabel(typeLabel: String) = typeLabel.split("|") - -fun AntlrNode.lastLabel() = decompressTypeLabel(typeLabel).last() - -fun AntlrNode.firstLabel() = decompressTypeLabel(typeLabel).first() - -fun AntlrNode.hasLastLabel(label: String): Boolean = lastLabel() == label - -fun AntlrNode.lastLabelIn(labels: List): Boolean = labels.contains(lastLabel()) - -fun AntlrNode.hasFirstLabel(label: String): Boolean = firstLabel() == label - -fun AntlrNode.firstLabelIn(labels: List): Boolean = labels.contains(firstLabel()) - -fun Node.getTokensFromSubtree(): String = - if (isLeaf()) originalToken ?: EMPTY_TOKEN else children.joinToString(separator = "") { it.getTokensFromSubtree() } - -fun AntlrNode.getItOrChildrenOfType(typeLabel: String): List = - if (hasLastLabel(typeLabel)) listOf(this) else this.getChildrenOfType(typeLabel).map { it } diff --git a/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt index a99fa460..def07552 100644 --- a/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/java/AntlrJavaFunctionInfo.kt @@ -23,12 +23,12 @@ class AntlrJavaFunctionInfo(override val root: AntlrNode, override val filePath: override val modifiers: List? = root.parent?.children ?.filter { it.hasFirstLabel(METHOD_MODIFIER) && !it.hasLastLabel(METHOD_ANNOTATION) } - ?.mapNotNull { it.originalToken } + ?.mapNotNull { it.token.original } override val annotations: List? = root.parent?.children ?.filter { it.hasLastLabel(METHOD_ANNOTATION) } - ?.mapNotNull { it.getChildOfType(ANNOTATION_NAME)?.originalToken } + ?.mapNotNull { it.getChildOfType(ANNOTATION_NAME)?.token?.original } override val body: AntlrNode? = root.children.find { it.hasFirstLabel(METHOD_BODY_NODE) } @@ -51,7 +51,7 @@ class AntlrJavaFunctionInfo(override val root: AntlrNode, override val filePath: } EnclosingElement( type = enclosingType, - name = enclosingClassNode.getChildOfType(ENCLOSING_NAME_NODE)?.originalToken, + name = enclosingClassNode.getChildOfType(ENCLOSING_NAME_NODE)?.token?.original, root = enclosingClassNode ) } diff --git a/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt b/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt index d5c12555..fedf1b5a 100644 --- a/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/javascript/AntlrJavaScriptElementInfo.kt @@ -30,7 +30,7 @@ abstract class AntlrJavaScriptElementInfo(override val root: AntlrNode, override private fun getEnclosingElementName(enclosingRoot: AntlrNode?): String? { return enclosingRoot?.children?.firstOrNull { it.hasLastLabel(ENCLOSING_ELEMENT_NAME_NODE) - }?.originalToken + }?.token?.original } private fun getEnclosingElementType(enclosingRoot: AntlrNode): EnclosingElementType { @@ -59,8 +59,8 @@ abstract class AntlrJavaScriptElementInfo(override val root: AntlrNode, override .map { it.getChildOfType(PARAMETER_NAME_NODE) ?: it } } return parameterNameNodes.map { - check(it.originalToken != null) { "Parameter name wasn't found" } - FunctionInfoParameter(name = it.originalToken, type = null) + check(it.token.original != null) { "Parameter name wasn't found" } + FunctionInfoParameter(name = it.token.original, type = null) } } diff --git a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt index e7642cd2..45cacf17 100644 --- a/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/php/ANTLRPHPFunctionInfo.kt @@ -15,7 +15,9 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: override val nameNode: AntlrNode? = root.getChildOfType(FUNCTION_NAME) override val enclosingElement: EnclosingElement? = collectEnclosingElement() override val parameters: List? = - try { collectParameters() } catch (e: IllegalStateException) { + try { + collectParameters() + } catch (e: IllegalStateException) { logger.warn { e.message } null } @@ -65,18 +67,18 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: val isPassedByReference = parameterNode.getChildOfType(REFERENCE) != null if (parameterNode.hasLastLabel(PARAMETER_NAME)) { - return parameterNode.originalToken ?: error("No name was found for a parameter") + return parameterNode.token.original ?: error("No name was found for a parameter") } val varInit = parameterNode.getItOrChildrenOfType(VAR_DECLARATION).first() - val name = varInit.getItOrChildrenOfType(PARAMETER_NAME).first().originalToken + val name = varInit.getItOrChildrenOfType(PARAMETER_NAME).first().token.original ?: error("No name was found for a parameter") return (if (isPassedByReference) "&" else "") + (if (isSplattedArg) "..." else "") + name } - private fun getElementType(element: AntlrNode): String? = element.getChildOfType(TYPE)?.originalToken + private fun getElementType(element: AntlrNode): String? = element.getChildOfType(TYPE)?.token?.original private fun collectEnclosingElement(): EnclosingElement? { val enclosing = root.findEnclosingElementBy { it.isPossibleEnclosing() } ?: return null @@ -102,12 +104,10 @@ class ANTLRPHPFunctionInfo(override val root: AntlrNode, override val filePath: } } - private fun getEnclosingElementName(enclosing: AntlrNode): String? { - return when { - enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.originalToken - enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.originalToken - else -> error("No type can be associated") - } + private fun getEnclosingElementName(enclosing: AntlrNode): String? = when { + enclosing.isFunction() || enclosing.isClass() -> enclosing.getChildOfType(FUNCTION_NAME)?.token?.original + enclosing.isAssignExpression() -> enclosing.children.find { it.hasLastLabel(PARAMETER_NAME) }?.token?.original + else -> error("No type can be associated") } // No check for method because method is a function diff --git a/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt index 365dec5f..6ccf3f62 100644 --- a/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/antlr/python/AntlrPythonFunctionInfo.kt @@ -38,7 +38,7 @@ class AntlrPythonFunctionInfo(override val root: AntlrNode, override val filePat val parameterHaveNoDefaultOrType = parameterNode.hasLastLabel(PARAMETER_NAME_NODE) val parameterNameNode = if (parameterHaveNoDefaultOrType) parameterNode else parameterNode.getChildOfType(PARAMETER_NAME_NODE) - val parameterName = parameterNameNode?.originalToken + val parameterName = parameterNameNode?.token?.original require(parameterName != null) { "Method name was not found" } val parameterType = parameterNode.getChildOfType(PARAMETER_TYPE_NODE)?.getTokensFromSubtree() @@ -63,7 +63,7 @@ class AntlrPythonFunctionInfo(override val root: AntlrNode, override val filePat EnclosingElementType.Method, EnclosingElementType.Function -> enclosingNode.getChildOfType(FUNCTION_NAME_NODE) else -> error("Enclosing node can only be function or class") - }?.originalToken + }?.token?.original return EnclosingElement( type = type, name = name, diff --git a/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt b/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt index af122b38..0e6a9aed 100644 --- a/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt +++ b/src/main/kotlin/astminer/parse/fuzzy/FuzzyNode.kt @@ -1,6 +1,7 @@ package astminer.parse.fuzzy import astminer.common.model.Node +import astminer.common.model.NodeRange import com.google.common.collect.TreeMultiset /** @@ -23,6 +24,8 @@ class FuzzyNode( override val children get() = childrenMultiset.toList() + override val range: NodeRange? = null + fun addChild(node: FuzzyNode) { childrenMultiset.add(node) node.parent = this diff --git a/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt b/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt index 711293b7..72e05617 100644 --- a/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/fuzzy/cpp/FuzzyCppFunctionInfo.kt @@ -24,7 +24,7 @@ class FuzzyCppFunctionInfo(override val root: FuzzyNode, override val filePath: private fun collectNameNode(): FuzzyNode? = root.getChildOfType(METHOD_NAME_NODE) as? FuzzyNode private fun collectReturnType(): String? = - root.getChildOfType(METHOD_RETURN_NODE)?.getChildOfType(METHOD_RETURN_TYPE_NODE)?.originalToken + root.getChildOfType(METHOD_RETURN_NODE)?.getChildOfType(METHOD_RETURN_TYPE_NODE)?.token?.original private fun collectEnclosingClass(): EnclosingElement? { val enclosingClass = findEnclosingClass() ?: return null @@ -40,13 +40,13 @@ class FuzzyCppFunctionInfo(override val root: FuzzyNode, override val filePath: root.findEnclosingElementBy { it.typeLabel == CLASS_DECLARATION_NODE } private fun findEnclosingClassName(enclosingClass: FuzzyNode): String? = - enclosingClass.getChildOfType(CLASS_NAME_NODE)?.originalToken + enclosingClass.getChildOfType(CLASS_NAME_NODE)?.token?.original private fun collectParameters(): List { val parameters = root.getChildrenOfType(METHOD_PARAMETER_NODE) return parameters.map { param -> - val type = param.getChildOfType(PARAMETER_TYPE_NODE)?.originalToken - val name = param.getChildOfType(PARAMETER_NAME_NODE)?.originalToken ?: "" + val type = param.getChildOfType(PARAMETER_TYPE_NODE)?.token?.original + val name = param.getChildOfType(PARAMETER_NAME_NODE)?.token?.original ?: "" FunctionInfoParameter(name, type) } } diff --git a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt index 9414b87a..599621d5 100644 --- a/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt +++ b/src/main/kotlin/astminer/parse/gumtree/GumTreeNode.kt @@ -1,16 +1,19 @@ package astminer.parse.gumtree import astminer.common.model.Node +import astminer.common.model.NodeRange import com.github.gumtreediff.tree.Tree -class GumTreeNode(val wrappedNode: Tree, override var parent: GumTreeNode? = null) : +class GumTreeNode(val wrappedNode: Tree, posConverter: PositionConverter, override var parent: GumTreeNode? = null) : Node(wrappedNode.label) { override val typeLabel: String = wrappedNode.type.name override val children: MutableList by lazy { - wrappedNode.children.map { GumTreeNode(it, this) }.toMutableList() + wrappedNode.children.map { GumTreeNode(it, posConverter, this) }.toMutableList() } + override val range: NodeRange = posConverter.getRange(wrappedNode.pos, wrappedNode.endPos) + override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } } diff --git a/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt b/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt new file mode 100644 index 00000000..aa50f12b --- /dev/null +++ b/src/main/kotlin/astminer/parse/gumtree/PositionConverter.kt @@ -0,0 +1,23 @@ +package astminer.parse.gumtree + +import astminer.common.model.NodeRange +import astminer.common.model.Position + +class PositionConverter(content: String) { + private val newLineIndexes: List = + content.asSequence().mapIndexedNotNull { index, c -> if (c != '\n') null else index }.toList() + + private fun searchPosition(pos: Int): Position { + val line = newLineIndexes.binarySearch(pos) + if (line >= 0) return searchPosition(pos - 1) + if (line == -1) return Position(1, pos) + val previousNewLine = -line - 2 + return Position(previousNewLine + 2, pos - newLineIndexes[previousNewLine]) + } + + fun getRange(pos: Int, endPos: Int): NodeRange { + val start = searchPosition(pos) + val end = searchPosition(endPos) + return NodeRange(start, end) + } +} diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt index 0fbd3078..23e479b9 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTFunctionInfo.kt @@ -22,12 +22,12 @@ class GumTreeJavaJDTFunctionInfo( override val modifiers: List = root .children .filter { it.typeLabel == "Modifier" } - .mapNotNull { it.originalToken } + .mapNotNull { it.token.original } override val annotations: List = root .children .filter { it.typeLabel == "MarkerAnnotation" } - .mapNotNull { it.children.first().originalToken } + .mapNotNull { it.children.first().token.original } override val isConstructor: Boolean = enclosingElement?.name?.equals(name) ?: false @@ -35,7 +35,7 @@ class GumTreeJavaJDTFunctionInfo( private fun collectEnclosingClass(): EnclosingElement? = extractWithLogger(logger) { val enclosingNode = getEnclosingClassNode(root.parent) ?: return@extractWithLogger null - val name = enclosingNode.getChildOfType(TypeLabels.simpleName)?.originalToken + val name = enclosingNode.getChildOfType(TypeLabels.simpleName)?.token?.original val type = when (enclosingNode.typeLabel) { TypeLabels.typeDeclaration -> EnclosingElementType.Class TypeLabels.enumDeclaration -> EnclosingElementType.Enum @@ -59,10 +59,10 @@ class GumTreeJavaJDTFunctionInfo( } private fun GumTreeNode.getElementName(): String = - getChildOfType(TypeLabels.simpleName)?.originalToken ?: error("No name found for element") + getChildOfType(TypeLabels.simpleName)?.token?.original ?: error("No name found for element") private fun GumTreeNode.getElementType(): String? = children.firstOrNull { it.isTypeNode() }?.preOrder() - ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.originalToken } + ?.mapNotNull { if (it.typeLabel == TypeLabels.arrayDimensions) "[]" else it.token.original } ?.joinToString(separator = "") private fun GumTreeNode.isTypeNode() = typeLabel.endsWith("Type") diff --git a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt index ba111f14..9a0eea4c 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/jdt/GumTreeJavaJDTParser.kt @@ -3,12 +3,12 @@ package astminer.parse.gumtree.java.jdt import astminer.common.model.Parser import astminer.parse.ParsingException import astminer.parse.gumtree.GumTreeNode +import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.SyntaxException import com.github.gumtreediff.gen.jdt.JdtTreeGenerator import mu.KotlinLogging import java.io.InputStream -import java.io.InputStreamReader private val logger = KotlinLogging.logger("GumTree-JavaParser") @@ -18,8 +18,11 @@ class GumTreeJavaJDTParser : Parser { } override fun parseInputStream(content: InputStream): GumTreeNode = try { - val treeContext = JdtTreeGenerator().generate(InputStreamReader(content)) - GumTreeNode(treeContext.root) + val contentAsString = content.bufferedReader().use { it.readText() } + val converter = PositionConverter(contentAsString) + val reader = contentAsString.reader() + val treeContext = JdtTreeGenerator().generate(reader) + GumTreeNode(treeContext.root, converter) } catch (e: SyntaxException) { throw ParsingException(parserType = "Gumtree", language = "Java", exc = e) } diff --git a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt index b9b039a8..117de3df 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlFunctionInfo.kt @@ -22,7 +22,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val override val annotations: List? = extractWithLogger(logger) { root.children.filter { it.typeLabel == ANNOTATION }.map { - val token = it.getChildOfType(NAME)?.originalToken + val token = it.getChildOfType(NAME)?.token?.original checkNotNull(token) { "Annotation doesn't have a name" } } } @@ -30,7 +30,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val override val modifiers: List? = extractWithLogger(logger) { val type = checkNotNull(root.getChildOfType(TYPE)) { "Function doesn't have a type" } type.children.filter { it.typeLabel == MODIFIER }.map { - checkNotNull(it.originalToken) { "Modifier doesn't have a name" } + checkNotNull(it.token.original) { "Modifier doesn't have a name" } } } @@ -43,7 +43,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val private fun assembleParameter(node: GumTreeNode): FunctionInfoParameter { val parameter = checkNotNull(node.getChildOfType(VAR_DECLARATION)) { "No variable found" } - val name = checkNotNull(parameter.getChildOfType(NAME)?.originalToken) { "Parameter name was not found" } + val name = checkNotNull(parameter.getChildOfType(NAME)?.token?.original) { "Parameter name was not found" } val type = parameter.extractType() return FunctionInfoParameter(name, type) } @@ -56,7 +56,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val } EnclosingElement( type = enclosingType, - name = this.getChildOfType(NAME)?.originalToken ?: return@extractWithLogger null, + name = this.getChildOfType(NAME)?.token?.original ?: return@extractWithLogger null, root = this ) } @@ -67,7 +67,7 @@ class GumTreeJavaSrcmlFunctionInfo(override val root: GumTreeNode, override val if (node.typeLabel == ARRAY_BRACKETS) { "[]" } else { - checkNotNull(node.originalToken) { "No type found" } + checkNotNull(node.token.original) { "No type found" } } } } diff --git a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt index 8cd1faea..6ee05e17 100644 --- a/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/java/srcML/GumTreeJavaSrcmlParser.kt @@ -2,10 +2,10 @@ package astminer.parse.gumtree.java.srcML import astminer.common.model.Parser import astminer.parse.gumtree.GumTreeNode +import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.srcml.SrcmlJavaTreeGenerator import java.io.InputStream -import java.io.InputStreamReader class GumTreeJavaSrcmlParser : Parser { init { @@ -13,7 +13,10 @@ class GumTreeJavaSrcmlParser : Parser { } override fun parseInputStream(content: InputStream): GumTreeNode { - val treeContext = SrcmlJavaTreeGenerator().generate(InputStreamReader(content)) - return GumTreeNode(treeContext.root) + val contentAsString = content.bufferedReader().use { it.readText() } + val converter = PositionConverter(contentAsString) + val reader = contentAsString.reader() + val treeContext = SrcmlJavaTreeGenerator().generate(reader) + return GumTreeNode(treeContext.root, converter) } } diff --git a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt index 76577fb2..39e9f62d 100644 --- a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonFunctionInfo.kt @@ -19,9 +19,9 @@ class GumTreePythonFunctionInfo( override val isConstructor: Boolean = name == CONSTRUCTOR_NAME - override val returnType = if (root.children.find { it.originalToken == RETURN_TYPE_OPERATOR } == null) { + override val returnType = if (root.children.find { it.token.original == RETURN_TYPE_OPERATOR } == null) { null - } else root.getChildrenOfType(NAME).lastOrNull()?.preOrder()?.mapNotNull { it.originalToken }?.joinToString("") + } else root.getChildrenOfType(NAME).lastOrNull()?.preOrder()?.mapNotNull { it.token.original }?.joinToString("") override val enclosingElement: EnclosingElement? = extractWithLogger(logger) { val enclosing = root.findEnclosingElementBy { it.typeLabel in possibleEnclosingElements } @@ -32,7 +32,7 @@ class GumTreePythonFunctionInfo( else -> error("No enclosing type can be associated") } EnclosingElement( - name = enclosing.getChildOfType(NAME)?.originalToken, + name = enclosing.getChildOfType(NAME)?.token?.original, type = type, root = enclosing ) @@ -43,14 +43,14 @@ class GumTreePythonFunctionInfo( parameters.children.filter { it.typeLabel == PARAMETER }.map { param -> // Simple case: param has name and possibly default if (param.getChildOfType(TYPE_DEFINITION) == null) { - val name = param.getChildOfType(NAME)?.originalToken + val name = param.getChildOfType(NAME)?.token?.original checkNotNull(name) { "Parameter has no name" } FunctionInfoParameter(name, null) } else { // Complicated case: parameter has some type val variableDef = param.getChildOfType(TYPE_DEFINITION) ?: error("Tree structure was changed while function info collection") - val name = variableDef.getChildOfType(NAME)?.originalToken + val name = variableDef.getChildOfType(NAME)?.token?.original ?: error("Parameter has no name") val type = if (variableDef.children.size > 1) variableDef.children[1].getTokensFromSubtree() else null FunctionInfoParameter(name, type) diff --git a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt index 9e5f3cc6..dc998592 100644 --- a/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt +++ b/src/main/kotlin/astminer/parse/gumtree/python/GumTreePythonParser.kt @@ -4,11 +4,11 @@ import astminer.common.model.Parser import astminer.common.model.ParserNotInstalledException import astminer.parse.ParsingException import astminer.parse.gumtree.GumTreeNode +import astminer.parse.gumtree.PositionConverter import com.github.gumtreediff.client.Run import com.github.gumtreediff.gen.python.PythonTreeGenerator import java.io.IOException import java.io.InputStream -import java.io.InputStreamReader class GumTreePythonParser : Parser { init { @@ -16,8 +16,11 @@ class GumTreePythonParser : Parser { } override fun parseInputStream(content: InputStream): GumTreeNode = try { - val context = PythonTreeGenerator().generate(InputStreamReader(content)) - GumTreeNode(context.root) + val contentAsString = content.bufferedReader().use { it.readText() } + val converter = PositionConverter(contentAsString) + val reader = contentAsString.reader() + val context = PythonTreeGenerator().generate(reader) + GumTreeNode(context.root, converter) } catch (e: RuntimeException) { throw ParsingException("GumTree", "Python", e) } catch (e: IOException) { diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt index 2ab68741..a72b0a41 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionInfo.kt @@ -1,5 +1,6 @@ package astminer.parse.javalang +import astminer.common.SimpleNode import astminer.common.model.* import astminer.parse.findEnclosingElementBy import mu.KotlinLogging @@ -14,14 +15,14 @@ class JavaLangFunctionInfo(override val root: SimpleNode, override val filePath: override val annotations: List? = extractWithLogger(logger) { val annotations = root.getChildOfType(ANNOTATIONS) ?: return@extractWithLogger listOf() annotations.children - .map { it.getChildOfType(NAME)?.originalToken } + .map { it.getChildOfType(NAME)?.token?.original } .map { checkNotNull(it) { "No name for annotation found" } } } override val modifiers: List? = extractWithLogger(logger) { val modifiers = root.getChildOfType(MODIFIERS) ?: return@extractWithLogger listOf() modifiers.children - .map { it.originalToken } + .map { it.token.original } .map { checkNotNull(it) { "No name for modifier found" } } } @@ -30,7 +31,7 @@ class JavaLangFunctionInfo(override val root: SimpleNode, override val filePath: parameters.children.map { parameter -> val type = parameter.children.find { it.typeLabel in possibleTypes }?.extractType() checkNotNull(type) { "Can't extract parameter type" } - val name = parameter.children.find { it.typeLabel == NAME }?.originalToken + val name = parameter.children.find { it.typeLabel == NAME }?.token?.original checkNotNull(name) { "Can't find parameter name" } return@map FunctionInfoParameter(name, type) } @@ -46,14 +47,14 @@ class JavaLangFunctionInfo(override val root: SimpleNode, override val filePath: ENUM_DECLARATION -> EnclosingElementType.Enum else -> error("No type can be associated with enclosing node type label") } - val name = enclosingNode.getChildOfType(NAME)?.originalToken + val name = enclosingNode.getChildOfType(NAME)?.token?.original EnclosingElement(type, name, enclosingNode) } override val isConstructor: Boolean = false private fun SimpleNode.extractType(): String = this.preOrder() - .mapNotNull { if (it.typeLabel == "dimensions" && it.isLeaf()) "[]" else it.originalToken } + .mapNotNull { if (it.typeLabel == "dimensions" && it.isLeaf()) "[]" else it.token.original } .joinToString(separator = "") companion object { diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt index 9a45c241..f739f14a 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangFunctionSplitter.kt @@ -1,7 +1,7 @@ package astminer.parse.javalang +import astminer.common.SimpleNode import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode import astminer.common.model.TreeFunctionSplitter class JavaLangFunctionSplitter : TreeFunctionSplitter { diff --git a/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt b/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt index 949337c0..61ce478a 100644 --- a/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt +++ b/src/main/kotlin/astminer/parse/javalang/JavaLangParsingResultFactory.kt @@ -1,6 +1,10 @@ package astminer.parse.javalang -import astminer.common.model.* +import astminer.common.SimpleNode +import astminer.common.model.Node +import astminer.common.model.ParsingResult +import astminer.common.model.ParsingResultFactory +import astminer.common.model.TreeFunctionSplitter import java.io.File object JavaLangParsingResultFactory : ParsingResultFactory { diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt index 4f613bc7..f6af9bc0 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaParserNode.kt @@ -1,6 +1,8 @@ package astminer.parse.javaparser import astminer.common.model.Node +import astminer.common.model.NodeRange +import astminer.common.model.Position import com.github.javaparser.ast.expr.AssignExpr import com.github.javaparser.ast.expr.BinaryExpr import com.github.javaparser.ast.expr.Name @@ -13,8 +15,8 @@ private val logger = KotlinLogging.logger("JavaParser-Node") /** * Representation of JavaParser nodes inside `astminer` * - * @property jpNode node from javapParser. JPNode is an alias for Node from javaparser - * @property parent parent of this node. Null if it's a root. + * @param jpNode node from JavaParser. JPNode is an alias for Node from javaparser + * @param parent parent of this node. Null if it's a root. */ class JavaParserNode(jpNode: JPNode, override val parent: JavaParserNode?) : Node(getJavaParserNodeToken(jpNode)) { override val children: MutableList = @@ -31,6 +33,15 @@ class JavaParserNode(jpNode: JPNode, override val parent: JavaParserNode?) : Nod SHORTEN_VALUES.getOrDefault(rawType, rawType) } + override val range: NodeRange? = if (jpNode.hasNoToken()) { null } else { + val start = jpNode.begin.get() + val end = jpNode.end.get() + NodeRange( + Position(start.line, start.column), + Position(end.line, end.column) + ) + } + /** * Returns node type. Composed of `javaClass.simpleName` and * `jpNode.operator` if node is expression. @@ -65,9 +76,10 @@ private fun JPNode.isLeaf(): Boolean = this.childNodes.isEmpty() private fun JPNode.hasNoToken(): Boolean = !this.tokenRange.isPresent private fun getJavaParserNodeToken(jpNode: JPNode): String? { - return when { + val originalToken = when { jpNode is Name -> jpNode.asString() jpNode.isLeaf() -> jpNode.tokenRange.get().toString() else -> null } + return originalToken } diff --git a/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt b/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt index 9d5c5d63..9d754702 100644 --- a/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/javaparser/JavaparserFunctionInfo.kt @@ -27,7 +27,7 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil override val modifiers: List? = run { root.children.filter { it.typeLabel == MODIFIER }.map { - val token = it.originalToken + val token = it.token.original if (token == null) { logger.warn { "Modifier for function $name in file $filePath doesn't have a token" } return@run null @@ -38,7 +38,7 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil override val annotations: List? = run { root.children.filter { it.typeLabel in POSSIBLE_ANNOTATION_TYPES }.map { - val token = it.getChildOfType(ANNOTATION_NAME)?.originalToken?.split(".")?.last() + val token = it.getChildOfType(ANNOTATION_NAME)?.token?.original?.split(".")?.last() if (token == null) { logger.warn { "Annotation for function $name in file $filePath doesn't have a token" } return@run null @@ -62,8 +62,8 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil checkNotNull(possibleTypeNode) { "Couldn't find parameter type node" } val typeToken = when (possibleTypeNode.typeLabel) { ARRAY_TYPE -> getParameterType(possibleTypeNode) + ARRAY_BRACKETS - PRIMITIVE_TYPE -> possibleTypeNode.originalToken - CLASS_OR_INTERFACE_TYPE -> possibleTypeNode.getChildOfType(CLASS_NAME)?.originalToken + PRIMITIVE_TYPE -> possibleTypeNode.token.original + CLASS_OR_INTERFACE_TYPE -> possibleTypeNode.getChildOfType(CLASS_NAME)?.token?.original else -> null } checkNotNull(typeToken) { "Couldn't extract parameter type from node" } @@ -71,12 +71,12 @@ class JavaparserFunctionInfo(override val root: JavaParserNode, override val fil } private fun getParameterName(node: JavaParserNode): String { - val name = checkNotNull(node.getChildOfType(PARAMETER_NAME)?.originalToken) { "Couldn't find parameter name" } + val name = checkNotNull(node.getChildOfType(PARAMETER_NAME)?.token?.original) { "Couldn't find parameter name" } return name.replace(ARRAY_BRACKETS_REGEX, "") } private fun JavaParserNode.assembleEnclosingClass(): EnclosingElement? = extractWithLogger(logger) { - val name = this.getChildOfType(CLASS_NAME)?.originalToken + val name = this.getChildOfType(CLASS_NAME)?.token?.original val type = when (this.typeLabel) { CLASS_OR_INTERFACE_DECLARATION -> EnclosingElementType.Class ENUM_DECLARATION -> EnclosingElementType.Enum diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt index 3247ab85..8f9b7503 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonJavaFunctionInfo.kt @@ -23,11 +23,11 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: override val annotations: List? = run { root.getChildrenOfType(ANNOTATION_NODE_TYPE).map { - it.getChildOfType(TYPE_REFERENCE)?.originalToken ?: return@run null + it.getChildOfType(TYPE_REFERENCE)?.token?.original ?: return@run null } } - override val returnType: String? = root.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.originalToken + override val returnType: String? = root.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.token?.original override val body: SpoonNode? = root.getChildOfType(BLOCK) @@ -37,8 +37,8 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: root.findEnclosingElementBy { it.typeLabel in POSSIBLE_ENCLOSING_ELEMENTS }?.assembleEnclosingClass() private fun assembleParameter(parameterNode: SpoonNode): FunctionInfoParameter { - val type = parameterNode.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.originalToken - val name = parameterNode.originalToken + val type = parameterNode.children.find { it.typeLabel in POSSIBLE_PARAMETER_TYPES }?.token?.original + val name = parameterNode.token.original checkNotNull(name) { "Couldn't find parameter name token" } return FunctionInfoParameter(name, type) } @@ -49,7 +49,7 @@ class SpoonJavaFunctionInfo(override val root: SpoonNode, override val filePath: CLASS_DECLARATION_TYPE -> EnclosingElementType.Class else -> error("Can't find any enclosing type association") } - EnclosingElement(type, this.originalToken, root) + EnclosingElement(type, this.token.original, root) } companion object { diff --git a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt index f05fda6e..b70fc23a 100644 --- a/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt +++ b/src/main/kotlin/astminer/parse/spoon/SpoonNode.kt @@ -1,17 +1,29 @@ package astminer.parse.spoon import astminer.common.model.Node +import astminer.common.model.NodeRange +import astminer.common.model.Position import spoon.reflect.code.* +import spoon.reflect.cu.position.NoSourcePosition import spoon.reflect.declaration.CtElement import spoon.reflect.declaration.CtNamedElement import spoon.reflect.reference.CtReference -class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpoonValue()) { +class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpoonToken()) { // Turning CtImpl -> override val typeLabel = el.javaClass.simpleName.substring(startIndex = 2).dropLast(4) override val children = run { el.directChildren.map { SpoonNode(it, this) } }.toMutableList() + override val range: NodeRange? = if (el.position.compilationUnit.originalSourceCode != null && + el.position !is NoSourcePosition + ) { + NodeRange( + Position(el.position.line, el.position.column), + Position(el.position.endLine, el.position.endColumn) + ) + } else null + override fun removeChildrenOfType(typeLabel: String) { children.removeIf { it.typeLabel == typeLabel } } @@ -26,8 +38,8 @@ class SpoonNode(el: CtElement, override val parent: SpoonNode?) : Node(el.getSpo override fun postOrder(): List = super.postOrder().map { it as SpoonNode } } -private fun CtElement.getSpoonValue(): String? { - return when { +private fun CtElement.getSpoonToken(): String? { + val originalToken = when { this is CtNamedElement -> this.simpleName this is CtVariableAccess<*> -> this.variable.simpleName this is CtInvocation<*> -> this.executable?.simpleName @@ -40,4 +52,5 @@ private fun CtElement.getSpoonValue(): String? { this.directChildren.size == 0 -> this.toString() else -> null } + return originalToken } diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt index b45a6442..97d285d9 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFactory.kt @@ -1,8 +1,8 @@ package astminer.parse.treesitter.java +import astminer.common.SimpleNode import astminer.common.model.ParsingResult import astminer.common.model.ParsingResultFactory -import astminer.common.model.SimpleNode import astminer.common.model.TreeFunctionSplitter import java.io.File diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt index 5c5360d6..b987c4c8 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionInfo.kt @@ -1,6 +1,7 @@ package astminer.parse.treesitter.java import astminer.common.EMPTY_TOKEN +import astminer.common.SimpleNode import astminer.common.model.* import astminer.parse.antlr.getTokensFromSubtree import astminer.parse.findEnclosingElementBy @@ -20,7 +21,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil annotations.children .filter { it.typeLabel in possibleAnnotations } .map { annotation -> annotation.preOrder().filter { it.typeLabel in listOf(NAME, SCOPE_IDENTIFIER, DOT) } } - .map { nameNodes -> nameNodes.map { it.originalToken ?: "" } } + .map { nameNodes -> nameNodes.map { it.token.original ?: "" } } .map { nameNodesWithToken -> nameNodesWithToken.joinToString(separator = "") } } @@ -28,7 +29,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil val modifiers = root.getChildOfType(MODIFIERS) ?: return@extractWithLogger listOf() modifiers.children .filter { it.typeLabel in possibleModifiers } - .map { it.originalToken } + .map { it.token.original } .map { checkNotNull(it) { "Modifier without a token" } } } @@ -37,9 +38,9 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil parametersRoot.children.filter { it.typeLabel in possibleParameters }.map { parameter -> val possibleNameNode = parameter.getChildOfType(NAME) val name = if (possibleNameNode != null) { - possibleNameNode.originalToken + possibleNameNode.token.original } else { - parameter.getChildOfType(VARIABLE_DECLARATOR)?.getChildOfType(NAME)?.originalToken + parameter.getChildOfType(VARIABLE_DECLARATOR)?.getChildOfType(NAME)?.token?.original } checkNotNull(name) { "Can't find parameter name" } @@ -65,7 +66,7 @@ class TreeSitterJavaFunctionInfo(override val root: SimpleNode, override val fil override val enclosingElement: EnclosingElement? = extractWithLogger(logger) { val enclosingNode = root.findEnclosingElementBy { it.typeLabel in possible_enclosings } ?: return@extractWithLogger null - val name = enclosingNode.getChildOfType(NAME)?.originalToken + val name = enclosingNode.getChildOfType(NAME)?.token?.original val type = when (enclosingNode.typeLabel) { CLASS_DECLARATION -> EnclosingElementType.Class ENUM_DECLARATION -> EnclosingElementType.Enum diff --git a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt index 13866a64..a4dbb724 100644 --- a/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt +++ b/src/main/kotlin/astminer/parse/treesitter/java/TreeSitterJavaFunctionSplitter.kt @@ -1,7 +1,7 @@ package astminer.parse.treesitter.java +import astminer.common.SimpleNode import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode import astminer.common.model.TreeFunctionSplitter class TreeSitterJavaFunctionSplitter : TreeFunctionSplitter { diff --git a/src/main/kotlin/astminer/paths/PathUtil.kt b/src/main/kotlin/astminer/paths/PathUtil.kt index ad8c9cc8..5dd456a5 100644 --- a/src/main/kotlin/astminer/paths/PathUtil.kt +++ b/src/main/kotlin/astminer/paths/PathUtil.kt @@ -2,7 +2,7 @@ package astminer.paths import astminer.common.model.* -fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token }): PathContext { +fun toPathContext(path: ASTPath, getToken: (Node) -> String = { node -> node.token.final() }): PathContext { val startToken = getToken(path.upwardNodes.first()) val endToken = getToken(path.downwardNodes.last()) val astNodes = path.upwardNodes.map { OrientedNodeType(it.typeLabel, Direction.UP) } + diff --git a/src/main/kotlin/astminer/paths/PathWorker.kt b/src/main/kotlin/astminer/paths/PathWorker.kt index a19bd487..221427fa 100644 --- a/src/main/kotlin/astminer/paths/PathWorker.kt +++ b/src/main/kotlin/astminer/paths/PathWorker.kt @@ -49,7 +49,7 @@ class PathWorker { val paths: MutableList = ArrayList() iterator.forEach { currentNode -> if (currentNode.isLeaf()) { - if (currentNode.token.isNotEmpty()) { + if (currentNode.token.final().isNotEmpty()) { currentNode.setPathPieces(listOf(listOf(currentNode))) } } else { diff --git a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt index 9ef41b0b..47f426df 100644 --- a/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/CsvAstStorage.kt @@ -28,7 +28,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { for (node in labeledResult.root.preOrder()) { - tokensMap.record(node.token) + tokensMap.record(node.token.final()) nodeTypesMap.record(node.typeLabel) } val writer = astsPrintWriters.getOrPut(holdout) { holdout.resolveHoldout() } @@ -55,7 +55,7 @@ class CsvAstStorage(override val outputDirectoryPath: String) : Storage { } internal fun astString(node: Node): String { - return "${tokensMap.getId(node.token)} ${nodeTypesMap.getId(node.typeLabel)}{${ + return "${tokensMap.getId(node.token.final())} ${nodeTypesMap.getId(node.typeLabel)}{${ node.children.joinToString(separator = "", transform = ::astString) }}" } diff --git a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt index 3ce2463e..f2609109 100644 --- a/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt +++ b/src/main/kotlin/astminer/storage/ast/JsonAstStorage.kt @@ -1,9 +1,6 @@ package astminer.storage.ast -import astminer.common.model.DatasetHoldout -import astminer.common.model.LabeledResult -import astminer.common.model.Node -import astminer.common.model.Storage +import astminer.common.model.* import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString import kotlinx.serialization.json.Json @@ -17,7 +14,11 @@ private typealias Id = Int * Each line in the output file is a single json object that corresponds to one of the labeled trees. * Each tree is flattened and represented as a list of nodes. */ -class JsonAstStorage(override val outputDirectoryPath: String, private val withPaths: Boolean) : Storage { +class JsonAstStorage( + override val outputDirectoryPath: String, + private val withPaths: Boolean, + private val withRanges: Boolean +) : Storage { private val treeFlattener = TreeFlattener() private val datasetWriters = mutableMapOf() @@ -28,13 +29,27 @@ class JsonAstStorage(override val outputDirectoryPath: String, private val withP } @Serializable - private data class LabeledAst(val label: String, val path: String? = null, val ast: List) + private data class LabeledAst( + val label: String, + val path: String? = null, + val ast: List + ) @Serializable - private data class OutputNode(val token: String, val typeLabel: String, val children: List) + private data class OutputNode( + val token: String, + val typeLabel: String, + val range: NodeRange? = null, + val children: List + ) private fun TreeFlattener.EnumeratedNode.toOutputNode() = - OutputNode(node.token, node.typeLabel, children.map { it.id }) + OutputNode( + node.token.final(), + node.typeLabel, + if (withRanges) node.range else null, + children.map { it.id } + ) override fun store(labeledResult: LabeledResult, holdout: DatasetHoldout) { val outputNodes = treeFlattener.flatten(labeledResult.root).map { it.toOutputNode() } diff --git a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt index d41ad286..ed0f8a71 100644 --- a/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt +++ b/src/main/kotlin/astminer/storage/path/PathBasedStorage.kt @@ -54,7 +54,7 @@ abstract class PathBasedStorage( return LabeledPathContexts( labeledResult.label, paths.map { astPath -> - toPathContext(astPath) { it.token.replace("\n", "\\n") } + toPathContext(astPath) { it.token.final().replace("\n", "\\n") } } ) } diff --git a/src/main/python/parse/javalang/aw_javalang/ast_generation.py b/src/main/python/parse/javalang/aw_javalang/ast_generation.py index bf79b3b2..cc6ce85b 100644 --- a/src/main/python/parse/javalang/aw_javalang/ast_generation.py +++ b/src/main/python/parse/javalang/aw_javalang/ast_generation.py @@ -6,11 +6,28 @@ DUMMY_NONE_PROCESSING_IN_ITERABLE = True +@dataclass +class Position: + l: int + c: int + + +@dataclass +class Range: + start: Position + end: Position + + +def node_range(start: List[int], end: List[int]): + return Range(Position(start[0], start[1]), Position(end[0], end[1])) + + @dataclass class Node: type: str value: Optional[str] children: List["Node"] + node_range: Range def __str__(self): return self.type + (f" : {self.value}" if self.value is not None else "") @@ -38,7 +55,7 @@ def generate_presentable_AST(node: JavaLangNode, show_declined: bool = True) -> if isinstance(value, str): children.append(process_string_attribute(node, attr, value)) elif isinstance(value, list) or isinstance(value, set): - attribute_node = Node(attr, None, []) + attribute_node = Node(attr, None, [], node_range([-1, -1], [-1, -1])) attribute_node.children = process_iterable_attributes(attribute_node, attr, value, show_declined) if not attribute_node.is_leaf(): children.append(attribute_node) @@ -46,7 +63,13 @@ def generate_presentable_AST(node: JavaLangNode, show_declined: bool = True) -> children.append(process_node_attribute(node, value, show_declined)) elif (value is not None or (value is None and not IGNORE_NONE_ATTR)) and show_declined: process_declined_attribute(attr, value) - return Node(generate_node_type(node), None, children) + node_pos = node.position + if node_pos is None: + start = [-1, -1] + else: + start = [node_pos.line, node_pos.column] + return Node(type=generate_node_type(node), value=None, children=children, + node_range=node_range(start, [-1, -1])) def generate_node_type(node: JavaLangNode) -> str: @@ -72,7 +95,7 @@ def get_singular(string: str) -> str: def process_string_attribute(node: Node, attr: str, value: str) -> Node: - return Node(attr, value, []) + return Node(attr, value, [], node_range([-1, -1], [-1, -1])) def process_node_attribute(node: Node, value: JavaLangNode, show_declined: bool) -> Node: diff --git a/src/main/python/parse/javalang/aw_javalang/tree_flattening.py b/src/main/python/parse/javalang/aw_javalang/tree_flattening.py index 39b25846..236aaa35 100644 --- a/src/main/python/parse/javalang/aw_javalang/tree_flattening.py +++ b/src/main/python/parse/javalang/aw_javalang/tree_flattening.py @@ -1,5 +1,5 @@ from typing import Optional, List, Tuple -from aw_javalang.ast_generation import Node +from aw_javalang.ast_generation import Node, Range from dataclasses import dataclass @@ -13,14 +13,14 @@ class EnumeratedNode: token: Optional[str] nodeType: str children: List[int] - + range: Range class TreeSerializer: def __init__(self): self._current_id = 0 def _enumerate_tree(self, node) -> Tuple[List["EnumeratedNode"], int]: - enumerated_root = EnumeratedNode(node.value, node.type, []) + enumerated_root = EnumeratedNode(node.value, node.type, [], node.node_range) root_id = self._current_id self._current_id += 1 enumerated_tree = [enumerated_root] diff --git a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py index 77bd8303..6b703330 100644 --- a/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py +++ b/src/main/python/parse/tree_sitter/aw_tree_sitter/ast.py @@ -1,7 +1,11 @@ from tree_sitter import TreeCursor from typing import Optional, TypedDict, List -NodeAsDict = TypedDict("NodeAsDict", {"token": Optional[str], "nodeType": str, "children": List[int]}) +Position = TypedDict("Position", {"l": int, "c": int}) +NodeRange = TypedDict("NodeRange", {"start": Position, "end": Position}) +NodeAsDict = TypedDict( + "NodeAsDict", {"token": Optional[str], "nodeType": str, "range": NodeRange, "children": List[int]} +) TreeAsDict = TypedDict("TreeAsDict", {"tree": List[NodeAsDict]}) @@ -13,15 +17,26 @@ def __init__(self, cursor: TreeCursor, file_bytes: bytes): self._cursor = cursor self._file_bytes = file_bytes + def _get_current_node_range(self) -> NodeRange: + node = self._cursor.node + start = node.start_point + end = node.end_point + return { + "start": {"l": start[0] + 1, "c": start[1] + 1}, + "end": {"l": end[0] + 1, "c": end[1] + 1} + } + def _get_current_node_as_dict(self) -> NodeAsDict: node_type = self._cursor.node.type + node_range = self._get_current_node_range() + if len(self._cursor.node.children) == 0: node_value_bytes = self._file_bytes[self._cursor.node.start_byte : self._cursor.node.end_byte] node_value: Optional[str] = node_value_bytes.decode("utf-8") else: node_value = None - return {"token": node_value, "nodeType": node_type, "children": []} + return {"token": node_value, "nodeType": node_type, "range": node_range, "children": []} def get_tree_as_dict(self) -> TreeAsDict: depth = 0 diff --git a/src/main/python/parse/tree_sitter/setup.py b/src/main/python/parse/tree_sitter/setup.py index fcf3f1db..12f23983 100644 --- a/src/main/python/parse/tree_sitter/setup.py +++ b/src/main/python/parse/tree_sitter/setup.py @@ -1,16 +1,12 @@ from setuptools import setup setup( - name='tree_sitter_astminer_wrapper', - version='1.0.0', - description='Wrapper for tree sitter python bindings for using with astminer', - packages=['aw_tree_sitter'], - license='MIT', - author='Ilya Utkin', - entry_points={ - 'console_scripts': ["aw_tree_sitter = aw_tree_sitter.main:main"] - }, - install_requires=[ - 'tree_sitter~=0.19.0' - ] -) \ No newline at end of file + name="tree_sitter_astminer_wrapper", + version="1.0.0", + description="Wrapper for tree sitter python bindings for using with astminer", + packages=["aw_tree_sitter"], + license="MIT", + author="Ilya Utkin", + entry_points={"console_scripts": ["aw_tree_sitter = aw_tree_sitter.main:main"]}, + install_requires=["tree_sitter~=0.19.0"], +) diff --git a/src/test/kotlin/astminer/common/DummyNode.kt b/src/test/kotlin/astminer/common/DummyNode.kt index a00ca3ef..de266e0e 100644 --- a/src/test/kotlin/astminer/common/DummyNode.kt +++ b/src/test/kotlin/astminer/common/DummyNode.kt @@ -10,9 +10,11 @@ class DummyNode( override val parent: Node? = null + override val range: NodeRange? = null + init { // Tokens may change after normalization, for tests we want tokens to be unchanged - technicalToken = typeLabel + token.technical = typeLabel } override fun removeChildrenOfType(typeLabel: String) { diff --git a/src/test/kotlin/astminer/common/TreeUtilTest.kt b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt similarity index 89% rename from src/test/kotlin/astminer/common/TreeUtilTest.kt rename to src/test/kotlin/astminer/common/TokenNormalizationTest.kt index 7a263f7e..939d2bcc 100644 --- a/src/test/kotlin/astminer/common/TreeUtilTest.kt +++ b/src/test/kotlin/astminer/common/TokenNormalizationTest.kt @@ -3,8 +3,8 @@ package astminer.common import org.junit.Assert import org.junit.Test -class TreeUtilTest { - private val defaultToken = "EMPTY" +class TokenNormalizationTest { + private val defaultToken = EMPTY_TOKEN @Test fun testPreOrder() { @@ -25,10 +25,10 @@ class TreeUtilTest { @Test fun testNormalizeTokenCleaning() { val token = " Token THAT \n contains Whi\"t,es''pace characters!!!and pu.n.c.t.u.a.tion \n" - val expectedToken = "token" + "that" + "contains" + "whitespace" + "characters" + "and" + "punctuation" + val normalizedSubTokens = listOf("token", "that", "contains", "whitespace", "characters", "and", "punctuation") Assert.assertEquals( "All whitespace characters and punctuation should be removed, keeping only letters", - expectedToken, + normalizedSubTokens.joinToString(""), normalizeToken(token, defaultToken) ) } diff --git a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt index d0867ce0..80691b9e 100644 --- a/src/test/kotlin/astminer/featureextraction/PrettyNode.kt +++ b/src/test/kotlin/astminer/featureextraction/PrettyNode.kt @@ -1,6 +1,7 @@ package astminer.featureextraction import astminer.common.model.Node +import astminer.common.model.NodeRange class PrettyNode(override val typeLabel: String, originalToken: String) : Node(originalToken) { override var children: MutableList = ArrayList() @@ -10,12 +11,14 @@ class PrettyNode(override val typeLabel: String, originalToken: String) : Node(o field = value } + override val range: NodeRange? = null + fun addChild(node: PrettyNode) = children.add(node) fun toPrettyString(indent: Int = 0, indentSymbol: String = "--"): String = with(StringBuilder()) { repeat(indent) { append(indentSymbol) } append(typeLabel) - if (token.isNotEmpty()) { + if (token.final().isNotEmpty()) { appendLine(" : $token") } else { appendLine() diff --git a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt index c51226ab..2b42100f 100644 --- a/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt +++ b/src/test/kotlin/astminer/labelextractor/FunctionNameLabelExtractorTest.kt @@ -38,20 +38,20 @@ class FunctionNameLabelExtractorTest { @Test fun `test FunctionNameProblem hides function name node token with METHOD_NAME`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.nameNode?.token) + assertEquals("METHOD_NAME", functionInfo.nameNode?.token?.final()) } @Test fun `test FunctionNameProblem hides function root token with METHOD_NAME if it is the name node`() { FunctionNameLabelExtractor.process(functionInfo) - assertEquals("METHOD_NAME", functionInfo.root.token) + assertEquals("METHOD_NAME", functionInfo.root.token.final()) } @Test fun `test function name problem should hide recursive call tokens with SELF`() { FunctionNameLabelExtractor.process(functionInfo) val recursiveCallNode = functionInfo.root.children.firstOrNull()?.children?.firstOrNull() - assertEquals("SELF", recursiveCallNode?.token) + assertEquals("SELF", recursiveCallNode?.token?.final()) } companion object { diff --git a/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt index d9039bb3..ef060293 100644 --- a/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/antlr/java/JavaFunctionSplitterTest.kt @@ -7,6 +7,7 @@ import java.io.File import kotlin.test.BeforeTest import kotlin.test.assertEquals import kotlin.test.assertNotNull +import kotlin.test.assertTrue class JavaFunctionSplitterTest { @@ -154,10 +155,40 @@ class JavaFunctionSplitterTest { assertEquals(setOf("Deprecated"), annotations.toSet()) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 val functionSplitter = JavaFunctionSplitter() val parser = JavaParser() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 19 to 19, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) } } diff --git a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt index 2e015872..4a8aecea 100644 --- a/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javalang/JavaLangFunctionSplitterTest.kt @@ -1,14 +1,15 @@ package astminer.parse.javalang import astminer.checkExecutable +import astminer.common.SimpleNode import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode import org.junit.Assume import org.junit.BeforeClass import org.junit.Test import java.io.File import kotlin.test.assertEquals import kotlin.test.assertNotNull +import kotlin.test.assertTrue internal class JavaLangFunctionSplitterTest { @Test @@ -146,12 +147,24 @@ internal class JavaLangFunctionSplitterTest { assertEquals(setOf("Deprecated"), annotations.toSet()) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val expectedStart = it.second + actualStart == expectedStart + } + ) + } + companion object { private const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 private val functionSplitter = JavaLangFunctionSplitter() val parser = JavaLangParser() lateinit var functionInfos: Collection> + val functionLinePositions = listOf(2, 4, 8, 12, 16, 19, 22, 24, 26, 28, 31, 35, 38, 42, 44) @BeforeClass @JvmStatic diff --git a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt index 41b729a8..ee2e55c7 100644 --- a/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/javaparser/JavaparserMethodSplitterTest.kt @@ -133,12 +133,42 @@ internal class JavaparserMethodSplitterTest { testAnnotationsMatches("functionWithModifiersAndAnnotations", setOf("Deprecated")) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { private const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 private val functionSplitter = JavaparserMethodSplitter() val parser = JavaParserParseWrapper() var functionInfos: Collection> = listOf() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 19 to 19, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) @BeforeClass @JvmStatic diff --git a/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt b/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt index ef8990ae..9e197f79 100644 --- a/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/spoon/SpoonJavaFunctionSplitterTest.kt @@ -147,10 +147,40 @@ internal class SpoonJavaFunctionSplitterTest { assertTrue(blankFunction.isBlank()) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 val functionSplitter = SpoonJavaFunctionSplitter() val parser = SpoonJavaParser() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 19 to 19, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) } } diff --git a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt index 0f19631d..1c9dfea1 100644 --- a/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt +++ b/src/test/kotlin/astminer/parse/treesitter/java/TreeSitterJavaMethodSplitterTest.kt @@ -1,9 +1,9 @@ package astminer.parse.treesitter.java import astminer.checkExecutable +import astminer.common.SimpleNode import astminer.common.model.EnclosingElementType import astminer.common.model.FunctionInfo -import astminer.common.model.SimpleNode import org.junit.Assume import org.junit.BeforeClass import org.junit.Test @@ -155,12 +155,42 @@ class TreeSitterJavaMethodSplitterTest { testAnnotationsMatches("functionWithModifiersAndAnnotations", setOf("Deprecated")) } + @Test + fun testPositions() { + assertTrue( + functionInfos.mapNotNull { it.root.range }.zip(functionLinePositions).all { + val actualStart = it.first.start.line + val actualEnd = it.first.end.line + val expectedStart = it.second.first + val expectedEnd = it.second.second + (actualStart..actualEnd).intersect(expectedStart..expectedEnd).isNotEmpty() + } + ) + } + companion object { private const val FILE_PATH = "src/test/resources/methodSplitting/testMethodSplitting.java" const val N_FUNCTIONS = 15 private val functionSplitter = TreeSitterJavaFunctionSplitter() val parser = TreeSitterJavaParser() var functionInfos: Collection> = listOf() + val functionLinePositions = listOf( + 2 to 2, + 4 to 6, + 8 to 10, + 12 to 14, + 16 to 16, + 19 to 19, + 22 to 22, + 24 to 24, + 26 to 26, + 28 to 28, + 30 to 31, + 33 to 35, + 37 to 38, + 42 to 42, + 44 to 44 + ) @BeforeClass @JvmStatic