JetBrains-Research · SpirinEgor · Nov 12, 2021 · Oct 23, 2021 · Oct 24, 2021 · Oct 24, 2021
diff --git a/docs/storages.md b/docs/storages.md
@@ -37,11 +37,13 @@ Saves each tree with its label in the JSON lines format inspired by the [150k Py
 
  ```yaml
  name: json AST
+ withPaths: true # can be omitted
+ withRanges: true # can be omitted
  ```
 
 In this format, each line represents an AST with its [label](label_extractors.md), path, and all vertices:
 
-```json
+```json lines
 {
   "label": "1.java",
   "path": "src/test/resources/examples/1.java",
@@ -50,8 +52,13 @@ In this format, each line represents an AST with its [label](label_extractors.md
     { "token": "class", "typeLabel": "TypeDeclaration", "children": [2, 3, 4] },
     ...
   ]
+}
 ```
 
+Possible configuration options for Json storage:
+1. `withPaths` allows for each tree to save the path to the file where it appears. Default: `false`.
+2. `withRanges` allows for each node to save start and end positions in the corresponding source code. Default: `false`.
+
 ## Path-based representations
 
 Path-based representation was introduced by [Alon et al.](https://arxiv.org/abs/1803.09544).

diff --git a/src/main/kotlin/astminer/common/SimpleNode.kt b/src/main/kotlin/astminer/common/SimpleNode.kt
@@ -0,0 +1,23 @@
+package astminer.common
+
+import astminer.common.model.Node
+import astminer.common.model.NodeRange
+
+/** Node simplest implementation **/
+class SimpleNode(
+    override val typeLabel: String,
+    override val children: MutableList<SimpleNode>,
+    override val parent: Node? = null,
+    override val range: NodeRange? = null,
+    token: String?
+) : Node(token) {
+    override fun removeChildrenOfType(typeLabel: String) {
+        children.removeIf { it.typeLabel == typeLabel }
+    }
+
+    override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode }
+    override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode
+
+    override fun preOrder() = super.preOrder().map { it as SimpleNode }
+    override fun postOrder() = super.postOrder().map { it as SimpleNode }
+}
diff --git a/src/main/kotlin/astminer/common/TokenNormalization.kt b/src/main/kotlin/astminer/common/TokenNormalization.kt
@@ -0,0 +1,52 @@
+package astminer.common
+
+const val EMPTY_TOKEN = "<E>"
+const val TOKEN_DELIMITER = "|"
+const val EMPTY_STRING = ""
+
+/**
+ * Splits token into subtokens by commonly used practice, i.e. `camelCase` or `snake_case`.
+ * Returns a list of not empty, normalized subtokens.
+ * The function was adopted from the original code2vec implementation in order to match their behavior:
+ * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
+ * @see normalizeToken
+ */
+fun splitToSubtokens(token: String) = token
+    .trim()
+    .split(splitRegex)
+    .map { s -> normalizeToken(s, EMPTY_STRING) }
+    .filter { it.isNotEmpty() }
+    .toList()
+
+private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex()
+
+/**
+ * Normalize token by conversion to lower case, removing the new line,
+ * whitespace, quotes, and other weird Unicode characters.
+ * The function was adopted from the original code2vec implementation in order to match their behavior:
+ * https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
+ * @param token Token to normalize
+ * @param defaultToken If the token is empty after the normalization process, it will be replaced with the default token
+ */
+fun normalizeToken(token: String, defaultToken: String): String {
+    val cleanToken = token.lowercase()
+        .replace(newLineReg, EMPTY_STRING) // escaped new line
+        .replace(whitespaceReg, EMPTY_STRING) // whitespaces
+        .replace(quotesApostrophesCommasReg, EMPTY_STRING) // quotes, apostrophes, commas
+        .replace(unicodeWeirdCharReg, EMPTY_STRING) // unicode weird characters
+
+    val stripped = cleanToken.replace(notALetterReg, EMPTY_STRING)
+
+    return stripped.ifEmpty {
+        val carefulStripped = cleanToken.replace(" ", "_")
+        carefulStripped.ifEmpty {
+            defaultToken
+        }
+    }
+}
+
+private val newLineReg = "\\\\n".toRegex()
+private val whitespaceReg = "//s+".toRegex()
+private val quotesApostrophesCommasReg = "[\"',]".toRegex()
+private val unicodeWeirdCharReg = "\\P{Print}".toRegex()
+private val notALetterReg = "[^A-Za-z]".toRegex()
diff --git a/src/main/kotlin/astminer/common/TreeUtil.kt b/src/main/kotlin/astminer/common/TreeUtil.kt
diff --git a/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt b/src/main/kotlin/astminer/common/model/FunctionInfoModel.kt
@@ -20,7 +20,7 @@ interface NamedTree<T : Node> {
     val nameNode: T?
         get() = notImplemented("nameNode")
     val name: String?
-        get() = nameNode?.originalToken
+        get() = nameNode?.token?.original
     val root: T
         get() = notImplemented("root")
     val body: T?

diff --git a/src/main/kotlin/astminer/common/model/ParsingModel.kt b/src/main/kotlin/astminer/common/model/ParsingModel.kt
@@ -1,31 +1,22 @@
 package astminer.common.model
 
-import astminer.common.EMPTY_TOKEN
-import astminer.common.splitToSubtokens
+import kotlinx.serialization.SerialName
+import kotlinx.serialization.Serializable
 import java.io.File
 import java.io.InputStream
-import java.util.*
 
-abstract class Node(val originalToken: String?) {
+abstract class Node(originalToken: String?) {
     abstract val typeLabel: String
     abstract val children: List<Node>
     abstract val parent: Node?
-
-    val normalizedToken: String =
-        originalToken?.let {
-            val subtokens = splitToSubtokens(it)
-            if (subtokens.isEmpty()) EMPTY_TOKEN else subtokens.joinToString(TOKEN_DELIMITER)
-        } ?: EMPTY_TOKEN
-
-    var technicalToken: String? = null
-
-    val token: String
-        get() = technicalToken ?: normalizedToken
-
+    abstract val range: NodeRange?
     val metadata: MutableMap<String, Any> = HashMap()
+    val token = Token(originalToken)
+
     fun isLeaf() = children.isEmpty()
 
     override fun toString(): String = "$typeLabel : $token"
+
     fun prettyPrint(indent: Int = 0, indentSymbol: String = "--") {
         repeat(indent) { print(indentSymbol) }
         println(this)
@@ -52,30 +43,16 @@ abstract class Node(val originalToken: String?) {
 
     fun postOrderIterator(): Iterator<Node> = postOrder().listIterator()
     open fun postOrder(): List<Node> = mutableListOf<Node>().also { doTraversePostOrder(it) }
-
-    companion object {
-        const val TOKEN_DELIMITER = "|"
-    }
 }
 
-/** Node simplest implementation **/
-class SimpleNode(
-    override val typeLabel: String,
-    override val children: MutableList<SimpleNode>,
-    override val parent: Node?,
-    token: String?
-) : Node(token) {
-    override fun removeChildrenOfType(typeLabel: String) {
-        children.removeIf { it.typeLabel == typeLabel }
-    }
-
-    override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode }
-    override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode
-
-    override fun preOrder() = super.preOrder().map { it as SimpleNode }
-    override fun postOrder() = super.postOrder().map { it as SimpleNode }
+@Serializable
+data class NodeRange(val start: Position, val end: Position) {
+    override fun toString(): String = "[${start.line}, ${start.column}] - [${end.line}, ${end.column}]"
 }
 
+@Serializable
+data class Position(@SerialName("l") val line: Int, @SerialName("c") val column: Int)
+
 interface Parser<T : Node> {
     /**
      * Parse input stream into an AST.

diff --git a/src/main/kotlin/astminer/common/model/Token.kt b/src/main/kotlin/astminer/common/model/Token.kt
@@ -0,0 +1,37 @@
+package astminer.common.model
+
+import astminer.common.*
+
+/**
+ * Class to wrap logic with token processing.
+ * It is responsible for token normalization or replacing it with technical information.
+ * Use `token.original` to access the original token.
+ */
+class Token(val original: String?) {
+    /**
+     * Technical token is used to shadow the original token with mining pipeline specific value.
+     * For example, for the method name prediction problem
+     * we want to set technical `<METHOD_NAME>` token to hide real method name.
+     */
+    var technical: String? = null
+
+    /**
+     * Original token with normalization applied
+     * @see normalizeToken
+     */
+    val normalized = run {
+        if (original == null) return@run EMPTY_TOKEN
+        val subTokens = splitToSubtokens(original)
+        if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER)
+    }
+
+    /**
+     * Access to the final representation of the token after normalization and other preprocessing.
+     * It returns technical assign token if it exists or normalized token otherwise.
+     * @see technical
+     * @see normalized
+     */
+    fun final() = technical ?: normalized
+
+    override fun toString(): String = final()
+}
diff --git a/src/main/kotlin/astminer/config/StorageConfigs.kt b/src/main/kotlin/astminer/config/StorageConfigs.kt
@@ -42,8 +42,16 @@ class DotAstStorageConfig : StorageConfig() {
  */
 @Serializable
 @SerialName("json AST")
-class JsonAstStorageConfig(private val withPaths: Boolean = false) : StorageConfig() {
-    override fun createStorage(outputDirectoryPath: String) = JsonAstStorage(outputDirectoryPath, withPaths)
+class JsonAstStorageConfig(
+    private val withPaths: Boolean = false,
+    private val withRanges: Boolean = false
+) : StorageConfig() {
+    override fun createStorage(outputDirectoryPath: String) =
+        JsonAstStorage(
+            outputDirectoryPath,
+            withPaths,
+            withRanges
+        )
 }
 
 /**

diff --git a/src/main/kotlin/astminer/featureextraction/TreeFeature.kt b/src/main/kotlin/astminer/featureextraction/TreeFeature.kt
@@ -57,7 +57,7 @@ object Tokens : TreeFeature<List<String>> {
 
     private fun findTokens(node: Node, tokensList: MutableList<String>): List<String> {
         node.children.forEach { findTokens(it, tokensList) }
-        tokensList.add(node.token)
+        tokensList.add(node.token.final())
         return tokensList
     }
 }

diff --git a/src/main/kotlin/astminer/filters/CommonFilters.kt b/src/main/kotlin/astminer/filters/CommonFilters.kt
@@ -1,5 +1,6 @@
 package astminer.filters
 
+import astminer.common.TOKEN_DELIMITER
 import astminer.common.model.*
 import astminer.featureextraction.NumberOfNodes
 
@@ -23,8 +24,8 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n
  * Filter that excludes trees that have more words than [maxWordsNumber] in any token of their node.
  */
 class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter {
-    private fun validateTree(root: Node) =
-        !root.preOrder().any { node -> node.token.split(Node.TOKEN_DELIMITER).size > maxWordsNumber }
+    private fun validateTree(root: Node) = root.preOrder()
+        .none { node -> node.token.final().split(TOKEN_DELIMITER).size > maxWordsNumber }
 
     override fun validate(functionInfo: FunctionInfo<out Node>) = validateTree(functionInfo.root)
 

diff --git a/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt b/src/main/kotlin/astminer/labelextractor/FunctionNameLabelExtractor.kt
@@ -14,13 +14,13 @@ object FunctionNameLabelExtractor : FunctionLabelExtractor {
     private const val RECURSIVE_CALL_TOKEN = "SELF"
 
     override fun process(functionInfo: FunctionInfo<out Node>): LabeledResult<out Node>? {
-        val normalizedName = functionInfo.nameNode?.normalizedToken ?: return null
+        val normalizedName = functionInfo.nameNode?.token?.normalized ?: return null
         functionInfo.root.preOrder().forEach { node ->
-            if (node.originalToken == functionInfo.nameNode?.originalToken) {
-                node.technicalToken = RECURSIVE_CALL_TOKEN
+            if (node.token.original == functionInfo.nameNode?.token?.original) {
+                node.token.technical = RECURSIVE_CALL_TOKEN
             }
         }
-        functionInfo.nameNode?.technicalToken = HIDDEN_METHOD_NAME_TOKEN
+        functionInfo.nameNode?.token?.technical = HIDDEN_METHOD_NAME_TOKEN
         return LabeledResult(functionInfo.root, normalizedName, functionInfo.qualifiedPath)
     }
 }