Skip to content

Commit

Permalink
Merge pull request #194 from JetBrains-Research/token_range
Browse files Browse the repository at this point in the history
Node range
  • Loading branch information
SpirinEgor authored Nov 12, 2021
2 parents 4968a55 + 37ae185 commit 259f730
Show file tree
Hide file tree
Showing 57 changed files with 617 additions and 248 deletions.
9 changes: 8 additions & 1 deletion docs/storages.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ Saves each tree with its label in the JSON lines format inspired by the [150k Py

```yaml
name: json AST
withPaths: true # can be omitted
withRanges: true # can be omitted
```

In this format, each line represents an AST with its [label](label_extractors.md), path, and all vertices:

```json
```json lines
{
"label": "1.java",
"path": "src/test/resources/examples/1.java",
Expand All @@ -50,8 +52,13 @@ In this format, each line represents an AST with its [label](label_extractors.md
{ "token": "class", "typeLabel": "TypeDeclaration", "children": [2, 3, 4] },
...
]
}
```

Possible configuration options for Json storage:
1. `withPaths` allows for each tree to save the path to the file where it appears. Default: `false`.
2. `withRanges` allows for each node to save start and end positions in the corresponding source code. Default: `false`.

## Path-based representations

Path-based representation was introduced by [Alon et al.](https://arxiv.org/abs/1803.09544).
Expand Down
23 changes: 23 additions & 0 deletions src/main/kotlin/astminer/common/SimpleNode.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package astminer.common

import astminer.common.model.Node
import astminer.common.model.NodeRange

/** Node simplest implementation **/
class SimpleNode(
override val typeLabel: String,
override val children: MutableList<SimpleNode>,
override val parent: Node? = null,
override val range: NodeRange? = null,
token: String?
) : Node(token) {
override fun removeChildrenOfType(typeLabel: String) {
children.removeIf { it.typeLabel == typeLabel }
}

override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode }
override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode

override fun preOrder() = super.preOrder().map { it as SimpleNode }
override fun postOrder() = super.postOrder().map { it as SimpleNode }
}
52 changes: 52 additions & 0 deletions src/main/kotlin/astminer/common/TokenNormalization.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package astminer.common

const val EMPTY_TOKEN = "<E>"
const val TOKEN_DELIMITER = "|"
const val EMPTY_STRING = ""

/**
* Splits token into subtokens by commonly used practice, i.e. `camelCase` or `snake_case`.
* Returns a list of not empty, normalized subtokens.
* The function was adopted from the original code2vec implementation in order to match their behavior:
* https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
* @see normalizeToken
*/
fun splitToSubtokens(token: String) = token
.trim()
.split(splitRegex)
.map { s -> normalizeToken(s, EMPTY_STRING) }
.filter { it.isNotEmpty() }
.toList()

private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex()

/**
* Normalize token by conversion to lower case, removing the new line,
* whitespace, quotes, and other weird Unicode characters.
* The function was adopted from the original code2vec implementation in order to match their behavior:
* https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
* @param token Token to normalize
* @param defaultToken If the token is empty after the normalization process, it will be replaced with the default token
*/
fun normalizeToken(token: String, defaultToken: String): String {
val cleanToken = token.lowercase()
.replace(newLineReg, EMPTY_STRING) // escaped new line
.replace(whitespaceReg, EMPTY_STRING) // whitespaces
.replace(quotesApostrophesCommasReg, EMPTY_STRING) // quotes, apostrophes, commas
.replace(unicodeWeirdCharReg, EMPTY_STRING) // unicode weird characters

val stripped = cleanToken.replace(notALetterReg, EMPTY_STRING)

return stripped.ifEmpty {
val carefulStripped = cleanToken.replace(" ", "_")
carefulStripped.ifEmpty {
defaultToken
}
}
}

private val newLineReg = "\\\\n".toRegex()
private val whitespaceReg = "//s+".toRegex()
private val quotesApostrophesCommasReg = "[\"',]".toRegex()
private val unicodeWeirdCharReg = "\\P{Print}".toRegex()
private val notALetterReg = "[^A-Za-z]".toRegex()
45 changes: 0 additions & 45 deletions src/main/kotlin/astminer/common/TreeUtil.kt

This file was deleted.

2 changes: 1 addition & 1 deletion src/main/kotlin/astminer/common/model/FunctionInfoModel.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ interface NamedTree<T : Node> {
val nameNode: T?
get() = notImplemented("nameNode")
val name: String?
get() = nameNode?.originalToken
get() = nameNode?.token?.original
val root: T
get() = notImplemented("root")
val body: T?
Expand Down
49 changes: 13 additions & 36 deletions src/main/kotlin/astminer/common/model/ParsingModel.kt
Original file line number Diff line number Diff line change
@@ -1,31 +1,22 @@
package astminer.common.model

import astminer.common.EMPTY_TOKEN
import astminer.common.splitToSubtokens
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable
import java.io.File
import java.io.InputStream
import java.util.*

abstract class Node(val originalToken: String?) {
abstract class Node(originalToken: String?) {
abstract val typeLabel: String
abstract val children: List<Node>
abstract val parent: Node?

val normalizedToken: String =
originalToken?.let {
val subtokens = splitToSubtokens(it)
if (subtokens.isEmpty()) EMPTY_TOKEN else subtokens.joinToString(TOKEN_DELIMITER)
} ?: EMPTY_TOKEN

var technicalToken: String? = null

val token: String
get() = technicalToken ?: normalizedToken

abstract val range: NodeRange?
val metadata: MutableMap<String, Any> = HashMap()
val token = Token(originalToken)

fun isLeaf() = children.isEmpty()

override fun toString(): String = "$typeLabel : $token"

fun prettyPrint(indent: Int = 0, indentSymbol: String = "--") {
repeat(indent) { print(indentSymbol) }
println(this)
Expand All @@ -52,30 +43,16 @@ abstract class Node(val originalToken: String?) {

fun postOrderIterator(): Iterator<Node> = postOrder().listIterator()
open fun postOrder(): List<Node> = mutableListOf<Node>().also { doTraversePostOrder(it) }

companion object {
const val TOKEN_DELIMITER = "|"
}
}

/** Node simplest implementation **/
class SimpleNode(
override val typeLabel: String,
override val children: MutableList<SimpleNode>,
override val parent: Node?,
token: String?
) : Node(token) {
override fun removeChildrenOfType(typeLabel: String) {
children.removeIf { it.typeLabel == typeLabel }
}

override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode }
override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode

override fun preOrder() = super.preOrder().map { it as SimpleNode }
override fun postOrder() = super.postOrder().map { it as SimpleNode }
@Serializable
data class NodeRange(val start: Position, val end: Position) {
override fun toString(): String = "[${start.line}, ${start.column}] - [${end.line}, ${end.column}]"
}

@Serializable
data class Position(@SerialName("l") val line: Int, @SerialName("c") val column: Int)

interface Parser<T : Node> {
/**
* Parse input stream into an AST.
Expand Down
37 changes: 37 additions & 0 deletions src/main/kotlin/astminer/common/model/Token.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package astminer.common.model

import astminer.common.*

/**
* Class to wrap logic with token processing.
* It is responsible for token normalization or replacing it with technical information.
* Use `token.original` to access the original token.
*/
class Token(val original: String?) {
/**
* Technical token is used to shadow the original token with mining pipeline specific value.
* For example, for the method name prediction problem
* we want to set technical `<METHOD_NAME>` token to hide real method name.
*/
var technical: String? = null

/**
* Original token with normalization applied
* @see normalizeToken
*/
val normalized = run {
if (original == null) return@run EMPTY_TOKEN
val subTokens = splitToSubtokens(original)
if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER)
}

/**
* Access to the final representation of the token after normalization and other preprocessing.
* It returns technical assign token if it exists or normalized token otherwise.
* @see technical
* @see normalized
*/
fun final() = technical ?: normalized

override fun toString(): String = final()
}
12 changes: 10 additions & 2 deletions src/main/kotlin/astminer/config/StorageConfigs.kt
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,16 @@ class DotAstStorageConfig : StorageConfig() {
*/
@Serializable
@SerialName("json AST")
class JsonAstStorageConfig(private val withPaths: Boolean = false) : StorageConfig() {
override fun createStorage(outputDirectoryPath: String) = JsonAstStorage(outputDirectoryPath, withPaths)
class JsonAstStorageConfig(
private val withPaths: Boolean = false,
private val withRanges: Boolean = false
) : StorageConfig() {
override fun createStorage(outputDirectoryPath: String) =
JsonAstStorage(
outputDirectoryPath,
withPaths,
withRanges
)
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/kotlin/astminer/featureextraction/TreeFeature.kt
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ object Tokens : TreeFeature<List<String>> {

private fun findTokens(node: Node, tokensList: MutableList<String>): List<String> {
node.children.forEach { findTokens(it, tokensList) }
tokensList.add(node.token)
tokensList.add(node.token.final())
return tokensList
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/main/kotlin/astminer/filters/CommonFilters.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package astminer.filters

import astminer.common.TOKEN_DELIMITER
import astminer.common.model.*
import astminer.featureextraction.NumberOfNodes

Expand All @@ -23,8 +24,8 @@ class TreeSizeFilter(private val minSize: Int = 0, private val maxSize: Int? = n
* Filter that excludes trees that have more words than [maxWordsNumber] in any token of their node.
*/
class WordsNumberFilter(private val maxWordsNumber: Int) : FunctionFilter, FileFilter {
private fun validateTree(root: Node) =
!root.preOrder().any { node -> node.token.split(Node.TOKEN_DELIMITER).size > maxWordsNumber }
private fun validateTree(root: Node) = root.preOrder()
.none { node -> node.token.final().split(TOKEN_DELIMITER).size > maxWordsNumber }

override fun validate(functionInfo: FunctionInfo<out Node>) = validateTree(functionInfo.root)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ object FunctionNameLabelExtractor : FunctionLabelExtractor {
private const val RECURSIVE_CALL_TOKEN = "SELF"

override fun process(functionInfo: FunctionInfo<out Node>): LabeledResult<out Node>? {
val normalizedName = functionInfo.nameNode?.normalizedToken ?: return null
val normalizedName = functionInfo.nameNode?.token?.normalized ?: return null
functionInfo.root.preOrder().forEach { node ->
if (node.originalToken == functionInfo.nameNode?.originalToken) {
node.technicalToken = RECURSIVE_CALL_TOKEN
if (node.token.original == functionInfo.nameNode?.token?.original) {
node.token.technical = RECURSIVE_CALL_TOKEN
}
}
functionInfo.nameNode?.technicalToken = HIDDEN_METHOD_NAME_TOKEN
functionInfo.nameNode?.token?.technical = HIDDEN_METHOD_NAME_TOKEN
return LabeledResult(functionInfo.root, normalizedName, functionInfo.qualifiedPath)
}
}
Loading

0 comments on commit 259f730

Please sign in to comment.