-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #194 from JetBrains-Research/token_range
Node range
- Loading branch information
Showing
57 changed files
with
617 additions
and
248 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package astminer.common | ||
|
||
import astminer.common.model.Node | ||
import astminer.common.model.NodeRange | ||
|
||
/** Node simplest implementation **/ | ||
class SimpleNode( | ||
override val typeLabel: String, | ||
override val children: MutableList<SimpleNode>, | ||
override val parent: Node? = null, | ||
override val range: NodeRange? = null, | ||
token: String? | ||
) : Node(token) { | ||
override fun removeChildrenOfType(typeLabel: String) { | ||
children.removeIf { it.typeLabel == typeLabel } | ||
} | ||
|
||
override fun getChildrenOfType(typeLabel: String) = super.getChildrenOfType(typeLabel).map { it as SimpleNode } | ||
override fun getChildOfType(typeLabel: String) = super.getChildOfType(typeLabel) as? SimpleNode | ||
|
||
override fun preOrder() = super.preOrder().map { it as SimpleNode } | ||
override fun postOrder() = super.postOrder().map { it as SimpleNode } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package astminer.common | ||
|
||
const val EMPTY_TOKEN = "<E>" | ||
const val TOKEN_DELIMITER = "|" | ||
const val EMPTY_STRING = "" | ||
|
||
/** | ||
* Splits token into subtokens by commonly used practice, i.e. `camelCase` or `snake_case`. | ||
* Returns a list of not empty, normalized subtokens. | ||
* The function was adopted from the original code2vec implementation in order to match their behavior: | ||
* https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java | ||
* @see normalizeToken | ||
*/ | ||
fun splitToSubtokens(token: String) = token | ||
.trim() | ||
.split(splitRegex) | ||
.map { s -> normalizeToken(s, EMPTY_STRING) } | ||
.filter { it.isNotEmpty() } | ||
.toList() | ||
|
||
private val splitRegex = "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+".toRegex() | ||
|
||
/** | ||
* Normalize token by conversion to lower case, removing the new line, | ||
* whitespace, quotes, and other weird Unicode characters. | ||
* The function was adopted from the original code2vec implementation in order to match their behavior: | ||
* https://github.com/tech-srl/code2vec/blob/master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java | ||
* @param token Token to normalize | ||
* @param defaultToken If the token is empty after the normalization process, it will be replaced with the default token | ||
*/ | ||
fun normalizeToken(token: String, defaultToken: String): String { | ||
val cleanToken = token.lowercase() | ||
.replace(newLineReg, EMPTY_STRING) // escaped new line | ||
.replace(whitespaceReg, EMPTY_STRING) // whitespaces | ||
.replace(quotesApostrophesCommasReg, EMPTY_STRING) // quotes, apostrophes, commas | ||
.replace(unicodeWeirdCharReg, EMPTY_STRING) // unicode weird characters | ||
|
||
val stripped = cleanToken.replace(notALetterReg, EMPTY_STRING) | ||
|
||
return stripped.ifEmpty { | ||
val carefulStripped = cleanToken.replace(" ", "_") | ||
carefulStripped.ifEmpty { | ||
defaultToken | ||
} | ||
} | ||
} | ||
|
||
private val newLineReg = "\\\\n".toRegex() | ||
private val whitespaceReg = "//s+".toRegex() | ||
private val quotesApostrophesCommasReg = "[\"',]".toRegex() | ||
private val unicodeWeirdCharReg = "\\P{Print}".toRegex() | ||
private val notALetterReg = "[^A-Za-z]".toRegex() |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package astminer.common.model | ||
|
||
import astminer.common.* | ||
|
||
/** | ||
* Class to wrap logic with token processing. | ||
* It is responsible for token normalization or replacing it with technical information. | ||
* Use `token.original` to access the original token. | ||
*/ | ||
class Token(val original: String?) { | ||
/** | ||
* Technical token is used to shadow the original token with mining pipeline specific value. | ||
* For example, for the method name prediction problem | ||
* we want to set technical `<METHOD_NAME>` token to hide real method name. | ||
*/ | ||
var technical: String? = null | ||
|
||
/** | ||
* Original token with normalization applied | ||
* @see normalizeToken | ||
*/ | ||
val normalized = run { | ||
if (original == null) return@run EMPTY_TOKEN | ||
val subTokens = splitToSubtokens(original) | ||
if (subTokens.isEmpty()) EMPTY_TOKEN else subTokens.joinToString(TOKEN_DELIMITER) | ||
} | ||
|
||
/** | ||
* Access to the final representation of the token after normalization and other preprocessing. | ||
* It returns technical assign token if it exists or normalized token otherwise. | ||
* @see technical | ||
* @see normalized | ||
*/ | ||
fun final() = technical ?: normalized | ||
|
||
override fun toString(): String = final() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.