-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add the initial tokenizer interface and implementation
- Loading branch information
Showing
26 changed files
with
1,429 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
|
||
plugins { | ||
java | ||
kotlin("jvm") version Kotlin.version apply false | ||
} | ||
|
||
repositories { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
65 changes: 65 additions & 0 deletions
65
kotori/src/main/kotlin/com/github/wanasit/kotori/Dictionary.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package com.github.wanasit.kotori | ||
|
||
import com.github.wanasit.kotori.mecab.MeCabDictionary | ||
|
||
|
||
class Dictionary <out T: TermEntry> ( | ||
val terms: TermDictionary<T>, | ||
val connection: ConnectionCost, | ||
val unknownExtraction: UnknownTermExtractionStrategy? = null | ||
) { | ||
companion object { | ||
|
||
@JvmStatic | ||
fun readDefaultFromResource(): Dictionary<TermEntry> { | ||
return MeCabDictionary.readFromResource() | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* TermDictionary (単語辞書) | ||
* | ||
* e.g. | ||
* - 木, 1285 (Noun), 1285 (Noun), 7283 | ||
* - 切る, 772 (Verb-ru), 772 (Verb-ru), 7439 | ||
* - きる, 772 (Verb-ru), 772 (Verb-ru), 12499 | ||
* - ... | ||
*/ | ||
typealias TermID = Int | ||
interface TermDictionary<out T: TermEntry> : Iterable<Pair<TermID, T>>{ | ||
operator fun get(id: TermID): T? | ||
} | ||
|
||
interface TermEntry { | ||
val surfaceForm: String | ||
val leftId: Int | ||
val rightId: Int | ||
val cost: Int | ||
} | ||
|
||
/** | ||
* ConnectionCost (連接コース) or Connectivity () | ||
* | ||
* e.g. | ||
* - 1285 (Noun) to 1285 (Noun) => 62 | ||
* - 772 (Verb-ru) to 1285 (Noun) => 335 | ||
* - 772 (Verb-ru) to 772 (Verb-ru) => -3713 | ||
*/ | ||
interface ConnectionCost { | ||
fun lookup(fromRightId: Int, toLeftId: Int): Int? | ||
} | ||
|
||
/** | ||
* Unknown or Out-of-Vocabulary terms handling strategy | ||
*/ | ||
interface UnknownTermExtractionStrategy { | ||
|
||
/** | ||
* Extract unknown terms from `text` at `index` | ||
* | ||
* @param forceExtraction at least one term is expected to be extracted when this flag is enforced | ||
*/ | ||
fun extractUnknownTerms(text: String, index: Int, forceExtraction: Boolean): Iterable<TermEntry> | ||
} | ||
|
10 changes: 0 additions & 10 deletions
10
kotori/src/main/kotlin/com/github/wanasit/kotori/Library.kt
This file was deleted.
Oops, something went wrong.
21 changes: 21 additions & 0 deletions
21
kotori/src/main/kotlin/com/github/wanasit/kotori/Tokenizer.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package com.github.wanasit.kotori | ||
|
||
import com.github.wanasit.kotori.core.LatticeBasedTokenizer | ||
|
||
interface Tokenizer { | ||
|
||
fun tokenize(text: String): List<Token> | ||
|
||
companion object { | ||
@JvmStatic | ||
fun createDefaultTokenizer(): Tokenizer { | ||
val defaultDictionary = Dictionary.readDefaultFromResource() | ||
return LatticeBasedTokenizer(defaultDictionary) | ||
} | ||
} | ||
} | ||
|
||
interface Token { | ||
val text: String | ||
val position: Int | ||
} |
75 changes: 75 additions & 0 deletions
75
kotori/src/main/kotlin/com/github/wanasit/kotori/core/Lattice.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package com.github.wanasit.kotori.core | ||
import com.github.wanasit.kotori.ConnectionCost | ||
import com.github.wanasit.kotori.TermEntry | ||
|
||
data class LatticeNode( | ||
val location: Int, | ||
val totalCost: Int, | ||
val termEntry: TermEntry, | ||
val previousNode: LatticeNode? | ||
) | ||
|
||
internal val BEGIN_NODE = LatticeNode(0, 0, object : TermEntry { | ||
override val surfaceForm = "" | ||
override val leftId = 0 | ||
override val rightId = 0 | ||
override val cost = 0 | ||
}, null) | ||
|
||
class Lattice( | ||
private val connection: ConnectionCost | ||
) { | ||
private val nodesAtEndIndex: MutableMap<Int, MutableList<LatticeNode>> = mutableMapOf() | ||
init { | ||
nodesAtEndIndex[0] = mutableListOf(BEGIN_NODE) | ||
} | ||
|
||
fun addNode(term: TermEntry, startIndex: Int, | ||
endIndex: Int = startIndex + term.surfaceForm.length) { | ||
|
||
val (prevNode, prevCost) = findPossibleConnections(startIndex, term.leftId) | ||
.minBy {(_, prevCost) -> prevCost} | ||
?: return // If there is not possible connection just ignore | ||
|
||
nodesAtEndIndex.getOrPut(endIndex, { mutableListOf() }) | ||
.add(LatticeNode( | ||
location = startIndex, | ||
totalCost = prevCost + term.cost, | ||
termEntry = term, | ||
previousNode = prevNode)) | ||
} | ||
|
||
fun close(endIndex: Int) : List<LatticeNode>? { | ||
|
||
val (prevNode, _) = findPossibleConnections(endIndex, 0) | ||
.minBy {(_, prevCost) -> prevCost} | ||
|
||
?: return null // If there is not possible connection just ignore | ||
|
||
return transversePath(prevNode) | ||
} | ||
|
||
private fun findPossibleConnections(location: Int, leftId: Int) : Iterable<Pair<LatticeNode, Int>>{ | ||
|
||
val results: MutableList<Pair<LatticeNode, Int>> = mutableListOf() | ||
nodesAtEndIndex[location]?.forEach { | ||
val connectionCost = connection.lookup(it.termEntry.rightId, leftId) | ||
if (connectionCost != null) { | ||
val newTotalCost = it.totalCost + connectionCost | ||
results.add(it to newTotalCost) | ||
} | ||
} | ||
|
||
return results | ||
} | ||
|
||
private fun transversePath(node: LatticeNode) : List<LatticeNode> { | ||
val path = mutableListOf<LatticeNode>() | ||
var currentNode: LatticeNode = node | ||
while (currentNode.previousNode != null) { | ||
path.add(currentNode) | ||
currentNode = currentNode.previousNode!! | ||
} | ||
return path.reversed() | ||
} | ||
} |
71 changes: 71 additions & 0 deletions
71
kotori/src/main/kotlin/com/github/wanasit/kotori/core/LatticeBasedTokenizer.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package com.github.wanasit.kotori.core | ||
|
||
import com.github.wanasit.kotori.Dictionary | ||
import com.github.wanasit.kotori.TermID | ||
import com.github.wanasit.kotori.Token | ||
import com.github.wanasit.kotori.Tokenizer | ||
import java.lang.IllegalStateException | ||
import kotlin.math.min | ||
|
||
class LatticeBasedToken( | ||
override val text: String, | ||
override val position: Int) : Token; | ||
|
||
class LatticeBasedTokenizer( | ||
private val dictionary: Dictionary<*> | ||
) : Tokenizer { | ||
|
||
override fun tokenize(text: String): List<Token> { | ||
|
||
val lattice = Lattice(dictionary.connection) | ||
|
||
for (i in text.indices) { | ||
val termIDs = findTermsStartingAtIndex(text, i) | ||
termIDs.forEach { | ||
val termEntry = dictionary.terms[it] ?: throw IllegalStateException() | ||
lattice.addNode(termEntry, i) | ||
} | ||
|
||
val unknownTerms = dictionary.unknownExtraction | ||
?.extractUnknownTerms(text, i, termIDs.isEmpty()) | ||
?:emptyList() | ||
|
||
unknownTerms.forEach { | ||
lattice.addNode(it, i) | ||
} | ||
} | ||
|
||
val path = lattice.close(text.length) | ||
return path?.map { LatticeBasedToken(it.termEntry.surfaceForm, it.location) } ?: emptyList() | ||
} | ||
|
||
// TODO More efficient matching approach | ||
// e.g. prefix-tree, FST, or Automaton | ||
private val surfaceFormLookup: Map<String, List<TermID>> | ||
private val longestSurfaceForm: Int | ||
|
||
init { | ||
val lookupTable: MutableMap<String, MutableList<TermID>> = mutableMapOf() | ||
var longestSurfaceForm = 0; | ||
dictionary.terms.forEach {(termId, termEntry) -> | ||
longestSurfaceForm = maxOf(longestSurfaceForm, termEntry.surfaceForm.length) | ||
lookupTable.getOrPut(termEntry.surfaceForm, { mutableListOf() }) | ||
.add(termId) | ||
} | ||
|
||
this.surfaceFormLookup = lookupTable; | ||
this.longestSurfaceForm = longestSurfaceForm; | ||
} | ||
|
||
private fun findTermsStartingAtIndex(text: String, index: Int): List<TermID> { | ||
|
||
val result = mutableListOf<TermID>() | ||
val lastOffset = min(index+this.longestSurfaceForm, text.length) | ||
|
||
for (endOffset in index + 1..lastOffset) { | ||
result.addAll(surfaceFormLookup[text.substring(index, endOffset)] ?: listOf()) | ||
} | ||
|
||
return result | ||
} | ||
} |
Oops, something went wrong.