Skip to content

Commit

Permalink
Add the initial tokenizer interface and implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
wanasit committed May 8, 2020
1 parent faa28f9 commit 8a801fd
Show file tree
Hide file tree
Showing 26 changed files with 1,429 additions and 27 deletions.
2 changes: 2 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@

plugins {
java
kotlin("jvm") version Kotlin.version apply false
}

repositories {
Expand Down
11 changes: 8 additions & 3 deletions kotori/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import org.jetbrains.kotlin.gradle.tasks.KotlinCompile

plugins {
`java-library`
kotlin("jvm") version Kotlin.version
kotlin("jvm")
}

repositories {
// Use jcenter for resolving dependencies.
// You can declare any Maven/Ivy/file repository here.
jcenter()
}

// compile bytecode to java 8 (default is java 6)
tasks.withType<KotlinCompile> {
kotlinOptions.jvmTarget = "1.8"
}

dependencies {
implementation(Kotlin.Dependencies.stdlib)

Expand Down
65 changes: 65 additions & 0 deletions kotori/src/main/kotlin/com/github/wanasit/kotori/Dictionary.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package com.github.wanasit.kotori

import com.github.wanasit.kotori.mecab.MeCabDictionary


class Dictionary <out T: TermEntry> (
val terms: TermDictionary<T>,
val connection: ConnectionCost,
val unknownExtraction: UnknownTermExtractionStrategy? = null
) {
companion object {

@JvmStatic
fun readDefaultFromResource(): Dictionary<TermEntry> {
return MeCabDictionary.readFromResource()
}
}
}

/**
* TermDictionary (単語辞書)
*
* e.g.
* - 木, 1285 (Noun), 1285 (Noun), 7283
* - 切る, 772 (Verb-ru), 772 (Verb-ru), 7439
* - きる, 772 (Verb-ru), 772 (Verb-ru), 12499
* - ...
*/
typealias TermID = Int
interface TermDictionary<out T: TermEntry> : Iterable<Pair<TermID, T>>{
operator fun get(id: TermID): T?
}

interface TermEntry {
val surfaceForm: String
val leftId: Int
val rightId: Int
val cost: Int
}

/**
* ConnectionCost (連接コース) or Connectivity ()
*
* e.g.
* - 1285 (Noun) to 1285 (Noun) => 62
* - 772 (Verb-ru) to 1285 (Noun) => 335
* - 772 (Verb-ru) to 772 (Verb-ru) => -3713
*/
interface ConnectionCost {
fun lookup(fromRightId: Int, toLeftId: Int): Int?
}

/**
* Unknown or Out-of-Vocabulary terms handling strategy
*/
interface UnknownTermExtractionStrategy {

/**
* Extract unknown terms from `text` at `index`
*
* @param forceExtraction at least one term is expected to be extracted when this flag is enforced
*/
fun extractUnknownTerms(text: String, index: Int, forceExtraction: Boolean): Iterable<TermEntry>
}

10 changes: 0 additions & 10 deletions kotori/src/main/kotlin/com/github/wanasit/kotori/Library.kt

This file was deleted.

21 changes: 21 additions & 0 deletions kotori/src/main/kotlin/com/github/wanasit/kotori/Tokenizer.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.github.wanasit.kotori

import com.github.wanasit.kotori.core.LatticeBasedTokenizer

interface Tokenizer {

fun tokenize(text: String): List<Token>

companion object {
@JvmStatic
fun createDefaultTokenizer(): Tokenizer {
val defaultDictionary = Dictionary.readDefaultFromResource()
return LatticeBasedTokenizer(defaultDictionary)
}
}
}

interface Token {
val text: String
val position: Int
}
75 changes: 75 additions & 0 deletions kotori/src/main/kotlin/com/github/wanasit/kotori/core/Lattice.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package com.github.wanasit.kotori.core
import com.github.wanasit.kotori.ConnectionCost
import com.github.wanasit.kotori.TermEntry

data class LatticeNode(
val location: Int,
val totalCost: Int,
val termEntry: TermEntry,
val previousNode: LatticeNode?
)

internal val BEGIN_NODE = LatticeNode(0, 0, object : TermEntry {
override val surfaceForm = ""
override val leftId = 0
override val rightId = 0
override val cost = 0
}, null)

class Lattice(
private val connection: ConnectionCost
) {
private val nodesAtEndIndex: MutableMap<Int, MutableList<LatticeNode>> = mutableMapOf()
init {
nodesAtEndIndex[0] = mutableListOf(BEGIN_NODE)
}

fun addNode(term: TermEntry, startIndex: Int,
endIndex: Int = startIndex + term.surfaceForm.length) {

val (prevNode, prevCost) = findPossibleConnections(startIndex, term.leftId)
.minBy {(_, prevCost) -> prevCost}
?: return // If there is not possible connection just ignore

nodesAtEndIndex.getOrPut(endIndex, { mutableListOf() })
.add(LatticeNode(
location = startIndex,
totalCost = prevCost + term.cost,
termEntry = term,
previousNode = prevNode))
}

fun close(endIndex: Int) : List<LatticeNode>? {

val (prevNode, _) = findPossibleConnections(endIndex, 0)
.minBy {(_, prevCost) -> prevCost}

?: return null // If there is not possible connection just ignore

return transversePath(prevNode)
}

private fun findPossibleConnections(location: Int, leftId: Int) : Iterable<Pair<LatticeNode, Int>>{

val results: MutableList<Pair<LatticeNode, Int>> = mutableListOf()
nodesAtEndIndex[location]?.forEach {
val connectionCost = connection.lookup(it.termEntry.rightId, leftId)
if (connectionCost != null) {
val newTotalCost = it.totalCost + connectionCost
results.add(it to newTotalCost)
}
}

return results
}

private fun transversePath(node: LatticeNode) : List<LatticeNode> {
val path = mutableListOf<LatticeNode>()
var currentNode: LatticeNode = node
while (currentNode.previousNode != null) {
path.add(currentNode)
currentNode = currentNode.previousNode!!
}
return path.reversed()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package com.github.wanasit.kotori.core

import com.github.wanasit.kotori.Dictionary
import com.github.wanasit.kotori.TermID
import com.github.wanasit.kotori.Token
import com.github.wanasit.kotori.Tokenizer
import java.lang.IllegalStateException
import kotlin.math.min

class LatticeBasedToken(
override val text: String,
override val position: Int) : Token;

class LatticeBasedTokenizer(
private val dictionary: Dictionary<*>
) : Tokenizer {

override fun tokenize(text: String): List<Token> {

val lattice = Lattice(dictionary.connection)

for (i in text.indices) {
val termIDs = findTermsStartingAtIndex(text, i)
termIDs.forEach {
val termEntry = dictionary.terms[it] ?: throw IllegalStateException()
lattice.addNode(termEntry, i)
}

val unknownTerms = dictionary.unknownExtraction
?.extractUnknownTerms(text, i, termIDs.isEmpty())
?:emptyList()

unknownTerms.forEach {
lattice.addNode(it, i)
}
}

val path = lattice.close(text.length)
return path?.map { LatticeBasedToken(it.termEntry.surfaceForm, it.location) } ?: emptyList()
}

// TODO More efficient matching approach
// e.g. prefix-tree, FST, or Automaton
private val surfaceFormLookup: Map<String, List<TermID>>
private val longestSurfaceForm: Int

init {
val lookupTable: MutableMap<String, MutableList<TermID>> = mutableMapOf()
var longestSurfaceForm = 0;
dictionary.terms.forEach {(termId, termEntry) ->
longestSurfaceForm = maxOf(longestSurfaceForm, termEntry.surfaceForm.length)
lookupTable.getOrPut(termEntry.surfaceForm, { mutableListOf() })
.add(termId)
}

this.surfaceFormLookup = lookupTable;
this.longestSurfaceForm = longestSurfaceForm;
}

private fun findTermsStartingAtIndex(text: String, index: Int): List<TermID> {

val result = mutableListOf<TermID>()
val lastOffset = min(index+this.longestSurfaceForm, text.length)

for (endOffset in index + 1..lastOffset) {
result.addAll(surfaceFormLookup[text.substring(index, endOffset)] ?: listOf())
}

return result
}
}
Loading

0 comments on commit 8a801fd

Please sign in to comment.