Add the initial tokenizer interface and implementation

wanasit · May 8, 2020 · 8a801fd · 8a801fd
1 parent faa28f9
commit 8a801fd
Show file tree

Hide file tree

Showing 26 changed files with 1,429 additions and 27 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -1,5 +1,7 @@
+
 plugins {
     java
+    kotlin("jvm") version Kotlin.version apply false
 }
 
 repositories {

diff --git a/kotori/build.gradle.kts b/kotori/build.gradle.kts
@@ -1,14 +1,19 @@
+import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
+
 plugins {
     `java-library`
-    kotlin("jvm") version Kotlin.version
+    kotlin("jvm")
 }
 
 repositories {
-    // Use jcenter for resolving dependencies.
-    // You can declare any Maven/Ivy/file repository here.
     jcenter()
 }
 
+// compile bytecode to java 8 (default is java 6)
+tasks.withType<KotlinCompile> {
+    kotlinOptions.jvmTarget = "1.8"
+}
+
 dependencies {
     implementation(Kotlin.Dependencies.stdlib)
 

diff --git a/kotori/src/main/kotlin/com/github/wanasit/kotori/Dictionary.kt b/kotori/src/main/kotlin/com/github/wanasit/kotori/Dictionary.kt
@@ -0,0 +1,65 @@
+package com.github.wanasit.kotori
+
+import com.github.wanasit.kotori.mecab.MeCabDictionary
+
+
+class Dictionary <out T: TermEntry> (
+        val terms: TermDictionary<T>,
+        val connection: ConnectionCost,
+        val unknownExtraction: UnknownTermExtractionStrategy? = null
+) {
+    companion object {
+
+        @JvmStatic
+        fun readDefaultFromResource(): Dictionary<TermEntry> {
+            return MeCabDictionary.readFromResource()
+        }
+    }
+}
+
+/**
+ * TermDictionary (単語辞書)
+ *
+ * e.g.
+ * - 木, 1285 (Noun), 1285 (Noun), 7283
+ * - 切る, 772 (Verb-ru), 772 (Verb-ru), 7439
+ * - きる, 772 (Verb-ru), 772 (Verb-ru), 12499
+ * - ...
+ */
+typealias TermID = Int
+interface TermDictionary<out T: TermEntry> : Iterable<Pair<TermID, T>>{
+    operator fun get(id: TermID): T?
+}
+
+interface TermEntry {
+    val surfaceForm: String
+    val leftId: Int
+    val rightId: Int
+    val cost: Int
+}
+
+/**
+ * ConnectionCost (連接コース) or Connectivity ()
+ *
+ * e.g.
+ * - 1285 (Noun) to 1285 (Noun) => 62
+ * - 772 (Verb-ru) to 1285 (Noun) => 335
+ * - 772 (Verb-ru) to 772 (Verb-ru) => -3713
+ */
+interface ConnectionCost {
+    fun lookup(fromRightId: Int, toLeftId: Int): Int?
+}
+
+/**
+ * Unknown or Out-of-Vocabulary terms handling strategy
+ */
+interface UnknownTermExtractionStrategy {
+
+    /**
+     * Extract unknown terms from `text` at `index`
+     *
+     * @param forceExtraction at least one term is expected to be extracted when this flag is enforced
+     */
+    fun extractUnknownTerms(text: String, index: Int, forceExtraction: Boolean): Iterable<TermEntry>
+}
+
diff --git a/kotori/src/main/kotlin/com/github/wanasit/kotori/Library.kt b/kotori/src/main/kotlin/com/github/wanasit/kotori/Library.kt
diff --git a/kotori/src/main/kotlin/com/github/wanasit/kotori/Tokenizer.kt b/kotori/src/main/kotlin/com/github/wanasit/kotori/Tokenizer.kt
@@ -0,0 +1,21 @@
+package com.github.wanasit.kotori
+
+import com.github.wanasit.kotori.core.LatticeBasedTokenizer
+
+interface Tokenizer {
+
+    fun tokenize(text: String): List<Token>
+
+    companion object {
+        @JvmStatic
+        fun createDefaultTokenizer(): Tokenizer {
+            val defaultDictionary = Dictionary.readDefaultFromResource()
+            return LatticeBasedTokenizer(defaultDictionary)
+        }
+    }
+}
+
+interface Token {
+    val text: String
+    val position: Int
+}
diff --git a/kotori/src/main/kotlin/com/github/wanasit/kotori/core/Lattice.kt b/kotori/src/main/kotlin/com/github/wanasit/kotori/core/Lattice.kt
@@ -0,0 +1,75 @@
+package com.github.wanasit.kotori.core
+import com.github.wanasit.kotori.ConnectionCost
+import com.github.wanasit.kotori.TermEntry
+
+data class LatticeNode(
+        val location: Int,
+        val totalCost: Int,
+        val termEntry: TermEntry,
+        val previousNode: LatticeNode?
+)
+
+internal val BEGIN_NODE = LatticeNode(0, 0, object : TermEntry {
+    override val surfaceForm = ""
+    override val leftId = 0
+    override val rightId = 0
+    override val cost = 0
+}, null)
+
+class Lattice(
+        private val connection: ConnectionCost
+) {
+    private val nodesAtEndIndex: MutableMap<Int, MutableList<LatticeNode>> = mutableMapOf()
+    init {
+        nodesAtEndIndex[0] = mutableListOf(BEGIN_NODE)
+    }
+
+    fun addNode(term: TermEntry, startIndex: Int,
+                endIndex: Int = startIndex + term.surfaceForm.length) {
+
+        val (prevNode, prevCost) = findPossibleConnections(startIndex, term.leftId)
+                .minBy {(_, prevCost) -> prevCost}
+                ?: return // If there is not possible connection just ignore
+
+        nodesAtEndIndex.getOrPut(endIndex, { mutableListOf() })
+                .add(LatticeNode(
+                        location = startIndex,
+                        totalCost = prevCost + term.cost,
+                        termEntry = term,
+                        previousNode = prevNode))
+    }
+
+    fun close(endIndex: Int) : List<LatticeNode>? {
+
+        val (prevNode, _) = findPossibleConnections(endIndex, 0)
+                .minBy {(_, prevCost) -> prevCost}
+
+                ?: return null // If there is not possible connection just ignore
+
+        return transversePath(prevNode)
+    }
+
+    private fun findPossibleConnections(location: Int, leftId: Int) : Iterable<Pair<LatticeNode, Int>>{
+
+        val results: MutableList<Pair<LatticeNode, Int>> = mutableListOf()
+        nodesAtEndIndex[location]?.forEach {
+            val connectionCost = connection.lookup(it.termEntry.rightId, leftId)
+            if (connectionCost != null) {
+                val newTotalCost = it.totalCost + connectionCost
+                results.add(it to newTotalCost)
+            }
+        }
+
+        return results
+    }
+
+    private fun transversePath(node: LatticeNode) : List<LatticeNode> {
+        val path = mutableListOf<LatticeNode>()
+        var currentNode: LatticeNode = node
+        while (currentNode.previousNode != null) {
+            path.add(currentNode)
+            currentNode = currentNode.previousNode!!
+        }
+        return path.reversed()
+    }
+}
diff --git a/kotori/src/main/kotlin/com/github/wanasit/kotori/core/LatticeBasedTokenizer.kt b/kotori/src/main/kotlin/com/github/wanasit/kotori/core/LatticeBasedTokenizer.kt
@@ -0,0 +1,71 @@
+package com.github.wanasit.kotori.core
+
+import com.github.wanasit.kotori.Dictionary
+import com.github.wanasit.kotori.TermID
+import com.github.wanasit.kotori.Token
+import com.github.wanasit.kotori.Tokenizer
+import java.lang.IllegalStateException
+import kotlin.math.min
+
+class LatticeBasedToken(
+        override val text: String,
+        override val position: Int) : Token;
+
+class LatticeBasedTokenizer(
+        private val dictionary: Dictionary<*>
+) : Tokenizer {
+
+    override fun tokenize(text: String): List<Token> {
+
+        val lattice = Lattice(dictionary.connection)
+
+        for (i in text.indices) {
+            val termIDs = findTermsStartingAtIndex(text, i)
+            termIDs.forEach {
+                val termEntry = dictionary.terms[it] ?: throw IllegalStateException()
+                lattice.addNode(termEntry, i)
+            }
+
+            val unknownTerms = dictionary.unknownExtraction
+                    ?.extractUnknownTerms(text, i, termIDs.isEmpty())
+                    ?:emptyList()
+
+            unknownTerms.forEach {
+                lattice.addNode(it, i)
+            }
+        }
+
+        val path = lattice.close(text.length)
+        return path?.map { LatticeBasedToken(it.termEntry.surfaceForm, it.location) } ?: emptyList()
+    }
+
+    // TODO More efficient matching approach
+    //  e.g. prefix-tree, FST, or Automaton
+    private val surfaceFormLookup: Map<String, List<TermID>>
+    private val longestSurfaceForm: Int
+
+    init {
+        val lookupTable: MutableMap<String, MutableList<TermID>> = mutableMapOf()
+        var longestSurfaceForm = 0;
+        dictionary.terms.forEach {(termId, termEntry) ->
+            longestSurfaceForm = maxOf(longestSurfaceForm, termEntry.surfaceForm.length)
+            lookupTable.getOrPut(termEntry.surfaceForm, { mutableListOf() })
+                    .add(termId)
+        }
+
+        this.surfaceFormLookup = lookupTable;
+        this.longestSurfaceForm = longestSurfaceForm;
+    }
+
+    private fun findTermsStartingAtIndex(text: String, index: Int): List<TermID> {
+
+        val result = mutableListOf<TermID>()
+        val lastOffset = min(index+this.longestSurfaceForm, text.length)
+
+        for (endOffset in index + 1..lastOffset) {
+            result.addAll(surfaceFormLookup[text.substring(index, endOffset)] ?: listOf())
+        }
+
+        return result
+    }
+}