-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New default dictionary (more compact). Also improve unittest coverage…
… for the default dictionary
- Loading branch information
Showing
15 changed files
with
266 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions
1
...i-dictionaries/src/test/kotlin/com/github/wanasit/kotori/dictionaries/TestDictionaries.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
73 changes: 73 additions & 0 deletions
73
kotori/src/main/kotlin/com/github/wanasit/kotori/optimized/DefaultTermEntry.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
package com.github.wanasit.kotori.optimized | ||
|
||
import com.github.wanasit.kotori.TermEntry | ||
import com.github.wanasit.kotori.mecab.MeCabLikeTermFeatures | ||
import com.github.wanasit.kotori.mecab.MeCabTermFeatures | ||
import com.github.wanasit.kotori.utils.IOUtils | ||
import java.io.InputStream | ||
import java.io.OutputStream | ||
import java.lang.IllegalArgumentException | ||
|
||
/** | ||
* A term entry for default dictionary | ||
* To make default dictionary compact, we make assumption that | ||
* the left and right context id of each term entry are the same | ||
*/ | ||
data class DefaultTermEntry( | ||
override val surfaceForm: String, | ||
val contextId: Int, | ||
override val cost: Int, | ||
override val features: DefaultTermFeatures, | ||
override val leftId: Int = contextId, | ||
override val rightId: Int = contextId | ||
) : TermEntry<DefaultTermFeatures> { | ||
|
||
companion object { | ||
fun copy(other: TermEntry<DefaultTermFeatures>) : DefaultTermEntry { | ||
if (other.leftId != other.rightId) { | ||
throw IllegalArgumentException( | ||
"A default term entry must have the same left and right context ID") | ||
} | ||
|
||
return DefaultTermEntry( | ||
surfaceForm = other.surfaceForm, | ||
contextId = other.leftId, | ||
cost = other.cost, | ||
features = other.features) | ||
} | ||
|
||
fun readFromInputStream(inputStream: InputStream) : Array<DefaultTermEntry> { | ||
val size = IOUtils.readInt(inputStream) | ||
val sizePerEntry = 3 | ||
val flattenTermEntry = IOUtils.readIntArray(inputStream, size * sizePerEntry) | ||
val surfaceForms = IOUtils.readStringArray(inputStream, size) | ||
return Array(size) { | ||
DefaultTermEntry( | ||
surfaceForm = surfaceForms[it], | ||
contextId = flattenTermEntry[it*sizePerEntry], | ||
cost = flattenTermEntry[it*sizePerEntry + 1], | ||
features = DefaultTermFeatures( | ||
partOfSpeech = DefaultTermFeatures.PartOfSpeech.values()[flattenTermEntry[it*sizePerEntry + 2]] | ||
)) | ||
} | ||
} | ||
|
||
fun writeToOutputAsDefaultTermEntries(outputStream: OutputStream, termEntries: Array<TermEntry<DefaultTermFeatures>>) { | ||
writeToOutput(outputStream, termEntries.map { copy(it) }.toTypedArray() ) | ||
} | ||
|
||
fun writeToOutput(outputStream: OutputStream, termEntries: Array<DefaultTermEntry>) { | ||
val size = termEntries.size | ||
val surfaceForms = termEntries.map { it.surfaceForm }.toTypedArray() | ||
val flattenTermEntry = termEntries.flatMap { listOf( | ||
it.contextId, | ||
it.cost, | ||
it.features.partOfSpeech.ordinal | ||
)}.toIntArray() | ||
|
||
IOUtils.writeInt(outputStream, size) | ||
IOUtils.writeIntArray(outputStream, flattenTermEntry, includeSize = false) | ||
IOUtils.writeStringArray(outputStream, surfaceForms, includeSize = false) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
10 changes: 10 additions & 0 deletions
10
kotori/src/main/kotlin/com/github/wanasit/kotori/utils/DictionaryExtentions.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,20 @@ | ||
package com.github.wanasit.kotori.utils | ||
|
||
import com.github.wanasit.kotori.Dictionary | ||
import com.github.wanasit.kotori.TermDictionary | ||
import com.github.wanasit.kotori.TermEntry | ||
import com.github.wanasit.kotori.optimized.PlainTermEntry | ||
import com.github.wanasit.kotori.optimized.PlainToken | ||
|
||
val <F> Dictionary<F>.termEntries: List<TermEntry<F>> | ||
get() = this.terms.map { it.second } | ||
|
||
val <F> Dictionary<F>.size: Int | ||
get() = this.terms.size() | ||
|
||
val <F> TermDictionary<F>.asEntries : List<TermEntry<F>> | ||
get() = this.map { it.second } | ||
|
||
fun TermEntry<*>.withoutFeatures(): PlainTermEntry<PlainToken.EmptyFeatures> { | ||
return PlainTermEntry(this, PlainToken.EMPTY_FEATURES) | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
65 changes: 65 additions & 0 deletions
65
kotori/src/test/kotlin/com/github/wanasit/kotori/optimized/TestDefaultDictionary.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package com.github.wanasit.kotori.optimized | ||
|
||
import com.github.wanasit.kotori.connectionTable | ||
import com.github.wanasit.kotori.fakeTermDictionaryWithoutFeature | ||
import com.github.wanasit.kotori.optimized.unknown.UnknownTermExtractionByCharacterCategory | ||
import com.github.wanasit.kotori.utils.asEntries | ||
import com.github.wanasit.kotori.utils.termEntries | ||
import com.github.wanasit.kotori.utils.withoutFeatures | ||
import org.junit.Assert | ||
import org.junit.Assert.* | ||
import org.junit.Test | ||
|
||
class TestDefaultDictionary { | ||
|
||
@Test | ||
fun testBasicCreationAndSerialization() { | ||
val terms = fakeTermDictionaryWithoutFeature { | ||
term("そこで", CONJ, 10) | ||
term("そこ", NOUN, 40) | ||
term("で", VERB, 40) | ||
term("で", ADJ, 10) | ||
term("はなし", NOUN, 40) | ||
term("は", VERB, 10) | ||
term("なし", NOUN, 40) | ||
term("終わり", NOUN, 40) | ||
term("になった", VERB, 40) | ||
term("に", ADJ, 10) | ||
term("なった", VERB, 40) | ||
}.asEntries | ||
|
||
val connectionCost = connectionTable { | ||
header( END, NOUN, VERB, ADJ, CONJ) | ||
row(BEGIN, 0, 10, 10, 0, 10) | ||
row(NOUN, 10, 10, 40, 10, 0) | ||
row(VERB, 10, 10, 10, 0, 10) | ||
row(ADJ, 10, 10, 10, 10, 10) | ||
row(CONJ, 0, 10, 10, 0, 10) | ||
} | ||
|
||
val unknownExtraction: UnknownTermExtractionByCharacterCategory<DefaultTermFeatures> = | ||
UnknownTermExtractionByCharacterCategory.fromUnoptimizedMapping(emptyMap(), emptyMap(), emptyMap()) | ||
|
||
val dictionary = DefaultDictionary( | ||
terms = PlainTermDictionary.copyOf(terms) { PlainTermEntry(it, DefaultTermFeatures()) }, | ||
unknownExtraction = unknownExtraction, | ||
connection = PlainConnectionCostTable.copyOf(terms, connectionCost) | ||
) | ||
|
||
val file = createTempFile() | ||
file.deleteOnExit() | ||
file.outputStream().use { | ||
DefaultDictionary.writeToOutputStream(it, dictionary); | ||
} | ||
|
||
val readDictionary = file.inputStream().use { | ||
DefaultDictionary.readFromInputStream(it) | ||
} | ||
|
||
assertEquals( | ||
terms.map { it.withoutFeatures() }, | ||
readDictionary.termEntries.map { it.withoutFeatures() }) | ||
assertEquals(connectionCost.lookup(1, 1), readDictionary.connection.lookup(1 , 1)) | ||
assertEquals(connectionCost.lookup(3, 1), readDictionary.connection.lookup(3 , 1)) | ||
} | ||
} |
Oops, something went wrong.