Skip to content
This repository has been archived by the owner on Mar 29, 2022. It is now read-only.

Commit

Permalink
(WIP) using nwenc.Encoded/Decoded data.
Browse files Browse the repository at this point in the history
  • Loading branch information
high-moctane committed Sep 29, 2019
1 parent 2b1cd71 commit 2731311
Show file tree
Hide file tree
Showing 5 changed files with 680 additions and 0 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/high-moctane/nextword

go 1.13

require github.com/high-moctane/nwenc v0.2.0
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
github.com/high-moctane/nwenc v0.1.1 h1:wmt+kAdkgAfZHj7Jlwr7m3lxq8iczXEydlnl0ZMLwSA=
github.com/high-moctane/nwenc v0.1.1/go.mod h1:6REV+Ridy2BaP88PeJ7POgCU28oE3EkOzCjhBCywdOQ=
github.com/high-moctane/nwenc v0.2.0 h1:s8+qLh5RSEgPMPa7P54vWoGPALxR+cs8TZh0lr53TB0=
github.com/high-moctane/nwenc v0.2.0/go.mod h1:6REV+Ridy2BaP88PeJ7POgCU28oE3EkOzCjhBCywdOQ=
42 changes: 42 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package main

import (
"bufio"
"fmt"
"log"
"os"
"strings"
)

const dataPathEnv = "NEXTWORD_DATA_PATH"

func main() {
log.Fatal(run())
}

func run() error {
return fmt.Errorf("serve error: %w", serve())
}

func serve() error {
sg, err := NewSuggester(os.Getenv(dataPathEnv), 100)
if err != nil {
return fmt.Errorf("cannot create suggester: %w", err)
}
defer sg.Close()

sc := bufio.NewScanner(os.Stdin)
for sc.Scan() {
candidates, err := sg.Suggest(sc.Text())
if err != nil {
return fmt.Errorf("suggest error: %w", err)
}
fmt.Println(strings.Join(candidates, " "))
}

if sc.Err() != nil {
return fmt.Errorf("read error: %w", sc.Err())
}

return nil
}
239 changes: 239 additions & 0 deletions suggester.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
package main

import (
"bufio"
"fmt"
"io"
"os"
"path/filepath"
"strings"

"github.com/high-moctane/nwenc"
)

type Suggester struct {
dataPath string
om nwenc.OffsetMapper
fOneGram *os.File
candidateLen int
}

func NewSuggester(dataPath string, candidateLen int) (*Suggester, error) {
s := &Suggester{dataPath: dataPath}

var err error
s.fOneGram, err = os.Open(s.filePath(1, ""))
if err != nil {
return nil, fmt.Errorf("cannot open %s: %w", s.filePath(1, ""), err)
}
info, err := s.fOneGram.Stat()
if err != nil {
return nil, fmt.Errorf("cannot get file info: %w", err)
}

s.om = nwenc.NewCachedSeekOffsetMapper(s.fOneGram, info.Size())

if candidateLen < 1 {
return nil, fmt.Errorf("candidateLen must be positive int, but %d", candidateLen)
}
s.candidateLen = candidateLen

return s, nil
}

func (sg *Suggester) Close() error {
return sg.fOneGram.Close()
}

func (*Suggester) fileName(n int, prefix string) string {
if n == 1 {
return "1gram.txt"
}
return fmt.Sprintf("%dgram-%s", n, prefix)
}

func (sg *Suggester) filePath(n int, prefix string) string {
return filepath.Join(sg.dataPath, sg.fileName(n, prefix))
}

func (sg *Suggester) Suggest(query string) (candidates []string, err error) {
candidates = []string{}

if query == "" {
return
}

words, prefix := sg.parseQuery(query)

// search n-gram in decscending order
for i := 0; i < len(words); i++ {
var cand []string
cand, err = sg.suggestNgram(words[i:])
if err != nil {
return
}
candidates = append(candidates, cand...)
}

// search 1gram
if prefix != "" {
var cand []string
cand, err = sg.suggest1gram(prefix)
if err != nil {
return
}
candidates = append(candidates, cand...)
}

candidates = sg.uniqCandidates(candidates)
candidates = sg.filterCandidates(candidates, prefix)
return
}

func (*Suggester) parseQuery(input string) (words []string, prefix string) {
elems := strings.Split(input, " ")

// If the end of the input is not " ", the last word in the input will be the prefix.
if elems[len(elems)-1] != "" {
prefix = elems[len(elems)-1]
elems = elems[:len(elems)-1]
}

// collect up to last 4 words
words = []string{}
for i := len(elems) - 1; i >= 0; i-- {
if elems[i] == "" {
continue
}
words = append([]string{elems[i]}, words...)
if len(words) >= 4 {
break
}
}

return
}

func (sg *Suggester) suggestNgram(words []string) (candidates []string, err error) {
return
}

func (sg *Suggester) suggest1gram(prefix string) (candidates []string, err error) {
// open 1gram file
f, err := os.Open(sg.filePath(1, ""))
if err != nil {
return
}
defer f.Close()
info, err := f.Stat()
if err != nil {
return
}

// search head offset which prefix starts
offset, err := sg.binSearch(f, info.Size(), []byte(prefix), []byte{'\n'})
if err != nil {
return
}

// read candidates
sr := io.NewSectionReader(f, offset, info.Size()-offset)
sc := bufio.NewScanner(sr)
for i := 0; i < sg.candidateLen; i++ {
sc.Scan()
if sc.Err() != nil {
err = sc.Err()
return
}
if !strings.HasPrefix(sc.Text(), prefix) {
break
}
candidates = append(candidates, sc.Text())
}

return
}

func (*Suggester) uniqCandidates(candidates []string) []string {
res := []string{}
set := map[string]bool{} // set ot candidates

for _, word := range candidates {
if set[word] {
continue
}
res = append(res, word)
set[word] = true
}

return res
}

func (*Suggester) filterCandidates(candidates []string, prefix string) []string {
res := make([]string, 0, len(candidates))
for _, word := range candidates {
if strings.HasPrefix(word, prefix) {
res = append(res, word)
}
}
return res
}

func (sg *Suggester) binSearch(r io.ReaderAt, size int64, query []byte, delim []byte) (offset int64, err error) {
var left int64
right := size

for left <= right {
mid := left + (right-left)/2

offset, err = sg.findHeadOfLine(r, mid, delim)
if err != nil {
return
}

var b []byte
b, err = sg.readBytes(r, offset, delim)
if err != nil {
return
}

cmp := sg.cmpBytes(query, b)
if cmp < 0 {
right = mid - 1
} else if cmp > 0 {
left = mid + 1
} else {
return
}
}

offset, err = sg.findHeadOfLine(r, left, delim)
if err != nil {
return
}

return
}

func (sg *Suggester) findHeadOfLine(r io.ReaderAt, offset int64, delim []byte) (head int64, err error) {
// Because the data is encoded in fixed codeword length coding,
// offsets are 0 mod len(delim).
// The initial value of head is a previous value from the offset.
delimLen := int64(len(delim))
for head = offset - offset%delimLen - delimLen; head > 0; head -= delimLen {
buf := make([]byte, delimLen)
if _, err = r.ReadAt(buf, head); err != nil {
return
}

if sg.cmpBytes(buf, delim) == 0 {
head += delimLen
return
}
}
return
}

func (*Suggester) readBytes(r io.ReaderAt, offset int64, delim []byte) (b []byte, err error) {

}
Loading

0 comments on commit 2731311

Please sign in to comment.