Skip to content
This repository has been archived by the owner on Mar 29, 2022. It is now read-only.

Commit

Permalink
add nextword.go nextword_test.go
Browse files Browse the repository at this point in the history
  • Loading branch information
high-moctane committed Nov 28, 2019
1 parent 61cffa4 commit e647478
Show file tree
Hide file tree
Showing 2 changed files with 813 additions and 0 deletions.
323 changes: 323 additions & 0 deletions nextword.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
package main

import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
)

// ReadLineBufSize is buffer size for Nextword.ReadLine
var ReadLineBufSize = 1024 // 1024 is fast (?)

// Nextword suggests next English words.
type Nextword struct {
DataPath string
ReadLineBufSize int
Params *NextwordParams
}

// NewNextword returns new Nextword. If dataPath is not valid, err will be not nil.
func NewNextword(dataPath string, params *NextwordParams) (*Nextword, error) {
fi, err := os.Stat(dataPath)
if err != nil {
return nil, errors.New(`"NEXTWORD_DATA_PATH" environment variable is not set`)
}
if !fi.IsDir() {
return nil, errors.New(`invalid "NEXTWORD_DATA_PATH"`)
}
return &Nextword{
DataPath: dataPath,
ReadLineBufSize: ReadLineBufSize,
Params: params,
}, nil
}

// Suggest suggests next English words from input. If input ends with " ",
// it returns all likely words. If not, it returns the words that begins the last
// word of input.
func (nw *Nextword) Suggest(input string) (candidates []string, err error) {
ngram, prefix := nw.parseInput(input)

// search n-gram
for i := 0; i < len(ngram); i++ {
var cand []string
cand, err = nw.searchNgram(ngram[i:])
if err != nil {
return
}

// merge
if prefix != "" {
cand = nw.filterCandidates(cand, prefix)
}
candidates = nw.mergeCandidates(candidates, cand)

// end condition
if len(candidates) > nw.Params.CandidateNum {
candidates = candidates[:nw.Params.CandidateNum]
}
if !nw.Params.Greedy && len(candidates) > 0 {
return
}
}

// search 1-gram
cand, err := nw.searchOneGram(prefix)
if err != nil {
return
}
candidates = nw.mergeCandidates(candidates, cand)
if len(candidates) > nw.Params.CandidateNum {
candidates = candidates[:nw.Params.CandidateNum]
}

return
}

// parseInput returns last ngram and prefix in the input.
func (*Nextword) parseInput(input string) (ngram []string, prefix string) {
elems := strings.Split(input, " ")

// If elems does not end with " ", the word will be prefix.
if elems[len(elems)-1] != "" {
prefix = elems[len(elems)-1]
elems = elems[:len(elems)-1]
}

// collect up to last four words.
for i := len(elems) - 1; i >= 0; i-- {
if elems[i] == "" {
continue
}
ngram = append([]string{elems[i]}, ngram...)
if len(ngram) >= 4 {
break
}
}

return
}

// searchNgram search next English from ngram.
func (nw *Nextword) searchNgram(ngram []string) (candidates []string, err error) {
fname, ok := nw.ngramFileName(ngram)
if !ok {
return
}

// open
path := filepath.Join(nw.DataPath, fname)
f, err := os.Open(path)
if err != nil {
return
}
defer f.Close()
fi, err := os.Stat(path)
if err != nil {
return
}

// search
query := strings.Join(ngram, " ") + "\t"
offset, err := nw.binarySearch(f, 0, fi.Size(), query)
if err != nil {
err = nw.removeEOF(err)
return
}

line, err := nw.readLine(f, offset)
if err != nil {
err = nw.removeEOF(err)
return
}
if !strings.HasPrefix(line, query) {
return
}
candidates = strings.Split(strings.Split(line, "\t")[1], " ")

return
}

// ngramFileName returns appropriate file name from ngram.
func (*Nextword) ngramFileName(ngram []string) (fname string, ok bool) {
initial := strings.ToLower(string([]rune(ngram[0])[0]))
if initial < "a" && "z" < initial {
return
}

fname = fmt.Sprintf("%dgram-%s.txt", len(ngram)+1, initial)
ok = true
return
}

// searchOneGram search English words that begins with prefix.
func (nw *Nextword) searchOneGram(prefix string) (candidates []string, err error) {
if prefix == "" {
return
}

// open
path := filepath.Join(nw.DataPath, "1gram.txt")
f, err := os.Open(path)
if err != nil {
return
}
defer f.Close()
fi, err := os.Stat(path)
if err != nil {
return
}

// search offset
offset, err := nw.binarySearch(f, 0, fi.Size(), prefix)
if err != nil {
if err == io.EOF {
err = nil
}
return
}

// collect
for {
var line string
line, err = nw.readLine(f, offset)
if err != nil {
if err == io.EOF {
err = nil
}
return
}
if !strings.HasPrefix(line, prefix) {
break
}
candidates = append(candidates, line)
offset += int64(len(line)) + 1 // "\n"
}

return
}

// binarySearch searches query from r between left to right.
func (nw *Nextword) binarySearch(r io.ReaderAt, left, right int64, query string) (offset int64, err error) {
for left <= right {
mid := left + (right-left)/2
if mid == 0 {
offset = 0
} else {
var str string
str, err = nw.readLine(r, mid)
if err != nil {
err = nw.removeEOF(err)
return
}
offset = mid + int64(len(str)) + 1 // "\n"
}

var line string
line, err = nw.readLine(r, offset)
if err != nil {
err = nw.removeEOF(err)
return
}

if query < line {
right = mid - 1
} else if query == line {
return
} else {
left = mid + 1
}
}

mid := left + (right-left)/2
if mid == 0 {
offset = 0
} else {
var str string
str, err = nw.readLine(r, mid)
if err != nil {
err = nw.removeEOF(err)
return
}
offset = mid + int64(len(str)) + 1 // "\n"
}

return
}

// readLine reads r from offset until "\n".
func (nw *Nextword) readLine(r io.ReaderAt, offset int64) (string, error) {
strBuilder := new(strings.Builder)

for {
buf := make([]byte, nw.ReadLineBufSize)
n, err := r.ReadAt(buf, offset)
if err == io.EOF && n == 0 {
return "", err
} else if err != nil && err != io.EOF {
strBuilder.Write(buf[:n])
return strBuilder.String(), err
}

strBuilder.Write(buf[:n])

// return when buf has "\n"
for i, b := range buf[:n] {
if b == '\n' {
return strBuilder.String()[:strBuilder.Len()-n+i], nil
}
}

offset += int64(n)
}
}

// removeEOF removes io.EOF from err.
func (*Nextword) removeEOF(err error) error {
if err == io.EOF {
return nil
}
return err
}

// mergeCandidates merges two candidates
func (*Nextword) mergeCandidates(a, b []string) []string {
ret := a[:]

m := map[string]bool{}
for _, str := range a {
m[str] = true
}

for _, str := range b {
if !m[str] {
m[str] = true
ret = append(ret, str)
}
}

return ret
}

// filterCandidates filters words which do not begin with prefix from cand.
func (*Nextword) filterCandidates(cand []string, prefix string) []string {
res := make([]string, 0, len(cand))

for _, str := range cand {
if !strings.HasPrefix(str, prefix) {
continue
}
res = append(res, str)
}

return res
}

// NextwordParams is a Nextword parameter.
type NextwordParams struct {
CandidateNum int // Number of candidates
Greedy bool // If true, Nextword suggests words from all n-gram data.
}
Loading

0 comments on commit e647478

Please sign in to comment.