diff --git a/go.mod b/go.mod deleted file mode 100644 index b0660c8..0000000 --- a/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module github.com/high-moctane/nextword - -go 1.13 - -require github.com/high-moctane/nwenc v0.2.0 diff --git a/go.sum b/go.sum deleted file mode 100644 index 75b55df..0000000 --- a/go.sum +++ /dev/null @@ -1,4 +0,0 @@ -github.com/high-moctane/nwenc v0.1.1 h1:wmt+kAdkgAfZHj7Jlwr7m3lxq8iczXEydlnl0ZMLwSA= -github.com/high-moctane/nwenc v0.1.1/go.mod h1:6REV+Ridy2BaP88PeJ7POgCU28oE3EkOzCjhBCywdOQ= -github.com/high-moctane/nwenc v0.2.0 h1:s8+qLh5RSEgPMPa7P54vWoGPALxR+cs8TZh0lr53TB0= -github.com/high-moctane/nwenc v0.2.0/go.mod h1:6REV+Ridy2BaP88PeJ7POgCU28oE3EkOzCjhBCywdOQ= diff --git a/main.go b/main.go deleted file mode 100644 index c974fa6..0000000 --- a/main.go +++ /dev/null @@ -1,80 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "os" - "strings" -) - -// default env value for the data directory path. -const envDataPath = "NEXTWORD_DATA_PATH" - -// flags -var dataPath = flag.String("data", os.Getenv(envDataPath), "path to data directory") -var maxCandidatesNum = flag.Int("candidates-num", 100, "max candidates num (positive int)") -var helpFlag = flag.Bool("h", false, "show this message") -var greedyFlag = flag.Bool("greedy", false, "show as many result as possible") - -func main() { - os.Exit(run()) -} - -func run() int { - if !parseArgs() { - help := `The nextword prints the most likely English words that follow the stdin sentence. - -The space character at the end of line plays an important role. If the line ends -with a space, the command show the next suggested words. However, if the line -ends with an alphabetic character, the suggested words start with the last word -of the line. - -This command needs an external dataset. The dataset path should be set in an -environment value "$NEXTWORD_DATA_PATH". ` - - flag.Usage() - fmt.Fprintln(os.Stderr, "") - fmt.Fprintln(os.Stderr, help) - return 1 - } - - if err := serve(); err != nil { - fmt.Fprintf(os.Stderr, "serve error: %v", err) - return 1 - } - return 0 -} - -func serve() error { - sg := NewSuggester(*dataPath, *maxCandidatesNum, *greedyFlag) - - sc := bufio.NewScanner(os.Stdin) - for sc.Scan() { - candidates, err := sg.Suggest(sc.Text()) - if err != nil { - return fmt.Errorf("suggest error: %w", err) - } - fmt.Println(strings.Join(candidates, " ")) - } - - if sc.Err() != nil { - return fmt.Errorf("read error: %w", sc.Err()) - } - - return nil -} - -func parseArgs() bool { - flag.Parse() - if *helpFlag { - return false - } - if *maxCandidatesNum < 1 { - return false - } - if *dataPath == "" { - return false - } - return true -} diff --git a/suggester.go b/suggester.go deleted file mode 100644 index 4700e5c..0000000 --- a/suggester.go +++ /dev/null @@ -1,274 +0,0 @@ -package main - -import ( - "bufio" - "fmt" - "io" - "os" - "path/filepath" - "strings" -) - -type Suggester struct { - dataPath string - candidateLen int - greedy bool -} - -func NewSuggester(dataPath string, candidateLen int, greedy bool) *Suggester { - return &Suggester{ - dataPath: dataPath, - candidateLen: candidateLen, - greedy: greedy, - } -} - -func (*Suggester) fileName(n int, prefix string) string { - if n == 1 { - return "1gram.txt" - } - return fmt.Sprintf("%dgram-%s.txt", n, prefix) -} - -func (sg *Suggester) filePath(n int, prefix string) string { - return filepath.Join(sg.dataPath, sg.fileName(n, prefix)) -} - -func (sg *Suggester) Suggest(query string) (candidates []string, err error) { - if query == "" { - return - } - - words, prefix := sg.parseQuery(query) - - // search n-gram in decscending order - for i := 0; i < len(words); i++ { - var cand []string - cand, err = sg.suggestNgram(words[i:]) - if err != nil { - return - } - candidates = append(candidates, cand...) - - // return when non greedy with non empty candidates - if !sg.greedy { - candidates = sg.filterCandidates(candidates, prefix) - if len(candidates) > sg.candidateLen { - candidates = candidates[:sg.candidateLen] - } - if len(candidates) > 0 { - return - } - } - } - - // search 1gram - if prefix != "" { - var cand []string - cand, err = sg.suggest1gram(prefix) - if err != nil { - return - } - candidates = append(candidates, cand...) - } - - // filter candidates - candidates = sg.filterCandidates(candidates, prefix) - candidates = sg.uniqCandidates(candidates) - if len(candidates) > sg.candidateLen { - candidates = candidates[:sg.candidateLen] - } - return -} - -func (*Suggester) parseQuery(input string) (words []string, prefix string) { - elems := strings.Split(input, " ") - - // If the end of the input is not " ", the last word in the input will be the prefix. - if elems[len(elems)-1] != "" { - prefix = elems[len(elems)-1] - elems = elems[:len(elems)-1] - } - - // collect up to last 4 words - words = []string{} - for i := len(elems) - 1; i >= 0; i-- { - if elems[i] == "" { - continue - } - words = append([]string{elems[i]}, words...) - if len(words) >= 4 { - break - } - } - - return -} - -func (sg *Suggester) suggestNgram(words []string) (candidates []string, err error) { - // open data - n := len(words) + 1 - initial := strings.ToLower(string([]rune(words[0])[0])) - if (initial < "A" || "Z" <= initial) && (initial < "a" || "z" <= initial) { - return - } - - f, err := os.Open(sg.filePath(n, initial)) - if err != nil { - return - } - defer f.Close() - info, err := f.Stat() - if err != nil { - return - } - - // search for a head offset which the query starts - query := strings.Join(words, " ") + "\t" - offset, err := sg.binSearch(f, info.Size(), query) - if err != nil { - return - } - - entry, err := sg.readLine(f, offset, info.Size()) - if err != nil { - return - } - if !strings.HasPrefix(entry, query) { - // no matching - return - } - - candidates = strings.Split(strings.Split(entry, "\t")[1], " ") - return -} - -func (sg *Suggester) suggest1gram(prefix string) (candidates []string, err error) { - // open 1gram file - f, err := os.Open(sg.filePath(1, "")) - if err != nil { - return - } - defer f.Close() - info, err := f.Stat() - if err != nil { - return - } - - // search for a head offset which the prefix starts - offset, err := sg.binSearch(f, info.Size(), prefix) - if err != nil { - return - } - - // read candidates - sr := io.NewSectionReader(f, offset, info.Size()-offset) - sc := bufio.NewScanner(sr) - for i := 0; i < sg.candidateLen; i++ { - if sc.Scan() { - if !strings.HasPrefix(sc.Text(), prefix) { - break - } - candidates = append(candidates, sc.Text()) - } - if sc.Err() != nil { - err = sc.Err() - return - } - } - - return -} - -func (*Suggester) uniqCandidates(candidates []string) []string { - var res []string - set := map[string]bool{} // set ot candidates - - for _, word := range candidates { - if set[word] { - continue - } - res = append(res, word) - set[word] = true - } - - return res -} - -func (*Suggester) filterCandidates(candidates []string, prefix string) []string { - var res []string - for _, word := range candidates { - if strings.HasPrefix(word, prefix) { - res = append(res, word) - } - } - return res -} - -func (sg *Suggester) binSearch(r io.ReaderAt, size int64, query string) (offset int64, err error) { - var left int64 - right := size - - for left <= right { - mid := left + (right-left)/2 - - offset, err = sg.findHeadOfLine(r, mid) - if err != nil { - return - } - - var line string - line, err = sg.readLine(r, offset, size) - if err != nil { - return - } - - if query < line { - right = mid - 1 - } else if query > line { - left = mid + 1 - } else { - return - } - } - - offset, err = sg.findHeadOfLine(r, left) - if err != nil { - return - } - - return -} - -func (sg *Suggester) findHeadOfLine(r io.ReaderAt, offset int64) (head int64, err error) { - // The initial value of head is a previous value from the offset. - for head = offset - 1; ; head-- { - if head <= 0 { - head = 0 - return - } - - buf := make([]byte, 1) - if _, err = r.ReadAt(buf, head); err != nil { - return - } - - if buf[0] == '\n' { - head++ - return - } - } -} - -func (*Suggester) readLine(r io.ReaderAt, offset, size int64) (line string, err error) { - sr := io.NewSectionReader(r, offset, size-offset) - sc := bufio.NewScanner(sr) - if sc.Scan() { - line = sc.Text() - } - if sc.Err() != nil { - err = sc.Err() - return - } - return -} diff --git a/suggester_test.go b/suggester_test.go deleted file mode 100644 index 07b113c..0000000 --- a/suggester_test.go +++ /dev/null @@ -1,508 +0,0 @@ -package main - -import ( - "os" - "path/filepath" - "reflect" - "testing" -) - -var EnvDataPathTest = os.Getenv("NEXTWORD_DATA_PATH") - -func TestSuggester_Suggest(t *testing.T) { - tests := []struct { - candidatesLen int - query string - candidates []string - err error - }{ - { - 10, - "", - nil, - nil, - }, - { - 10, - "EDM ", - []string{"is", "and", "process", "instruments", "instrument"}, - nil, - }, - { - 10, - "EDM", - []string{"EDM", "EDMA", "EDMAN", "EDMD", "EDMK", "EDMOND", "EDMONDS", - "EDMONDSON", "EDMONSON", "EDMONSTON"}, - nil, - }, - { - 20, - "you could not buy ", - []string{ - "a", "the", "it", "them", "anything", - "any", "or", "him", "his", "that", - "one", "in", "her", "land", "food", - "their", "into", "from", "this", "me", - }, - nil, - }, - { - 20, - "you could not buy ", - []string{ - "a", "the", "it", "them", "anything", - "any", "or", "him", "his", "that", - "one", "in", "her", "land", "food", - "their", "into", "from", "this", "me", - }, - nil, - }, - { - 20, - "may be until day ", - []string{ - "after", "of", "and", "in", "to", - "the", "or", "I", "when", "for", - "he", "before", "was", "is", "that", - "at", "on", "by", "with", "we", - }, - nil, - }, - { - 15, - "just for a few m", - []string{ - "minutes", "moments", "months", "more", "miles", - "minor", "men", "million", "milliseconds", "members", - "m", "m!", "m'", "m'1", "m'2", - }, - nil, - }, - { - 20, - "aaaaaaaaa bbbbbbbbbb ccccccccccc dddddddddd eeeeeeeeaaa", - nil, - nil, - }, - } - - for idx, test := range tests { - sg := NewSuggester(EnvDataPathTest, test.candidatesLen) - - cand, err := sg.Suggest(test.query) - if err != nil { - t.Errorf("[%d] expected %v, but got %v", idx, test.err, err) - } - if err != nil { - continue - } - if !reflect.DeepEqual(test.candidates, cand) { - t.Errorf("[%d] expected %v, but got %v", idx, test.candidates, cand) - } - } -} - -func BenchmarkSuggester_Suggest(b *testing.B) { - sg := NewSuggester(EnvDataPathTest, 100) - - for i := 0; i < b.N; i++ { - sg.Suggest("The quick brown fox ju") - } -} - -func TestSuggester_ParseQuery(t *testing.T) { - tests := []struct { - query string - words []string - prefix string - }{ - { - "abc", - []string{}, - "abc", - }, - { - "abc ", - []string{"abc"}, - "", - }, - { - "abc def ", - []string{"abc", "def"}, - "", - }, - { - "abc def g", - []string{"abc", "def"}, - "g", - }, - { - "abc def ghi jkl ", - []string{"abc", "def", "ghi", "jkl"}, - "", - }, - { - "abc def ghi jkl mno ", - []string{"def", "ghi", "jkl", "mno"}, - "", - }, - { - "abc def ghi jkl mno pqr", - []string{"def", "ghi", "jkl", "mno"}, - "pqr", - }, - } - - for idx, test := range tests { - s := new(Suggester) - words, prefix := s.parseQuery(test.query) - if !reflect.DeepEqual(test.words, words) { - t.Errorf("[%d] expect %v, but got %v", idx, test.words, words) - } - if test.prefix != prefix { - t.Errorf("[%d] expected %s, but got %v", idx, test.prefix, prefix) - } - } -} - -func TestSuggester_SuggestNgram(t *testing.T) { - tests := []struct { - words []string - candidates []string - }{ - { - []string{"objectivation"}, - []string{"of", "and"}, - }, - { - []string{"committee", "feels"}, - []string{"that"}, - }, - { - []string{"these", "steps", "are"}, - []string{"taken", "not", "completed"}, - }, - { - []string{"Have", "you", "seen", "or"}, - []string{"heard"}, - }, - { - []string{"brousa"}, - nil, - }, - { - []string{"I felt paint"}, - nil, - }, - { - []string{"0000000000 11111111 "}, - nil, - }, - { - []string{"🤔 🤗 "}, - nil, - }, - } - - for idx, test := range tests { - sg := NewSuggester(EnvDataPathTest, 100) - cand, err := sg.suggestNgram(test.words) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - continue - } - if !reflect.DeepEqual(test.candidates, cand) { - t.Errorf("[%d] expected %v, but got %v", idx, test.candidates, cand) - } - } -} - -func TestSuggester_Suggest1gram(t *testing.T) { - tests := []struct { - candidatesLen int - prefix string - candidates []string - err error - }{ - { - 10, - "pound", - []string{"pound", "pound.1", "pounda", "poundage", "poundages", - "poundal", "poundals", "poundcake", "pounde", "pounded"}, - nil, - }, - { - 5, - "Kumu", - []string{"Kumu", "Kumud", "Kumuda", "Kumudini", "Kumuhonua"}, - nil, - }, - { - 5, - "NT/2000", - []string{"NT/2000", "NT/2000/XP"}, - nil, - }, - { - 5, - "Sehoraaaaaaaa", - nil, - nil, - }, - } - - for idx, test := range tests { - sg := NewSuggester(EnvDataPathTest, test.candidatesLen) - - cand, err := sg.suggest1gram(test.prefix) - if !reflect.DeepEqual(test.err, err) { - t.Errorf("[%d] expected %v, but got %v", idx, test.err, err) - } - if err != nil { - continue - } - - if !reflect.DeepEqual(test.candidates, cand) { - t.Errorf("[%d] expected %v, but got %v", idx, test.candidates, cand) - } - } -} - -func TestSuggester_UniqCandidates(t *testing.T) { - tests := []struct { - in, out []string - }{ - { - []string{}, - nil, - }, - { - []string{"abc"}, - []string{"abc"}, - }, - { - []string{"abc", "abc"}, - []string{"abc"}, - }, - { - []string{"abc", "def"}, - []string{"abc", "def"}, - }, - { - []string{"abc", "def", "abc"}, - []string{"abc", "def"}, - }, - { - []string{"abc", "def", "abc", "def", "abc"}, - []string{"abc", "def"}, - }, - } - - for idx, test := range tests { - s := new(Suggester) - out := s.uniqCandidates(test.in) - if !reflect.DeepEqual(test.out, out) { - t.Errorf("[%d] expected %v, but got %v", idx, test.out, out) - } - } -} - -func TestSuggester_FilterCandidates(t *testing.T) { - tests := []struct { - cand []string - prefix string - out []string - }{ - { - []string{}, - "", - nil, - }, - { - []string{}, - "prefix", - nil, - }, - { - []string{"abc"}, - "", - []string{"abc"}, - }, - { - []string{"abc"}, - "ab", - []string{"abc"}, - }, - { - []string{"abc"}, - "ae", - nil, - }, - { - []string{"abc", "bcd", "abd", "abe", "ade", "absent"}, - "ab", - []string{"abc", "abd", "abe", "absent"}, - }, - } - - for idx, test := range tests { - s := new(Suggester) - out := s.filterCandidates(test.cand, test.prefix) - if !reflect.DeepEqual(test.out, out) { - t.Errorf("[%d] expected %v, but got %v", idx, test.out, out) - } - } -} - -func TestSuggester_BinSearch(t *testing.T) { - tests := []struct { - filePath string - query string - offset int64 - }{ - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - "A", - 0, - }, - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - "zł", - 11346418, - }, - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - "Recu", - 4901265, - }, - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - "Latemaa", - 3303588, - }, - { - filepath.Join(EnvDataPathTest, "2gram-e.txt"), - "ELSE", - 24641, - }, - } - - sg := new(Suggester) - for idx, test := range tests { - func() { - f, err := os.Open(test.filePath) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - defer f.Close() - info, err := f.Stat() - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - - offset, err := sg.binSearch(f, info.Size(), test.query) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - - if test.offset != offset { - t.Errorf("[%d] expected %d, but got %d", idx, test.offset, offset) - } - }() - } -} - -func TestSuggester_FindHeadOfLine(t *testing.T) { - tests := []struct { - filePath string - offset int64 - head int64 - }{ - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - 0, - 0, - }, - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - 3, - 2, - }, - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - 30749, - 30734, - }, - } - - sg := new(Suggester) - for idx, test := range tests { - func() { - f, err := os.Open(test.filePath) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - defer f.Close() - - head, err := sg.findHeadOfLine(f, test.offset) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - - if test.head != head { - t.Errorf("[%d] expected %d, but got %d", idx, test.head, head) - } - }() - } -} - -func TestSuggester_ReadLine(t *testing.T) { - tests := []struct { - filePath string - offset int64 - line string - }{ - { - filepath.Join(EnvDataPathTest, "1gram.txt"), - 0, - "A", - }, - { - filepath.Join(EnvDataPathTest, "2gram-e.txt"), - 13617, - "EDUCA\tTION", - }, - } - - for idx, test := range tests { - func() { - f, err := os.Open(test.filePath) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - defer f.Close() - info, err := f.Stat() - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - - sg := new(Suggester) - line, err := sg.readLine(f, test.offset, info.Size()) - if err != nil { - t.Errorf("[%d] unexpected error: %v", idx, err) - return - } - - if test.line != line { - t.Errorf("[%d] expected %#v, but got %#v", idx, test.line, line) - } - }() - } -}