Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: spelling correction #228

Merged
merged 25 commits into from
Dec 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 49 additions & 8 deletions src/rime/algo/syllabifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
//
#include <queue>
#include <boost/range/adaptor/reversed.hpp>
#include <rime/dict/prism.h>
#include <rime/algo/syllabifier.h>
#include <rime/dict/corrector.h>
#include <rime/dict/prism.h>
#include "syllabifier.h"

namespace rime {
using namespace corrector;

using Vertex = pair<size_t, SpellingType>;
using VertexQueue = std::priority_queue<Vertex,
Expand All @@ -35,16 +38,36 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// record a visit to the vertex
if (graph->vertices.find(current_pos) == graph->vertices.end())
graph->vertices.insert(vertex); // preferred spelling type comes first
else
else {
// graph->vertices[current_pos] = std::min(vertex.second, graph->vertices[current_pos]);
continue; // discard worse spelling types
}

if (current_pos > farthest)
farthest = current_pos;
DLOG(INFO) << "current_pos: " << current_pos;

// see where we can go by advancing a syllable
vector<Prism::Match> matches;
prism.CommonPrefixSearch(input.substr(current_pos), &matches);
set<SyllableId> match_set;
auto current_input = input.substr(current_pos);
prism.CommonPrefixSearch(current_input, &matches);
for (auto &m : matches) {
match_set.insert(m.value);
}
if (enable_correction_) {
Corrections corrections;
corrector_->ToleranceSearch(prism, current_input, &corrections, 5);
for (const auto &m : corrections) {
for (auto accessor = prism.QuerySpelling(m.first); !accessor.exhausted(); accessor.Next()) {
if (accessor.properties().type == kNormalSpelling) {
matches.push_back({ m.first, m.second.length });
break;
}
}
}
}

if (!matches.empty()) {
auto& end_vertices(graph->edges[current_pos]);
for (const auto& m : matches) {
Expand All @@ -56,15 +79,15 @@ int Syllabifier::BuildSyllableGraph(const string &input,
++end_pos;
DLOG(INFO) << "end_pos: " << end_pos;
bool matches_input = (current_pos == 0 && end_pos == input.length());
SpellingMap spellings;
SpellingMap& spellings(end_vertices[end_pos]);
SpellingType end_vertex_type = kInvalidSpelling;
// when spelling algebra is enabled,
// a spelling evaluates to a set of syllables;
// otherwise, it resembles exactly the syllable itself.
SpellingAccessor accessor(prism.QuerySpelling(m.value));
while (!accessor.exhausted()) {
SyllableId syllable_id = accessor.syllable_id();
SpellingProperties props = accessor.properties();
EdgeProperties props(accessor.properties());
if (strict_spelling_ &&
matches_input &&
props.type != kNormalSpelling) {
Expand All @@ -74,20 +97,29 @@ int Syllabifier::BuildSyllableGraph(const string &input,
props.end_pos = end_pos;
// add a syllable with properties to the edge's
// spelling-to-syllable map
spellings.insert({syllable_id, props});
if (match_set.find(m.value) == match_set.end()) {
props.is_correction = true;
props.credibility = 0.01;
lotem marked this conversation as resolved.
Show resolved Hide resolved
}
auto it = spellings.find(syllable_id);
if (it == spellings.end()) {
spellings.insert({syllable_id, props});
} else {
it->second.type = std::min(it->second.type, props.type);
}
// let end_vertex_type be the best (smaller) type of spelling
// that ends at the vertex
if (end_vertex_type > props.type) {
if (end_vertex_type > props.type && !props.is_correction) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if a position can only be reached via correction?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be kept as normal spelling.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see. End vertex of a correction edge should be marked as normal spelling even there is a worse typed edge overlapped.

Copy link
Member Author

@nameoverflow nameoverflow Nov 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And the worse overlapped edge should be deleted?
OK. It should keep the behavior as there isn't a correction.

end_vertex_type = props.type;
}
}
accessor.Next();
}
if (spellings.empty()) {
DLOG(INFO) << "not spelt.";
end_vertices.erase(end_pos);
continue;
}
end_vertices[end_pos].swap(spellings);
// find the best common type in a path up to the end vertex
// eg. pinyin "shurfa" has vertex type kNormalSpelling at position 3,
// kAbbreviation at position 4 and kAbbreviation at position 6
Expand Down Expand Up @@ -121,6 +153,10 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// when there is a path of more favored type
SpellingType edge_type = kInvalidSpelling;
for (auto k = j->second.begin(); k != j->second.end(); ) {
if (k->second.is_correction) {
++k;
continue; // Don't care correction edges
lotem marked this conversation as resolved.
Show resolved Hide resolved
}
if (k->second.type > last_type) {
j->second.erase(k++);
}
Expand Down Expand Up @@ -245,4 +281,9 @@ void Syllabifier::Transpose(SyllableGraph* graph) {
}
}

void Syllabifier::EnableCorrection(an<Corrector> corrector) {
enable_correction_ = true;
corrector_ = std::move(corrector);
}

} // namespace rime
14 changes: 12 additions & 2 deletions src/rime/algo/syllabifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,22 @@
namespace rime {

class Prism;
class Corrector;

using SyllableId = int32_t;

using SpellingMap = map<SyllableId, SpellingProperties>;
struct EdgeProperties : SpellingProperties {
lotem marked this conversation as resolved.
Show resolved Hide resolved
EdgeProperties(SpellingProperties sup): SpellingProperties(sup) {};
EdgeProperties() = default;
bool is_correction = false;
};

using SpellingMap = map<SyllableId, EdgeProperties>;
using VertexMap = map<size_t, SpellingType>;
using EndVertexMap = map<size_t, SpellingMap>;
using EdgeMap = map<size_t, EndVertexMap>;

using SpellingPropertiesList = vector<const SpellingProperties*>;
using SpellingPropertiesList = vector<const EdgeProperties*>;
using SpellingIndex = map<SyllableId, SpellingPropertiesList>;
using SpellingIndices = map<size_t, SpellingIndex>;

Expand All @@ -49,6 +56,7 @@ class Syllabifier {
RIME_API int BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph);
RIME_API void EnableCorrection(an<Corrector> corrector);

protected:
void CheckOverlappedSpellings(SyllableGraph *graph,
Expand All @@ -58,6 +66,8 @@ class Syllabifier {
string delimiters_;
bool enable_completion_ = false;
bool strict_spelling_ = false;
an<Corrector> corrector_ = nullptr;
bool enable_correction_ = false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this can be expressed by bool(corrector_)

};

} // namespace rime
Expand Down
2 changes: 2 additions & 0 deletions src/rime/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/optional.hpp>
#define BOOST_BIND_NO_PLACEHOLDERS
#ifdef BOOST_SIGNALS2
#include <boost/signals2/connection.hpp>
Expand Down Expand Up @@ -47,6 +48,7 @@ using std::pair;
using std::set;
using std::string;
using std::vector;
using boost::optional;

template <class Key, class T>
using hash_map = std::unordered_map<Key, T>;
Expand Down
Loading