Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

All: Resolves #4613: Improve search with Asian scripts #5018

Merged
merged 11 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,12 @@ For more information see [Plugins](https://github.com/laurent22/joplin/blob/dev/

Joplin implements the SQLite Full Text Search (FTS4) extension. It means the content of all the notes is indexed in real time and search queries return results very fast. Both [Simple FTS Queries](https://www.sqlite.org/fts3.html#simple_fts_queries) and [Full-Text Index Queries](https://www.sqlite.org/fts3.html#full_text_index_queries) are supported. See below for the list of supported queries:

One caveat of SQLite FTS is that it does not support languages which do not use Latin word boundaries (spaces, tabs, punctuation). To solve this issue, Joplin has a custom search mode, that does not use FTS, but still has all of its features (multi term search, filters, etc.). One of its drawbacks is that it can get slow on larger note collections. Also, the sorting of the results will be less accurate, as the ranking algorithm (BM25) is, for now, only implemented for FTS. Finally, in this mode there are no restrictions on using the `*` wildcard (`swim*`, `*swim` and `ast*rix` all work). This search mode is currently enabled if one of the following languages are detected:
- Chinese
- Japanese
- Korean
- Thai

## Supported queries

Search type | Description | Example
Expand Down
27 changes: 4 additions & 23 deletions packages/lib/services/searchengine/SearchEngine.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('测试')).length).toBe(1);
expect((await engine.search('测试'))[0].fields).toEqual(['body']);
expect((await engine.search('测试*'))[0].fields).toEqual(['body']);
expect((await engine.search('any:1 type:todo 测试')).length).toBe(1);
}));

it('should support queries with Japanese characters', (async () => {
Expand All @@ -398,7 +399,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('できません')).length).toBe(1);
expect((await engine.search('できません*'))[0].fields.sort()).toEqual(['body', 'title']); // usually assume that keyword was matched in body
expect((await engine.search('テスト'))[0].fields.sort()).toEqual(['body']);

expect((await engine.search('any:1 type:todo テスト')).length).toBe(1);
}));

it('should support queries with Korean characters', (async () => {
Expand All @@ -409,6 +410,7 @@ describe('services_SearchEngine', function() {

expect((await engine.search('이것은')).length).toBe(1);
expect((await engine.search('말')).length).toBe(1);
expect((await engine.search('any:1 type:todo 말')).length).toBe(1);
}));

it('should support queries with Thai characters', (async () => {
Expand All @@ -419,28 +421,7 @@ describe('services_SearchEngine', function() {

expect((await engine.search('นี่คือค')).length).toBe(1);
expect((await engine.search('ไทย')).length).toBe(1);
}));

it('should support field restricted queries with Chinese characters', (async () => {
let rows;
const n1 = await Note.save({ title: '你好', body: '我是法国人' });

await engine.syncTables();

expect((await engine.search('title:你好*')).length).toBe(1);
expect((await engine.search('title:你好*'))[0].fields).toEqual(['title']);
expect((await engine.search('body:法国人')).length).toBe(1);
expect((await engine.search('body:法国人'))[0].fields).toEqual(['body']);
expect((await engine.search('body:你好')).length).toBe(0);
expect((await engine.search('title:你好 body:法国人')).length).toBe(1);
expect((await engine.search('title:你好 body:法国人'))[0].fields.sort()).toEqual(['body', 'title']);
expect((await engine.search('title:你好 body:bla')).length).toBe(0);
expect((await engine.search('title:你好 我是')).length).toBe(1);
expect((await engine.search('title:你好 我是'))[0].fields.sort()).toEqual(['body', 'title']);
expect((await engine.search('title:bla 我是')).length).toBe(0);

// For non-alpha char, only the first field is looked at, the following ones are ignored
// expect((await engine.search('title:你好 title:hello')).length).toBe(1);
expect((await engine.search('any:1 type:todo ไทย')).length).toBe(1);
}));

it('should parse normal query strings', (async () => {
Expand Down
15 changes: 11 additions & 4 deletions packages/lib/services/searchengine/SearchEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export default class SearchEngine {
public static relevantFields = 'id, title, body, user_created_time, user_updated_time, is_todo, todo_completed, todo_due, parent_id, latitude, longitude, altitude, source_url';
public static SEARCH_TYPE_AUTO = 'auto';
public static SEARCH_TYPE_BASIC = 'basic';
public static SEARCH_TYPE_NONLATIN_SCRIPT = 'nonlatin';
public static SEARCH_TYPE_FTS = 'fts';

public dispatch: Function = (_o: any) => {};
Expand Down Expand Up @@ -533,6 +534,7 @@ export default class SearchEngine {

determineSearchType_(query: string, preferredSearchType: any) {
if (preferredSearchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
if (preferredSearchType === SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT) return SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT;

// If preferredSearchType is "fts" we auto-detect anyway
// because it's not always supported.
Expand All @@ -547,10 +549,15 @@ export default class SearchEngine {
const textQuery = allTerms.filter(x => x.name === 'text' || x.name == 'title' || x.name == 'body').map(x => x.value).join(' ');
const st = scriptType(textQuery);

if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
if (!Setting.value('db.ftsEnabled')) {
return SearchEngine.SEARCH_TYPE_BASIC;
}

// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
if (['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
return SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT;
}

return SearchEngine.SEARCH_TYPE_FTS;
}

Expand All @@ -565,7 +572,6 @@ export default class SearchEngine {
const parsedQuery = await this.parseQuery(searchString);

if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
searchString = this.normalizeText_(searchString);
const rows = await this.basicSearch(searchString);

Expand All @@ -579,10 +585,11 @@ export default class SearchEngine {
// when searching.
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856

const useFts = searchType === SearchEngine.SEARCH_TYPE_FTS;
try {
const { query, params } = queryBuilder(parsedQuery.allTerms);
const { query, params } = queryBuilder(parsedQuery.allTerms, useFts);
const rows = await this.db().selectAll(query, params);
this.processResults_(rows, parsedQuery);
this.processResults_(rows, parsedQuery, !useFts);
return rows;
} catch (error) {
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
Expand Down
Loading