Skip to content

Commit

Permalink
All: Resolves #4613: Improve search with Asian scripts (#5018)
Browse files Browse the repository at this point in the history
  • Loading branch information
mablin7 authored Jun 7, 2021
1 parent 824afd4 commit 62a371b
Show file tree
Hide file tree
Showing 5 changed files with 920 additions and 883 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,12 @@ For more information see [Plugins](https://github.com/laurent22/joplin/blob/dev/

Joplin implements the SQLite Full Text Search (FTS4) extension. It means the content of all the notes is indexed in real time and search queries return results very fast. Both [Simple FTS Queries](https://www.sqlite.org/fts3.html#simple_fts_queries) and [Full-Text Index Queries](https://www.sqlite.org/fts3.html#full_text_index_queries) are supported. See below for the list of supported queries:

One caveat of SQLite FTS is that it does not support languages which do not use Latin word boundaries (spaces, tabs, punctuation). To solve this issue, Joplin has a custom search mode, that does not use FTS, but still has all of its features (multi term search, filters, etc.). One of its drawbacks is that it can get slow on larger note collections. Also, the sorting of the results will be less accurate, as the ranking algorithm (BM25) is, for now, only implemented for FTS. Finally, in this mode there are no restrictions on using the `*` wildcard (`swim*`, `*swim` and `ast*rix` all work). This search mode is currently enabled if one of the following languages are detected:
- Chinese
- Japanese
- Korean
- Thai

## Supported queries

Search type | Description | Example
Expand Down
27 changes: 4 additions & 23 deletions packages/lib/services/searchengine/SearchEngine.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('测试')).length).toBe(1);
expect((await engine.search('测试'))[0].fields).toEqual(['body']);
expect((await engine.search('测试*'))[0].fields).toEqual(['body']);
expect((await engine.search('any:1 type:todo 测试')).length).toBe(1);
}));

it('should support queries with Japanese characters', (async () => {
Expand All @@ -398,7 +399,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('できません')).length).toBe(1);
expect((await engine.search('できません*'))[0].fields.sort()).toEqual(['body', 'title']); // usually assume that keyword was matched in body
expect((await engine.search('テスト'))[0].fields.sort()).toEqual(['body']);

expect((await engine.search('any:1 type:todo テスト')).length).toBe(1);
}));

it('should support queries with Korean characters', (async () => {
Expand All @@ -409,6 +410,7 @@ describe('services_SearchEngine', function() {

expect((await engine.search('이것은')).length).toBe(1);
expect((await engine.search('말')).length).toBe(1);
expect((await engine.search('any:1 type:todo 말')).length).toBe(1);
}));

it('should support queries with Thai characters', (async () => {
Expand All @@ -419,28 +421,7 @@ describe('services_SearchEngine', function() {

expect((await engine.search('นี่คือค')).length).toBe(1);
expect((await engine.search('ไทย')).length).toBe(1);
}));

it('should support field restricted queries with Chinese characters', (async () => {
let rows;
const n1 = await Note.save({ title: '你好', body: '我是法国人' });

await engine.syncTables();

expect((await engine.search('title:你好*')).length).toBe(1);
expect((await engine.search('title:你好*'))[0].fields).toEqual(['title']);
expect((await engine.search('body:法国人')).length).toBe(1);
expect((await engine.search('body:法国人'))[0].fields).toEqual(['body']);
expect((await engine.search('body:你好')).length).toBe(0);
expect((await engine.search('title:你好 body:法国人')).length).toBe(1);
expect((await engine.search('title:你好 body:法国人'))[0].fields.sort()).toEqual(['body', 'title']);
expect((await engine.search('title:你好 body:bla')).length).toBe(0);
expect((await engine.search('title:你好 我是')).length).toBe(1);
expect((await engine.search('title:你好 我是'))[0].fields.sort()).toEqual(['body', 'title']);
expect((await engine.search('title:bla 我是')).length).toBe(0);

// For non-alpha char, only the first field is looked at, the following ones are ignored
// expect((await engine.search('title:你好 title:hello')).length).toBe(1);
expect((await engine.search('any:1 type:todo ไทย')).length).toBe(1);
}));

it('should parse normal query strings', (async () => {
Expand Down
15 changes: 11 additions & 4 deletions packages/lib/services/searchengine/SearchEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export default class SearchEngine {
public static relevantFields = 'id, title, body, user_created_time, user_updated_time, is_todo, todo_completed, todo_due, parent_id, latitude, longitude, altitude, source_url';
public static SEARCH_TYPE_AUTO = 'auto';
public static SEARCH_TYPE_BASIC = 'basic';
public static SEARCH_TYPE_NONLATIN_SCRIPT = 'nonlatin';
public static SEARCH_TYPE_FTS = 'fts';

public dispatch: Function = (_o: any) => {};
Expand Down Expand Up @@ -533,6 +534,7 @@ export default class SearchEngine {

determineSearchType_(query: string, preferredSearchType: any) {
if (preferredSearchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
if (preferredSearchType === SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT) return SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT;

// If preferredSearchType is "fts" we auto-detect anyway
// because it's not always supported.
Expand All @@ -547,10 +549,15 @@ export default class SearchEngine {
const textQuery = allTerms.filter(x => x.name === 'text' || x.name == 'title' || x.name == 'body').map(x => x.value).join(' ');
const st = scriptType(textQuery);

if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
if (!Setting.value('db.ftsEnabled')) {
return SearchEngine.SEARCH_TYPE_BASIC;
}

// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
if (['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
return SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT;
}

return SearchEngine.SEARCH_TYPE_FTS;
}

Expand All @@ -565,7 +572,6 @@ export default class SearchEngine {
const parsedQuery = await this.parseQuery(searchString);

if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
searchString = this.normalizeText_(searchString);
const rows = await this.basicSearch(searchString);

Expand All @@ -579,10 +585,11 @@ export default class SearchEngine {
// when searching.
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856

const useFts = searchType === SearchEngine.SEARCH_TYPE_FTS;
try {
const { query, params } = queryBuilder(parsedQuery.allTerms);
const { query, params } = queryBuilder(parsedQuery.allTerms, useFts);
const rows = await this.db().selectAll(query, params);
this.processResults_(rows, parsedQuery);
this.processResults_(rows, parsedQuery, !useFts);
return rows;
} catch (error) {
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
Expand Down
Loading

0 comments on commit 62a371b

Please sign in to comment.