-
-
Notifications
You must be signed in to change notification settings - Fork 5.1k
/
Copy pathSearchEngine.ts
611 lines (494 loc) · 19.8 KB
/
SearchEngine.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
import Logger from '../../Logger';
import ItemChange from '../../models/ItemChange';
import Setting from '../../models/Setting';
import Note from '../../models/Note';
import BaseModel from '../../BaseModel';
import ItemChangeUtils from '../ItemChangeUtils';
import shim from '../../shim';
import filterParser from './filterParser';
import queryBuilder from './queryBuilder';
import { ItemChangeEntity, NoteEntity } from '../database/types';
const { sprintf } = require('sprintf-js');
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');
export default class SearchEngine {
public static instance_: SearchEngine = null;
public static relevantFields = 'id, title, body, user_created_time, user_updated_time, is_todo, todo_completed, parent_id, latitude, longitude, altitude, source_url';
public static SEARCH_TYPE_AUTO = 'auto';
public static SEARCH_TYPE_BASIC = 'basic';
public static SEARCH_TYPE_FTS = 'fts';
public dispatch: Function = (_o: any) => {};
private logger_ = new Logger();
private db_: any = null;
private isIndexing_ = false;
private syncCalls_: any[] = [];
private scheduleSyncTablesIID_: any;
static instance() {
if (SearchEngine.instance_) return SearchEngine.instance_;
SearchEngine.instance_ = new SearchEngine();
return SearchEngine.instance_;
}
setLogger(logger: Logger) {
this.logger_ = logger;
}
logger() {
return this.logger_;
}
setDb(db: any) {
this.db_ = db;
}
db() {
return this.db_;
}
noteById_(notes: NoteEntity[], noteId: string) {
for (let i = 0; i < notes.length; i++) {
if (notes[i].id === noteId) return notes[i];
}
// The note may have been deleted since the change was recorded. For example in this case:
// - Note created (Some Change object is recorded)
// - Note is deleted
// - ResourceService indexer runs.
// In that case, there will be a change for the note, but the note will be gone.
return null;
}
async rebuildIndex_() {
let noteIds: string[] = await this.db().selectAll('SELECT id FROM notes WHERE is_conflict = 0 AND encryption_applied = 0');
noteIds = noteIds.map((n: any) => n.id);
const lastChangeId = await ItemChange.lastChangeId();
// First delete content of note_normalized, in case the previous initial indexing failed
await this.db().exec('DELETE FROM notes_normalized');
while (noteIds.length) {
const currentIds = noteIds.splice(0, 100);
const notes = await Note.modelSelectAll(`
SELECT ${SearchEngine.relevantFields}
FROM notes
WHERE id IN ("${currentIds.join('","')}") AND is_conflict = 0 AND encryption_applied = 0`);
const queries = [];
for (let i = 0; i < notes.length; i++) {
const note = notes[i];
const n = this.normalizeNote_(note);
queries.push({ sql: `
INSERT INTO notes_normalized(${SearchEngine.relevantFields})
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
params: [n.id, n.title, n.body, n.user_created_time, n.user_updated_time, n.is_todo, n.todo_completed, n.parent_id, n.latitude, n.longitude, n.altitude, n.source_url] }
);
}
await this.db().transactionExecBatch(queries);
}
Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
}
scheduleSyncTables() {
if (this.scheduleSyncTablesIID_) return;
this.scheduleSyncTablesIID_ = shim.setTimeout(async () => {
try {
await this.syncTables();
} catch (error) {
this.logger().error('SearchEngine::scheduleSyncTables: Error while syncing tables:', error);
}
this.scheduleSyncTablesIID_ = null;
}, 10000);
}
async rebuildIndex() {
Setting.setValue('searchEngine.lastProcessedChangeId', 0);
Setting.setValue('searchEngine.initialIndexingDone', false);
return this.syncTables();
}
async syncTables_() {
if (this.isIndexing_) return;
this.isIndexing_ = true;
this.logger().info('SearchEngine: Updating FTS table...');
await ItemChange.waitForAllSaved();
if (!Setting.value('searchEngine.initialIndexingDone')) {
await this.rebuildIndex_();
Setting.setValue('searchEngine.initialIndexingDone', true);
this.isIndexing_ = false;
return;
}
const startTime = Date.now();
const report = {
inserted: 0,
deleted: 0,
};
let lastChangeId = Setting.value('searchEngine.lastProcessedChangeId');
try {
while (true) {
const changes: ItemChangeEntity[] = await ItemChange.modelSelectAll(
`
SELECT id, item_id, type
FROM item_changes
WHERE item_type = ?
AND id > ?
ORDER BY id ASC
LIMIT 10
`,
[BaseModel.TYPE_NOTE, lastChangeId]
);
if (!changes.length) break;
const queries = [];
const noteIds = changes.map(a => a.item_id);
const notes = await Note.modelSelectAll(`
SELECT ${SearchEngine.relevantFields}
FROM notes WHERE id IN ("${noteIds.join('","')}") AND is_conflict = 0 AND encryption_applied = 0`
);
for (let i = 0; i < changes.length; i++) {
const change = changes[i];
if (change.type === ItemChange.TYPE_CREATE || change.type === ItemChange.TYPE_UPDATE) {
queries.push({ sql: 'DELETE FROM notes_normalized WHERE id = ?', params: [change.item_id] });
const note = this.noteById_(notes, change.item_id);
if (note) {
const n = this.normalizeNote_(note);
queries.push({ sql: `
INSERT INTO notes_normalized(${SearchEngine.relevantFields})
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
params: [change.item_id, n.title, n.body, n.user_created_time, n.user_updated_time, n.is_todo, n.todo_completed, n.parent_id, n.latitude, n.longitude, n.altitude, n.source_url] });
report.inserted++;
}
} else if (change.type === ItemChange.TYPE_DELETE) {
queries.push({ sql: 'DELETE FROM notes_normalized WHERE id = ?', params: [change.item_id] });
report.deleted++;
} else {
throw new Error(`Invalid change type: ${change.type}`);
}
lastChangeId = change.id;
}
await this.db().transactionExecBatch(queries);
Setting.setValue('searchEngine.lastProcessedChangeId', lastChangeId);
await Setting.saveAll();
}
} catch (error) {
this.logger().error('SearchEngine: Error while processing changes:', error);
}
await ItemChangeUtils.deleteProcessedChanges();
this.logger().info(sprintf('SearchEngine: Updated FTS table in %dms. Inserted: %d. Deleted: %d', Date.now() - startTime, report.inserted, report.deleted));
this.isIndexing_ = false;
}
async syncTables() {
this.syncCalls_.push(true);
try {
await this.syncTables_();
} finally {
this.syncCalls_.pop();
}
}
async countRows() {
const sql = 'SELECT count(*) as total FROM notes_fts';
const row = await this.db().selectOne(sql);
return row && row['total'] ? row['total'] : 0;
}
fieldNamesFromOffsets_(offsets: any[]) {
const notesNormalizedFieldNames = this.db().tableFieldNames('notes_normalized');
const occurenceCount = Math.floor(offsets.length / 4);
const output: string[] = [];
for (let i = 0; i < occurenceCount; i++) {
const colIndex = offsets[i * 4];
const fieldName = notesNormalizedFieldNames[colIndex];
if (!output.includes(fieldName)) output.push(fieldName);
}
return output;
}
calculateWeight_(offsets: any[], termCount: number) {
// Offset doc: https://www.sqlite.org/fts3.html#offsets
// - If there's only one term in the query string, the content with the most matches goes on top
// - If there are multiple terms, the result with the most occurences that are closest to each others go on top.
// eg. if query is "abcd efgh", "abcd efgh" will go before "abcd XX efgh".
const occurenceCount = Math.floor(offsets.length / 4);
if (termCount === 1) return occurenceCount;
let spread = 0;
let previousDist = null;
for (let i = 0; i < occurenceCount; i++) {
const dist = offsets[i * 4 + 2];
if (previousDist !== null) {
const delta = dist - previousDist;
spread += delta;
}
previousDist = dist;
}
// Divide the number of occurences by the spread so even if a note has many times the searched terms
// but these terms are very spread appart, they'll be given a lower weight than a note that has the
// terms once or twice but just next to each others.
return occurenceCount / spread;
}
calculateWeightBM25_(rows: any[]) {
// https://www.sqlite.org/fts3.html#matchinfo
// pcnalx are the arguments passed to matchinfo
// p - The number of matchable phrases in the query.
// c - The number of user defined columns in the FTS table
// n - The number of rows in the FTS4 table.
// a - avg number of tokens in the text values stored in the column.
// l - For each column, the length of the value stored in the current
// row of the FTS4 table, in tokens.
// x - For each distinct combination of a phrase and table column, the
// following three values:
// hits_this_row
// hits_all_rows
// docs_with_hits
if (rows.length === 0) return;
const matchInfo = rows.map(row => new Uint32Array(row.matchinfo.buffer));
const generalInfo = matchInfo[0];
const K1 = 1.2;
const B = 0.75;
const TITLE_COLUMN = 1;
const BODY_COLUMN = 2;
const columns = [TITLE_COLUMN, BODY_COLUMN];
// const NUM_COLS = 12;
const numPhrases = generalInfo[0]; // p
const numColumns = generalInfo[1]; // c
const numRows = generalInfo[2]; // n
const avgTitleTokens = generalInfo[4]; // a
const avgBodyTokens = generalInfo[5];
const avgTokens = [null, avgTitleTokens, avgBodyTokens]; // we only need cols 1 and 2
const numTitleTokens = matchInfo.map(m => m[4 + numColumns]); // l
const numBodyTokens = matchInfo.map(m => m[5 + numColumns]);
const numTokens = [null, numTitleTokens, numBodyTokens];
const X = matchInfo.map(m => m.slice(27)); // x
const hitsThisRow = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 0];
// const hitsAllRows = (array, c, p) => array[3 * (c + p*NUM_COLS) + 1];
const docsWithHits = (array: any, c: number, p: number) => array[3 * (c + p * numColumns) + 2];
// if a term occurs in over half the documents in the collection
// then this model gives a negative term weight, which is presumably undesirable.
// But, assuming the use of a stop list, this normally doesn't happen,
// and the value for each summand can be given a floor of 0.
const IDF = (n: number, N: number) => Math.max(Math.log((N - n + 0.5) / (n + 0.5)), 0);
// https://en.wikipedia.org/wiki/Okapi_BM25
const BM25 = (idf: any, freq: any, numTokens: number, avgTokens: any) => {
if (avgTokens === 0) {
return 0; // To prevent division by zero
}
return idf * (freq * (K1 + 1)) / (freq + K1 * (1 - B + B * (numTokens / avgTokens)));
};
const msSinceEpoch = Math.round(new Date().getTime());
const msPerDay = 86400000;
const weightForDaysSinceLastUpdate = (row: any) => {
// BM25 weights typically range 0-10, and last updated date should weight similarly, though prioritizing recency logarithmically.
// An alpha of 200 ensures matches in the last week will show up front (11.59) and often so for matches within 2 weeks (5.99),
// but is much less of a factor at 30 days (2.84) or very little after 90 days (0.95), focusing mostly on content at that point.
if (!row.user_updated_time) {
return 0;
}
const alpha = 200;
const daysSinceLastUpdate = (msSinceEpoch - row.user_updated_time) / msPerDay;
return alpha * Math.log(1 + 1 / Math.max(daysSinceLastUpdate, 0.5));
};
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
row.weight = 0;
for (let j = 0; j < numPhrases; j++) {
columns.forEach(column => {
const rowsWithHits = docsWithHits(X[i], column, j);
const frequencyHits = hitsThisRow(X[i], column, j);
const idf = IDF(rowsWithHits, numRows);
row.weight += BM25(idf, frequencyHits, numTokens[column][i], avgTokens[column]);
});
}
row.weight += weightForDaysSinceLastUpdate(row);
}
}
processBasicSearchResults_(rows: any[], parsedQuery: any) {
const valueRegexs = parsedQuery.keys.includes('_') ? parsedQuery.terms['_'].map((term: any) => term.valueRegex || term.value) : [];
const isTitleSearch = parsedQuery.keys.includes('title');
const isOnlyTitle = parsedQuery.keys.length === 1 && isTitleSearch;
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const testTitle = (regex: any) => new RegExp(regex, 'ig').test(row.title);
const matchedFields: any = {
title: isTitleSearch || valueRegexs.some(testTitle),
body: !isOnlyTitle,
};
row.fields = Object.keys(matchedFields).filter((key: any) => matchedFields[key]);
row.weight = 0;
row.fuzziness = 0;
}
}
processResults_(rows: any[], parsedQuery: any, isBasicSearchResults = false) {
if (isBasicSearchResults) {
this.processBasicSearchResults_(rows, parsedQuery);
} else {
this.calculateWeightBM25_(rows);
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const offsets = row.offsets.split(' ').map((o: any) => Number(o));
row.fields = this.fieldNamesFromOffsets_(offsets);
}
}
rows.sort((a, b) => {
if (a.fields.includes('title') && !b.fields.includes('title')) return -1;
if (!a.fields.includes('title') && b.fields.includes('title')) return +1;
if (a.weight < b.weight) return +1;
if (a.weight > b.weight) return -1;
if (a.is_todo && a.todo_completed) return +1;
if (b.is_todo && b.todo_completed) return -1;
if (a.user_updated_time < b.user_updated_time) return +1;
if (a.user_updated_time > b.user_updated_time) return -1;
return 0;
});
}
// https://stackoverflow.com/a/13818704/561309
queryTermToRegex(term: any) {
while (term.length && term.indexOf('*') === 0) {
term = term.substr(1);
}
let regexString = pregQuote(term);
if (regexString[regexString.length - 1] === '*') {
regexString = `${regexString.substr(0, regexString.length - 2)}[^${pregQuote(' \t\n\r,.,+-*?!={}<>|:"\'()[]')}]` + '*?';
// regexString = regexString.substr(0, regexString.length - 2) + '.*?';
}
return regexString;
}
async parseQuery(query: string) {
const trimQuotes = (str: string) => str.startsWith('"') ? str.substr(1, str.length - 2) : str;
let allTerms: any[] = [];
try {
allTerms = filterParser(query);
} catch (error) {
console.warn(error);
}
const textTerms = allTerms.filter(x => x.name === 'text' && !x.negated).map(x => trimQuotes(x.value));
const titleTerms = allTerms.filter(x => x.name === 'title' && !x.negated).map(x => trimQuotes(x.value));
const bodyTerms = allTerms.filter(x => x.name === 'body' && !x.negated).map(x => trimQuotes(x.value));
const terms: any = { _: textTerms, 'title': titleTerms, 'body': bodyTerms };
// Filter terms:
// - Convert wildcards to regex
// - Remove columns with no results
// - Add count of terms
let termCount = 0;
const keys = [];
for (const col in terms) {
if (!terms.hasOwnProperty(col)) continue;
if (!terms[col].length) {
delete terms[col];
continue;
}
for (let i = terms[col].length - 1; i >= 0; i--) {
const term = terms[col][i];
// SQlLite FTS doesn't allow "*" queries and neither shall we
if (term === '*') {
terms[col].splice(i, 1);
continue;
}
if (term.indexOf('*') >= 0) {
terms[col][i] = { type: 'regex', value: term, scriptType: scriptType(term), valueRegex: this.queryTermToRegex(term) };
} else {
terms[col][i] = { type: 'text', value: term, scriptType: scriptType(term) };
}
}
termCount += terms[col].length;
keys.push(col);
}
//
// The object "allTerms" is used for query construction purposes (this contains all the filter terms)
// Since this is used for the FTS match query, we need to normalize text, title and body terms.
// Note, we're not normalizing terms like tag because these are matched using SQL LIKE operator and so we must preserve their diacritics.
//
// The object "terms" only include text, title, body terms and is used for highlighting.
// By not normalizing the text, title, body in "terms", highlighting still works correctly for words with diacritics.
//
allTerms = allTerms.map(x => {
if (x.name === 'text' || x.name === 'title' || x.name === 'body') {
return Object.assign(x, { value: this.normalizeText_(x.value) });
}
return x;
});
return {
termCount: termCount,
keys: keys,
terms: terms, // text terms
allTerms: allTerms,
any: !!allTerms.find(term => term.name === 'any'),
};
}
allParsedQueryTerms(parsedQuery: any) {
if (!parsedQuery || !parsedQuery.termCount) return [];
let output: any[] = [];
for (const col in parsedQuery.terms) {
if (!parsedQuery.terms.hasOwnProperty(col)) continue;
output = output.concat(parsedQuery.terms[col]);
}
return output;
}
normalizeText_(text: string) {
const normalizedText = text.normalize ? text.normalize() : text;
return removeDiacritics(normalizedText.toLowerCase());
}
normalizeNote_(note: NoteEntity) {
const n = Object.assign({}, note);
n.title = this.normalizeText_(n.title);
n.body = this.normalizeText_(n.body);
return n;
}
async basicSearch(query: string) {
query = query.replace(/\*/, '');
const parsedQuery = await this.parseQuery(query);
const searchOptions: any = {};
for (const key of parsedQuery.keys) {
if (parsedQuery.terms[key].length === 0) continue;
const term = parsedQuery.terms[key].map((x: any) => x.value).join(' ');
if (key === '_') searchOptions.anywherePattern = `*${term}*`;
if (key === 'title') searchOptions.titlePattern = `*${term}*`;
if (key === 'body') searchOptions.bodyPattern = `*${term}*`;
}
return Note.previews(null, searchOptions);
}
determineSearchType_(query: string, preferredSearchType: any) {
if (preferredSearchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
// If preferredSearchType is "fts" we auto-detect anyway
// because it's not always supported.
let allTerms: any[] = [];
try {
allTerms = filterParser(query);
} catch (error) {
console.warn(error);
}
const textQuery = allTerms.filter(x => x.name === 'text' || x.name == 'title' || x.name == 'body').map(x => x.value).join(' ');
const st = scriptType(textQuery);
if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
return SearchEngine.SEARCH_TYPE_BASIC;
}
return SearchEngine.SEARCH_TYPE_FTS;
}
async search(searchString: string, options: any = null) {
if (!searchString) return [];
options = Object.assign({}, {
searchType: SearchEngine.SEARCH_TYPE_AUTO,
}, options);
const searchType = this.determineSearchType_(searchString, options.searchType);
const parsedQuery = await this.parseQuery(searchString);
if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
searchString = this.normalizeText_(searchString);
const rows = await this.basicSearch(searchString);
this.processResults_(rows, parsedQuery, true);
return rows;
} else {
// SEARCH_TYPE_FTS
// FTS will ignore all special characters, like "-" in the index. So if
// we search for "this-phrase" it won't find it because it will only
// see "this phrase" in the index. Because of this, we remove the dashes
// when searching.
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
try {
const { query, params } = queryBuilder(parsedQuery.allTerms);
const rows = await this.db().selectAll(query, params);
this.processResults_(rows, parsedQuery);
return rows;
} catch (error) {
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);
return [];
}
}
}
async destroy() {
if (this.scheduleSyncTablesIID_) {
shim.clearTimeout(this.scheduleSyncTablesIID_);
this.scheduleSyncTablesIID_ = null;
}
SearchEngine.instance_ = null;
return new Promise((resolve) => {
const iid = shim.setInterval(() => {
if (!this.syncCalls_.length) {
shim.clearInterval(iid);
SearchEngine.instance_ = null;
resolve(null);
}
}, 100);
});
}
}