From c30a528d634a6637dee5359e306ad289c497d26d Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sat, 1 Sep 2018 01:28:19 +0200 Subject: [PATCH] Add initial support for "Whole words" searching in the viewer As outlined in https://bugzilla.mozilla.org/show_bug.cgi?id=1282759 the internal Firefox name for the feature is `entireWord`, hence that name is used here as well for consistency (with "Whole words" being limited to the UI). Given existing limitations of the PDF.js search functionality, e.g. the existing problems of searching across "new lines", there's some edge-cases where "Whole words" searching will ignore (valid) results. However, considering that this is a pre-existing issue related to the way that the find controller joins text-content together, that shouldn't have to block this new feature in my opionion. *Please note:* In order to enable this feature in the `MOZCENTRAL` version, a small follow-up patch for [PdfjsChromeUtils.jsm](https://hg.mozilla.org/mozilla-central/file/tip/browser/extensions/pdfjs/content/PdfjsChromeUtils.jsm) will be required once this has landed in `mozilla-central`. --- l10n/en-US/viewer.properties | 1 + l10n/sv-SE/viewer.properties | 1 + web/app.js | 3 + web/firefoxcom.js | 8 ++- web/pdf_find_bar.js | 8 ++- web/pdf_find_controller.js | 40 +++++++++++-- web/pdf_find_utils.js | 107 +++++++++++++++++++++++++++++++++++ web/viewer.html | 2 + web/viewer.js | 1 + 9 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 web/pdf_find_utils.js diff --git a/l10n/en-US/viewer.properties b/l10n/en-US/viewer.properties index 3929482459c42b..85b5f80e90034c 100644 --- a/l10n/en-US/viewer.properties +++ b/l10n/en-US/viewer.properties @@ -165,6 +165,7 @@ find_next.title=Find the next occurrence of the phrase find_next_label=Next find_highlight=Highlight all find_match_case_label=Match case +find_entire_word_label=Whole words find_reached_top=Reached top of document, continued from bottom find_reached_bottom=Reached end of document, continued from top find_not_found=Phrase not found diff --git a/l10n/sv-SE/viewer.properties b/l10n/sv-SE/viewer.properties index 7e5c685af6ae4f..5f60c5332d3377 100644 --- a/l10n/sv-SE/viewer.properties +++ b/l10n/sv-SE/viewer.properties @@ -165,6 +165,7 @@ find_next.title=Hitta nästa förekomst av frasen find_next_label=Nästa find_highlight=Markera alla find_match_case_label=Matcha versal/gemen +find_entire_word_label=Hela ord find_reached_top=Nådde början av dokumentet, började från slutet find_reached_bottom=Nådde slutet på dokumentet, började från början find_not_found=Frasen hittades inte diff --git a/web/app.js b/web/app.js index f3d0635007775c..a4e8481ec18bab 100644 --- a/web/app.js +++ b/web/app.js @@ -1971,6 +1971,7 @@ function webViewerFind(evt) { query: evt.query, phraseSearch: evt.phraseSearch, caseSensitive: evt.caseSensitive, + entireWord: evt.entireWord, highlightAll: evt.highlightAll, findPrevious: evt.findPrevious, }); @@ -1981,6 +1982,7 @@ function webViewerFindFromUrlHash(evt) { query: evt.query, phraseSearch: evt.phraseSearch, caseSensitive: false, + entireWord: false, highlightAll: true, findPrevious: false, }); @@ -2117,6 +2119,7 @@ function webViewerKeyDown(evt) { query: findState.query, phraseSearch: findState.phraseSearch, caseSensitive: findState.caseSensitive, + entireWord: findState.entireWord, highlightAll: findState.highlightAll, findPrevious: cmd === 5 || cmd === 12, }); diff --git a/web/firefoxcom.js b/web/firefoxcom.js index 99026e9faac078..3f893368f22c8b 100644 --- a/web/firefoxcom.js +++ b/web/firefoxcom.js @@ -168,7 +168,8 @@ class MozL10n { 'find', 'findagain', 'findhighlightallchange', - 'findcasesensitivitychange' + 'findcasesensitivitychange', + 'findentirewordchange', ]; let handleEvent = function(evt) { if (!PDFViewerApplication.initialized) { @@ -180,13 +181,14 @@ class MozL10n { query: evt.detail.query, phraseSearch: true, caseSensitive: !!evt.detail.caseSensitive, + entireWord: !!evt.detail.entireWord, highlightAll: !!evt.detail.highlightAll, findPrevious: !!evt.detail.findPrevious, }); }; - for (let i = 0, len = events.length; i < len; i++) { - window.addEventListener(events[i], handleEvent); + for (let event of events) { + window.addEventListener(event, handleEvent); } })(); diff --git a/web/pdf_find_bar.js b/web/pdf_find_bar.js index cefd50017e906c..8a4d94aeb973f2 100644 --- a/web/pdf_find_bar.js +++ b/web/pdf_find_bar.js @@ -31,6 +31,7 @@ class PDFFindBar { this.findField = options.findField || null; this.highlightAll = options.highlightAllCheckbox || null; this.caseSensitive = options.caseSensitiveCheckbox || null; + this.entireWord = options.entireWordCheckbox || null; this.findMsg = options.findMsg || null; this.findResultsCount = options.findResultsCount || null; this.findStatusIcon = options.findStatusIcon || null; @@ -83,6 +84,10 @@ class PDFFindBar { this.dispatchEvent('casesensitivitychange'); }); + this.entireWord.addEventListener('click', () => { + this.dispatchEvent('entirewordchange'); + }); + this.eventBus.on('resize', this._adjustWidth.bind(this)); } @@ -95,8 +100,9 @@ class PDFFindBar { source: this, type, query: this.findField.value, - caseSensitive: this.caseSensitive.checked, phraseSearch: true, + caseSensitive: this.caseSensitive.checked, + entireWord: this.entireWord.checked, highlightAll: this.highlightAll.checked, findPrevious: findPrev, }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index d6810dfb1499d3..8ebf6833ce813f 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -14,6 +14,7 @@ */ import { createPromiseCapability } from 'pdfjs-lib'; +import { getCharacterType } from './pdf_find_utils'; import { getGlobalEventBus } from './dom_events'; import { scrollIntoView } from './ui_utils'; @@ -190,7 +191,30 @@ class PDFFindController { } } - _calculatePhraseMatch(query, pageIndex, pageContent) { + /** + * Determine if the search query constitutes a "whole word", by comparing the + * first/last character types with the preceding/following character types. + */ + _isEntireWord(content, startIdx, length) { + if (startIdx > 0) { + const first = content.charCodeAt(startIdx); + const limit = content.charCodeAt(startIdx - 1); + if (getCharacterType(first) === getCharacterType(limit)) { + return false; + } + } + const endIdx = (startIdx + length - 1); + if (endIdx < (content.length - 1)) { + const last = content.charCodeAt(endIdx); + const limit = content.charCodeAt(endIdx + 1); + if (getCharacterType(last) === getCharacterType(limit)) { + return false; + } + } + return true; + } + + _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) { let matches = []; let queryLen = query.length; let matchIdx = -queryLen; @@ -199,12 +223,15 @@ class PDFFindController { if (matchIdx === -1) { break; } + if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { + continue; + } matches.push(matchIdx); } this.pageMatches[pageIndex] = matches; } - _calculateWordMatch(query, pageIndex, pageContent) { + _calculateWordMatch(query, pageIndex, pageContent, entireWord) { let matchesWithLength = []; // Divide the query into pieces and search for text in each piece. let queryArray = query.match(/\S+/g); @@ -217,6 +244,10 @@ class PDFFindController { if (matchIdx === -1) { break; } + if (entireWord && + !this._isEntireWord(pageContent, matchIdx, subqueryLen)) { + continue; + } // Other searches do not, so we store the length. matchesWithLength.push({ match: matchIdx, @@ -244,6 +275,7 @@ class PDFFindController { let query = this._normalize(this.state.query); let caseSensitive = this.state.caseSensitive; let phraseSearch = this.state.phraseSearch; + const entireWord = this.state.entireWord; let queryLen = query.length; if (queryLen === 0) { @@ -257,9 +289,9 @@ class PDFFindController { } if (phraseSearch) { - this._calculatePhraseMatch(query, pageIndex, pageContent); + this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord); } else { - this._calculateWordMatch(query, pageIndex, pageContent); + this._calculateWordMatch(query, pageIndex, pageContent, entireWord); } this._updatePage(pageIndex); diff --git a/web/pdf_find_utils.js b/web/pdf_find_utils.js new file mode 100644 index 00000000000000..a4a3a4871760f7 --- /dev/null +++ b/web/pdf_find_utils.js @@ -0,0 +1,107 @@ +/* Copyright 2018 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +const CharacterType = { + SPACE: 0, + ALPHA_LETTER: 1, + PUNCT: 2, + HAN_LETTER: 3, + KATAKANA_LETTER: 4, + HIRAGANA_LETTER: 5, + HALFWIDTH_KATAKANA_LETTER: 6, + THAI_LETTER: 7, +}; + +function isAlphabeticalScript(charCode) { + return charCode < 0x2E80; +} + +function isAscii(charCode) { + return (charCode & 0xFF80) === 0; +} + +function isAsciiAlpha(charCode) { + return (charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7A) || + (charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5A); +} + +function isAsciiDigit(charCode) { + return (charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39); +} + +function isAsciiSpace(charCode) { + return (charCode === /* SPACE = */ 0x20 || charCode === /* TAB = */ 0x09 || + charCode === /* CR = */ 0x0D || charCode === /* LF = */ 0x0A); +} + +function isHan(charCode) { + return (charCode >= 0x3400 && charCode <= 0x9FFFF) || + (charCode >= 0xF900 && charCode <= 0xFAFF); +} + +function isKatakana(charCode) { + return (charCode >= 0x30A0 && charCode <= 0x30FF); +} + +function isHiragana(charCode) { + return (charCode >= 0x3040 && charCode <= 0x309F); +} + +function isHalfwidthKatakana(charCode) { + return (charCode >= 0xFF60 && charCode <= 0xFF9F); +} + +function isThai(charCode) { + return (charCode & 0xFF80) === 0x0E00; +} + +/** + * This function is based on the word-break detection implemented in: + * https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp + */ +function getCharacterType(charCode) { + if (isAlphabeticalScript(charCode)) { + if (isAscii(charCode)) { + if (isAsciiSpace(charCode)) { + return CharacterType.SPACE; + } else if (isAsciiAlpha(charCode) || isAsciiDigit(charCode) || + charCode === /* UNDERSCORE = */ 0x5F) { + return CharacterType.ALPHA_LETTER; + } + return CharacterType.PUNCT; + } else if (isThai(charCode)) { + return CharacterType.THAI_LETTER; + } else if (charCode === /* NBSP = */ 0xA0) { + return CharacterType.SPACE; + } + return CharacterType.ALPHA_LETTER; + } + + if (isHan(charCode)) { + return CharacterType.HAN_LETTER; + } else if (isKatakana(charCode)) { + return CharacterType.KATAKANA_LETTER; + } else if (isHiragana(charCode)) { + return CharacterType.HIRAGANA_LETTER; + } else if (isHalfwidthKatakana(charCode)) { + return CharacterType.HALFWIDTH_KATAKANA_LETTER; + } + return CharacterType.ALPHA_LETTER; +} + +export { + CharacterType, + getCharacterType, +}; diff --git a/web/viewer.html b/web/viewer.html index 8f1b9e1f49c44e..f5425ccdbdb489 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -109,6 +109,8 @@ + + diff --git a/web/viewer.js b/web/viewer.js index 7a9ae61c0f6b2a..113b2799eec935 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -134,6 +134,7 @@ function getViewerConfiguration() { findField: document.getElementById('findInput'), highlightAllCheckbox: document.getElementById('findHighlightAll'), caseSensitiveCheckbox: document.getElementById('findMatchCase'), + entireWordCheckbox: document.getElementById('findEntireWord'), findMsg: document.getElementById('findMsg'), findResultsCount: document.getElementById('findResultsCount'), findStatusIcon: document.getElementById('findStatusIcon'),