diff --git a/l10n/en-US/viewer.properties b/l10n/en-US/viewer.properties index b110cf4e640f6..39f1a99bcf66a 100644 --- a/l10n/en-US/viewer.properties +++ b/l10n/en-US/viewer.properties @@ -165,6 +165,7 @@ find_next.title=Find the next occurrence of the phrase find_next_label=Next find_highlight=Highlight all find_match_case_label=Match case +find_entire_word_label=Whole words find_reached_top=Reached top of document, continued from bottom find_reached_bottom=Reached end of document, continued from top # LOCALIZATION NOTE (find_matches_count): "{{current}}" and "{{total}}" will be diff --git a/l10n/nl/viewer.properties b/l10n/nl/viewer.properties index 61fb64f603273..477d366454cab 100644 --- a/l10n/nl/viewer.properties +++ b/l10n/nl/viewer.properties @@ -165,6 +165,7 @@ find_next.title=De volgende overeenkomst van de tekst zoeken find_next_label=Volgende find_highlight=Alles markeren find_match_case_label=Hoofdlettergevoelig +find_entire_word_label=Hele woorden find_reached_top=Bovenkant van document bereikt, doorgegaan vanaf onderkant find_reached_bottom=Onderkant van document bereikt, doorgegaan vanaf bovenkant # LOCALIZATION NOTE (find_matches_count): "{{current}}" and "{{total}}" will be diff --git a/l10n/sv-SE/viewer.properties b/l10n/sv-SE/viewer.properties index eb5bbdf8ca242..3233913ed86f9 100644 --- a/l10n/sv-SE/viewer.properties +++ b/l10n/sv-SE/viewer.properties @@ -165,6 +165,7 @@ find_next.title=Hitta nästa förekomst av frasen find_next_label=Nästa find_highlight=Markera alla find_match_case_label=Matcha versal/gemen +find_entire_word_label=Hela ord find_reached_top=Nådde början av dokumentet, började från slutet find_reached_bottom=Nådde slutet på dokumentet, började från början # LOCALIZATION NOTE (find_matches_count): "{{current}}" and "{{total}}" will be diff --git a/test/unit/clitests.json b/test/unit/clitests.json index 337109e7cb2cc..a0348fb7e15cf 100644 --- a/test/unit/clitests.json +++ b/test/unit/clitests.json @@ -25,6 +25,7 @@ "network_utils_spec.js", "node_stream_spec.js", "parser_spec.js", + "pdf_find_utils.js", "pdf_history.js", "primitives_spec.js", "stream_spec.js", diff --git a/test/unit/jasmine-boot.js b/test/unit/jasmine-boot.js index 5297a56ee51e2..ef87c76ee48a2 100644 --- a/test/unit/jasmine-boot.js +++ b/test/unit/jasmine-boot.js @@ -67,6 +67,7 @@ function initializePDFJS(callback) { 'pdfjs-test/unit/network_spec', 'pdfjs-test/unit/network_utils_spec', 'pdfjs-test/unit/parser_spec', + 'pdfjs-test/unit/pdf_find_utils_spec', 'pdfjs-test/unit/pdf_history_spec', 'pdfjs-test/unit/primitives_spec', 'pdfjs-test/unit/stream_spec', diff --git a/test/unit/pdf_find_utils_spec.js b/test/unit/pdf_find_utils_spec.js new file mode 100644 index 0000000000000..9ec5715177052 --- /dev/null +++ b/test/unit/pdf_find_utils_spec.js @@ -0,0 +1,56 @@ +/* Copyright 2018 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { CharacterType, getCharacterType } from '../../web/pdf_find_utils'; + +describe('pdf_find_utils', function() { + describe('getCharacterType', function() { + it('gets expected character types', function() { + const characters = { + 'A': CharacterType.ALPHA_LETTER, + 'a': CharacterType.ALPHA_LETTER, + '0': CharacterType.ALPHA_LETTER, + '5': CharacterType.ALPHA_LETTER, + '\xC4': CharacterType.ALPHA_LETTER, // 'Ä' + '\xE4': CharacterType.ALPHA_LETTER, // 'ä' + '_': CharacterType.ALPHA_LETTER, + ' ': CharacterType.SPACE, + '\t': CharacterType.SPACE, + '\r': CharacterType.SPACE, + '\n': CharacterType.SPACE, + '\xA0': CharacterType.SPACE, + '-': CharacterType.PUNCT, + ',': CharacterType.PUNCT, + '.': CharacterType.PUNCT, + ';': CharacterType.PUNCT, + ':': CharacterType.PUNCT, + '\u2122': CharacterType.ALPHA_LETTER, // trademark + '\u0E25': CharacterType.THAI_LETTER, + '\u4000': CharacterType.HAN_LETTER, + '\uF950': CharacterType.HAN_LETTER, + '\u30C0': CharacterType.KATAKANA_LETTER, + '\u3050': CharacterType.HIRAGANA_LETTER, + '\uFF80': CharacterType.HALFWIDTH_KATAKANA_LETTER, + }; + + for (const character in characters) { + const charCode = character.charCodeAt(0); + const type = characters[character]; + + expect(getCharacterType(charCode)).toEqual(type); + } + }); + }); +}); diff --git a/web/app.js b/web/app.js index ba1de89c4f684..1cc90cdb7adff 100644 --- a/web/app.js +++ b/web/app.js @@ -1959,6 +1959,7 @@ function webViewerFind(evt) { query: evt.query, phraseSearch: evt.phraseSearch, caseSensitive: evt.caseSensitive, + entireWord: evt.entireWord, highlightAll: evt.highlightAll, findPrevious: evt.findPrevious, }); @@ -1969,6 +1970,7 @@ function webViewerFindFromUrlHash(evt) { query: evt.query, phraseSearch: evt.phraseSearch, caseSensitive: false, + entireWord: false, highlightAll: true, findPrevious: false, }); @@ -2105,6 +2107,7 @@ function webViewerKeyDown(evt) { query: findState.query, phraseSearch: findState.phraseSearch, caseSensitive: findState.caseSensitive, + entireWord: findState.entireWord, highlightAll: findState.highlightAll, findPrevious: cmd === 5 || cmd === 12, }); diff --git a/web/firefoxcom.js b/web/firefoxcom.js index 80553cea9d958..46c75db6a856b 100644 --- a/web/firefoxcom.js +++ b/web/firefoxcom.js @@ -167,7 +167,8 @@ class MozL10n { 'find', 'findagain', 'findhighlightallchange', - 'findcasesensitivitychange' + 'findcasesensitivitychange', + 'findentirewordchange', ]; let handleEvent = function(evt) { if (!PDFViewerApplication.initialized) { @@ -179,13 +180,14 @@ class MozL10n { query: evt.detail.query, phraseSearch: true, caseSensitive: !!evt.detail.caseSensitive, + entireWord: !!evt.detail.entireWord, highlightAll: !!evt.detail.highlightAll, findPrevious: !!evt.detail.findPrevious, }); }; - for (let i = 0, len = events.length; i < len; i++) { - window.addEventListener(events[i], handleEvent); + for (let event of events) { + window.addEventListener(event, handleEvent); } })(); diff --git a/web/pdf_find_bar.js b/web/pdf_find_bar.js index 53e2c02012d76..795b725e7a878 100644 --- a/web/pdf_find_bar.js +++ b/web/pdf_find_bar.js @@ -33,6 +33,7 @@ class PDFFindBar { this.findField = options.findField || null; this.highlightAll = options.highlightAllCheckbox || null; this.caseSensitive = options.caseSensitiveCheckbox || null; + this.entireWord = options.entireWordCheckbox || null; this.findMsg = options.findMsg || null; this.findResultsCount = options.findResultsCount || null; this.findStatusIcon = options.findStatusIcon || null; @@ -85,6 +86,10 @@ class PDFFindBar { this.dispatchEvent('casesensitivitychange'); }); + this.entireWord.addEventListener('click', () => { + this.dispatchEvent('entirewordchange'); + }); + this.eventBus.on('resize', this._adjustWidth.bind(this)); } @@ -97,8 +102,9 @@ class PDFFindBar { source: this, type, query: this.findField.value, - caseSensitive: this.caseSensitive.checked, phraseSearch: true, + caseSensitive: this.caseSensitive.checked, + entireWord: this.entireWord.checked, highlightAll: this.highlightAll.checked, findPrevious: findPrev, }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 2dcf82c6aa8c9..2f5e919114109 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -14,6 +14,7 @@ */ import { createPromiseCapability } from 'pdfjs-lib'; +import { getCharacterType } from './pdf_find_utils'; import { getGlobalEventBus } from './dom_events'; import { scrollIntoView } from './ui_utils'; @@ -190,7 +191,30 @@ class PDFFindController { } } - _calculatePhraseMatch(query, pageIndex, pageContent) { + /** + * Determine if the search query constitutes a "whole word", by comparing the + * first/last character type with the preceding/following character type. + */ + _isEntireWord(content, startIdx, length) { + if (startIdx > 0) { + const first = content.charCodeAt(startIdx); + const limit = content.charCodeAt(startIdx - 1); + if (getCharacterType(first) === getCharacterType(limit)) { + return false; + } + } + const endIdx = (startIdx + length - 1); + if (endIdx < (content.length - 1)) { + const last = content.charCodeAt(endIdx); + const limit = content.charCodeAt(endIdx + 1); + if (getCharacterType(last) === getCharacterType(limit)) { + return false; + } + } + return true; + } + + _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) { let matches = []; let queryLen = query.length; let matchIdx = -queryLen; @@ -199,12 +223,15 @@ class PDFFindController { if (matchIdx === -1) { break; } + if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { + continue; + } matches.push(matchIdx); } this.pageMatches[pageIndex] = matches; } - _calculateWordMatch(query, pageIndex, pageContent) { + _calculateWordMatch(query, pageIndex, pageContent, entireWord) { let matchesWithLength = []; // Divide the query into pieces and search for text in each piece. let queryArray = query.match(/\S+/g); @@ -217,6 +244,10 @@ class PDFFindController { if (matchIdx === -1) { break; } + if (entireWord && + !this._isEntireWord(pageContent, matchIdx, subqueryLen)) { + continue; + } // Other searches do not, so we store the length. matchesWithLength.push({ match: matchIdx, @@ -244,6 +275,7 @@ class PDFFindController { let query = this._normalize(this.state.query); let caseSensitive = this.state.caseSensitive; let phraseSearch = this.state.phraseSearch; + const entireWord = this.state.entireWord; let queryLen = query.length; if (queryLen === 0) { @@ -257,9 +289,9 @@ class PDFFindController { } if (phraseSearch) { - this._calculatePhraseMatch(query, pageIndex, pageContent); + this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord); } else { - this._calculateWordMatch(query, pageIndex, pageContent); + this._calculateWordMatch(query, pageIndex, pageContent, entireWord); } this._updatePage(pageIndex); diff --git a/web/pdf_find_utils.js b/web/pdf_find_utils.js new file mode 100644 index 0000000000000..419684a5e435b --- /dev/null +++ b/web/pdf_find_utils.js @@ -0,0 +1,107 @@ +/* Copyright 2018 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +const CharacterType = { + SPACE: 0, + ALPHA_LETTER: 1, + PUNCT: 2, + HAN_LETTER: 3, + KATAKANA_LETTER: 4, + HIRAGANA_LETTER: 5, + HALFWIDTH_KATAKANA_LETTER: 6, + THAI_LETTER: 7, +}; + +function isAlphabeticalScript(charCode) { + return charCode < 0x2E80; +} + +function isAscii(charCode) { + return (charCode & 0xFF80) === 0; +} + +function isAsciiAlpha(charCode) { + return (charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7A) || + (charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5A); +} + +function isAsciiDigit(charCode) { + return (charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39); +} + +function isAsciiSpace(charCode) { + return (charCode === /* SPACE = */ 0x20 || charCode === /* TAB = */ 0x09 || + charCode === /* CR = */ 0x0D || charCode === /* LF = */ 0x0A); +} + +function isHan(charCode) { + return (charCode >= 0x3400 && charCode <= 0x9FFF) || + (charCode >= 0xF900 && charCode <= 0xFAFF); +} + +function isKatakana(charCode) { + return (charCode >= 0x30A0 && charCode <= 0x30FF); +} + +function isHiragana(charCode) { + return (charCode >= 0x3040 && charCode <= 0x309F); +} + +function isHalfwidthKatakana(charCode) { + return (charCode >= 0xFF60 && charCode <= 0xFF9F); +} + +function isThai(charCode) { + return (charCode & 0xFF80) === 0x0E00; +} + +/** + * This function is based on the word-break detection implemented in: + * https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp + */ +function getCharacterType(charCode) { + if (isAlphabeticalScript(charCode)) { + if (isAscii(charCode)) { + if (isAsciiSpace(charCode)) { + return CharacterType.SPACE; + } else if (isAsciiAlpha(charCode) || isAsciiDigit(charCode) || + charCode === /* UNDERSCORE = */ 0x5F) { + return CharacterType.ALPHA_LETTER; + } + return CharacterType.PUNCT; + } else if (isThai(charCode)) { + return CharacterType.THAI_LETTER; + } else if (charCode === /* NBSP = */ 0xA0) { + return CharacterType.SPACE; + } + return CharacterType.ALPHA_LETTER; + } + + if (isHan(charCode)) { + return CharacterType.HAN_LETTER; + } else if (isKatakana(charCode)) { + return CharacterType.KATAKANA_LETTER; + } else if (isHiragana(charCode)) { + return CharacterType.HIRAGANA_LETTER; + } else if (isHalfwidthKatakana(charCode)) { + return CharacterType.HALFWIDTH_KATAKANA_LETTER; + } + return CharacterType.ALPHA_LETTER; +} + +export { + CharacterType, + getCharacterType, +}; diff --git a/web/viewer.html b/web/viewer.html index 32469d113cffd..f2a284108a5e5 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -104,15 +104,19 @@ -