diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index bc70eeb441c392..a827ae72c02a41 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -6,6 +6,7 @@ !TrueType_without_cmap.pdf !franz.pdf !franz_2.pdf +!fraction-highlight.pdf !german-umlaut-r.pdf !xref_command_missing.pdf !issue1155r.pdf diff --git a/test/pdfs/fraction-highlight.pdf b/test/pdfs/fraction-highlight.pdf new file mode 100644 index 00000000000000..09b7c474a25ea0 Binary files /dev/null and b/test/pdfs/fraction-highlight.pdf differ diff --git a/test/unit/clitests_helper.js b/test/unit/clitests_helper.js index 398d6a336e43f9..981af09a421493 100644 --- a/test/unit/clitests_helper.js +++ b/test/unit/clitests_helper.js @@ -18,6 +18,9 @@ import { isNodeJS } from "../../src/shared/is_node.js"; import { PDFNodeStream } from "../../src/display/node_stream.js"; import { setPDFNetworkStreamFactory } from "../../src/display/api.js"; +// Sets longer timeout, similar to `jasmine-boot.js`. +jasmine.DEFAULT_TIMEOUT_INTERVAL = 30000; + // Ensure that this script only runs in Node.js environments. if (!isNodeJS) { throw new Error( diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index f4ae3a35b1e3a9..4106d3345e67b7 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -19,6 +19,8 @@ import { getDocument } from "../../src/display/api.js"; import { PDFFindController } from "../../web/pdf_find_controller.js"; import { SimpleLinkService } from "../../web/pdf_link_service.js"; +const tracemonkeyFileName = "tracemonkey.pdf"; + class MockLinkService extends SimpleLinkService { constructor() { super(); @@ -44,89 +46,106 @@ class MockLinkService extends SimpleLinkService { } } -describe("pdf_find_controller", function () { - let eventBus; - let pdfFindController; - - beforeEach(function (done) { - const loadingTask = getDocument(buildGetDocumentParams("tracemonkey.pdf")); - loadingTask.promise.then(function (pdfDocument) { - eventBus = new EventBus(); +async function initializePdfFindController(filename) { + const loadingTask = getDocument( + buildGetDocumentParams(filename || tracemonkeyFileName) + ); + const pdfDocument = await loadingTask.promise; - const linkService = new MockLinkService(); - linkService.setDocument(pdfDocument); + const eventBus = new EventBus(); - pdfFindController = new PDFFindController({ - linkService, - eventBus, - }); - pdfFindController.setDocument(pdfDocument); // Enable searching. + const linkService = new MockLinkService(); + linkService.setDocument(pdfDocument); - done(); - }); + const pdfFindController = new PDFFindController({ + linkService, + eventBus, }); + pdfFindController.setDocument(pdfDocument); // Enable searching. - afterEach(function () { - eventBus = null; - pdfFindController = null; - }); + return { eventBus, pdfFindController, loadingTask }; +} - function testSearch({ parameters, matchesPerPage, selectedMatch }) { - return new Promise(function (resolve) { - pdfFindController.executeCommand("find", parameters); - - // The `updatefindmatchescount` event is only emitted if the page contains - // at least one match for the query, so the last non-zero item in the - // matches per page array corresponds to the page for which the final - // `updatefindmatchescount` event is emitted. If this happens, we know - // that any subsequent pages won't trigger the event anymore and we - // can start comparing the matches per page. This logic is necessary - // because we call the `pdfFindController.pageMatches` getter directly - // after receiving the event and the underlying `_pageMatches` array - // is only extended when a page is processed, so it will only contain - // entries for the pages processed until the time when the final event - // was emitted. - let totalPages = matchesPerPage.length; - for (let i = totalPages - 1; i >= 0; i--) { - if (matchesPerPage[i] > 0) { - totalPages = i + 1; - break; - } +function testSearch({ + eventBus, + pdfFindController, + parameters, + matchesPerPage, + selectedMatch, + pageMatches = null, + pageMatchesLength = null, +}) { + return new Promise(function (resolve) { + pdfFindController.executeCommand("find", parameters); + + // The `updatefindmatchescount` event is only emitted if the page contains + // at least one match for the query, so the last non-zero item in the + // matches per page array corresponds to the page for which the final + // `updatefindmatchescount` event is emitted. If this happens, we know + // that any subsequent pages won't trigger the event anymore and we + // can start comparing the matches per page. This logic is necessary + // because we call the `pdfFindController.pageMatches` getter directly + // after receiving the event and the underlying `_pageMatches` array + // is only extended when a page is processed, so it will only contain + // entries for the pages processed until the time when the final event + // was emitted. + let totalPages = matchesPerPage.length; + for (let i = totalPages - 1; i >= 0; i--) { + if (matchesPerPage[i] > 0) { + totalPages = i + 1; + break; } + } - const totalMatches = matchesPerPage.reduce((a, b) => { - return a + b; - }); - - eventBus.on( - "updatefindmatchescount", - function onUpdateFindMatchesCount(evt) { - if (pdfFindController.pageMatches.length !== totalPages) { - return; - } - eventBus.off("updatefindmatchescount", onUpdateFindMatchesCount); - - expect(evt.matchesCount.total).toBe(totalMatches); - for (let i = 0; i < totalPages; i++) { - expect(pdfFindController.pageMatches[i].length).toEqual( - matchesPerPage[i] - ); - } - expect(pdfFindController.selected.pageIdx).toEqual( - selectedMatch.pageIndex - ); - expect(pdfFindController.selected.matchIdx).toEqual( - selectedMatch.matchIndex + const totalMatches = matchesPerPage.reduce((a, b) => { + return a + b; + }); + + eventBus.on( + "updatefindmatchescount", + function onUpdateFindMatchesCount(evt) { + if (pdfFindController.pageMatches.length !== totalPages) { + return; + } + eventBus.off("updatefindmatchescount", onUpdateFindMatchesCount); + + expect(evt.matchesCount.total).toBe(totalMatches); + for (let i = 0; i < totalPages; i++) { + expect(pdfFindController.pageMatches[i].length).toEqual( + matchesPerPage[i] ); + } + expect(pdfFindController.selected.pageIdx).toEqual( + selectedMatch.pageIndex + ); + expect(pdfFindController.selected.matchIdx).toEqual( + selectedMatch.matchIndex + ); - resolve(); + if (pageMatches) { + expect(pdfFindController.pageMatches).toEqual(pageMatches); + expect(pdfFindController.pageMatchesLength).toEqual( + pageMatchesLength + ); } - ); - }); - } - it("performs a normal search", function (done) { - testSearch({ + resolve(); + } + ); + }); +} + +describe("pdf_find_controller", function () { + it("performs a normal search", async function () { + const { + eventBus, + pdfFindController, + loadingTask, + } = await initializePdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, parameters: { query: "Dynamic", caseSensitive: false, @@ -139,14 +158,24 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - }).then(done); + }); + + return loadingTask.destroy(); }); - it("performs a normal search and finds the previous result", function (done) { + it("performs a normal search and finds the previous result", async function () { // Page 14 (with page index 13) contains five results. By default, the // first result (match index 0) is selected, so the previous result // should be the fifth result (match index 4). - testSearch({ + const { + eventBus, + pdfFindController, + loadingTask, + } = await initializePdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, parameters: { query: "conference", caseSensitive: false, @@ -159,11 +188,21 @@ describe("pdf_find_controller", function () { pageIndex: 13, matchIndex: 4, }, - }).then(done); + }); + + return loadingTask.destroy(); }); - it("performs a case sensitive search", function (done) { - testSearch({ + it("performs a case sensitive search", async function () { + const { + eventBus, + pdfFindController, + loadingTask, + } = await initializePdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, parameters: { query: "Dynamic", caseSensitive: true, @@ -176,13 +215,23 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - }).then(done); + }); + + return loadingTask.destroy(); }); - it("performs an entire word search", function (done) { + it("performs an entire word search", async function () { // Page 13 contains both 'Government' and 'Governmental', so the latter // should not be found with entire word search. - testSearch({ + const { + eventBus, + pdfFindController, + loadingTask, + } = await initializePdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, parameters: { query: "Government", caseSensitive: false, @@ -195,13 +244,23 @@ describe("pdf_find_controller", function () { pageIndex: 12, matchIndex: 0, }, - }).then(done); + }); + + return loadingTask.destroy(); }); - it("performs a multiple term (no phrase) search", function (done) { + it("performs a multiple term (no phrase) search", async function () { // Page 9 contains 'alternate' and pages 6 and 9 contain 'solution'. // Both should be found for multiple term (no phrase) search. - testSearch({ + const { + eventBus, + pdfFindController, + loadingTask, + } = await initializePdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, parameters: { query: "alternate solution", caseSensitive: false, @@ -214,6 +273,37 @@ describe("pdf_find_controller", function () { pageIndex: 5, matchIndex: 0, }, - }).then(done); + }); + + return loadingTask.destroy(); + }); + + it("performs a normal search, where the text is normalized", async function () { + const { + eventBus, + pdfFindController, + loadingTask, + } = await initializePdfFindController("fraction-highlight.pdf"); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "fraction", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + }, + matchesPerPage: [3], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[19, 48, 66]], + pageMatchesLength: [[8, 8, 8]], + }); + + return loadingTask.destroy(); }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index a306b7e848179c..100dee201bd997 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -49,9 +49,40 @@ function normalize(text) { const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); normalizationRegex = new RegExp(`[${replace}]`, "g"); } - return text.replace(normalizationRegex, function (ch) { - return CHARACTERS_TO_NORMALIZE[ch]; + let diffs = null; + const normalizedText = text.replace(normalizationRegex, function (ch, index) { + const normalizedCh = CHARACTERS_TO_NORMALIZE[ch], + diff = normalizedCh.length - ch.length; + if (diff !== 0) { + (diffs ||= []).push([index, diff]); + } + return normalizedCh; }); + + return [normalizedText, diffs]; +} + +// Determine the original, non-normalized, match index such that highlighting of +// search results is correct in the `textLayer` for strings containing e.g. "½" +// characters; essentially "inverting" the result of the `normalize` function. +function getOriginalIndex(matchIndex, diffs = null) { + if (!diffs) { + return matchIndex; + } + let totalDiff = 0; + for (const [index, diff] of diffs) { + const currentIndex = index + totalDiff; + + if (currentIndex >= matchIndex) { + break; + } + if (currentIndex + diff > matchIndex) { + totalDiff += matchIndex - currentIndex; + break; + } + totalDiff += diff; + } + return matchIndex - totalDiff; } /** @@ -215,6 +246,7 @@ class PDFFindController { }; this._extractTextPromises = []; this._pageContents = []; // Stores the normalized text for each page. + this._pageDiffs = []; this._matchesCountTotal = 0; this._pagesToSearch = null; this._pendingFindMatches = Object.create(null); @@ -232,7 +264,7 @@ class PDFFindController { get _query() { if (this._state.query !== this._rawQuery) { this._rawQuery = this._state.query; - this._normalizedQuery = normalize(this._state.query); + [this._normalizedQuery] = normalize(this._state.query); } return this._normalizedQuery; } @@ -349,8 +381,9 @@ class PDFFindController { return true; } - _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) { - const matches = []; + _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) { + const matches = [], + matchesLength = []; const queryLen = query.length; let matchIdx = -queryLen; @@ -362,12 +395,19 @@ class PDFFindController { if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { continue; } - matches.push(matchIdx); + const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs), + matchEnd = matchIdx + queryLen - 1, + originalQueryLen = + getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1; + + matches.push(originalMatchIdx); + matchesLength.push(originalQueryLen); } this._pageMatches[pageIndex] = matches; + this._pageMatchesLength[pageIndex] = matchesLength; } - _calculateWordMatch(query, pageIndex, pageContent, entireWord) { + _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) { const matchesWithLength = []; // Divide the query into pieces and search for text in each piece. @@ -388,10 +428,15 @@ class PDFFindController { ) { continue; } + const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs), + matchEnd = matchIdx + subqueryLen - 1, + originalQueryLen = + getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1; + // Other searches do not, so we store the length. matchesWithLength.push({ - match: matchIdx, - matchLength: subqueryLen, + match: originalMatchIdx, + matchLength: originalQueryLen, skipped: false, }); } @@ -412,6 +457,7 @@ class PDFFindController { _calculateMatch(pageIndex) { let pageContent = this._pageContents[pageIndex]; + const pageDiffs = this._pageDiffs[pageIndex]; let query = this._query; const { caseSensitive, entireWord, phraseSearch } = this._state; @@ -426,9 +472,21 @@ class PDFFindController { } if (phraseSearch) { - this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord); + this._calculatePhraseMatch( + query, + pageIndex, + pageContent, + pageDiffs, + entireWord + ); } else { - this._calculateWordMatch(query, pageIndex, pageContent, entireWord); + this._calculateWordMatch( + query, + pageIndex, + pageContent, + pageDiffs, + entireWord + ); } // When `highlightAll` is set, ensure that the matches on previously @@ -478,7 +536,9 @@ class PDFFindController { } // Store the normalized page content (text items) as one string. - this._pageContents[i] = normalize(strBuf.join("")); + [this._pageContents[i], this._pageDiffs[i]] = normalize( + strBuf.join("") + ); extractTextCapability.resolve(i); }, reason => { @@ -488,6 +548,7 @@ class PDFFindController { ); // Page error -- assuming no text content. this._pageContents[i] = ""; + this._pageDiffs[i] = null; extractTextCapability.resolve(i); } ); diff --git a/web/text_layer_builder.js b/web/text_layer_builder.js index 183241b609007e..f2a59edf725646 100644 --- a/web/text_layer_builder.js +++ b/web/text_layer_builder.js @@ -161,12 +161,11 @@ class TextLayerBuilder { if (!matches) { return []; } - const { findController, textContentItemsStr } = this; + const { textContentItemsStr } = this; let i = 0, iIndex = 0; const end = textContentItemsStr.length - 1; - const queryLen = findController.state.query.length; const result = []; for (let m = 0, mm = matches.length; m < mm; m++) { @@ -191,13 +190,7 @@ class TextLayerBuilder { }; // Calculate the end position. - if (matchesLength) { - // Multiterm search. - matchIdx += matchesLength[m]; - } else { - // Phrase search. - matchIdx += queryLen; - } + matchIdx += matchesLength[m]; // Somewhat the same array as above, but use > instead of >= to get // the end position right.