Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Copy/Paste #5783

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
styles: Object.create(null)
};
var bidiTexts = textContent.items;
// At the end of each textChunk, auto insert spaces based on:
var SPACE_FACTOR_CHUNKS = 0.6;
// If performing a spacedText operation, auto insert spaces based on:
var SPACE_FACTOR = 0.35;
var MULTI_SPACE_FACTOR = 1.5;

Expand Down Expand Up @@ -1037,8 +1040,73 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
} else {
textChunk.height += Math.abs(height * scaleCtmX * scaleLineX);
}
addSpaceIfNecessary(textChunk, font);
return textChunk;
}
function getChunkPosition(chunk, font) {
var tx = chunk.transform;
var angle = Math.atan2(tx[1], tx[0]);
if (font.vertical) {
angle += Math.PI / 2;
}
// Start by calculating the height
var fontAscent = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
// Then modify to add the ascent
if (font.ascent) {
fontAscent = font.ascent * fontAscent;
} else if (font.descent) {
fontAscent = (1 + font.descent) * fontAscent;
}
return {
x: (angle === 0 ? tx[4] : tx[4] + (fontAscent * Math.sin(angle))),
y: tx[5]
};
}
function addSpaceIfNecessary(newChunk, font) {
// If the new chunk starts with a space, it does not need one.
if (newChunk.str[0] === ' ' || newChunk.str[0] === '-') {
return;
}
if (bidiTexts.length === 0) {
return;
}
// If the last chunk ends with a space it does not need one.
var lastChunk = bidiTexts[bidiTexts.length - 1];
if (lastChunk.str.length === 0) {
return;
}
var lastChar = lastChunk.str[lastChunk.str.length - 1];
if (lastChar === ' ' || lastChar === '-') {
return;
}
var lastPosition = getChunkPosition(lastChunk, font);
var newPosition = getChunkPosition(newChunk, font);
var yDiff = Math.abs(lastPosition.y - newPosition.y);
if (yDiff >= lastChunk.height || yDiff >= newChunk.height) {
// On different lines, add a space.
lastChunk.str += ' ';
} else {
var wordSpacing = textState.wordSpacing; // Try default wordSpacing.
if (wordSpacing === 0) {
// Heuristic for wordSpacing
wordSpacing = newChunk.width / newChunk.str.length *
SPACE_FACTOR_CHUNKS;
}
var addSpace;
if (newPosition.x >= lastPosition.x) {
// Left to right. Add a space if next is past wordSpacing.
addSpace = newPosition.x >= lastPosition.x + lastChunk.width +
wordSpacing;
} else {
// Right to left. Add space if next is before sart.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typ: sart -> start ?

addSpace = lastPosition.x >= newPosition.x + newChunk.width +
wordSpacing;
}
if (addSpace) {
lastChunk.str += ' ';
}
}
}

var timeSlotManager = new TimeSlotManager();

Expand Down Expand Up @@ -1121,6 +1189,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textState.translateTextMatrix(0, offset);
textChunk.height += offset;
}
// Automatically insert spaces if the shift is big enough.
if (items[j] < 0 && textState.font.spaceWidth > 0) {
var fakeSpaces = -items[j] / textState.font.spaceWidth;
if (fakeSpaces > MULTI_SPACE_FACTOR) {
Expand Down
1 change: 1 addition & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,4 @@
!issue5481.pdf
!issue5567.pdf
!issue5701.pdf
!US6205527_page1.pdf
Binary file added test/pdfs/US6205527_page1.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ describe('api', function() {
expect(!!data.items).toEqual(true);
expect(data.items.length).toEqual(7);
expect(!!data.styles).toEqual(true);

// Make sure the text is ordered properly.
expect(data.items[1].str).toEqual('Table Of Content ');
expect(data.items[6].str.replace(/^\s+/,'')).toEqual('page 1 / 3');
});
});
it('gets operator list', function() {
Expand Down
78 changes: 78 additions & 0 deletions test/unit/text_layer_spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
/* globals PDFJS, expect, it, describe, Promise, combineUrl, waitsFor,
isArray, MissingPDFException */

'use strict';

function waitsForPromiseResolved(promise, successCallback) {
var data;
promise.then(function(val) {
data = val;
successCallback(data);
},
function(error) {
// Shouldn't get here.
expect(false).toEqual(true);
});
waitsFor(function() {
return data !== undefined;
}, 20000);
}

function getPageOneOf(pdf) {
var pdfURL = combineUrl(window.location.href, pdf);
var resolvePromise;
var pagePromise = new Promise(function (resolve) {
resolvePromise = resolve;
});
PDFJS.getDocument(pdfURL).then(function(doc) {
doc.getPage(1).then(function(data) {
resolvePromise(data);
});
});
var page = {
promise: pagePromise,
page: page
};
waitsForPromiseResolved(pagePromise, function(data) {
page.page = data;
});
return page;
}

describe('text-extract', function() {
it('patent', function () {
var page = getPageOneOf('../pdfs/US6205527_page1.pdf');
waitsForPromiseResolved(page.promise, function (data) {
var textPromise = page.page.getTextContent();
waitsForPromiseResolved(textPromise, function (data) {
expect(!!data.items).toEqual(true);
var text = data.items.map(function (d) { return d.str; }).join('');
// Make sure the text is ordered properly.
expect(text.indexOf('Disclosed is an apparatus, a system, a') > 0)
.toEqual(true);
expect(text.indexOf('device to the computer system; (b) preparing ' +
'a storage. media of the peripheral storage') > 0).toEqual(true);
});
});
});

it('tracemonkey', function () {
var page = getPageOneOf('../pdfs/tracemonkey.pdf');
waitsForPromiseResolved(page.promise, function (data) {
var textPromise = page.page.getTextContent();
waitsForPromiseResolved(textPromise, function (data) {
expect(!!data.items).toEqual(true);
var text = data.items.map(function (d) { return d.str; }).join('');
// Make sure the text is ordered properly.
expect(text.indexOf('no concrete type information is available') > 0)
.toEqual(true);
expect(text.indexOf('difficult to com-pile than statically ') > 0)
.toEqual(true);
expect(text.indexOf('this work for personal or classroom use is') > 0)
.toEqual(true);
});
});
});
});
1 change: 1 addition & 0 deletions test/unit/unit_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
<script src="stream_spec.js"></script>
<script src="parser_spec.js"></script>
<script src="api_spec.js"></script>
<script src="text_layer_spec.js"></script>
<script src="metadata_spec.js"></script>
<script src="util_spec.js"></script>
<script src="cmap_spec.js"></script>
Expand Down