Skip to content

Commit

Permalink
Improve copy/paste by inserting spaces into textChunks if we deem it …
Browse files Browse the repository at this point in the history
…appropriate.

 Add test re same. PR #5783.
  • Loading branch information
speedplane committed Mar 8, 2015
1 parent fa0f09b commit e9ad0b4
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 0 deletions.
69 changes: 69 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
styles: Object.create(null)
};
var bidiTexts = textContent.items;
// At the end of each textChunk, auto insert spaces based on:
var SPACE_FACTOR_CHUNKS = 0.6;
// If performing a spacedText operation, auto insert spaces based on:
var SPACE_FACTOR = 0.35;
var MULTI_SPACE_FACTOR = 1.5;

Expand Down Expand Up @@ -1037,8 +1040,73 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
} else {
textChunk.height += Math.abs(height * scaleCtmX * scaleLineX);
}
addSpaceIfNecessary(textChunk, font);
return textChunk;
}
function getChunkPosition(chunk, font) {
var tx = chunk.transform;
var angle = Math.atan2(tx[1], tx[0]);
if (font.vertical) {
angle += Math.PI / 2;
}
// Start by calculating the height
var fontAscent = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
// Then modify to add the ascent
if (font.ascent) {
fontAscent = font.ascent * fontAscent;
} else if (font.descent) {
fontAscent = (1 + font.descent) * fontAscent;
}
return {
x: (angle === 0 ? tx[4] : tx[4] + (fontAscent * Math.sin(angle))),
y: tx[5]
};
}
function addSpaceIfNecessary(newChunk, font) {
// If the new chunk starts with a space, it does not need one.
if (newChunk.str[0] === ' ' || newChunk.str[0] === '-') {
return;
}
if (bidiTexts.length === 0) {
return;
}
// If the last chunk ends with a space it does not need one.
var lastChunk = bidiTexts[bidiTexts.length - 1];
if (lastChunk.str.length === 0) {
return;
}
var lastChar = lastChunk.str[lastChunk.str.length - 1];
if (lastChar === ' ' || lastChar === '-') {
return;
}
var lastPosition = getChunkPosition(lastChunk, font);
var newPosition = getChunkPosition(newChunk, font);
var yDiff = Math.abs(lastPosition.y - newPosition.y);
if (yDiff >= lastChunk.height || yDiff >= newChunk.height) {
// On different lines, add a space.
lastChunk.str += ' ';
} else {
var wordSpacing = textState.wordSpacing; // Try default wordSpacing.
if (wordSpacing === 0) {
// Heuristic for wordSpacing
wordSpacing = newChunk.width / newChunk.str.length *
SPACE_FACTOR_CHUNKS;
}
var addSpace;
if (newPosition.x >= lastPosition.x) {
// Left to right. Add a space if next is past wordSpacing.
addSpace = newPosition.x >= lastPosition.x + lastChunk.width +
wordSpacing;
} else {
// Right to left. Add space if next is before sart.
addSpace = lastPosition.x >= newPosition.x + newChunk.width +
wordSpacing;
}
if (addSpace) {
lastChunk.str += ' ';
}
}
}

var timeSlotManager = new TimeSlotManager();

Expand Down Expand Up @@ -1121,6 +1189,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textState.translateTextMatrix(0, offset);
textChunk.height += offset;
}
// Automatically insert spaces if the shift is big enough.
if (items[j] < 0 && textState.font.spaceWidth > 0) {
var fakeSpaces = -items[j] / textState.font.spaceWidth;
if (fakeSpaces > MULTI_SPACE_FACTOR) {
Expand Down
1 change: 1 addition & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,4 @@
!issue5481.pdf
!issue5567.pdf
!issue5701.pdf
!US6205527_page1.pdf
Binary file added test/pdfs/US6205527_page1.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ describe('api', function() {
expect(!!data.items).toEqual(true);
expect(data.items.length).toEqual(7);
expect(!!data.styles).toEqual(true);

// Make sure the text is ordered properly.
expect(data.items[1].str).toEqual('Table Of Content ');
expect(data.items[6].str.replace(/^\s+/,'')).toEqual('page 1 / 3');
});
});
it('gets operator list', function() {
Expand Down
78 changes: 78 additions & 0 deletions test/unit/text_layer_spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
/* globals PDFJS, expect, it, describe, Promise, combineUrl, waitsFor,
isArray, MissingPDFException */

'use strict';

function waitsForPromiseResolved(promise, successCallback) {
var data;
promise.then(function(val) {
data = val;
successCallback(data);
},
function(error) {
// Shouldn't get here.
expect(false).toEqual(true);
});
waitsFor(function() {
return data !== undefined;
}, 20000);
}

function getPageOneOf(pdf) {
var pdfURL = combineUrl(window.location.href, pdf);
var resolvePromise;
var pagePromise = new Promise(function (resolve) {
resolvePromise = resolve;
});
PDFJS.getDocument(pdfURL).then(function(doc) {
doc.getPage(1).then(function(data) {
resolvePromise(data);
});
});
var page = {
promise: pagePromise,
page: page
};
waitsForPromiseResolved(pagePromise, function(data) {
page.page = data;
});
return page;
}

describe('text-extract', function() {
it('patent', function () {
var page = getPageOneOf('../pdfs/US6205527_page1.pdf');
waitsForPromiseResolved(page.promise, function (data) {
var textPromise = page.page.getTextContent();
waitsForPromiseResolved(textPromise, function (data) {
expect(!!data.items).toEqual(true);
var text = data.items.map(function (d) { return d.str; }).join('');
// Make sure the text is ordered properly.
expect(text.indexOf('Disclosed is an apparatus, a system, a') > 0)
.toEqual(true);
expect(text.indexOf('device to the computer system; (b) preparing ' +
'a storage. media of the peripheral storage') > 0).toEqual(true);
});
});
});

it('tracemonkey', function () {
var page = getPageOneOf('../pdfs/tracemonkey.pdf');
waitsForPromiseResolved(page.promise, function (data) {
var textPromise = page.page.getTextContent();
waitsForPromiseResolved(textPromise, function (data) {
expect(!!data.items).toEqual(true);
var text = data.items.map(function (d) { return d.str; }).join('');
// Make sure the text is ordered properly.
expect(text.indexOf('no concrete type information is available') > 0)
.toEqual(true);
expect(text.indexOf('difficult to com-pile than statically ') > 0)
.toEqual(true);
expect(text.indexOf('this work for personal or classroom use is') > 0)
.toEqual(true);
});
});
});
});
1 change: 1 addition & 0 deletions test/unit/unit_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
<script src="stream_spec.js"></script>
<script src="parser_spec.js"></script>
<script src="api_spec.js"></script>
<script src="text_layer_spec.js"></script>
<script src="metadata_spec.js"></script>
<script src="util_spec.js"></script>
<script src="cmap_spec.js"></script>
Expand Down

0 comments on commit e9ad0b4

Please sign in to comment.