Skip to content

Commit

Permalink
Improve copy/paste by inserting spaces into textChunks if we deem it …
Browse files Browse the repository at this point in the history
…appropriate.

 Add test re same. PR mozilla#5783.
  • Loading branch information
speedplane committed Mar 5, 2015
1 parent fa0f09b commit 6e56084
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 0 deletions.
58 changes: 58 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -1037,8 +1037,66 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
} else {
textChunk.height += Math.abs(height * scaleCtmX * scaleLineX);
}
addSpaceIfNecessary(textChunk, font);
return textChunk;
}
function getChunkPos(chunk, font) {
var tx = chunk.transform;
var angle = Math.atan2(tx[1], tx[0]);
if (font.vertical) {
angle += Math.PI / 2;
}
var fontHeight = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
var fontAscent = fontHeight;
if (font.ascent) {
fontAscent = font.ascent * fontAscent;
} else if (font.descent) {
fontAscent = (1 + font.descent) * fontAscent;
}
return {
x : (angle === 0) ? tx[4] :(tx[4] + (fontAscent * Math.sin(angle))),
y: tx[5]
};
}
function addSpaceIfNecessary(newChunk, font) {
// If the new chunk starts with a space, it does not need one.
if (newChunk.str[0] === ' ' || newChunk.str[0] === '-') {
return;
}
if (bidiTexts.length === 0) {
return;
}
// If the last chunk ends with a space it does not need one.
var lastChunk = bidiTexts[bidiTexts.length - 1];
if (lastChunk.str.length === 0) {
return;
}
var lastChar = lastChunk.str[lastChunk.str.length - 1];
if (lastChar === ' ' || lastChar === '-') {
return;
}
var lastPos = getChunkPos(lastChunk, font);
var newPos = getChunkPos(newChunk, font);
var yDiff = Math.abs(lastPos.y - newPos.y);
if (yDiff >= lastChunk.height || yDiff >= newChunk.height) {
// On different lines, add a space.
lastChunk.str += ' ';
} else {
var wordSpacing = textState.wordSpacing > 0 ?
// Standard wordSpacing
textState.wordSpacing:
// Hueristic for wordSpacing
newChunk.width / newChunk.str.length * 0.6;
var addSpace = newPos.x >= lastPos.x ?
// Left to right
newPos.x >= lastPos.x + lastChunk.width + wordSpacing:
// Right to left
lastPos.x >= newPos.x + newChunk.width + wordSpacing;
if (addSpace) {
lastChunk.str += ' ';
}
}
}

var timeSlotManager = new TimeSlotManager();

Expand Down
1 change: 1 addition & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,4 @@
!issue5481.pdf
!issue5567.pdf
!issue5701.pdf
!US6205527_page1.pdf
Binary file added test/pdfs/US6205527_page1.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ describe('api', function() {
expect(!!data.items).toEqual(true);
expect(data.items.length).toEqual(7);
expect(!!data.styles).toEqual(true);

// Make sure the text is ordered properly.
expect(data.items[1].str).toEqual('Table Of Content ');
expect(data.items[6].str.replace(/^\s+/,'')).toEqual('page 1 / 3');
});
});
it('gets operator list', function() {
Expand Down
78 changes: 78 additions & 0 deletions test/unit/text_extract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
/* globals PDFJS, expect, it, describe, Promise, combineUrl, waitsFor,
isArray, MissingPDFException */

'use strict';

function waitsForPromiseResolved(promise, successCallback) {
var data;
promise.then(function(val) {
data = val;
successCallback(data);
},
function(error) {
// Shouldn't get here.
expect(false).toEqual(true);
});
waitsFor(function() {
return data !== undefined;
}, 20000);
}

function getPageOneOf(pdf) {
var pdfURL = combineUrl(window.location.href, pdf);
var resolvePromise;
var pagePromise = new Promise(function (resolve) {
resolvePromise = resolve;
});
PDFJS.getDocument(pdfURL).then(function(doc) {
doc.getPage(1).then(function(data) {
resolvePromise(data);
});
});
var page = {
promise : pagePromise,
page : page
};
waitsForPromiseResolved(pagePromise, function(data) {
page.page = data;
});
return page;
}

describe('text-extract', function() {
it('patent', function () {
var page = getPageOneOf('../pdfs/US6205527_page1.pdf');
waitsForPromiseResolved(page.promise, function (data) {
var textPromise = page.page.getTextContent();
waitsForPromiseResolved(textPromise, function (data) {
expect(!!data.items).toEqual(true);
var text = data.items.map(function (d) { return d.str; }).join('');
// Make sure the text is ordered properly.
expect(text.indexOf('Disclosed is an apparatus, a system, a') > 0)
.toEqual(true);
expect(text.indexOf('device to the computer system; (b) preparing ' +
'a storage. media of the peripheral storage') > 0).toEqual(true);
});
});
});

it('tracemonkey', function () {
var page = getPageOneOf('../pdfs/tracemonkey.pdf');
waitsForPromiseResolved(page.promise, function (data) {
var textPromise = page.page.getTextContent();
waitsForPromiseResolved(textPromise, function (data) {
expect(!!data.items).toEqual(true);
var text = data.items.map(function (d) { return d.str; }).join('');
// Make sure the text is ordered properly.
expect(text.indexOf('no concrete type information is available') > 0)
.toEqual(true);
expect(text.indexOf('difficult to com-pile than statically ') > 0)
.toEqual(true);
expect(text.indexOf('this work for personal or classroom use is') > 0)
.toEqual(true);
});
});
});
});
1 change: 1 addition & 0 deletions test/unit/unit_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
<script src="stream_spec.js"></script>
<script src="parser_spec.js"></script>
<script src="api_spec.js"></script>
<script src="text_extract.js"></script>
<script src="metadata_spec.js"></script>
<script src="util_spec.js"></script>
<script src="cmap_spec.js"></script>
Expand Down

0 comments on commit 6e56084

Please sign in to comment.