-
Notifications
You must be signed in to change notification settings - Fork 10.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Combine text items is off #8963
Comments
Closing as duplicate of #2140 |
I am trying to use a regex with the text of a PDF. Is it posible? |
It's possible. You need to a) preprocess text items based on their positions, or b) build regex without relying on the line breaks. |
You are right, the y-coordinate is the same for items on the same line. But I still can´t deal with the random spaces it adds splitting words in halves. Well, I guess I could disable combine text items and concatenate each symbol but... shouldn´t it be just as easy to fix directly in the combine text items implementation? |
Actually the random spaces are not solved by disableCombineTextItems. :( Update: That was because of weird symbols. The y-coordinate solution should work in general but it looks kind of easy to fix for everybody. Plus it would also solve #2140 |
Workaround: <!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>PDF.js test</title>
<script src="https://mozilla.github.io/pdf.js/build/pdf.js"></script>
</head>
<body>
<input type="file" onchange="readText(this)" autofocus>
<pre id="text"></pre>
<script>
var pre = document.getElementById("text");
function readText(filePath) {
var reader = new FileReader();
var fileName = filePath.files[0].name;
reader.onload = function(event) {
PDFJS.getDocument(new Uint8Array(event.target.result)).then(function(pdf) {
var maxPages = pdf.pdfInfo.numPages;
var countPromises = [];
for (var i = 1; i <= maxPages; i++) {
countPromises.push(pdf.getPage(i).then(function(page){
return page.getTextContent().then(function(text){
"use strict";
let lastY, str = '';
for (let item of text.items) {
if (lastY == item.transform[5] || !lastY)
str += item.str;
else
str += '\n' + item.str;
lastY = item.transform[5];
}
return str;
});
}));
}
Promise.all(countPromises).then(function(texts) {
pre.innerHTML = texts.join('\n');
});
});
};
reader.readAsArrayBuffer(filePath.files[0]);
}
</script>
</body>
</html> |
Even though the "duplicate" is fixed, this issue still isn't. Since the API has changed in the last year, this is now the code to test it: <!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>PDF.js test</title>
<script src="https://unpkg.com/pdfjs-dist/build/pdf.min.js"></script>
</head>
<body>
<input type="file" onchange="readText(this)" autofocus>
<pre id="text"></pre>
<script>
var pre = document.getElementById("text");
function readText(filePath) {
var reader = new FileReader();
var fileName = filePath.files[0].name;
reader.onload = function(event) {
pdfjsLib.getDocument(event.target.result).promise.then(function(pdf) {
var maxPages = pdf.numPages;
var countPromises = [];
for (var i = 1; i <= maxPages; i++) {
countPromises.push(pdf.getPage(i).then(function(page){
return page.getTextContent().then(function(text){
return text.items.map(function(item) { return item.str; }).join('\n');
});
}));
}
Promise.all(countPromises).then(function(texts) {
pre.innerHTML = texts.join('\n');
});
});
};
reader.readAsArrayBuffer(filePath.files[0]);
}
</script>
</body>
</html> |
After all this time, the issue is still not fixed. This is the updated code to test it: <!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>PDF.js test</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<script src="https://unpkg.com/pdfjs-dist/build/pdf.min.js"></script>
</head>
<body>
<input type="file" onchange="readText(this)" autofocus>
<pre id="text"></pre>
<script>
"use strict";
let pre = document.getElementById("text");
function readText(filePath) {
let reader = new FileReader();
reader.onload = async function(event) {
let pdf = await pdfjsLib.getDocument(event.target.result).promise;
let pages = Array.from({length: pdf.numPages}, (_, i) => i + 1)
.map(index => pdf.getPage(index).then(page => page.getTextContent())
.then(text => text.items.map(item => item.str).join('\n')));
pre.innerHTML = (await Promise.all(pages)).join('\n');
};
reader.readAsArrayBuffer(filePath.files[0]);
}
</script>
</body>
</html> |
Link to PDF file: Hyphenator.pdf
Configuration:
Steps to reproduce the problem:
What is the expected behavior?
A plain text version of the original PDF.
What went wrong?
Text from the same line is not combined properly.
The text was updated successfully, but these errors were encountered: