Combine text items is off #8963

luiscastro193 · 2017-09-28T12:17:31Z

Configuration:

Web browser and its version: Firefox 55.0.2 and Chrome 61.0.3163.100.
Operating system and its version: Linux Mint 18.1
PDF.js version: 1.9.602
Is an extension: No

Steps to reproduce the problem:

Try to extract the text from the PDF. For example:

<!doctype html>
<html>
	<head>
		<meta charset="utf-8">
		<title>PDF.js test</title>
		<meta name="viewport" content="width=device-width, initial-scale=1">
		<script src="https://unpkg.com/pdfjs-dist/build/pdf.min.js"></script>
	</head>
	<body>
		<input type="file" onchange="readText(this)" autofocus>
		<pre id="text"></pre>
		<script>
			"use strict";
			let pre = document.getElementById("text");
		
			function readText(filePath) {
				let reader = new FileReader();
				
				reader.onload = async function(event) {
					let pdf = await pdfjsLib.getDocument(event.target.result).promise;
					let pages = Array.from({length: pdf.numPages}, (_, i) => i + 1)
						.map(index => pdf.getPage(index).then(page => page.getTextContent())
							.then(text => text.items.map(item => item.str).join('\n')));
					pre.innerHTML = (await Promise.all(pages)).join('\n');
				};
			
				reader.readAsArrayBuffer(filePath.files[0]);
			}
		</script>
	</body>
</html>

Compare the result with the original PDF.

What is the expected behavior?

A plain text version of the original PDF.

What went wrong?

Text from the same line is not combined properly.

yurydelendik · 2017-09-28T12:46:27Z

Closing as duplicate of #2140

luiscastro193 · 2017-09-28T14:03:22Z

I am trying to use a regex with the text of a PDF. Is it posible?

yurydelendik · 2017-09-28T14:24:14Z

I am trying to use a regex with the text of a PDF. Is it posible?

It's possible. You need to a) preprocess text items based on their positions, or b) build regex without relying on the line breaks.

luiscastro193 · 2017-09-28T18:59:23Z

You are right, the y-coordinate is the same for items on the same line. But I still can´t deal with the random spaces it adds splitting words in halves.

Well, I guess I could disable combine text items and concatenate each symbol but... shouldn´t it be just as easy to fix directly in the combine text items implementation?

luiscastro193 · 2017-09-28T19:04:18Z

Actually the random spaces are not solved by disableCombineTextItems. :(

Update: That was because of weird symbols. The y-coordinate solution should work in general but it looks kind of easy to fix for everybody. Plus it would also solve #2140

luiscastro193 · 2017-09-28T19:37:32Z

Workaround:

<!doctype html>
<html>
	<head>
		<meta charset="utf-8">
		<title>PDF.js test</title>
		<script src="https://mozilla.github.io/pdf.js/build/pdf.js"></script>
	</head>

	<body>
		<input type="file" onchange="readText(this)" autofocus>
		<pre id="text"></pre>
		<script>
			var pre = document.getElementById("text");
		
			function readText(filePath) {
				var reader = new FileReader();
				var fileName = filePath.files[0].name;
				
				reader.onload = function(event) {
					PDFJS.getDocument(new Uint8Array(event.target.result)).then(function(pdf) {
						var maxPages = pdf.pdfInfo.numPages;
						var countPromises = [];
						
						for (var i = 1; i <= maxPages; i++) {
							countPromises.push(pdf.getPage(i).then(function(page){
								return page.getTextContent().then(function(text){
									"use strict";
									let lastY, str = '';
								
									for (let item of text.items) {
										if (lastY == item.transform[5] || !lastY)
											str += item.str;
										else
											str += '\n' + item.str;
											
										lastY = item.transform[5];
									}
									
									return str;
								});
							}));
						}
					
						Promise.all(countPromises).then(function(texts) {
							pre.innerHTML = texts.join('\n');
						});
					});
				};
			
				reader.readAsArrayBuffer(filePath.files[0]);
			}
		</script>
	</body>
</html>

luiscastro193 · 2018-12-01T11:17:27Z

Even though the "duplicate" is fixed, this issue still isn't. Since the API has changed in the last year, this is now the code to test it:

<!doctype html>
<html>
	<head>
		<meta charset="utf-8">
		<title>PDF.js test</title>
		<script src="https://unpkg.com/pdfjs-dist/build/pdf.min.js"></script>
	</head>

	<body>
		<input type="file" onchange="readText(this)" autofocus>
		<pre id="text"></pre>
		<script>
			var pre = document.getElementById("text");
		
			function readText(filePath) {
				var reader = new FileReader();
				var fileName = filePath.files[0].name;
				
				reader.onload = function(event) {
					pdfjsLib.getDocument(event.target.result).promise.then(function(pdf) {
						var maxPages = pdf.numPages;
						var countPromises = [];
						
						for (var i = 1; i <= maxPages; i++) {
							countPromises.push(pdf.getPage(i).then(function(page){
								return page.getTextContent().then(function(text){
									return text.items.map(function(item) { return item.str; }).join('\n');
								});
							}));
						}
					
						Promise.all(countPromises).then(function(texts) {
							pre.innerHTML = texts.join('\n');
						});
					});
				};
			
				reader.readAsArrayBuffer(filePath.files[0]);
			}
		</script>
	</body>
</html>

luiscastro193 · 2021-01-11T17:13:24Z

After all this time, the issue is still not fixed. This is the updated code to test it:

<!doctype html>
<html>
	<head>
		<meta charset="utf-8">
		<title>PDF.js test</title>
		<meta name="viewport" content="width=device-width, initial-scale=1">
		<script src="https://unpkg.com/pdfjs-dist/build/pdf.min.js"></script>
	</head>
	<body>
		<input type="file" onchange="readText(this)" autofocus>
		<pre id="text"></pre>
		<script>
			"use strict";
			let pre = document.getElementById("text");
		
			function readText(filePath) {
				let reader = new FileReader();
				
				reader.onload = async function(event) {
					let pdf = await pdfjsLib.getDocument(event.target.result).promise;
					let pages = Array.from({length: pdf.numPages}, (_, i) => i + 1)
						.map(index => pdf.getPage(index).then(page => page.getTextContent())
							.then(text => text.items.map(item => item.str).join('\n')));
					pre.innerHTML = (await Promise.all(pages)).join('\n');
				};
			
				reader.readAsArrayBuffer(filePath.files[0]);
			}
		</script>
	</body>
</html>

yurydelendik closed this as completed Sep 28, 2017

snyk-bot mentioned this issue Dec 17, 2020

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#6

Open

yoonpsu mentioned this issue Feb 23, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#7

Open

This was referenced Mar 8, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#8

Open

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#9

Open

yoonpsu mentioned this issue Mar 15, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#10

Open

snyk-bot mentioned this issue Mar 30, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#11

Open

snyk-bot mentioned this issue Apr 17, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#12

Open

snyk-bot mentioned this issue Apr 27, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#13

Open

snyk-bot mentioned this issue May 27, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#14

Open

snyk-bot mentioned this issue Jun 8, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#15

Open

snyk-bot mentioned this issue Jun 23, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#16

Open

snyk-bot mentioned this issue Sep 16, 2021

[Snyk] Security upgrade react-scripts from 3.2.0 to 4.0.0 yoonpsu/pdf.js#18

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Combine text items is off #8963

Combine text items is off #8963

luiscastro193 commented Sep 28, 2017 •

edited

Loading

yurydelendik commented Sep 28, 2017

luiscastro193 commented Sep 28, 2017

yurydelendik commented Sep 28, 2017

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Dec 1, 2018 •

edited

Loading

luiscastro193 commented Jan 11, 2021

Combine text items is off #8963

Combine text items is off #8963

Comments

luiscastro193 commented Sep 28, 2017 • edited Loading

yurydelendik commented Sep 28, 2017

luiscastro193 commented Sep 28, 2017

yurydelendik commented Sep 28, 2017

luiscastro193 commented Sep 28, 2017 • edited Loading

luiscastro193 commented Sep 28, 2017 • edited Loading

luiscastro193 commented Sep 28, 2017 • edited Loading

luiscastro193 commented Dec 1, 2018 • edited Loading

luiscastro193 commented Jan 11, 2021

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Sep 28, 2017 •

edited

Loading

luiscastro193 commented Dec 1, 2018 •

edited

Loading