Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Cannot parse CSV with newlines #642

Merged
merged 8 commits into from
Jan 28, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ import {
} from './io-type-implementation';

export class TextFile
extends FileSystemFile<string[]>
extends FileSystemFile<string>
implements IOTypeImplementation<IOType.TEXT_FILE>
{
public readonly ioType = IOType.TEXT_FILE;
65 changes: 64 additions & 1 deletion libs/execution/src/lib/util/file-util.spec.ts
Original file line number Diff line number Diff line change
@@ -2,14 +2,25 @@
//
// SPDX-License-Identifier: AGPL-3.0-only

import { FileExtension, MimeType } from '../types';
import * as R from '../blocks';
import { FileExtension, MimeType, TextFile } from '../types';

import {
inferFileExtensionFromContentTypeString,
inferFileExtensionFromFileExtensionString,
inferMimeTypeFromFileExtensionString,
transformTextFileLines,
} from './file-util';

function exampleTextFile(content: string): TextFile {
return new TextFile(
'exampleTextFile',
FileExtension.TXT,
MimeType.TEXT_PLAIN,
content,
);
}

describe('Validation of file-util', () => {
describe('Function inferMimeTypeFromContentTypeString', () => {
it('should diagnose no error on known mimeType', () => {
@@ -68,4 +79,56 @@ describe('Validation of file-util', () => {
expect(result).toEqual(undefined);
});
});
describe('Function transformTextFileLines', () => {
it('should diagnose no error without newline', async () => {
const file = exampleTextFile('some text content without a newline');
// eslint-disable-next-line @typescript-eslint/require-await
const spy = vi.fn(async (lines: string[]) => R.ok(lines));
const result = await transformTextFileLines(file, /\r?\n/, spy);

expect(spy).toHaveBeenCalledOnce();
expect(spy).toHaveBeenCalledWith(['some text content without a newline']);

expect(R.isOk(result)).toBe(true);
assert(R.isOk(result));

expect(result.right).toStrictEqual(file);
});
it('should diagnose no error on empty file', async () => {
const file = exampleTextFile('');

// eslint-disable-next-line @typescript-eslint/require-await
const spy = vi.fn(async (lines: string[]) => R.ok(lines));
const result = await transformTextFileLines(file, /\r?\n/, spy);

expect(spy).toHaveBeenCalledOnce();
expect(spy).toHaveBeenCalledWith([]);

expect(R.isOk(result)).toBe(true);
assert(R.isOk(result));

expect(result.right).toStrictEqual(file);
});
it('should diagnose no error on file with trailing newline', async () => {
const file = exampleTextFile(`some text content
with a
trailing newline
`);
// eslint-disable-next-line @typescript-eslint/require-await
const spy = vi.fn(async (lines: string[]) => R.ok(lines));
const result = await transformTextFileLines(file, /\r?\n/, spy);

expect(spy).toHaveBeenCalledOnce();
expect(spy).toHaveBeenCalledWith([
'some text content',
'with a ',
'trailing newline',
]);

expect(R.isOk(result)).toBe(true);
assert(R.isOk(result));

expect(result.right).toStrictEqual(file);
});
});
});
33 changes: 32 additions & 1 deletion libs/execution/src/lib/util/file-util.ts
Original file line number Diff line number Diff line change
@@ -4,7 +4,8 @@

import * as mime from 'mime-types';

import { FileExtension, MimeType } from '../types';
import * as R from '../blocks';
import { FileExtension, MimeType, TextFile } from '../types';

export function inferMimeTypeFromFileExtensionString(
fileExtension: string | undefined,
@@ -50,3 +51,33 @@ export function inferFileExtensionFromContentTypeString(
}
return undefined;
}

export async function transformTextFileLines(
file: TextFile,
lineBreakPattern: RegExp,
transformFn: (lines: string[]) => Promise<R.Result<string[]>>,
): Promise<R.Result<TextFile>> {
const lines = file.content.split(lineBreakPattern);
const lineBreak = file.content.match(lineBreakPattern)?.at(0) ?? '';

// There may be an additional empty line due to the previous splitting
let emptyNewline = false;
if (lines[lines.length - 1] === '') {
emptyNewline = true;
lines.pop();
}
TungstnBallon marked this conversation as resolved.
Show resolved Hide resolved

const newLines = await transformFn(lines);
if (R.isErr(newLines)) {
return newLines;
}

let newContent = newLines.right.join(lineBreak);
if (emptyNewline) {
newContent += lineBreak;
}

return R.ok(
new TextFile(file.name, file.extension, file.mimeType, newContent),
);
}
1 change: 0 additions & 1 deletion libs/execution/src/lib/util/index.ts
Original file line number Diff line number Diff line change
@@ -4,4 +4,3 @@

export * from './implements-static-decorator';
export * from './file-util';
export * from './string-util';
14 changes: 0 additions & 14 deletions libs/execution/src/lib/util/string-util.ts

This file was deleted.

3 changes: 1 addition & 2 deletions libs/execution/test/utils/file-util.ts
Original file line number Diff line number Diff line change
@@ -12,7 +12,6 @@ import {
TextFile,
inferFileExtensionFromFileExtensionString,
inferMimeTypeFromFileExtensionString,
splitLines,
} from '../../src';

export function createBinaryFileFromLocalFile(fileName: string): BinaryFile {
@@ -39,6 +38,6 @@ export function createTextFileFromLocalFile(fileName: string): TextFile {
path.basename(fileName),
fileExtension,
mimeType,
splitLines(fileContent, /\r?\n/),
fileContent,
);
}
36 changes: 15 additions & 21 deletions libs/extensions/std/exec/src/text-file-interpreter-executor.spec.ts
Original file line number Diff line number Diff line change
@@ -92,9 +92,9 @@ describe('Validation of TextFileInterpreterExecutor', () => {
expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['Multiline ', 'Test File']),
);
expect(result.right.content).toBe(`Multiline
Test File
`);
}
});

@@ -107,24 +107,18 @@ describe('Validation of TextFileInterpreterExecutor', () => {
expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['vehicle:268435857"0']),
);
}
});

it('should diagnose no error on custom lineBreak', async () => {
const text = readJvTestAsset('valid-custom-line-break.jv');

const testFile = readTestFile('test.txt');
const result = await parseAndExecuteExecutor(text, testFile);

expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['Multiline \nTest', 'File\n']),
);
const expectedBytes = Buffer.from([
0xa, 0xd, 0xa, 0x3, 0x32, 0x2e, 0x30, 0x10, 0x0, 0x18, 0xe9, 0xa9, 0xba,
0xef, 0xbf, 0xbd, 0x6, 0x12, 0x45, 0xa, 0x11, 0x76, 0x65, 0x68, 0x69,
0x63, 0x6c, 0x65, 0x3a, 0x32, 0x36, 0x38, 0x34, 0x33, 0x35, 0x38, 0x35,
0x37, 0x22, 0x30, 0xa, 0xe, 0xa, 0x8, 0x31, 0x35, 0x39, 0x32, 0x33,
0x34, 0x37, 0x34, 0x2a, 0x2, 0x31, 0x30, 0x12, 0xf, 0xd, 0x27, 0xef,
0xbf, 0xbd, 0x39, 0x42, 0x15, 0xef, 0xbf, 0xbd, 0xf, 0x1f, 0xef, 0xbf,
0xbd, 0x1d, 0x0, 0x0, 0x2c, 0x43, 0x28, 0x0, 0x42, 0xb, 0xa, 0x9, 0x32,
0x36, 0x38, 0x34, 0x33, 0x35, 0x38, 0x35, 0x37,
]);
const actualBytes = Buffer.from(result.right.content);
expect(actualBytes).toStrictEqual(expectedBytes);
}
});
});
15 changes: 2 additions & 13 deletions libs/extensions/std/exec/src/text-file-interpreter-executor.ts
Original file line number Diff line number Diff line change
@@ -12,7 +12,6 @@ import {
type ExecutionContext,
TextFile,
implementsStatic,
splitLines,
} from '@jvalue/jayvee-execution';
import { IOType } from '@jvalue/jayvee-language-server';

@@ -36,25 +35,15 @@ export class TextFileInterpreterExecutor extends AbstractBlockExecutor<
'encoding',
context.valueTypeProvider.Primitives.Text,
);
const lineBreak = context.getPropertyValue(
'lineBreak',
context.valueTypeProvider.Primitives.Regex,
);

const decoder = new TextDecoder(encoding);
context.logger.logDebug(
`Decoding file content using encoding "${encoding}"`,
);
const textContent = decoder.decode(file.content);

context.logger.logDebug(
`Splitting lines using line break /${lineBreak.source}/`,
return R.ok(
new TextFile(file.name, file.extension, file.mimeType, textContent),
);
const lines = splitLines(textContent, lineBreak);
context.logger.logDebug(
`Lines were split successfully, the resulting text file has ${lines.length} lines`,
);

return R.ok(new TextFile(file.name, file.extension, file.mimeType, lines));
}
}
16 changes: 10 additions & 6 deletions libs/extensions/std/exec/src/text-line-deleter-executor.spec.ts
Original file line number Diff line number Diff line change
@@ -92,8 +92,9 @@ describe('Validation of TextLineDeleterExecutor', () => {
expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['Test File']),
expect(result.right.content).toBe(
`Test File
`,
);
}
});
@@ -107,8 +108,10 @@ describe('Validation of TextLineDeleterExecutor', () => {
expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['Multiline', 'Test File']),
expect(result.right.content).toBe(
`Multiline
Test File
`,
);
}
});
@@ -136,8 +139,9 @@ describe('Validation of TextLineDeleterExecutor', () => {
expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['Test File']),
expect(result.right.content).toBe(
`Test File
`,
);
}
});
73 changes: 41 additions & 32 deletions libs/extensions/std/exec/src/text-line-deleter-executor.ts
Original file line number Diff line number Diff line change
@@ -12,6 +12,40 @@ import {
} from '@jvalue/jayvee-execution';
import { IOType } from '@jvalue/jayvee-language-server';

// eslint-disable-next-line @typescript-eslint/require-await
async function deleteLines(
lines: string[],
deleteIdxs: number[],
context: ExecutionContext,
): Promise<R.Result<string[]>> {
let lineIdx = 0;
for (const deleteIdx of deleteIdxs) {
if (deleteIdx > lines.length) {
return R.err({
message: `Line ${deleteIdx} does not exist in the text file, only ${lines.length} line(s) are present`,
diagnostic: {
node: context.getOrFailProperty('lines').value,
property: 'values',
index: lineIdx,
},
});
}
++lineIdx;
}

const distinctLines = new Set(deleteIdxs);
const sortedLines = [...distinctLines].sort((a, b) => a - b);

context.logger.logDebug(`Deleting line(s) ${sortedLines.join(', ')}`);

const reversedLines = sortedLines.reverse();
for (const lineToDelete of reversedLines) {
lines.splice(lineToDelete - 1, 1);
}

return R.ok(lines);
}

@implementsStatic<BlockExecutorClass>()
export class TextLineDeleterExecutor extends AbstractBlockExecutor<
IOType.TEXT_FILE,
@@ -23,48 +57,23 @@ export class TextLineDeleterExecutor extends AbstractBlockExecutor<
super(IOType.TEXT_FILE, IOType.TEXT_FILE);
}

// eslint-disable-next-line @typescript-eslint/require-await
async doExecute(
file: TextFile,
context: ExecutionContext,
): Promise<R.Result<TextFile>> {
const lines = context.getPropertyValue(
const deleteIdxs = context.getPropertyValue(
'lines',
context.valueTypeProvider.createCollectionValueTypeOf(
context.valueTypeProvider.Primitives.Integer,
),
);
const numberOfLines = file.content.length;

let lineIndex = 0;
for (const lineNumber of lines) {
if (lineNumber > numberOfLines) {
return R.err({
message: `Line ${lineNumber} does not exist in the text file, only ${file.content.length} line(s) are present`,
diagnostic: {
node: context.getOrFailProperty('lines').value,
property: 'values',
index: lineIndex,
},
});
}

++lineIndex;
}

const distinctLines = new Set(lines);
const sortedLines = [...distinctLines].sort((a, b) => a - b);

context.logger.logDebug(`Deleting line(s) ${sortedLines.join(', ')}`);

const reversedLines = sortedLines.reverse();
const newContent = [...file.content];
for (const lineToDelete of reversedLines) {
newContent.splice(lineToDelete - 1, 1);
}
const lineBreakPattern = context.getPropertyValue(
'lineBreak',
context.valueTypeProvider.Primitives.Regex,
);

return R.ok(
new TextFile(file.name, file.extension, file.mimeType, newContent),
return R.transformTextFileLines(file, lineBreakPattern, (lines) =>
deleteLines(lines, deleteIdxs, context),
);
}
}
Original file line number Diff line number Diff line change
@@ -93,7 +93,9 @@ describe('Validation of TextRangeSelectorExecutor', () => {
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.TEXT_FILE);
expect(result.right.content).toEqual(
expect.arrayContaining(['Multiline', 'Test ']),
`Multiline
Test
`,
);
}
});
28 changes: 14 additions & 14 deletions libs/extensions/std/exec/src/text-range-selector-executor.ts
Original file line number Diff line number Diff line change
@@ -23,7 +23,6 @@ export class TextRangeSelectorExecutor extends AbstractBlockExecutor<
super(IOType.TEXT_FILE, IOType.TEXT_FILE);
}

// eslint-disable-next-line @typescript-eslint/require-await
async doExecute(
file: TextFile,
context: ExecutionContext,
@@ -36,20 +35,21 @@ export class TextRangeSelectorExecutor extends AbstractBlockExecutor<
'lineTo',
context.valueTypeProvider.Primitives.Integer,
);

const numberOfLines = file.content.length;

context.logger.logDebug(
`Selecting lines from ${lineFrom} to ${
lineTo === Number.MAX_SAFE_INTEGER || lineTo >= numberOfLines
? 'the end'
: `${lineTo}`
}`,
const lineBreakPattern = context.getPropertyValue(
'lineBreak',
context.valueTypeProvider.Primitives.Regex,
);
const selectedLines = file.content.slice(lineFrom - 1, lineTo);

return R.ok(
new TextFile(file.name, file.extension, file.mimeType, selectedLines),
);
// eslint-disable-next-line @typescript-eslint/require-await
return R.transformTextFileLines(file, lineBreakPattern, async (lines) => {
context.logger.logDebug(
`Selecting lines from ${lineFrom} to ${
lineTo === Number.MAX_SAFE_INTEGER || lineTo >= lines.length
? 'the end'
: `${lineTo}`
}`,
);
return R.ok(lines.slice(lineFrom - 1, lineTo));
});
}
}
Original file line number Diff line number Diff line change
@@ -100,4 +100,27 @@ describe('Validation of CSVInterpreterExecutor', () => {
]);
}
});

it('should correctly parse newlines within csv string values', async () => {
const text = readJvTestAsset('csv-with-newline-interpreter.jv');

const testCsv = readTestFile('csv-with-newline.csv');
const result = await parseAndExecuteExecutor(text, testCsv);

expect(R.isErr(result)).toEqual(false);
if (R.isOk(result)) {
expect(result.right.ioType).toEqual(IOType.SHEET);
expect(result.right.getNumberOfColumns()).toEqual(3);
expect(result.right.getNumberOfRows()).toEqual(2);
expect(result.right.getData()).toEqual([
['C1', 'C2', 'C3'],
[
'2',
`some
text`,
TungstnBallon marked this conversation as resolved.
Show resolved Hide resolved
'true',
],
]);
}
});
});
89 changes: 43 additions & 46 deletions libs/extensions/tabular/exec/src/lib/csv-interpreter-executor.ts
Original file line number Diff line number Diff line change
@@ -2,7 +2,10 @@
//
// SPDX-License-Identifier: AGPL-3.0-only

import { parseString as parseStringAsCsv } from '@fast-csv/parse';
// eslint-disable-next-line unicorn/prefer-node-protocol
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I asked this before, why not just add the node protocol?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

importing node:assert gives me this linter error:

'node:assert' import is restricted from being used. Please use the library `assert` instead to keep browser compatibility. You might need to disable rule `unicorn/prefer-node-protocol` to do so  no-restricted-imports`

import assert from 'assert';

import { Row, parseString as parseStringAsCsv } from '@fast-csv/parse';
import { type ParserOptionsArgs } from '@fast-csv/parse/build/src/ParserOptions';
import * as R from '@jvalue/jayvee-execution';
import {
@@ -14,7 +17,6 @@ import {
implementsStatic,
} from '@jvalue/jayvee-execution';
import { IOType } from '@jvalue/jayvee-language-server';
import { either as E } from 'fp-ts';

@implementsStatic<BlockExecutorClass>()
export class CSVInterpreterExecutor extends AbstractBlockExecutor<
@@ -53,56 +55,51 @@ export class CSVInterpreterExecutor extends AbstractBlockExecutor<
quote: enclosing,
escape: enclosingEscape,
};
const csvData = await parseAsCsv(file.content, parseOptions);

if (E.isLeft(csvData)) {
return Promise.resolve(
R.err({
message: `CSV parse failed in line ${csvData.left.lineNumber}: ${csvData.left.error.message}`,
diagnostic: { node: context.getCurrentNode(), property: 'name' },
}),
);
}
const sheet = new Sheet(csvData.right);

context.logger.logDebug(`Parsing raw data as CSV-sheet successful`);
return Promise.resolve(R.ok(sheet));
}
}

async function parseAsCsv(
lines: string[],
parseOptions: ParserOptionsArgs,
): Promise<E.Either<{ error: Error; lineNumber: number }, string[][]>> {
let lineNumber = 1;
const rows: string[][] = [];
for await (const line of lines) {
const rowParseResult = await parseLineAsRow(line, parseOptions);
if (E.isLeft(rowParseResult)) {
return E.left({ error: rowParseResult.left, lineNumber });
}
rows.push(rowParseResult.right);

++lineNumber;
return parseAsCSV(file.content, parseOptions).then(
(csvData) => {
context.logger.logDebug(`Parsing raw data as CSV-sheet successful`);
return R.ok(new Sheet(csvData));
},
(reason) => {
assert(typeof reason === 'string');
return R.err({
message: reason,
diagnostic: {
node: context.getCurrentNode(),
property: 'name',
},
});
},
Comment on lines +63 to +72
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow I didn't know that you could add a second parameter to .then :D I still prefer using .catch because it think has more semantic value, but I don't mind using it.

);
}
return E.right(rows);
}

async function parseLineAsRow(
line: string,
async function parseAsCSV(
text: string,
parseOptions: ParserOptionsArgs,
): Promise<E.Either<Error, string[]>> {
return new Promise((resolve) => {
let row: string[];
parseStringAsCsv(line, parseOptions)
.on('data', (data: string[]) => {
row = data;
})
.on('error', (error) => {
resolve(E.left(error));
): Promise<string[][]> {
return new Promise((resolve, reject) => {
const rows: string[][] = [];
parseStringAsCsv(text, parseOptions)
.on('data', (row: string[]) => {
rows.push(row);
})
.on('error', (error) =>
reject(
`Unexpected error while parsing CSV: ${error.name}: ${error.message}`,
),
)
.on(
'data-invalid',
(row: Row | null, rowCount: number, reason?: string) =>
reject(
`Invalid row ${rowCount}: ${
reason ?? 'Unknwon reason'
}: ${JSON.stringify(row ?? '')}`,
),
)
.on('end', () => {
resolve(E.right(row));
resolve(rows);
});
});
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// SPDX-FileCopyrightText: 2025 Friedrich-Alexander-Universitat Erlangen-Nurnberg
//
// SPDX-License-Identifier: AGPL-3.0-only

pipeline TestPipeline {

block TestExtractor oftype TestTextFileExtractor { }

block TestBlock oftype CSVInterpreter {
enclosing: '"';
}

block TestLoader oftype TestSheetLoader { }

TestExtractor
-> TestBlock
-> TestLoader;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
C1,C2,C3
2,"some
text",true
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SPDX-FileCopyrightText: 2025 Friedrich-Alexander-Universitat Erlangen-Nurnberg

SPDX-License-Identifier: AGPL-3.0-only
Original file line number Diff line number Diff line change
@@ -4,28 +4,28 @@

/**
* Interprets an input file as a csv-file containing string-values delimited by `delimiter` and outputs a `Sheet`.
*
*
* @example Interprets an input file as a csv-file containing string-values delimited by `;` and outputs `Sheet`.
* block AgencyCSVInterpreter oftype CSVInterpreter {
* block AgencyCSVInterpreter oftype CSVInterpreter {
* delimiter: ";";
* }
*/
publish builtin blocktype CSVInterpreter {
input default oftype TextFile;
output default oftype Sheet;

/**
* The delimiter for values in the CSV file.
*/
property delimiter oftype text: ',';

/**
* The enclosing character that may be used for values in the CSV file.
*/
property enclosing oftype text: '';

/**
* The character to escape enclosing characters in values.
*/
property enclosingEscape oftype text: '';
}
}
Original file line number Diff line number Diff line change
@@ -8,14 +8,9 @@
publish builtin blocktype TextFileInterpreter {
input default oftype File;
output default oftype TextFile;

/**
* The encoding used for decoding the file contents.
*/
property encoding oftype text: 'utf-8';

/**
* The regex for identifying line breaks.
*/
property lineBreak oftype Regex: /\r?\n/;
}
Original file line number Diff line number Diff line change
@@ -8,9 +8,14 @@
publish builtin blocktype TextLineDeleter {
input default oftype TextFile;
output default oftype TextFile;

/**
* The line numbers to delete.
*/
property lines oftype Collection<integer>;

/**
* The regex for identifying line breaks.
*/
property lineBreak oftype Regex: /\r?\n/;
}
Original file line number Diff line number Diff line change
@@ -19,4 +19,9 @@ publish builtin blocktype TextRangeSelector {
* The default value is the biggest usable integer.
*/
property lineTo oftype integer: 9007199254740991;
}

/**
* The regex for identifying line breaks.
*/
property lineBreak oftype Regex: /\r?\n/;
}