Skip to content
This repository has been archived by the owner on Oct 17, 2024. It is now read-only.

Add chunked decoding support to CodePage #91

Merged
merged 5 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
## 3.1.2-dev

- Require Dart 2.19
- Add chunked decoding support (`startChunkedConversion`) for `CodePage`
encodings.

## 3.1.1

Expand Down
27 changes: 27 additions & 0 deletions lib/src/codepage.dart
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,25 @@ CodePageDecoder _createDecoder(String characters) {
return _NonBmpCodePageDecoder._(result);
}

/// An input Sink for decoders where each input byte can be be considered
brianquinlan marked this conversation as resolved.
Show resolved Hide resolved
/// independantly.
class _CodePageSink implements Sink<List<int>> {
brianquinlan marked this conversation as resolved.
Show resolved Hide resolved
brianquinlan marked this conversation as resolved.
Show resolved Hide resolved
final Sink<String> _output;
final String Function(List<int> input) _convert;
brianquinlan marked this conversation as resolved.
Show resolved Hide resolved

_CodePageSink(this._output, this._convert);

@override
void add(List<int> chunk) {
_output.add(_convert(chunk));
}

@override
void close() {
_output.close();
}
}

/// Code page with non-BMP characters.
class _NonBmpCodePageDecoder extends Converter<List<int>, String>
implements CodePageDecoder {
Expand Down Expand Up @@ -326,6 +345,10 @@ class _NonBmpCodePageDecoder extends Converter<List<int>, String>
}
return String.fromCharCodes(buffer);
}

@override
Sink<List<int>> startChunkedConversion(Sink<String> sink) =>
_CodePageSink(sink, convert);
}

class _BmpCodePageDecoder extends Converter<List<int>, String>
Expand Down Expand Up @@ -360,6 +383,10 @@ class _BmpCodePageDecoder extends Converter<List<int>, String>
return String.fromCharCodes(codeUnits);
}

@override
Sink<List<int>> startChunkedConversion(Sink<String> sink) =>
_CodePageSink(sink, convert);

String _convertAllowInvalid(List<int> bytes) {
var count = bytes.length;
var codeUnits = Uint16List(count);
Expand Down
116 changes: 90 additions & 26 deletions test/codepage_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

import 'dart:convert';
import 'dart:core';
import 'dart:typed_data';

import 'package:convert/convert.dart';
Expand All @@ -25,24 +27,52 @@ void main() {
latinThai,
latinArabic
]) {
test('${cp.name} codepage', () {
// All ASCII compatible.
for (var byte = 0x20; byte < 0x7f; byte++) {
expect(cp[byte], byte);
}
// Maps both directions.
for (var byte = 0; byte < 256; byte++) {
var char = cp[byte];
if (char != 0xFFFD) {
var string = String.fromCharCode(char);
expect(cp.encode(string), [byte]);
expect(cp.decode([byte]), string);
group('${cp.name} codepage', () {
test('ascii compatible', () {
for (var byte = 0x20; byte < 0x7f; byte++) {
expect(cp[byte], byte);
}
}
expect(() => cp.decode([0xfffd]), throwsA(isA<FormatException>()));
// Decode works like operator[].
expect(cp.decode(bytes, allowInvalid: true),
String.fromCharCodes([for (var i = 0; i < 256; i++) cp[i]]));
});

test('bidirectional mapping', () {
// Maps both directions.
for (var byte = 0; byte < 256; byte++) {
var char = cp[byte];
if (char != 0xFFFD) {
var string = String.fromCharCode(char);
expect(cp.encode(string), [byte]);
expect(cp.decode([byte]), string);
}
}
});

test('decode invalid characters not allowed', () {
expect(() => cp.decode([0xfffd]), throwsA(isA<FormatException>()));
});

test('decode invalid characters allowed', () {
// Decode works like operator[].
expect(cp.decode(bytes, allowInvalid: true),
String.fromCharCodes([for (var i = 0; i < 256; i++) cp[i]]));
});

test('chunked conversion', () {
late final String decodedString;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just initialize it to "" instead of being late. (I don't like late 😉 )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like mutability ;-) I'd prefer to have late and make it final rather lose the final specifier ;-)

final outputSink = StringConversionSink.withCallback(
(accumulated) => decodedString = accumulated);
final inputSink = cp.decoder.startChunkedConversion(outputSink);
final expected = StringBuffer();

for (var byte = 0; byte < 256; byte++) {
var char = cp[byte];
if (char != 0xFFFD) {
inputSink.add([byte]);
expected.writeCharCode(char);
}
}
inputSink.close();
expect(decodedString, expected.toString());
});
});
}
test('latin-2 roundtrip', () {
Expand All @@ -62,14 +92,48 @@ void main() {
expect(decoded, latin2text);
});

test('Custom code page', () {
var cp = CodePage('custom', "ABCDEF${"\uFFFD" * 250}");
var result = cp.encode('BADCAFE');
expect(result, [1, 0, 3, 2, 0, 5, 4]);
expect(() => cp.encode('GAD'), throwsFormatException);
expect(cp.encode('GAD', invalidCharacter: 0x3F), [0x3F, 0, 3]);
expect(cp.decode([1, 0, 3, 2, 0, 5, 4]), 'BADCAFE');
expect(() => cp.decode([6, 1, 255]), throwsFormatException);
expect(cp.decode([6, 1, 255], allowInvalid: true), '\u{FFFD}B\u{FFFD}');
group('Custom code page', () {
late final CodePage cp;

setUpAll(() => cp = CodePage('custom', "ABCDEF${"\uFFFD" * 250}"));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code pages should be immutable, so just initialize it directly, without using setUp. (I also don't like setUp 😁).

Unless you fear that it will throw?

(What's the difference between setUp and setUpAll?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

setUp runs for each embedded test, setUpAll runs once.

Done.


test('simple encode', () {
var result = cp.encode('BADCAFE');
expect(result, [1, 0, 3, 2, 0, 5, 4]);
});

test('unencodable character', () {
expect(() => cp.encode('GAD'), throwsFormatException);
});

test('unencodable character with invalidCharacter', () {
expect(cp.encode('GAD', invalidCharacter: 0x3F), [0x3F, 0, 3]);
});

test('simple decode', () {
expect(cp.decode([1, 0, 3, 2, 0, 5, 4]), 'BADCAFE');
});

test('undecodable byte', () {
expect(() => cp.decode([6, 1, 255]), throwsFormatException);
});

test('undecodable byte with allowInvalid', () {
expect(cp.decode([6, 1, 255], allowInvalid: true), '\u{FFFD}B\u{FFFD}');
});

test('chunked conversion', () {
late final String decodedString;
final outputSink = StringConversionSink.withCallback(
(accumulated) => decodedString = accumulated);
final inputSink = cp.decoder.startChunkedConversion(outputSink);
inputSink
..add([1])
..add([0])
..add([3]);

inputSink.close();
expect(decodedString, 'BAD');
});
});
}