diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index b22db3ace..9ec49f187 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -84,11 +84,16 @@ pub fn to_string(self : T) -> String { /// Return a new unchecked string contains the data in buffer. /// Note this function does not validate the encoding of the byte sequence, /// it simply copy the bytes into a new String. -pub fn to_unchecked_string(self : T) -> String { - Bytes::from_fixedarray(self.data).to_unchecked_string( - offset=0, - length=self.len, - ) +pub fn to_unchecked_string(self : T, offset? : Int, length? : Int) -> String { + let offset = match offset { + None => 0 + Some(x) => x + } + let length = match length { + None => self.len + Some(x) => x + } + Bytes::from_fixedarray(self.data).to_unchecked_string(offset~, length~) } ///| @@ -99,6 +104,22 @@ pub fn T::new(size_hint~ : Int = 0) -> T { { data, len: 0, initial_data: data } } +///| +pub fn T::from_bytes(bytes : Bytes) -> T { + let buf = T::new(size_hint=bytes.length()) + buf.write_bytes(bytes) + buf +} + +///| +pub fn T::from_array(arr : Array[Byte]) -> T { + let buf = T::new(size_hint=arr.length()) + for byte in arr { + buf.write_byte(byte) + } + buf +} + ///| /// Write a string into buffer. pub fn write_string(self : T, value : String) -> Unit { @@ -147,10 +168,26 @@ pub fn write_sub_string( } ///| -/// Write a char into buffer. +/// Write a char into buffer as UTF16LE. pub fn write_char(self : T, value : Char) -> Unit { self.grow_if_necessary(self.len + 4) - let inc = self.data.set_utf16_char(self.len, value) + let inc = self.data.set_utf16le_char(self.len, value) + self.len += inc +} + +///| +/// Write a char into buffer as UTF16BE. +pub fn write_utf16be_char(self : T, value : Char) -> Unit { + self.grow_if_necessary(self.len + 4) + let inc = self.data.set_utf16be_char(self.len, value) + self.len += inc +} + +///| +/// Write a char into buffer as UTF8. +pub fn write_utf8_char(self : T, value : Char) -> Unit { + self.grow_if_necessary(self.len + 4) + let inc = self.data.set_utf8_char(self.len, value) self.len += inc } @@ -162,6 +199,11 @@ pub fn write_byte(self : T, value : Byte) -> Unit { self.len += 1 } +///| +pub fn blit(self : T, srcoff : Int, dst : T, dstoff : Int, len : Int) -> Unit { + Bytes::blit(self.to_bytes(), srcoff, dst.to_bytes(), dstoff, len) +} + ///| pub fn reset(self : T) -> Unit { self.data = self.initial_data @@ -173,7 +215,26 @@ pub fn to_bytes(self : T) -> Bytes { Bytes::from_fixedarray(self.data, len=self.len) } +///| +pub fn to_array(self : T) -> Array[Byte] { + self.to_bytes().to_array() +} + +///| +pub fn op_set(self : T, index : Int, value : Byte) -> Unit { + let len = self.length() + guard index >= 0 && index < len + self.data[index] = value +} + +///| +pub fn op_get(self : T, index : Int) -> Byte { + let len = self.length() + guard index >= 0 && index < len + self.data[index] +} + ///| pub impl Show for T with output(self, logger) { - logger.write_string(self.to_unchecked_string()) + logger.write_string(self.to_unchecked_string(offset=0, length=self.len)) } diff --git a/buffer/buffer.mbti b/buffer/buffer.mbti index 41570ecb8..d2bf8c98a 100644 --- a/buffer/buffer.mbti +++ b/buffer/buffer.mbti @@ -5,13 +5,19 @@ package moonbitlang/core/buffer // Types and methods type T impl T { + blit(Self, Int, Self, Int, Int) -> Unit + from_array(Array[Byte]) -> Self + from_bytes(Bytes) -> Self is_empty(Self) -> Bool length(Self) -> Int new(size_hint~ : Int = ..) -> Self + op_get(Self, Int) -> Byte + op_set(Self, Int, Byte) -> Unit reset(Self) -> Unit + to_array(Self) -> Array[Byte] to_bytes(Self) -> Bytes to_string(Self) -> String //deprecated - to_unchecked_string(Self) -> String + to_unchecked_string(Self, offset? : Int, length? : Int) -> String write_byte(Self, Byte) -> Unit write_bytes(Self, Bytes) -> Unit write_char(Self, Char) -> Unit @@ -19,6 +25,8 @@ impl T { write_string(Self, String) -> Unit write_sub_string(Self, String, Int, Int) -> Unit //deprecated write_substring(Self, String, Int, Int) -> Unit + write_utf16be_char(Self, Char) -> Unit + write_utf8_char(Self, Char) -> Unit } impl Show for T diff --git a/builtin/builtin.mbti b/builtin/builtin.mbti index 0935940a1..740afb865 100644 --- a/builtin/builtin.mbti +++ b/builtin/builtin.mbti @@ -246,6 +246,7 @@ impl Iter { tap[T](Self[T], (T) -> Unit) -> Self[T] //deprecated to_array[T](Self[T]) -> Array[T] to_string[T : Show](Self[T]) -> String + try_collect[T, E : Error](Self[Result[T, E]]) -> Array[T]!E } impl[T : Show] Show for Iter[T] @@ -667,7 +668,10 @@ impl FixedArray { op_get[T](Self[T], Int) -> T op_set[T](Self[T], Int, T) -> Unit set[T](Self[T], Int, T) -> Unit - set_utf16_char(Self[Byte], Int, Char) -> Int + set_utf16_char(Self[Byte], Int, Char) -> Int //deprecated + set_utf16be_char(Self[Byte], Int, Char) -> Int + set_utf16le_char(Self[Byte], Int, Char) -> Int + set_utf8_char(Self[Byte], Int, Char) -> Int to_json[X : ToJson](Self[X]) -> Json to_string[X : Show](Self[X]) -> String unsafe_blit[A](Self[A], Int, Self[A], Int, Int) -> Unit @@ -685,7 +689,7 @@ impl Bytes { op_equal(Bytes, Bytes) -> Bool op_get(Bytes, Int) -> Byte op_set(Bytes, Int, Byte) -> Unit - set_utf16_char(Bytes, Int, Char) -> Int + set_utf16_char(Bytes, Int, Char) -> Int //deprecated set_utf8_char(Bytes, Int, Char) -> Int //deprecated sub_string(Bytes, Int, Int) -> String //deprecated to_string(Bytes) -> String //deprecated diff --git a/builtin/bytes.mbt b/builtin/bytes.mbt index 4a54f8cde..97501aa19 100644 --- a/builtin/bytes.mbt +++ b/builtin/bytes.mbt @@ -140,7 +140,7 @@ pub fn copy(self : Bytes) -> Bytes { } ///| -/// Fill utf8 encoded char `value` into byte sequence `self`, starting at `offset`. +/// Fill UTF8 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. /// @alert deprecated "The type Bytes is about to be changed to be immutable. Use `FixedArray[Byte]` or `Buffer` instead." pub fn set_utf8_char(self : Bytes, offset : Int, value : Char) -> Int { @@ -169,9 +169,40 @@ pub fn set_utf8_char(self : Bytes, offset : Int, value : Char) -> Int { } ///| -/// Fill utf16 encoded char `value` into byte sequence `self`, starting at `offset`. +pub fn set_utf8_char( + self : FixedArray[Byte], + offset : Int, + value : Char +) -> Int { + let code = value.to_uint() + if code < 0x80 { + self[offset] = ((code & 0x7F) | 0x00).to_byte() + 1 + } else if code < 0x0800 { + self[offset] = (((code >> 6) & 0x1F) | 0xC0).to_byte() + self[offset + 1] = ((code & 0x3F) | 0x80).to_byte() + 2 + } else if code < 0x010000 { + self[offset] = (((code >> 12) & 0x0F) | 0xE0).to_byte() + self[offset + 1] = (((code >> 6) & 0x3F) | 0x80).to_byte() + self[offset + 2] = ((code & 0x3F) | 0x80).to_byte() + 3 + } else if code < 0x110000 { + self[offset] = (((code >> 18) & 0x07) | 0xF0).to_byte() + self[offset + 1] = (((code >> 12) & 0x3F) | 0x80).to_byte() + self[offset + 2] = (((code >> 6) & 0x3F) | 0x80).to_byte() + self[offset + 3] = ((code & 0x3F) | 0x80).to_byte() + 4 + } else { + abort("Char out of range") + } +} + +///| +/// Fill UTF16 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. /// @alert unsafe "Panic if the [value] is out of range" +/// @alert deprecated "The type Bytes is about to be changed to be immutable. Use `FixedArray[Byte]` or `Buffer` instead." pub fn set_utf16_char(self : Bytes, offset : Int, value : Char) -> Int { let code = value.to_uint() if code < 0x10000 { @@ -196,6 +227,7 @@ pub fn set_utf16_char(self : Bytes, offset : Int, value : Char) -> Int { /// Fill utf16 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. /// @alert unsafe "Panic if the [value] is out of range" +/// @alert deprecated "Use `set_utf16le_char` instead" pub fn set_utf16_char( self : FixedArray[Byte], offset : Int, @@ -220,6 +252,62 @@ pub fn set_utf16_char( } } +///| +/// Fill UTF16LE encoded char `value` into byte sequence `self`, starting at `offset`. +/// It return the length of bytes has been written. +/// @alert unsafe "Panic if the [value] is out of range" +pub fn set_utf16le_char( + self : FixedArray[Byte], + offset : Int, + value : Char +) -> Int { + let code = value.to_uint() + if code < 0x10000 { + self[offset] = (code & 0xFF).to_byte() + self[offset + 1] = (code >> 8).to_byte() + 2 + } else if code < 0x110000 { + let hi = code - 0x10000 + let lo = (hi >> 10) | 0xD800 + let hi = (hi & 0x3FF) | 0xDC00 + self[offset] = (lo & 0xFF).to_byte() + self[offset + 1] = (lo >> 8).to_byte() + self[offset + 2] = (hi & 0xFF).to_byte() + self[offset + 3] = (hi >> 8).to_byte() + 4 + } else { + abort("Char out of range") + } +} + +///| +/// Fill UTF16BE encoded char `value` into byte sequence `self`, starting at `offset`. +/// It return the length of bytes has been written. +/// @alert unsafe "Panic if the [value] is out of range" +pub fn set_utf16be_char( + self : FixedArray[Byte], + offset : Int, + value : Char +) -> Int { + let code = value.to_uint() + if code < 0x10000 { + self[offset] = (code >> 0xFF).to_byte() + self[offset + 1] = (code & 0xFF).to_byte() + 2 + } else if code < 0x110000 { + let hi = code - 0x10000 + let lo = (hi >> 10) | 0xD800 + let hi = (hi & 0x3FF) | 0xDC00 + self[offset] = (lo >> 8).to_byte() + self[offset + 1] = (lo & 0xFF).to_byte() + self[offset + 2] = (hi >> 8).to_byte() + self[offset + 3] = (hi & 0xFF).to_byte() + 4 + } else { + abort("Char out of range") + } +} + ///| pub fn op_equal(self : Bytes, other : Bytes) -> Bool { if self.length() != other.length() { diff --git a/builtin/iter.mbt b/builtin/iter.mbt index 28e03ba78..75a2f6263 100644 --- a/builtin/iter.mbt +++ b/builtin/iter.mbt @@ -786,6 +786,19 @@ pub fn collect[T](self : Iter[T]) -> Array[T] { result } +///| +/// Collects the elements of the iterator into an array. +pub fn try_collect[T, E : Error](self : Iter[Result[T, E]]) -> Array[T]!E { + let result = [] + for a in self { + match a { + Ok(x) => result.push(x) + Err(e) => raise e + } + } + result +} + ///| /// Iter itself is an iterator. /// so that it works with array spread operator. e.g, `[..iter]` diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt new file mode 100644 index 000000000..19f577406 --- /dev/null +++ b/encoding/decoding.mbt @@ -0,0 +1,418 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +const U_REP = '\u{FFFD}' + +///| +let utf_8_len = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, + 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +] + +///| +pub fn decode_lossy(encoding : Encoding, src : @buffer.T) -> LossyChars { + let decoder = decoder(encoding, src) + decoder +} + +///| +pub fn decode_strict(encoding : Encoding, src : @buffer.T) -> StrictChars { + let decoder = decoder(encoding, src) + decoder +} + +// Implements + +///| +fn decoder(encoding : Encoding, src : @buffer.T) -> Decoder { + let i = src + let i_pos = 0 + let i_max = src.length() - 1 + let t = @buffer.from_bytes(b"\x00\x00\x00\x00") + let t_len = 0 + let t_need = 0 + let k = match encoding { + UTF8 => decode_utf_8 + UTF16 => decode_utf_16le + UTF16LE => decode_utf_16le + UTF16BE => decode_utf_16be + } + { i, i_pos, i_max, t, t_len, t_need, k } +} + +///| +fn decode(self : Decoder) -> Decode { + (self.k)(self) +} + +///| +fn ret(self : Decoder, k : Cont, v : Decode) -> Decode { + self.k = k + v +} + +///| +fn i_rem(self : Decoder) -> Int { + self.i_max - self.i_pos + 1 +} + +///| +fn eoi(self : Decoder) -> Unit { + self.i = @buffer.new() + self.i_pos = 0 + self.i_max = @int.min_value +} + +///| +fn refill(self : Decoder, k : Cont) -> Decode { + // only Buffer + self.eoi() + k(self) +} + +///| +fn t_need(self : Decoder, need : Int) -> Unit { + self.t_len = 0 + self.t_need = need +} + +///| +fn t_fill(k : Cont, decoder : Decoder) -> Decode { + fn blit(decoder : Decoder, l : Int) -> Unit { + decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l) + decoder.i_pos = decoder.i_pos + 1 + decoder.t_len = decoder.t_len + 1 + } + + let rem = decoder.i_rem() + if rem < 0 { // eoi + k(decoder) + } else { + let need = decoder.t_need - decoder.t_len + if rem < need { + blit(decoder, rem) + decoder.refill(@tuple.curry(t_fill)(k)) + } else { + blit(decoder, need) + k(decoder) + } + } +} + +// UTF8 + +///| +fn decode_utf_8(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_8) + // rem > 0 + 1 => { + let idx = self.i[self.i_pos].to_int() + let need = utf_8_len[idx] + if rem < need { + self.t_need(need) + t_fill(t_decode_utf_8, self) + } else { + let j = self.i_pos + if need == 0 { + self.i_pos = self.i_pos + 1 + self.ret(decode_utf_8, malformed(self.i, j, 1)) + } else { + self.i_pos = self.i_pos + need + self.ret(decode_utf_8, r_utf_8(self.i, j, need)) + } + } + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_8(self : Decoder) -> Decode { + if self.t_len < self.t_need { + malformed(self.t, 0, self.t_len) + } else { + r_utf_8(self.t, 0, self.t_len) + } +} + +///| +fn r_utf_8(buf : @buffer.T, offset : Int, length : Int) -> Decode { + fn uchar(c : Int) { + Uchar(Char::from_int(c)) + } + + match length { + 1 => uchar(buf[offset].to_int()) + 2 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F)) + } + } + 3 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + let b2 = buf[offset + 2].to_int() + let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F)) + if (b2 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + match b0 { + 0xE0 => + if b1 < 0xA0 || 0xBF < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + 0xED => + if b1 < 0x80 || 0x9F < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + _ => + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(c) + } + } + } + } + 4 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + let b2 = buf[offset + 2].to_int() + let b3 = buf[offset + 3].to_int() + let c = ((b0 & 0x07) << 18) | + ((b1 & 0x3F) << 12) | + ((b2 & 0x3F) << 6) | + (b3 & 0x3F) + if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + match b0 { + 0xF0 => + if b1 < 0x90 || 0xBF < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + 0xF4 => + if b1 < 0x80 || 0x8F < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + _ => + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(c) + } + } + } + } + _ => panic() + } +} + +// UTF16LE + +///| +priv enum UTF16Decode { + Hi(Int) + UTF16Malformed(String) + UTF16Uchar(Char) +} + +///| +fn decode_utf_16le(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_16le) + // rem > 0 + 1 => + if rem < 2 { + self.t_need(2) + t_fill(t_decode_utf_16le, self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + // mark + self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j)) + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_16le(self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len)) + } else { + self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0)) + } +} + +///| +fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { + match v { + UTF16Uchar(u) => Uchar(u) + UTF16Malformed(s) => Malformed(s) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + r_utf_16_lo(hi, self.i, j + 1, j) + } + } + } +} + +///| +fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { + if decoder.t_len < decoder.t_need { + decoder.ret( + decode_utf_16le, + malformed_pair(false, hi, decoder.t, 0, decoder.t_len), + ) + } else { + decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0)) + } +} + +///| +fn r_utf_16_lo( + hi : Int, + buf : @buffer.T, + offset0 : Int, + offset1 : Int +) -> Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let lo = (b0 << 8) | b1 + if lo < 0xDC00 || lo > 0xDFFF { + malformed_pair( + offset0 < offset1, + hi, + buf, + @int.minimum(offset0, offset1), + 2, + ) + } else { + Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000))) + } +} + +///| +fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let u = (b0 << 8) | b1 + if u < 0xD800 || u > 0xDFFF { + UTF16Uchar(Char::from_int(u)) + } else if u > 0xDBFF { + UTF16Malformed( + buf.to_unchecked_string(offset=@int.minimum(offset0, offset1), length=2), + ) + } else { + Hi(u) + } +} + +// UTF16BE + +///| +fn decode_utf_16be(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_16be) + // rem > 0 + 1 => + if rem < 2 { + self.t_need(2) + t_fill(t_decode_utf_16be, self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1)) + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_16be(self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len)) + } else { + self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1)) + } +} + +///| +fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode { + match decode { + UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x)) + UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x)) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, j + 1)) + } + } + } +} + +///| +fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len)) + } else { + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1)) + } +} diff --git a/encoding/decoding_test.mbt b/encoding/decoding_test.mbt new file mode 100644 index 000000000..324cfb985 --- /dev/null +++ b/encoding/decoding_test.mbt @@ -0,0 +1,231 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +test "lossy decoding String (UTF16LE encoded) to String" { + let src = "你好👀" + let buf = @buffer.T::from_bytes(src.to_bytes()) + inspect!( + buf.to_bytes(), + content= + #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" + , + ) + let stream = @encoding.decode_lossy(UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "lossy decoding UTF16LE encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\x60\x4f") + buf.write_bytes(b"\x7d\x59") + buf.write_bytes(b"\x3d\xd8\x40\xdc") + inspect!( + buf.to_bytes(), + content= + #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" + , + ) + let stream = @encoding.decode_lossy(UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content="你好👀") +} + +test "lossy decoding UTF16BE encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\xd8\x3d\xdc\x08") + buf.write_bytes(b"\xd8\x3d\xdc\x31") + buf.write_bytes(b"\xd8\x3d\xdc\x07") + buf.write_bytes(b"\xd8\x3d\xdc\x30") + inspect!( + buf.to_bytes(), + content= + #|b"\xd8\x3d\xdc\x08\xd8\x3d\xdc\x31\xd8\x3d\xdc\x07\xd8\x3d\xdc\x30" + , + ) + let stream = @encoding.decode_lossy(UTF16BE, buf) + inspect!(String::from_iter(stream.iter()), content="🐈🐱🐇🐰") +} + +test "lossy decoding UTF16 (alias of UTF16LE) encoded data to String" { + let buf = @buffer.T::new(size_hint=20) + buf.write_bytes(b"\x65\x18") + buf.write_bytes(b"\x20\x18") + buf.write_bytes(b"\x73\x18") + buf.write_bytes(b"\x64\x18") + buf.write_bytes(b"\x73\x18") + buf.write_bytes(b"\x36\x18") + buf.write_bytes(b"\x20\x18") + inspect!( + buf.to_bytes(), + content= + #|b"\x65\x18\x20\x18\x73\x18\x64\x18\x73\x18\x36\x18\x20\x18" + , + ) + let stream = @encoding.decode_lossy(UTF16, buf) + inspect!(String::from_iter(stream.iter()), content="ᡥᠠᡳᡤᡳᠶᠠ") +} + +test "lossy decoding UTF8 encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\xe4\xbd\xa0") + buf.write_bytes(b"\xe5\xa5\xbd") + buf.write_bytes(b"\xf0\x9f\x91\x80") + inspect!( + buf.to_bytes(), + content= + #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" + , + ) + let stream = @encoding.decode_lossy(UTF8, buf) + inspect!(String::from_iter(stream.iter()), content="你好👀") +} + +test "lossy decoding String (UTF16LE encoded) to String" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\x3d\xd8\x4b\xdc\x8d\x51\xc1\x89" + , + ) + let stream = @encoding.decode_lossy(UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "lossy decoding UTF8 encoded data to String" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" + , + ) + let stream = @encoding.decode_lossy(UTF8, buf) + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "lossy decoding UTF8 encoded data" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" + , + ) + let stream = @encoding.decode_lossy(UTF8, buf) + inspect!(stream.iter().collect(), content="['👋', '再', '见']") +} + +test "lossy decoding UTF8 encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let stream = @encoding.decode_lossy(UTF8, buf) + inspect!( + stream.iter().collect(), + content="['э', 'e', 'k', '<', '�', '�', 'n', '�', '�']", + ) +} + +test "lossy decoding UTF16LE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let stream = @encoding.decode_lossy(UTF16LE, buf) + inspect!( + stream.iter().collect(), + content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", + ) +} + +test "strictly decoding UTF8 encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let stream = @encoding.decode_strict(UTF8, buf) + inspect!(stream.iter().try_collect?(), content="Err(쏘)") +} + +test "strictly decoding UTF16LE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let stream = @encoding.decode_strict(UTF16LE, buf) + inspect!( + stream.iter().try_collect?(), + content="Ok(['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏'])", + ) +} + +test "strictly decoding UTF16BE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let stream = @encoding.decode_strict(UTF16BE, buf) + inspect!( + stream.iter().try_collect?(), + content="Ok(['', '釦', '궥', '', '较', '', '룦', '뎳', '', '辊'])", + ) +} diff --git a/encoding/encoding.mbt b/encoding/encoding.mbt new file mode 100644 index 000000000..a6afe84e4 --- /dev/null +++ b/encoding/encoding.mbt @@ -0,0 +1,35 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +pub fn encode(encoding : Encoding, src : String) -> Bytes { + // NOTE: special case: MoonBit String are already valid UTF16(LE) bytes + match encoding { + UTF16 | UTF16LE => return src.to_bytes() + _ => () + } + let buf = @buffer.T::from_bytes(src.to_bytes()) + let chars = decode_strict(UTF16LE, buf) + let new_buf = @buffer.T::new(size_hint=buf.length()) + let write = match encoding { + UTF8 => @buffer.write_utf8_char + UTF16BE => @buffer.write_utf16be_char + _ => abort("unreachable") + } + for char in chars { + // SAFETY: Assume String are always valid UTF16LE + write(new_buf, char.unwrap()) + } + new_buf.to_bytes() +} diff --git a/encoding/encoding.mbti b/encoding/encoding.mbti new file mode 100644 index 000000000..a28205bef --- /dev/null +++ b/encoding/encoding.mbti @@ -0,0 +1,36 @@ +package moonbitlang/core/encoding + +alias @moonbitlang/core/buffer as @buffer + +// Values +fn decode_lossy(Encoding, @buffer.T) -> LossyChars + +fn decode_strict(Encoding, @buffer.T) -> StrictChars + +fn encode(Encoding, String) -> Bytes + +// Types and methods +type DecodeError +impl Show for DecodeError + +pub(all) enum Encoding { + UTF8 + UTF16 + UTF16LE + UTF16BE +} + +type LossyChars +impl LossyChars { + iter(Self) -> Iter[Char] +} + +type StrictChars +impl StrictChars { + iter(Self) -> Iter[Result[Char, DecodeError]] +} + +// Type aliases + +// Traits + diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt new file mode 100644 index 000000000..2ff086c9b --- /dev/null +++ b/encoding/encoding_test.mbt @@ -0,0 +1,57 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +test "encoding String to UTF8" { + let src = "你好👀" + let bytes = @encoding.encode(UTF8, src) + inspect!( + bytes, + content= + #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" + , + ) +} + +test "encoding String to UTF16 (alias of UTF16LE)" { + let src = "LISP programmers know the value of everything" + let bytes = @encoding.encode(UTF16, src) + inspect!( + bytes, + content= + #|b"\x4c\x00\x49\x00\x53\x00\x50\x00\x20\x00\x70\x00\x72\x00\x6f\x00\x67\x00\x72\x00\x61\x00\x6d\x00\x6d\x00\x65\x00\x72\x00\x73\x00\x20\x00\x6b\x00\x6e\x00\x6f\x00\x77\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x76\x00\x61\x00\x6c\x00\x75\x00\x65\x00\x20\x00\x6f\x00\x66\x00\x20\x00\x65\x00\x76\x00\x65\x00\x72\x00\x79\x00\x74\x00\x68\x00\x69\x00\x6e\x00\x67\x00" + , + ) +} + +test "encoding String to UTF16LE" { + let src = "and the cost of nothing" + let bytes = @encoding.encode(UTF16LE, src) + inspect!( + bytes, + content= + #|b"\x61\x00\x6e\x00\x64\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x63\x00\x6f\x00\x73\x00\x74\x00\x20\x00\x6f\x00\x66\x00\x20\x00\x6e\x00\x6f\x00\x74\x00\x68\x00\x69\x00\x6e\x00\x67\x00" + , + ) +} + +test "encoding String to UTF16BE" { + let src = "λf.(λx.f(x x))(λx.f(x x))" + let bytes = @encoding.encode(UTF16BE, src) + inspect!( + bytes, + content= + #|b"\x00\xbb\x00\x66\x00\x2e\x00\x28\x00\xbb\x00\x78\x00\x2e\x00\x66\x00\x28\x00\x78\x00\x20\x00\x78\x00\x29\x00\x29\x00\x28\x00\xbb\x00\x78\x00\x2e\x00\x66\x00\x28\x00\x78\x00\x20\x00\x78\x00\x29\x00\x29" + , + ) +} diff --git a/encoding/moon.pkg.json b/encoding/moon.pkg.json new file mode 100644 index 000000000..58dd8726e --- /dev/null +++ b/encoding/moon.pkg.json @@ -0,0 +1,16 @@ +{ + "import": [ + "moonbitlang/core/array", + "moonbitlang/core/buffer", + "moonbitlang/core/builtin", + "moonbitlang/core/bytes", + "moonbitlang/core/char", + "moonbitlang/core/coverage", + "moonbitlang/core/int", + "moonbitlang/core/result", + "moonbitlang/core/string", + "moonbitlang/core/tuple" + ], + "test-import": [ + ] +} diff --git a/encoding/types.mbt b/encoding/types.mbt new file mode 100644 index 000000000..84164f2a2 --- /dev/null +++ b/encoding/types.mbt @@ -0,0 +1,133 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +typealias Cont = (Decoder) -> Decode + +///| +pub(all) enum Encoding { + UTF8 + UTF16 // alias of UTF16LE + UTF16LE + UTF16BE +} + +// Decoder + +///| +priv struct Decoder { + mut i : @buffer.T + mut i_pos : Int + mut i_max : Int + t : @buffer.T + mut t_len : Int + mut t_need : Int + mut k : Cont +} + +///| +priv enum Decode { + End + Malformed(String) + Uchar(Char) +} + +///| +fn malformed(buf : @buffer.T, offset : Int, length : Int) -> Decode { + Malformed(buf.to_unchecked_string(offset~, length~)) +} + +///| +fn malformed_pair( + be : Bool, + hi : Int, + buf : @buffer.T, + offset : Int, + length : Int +) -> Decode { + let bs1 = buf.to_unchecked_string(offset~, length~).to_bytes() + let bs0 = @buffer.new(size_hint=2) + let (j0, j1) = if be { (0, 1) } else { (1, 0) } + bs0[j0] = (hi >> 8).to_byte() + bs0[j1] = hi.land(0xFF).to_byte() + let arr = bs0.to_array() + arr.append(bs1.to_array()) + let bs = @buffer.from_array(arr) + Malformed(bs.to_unchecked_string(offset=0, length=bs.length())) +} + +// Chars + +///| +type LossyChars Decoder + +///| +pub fn iter(self : LossyChars) -> Iter[Char] { + Iter::new( + fn(yield_) { + loop self._.decode() { + Uchar(u) => { + if yield_(u) == IterEnd { + break IterEnd + } + continue self._.decode() + } + Malformed(_) => { + if yield_(U_REP) == IterEnd { + break IterEnd + } + continue self._.decode() + } + End => break IterEnd + } + }, + ) +} + +///| +type StrictChars Decoder + +///| +type! DecodeError String + +///| +pub impl Show for DecodeError with output(self, logger) { + match self { + DecodeError(err) => logger.write_string(err) + } +} + +///| +pub fn iter(self : StrictChars) -> Iter[Result[Char, DecodeError]] { + Iter::new( + fn(yield_) { + loop self._.decode() { + Uchar(u) => { + if yield_(Ok(u)) == IterEnd { + break IterEnd + } + continue self._.decode() + } + Malformed(s) => { + let err = DecodeError(s) + if yield_(Err(err)) == IterEnd { + break IterEnd + } + continue self._.decode() + } + End => break IterEnd + } + }, + ) +} diff --git a/int/int.mbt b/int/int.mbt index 54d320a8f..753d0bb6b 100644 --- a/int/int.mbt +++ b/int/int.mbt @@ -41,3 +41,12 @@ pub fn abs(self : Int) -> Int { self } } + +///| +pub fn minimum(self : Int, x : Int) -> Int { + if self > x { + x + } else { + self + } +} diff --git a/int/int.mbti b/int/int.mbti index e7694923f..cb1c6f6d4 100644 --- a/int/int.mbti +++ b/int/int.mbti @@ -10,6 +10,7 @@ let min_value : Int impl Int { abs(Int) -> Int + minimum(Int, Int) -> Int } // Type aliases diff --git a/string/string.mbt b/string/string.mbt index 2328d864c..d436ab83a 100644 --- a/string/string.mbt +++ b/string/string.mbt @@ -31,6 +31,12 @@ pub fn String::from_array(chars : Array[Char]) -> String { buf.to_string() } +///| +pub fn String::from_iter(iter : Iter[Char]) -> String { + let chars = iter.collect() + String::from_array(chars) +} + ///| /// Concatenate strings. /// diff --git a/string/string.mbti b/string/string.mbti index f21058e69..e7b8b35f0 100644 --- a/string/string.mbti +++ b/string/string.mbti @@ -14,6 +14,7 @@ impl String { ends_with(String, String) -> Bool fold[A](String, init~ : A, (A, Char) -> A) -> A from_array(Array[Char]) -> String + from_iter(Iter[Char]) -> String index_of(String, String, from~ : Int = ..) -> Int is_blank(String) -> Bool is_empty(String) -> Bool