Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Add URI encoding and decoding functions #2267

Closed
wants to merge 9 commits into from
Closed
6 changes: 4 additions & 2 deletions boa_engine/src/builtins/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub mod string;
pub mod symbol;
pub mod typed_array;
pub mod undefined;
pub mod uri;

#[cfg(feature = "console")]
pub mod console;
Expand Down Expand Up @@ -81,7 +82,7 @@ use crate::{
builtins::{
array_buffer::ArrayBuffer, async_generator::AsyncGenerator,
async_generator_function::AsyncGeneratorFunction, generator::Generator,
generator_function::GeneratorFunction, typed_array::TypedArray,
generator_function::GeneratorFunction, typed_array::TypedArray, uri::Uri,
},
property::{Attribute, PropertyDescriptor},
Context, JsValue,
Expand Down Expand Up @@ -193,7 +194,8 @@ pub fn init(context: &mut Context) {
Promise,
AsyncFunction,
AsyncGenerator,
AsyncGeneratorFunction
AsyncGeneratorFunction,
Uri
};

#[cfg(feature = "intl")]
Expand Down
5 changes: 3 additions & 2 deletions boa_engine/src/builtins/regexp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1745,7 +1745,8 @@ fn advance_string_index(s: &JsString, index: u64, unicode: bool) -> u64 {
}

// 5. Let cp be ! CodePointAt(S, index).
let (_, offset, _) = crate::builtins::string::code_point_at(s, index);
let cp = crate::builtins::string::code_point_at(s, index);

index + u64::from(offset)
// 6. Return index + cp.[[CodeUnitCount]].
index + u64::from(cp.code_unit_count)
}
36 changes: 30 additions & 6 deletions boa_engine/src/builtins/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,29 +40,53 @@ pub(crate) enum Placement {
End,
}

pub(crate) fn code_point_at(string: &JsString, position: u64) -> (u32, u8, bool) {
/// Code point information for the `code_point_at()` function.
#[derive(Debug, Clone, Copy)]
pub(crate) struct CodePointInfo {
pub(crate) code_point: u32,
pub(crate) code_unit_count: u8,
pub(crate) is_unpaired_surrogate: bool,
}

pub(crate) fn code_point_at(string: &JsString, position: u64) -> CodePointInfo {
let mut encoded = string.encode_utf16();
let size = encoded.clone().count() as u64;

let first = encoded
.nth(position as usize)
.expect("The callers of this function must've already checked bounds.");
if !is_leading_surrogate(first) && !is_trailing_surrogate(first) {
return (u32::from(first), 1, false);
return CodePointInfo {
code_point: u32::from(first),
code_unit_count: 1,
is_unpaired_surrogate: false,
};
}

if is_trailing_surrogate(first) || position + 1 == size {
return (u32::from(first), 1, true);
return CodePointInfo {
code_point: u32::from(first),
code_unit_count: 1,
is_unpaired_surrogate: true,
};
}

let second = encoded
.next()
.expect("The callers of this function must've already checked bounds.");
if !is_trailing_surrogate(second) {
return (u32::from(first), 1, true);
return CodePointInfo {
code_point: u32::from(first),
code_unit_count: 1,
is_unpaired_surrogate: true,
};
}
let cp = (u32::from(first) - 0xD800) * 0x400 + (u32::from(second) - 0xDC00) + 0x10000;
(cp, 2, false)
CodePointInfo {
code_point: cp,
code_unit_count: 2,
is_unpaired_surrogate: false,
}
}

/// Helper function to check if a `char` is trimmable.
Expand Down Expand Up @@ -544,7 +568,7 @@ impl String {
IntegerOrInfinity::Integer(position) if (0..size).contains(&position) => {
// 6. Let cp be ! CodePointAt(S, position).
// 7. Return 𝔽(cp.[[CodePoint]]).
Ok(code_point_at(&string, position as u64).0.into())
Ok(code_point_at(&string, position as u64).code_point.into())
}
// 5. If position < 0 or position ≥ size, return undefined.
_ => Ok(JsValue::undefined()),
Expand Down
8 changes: 7 additions & 1 deletion boa_engine/src/builtins/string/string_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use crate::{
use boa_gc::{Finalize, Trace};
use boa_profiler::Profiler;

use super::CodePointInfo;

#[derive(Debug, Clone, Finalize, Trace)]
pub struct StringIterator {
string: JsValue,
Expand Down Expand Up @@ -61,7 +63,11 @@ impl StringIterator {
context,
));
}
let (_, code_unit_count, _) = code_point_at(&native_string, position as u64);
let CodePointInfo {
code_point: _,
code_unit_count,
is_unpaired_surrogate: _,
} = code_point_at(&native_string, position as u64);
string_iterator.next_index += i32::from(code_unit_count);
let result_string = crate::builtins::string::String::substring(
&string_iterator.string,
Expand Down
203 changes: 203 additions & 0 deletions boa_engine/src/builtins/uri/consts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/// Constant with all the unescaped URI characters.
///
/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped
pub(super) const URI_UNESCAPED: [u16; 71] = [
// uriAlpha
b'a' as u16,
b'b' as u16,
b'c' as u16,
b'd' as u16,
b'e' as u16,
b'f' as u16,
b'g' as u16,
b'h' as u16,
b'i' as u16,
b'j' as u16,
b'k' as u16,
b'l' as u16,
b'm' as u16,
b'n' as u16,
b'o' as u16,
b'p' as u16,
b'q' as u16,
b'r' as u16,
b's' as u16,
b't' as u16,
b'u' as u16,
b'v' as u16,
b'w' as u16,
b'x' as u16,
b'y' as u16,
b'z' as u16,
b'A' as u16,
b'B' as u16,
b'C' as u16,
b'D' as u16,
b'E' as u16,
b'F' as u16,
b'G' as u16,
b'H' as u16,
b'I' as u16,
b'J' as u16,
b'K' as u16,
b'L' as u16,
b'M' as u16,
b'N' as u16,
b'O' as u16,
b'P' as u16,
b'Q' as u16,
b'R' as u16,
b'S' as u16,
b'T' as u16,
b'U' as u16,
b'V' as u16,
b'W' as u16,
b'X' as u16,
b'Y' as u16,
b'Z' as u16,
// DecimalDigit
b'0' as u16,
b'1' as u16,
b'2' as u16,
b'3' as u16,
b'4' as u16,
b'5' as u16,
b'6' as u16,
b'7' as u16,
b'8' as u16,
b'9' as u16,
// uriMark
b'-' as u16,
b'_' as u16,
b'.' as u16,
b'!' as u16,
b'~' as u16,
b'*' as u16,
b'\'' as u16,
b'(' as u16,
b')' as u16,
];

/// Constant with all the reserved URI characters, plus the hashtag symbol (`#`).
Razican marked this conversation as resolved.
Show resolved Hide resolved
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriReserved
pub(super) const URI_RESERVED_HASH: [u16; 11] = [
b';' as u16,
b'/' as u16,
b'?' as u16,
b':' as u16,
b'@' as u16,
b'&' as u16,
b'=' as u16,
b'+' as u16,
b'$' as u16,
b',' as u16,
// Extra: # symbol
b'#' as u16,
];

/// Constant with all the reserved and unescaped URI characters, plus the hashtag symbol (`#`).
///
/// More information:
/// - [`uriReserved` in ECMAScript spec][uri_reserved]
/// - [`uriUnescaped` in ECMAScript spec][uri_unescaped]
///
/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved
/// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped
pub(super) const URI_RESERVED_UNESCAPED_HASH: [u16; 82] = [
// uriAlpha
b'a' as u16,
b'b' as u16,
b'c' as u16,
b'd' as u16,
b'e' as u16,
b'f' as u16,
b'g' as u16,
b'h' as u16,
b'i' as u16,
b'j' as u16,
b'k' as u16,
b'l' as u16,
b'm' as u16,
b'n' as u16,
b'o' as u16,
b'p' as u16,
b'q' as u16,
b'r' as u16,
b's' as u16,
b't' as u16,
b'u' as u16,
b'v' as u16,
b'w' as u16,
b'x' as u16,
b'y' as u16,
b'z' as u16,
b'A' as u16,
b'B' as u16,
b'C' as u16,
b'D' as u16,
b'E' as u16,
b'F' as u16,
b'G' as u16,
b'H' as u16,
b'I' as u16,
b'J' as u16,
b'K' as u16,
b'L' as u16,
b'M' as u16,
b'N' as u16,
b'O' as u16,
b'P' as u16,
b'Q' as u16,
b'R' as u16,
b'S' as u16,
b'T' as u16,
b'U' as u16,
b'V' as u16,
b'W' as u16,
b'X' as u16,
b'Y' as u16,
b'Z' as u16,
// DecimalDigit
b'0' as u16,
b'1' as u16,
b'2' as u16,
b'3' as u16,
b'4' as u16,
b'5' as u16,
b'6' as u16,
b'7' as u16,
b'8' as u16,
b'9' as u16,
// uriMark
b'-' as u16,
b'_' as u16,
b'.' as u16,
b'!' as u16,
b'~' as u16,
b'*' as u16,
b'\'' as u16,
b'(' as u16,
b')' as u16,
// uriReserved
b';' as u16,
b'/' as u16,
b'?' as u16,
b':' as u16,
b'@' as u16,
b'&' as u16,
b'=' as u16,
b'+' as u16,
b'$' as u16,
b',' as u16,
// Extra: # symbol
b'#' as u16,
];
Loading