From 524f5c4d0fb1e85132eedf39ebb4b1cf31ac8d35 Mon Sep 17 00:00:00 2001 From: Gordy French Date: Fri, 4 Oct 2024 10:47:28 -0700 Subject: [PATCH] Unicode computed location fix Reviewed By: captbaritone Differential Revision: D63879176 fbshipit-source-id: 9093a8916524dad92cab7a4d91bf6917218ee95c --- compiler/crates/common/src/text_source.rs | 23 +++++++++++++++++----- compiler/crates/docblock-syntax/src/lib.rs | 6 +++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/compiler/crates/common/src/text_source.rs b/compiler/crates/common/src/text_source.rs index 832c9ef55c878..50078a72a781d 100644 --- a/compiler/crates/common/src/text_source.rs +++ b/compiler/crates/common/src/text_source.rs @@ -74,6 +74,7 @@ impl TextSource { /** * Converts span, which is the relative indices of characters within this text source, * into the equivalent line and character number range. + * Span is bytes, not characters. */ pub fn to_span_range(&self, span: Span) -> lsp_types::Range { let start = span.start as usize; @@ -84,13 +85,15 @@ impl TextSource { let mut character = self.column_index; let mut start_position = lsp_types::Position::default(); let mut end_position = lsp_types::Position::default(); - let mut chars = self.text.chars().enumerate().peekable(); + let mut chars = self.text.chars().peekable(); + + let mut bytes_seen = 0; - while let Some((index, chr)) = chars.next() { - if index == start { + while let Some(chr) = chars.next() { + if bytes_seen == start { start_position = lsp_types::Position::new(line as u32, character as u32); } - if index == end { + if bytes_seen == end { end_position = lsp_types::Position::new(line as u32, character as u32); break; } @@ -99,7 +102,7 @@ impl TextSource { // Line terminators: https://www.ecma-international.org/ecma-262/#sec-line-terminators '\u{000A}' | '\u{000D}' | '\u{2028}' | '\u{2029}' => { // - !matches!((chr, chars.peek()), ('\u{000D}', Some((_, '\u{000D}')))) + !matches!((chr, chars.peek()), ('\u{000D}', Some('\u{000D}'))) } _ => false, }; @@ -112,6 +115,7 @@ impl TextSource { } else { character += 1; } + bytes_seen += chr.len_utf8(); } if start_position != lsp_types::Position::default() @@ -138,6 +142,15 @@ mod test { assert_eq!(range.end, lsp_types::Position::new(0, 5)); } + #[test] + fn to_range_unicode_test() { + let span = Span::new(0, 5); + let text_source = TextSource::new("☃ource", 0, 0); + let range = text_source.to_span_range(span); + assert_eq!(range.start, lsp_types::Position::new(0, 0)); + assert_eq!(range.end, lsp_types::Position::new(0, 3)); + } + #[test] fn to_range_multi_line_test() { // this range contains all characters of `fn foo ...` diff --git a/compiler/crates/docblock-syntax/src/lib.rs b/compiler/crates/docblock-syntax/src/lib.rs index b64e6da1641f1..146209356c6c8 100644 --- a/compiler/crates/docblock-syntax/src/lib.rs +++ b/compiler/crates/docblock-syntax/src/lib.rs @@ -85,7 +85,7 @@ pub fn parse_docblock( * strings with quotation marks. * * To account for this, we parse in a single pass, essentially treating each - * character as a token. This allows us to easily intemperate characters + * character as a token. This allows us to easily interpret characters * differently in different contexts. */ struct DocblockParser<'a> { @@ -283,7 +283,7 @@ impl<'a> DocblockParser<'a> { fn next(&mut self) { self.chars.next(); - self.offset += 1; + self.offset += 1; // Is this correct for unicode characters? } /// Advance over a string of characters matching predicate. @@ -307,7 +307,7 @@ impl<'a> DocblockParser<'a> { break; } } - self.offset += result.len() as u32; + self.offset += result.len() as u32; // result.len() returns byte length result }