fix: Correctly cast from UTF16 positions

Fixes #302
openlawlibrary · Jul 28, 2023 · d5a1212 · d5a1212
1 parent 6139c71
commit d5a1212
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning][semver].
 
 ### Changed
 ### Fixed
+[#304]: https://github.com/openlawlibrary/pygls/issues/304
 
 - `pygls` no longer overrides the event loop for the current thread when given an explicit loop to use. ([#334])
 - Fixed `MethodTypeNotRegisteredError` when registering a `TEXT_DOCUMENT_DID_SAVE` feature with options. ([#338])

diff --git a/pygls/workspace.py b/pygls/workspace.py
@@ -37,13 +37,17 @@
 log = logging.getLogger(__name__)
 
 
+def is_char_beyond_multilingual_plane(char: str) -> bool:
+    return ord(char) > 0xFFFF
+
+
 def utf16_unit_offset(chars: str):
     """Calculate the number of characters which need two utf-16 code units.
 
     Arguments:
         chars (str): The string to count occurrences of utf-16 code units for.
     """
-    return sum(ord(ch) > 0xFFFF for ch in chars)
+    return sum(is_char_beyond_multilingual_plane(ch) for ch in chars)
 
 
 def utf16_num_units(chars: str):
@@ -59,7 +63,7 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
     """Convert the position.character from utf-16 code units to utf-32.
 
     A python application can't use the character member of `Position`
-    directly as per specification it is represented as a zero-based line and
+    directly. As per specification it is represented as a zero-based line and
     character offset based on a UTF-16 string representation.
 
     All characters whose code point exceeds the Basic Multilingual Plane are
@@ -80,14 +84,44 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
     Returns:
         The position with `character` being converted to utf-32 code units.
     """
-    try:
-        return Position(
-            line=position.line,
-            character=position.character
-            - utf16_unit_offset(lines[position.line][:position.character])
+    if len(lines) == 0:
+        return Position(0, 0)
+    if position.line >= len(lines):
+        return Position(len(lines) - 1, utf16_num_units(lines[-1]))
+
+    _line = lines[position.line]
+    _line = _line.replace('\r\n', '\n')  # TODO: it's a bit of a hack
+    _utf16_len = utf16_num_units(_line)
+    _utf32_len = len(_line)
+
+    if _utf16_len == 0:
+        return Position(position.line, 0)
+
+    _utf16_end_of_line = utf16_num_units(_line)
+    if position.character > _utf16_end_of_line:
+        position.character = _utf16_end_of_line - 1
+
+    _utf16_index = 0
+    utf32_index = 0
+    while True:
+        _is_searching_queried_position = _utf16_index < position.character
+        _is_before_end_of_line = utf32_index < _utf32_len
+        _is_searching_for_position = (
+            _is_searching_queried_position and _is_before_end_of_line
         )
-    except IndexError:
-        return Position(line=len(lines), character=0)
+        if not _is_searching_for_position:
+            break
+
+        _current_char = _line[utf32_index]
+        _is_double_width = is_char_beyond_multilingual_plane(_current_char)
+        if _is_double_width:
+            _utf16_index += 2
+        else:
+            _utf16_index += 1
+        utf32_index += 1
+
+    position = Position(line=position.line, character=utf32_index)
+    return position
 
 
 def position_to_utf16(lines: List[str], position: Position) -> Position:
@@ -137,10 +171,11 @@ def range_from_utf16(lines: List[str], range: Range) -> Range:
     Returns:
         The range with `character` offsets being converted to utf-16 code units.
     """
-    return Range(
+    range_new = Range(
         start=position_from_utf16(lines, range.start),
         end=position_from_utf16(lines, range.end)
     )
+    return range_new
 
 
 def range_to_utf16(lines: List[str], range: Range) -> Range:
@@ -280,7 +315,7 @@ def offset_at_position(self, position: Position) -> int:
         lines = self.lines
         pos = position_from_utf16(lines, position)
         row, col = pos.line, pos.character
-        return col + sum(len(line) for line in lines[:row])
+        return col + sum(utf16_num_units(line) for line in lines[:row])
 
     @property
     def source(self) -> str:

diff --git a/tests/test_document.py b/tests/test_document.py
@@ -220,10 +220,11 @@ def test_range_from_utf16():
     range = Range(
         start=Position(line=0, character=3), end=Position(line=0, character=5)
     )
-    range_from_utf16(['x="😋"'], range)
-    assert range == Range(
-        start=Position(line=0, character=3), end=Position(line=0, character=5)
+    actual = range_from_utf16(['x="😋😋"'], range)
+    expected = Range(
+        start=Position(line=0, character=3), end=Position(line=0, character=4)
     )
+    assert actual == expected
 
 
 def test_range_to_utf16():
@@ -239,23 +240,40 @@ def test_range_to_utf16():
     range = Range(
         start=Position(line=0, character=3), end=Position(line=0, character=4)
     )
-    range_to_utf16(['x="😋"'], range)
-    assert range == Range(
-        start=Position(line=0, character=3), end=Position(line=0, character=4)
+    actual = range_to_utf16(['x="😋😋"'], range)
+    expected = Range(
+        start=Position(line=0, character=3), end=Position(line=0, character=5)
     )
+    assert actual == expected
 
 
 def test_offset_at_position(doc):
     assert doc.offset_at_position(Position(line=0, character=8)) == 8
-    assert doc.offset_at_position(Position(line=1, character=5)) == 14
+    assert doc.offset_at_position(Position(line=1, character=5)) == 12
     assert doc.offset_at_position(Position(line=2, character=0)) == 13
     assert doc.offset_at_position(Position(line=2, character=4)) == 17
     assert doc.offset_at_position(Position(line=3, character=6)) == 27
-    assert doc.offset_at_position(Position(line=3, character=7)) == 27
+    assert doc.offset_at_position(Position(line=3, character=7)) == 28
     assert doc.offset_at_position(Position(line=3, character=8)) == 28
-    assert doc.offset_at_position(Position(line=4, character=0)) == 39
-    assert doc.offset_at_position(Position(line=5, character=0)) == 39
-
+    assert doc.offset_at_position(Position(line=4, character=0)) == 40
+    assert doc.offset_at_position(Position(line=5, character=0)) == 40
+
+def test_utf16_to_utf32_position_cast(doc):
+    lines = ['', '😋😋', '']
+    assert position_from_utf16(lines, Position(line=0, character=0)) == Position(line=0, character=0)
+    assert position_from_utf16(lines, Position(line=0, character=1)) == Position(line=0, character=0)
+    assert position_from_utf16(lines, Position(line=1, character=0)) == Position(line=1, character=0)
+    assert position_from_utf16(lines, Position(line=1, character=2)) == Position(line=1, character=1)
+    assert position_from_utf16(lines, Position(line=1, character=3)) == Position(line=1, character=2)
+    assert position_from_utf16(lines, Position(line=1, character=4)) == Position(line=1, character=2)
+    assert position_from_utf16(lines, Position(line=1, character=100)) == Position(line=1, character=2)
+    assert position_from_utf16(lines, Position(line=3, character=0)) == Position(line=2, character=0)
+    assert position_from_utf16(lines, Position(line=4, character=10)) == Position(line=2, character=0)
+
+def test_position_for_line_endings(doc):
+    lines = ['x\r\n', 'y\n']
+    assert position_from_utf16(lines, Position(line=0, character=10)) == Position(line=0, character=1)
+    assert position_from_utf16(lines, Position(line=1, character=10)) == Position(line=1, character=1)
 
 def test_word_at_position(doc):
     """