diff --git a/CHANGELOG.md b/CHANGELOG.md index adf005f8..7ab08eb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning][semver]. ### Changed ### Fixed +[#304]: https://github.com/openlawlibrary/pygls/issues/304 - `pygls` no longer overrides the event loop for the current thread when given an explicit loop to use. ([#334]) - Fixed `MethodTypeNotRegisteredError` when registering a `TEXT_DOCUMENT_DID_SAVE` feature with options. ([#338]) diff --git a/pygls/workspace.py b/pygls/workspace.py index bfd5aa5a..dad73438 100644 --- a/pygls/workspace.py +++ b/pygls/workspace.py @@ -37,13 +37,17 @@ log = logging.getLogger(__name__) +def is_char_beyond_multilingual_plane(char: str) -> bool: + return ord(char) > 0xFFFF + + def utf16_unit_offset(chars: str): """Calculate the number of characters which need two utf-16 code units. Arguments: chars (str): The string to count occurrences of utf-16 code units for. """ - return sum(ord(ch) > 0xFFFF for ch in chars) + return sum(is_char_beyond_multilingual_plane(ch) for ch in chars) def utf16_num_units(chars: str): @@ -59,7 +63,7 @@ def position_from_utf16(lines: List[str], position: Position) -> Position: """Convert the position.character from utf-16 code units to utf-32. A python application can't use the character member of `Position` - directly as per specification it is represented as a zero-based line and + directly. As per specification it is represented as a zero-based line and character offset based on a UTF-16 string representation. All characters whose code point exceeds the Basic Multilingual Plane are @@ -80,14 +84,44 @@ def position_from_utf16(lines: List[str], position: Position) -> Position: Returns: The position with `character` being converted to utf-32 code units. """ - try: - return Position( - line=position.line, - character=position.character - - utf16_unit_offset(lines[position.line][:position.character]) + if len(lines) == 0: + return Position(0, 0) + if position.line >= len(lines): + return Position(len(lines) - 1, utf16_num_units(lines[-1])) + + _line = lines[position.line] + _line = _line.replace('\r\n', '\n') # TODO: it's a bit of a hack + _utf16_len = utf16_num_units(_line) + _utf32_len = len(_line) + + if _utf16_len == 0: + return Position(position.line, 0) + + _utf16_end_of_line = utf16_num_units(_line) + if position.character > _utf16_end_of_line: + position.character = _utf16_end_of_line - 1 + + _utf16_index = 0 + utf32_index = 0 + while True: + _is_searching_queried_position = _utf16_index < position.character + _is_before_end_of_line = utf32_index < _utf32_len + _is_searching_for_position = ( + _is_searching_queried_position and _is_before_end_of_line ) - except IndexError: - return Position(line=len(lines), character=0) + if not _is_searching_for_position: + break + + _current_char = _line[utf32_index] + _is_double_width = is_char_beyond_multilingual_plane(_current_char) + if _is_double_width: + _utf16_index += 2 + else: + _utf16_index += 1 + utf32_index += 1 + + position = Position(line=position.line, character=utf32_index) + return position def position_to_utf16(lines: List[str], position: Position) -> Position: @@ -137,10 +171,11 @@ def range_from_utf16(lines: List[str], range: Range) -> Range: Returns: The range with `character` offsets being converted to utf-16 code units. """ - return Range( + range_new = Range( start=position_from_utf16(lines, range.start), end=position_from_utf16(lines, range.end) ) + return range_new def range_to_utf16(lines: List[str], range: Range) -> Range: @@ -280,7 +315,7 @@ def offset_at_position(self, position: Position) -> int: lines = self.lines pos = position_from_utf16(lines, position) row, col = pos.line, pos.character - return col + sum(len(line) for line in lines[:row]) + return col + sum(utf16_num_units(line) for line in lines[:row]) @property def source(self) -> str: diff --git a/tests/test_document.py b/tests/test_document.py index 5c3c36b8..8b21ac26 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -220,10 +220,11 @@ def test_range_from_utf16(): range = Range( start=Position(line=0, character=3), end=Position(line=0, character=5) ) - range_from_utf16(['x="😋"'], range) - assert range == Range( - start=Position(line=0, character=3), end=Position(line=0, character=5) + actual = range_from_utf16(['x="😋😋"'], range) + expected = Range( + start=Position(line=0, character=3), end=Position(line=0, character=4) ) + assert actual == expected def test_range_to_utf16(): @@ -239,23 +240,40 @@ def test_range_to_utf16(): range = Range( start=Position(line=0, character=3), end=Position(line=0, character=4) ) - range_to_utf16(['x="😋"'], range) - assert range == Range( - start=Position(line=0, character=3), end=Position(line=0, character=4) + actual = range_to_utf16(['x="😋😋"'], range) + expected = Range( + start=Position(line=0, character=3), end=Position(line=0, character=5) ) + assert actual == expected def test_offset_at_position(doc): assert doc.offset_at_position(Position(line=0, character=8)) == 8 - assert doc.offset_at_position(Position(line=1, character=5)) == 14 + assert doc.offset_at_position(Position(line=1, character=5)) == 12 assert doc.offset_at_position(Position(line=2, character=0)) == 13 assert doc.offset_at_position(Position(line=2, character=4)) == 17 assert doc.offset_at_position(Position(line=3, character=6)) == 27 - assert doc.offset_at_position(Position(line=3, character=7)) == 27 + assert doc.offset_at_position(Position(line=3, character=7)) == 28 assert doc.offset_at_position(Position(line=3, character=8)) == 28 - assert doc.offset_at_position(Position(line=4, character=0)) == 39 - assert doc.offset_at_position(Position(line=5, character=0)) == 39 - + assert doc.offset_at_position(Position(line=4, character=0)) == 40 + assert doc.offset_at_position(Position(line=5, character=0)) == 40 + +def test_utf16_to_utf32_position_cast(doc): + lines = ['', '😋😋', ''] + assert position_from_utf16(lines, Position(line=0, character=0)) == Position(line=0, character=0) + assert position_from_utf16(lines, Position(line=0, character=1)) == Position(line=0, character=0) + assert position_from_utf16(lines, Position(line=1, character=0)) == Position(line=1, character=0) + assert position_from_utf16(lines, Position(line=1, character=2)) == Position(line=1, character=1) + assert position_from_utf16(lines, Position(line=1, character=3)) == Position(line=1, character=2) + assert position_from_utf16(lines, Position(line=1, character=4)) == Position(line=1, character=2) + assert position_from_utf16(lines, Position(line=1, character=100)) == Position(line=1, character=2) + assert position_from_utf16(lines, Position(line=3, character=0)) == Position(line=2, character=0) + assert position_from_utf16(lines, Position(line=4, character=10)) == Position(line=2, character=0) + +def test_position_for_line_endings(doc): + lines = ['x\r\n', 'y\n'] + assert position_from_utf16(lines, Position(line=0, character=10)) == Position(line=0, character=1) + assert position_from_utf16(lines, Position(line=1, character=10)) == Position(line=1, character=1) def test_word_at_position(doc): """