Skip to content

Commit

Permalink
fix: Correctly cast from UTF16 positions
Browse files Browse the repository at this point in the history
Fixes #302
  • Loading branch information
tombh committed Jul 28, 2023
1 parent 6139c71 commit d5a1212
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 22 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning][semver].

### Changed
### Fixed
[#304]: https://github.com/openlawlibrary/pygls/issues/304

- `pygls` no longer overrides the event loop for the current thread when given an explicit loop to use. ([#334])
- Fixed `MethodTypeNotRegisteredError` when registering a `TEXT_DOCUMENT_DID_SAVE` feature with options. ([#338])
Expand Down
57 changes: 46 additions & 11 deletions pygls/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,17 @@
log = logging.getLogger(__name__)


def is_char_beyond_multilingual_plane(char: str) -> bool:
return ord(char) > 0xFFFF


def utf16_unit_offset(chars: str):
"""Calculate the number of characters which need two utf-16 code units.
Arguments:
chars (str): The string to count occurrences of utf-16 code units for.
"""
return sum(ord(ch) > 0xFFFF for ch in chars)
return sum(is_char_beyond_multilingual_plane(ch) for ch in chars)


def utf16_num_units(chars: str):
Expand All @@ -59,7 +63,7 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
"""Convert the position.character from utf-16 code units to utf-32.
A python application can't use the character member of `Position`
directly as per specification it is represented as a zero-based line and
directly. As per specification it is represented as a zero-based line and
character offset based on a UTF-16 string representation.
All characters whose code point exceeds the Basic Multilingual Plane are
Expand All @@ -80,14 +84,44 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
Returns:
The position with `character` being converted to utf-32 code units.
"""
try:
return Position(
line=position.line,
character=position.character
- utf16_unit_offset(lines[position.line][:position.character])
if len(lines) == 0:
return Position(0, 0)
if position.line >= len(lines):
return Position(len(lines) - 1, utf16_num_units(lines[-1]))

_line = lines[position.line]
_line = _line.replace('\r\n', '\n') # TODO: it's a bit of a hack
_utf16_len = utf16_num_units(_line)
_utf32_len = len(_line)

if _utf16_len == 0:
return Position(position.line, 0)

_utf16_end_of_line = utf16_num_units(_line)
if position.character > _utf16_end_of_line:
position.character = _utf16_end_of_line - 1

_utf16_index = 0
utf32_index = 0
while True:
_is_searching_queried_position = _utf16_index < position.character
_is_before_end_of_line = utf32_index < _utf32_len
_is_searching_for_position = (
_is_searching_queried_position and _is_before_end_of_line
)
except IndexError:
return Position(line=len(lines), character=0)
if not _is_searching_for_position:
break

_current_char = _line[utf32_index]
_is_double_width = is_char_beyond_multilingual_plane(_current_char)
if _is_double_width:
_utf16_index += 2
else:
_utf16_index += 1
utf32_index += 1

position = Position(line=position.line, character=utf32_index)
return position


def position_to_utf16(lines: List[str], position: Position) -> Position:
Expand Down Expand Up @@ -137,10 +171,11 @@ def range_from_utf16(lines: List[str], range: Range) -> Range:
Returns:
The range with `character` offsets being converted to utf-16 code units.
"""
return Range(
range_new = Range(
start=position_from_utf16(lines, range.start),
end=position_from_utf16(lines, range.end)
)
return range_new


def range_to_utf16(lines: List[str], range: Range) -> Range:
Expand Down Expand Up @@ -280,7 +315,7 @@ def offset_at_position(self, position: Position) -> int:
lines = self.lines
pos = position_from_utf16(lines, position)
row, col = pos.line, pos.character
return col + sum(len(line) for line in lines[:row])
return col + sum(utf16_num_units(line) for line in lines[:row])

@property
def source(self) -> str:
Expand Down
40 changes: 29 additions & 11 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,11 @@ def test_range_from_utf16():
range = Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
)
range_from_utf16(['x="πŸ˜‹"'], range)
assert range == Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
actual = range_from_utf16(['x="πŸ˜‹πŸ˜‹"'], range)
expected = Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
)
assert actual == expected


def test_range_to_utf16():
Expand All @@ -239,23 +240,40 @@ def test_range_to_utf16():
range = Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
)
range_to_utf16(['x="πŸ˜‹"'], range)
assert range == Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
actual = range_to_utf16(['x="πŸ˜‹πŸ˜‹"'], range)
expected = Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
)
assert actual == expected


def test_offset_at_position(doc):
assert doc.offset_at_position(Position(line=0, character=8)) == 8
assert doc.offset_at_position(Position(line=1, character=5)) == 14
assert doc.offset_at_position(Position(line=1, character=5)) == 12
assert doc.offset_at_position(Position(line=2, character=0)) == 13
assert doc.offset_at_position(Position(line=2, character=4)) == 17
assert doc.offset_at_position(Position(line=3, character=6)) == 27
assert doc.offset_at_position(Position(line=3, character=7)) == 27
assert doc.offset_at_position(Position(line=3, character=7)) == 28
assert doc.offset_at_position(Position(line=3, character=8)) == 28
assert doc.offset_at_position(Position(line=4, character=0)) == 39
assert doc.offset_at_position(Position(line=5, character=0)) == 39

assert doc.offset_at_position(Position(line=4, character=0)) == 40
assert doc.offset_at_position(Position(line=5, character=0)) == 40

def test_utf16_to_utf32_position_cast(doc):
lines = ['', 'πŸ˜‹πŸ˜‹', '']
assert position_from_utf16(lines, Position(line=0, character=0)) == Position(line=0, character=0)
assert position_from_utf16(lines, Position(line=0, character=1)) == Position(line=0, character=0)
assert position_from_utf16(lines, Position(line=1, character=0)) == Position(line=1, character=0)
assert position_from_utf16(lines, Position(line=1, character=2)) == Position(line=1, character=1)
assert position_from_utf16(lines, Position(line=1, character=3)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=1, character=4)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=1, character=100)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=3, character=0)) == Position(line=2, character=0)
assert position_from_utf16(lines, Position(line=4, character=10)) == Position(line=2, character=0)

def test_position_for_line_endings(doc):
lines = ['x\r\n', 'y\n']
assert position_from_utf16(lines, Position(line=0, character=10)) == Position(line=0, character=1)
assert position_from_utf16(lines, Position(line=1, character=10)) == Position(line=1, character=1)

def test_word_at_position(doc):
"""
Expand Down

0 comments on commit d5a1212

Please sign in to comment.