Skip to content

Commit

Permalink
fix: Correctly cast from UTF16 positions
Browse files Browse the repository at this point in the history
Fixes #302
  • Loading branch information
tombh committed Dec 21, 2022
1 parent af3f318 commit 69a5960
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,4 @@ venv.bak/

# mypy
.mypy_cache
.dmypy.json
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ and this project adheres to [Semantic Versioning][semver].
### Changed
### Fixed

- Fix progress example in json extension. ([#230])
- Fix: progress example in json extension. ([#230])
- Fix: UTF16 to UTF18 character position casting ([#304])

[#230]: https://github.com/openlawlibrary/pygls/issues/230
[#304]: https://github.com/openlawlibrary/pygls/issues/304

## [1.0.0] - 2/12/2022
### Changed
Expand Down
63 changes: 53 additions & 10 deletions pygls/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,17 @@
log = logging.getLogger(__name__)


def is_char_beyond_multilingual_plane(char: str) -> bool:
return ord(char) > 0xFFFF


def utf16_unit_offset(chars: str):
"""Calculate the number of characters which need two utf-16 code units.
Arguments:
chars (str): The string to count occurrences of utf-16 code units for.
"""
return sum(ord(ch) > 0xFFFF for ch in chars)
return sum(is_char_beyond_multilingual_plane(ch) for ch in chars)


def utf16_num_units(chars: str):
Expand All @@ -59,7 +63,7 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
"""Convert the position.character from utf-16 code units to utf-32.
A python application can't use the character member of `Position`
directly as per specification it is represented as a zero-based line and
directly. As per specification it is represented as a zero-based line and
character offset based on a UTF-16 string representation.
All characters whose code point exceeds the Basic Multilingual Plane are
Expand All @@ -80,14 +84,52 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
Returns:
The position with `character` being converted to utf-32 code units.
"""
try:
return Position(
line=position.line,
character=position.character
- utf16_unit_offset(lines[position.line][:position.character])
if len(lines) == 0:
return Position(0, 0)
if position.line >= len(lines):
log.warning(
f"position_from_utf16() received out of bounds index: lines[{position.line}]"
)
except IndexError:
return Position(line=len(lines), character=0)
return Position(len(lines), 0)

_line = lines[position.line]
_utf16_len = utf16_num_units(_line)
_utf32_len = len(_line)

if _utf16_len == 0:
return Position(position.line, 0)

_utf16_end_of_line = utf16_num_units(_line) - 1
if position.character > _utf16_end_of_line:
log.warning(
f"position_from_utf16() received out of bounds index: line[{position.character}]"
)
log.debug(
f"position_from_utf16() line with {utf16_num_units(_line)} UTF16 units: `{_line}`"
)
position.character = _utf16_end_of_line

_utf16_index = 0
utf32_index = 0
while True:
_is_searching_queried_position = _utf16_index < position.character
_is_before_end_of_line = utf32_index < _utf32_len
_is_searching_for_position = (
_is_searching_queried_position and _is_before_end_of_line
)
if not _is_searching_for_position:
break

_current_char = _line[utf32_index]
_is_double_width = is_char_beyond_multilingual_plane(_current_char)
if _is_double_width:
_utf16_index += 2
else:
_utf16_index += 1
utf32_index += 1

position = Position(line=position.line, character=utf32_index)
return position


def position_to_utf16(lines: List[str], position: Position) -> Position:
Expand Down Expand Up @@ -137,10 +179,11 @@ def range_from_utf16(lines: List[str], range: Range) -> Range:
Returns:
The range with `character` offsets being converted to utf-16 code units.
"""
return Range(
range_new = Range(
start=position_from_utf16(lines, range.start),
end=position_from_utf16(lines, range.end)
)
return range_new


def range_to_utf16(lines: List[str], range: Range) -> Range:
Expand Down
29 changes: 21 additions & 8 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,11 @@ def test_range_from_utf16():
range = Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
)
range_from_utf16(['x="πŸ˜‹"'], range)
assert range == Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
actual = range_from_utf16(['x="πŸ˜‹πŸ˜‹"'], range)
expected = Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
)
assert actual == expected


def test_range_to_utf16():
Expand All @@ -239,23 +240,35 @@ def test_range_to_utf16():
range = Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
)
range_to_utf16(['x="πŸ˜‹"'], range)
assert range == Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
actual = range_to_utf16(['x="πŸ˜‹πŸ˜‹"'], range)
expected = Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
)
assert actual == expected


def test_offset_at_position(doc):
assert doc.offset_at_position(Position(line=0, character=8)) == 8
assert doc.offset_at_position(Position(line=1, character=5)) == 14
assert doc.offset_at_position(Position(line=1, character=5)) == 12
assert doc.offset_at_position(Position(line=2, character=0)) == 13
assert doc.offset_at_position(Position(line=2, character=4)) == 17
assert doc.offset_at_position(Position(line=3, character=6)) == 27
assert doc.offset_at_position(Position(line=3, character=7)) == 27
assert doc.offset_at_position(Position(line=3, character=7)) == 28
assert doc.offset_at_position(Position(line=3, character=8)) == 28
assert doc.offset_at_position(Position(line=4, character=0)) == 39
assert doc.offset_at_position(Position(line=5, character=0)) == 39

def test_utf16_to_utf32_position_cast(doc):
lines = ['', 'πŸ˜‹πŸ˜‹', '']
assert position_from_utf16(lines, Position(line=0, character=0)) == Position(line=0, character=0)
assert position_from_utf16(lines, Position(line=0, character=1)) == Position(line=0, character=0)
assert position_from_utf16(lines, Position(line=1, character=0)) == Position(line=1, character=0)
assert position_from_utf16(lines, Position(line=1, character=2)) == Position(line=1, character=1)
assert position_from_utf16(lines, Position(line=1, character=3)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=1, character=4)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=1, character=100)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=3, character=0)) == Position(line=3, character=0)
assert position_from_utf16(lines, Position(line=4, character=10)) == Position(line=3, character=0)

def test_word_at_position(doc):
"""
Expand Down

0 comments on commit 69a5960

Please sign in to comment.