Skip to content

Commit

Permalink
feat: support UTF32 ans UTF8 position encoding
Browse files Browse the repository at this point in the history
Contributes to #346
  • Loading branch information
tombh committed Sep 22, 2023
1 parent d1695db commit d85c41c
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 30 deletions.
2 changes: 1 addition & 1 deletion pygls/workspace/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def offset_at_position(self, client_position: types.Position) -> int:
lines, client_position
)
row, col = server_position.line, server_position.character
return col + sum(self.position.utf16_num_units(line) for line in lines[:row])
return col + sum(self.position.client_num_units(line) for line in lines[:row])

@property
def source(self) -> str:
Expand Down
41 changes: 27 additions & 14 deletions pygls/workspace/position.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,21 @@ def utf16_unit_offset(self, chars: str):
"""
return sum(self.is_char_beyond_multilingual_plane(ch) for ch in chars)

def utf16_num_units(self, chars: str):
def client_num_units(self, chars: str):
"""
Calculate the length of `str` in utf-16 code units.
Arguments:
chars (str): The string to return the length in utf-16 code units for.
"""
return len(chars) + self.utf16_unit_offset(chars)
utf32_units = len(chars)
if self.encoding == types.PositionEncodingKind.Utf32:
return utf32_units

if self.encoding == types.PositionEncodingKind.Utf8:
return utf32_units + (self.utf16_unit_offset(chars) * 2)

return utf32_units + self.utf16_unit_offset(chars)

def position_from_client_units(
self, lines: List[str], position: types.Position
Expand Down Expand Up @@ -88,24 +95,24 @@ def position_from_client_units(
if len(lines) == 0:
return types.Position(0, 0)
if position.line >= len(lines):
return types.Position(len(lines) - 1, self.utf16_num_units(lines[-1]))
return types.Position(len(lines) - 1, self.client_num_units(lines[-1]))

_line = lines[position.line]
_line = _line.replace("\r\n", "\n") # TODO: it's a bit of a hack
_utf16_len = self.utf16_num_units(_line)
_client_len = self.client_num_units(_line)
_utf32_len = len(_line)

if _utf16_len == 0:
if _client_len == 0:
return types.Position(position.line, 0)

_utf16_end_of_line = self.utf16_num_units(_line)
if position.character > _utf16_end_of_line:
position.character = _utf16_end_of_line - 1
_client_end_of_line = self.client_num_units(_line)
if position.character > _client_end_of_line:
position.character = _client_end_of_line - 1

_utf16_index = 0
_client_index = 0
utf32_index = 0
while True:
_is_searching_queried_position = _utf16_index < position.character
_is_searching_queried_position = _client_index < position.character
_is_before_end_of_line = utf32_index < _utf32_len
_is_searching_for_position = (
_is_searching_queried_position and _is_before_end_of_line
Expand All @@ -116,9 +123,13 @@ def position_from_client_units(
_current_char = _line[utf32_index]
_is_double_width = Position.is_char_beyond_multilingual_plane(_current_char)
if _is_double_width:
_utf16_index += 2
if self.encoding == types.PositionEncodingKind.Utf32:
_client_index += 1
if self.encoding == types.PositionEncodingKind.Utf8:
_client_index += 4
_client_index += 2
else:
_utf16_index += 1
_client_index += 1
utf32_index += 1

position = types.Position(line=position.line, character=utf32_index)
Expand All @@ -141,10 +152,12 @@ def position_to_client_unit(
The position with `character` being converted to UTF-[32|16|8] code units.
"""
try:
character = self.client_num_units(
lines[position.line][: position.character]
)
return types.Position(
line=position.line,
character=position.character
+ self.utf16_unit_offset(lines[position.line][: position.character]),
character=character,
)
except IndexError:
return types.Position(line=len(lines), character=0)
Expand Down
5 changes: 0 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,6 @@ def server_dir():
json_server_client = create_client_for_server("json_server.py")


@pytest.fixture
def doc():
return TextDocument(DOC_URI, DOC)


@pytest.fixture
def feature_manager():
"""Return a feature manager"""
Expand Down
72 changes: 62 additions & 10 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def test_document_line_edit():
assert doc.source == "itsgoodbyeworld"


def test_document_lines(doc):
def test_document_lines():
doc = Document(DOC_URI, DOC)
assert len(doc.lines) == 4
assert doc.lines[0] == "document\n"

Expand Down Expand Up @@ -160,7 +161,9 @@ def test_document_no_edit():
assert doc.lines == old


def test_document_props(doc):
def test_document_props():
doc = Document(DOC_URI, DOC)

assert doc.uri == DOC_URI
assert doc.source == DOC

Expand All @@ -180,9 +183,25 @@ def test_position_from_utf16():
['x="πŸ˜‹"'], types.Position(line=0, character=5)
) == types.Position(line=0, character=4)

actual = types.Position(line=0, character=5)
position.position_from_client_units(['x="πŸ˜‹"'], actual)
assert actual == types.Position(line=0, character=5)

def test_position_from_utf32():
position = Position(encoding=types.PositionEncodingKind.Utf32)
assert position.position_from_client_units(
['x="πŸ˜‹"'], types.Position(line=0, character=3)
) == types.Position(line=0, character=3)
assert position.position_from_client_units(
['x="πŸ˜‹"'], types.Position(line=0, character=4)
) == types.Position(line=0, character=4)


def test_position_from_utf8():
position = Position(encoding=types.PositionEncodingKind.Utf8)
assert position.position_from_client_units(
['x="πŸ˜‹"'], types.Position(line=0, character=3)
) == types.Position(line=0, character=3)
assert position.position_from_client_units(
['x="πŸ˜‹"'], types.Position(line=0, character=7)
) == types.Position(line=0, character=4)


def test_position_to_utf16():
Expand All @@ -195,9 +214,27 @@ def test_position_to_utf16():
['x="πŸ˜‹"'], types.Position(line=0, character=4)
) == types.Position(line=0, character=5)

actual = types.Position(line=0, character=4)
position.position_to_client_unit(['x="πŸ˜‹"'], actual)
assert actual == types.Position(line=0, character=4)

def test_position_to_utf32():
position = Position(encoding=types.PositionEncodingKind.Utf32)
assert position.position_to_client_unit(
['x="πŸ˜‹"'], types.Position(line=0, character=3)
) == types.Position(line=0, character=3)

assert position.position_to_client_unit(
['x="πŸ˜‹"'], types.Position(line=0, character=4)
) == types.Position(line=0, character=4)


def test_position_to_utf8():
position = Position(encoding=types.PositionEncodingKind.Utf8)
assert position.position_to_client_unit(
['x="πŸ˜‹"'], types.Position(line=0, character=3)
) == types.Position(line=0, character=3)

assert position.position_to_client_unit(
['x="πŸ˜‹"'], types.Position(line=0, character=4)
) == types.Position(line=0, character=6)


def test_range_from_utf16():
Expand Down Expand Up @@ -250,7 +287,8 @@ def test_range_to_utf16():
assert actual == expected


def test_offset_at_position(doc):
def test_offset_at_position_utf16():
doc = Document(DOC_URI, DOC)
assert doc.offset_at_position(types.Position(line=0, character=8)) == 8
assert doc.offset_at_position(types.Position(line=1, character=5)) == 12
assert doc.offset_at_position(types.Position(line=2, character=0)) == 13
Expand All @@ -262,6 +300,18 @@ def test_offset_at_position(doc):
assert doc.offset_at_position(types.Position(line=5, character=0)) == 40


def test_offset_at_position_utf32():
doc = Document(DOC_URI, DOC, position_encoding=types.PositionEncodingKind.Utf32)
assert doc.offset_at_position(types.Position(line=0, character=8)) == 8
assert doc.offset_at_position(types.Position(line=5, character=0)) == 39


def test_offset_at_position_utf8():
doc = Document(DOC_URI, DOC, position_encoding=types.PositionEncodingKind.Utf8)
assert doc.offset_at_position(types.Position(line=0, character=8)) == 8
assert doc.offset_at_position(types.Position(line=5, character=0)) == 41


def test_utf16_to_utf32_position_cast():
position = Position(encoding=types.PositionEncodingKind.Utf16)
lines = ["", "πŸ˜‹πŸ˜‹", ""]
Expand Down Expand Up @@ -305,10 +355,12 @@ def test_position_for_line_endings():
) == types.Position(line=1, character=1)


def test_word_at_position(doc):
def test_word_at_position():
"""
Return word under the cursor (or last in line if past the end)
"""
doc = Document(DOC_URI, DOC)

assert doc.word_at_position(types.Position(line=0, character=8)) == "document"
assert doc.word_at_position(types.Position(line=0, character=1000)) == "document"
assert doc.word_at_position(types.Position(line=1, character=5)) == "for"
Expand Down

0 comments on commit d85c41c

Please sign in to comment.