Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement grapheme clusters #16916

Merged
merged 19 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions .github/actions/spelling/expect/expect.txt
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ bytebuffer
cac
cacafire
CALLCONV
CANDRABINDU
capslock
CARETBLINKINGENABLED
CARRIAGERETURN
Expand All @@ -156,6 +157,7 @@ CBash
cbiex
CBN
cbt
Ccc
CCCBB
cch
CCHAR
Expand Down Expand Up @@ -293,7 +295,6 @@ CREATESTRUCT
CREATESTRUCTW
createvpack
crisman
CRLFs
crloew
CRTLIBS
csbi
Expand Down Expand Up @@ -594,6 +595,7 @@ fesb
FFAF
ffd
FFDE
FFFD
FFFDb
fgbg
FGCOLOR
Expand All @@ -614,6 +616,7 @@ FINDREGEX
FINDSTRINGEXACT
FINDUP
FIter
FITZPATRICK
FIXEDFILEINFO
Flg
flyouts
Expand Down Expand Up @@ -882,10 +885,12 @@ jconcpp
JLO
JOBOBJECT
JOBOBJECTINFOCLASS
JONGSEONG
JPN
jsoncpp
jsprovider
jumplist
JUNGSEONG
KAttrs
kawa
Kazu
Expand All @@ -904,6 +909,7 @@ keyups
KILLACTIVE
KILLFOCUS
kinda
KIYEOK
KLF
KLMNO
KLMNOPQRST
Expand Down Expand Up @@ -1013,6 +1019,7 @@ luma
lval
LVB
LVERTICAL
LVT
LWA
LWIN
lwkmvj
Expand Down Expand Up @@ -1209,6 +1216,7 @@ ntuser
NTVDM
ntverp
nugetversions
NUKTA
nullness
nullonfailure
nullopts
Expand Down Expand Up @@ -1471,7 +1479,6 @@ READMODE
rectread
redef
redefinable
Redir
redist
REDSCROLL
REFCLSID
Expand All @@ -1489,6 +1496,7 @@ renderengine
rendersize
reparented
reparenting
REPH
replatformed
Replymessage
reportfileaccesses
Expand Down Expand Up @@ -1519,6 +1527,7 @@ rgw
RIGHTALIGN
RIGHTBUTTON
riid
ris
RIS
roadmap
robomac
Expand Down Expand Up @@ -1924,6 +1933,7 @@ vga
vgaoem
viewkind
viewports
VIRAMA
Virt
VIRTTERM
vkey
Expand Down Expand Up @@ -1974,8 +1984,8 @@ wchars
WCIA
WCIW
WCSHELPER
wcsicmp
wcsrev
wcswidth
wddm
wddmcon
WDDMCONSOLECONTEXT
Expand Down Expand Up @@ -2131,6 +2141,7 @@ XFORM
XIn
XManifest
XMath
XNamespace
xorg
XPan
XResource
Expand Down Expand Up @@ -2162,6 +2173,7 @@ Zabcdefghijklmn
Zabcdefghijklmnopqrstuvwxyz
ZCmd
ZCtrl
ZWJs
zxcvbnm
ZYXWVU
ZYXWVUTd
116 changes: 75 additions & 41 deletions src/buffer/out/Row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
#include "Row.hpp"

#include <isa_availability.h>
#include <til/unicode.h>

#include "textBuffer.hpp"
#include "../../types/inc/GlyphWidth.hpp"
#include "../../types/inc/CodepointWidthDetector.hpp"

// It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
// performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
Expand Down Expand Up @@ -568,6 +566,7 @@ void ROW::ReplaceAttributes(const til::CoordType beginIndex, const til::CoordTyp
void ROW::ReplaceCharacters(til::CoordType columnBegin, til::CoordType width, const std::wstring_view& chars)
try
{
assert(width >= 1 && width <= 2);
WriteHelper h{ *this, columnBegin, _columnCount, chars };
if (!h.IsValid())
{
Expand Down Expand Up @@ -666,56 +665,91 @@ catch (...)

[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
{
const auto end = chars.end();
auto& cwd = CodepointWidthDetector::Singleton();

while (it != end)
// Check if the new text joins with the existing contents of the row to form a single grapheme cluster.
if (it == chars.begin())
{
unsigned int width = 1;
auto ptr = &*it;
const auto wch = *ptr;
size_t advance = 1;
auto colPrev = colBeg;
while (colPrev > 0 && row._uncheckedIsTrailer(--colPrev))
{
}

++it;
const auto chPrev = row._uncheckedCharOffset(colPrev);
const std::wstring_view charsPrev{ row._chars.data() + chPrev, ch - chPrev };

// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
// It also allows us to skip the surrogate pair decoding at the same time.
if (wch >= 0x80)
GraphemeState state;
cwd.GraphemeNext(state, charsPrev);
cwd.GraphemeNext(state, chars);

if (state.len > 0)
{
if (til::is_surrogate(wch))
colBegDirty = colPrev;
colEnd = colPrev;

const auto width = std::max(1, state.width);
const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
if (colEndNew > colLimit)
{
if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
{
advance = 2;
++it;
}
else
{
ptr = &UNICODE_REPLACEMENT;
}
colEndDirty = colLimit;
charsConsumed = ch - chBeg;
return;
}

width = IsGlyphFullWidth({ ptr, advance }) + 1u;
}
// Fill our char-offset buffer with 1 entry containing the mapping from the
// current column (colEnd) to the start of the glyph in the string (ch)...
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(chPrev);
// ...followed by 0-N entries containing an indication that the
// columns are just a wide-glyph extension of the preceding one.
while (colEnd < colEndNew)
{
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(chPrev | CharOffsetsTrailer);
}

const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
if (colEndNew > colLimit)
{
colEndDirty = colLimit;
charsConsumed = ch - chBeg;
return;
ch += state.len;
it += state.len;
}
}
else
{
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
// and let MeasureNext() find the next proper grapheme boundary.
--colEnd;
--ch;
--it;
}

if (const auto end = chars.end(); it != end)
{
GraphemeState state{ .beg = &*it };

// Fill our char-offset buffer with 1 entry containing the mapping from the
// current column (colEnd) to the start of the glyph in the string (ch)...
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch);
// ...followed by 0-N entries containing an indication that the
// columns are just a wide-glyph extension of the preceding one.
while (colEnd < colEndNew)
do
{
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
}
cwd.GraphemeNext(state, chars);

const auto width = std::max(1, state.width);
const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
if (colEndNew > colLimit)
{
colEndDirty = colLimit;
charsConsumed = ch - chBeg;
return;
}

// Fill our char-offset buffer with 1 entry containing the mapping from the
// current column (colEnd) to the start of the glyph in the string (ch)...
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch);
// ...followed by 0-N entries containing an indication that the
// columns are just a wide-glyph extension of the preceding one.
while (colEnd < colEndNew)
{
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
}

ch += advance;
ch += state.len;
it += state.len;
} while (it != end);
}

colEndDirty = colEnd;
Expand Down Expand Up @@ -1058,7 +1092,7 @@ std::wstring_view ROW::GetText() const noexcept

std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
{
const til::CoordType columns = _columnCount;
const auto columns = GetReadableColumnCount();
const auto colBeg = clamp(columnBegin, 0, columns);
const auto colEnd = clamp(columnEnd, colBeg, columns);
const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));
Expand Down
Loading
Loading