-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Full support for bidirectional line breaking #7123
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,55 +2,124 @@ | |
|
||
#include <mbgl/text/bidi.hpp> | ||
#include <unicode/ubidi.h> | ||
#include <unicode/ubiditransform.h> | ||
#include <unicode/ushape.h> | ||
|
||
namespace mbgl { | ||
|
||
BiDi::BiDi() { | ||
// Takes UTF16 input in logical order and applies Arabic shaping to the input while maintaining | ||
// logical order | ||
// Output won't be intelligible until the bidirectional algorithm is applied | ||
std::u16string applyArabicShaping(const std::u16string& input) { | ||
UErrorCode errorCode = U_ZERO_ERROR; | ||
transform = ubiditransform_open(&errorCode); // Only error is failure to allocate memory, in | ||
// that case ubidi_transform would fall back to | ||
// creating transform object on the fly | ||
|
||
int32_t outputLength = | ||
u_shapeArabic(input.c_str(), static_cast<int32_t>(input.size()), NULL, 0, | ||
(U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK) | | ||
(U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK), | ||
&errorCode); | ||
|
||
// Pre-flighting will always set U_BUFFER_OVERFLOW_ERROR | ||
errorCode = U_ZERO_ERROR; | ||
|
||
std::unique_ptr<UChar[]> outputText = std::make_unique<UChar[]>(outputLength); | ||
u_shapeArabic(input.c_str(), static_cast<int32_t>(input.size()), outputText.get(), outputLength, | ||
(U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK) | | ||
(U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK), | ||
&errorCode); | ||
|
||
// If the algorithm fails for any reason, fall back to non-transformed text | ||
if (U_FAILURE(errorCode)) | ||
return input; | ||
|
||
return std::u16string(outputText.get(), outputLength); | ||
} | ||
|
||
ProcessedBiDiText::ProcessedBiDiText(BiDi& p_bidi) : bidi(p_bidi) { | ||
} | ||
|
||
void ProcessedBiDiText::mergeParagraphLineBreaks(std::set<int32_t>& lineBreakPoints) { | ||
int32_t paragraphCount = ubidi_countParagraphs(bidi.bidiText); | ||
for (int32_t i = 0; i < paragraphCount; i++) { | ||
UErrorCode errorCode = U_ZERO_ERROR; | ||
int32_t paragraphEndIndex; | ||
ubidi_getParagraphByIndex(bidi.bidiText, i, NULL, ¶graphEndIndex, NULL, &errorCode); | ||
|
||
if (U_FAILURE(errorCode)) | ||
throw std::runtime_error(std::string("ProcessedBiDiText::mergeParagraphLineBreaks: ") + | ||
u_errorName(errorCode)); | ||
|
||
lineBreakPoints.insert(paragraphEndIndex); | ||
} | ||
} | ||
|
||
std::vector<std::u16string> | ||
ProcessedBiDiText::applyLineBreaking(std::set<int32_t> lineBreakPoints) { | ||
// BiDi::getLine will error if called across a paragraph boundary, so we need to ensure that all | ||
// paragraph | ||
// boundaries are included in the set of line break points. The calling code might not include | ||
// the line break because it | ||
// didn't need to wrap at that point, or because the text was separated with a more exotic code | ||
// point such as (U+001C) | ||
mergeParagraphLineBreaks(lineBreakPoints); | ||
|
||
std::vector<std::u16string> transformedLines; | ||
int32_t start = 0; | ||
for (int32_t lineBreakPoint : lineBreakPoints) { | ||
transformedLines.push_back(bidi.getLine(start, lineBreakPoint)); | ||
start = lineBreakPoint; | ||
} | ||
|
||
return transformedLines; | ||
} | ||
|
||
BiDi::BiDi() { | ||
bidiText = ubidi_open(); | ||
bidiLine = ubidi_open(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why two handles? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just the way the ICU interface works. |
||
} | ||
|
||
BiDi::~BiDi() { | ||
if (transform) | ||
ubiditransform_close(transform); | ||
if (bidiText) | ||
ubidi_close(bidiText); | ||
|
||
if (bidiLine) | ||
ubidi_close(bidiLine); | ||
} | ||
|
||
std::u16string BiDi::bidiTransform(const std::u16string& input) { | ||
ProcessedBiDiText BiDi::processText(const std::u16string& input) { | ||
UErrorCode errorCode = U_ZERO_ERROR; | ||
|
||
std::unique_ptr<UChar[]> outputText = | ||
std::make_unique<UChar[]>(input.size() * 2); // Maximum output of ubidi_transform is twice | ||
// the size of input according to | ||
// ubidi_transform.h | ||
uint32_t outputLength = ubiditransform_transform( | ||
transform, input.c_str(), static_cast<int32_t>(input.size()), outputText.get(), | ||
static_cast<int32_t>(input.size()) * 2, | ||
UBIDI_DEFAULT_LTR, // Assume input is LTR unless strong RTL characters are found | ||
UBIDI_LOGICAL, // Input is in logical order | ||
UBIDI_LTR, // Output is in "visual LTR" order | ||
UBIDI_VISUAL, // '' | ||
UBIDI_MIRRORING_ON, // Use mirroring lookups for things like parentheses that need mirroring | ||
// in RTL text | ||
U_SHAPE_LETTERS_SHAPE, // Add options here for handling numbers in bidirectional text | ||
&errorCode); | ||
ubidi_setPara(bidiText, input.c_str(), static_cast<int32_t>(input.size()), UBIDI_DEFAULT_LTR, | ||
NULL, &errorCode); | ||
|
||
// If the algorithm fails for any reason, fall back to non-transformed text | ||
if (U_FAILURE(errorCode)) | ||
return input; | ||
throw std::runtime_error(std::string("BiDi::processText: ") + u_errorName(errorCode)); | ||
|
||
return std::u16string(outputText.get(), outputLength); | ||
return ProcessedBiDiText(*this); | ||
} | ||
|
||
WritingDirection BiDi::baseWritingDirection(const std::u16string& input) { | ||
// This just looks for the first character with a strong direction property, it does not perform | ||
// the BiDi algorithm | ||
return ubidi_getBaseDirection(input.c_str(), static_cast<int32_t>(input.size())) == UBIDI_RTL | ||
? WritingDirection::RightToLeft | ||
: WritingDirection::LeftToRight; | ||
std::u16string BiDi::getLine(int32_t start, int32_t end) { | ||
UErrorCode errorCode = U_ZERO_ERROR; | ||
ubidi_setLine(bidiText, start, end, bidiLine, &errorCode); | ||
|
||
if (U_FAILURE(errorCode)) | ||
throw std::runtime_error(std::string("BiDi::getLine (setLine): ") + u_errorName(errorCode)); | ||
|
||
// Because we set UBIDI_REMOVE_BIDI_CONTROLS, the output may be smaller than what we reserve | ||
// Setting UBIDI_INSERT_LRM_FOR_NUMERIC would require | ||
// ubidi_getLength(pBiDi)+2*ubidi_countRuns(pBiDi) | ||
int32_t outputLength = ubidi_getProcessedLength(bidiLine); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
std::unique_ptr<UChar[]> outputText = std::make_unique<UChar[]>(outputLength); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can just use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, right! I left C++ world at C++98, still getting used to all the great additions... |
||
|
||
// UBIDI_DO_MIRRORING: Apply unicode mirroring of characters like parentheses | ||
// UBIDI_REMOVE_BIDI_CONTROLS: Now that all the lines are set, remove control characters so that | ||
// they don't show up on screen (some fonts have glyphs representing them) | ||
ubidi_writeReordered(bidiLine, outputText.get(), outputLength, | ||
UBIDI_DO_MIRRORING | UBIDI_REMOVE_BIDI_CONTROLS, &errorCode); | ||
|
||
if (U_FAILURE(errorCode)) | ||
throw std::runtime_error(std::string("BiDi::getLine (writeReordered): ") + u_errorName(errorCode)); | ||
|
||
return std::u16string(outputText.get(), outputLength); | ||
} | ||
|
||
} // end namespace mbgl |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,47 @@ | ||
#pragma once | ||
|
||
#include <set> | ||
#include <string> | ||
#include <vector> | ||
|
||
#include <mbgl/util/noncopyable.hpp> | ||
|
||
struct UBiDiTransform; | ||
struct UBiDi; | ||
|
||
namespace mbgl { | ||
|
||
enum class WritingDirection : bool { LeftToRight, RightToLeft }; | ||
|
||
|
||
class BiDi; | ||
|
||
std::u16string applyArabicShaping(const std::u16string&); | ||
|
||
class ProcessedBiDiText { | ||
public: | ||
ProcessedBiDiText(BiDi&); | ||
|
||
std::vector<std::u16string> applyLineBreaking(std::set<int32_t>); | ||
|
||
private: | ||
void mergeParagraphLineBreaks(std::set<int32_t>&); | ||
|
||
BiDi& bidi; | ||
}; | ||
|
||
class BiDi : private util::noncopyable { | ||
public: | ||
BiDi(); | ||
~BiDi(); | ||
|
||
std::u16string bidiTransform(const std::u16string&); | ||
WritingDirection baseWritingDirection(const std::u16string&); | ||
// Calling processText resets internal state, invalidating any existing ProcessedBiDiText | ||
// objects | ||
ProcessedBiDiText processText(const std::u16string&); | ||
|
||
friend class ProcessedBiDiText; | ||
|
||
private: | ||
UBiDiTransform* transform; | ||
std::u16string getLine(int32_t start, int32_t end); | ||
|
||
UBiDi* bidiText; | ||
UBiDi* bidiLine; | ||
}; | ||
|
||
} // end namespace mbgl |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mapbox/mapbox-gl-test-suite#186 was rebased onto that repository’s master branch, so mapbox/mapbox-gl-test-suite@d85534f has been orphaned. This line should have been changed to point to mapbox/mapbox-gl-test-suite@0c6f3e0 before this PR was merged.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, sorry about that, and thanks for fixing!