From 5509479987d4a6b482f1fef7dac29d2afc112dd6 Mon Sep 17 00:00:00 2001 From: Christian Rocha Date: Tue, 5 Sep 2023 14:50:32 -0400 Subject: [PATCH] Upgrade to Unicode v15.0.0 (and regnerate accordingly) --- eastasianwidth.go | 74 ++++++++++++++++++++--------- emojipresentation.go | 14 +++++- gen_breaktest.go | 2 +- gen_properties.go | 4 +- graphemebreak_test.go | 4 +- graphemeproperties.go | 54 ++++++++++++++++------ linebreak_test.go | 4 +- lineproperties.go | 105 +++++++++++++++++++++++++++++------------- sentencebreak_test.go | 4 +- sentenceproperties.go | 50 ++++++++++++++++---- wordbreak_test.go | 4 +- wordproperties.go | 67 ++++++++++++++++++++------- 12 files changed, 279 insertions(+), 107 deletions(-) diff --git a/eastasianwidth.go b/eastasianwidth.go index 12f7dd3..5fc54d9 100644 --- a/eastasianwidth.go +++ b/eastasianwidth.go @@ -3,11 +3,11 @@ package uniseg // eastAsianWidth are taken from -// https://www.unicode.org/Public/14.0.0/ucd/EastAsianWidth.txt +// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt // and -// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt +// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt // ("Extended_Pictographic" only) -// on July 17, 2023. See https://www.unicode.org/license.html for the Unicode +// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode // license agreement. var eastAsianWidth = [][3]int{ {0x0000, 0x001F, prN}, // Cc [32] .. @@ -504,6 +504,7 @@ var eastAsianWidth = [][3]int{ {0x0CE2, 0x0CE3, prN}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL {0x0CE6, 0x0CEF, prN}, // Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE {0x0CF1, 0x0CF2, prN}, // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA + {0x0CF3, 0x0CF3, prN}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT {0x0D00, 0x0D01, prN}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU {0x0D02, 0x0D03, prN}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA {0x0D04, 0x0D0C, prN}, // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L @@ -565,7 +566,7 @@ var eastAsianWidth = [][3]int{ {0x0EBD, 0x0EBD, prN}, // Lo LAO SEMIVOWEL SIGN NYO {0x0EC0, 0x0EC4, prN}, // Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI {0x0EC6, 0x0EC6, prN}, // Lm LAO KO LA - {0x0EC8, 0x0ECD, prN}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA + {0x0EC8, 0x0ECE, prN}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN {0x0ED0, 0x0ED9, prN}, // Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE {0x0EDC, 0x0EDF, prN}, // Lo [4] LAO HO NO..LAO LETTER KHMU NYO {0x0F00, 0x0F00, prN}, // Lo TIBETAN SYLLABLE OM @@ -1916,6 +1917,7 @@ var eastAsianWidth = [][3]int{ {0x10EAB, 0x10EAC, prN}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK {0x10EAD, 0x10EAD, prN}, // Pd YEZIDI HYPHENATION MARK {0x10EB0, 0x10EB1, prN}, // Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE + {0x10EFD, 0x10EFF, prN}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA {0x10F00, 0x10F1C, prN}, // Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL {0x10F1D, 0x10F26, prN}, // No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF {0x10F27, 0x10F27, prN}, // Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -1998,6 +2000,8 @@ var eastAsianWidth = [][3]int{ {0x11236, 0x11237, prN}, // Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA {0x11238, 0x1123D, prN}, // Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN {0x1123E, 0x1123E, prN}, // Mn KHOJKI SIGN SUKUN + {0x1123F, 0x11240, prN}, // Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I + {0x11241, 0x11241, prN}, // Mn KHOJKI VOWEL SIGN VOCALIC R {0x11280, 0x11286, prN}, // Lo [7] MULTANI LETTER A..MULTANI LETTER GA {0x11288, 0x11288, prN}, // Lo MULTANI LETTER GHA {0x1128A, 0x1128D, prN}, // Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA @@ -2160,6 +2164,7 @@ var eastAsianWidth = [][3]int{ {0x11A9E, 0x11AA2, prN}, // Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 {0x11AB0, 0x11ABF, prN}, // Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA {0x11AC0, 0x11AF8, prN}, // Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL + {0x11B00, 0x11B09, prN}, // Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU {0x11C00, 0x11C08, prN}, // Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L {0x11C0A, 0x11C2E, prN}, // Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA {0x11C2F, 0x11C2F, prN}, // Mc BHAIKSUKI VOWEL SIGN AA @@ -2205,6 +2210,19 @@ var eastAsianWidth = [][3]int{ {0x11EF3, 0x11EF4, prN}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U {0x11EF5, 0x11EF6, prN}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O {0x11EF7, 0x11EF8, prN}, // Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION + {0x11F00, 0x11F01, prN}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA + {0x11F02, 0x11F02, prN}, // Lo KAWI SIGN REPHA + {0x11F03, 0x11F03, prN}, // Mc KAWI SIGN VISARGA + {0x11F04, 0x11F10, prN}, // Lo [13] KAWI LETTER A..KAWI LETTER O + {0x11F12, 0x11F33, prN}, // Lo [34] KAWI LETTER KA..KAWI LETTER JNYA + {0x11F34, 0x11F35, prN}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA + {0x11F36, 0x11F3A, prN}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R + {0x11F3E, 0x11F3F, prN}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI + {0x11F40, 0x11F40, prN}, // Mn KAWI VOWEL SIGN EU + {0x11F41, 0x11F41, prN}, // Mc KAWI SIGN KILLER + {0x11F42, 0x11F42, prN}, // Mn KAWI CONJOINER + {0x11F43, 0x11F4F, prN}, // Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL + {0x11F50, 0x11F59, prN}, // Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE {0x11FB0, 0x11FB0, prN}, // Lo LISU LETTER YHA {0x11FC0, 0x11FD4, prN}, // No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH {0x11FD5, 0x11FDC, prN}, // So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI @@ -2217,8 +2235,11 @@ var eastAsianWidth = [][3]int{ {0x12480, 0x12543, prN}, // Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU {0x12F90, 0x12FF0, prN}, // Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 {0x12FF1, 0x12FF2, prN}, // Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302 - {0x13000, 0x1342E, prN}, // Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 - {0x13430, 0x13438, prN}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT + {0x13000, 0x1342F, prN}, // Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D + {0x13430, 0x1343F, prN}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE + {0x13440, 0x13440, prN}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY + {0x13441, 0x13446, prN}, // Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN + {0x13447, 0x13455, prN}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED {0x14400, 0x14646, prN}, // Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 {0x16800, 0x16A38, prN}, // Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ {0x16A40, 0x16A5E, prN}, // Lo [31] MRO LETTER TA..MRO LETTER TEK @@ -2263,7 +2284,9 @@ var eastAsianWidth = [][3]int{ {0x1AFFD, 0x1AFFE, prW}, // Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 {0x1B000, 0x1B0FF, prW}, // Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2 {0x1B100, 0x1B122, prW}, // Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU + {0x1B132, 0x1B132, prW}, // Lo HIRAGANA LETTER SMALL KO {0x1B150, 0x1B152, prW}, // Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO + {0x1B155, 0x1B155, prW}, // Lo KATAKANA LETTER SMALL KO {0x1B164, 0x1B167, prW}, // Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N {0x1B170, 0x1B2FB, prW}, // Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB {0x1BC00, 0x1BC6A, prN}, // Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M @@ -2294,6 +2317,7 @@ var eastAsianWidth = [][3]int{ {0x1D200, 0x1D241, prN}, // So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 {0x1D242, 0x1D244, prN}, // Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME {0x1D245, 0x1D245, prN}, // So GREEK MUSICAL LEIMMA + {0x1D2C0, 0x1D2D3, prN}, // No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN {0x1D2E0, 0x1D2F3, prN}, // No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN {0x1D300, 0x1D356, prN}, // So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING {0x1D360, 0x1D378, prN}, // No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE @@ -2353,11 +2377,14 @@ var eastAsianWidth = [][3]int{ {0x1DF00, 0x1DF09, prN}, // Ll [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK {0x1DF0A, 0x1DF0A, prN}, // Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK {0x1DF0B, 0x1DF1E, prN}, // Ll [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL + {0x1DF25, 0x1DF2A, prN}, // Ll [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK {0x1E000, 0x1E006, prN}, // Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE {0x1E008, 0x1E018, prN}, // Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU {0x1E01B, 0x1E021, prN}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI {0x1E023, 0x1E024, prN}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS {0x1E026, 0x1E02A, prN}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA + {0x1E030, 0x1E06D, prN}, // Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE + {0x1E08F, 0x1E08F, prN}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I {0x1E100, 0x1E12C, prN}, // Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W {0x1E130, 0x1E136, prN}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D {0x1E137, 0x1E13D, prN}, // Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER @@ -2370,6 +2397,10 @@ var eastAsianWidth = [][3]int{ {0x1E2EC, 0x1E2EF, prN}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI {0x1E2F0, 0x1E2F9, prN}, // Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE {0x1E2FF, 0x1E2FF, prN}, // Sc WANCHO NGUN SIGN + {0x1E4D0, 0x1E4EA, prN}, // Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL + {0x1E4EB, 0x1E4EB, prN}, // Lm NAG MUNDARI SIGN OJOD + {0x1E4EC, 0x1E4EF, prN}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH + {0x1E4F0, 0x1E4F9, prN}, // Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE {0x1E7E0, 0x1E7E6, prN}, // Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO {0x1E7E8, 0x1E7EB, prN}, // Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE {0x1E7ED, 0x1E7EE, prN}, // Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE @@ -2498,13 +2529,14 @@ var eastAsianWidth = [][3]int{ {0x1F6D0, 0x1F6D2, prW}, // So [3] PLACE OF WORSHIP..SHOPPING TROLLEY {0x1F6D3, 0x1F6D4, prN}, // So [2] STUPA..PAGODA {0x1F6D5, 0x1F6D7, prW}, // So [3] HINDU TEMPLE..ELEVATOR - {0x1F6DD, 0x1F6DF, prW}, // So [3] PLAYGROUND SLIDE..RING BUOY + {0x1F6DC, 0x1F6DF, prW}, // So [4] WIRELESS..RING BUOY {0x1F6E0, 0x1F6EA, prN}, // So [11] HAMMER AND WRENCH..NORTHEAST-POINTING AIRPLANE {0x1F6EB, 0x1F6EC, prW}, // So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING {0x1F6F0, 0x1F6F3, prN}, // So [4] SATELLITE..PASSENGER SHIP {0x1F6F4, 0x1F6FC, prW}, // So [9] SCOOTER..ROLLER SKATE - {0x1F700, 0x1F773, prN}, // So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE - {0x1F780, 0x1F7D8, prN}, // So [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE + {0x1F700, 0x1F776, prN}, // So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE + {0x1F77B, 0x1F77F, prN}, // So [5] HAUMEA..ORCUS + {0x1F780, 0x1F7D9, prN}, // So [90] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NINE POINTED WHITE STAR {0x1F7E0, 0x1F7EB, prW}, // So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE {0x1F7F0, 0x1F7F0, prW}, // So HEAVY EQUALS SIGN {0x1F800, 0x1F80B, prN}, // So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD @@ -2521,22 +2553,20 @@ var eastAsianWidth = [][3]int{ {0x1F947, 0x1F9FF, prW}, // So [185] FIRST PLACE MEDAL..NAZAR AMULET {0x1FA00, 0x1FA53, prN}, // So [84] NEUTRAL CHESS KING..BLACK CHESS KNIGHT-BISHOP {0x1FA60, 0x1FA6D, prN}, // So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER - {0x1FA70, 0x1FA74, prW}, // So [5] BALLET SHOES..THONG SANDAL - {0x1FA78, 0x1FA7C, prW}, // So [5] DROP OF BLOOD..CRUTCH - {0x1FA80, 0x1FA86, prW}, // So [7] YO-YO..NESTING DOLLS - {0x1FA90, 0x1FAAC, prW}, // So [29] RINGED PLANET..HAMSA - {0x1FAB0, 0x1FABA, prW}, // So [11] FLY..NEST WITH EGGS - {0x1FAC0, 0x1FAC5, prW}, // So [6] ANATOMICAL HEART..PERSON WITH CROWN - {0x1FAD0, 0x1FAD9, prW}, // So [10] BLUEBERRIES..JAR - {0x1FAE0, 0x1FAE7, prW}, // So [8] MELTING FACE..BUBBLES - {0x1FAF0, 0x1FAF6, prW}, // So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS + {0x1FA70, 0x1FA7C, prW}, // So [13] BALLET SHOES..CRUTCH + {0x1FA80, 0x1FA88, prW}, // So [9] YO-YO..FLUTE + {0x1FA90, 0x1FABD, prW}, // So [46] RINGED PLANET..WING + {0x1FABF, 0x1FAC5, prW}, // So [7] GOOSE..PERSON WITH CROWN + {0x1FACE, 0x1FADB, prW}, // So [14] MOOSE..PEA POD + {0x1FAE0, 0x1FAE8, prW}, // So [9] MELTING FACE..SHAKING FACE + {0x1FAF0, 0x1FAF8, prW}, // So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND {0x1FB00, 0x1FB92, prN}, // So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK {0x1FB94, 0x1FBCA, prN}, // So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON {0x1FBF0, 0x1FBF9, prN}, // Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE {0x20000, 0x2A6DF, prW}, // Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF {0x2A6E0, 0x2A6FF, prW}, // Cn [32] .. - {0x2A700, 0x2B738, prW}, // Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738 - {0x2B739, 0x2B73F, prW}, // Cn [7] .. + {0x2A700, 0x2B739, prW}, // Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 + {0x2B73A, 0x2B73F, prW}, // Cn [6] .. {0x2B740, 0x2B81D, prW}, // Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D {0x2B81E, 0x2B81F, prW}, // Cn [2] .. {0x2B820, 0x2CEA1, prW}, // Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 @@ -2547,7 +2577,9 @@ var eastAsianWidth = [][3]int{ {0x2FA1E, 0x2FA1F, prW}, // Cn [2] .. {0x2FA20, 0x2FFFD, prW}, // Cn [1502] .. {0x30000, 0x3134A, prW}, // Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A - {0x3134B, 0x3FFFD, prW}, // Cn [60595] .. + {0x3134B, 0x3134F, prW}, // Cn [5] .. + {0x31350, 0x323AF, prW}, // Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF + {0x323B0, 0x3FFFD, prW}, // Cn [56398] .. {0xE0001, 0xE0001, prN}, // Cf LANGUAGE TAG {0xE0020, 0xE007F, prN}, // Cf [96] TAG SPACE..CANCEL TAG {0xE0100, 0xE01EF, prA}, // Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 diff --git a/emojipresentation.go b/emojipresentation.go index ba0e2b0..9b5f499 100644 --- a/emojipresentation.go +++ b/emojipresentation.go @@ -5,9 +5,9 @@ package uniseg // emojiPresentation are taken from // // and -// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt +// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt // ("Extended_Pictographic" only) -// on July 17, 2023. See https://www.unicode.org/license.html for the Unicode +// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode // license agreement. var emojiPresentation = [][3]int{ {0x231A, 0x231B, prEmojiPresentation}, // E0.6 [2] (โŒš..โŒ›) watch..hourglass done @@ -211,6 +211,7 @@ var emojiPresentation = [][3]int{ {0x1F6D1, 0x1F6D2, prEmojiPresentation}, // E3.0 [2] (๐Ÿ›‘..๐Ÿ›’) stop sign..shopping cart {0x1F6D5, 0x1F6D5, prEmojiPresentation}, // E12.0 [1] (๐Ÿ›•) hindu temple {0x1F6D6, 0x1F6D7, prEmojiPresentation}, // E13.0 [2] (๐Ÿ›–..๐Ÿ›—) hut..elevator + {0x1F6DC, 0x1F6DC, prEmojiPresentation}, // E15.0 [1] (๐Ÿ›œ) wireless {0x1F6DD, 0x1F6DF, prEmojiPresentation}, // E14.0 [3] (๐Ÿ›..๐Ÿ›Ÿ) playground slide..ring buoy {0x1F6EB, 0x1F6EC, prEmojiPresentation}, // E1.0 [2] (๐Ÿ›ซ..๐Ÿ›ฌ) airplane departure..airplane arrival {0x1F6F4, 0x1F6F6, prEmojiPresentation}, // E3.0 [3] (๐Ÿ›ด..๐Ÿ›ถ) kick scooter..canoe @@ -267,19 +268,28 @@ var emojiPresentation = [][3]int{ {0x1F9E7, 0x1F9FF, prEmojiPresentation}, // E11.0 [25] (๐Ÿงง..๐Ÿงฟ) red envelope..nazar amulet {0x1FA70, 0x1FA73, prEmojiPresentation}, // E12.0 [4] (๐Ÿฉฐ..๐Ÿฉณ) ballet shoes..shorts {0x1FA74, 0x1FA74, prEmojiPresentation}, // E13.0 [1] (๐Ÿฉด) thong sandal + {0x1FA75, 0x1FA77, prEmojiPresentation}, // E15.0 [3] (๐Ÿฉต..๐Ÿฉท) light blue heart..pink heart {0x1FA78, 0x1FA7A, prEmojiPresentation}, // E12.0 [3] (๐Ÿฉธ..๐Ÿฉบ) drop of blood..stethoscope {0x1FA7B, 0x1FA7C, prEmojiPresentation}, // E14.0 [2] (๐Ÿฉป..๐Ÿฉผ) x-ray..crutch {0x1FA80, 0x1FA82, prEmojiPresentation}, // E12.0 [3] (๐Ÿช€..๐Ÿช‚) yo-yo..parachute {0x1FA83, 0x1FA86, prEmojiPresentation}, // E13.0 [4] (๐Ÿชƒ..๐Ÿช†) boomerang..nesting dolls + {0x1FA87, 0x1FA88, prEmojiPresentation}, // E15.0 [2] (๐Ÿช‡..๐Ÿชˆ) maracas..flute {0x1FA90, 0x1FA95, prEmojiPresentation}, // E12.0 [6] (๐Ÿช..๐Ÿช•) ringed planet..banjo {0x1FA96, 0x1FAA8, prEmojiPresentation}, // E13.0 [19] (๐Ÿช–..๐Ÿชจ) military helmet..rock {0x1FAA9, 0x1FAAC, prEmojiPresentation}, // E14.0 [4] (๐Ÿชฉ..๐Ÿชฌ) mirror ball..hamsa + {0x1FAAD, 0x1FAAF, prEmojiPresentation}, // E15.0 [3] (๐Ÿชญ..๐Ÿชฏ) folding hand fan..khanda {0x1FAB0, 0x1FAB6, prEmojiPresentation}, // E13.0 [7] (๐Ÿชฐ..๐Ÿชถ) fly..feather {0x1FAB7, 0x1FABA, prEmojiPresentation}, // E14.0 [4] (๐Ÿชท..๐Ÿชบ) lotus..nest with eggs + {0x1FABB, 0x1FABD, prEmojiPresentation}, // E15.0 [3] (๐Ÿชป..๐Ÿชฝ) hyacinth..wing + {0x1FABF, 0x1FABF, prEmojiPresentation}, // E15.0 [1] (๐Ÿชฟ) goose {0x1FAC0, 0x1FAC2, prEmojiPresentation}, // E13.0 [3] (๐Ÿซ€..๐Ÿซ‚) anatomical heart..people hugging {0x1FAC3, 0x1FAC5, prEmojiPresentation}, // E14.0 [3] (๐Ÿซƒ..๐Ÿซ…) pregnant man..person with crown + {0x1FACE, 0x1FACF, prEmojiPresentation}, // E15.0 [2] (๐ŸซŽ..๐Ÿซ) moose..donkey {0x1FAD0, 0x1FAD6, prEmojiPresentation}, // E13.0 [7] (๐Ÿซ..๐Ÿซ–) blueberries..teapot {0x1FAD7, 0x1FAD9, prEmojiPresentation}, // E14.0 [3] (๐Ÿซ—..๐Ÿซ™) pouring liquid..jar + {0x1FADA, 0x1FADB, prEmojiPresentation}, // E15.0 [2] (๐Ÿซš..๐Ÿซ›) ginger root..pea pod {0x1FAE0, 0x1FAE7, prEmojiPresentation}, // E14.0 [8] (๐Ÿซ ..๐Ÿซง) melting face..bubbles + {0x1FAE8, 0x1FAE8, prEmojiPresentation}, // E15.0 [1] (๐Ÿซจ) shaking face {0x1FAF0, 0x1FAF6, prEmojiPresentation}, // E14.0 [7] (๐Ÿซฐ..๐Ÿซถ) hand with index finger and thumb crossed..heart hands + {0x1FAF7, 0x1FAF8, prEmojiPresentation}, // E15.0 [2] (๐Ÿซท..๐Ÿซธ) leftwards pushing hand..rightwards pushing hand } diff --git a/gen_breaktest.go b/gen_breaktest.go index c00c368..6bfbeb5 100644 --- a/gen_breaktest.go +++ b/gen_breaktest.go @@ -32,7 +32,7 @@ import ( // We want to test against a specific version rather than the latest. When the // package is upgraded to a new version, change these to generate new tests. const ( - testCaseURL = `https://www.unicode.org/Public/14.0.0/ucd/auxiliary/%s.txt` + testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt` ) func main() { diff --git a/gen_properties.go b/gen_properties.go index 8aa5683..8992d2c 100644 --- a/gen_properties.go +++ b/gen_properties.go @@ -41,8 +41,8 @@ import ( // We want to test against a specific version rather than the latest. When the // package is upgraded to a new version, change these to generate new tests. const ( - propertyURL = `https://www.unicode.org/Public/14.0.0/ucd/%s.txt` - emojiURL = `https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt` + propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt` + emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt` ) // The regular expression for a line containing a code point range property. diff --git a/graphemebreak_test.go b/graphemebreak_test.go index 8af7849..0a8d3f2 100644 --- a/graphemebreak_test.go +++ b/graphemebreak_test.go @@ -3,8 +3,8 @@ package uniseg // graphemeBreakTestCases are Grapheme testcases taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt -// on July 17, 2023. See +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakTest.txt +// on September 5, 2023. See // https://www.unicode.org/license.html for the Unicode license agreement. var graphemeBreakTestCases = []testCase{ {original: "\u0020\u0020", expected: [][]rune{{0x0020}, {0x0020}}}, // รท [0.2] SPACE (Other) รท [999.0] SPACE (Other) รท [0.3] diff --git a/graphemeproperties.go b/graphemeproperties.go index 791ca60..0aff4a6 100644 --- a/graphemeproperties.go +++ b/graphemeproperties.go @@ -3,11 +3,11 @@ package uniseg // graphemeCodePoints are taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt // and -// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt +// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt // ("Extended_Pictographic" only) -// on July 17, 2023. See https://www.unicode.org/license.html for the Unicode +// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode // license agreement. var graphemeCodePoints = [][3]int{ {0x0000, 0x0009, prControl}, // Cc [10] .. @@ -143,6 +143,7 @@ var graphemeCodePoints = [][3]int{ {0x0CCC, 0x0CCD, prExtend}, // Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA {0x0CD5, 0x0CD6, prExtend}, // Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK {0x0CE2, 0x0CE3, prExtend}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL + {0x0CF3, 0x0CF3, prSpacingMark}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT {0x0D00, 0x0D01, prExtend}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU {0x0D02, 0x0D03, prSpacingMark}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA {0x0D3B, 0x0D3C, prExtend}, // Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA @@ -172,7 +173,7 @@ var graphemeCodePoints = [][3]int{ {0x0EB1, 0x0EB1, prExtend}, // Mn LAO VOWEL SIGN MAI KAN {0x0EB3, 0x0EB3, prSpacingMark}, // Lo LAO VOWEL SIGN AM {0x0EB4, 0x0EBC, prExtend}, // Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO - {0x0EC8, 0x0ECD, prExtend}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA + {0x0EC8, 0x0ECE, prExtend}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN {0x0F18, 0x0F19, prExtend}, // Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS {0x0F35, 0x0F35, prExtend}, // Mn TIBETAN MARK NGAS BZUNG NYI ZLA {0x0F37, 0x0F37, prExtend}, // Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS @@ -1336,6 +1337,7 @@ var graphemeCodePoints = [][3]int{ {0x10AE5, 0x10AE6, prExtend}, // Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW {0x10D24, 0x10D27, prExtend}, // Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI {0x10EAB, 0x10EAC, prExtend}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK + {0x10EFD, 0x10EFF, prExtend}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA {0x10F46, 0x10F50, prExtend}, // Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW {0x10F82, 0x10F85, prExtend}, // Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW {0x11000, 0x11000, prSpacingMark}, // Mc BRAHMI SIGN CANDRABINDU @@ -1375,6 +1377,7 @@ var graphemeCodePoints = [][3]int{ {0x11235, 0x11235, prSpacingMark}, // Mc KHOJKI SIGN VIRAMA {0x11236, 0x11237, prExtend}, // Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA {0x1123E, 0x1123E, prExtend}, // Mn KHOJKI SIGN SUKUN + {0x11241, 0x11241, prExtend}, // Mn KHOJKI VOWEL SIGN VOCALIC R {0x112DF, 0x112DF, prExtend}, // Mn KHUDAWADI SIGN ANUSVARA {0x112E0, 0x112E2, prSpacingMark}, // Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II {0x112E3, 0x112EA, prExtend}, // Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA @@ -1494,7 +1497,18 @@ var graphemeCodePoints = [][3]int{ {0x11D97, 0x11D97, prExtend}, // Mn GUNJALA GONDI VIRAMA {0x11EF3, 0x11EF4, prExtend}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U {0x11EF5, 0x11EF6, prSpacingMark}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O - {0x13430, 0x13438, prControl}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT + {0x11F00, 0x11F01, prExtend}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA + {0x11F02, 0x11F02, prPrepend}, // Lo KAWI SIGN REPHA + {0x11F03, 0x11F03, prSpacingMark}, // Mc KAWI SIGN VISARGA + {0x11F34, 0x11F35, prSpacingMark}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA + {0x11F36, 0x11F3A, prExtend}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R + {0x11F3E, 0x11F3F, prSpacingMark}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI + {0x11F40, 0x11F40, prExtend}, // Mn KAWI VOWEL SIGN EU + {0x11F41, 0x11F41, prSpacingMark}, // Mc KAWI SIGN KILLER + {0x11F42, 0x11F42, prExtend}, // Mn KAWI CONJOINER + {0x13430, 0x1343F, prControl}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE + {0x13440, 0x13440, prExtend}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY + {0x13447, 0x13455, prExtend}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED {0x16AF0, 0x16AF4, prExtend}, // Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE {0x16B30, 0x16B36, prExtend}, // Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM {0x16F4F, 0x16F4F, prExtend}, // Mn MIAO SIGN CONSONANT MODIFIER BAR @@ -1527,9 +1541,11 @@ var graphemeCodePoints = [][3]int{ {0x1E01B, 0x1E021, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI {0x1E023, 0x1E024, prExtend}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS {0x1E026, 0x1E02A, prExtend}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA + {0x1E08F, 0x1E08F, prExtend}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I {0x1E130, 0x1E136, prExtend}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D {0x1E2AE, 0x1E2AE, prExtend}, // Mn TOTO SIGN RISING TONE {0x1E2EC, 0x1E2EF, prExtend}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI + {0x1E4EC, 0x1E4EF, prExtend}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH {0x1E8D0, 0x1E8D6, prExtend}, // Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS {0x1E944, 0x1E94A, prExtend}, // Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA {0x1F000, 0x1F003, prExtendedPictographic}, // E0.0 [4] (๐Ÿ€€..๐Ÿ€ƒ) MAHJONG TILE EAST WIND..MAHJONG TILE NORTH WIND @@ -1780,7 +1796,8 @@ var graphemeCodePoints = [][3]int{ {0x1F6D3, 0x1F6D4, prExtendedPictographic}, // E0.0 [2] (๐Ÿ›“..๐Ÿ›”) STUPA..PAGODA {0x1F6D5, 0x1F6D5, prExtendedPictographic}, // E12.0 [1] (๐Ÿ›•) hindu temple {0x1F6D6, 0x1F6D7, prExtendedPictographic}, // E13.0 [2] (๐Ÿ›–..๐Ÿ›—) hut..elevator - {0x1F6D8, 0x1F6DC, prExtendedPictographic}, // E0.0 [5] (๐Ÿ›˜..๐Ÿ›œ) .. + {0x1F6D8, 0x1F6DB, prExtendedPictographic}, // E0.0 [4] (๐Ÿ›˜..๐Ÿ››) .. + {0x1F6DC, 0x1F6DC, prExtendedPictographic}, // E15.0 [1] (๐Ÿ›œ) wireless {0x1F6DD, 0x1F6DF, prExtendedPictographic}, // E14.0 [3] (๐Ÿ›..๐Ÿ›Ÿ) playground slide..ring buoy {0x1F6E0, 0x1F6E5, prExtendedPictographic}, // E0.7 [6] (๐Ÿ› ๏ธ..๐Ÿ›ฅ๏ธ) hammer and wrench..motor boat {0x1F6E6, 0x1F6E8, prExtendedPictographic}, // E0.0 [3] (๐Ÿ›ฆ..๐Ÿ›จ) UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE @@ -1797,7 +1814,7 @@ var graphemeCodePoints = [][3]int{ {0x1F6FA, 0x1F6FA, prExtendedPictographic}, // E12.0 [1] (๐Ÿ›บ) auto rickshaw {0x1F6FB, 0x1F6FC, prExtendedPictographic}, // E13.0 [2] (๐Ÿ›ป..๐Ÿ›ผ) pickup truck..roller skate {0x1F6FD, 0x1F6FF, prExtendedPictographic}, // E0.0 [3] (๐Ÿ›ฝ..๐Ÿ›ฟ) .. - {0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (๐Ÿด..๐Ÿฟ) .. + {0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (๐Ÿด..๐Ÿฟ) LOT OF FORTUNE..ORCUS {0x1F7D5, 0x1F7DF, prExtendedPictographic}, // E0.0 [11] (๐ŸŸ•..๐ŸŸŸ) CIRCLED TRIANGLE.. {0x1F7E0, 0x1F7EB, prExtendedPictographic}, // E12.0 [12] (๐ŸŸ ..๐ŸŸซ) orange circle..brown square {0x1F7EC, 0x1F7EF, prExtendedPictographic}, // E0.0 [4] (๐ŸŸฌ..๐ŸŸฏ) .. @@ -1856,30 +1873,37 @@ var graphemeCodePoints = [][3]int{ {0x1FA00, 0x1FA6F, prExtendedPictographic}, // E0.0 [112] (๐Ÿจ€..๐Ÿฉฏ) NEUTRAL CHESS KING.. {0x1FA70, 0x1FA73, prExtendedPictographic}, // E12.0 [4] (๐Ÿฉฐ..๐Ÿฉณ) ballet shoes..shorts {0x1FA74, 0x1FA74, prExtendedPictographic}, // E13.0 [1] (๐Ÿฉด) thong sandal - {0x1FA75, 0x1FA77, prExtendedPictographic}, // E0.0 [3] (๐Ÿฉต..๐Ÿฉท) .. + {0x1FA75, 0x1FA77, prExtendedPictographic}, // E15.0 [3] (๐Ÿฉต..๐Ÿฉท) light blue heart..pink heart {0x1FA78, 0x1FA7A, prExtendedPictographic}, // E12.0 [3] (๐Ÿฉธ..๐Ÿฉบ) drop of blood..stethoscope {0x1FA7B, 0x1FA7C, prExtendedPictographic}, // E14.0 [2] (๐Ÿฉป..๐Ÿฉผ) x-ray..crutch {0x1FA7D, 0x1FA7F, prExtendedPictographic}, // E0.0 [3] (๐Ÿฉฝ..๐Ÿฉฟ) .. {0x1FA80, 0x1FA82, prExtendedPictographic}, // E12.0 [3] (๐Ÿช€..๐Ÿช‚) yo-yo..parachute {0x1FA83, 0x1FA86, prExtendedPictographic}, // E13.0 [4] (๐Ÿชƒ..๐Ÿช†) boomerang..nesting dolls - {0x1FA87, 0x1FA8F, prExtendedPictographic}, // E0.0 [9] (๐Ÿช‡..๐Ÿช) .. + {0x1FA87, 0x1FA88, prExtendedPictographic}, // E15.0 [2] (๐Ÿช‡..๐Ÿชˆ) maracas..flute + {0x1FA89, 0x1FA8F, prExtendedPictographic}, // E0.0 [7] (๐Ÿช‰..๐Ÿช) .. {0x1FA90, 0x1FA95, prExtendedPictographic}, // E12.0 [6] (๐Ÿช..๐Ÿช•) ringed planet..banjo {0x1FA96, 0x1FAA8, prExtendedPictographic}, // E13.0 [19] (๐Ÿช–..๐Ÿชจ) military helmet..rock {0x1FAA9, 0x1FAAC, prExtendedPictographic}, // E14.0 [4] (๐Ÿชฉ..๐Ÿชฌ) mirror ball..hamsa - {0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E0.0 [3] (๐Ÿชญ..๐Ÿชฏ) .. + {0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E15.0 [3] (๐Ÿชญ..๐Ÿชฏ) folding hand fan..khanda {0x1FAB0, 0x1FAB6, prExtendedPictographic}, // E13.0 [7] (๐Ÿชฐ..๐Ÿชถ) fly..feather {0x1FAB7, 0x1FABA, prExtendedPictographic}, // E14.0 [4] (๐Ÿชท..๐Ÿชบ) lotus..nest with eggs - {0x1FABB, 0x1FABF, prExtendedPictographic}, // E0.0 [5] (๐Ÿชป..๐Ÿชฟ) .. + {0x1FABB, 0x1FABD, prExtendedPictographic}, // E15.0 [3] (๐Ÿชป..๐Ÿชฝ) hyacinth..wing + {0x1FABE, 0x1FABE, prExtendedPictographic}, // E0.0 [1] (๐Ÿชพ) + {0x1FABF, 0x1FABF, prExtendedPictographic}, // E15.0 [1] (๐Ÿชฟ) goose {0x1FAC0, 0x1FAC2, prExtendedPictographic}, // E13.0 [3] (๐Ÿซ€..๐Ÿซ‚) anatomical heart..people hugging {0x1FAC3, 0x1FAC5, prExtendedPictographic}, // E14.0 [3] (๐Ÿซƒ..๐Ÿซ…) pregnant man..person with crown - {0x1FAC6, 0x1FACF, prExtendedPictographic}, // E0.0 [10] (๐Ÿซ†..๐Ÿซ) .. + {0x1FAC6, 0x1FACD, prExtendedPictographic}, // E0.0 [8] (๐Ÿซ†..๐Ÿซ) .. + {0x1FACE, 0x1FACF, prExtendedPictographic}, // E15.0 [2] (๐ŸซŽ..๐Ÿซ) moose..donkey {0x1FAD0, 0x1FAD6, prExtendedPictographic}, // E13.0 [7] (๐Ÿซ..๐Ÿซ–) blueberries..teapot {0x1FAD7, 0x1FAD9, prExtendedPictographic}, // E14.0 [3] (๐Ÿซ—..๐Ÿซ™) pouring liquid..jar - {0x1FADA, 0x1FADF, prExtendedPictographic}, // E0.0 [6] (๐Ÿซš..๐ŸซŸ) .. + {0x1FADA, 0x1FADB, prExtendedPictographic}, // E15.0 [2] (๐Ÿซš..๐Ÿซ›) ginger root..pea pod + {0x1FADC, 0x1FADF, prExtendedPictographic}, // E0.0 [4] (๐Ÿซœ..๐ŸซŸ) .. {0x1FAE0, 0x1FAE7, prExtendedPictographic}, // E14.0 [8] (๐Ÿซ ..๐Ÿซง) melting face..bubbles - {0x1FAE8, 0x1FAEF, prExtendedPictographic}, // E0.0 [8] (๐Ÿซจ..๐Ÿซฏ) .. + {0x1FAE8, 0x1FAE8, prExtendedPictographic}, // E15.0 [1] (๐Ÿซจ) shaking face + {0x1FAE9, 0x1FAEF, prExtendedPictographic}, // E0.0 [7] (๐Ÿซฉ..๐Ÿซฏ) .. {0x1FAF0, 0x1FAF6, prExtendedPictographic}, // E14.0 [7] (๐Ÿซฐ..๐Ÿซถ) hand with index finger and thumb crossed..heart hands - {0x1FAF7, 0x1FAFF, prExtendedPictographic}, // E0.0 [9] (๐Ÿซท..๐Ÿซฟ) .. + {0x1FAF7, 0x1FAF8, prExtendedPictographic}, // E15.0 [2] (๐Ÿซท..๐Ÿซธ) leftwards pushing hand..rightwards pushing hand + {0x1FAF9, 0x1FAFF, prExtendedPictographic}, // E0.0 [7] (๐Ÿซน..๐Ÿซฟ) .. {0x1FC00, 0x1FFFD, prExtendedPictographic}, // E0.0[1022] (๐Ÿฐ€..๐Ÿฟฝ) .. {0xE0000, 0xE0000, prControl}, // Cn {0xE0001, 0xE0001, prControl}, // Cf LANGUAGE TAG diff --git a/linebreak_test.go b/linebreak_test.go index c0a100f..e5e5448 100644 --- a/linebreak_test.go +++ b/linebreak_test.go @@ -3,8 +3,8 @@ package uniseg // lineBreakTestCases are Grapheme testcases taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/LineBreakTest.txt -// on July 17, 2023. See +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/LineBreakTest.txt +// on September 5, 2023. See // https://www.unicode.org/license.html for the Unicode license agreement. var lineBreakTestCases = []testCase{ {original: "\u0023\u0023", expected: [][]rune{{0x0023, 0x0023}}}, // ร— [0.3] NUMBER SIGN (AL) ร— [28.0] NUMBER SIGN (AL) รท [0.3] diff --git a/lineproperties.go b/lineproperties.go index be0f22c..ac7fac4 100644 --- a/lineproperties.go +++ b/lineproperties.go @@ -3,11 +3,11 @@ package uniseg // lineBreakCodePoints are taken from -// https://www.unicode.org/Public/14.0.0/ucd/LineBreak.txt +// https://www.unicode.org/Public/15.0.0/ucd/LineBreak.txt // and -// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt +// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt // ("Extended_Pictographic" only) -// on July 17, 2023. See https://www.unicode.org/license.html for the Unicode +// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode // license agreement. var lineBreakCodePoints = [][4]int{ {0x0000, 0x0008, prCM, gcCc}, // [9] .. @@ -439,6 +439,7 @@ var lineBreakCodePoints = [][4]int{ {0x0CE2, 0x0CE3, prCM, gcMn}, // [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL {0x0CE6, 0x0CEF, prNU, gcNd}, // [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE {0x0CF1, 0x0CF2, prAL, gcLo}, // [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA + {0x0CF3, 0x0CF3, prCM, gcMc}, // KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT {0x0D00, 0x0D01, prCM, gcMn}, // [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU {0x0D02, 0x0D03, prCM, gcMc}, // [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA {0x0D04, 0x0D0C, prAL, gcLo}, // [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L @@ -500,7 +501,7 @@ var lineBreakCodePoints = [][4]int{ {0x0EBD, 0x0EBD, prSA, gcLo}, // LAO SEMIVOWEL SIGN NYO {0x0EC0, 0x0EC4, prSA, gcLo}, // [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI {0x0EC6, 0x0EC6, prSA, gcLm}, // LAO KO LA - {0x0EC8, 0x0ECD, prSA, gcMn}, // [6] LAO TONE MAI EK..LAO NIGGAHITA + {0x0EC8, 0x0ECE, prSA, gcMn}, // [7] LAO TONE MAI EK..LAO YAMAKKAN {0x0ED0, 0x0ED9, prNU, gcNd}, // [10] LAO DIGIT ZERO..LAO DIGIT NINE {0x0EDC, 0x0EDF, prSA, gcLo}, // [4] LAO HO NO..LAO LETTER KHMU NYO {0x0F00, 0x0F00, prAL, gcLo}, // TIBETAN SYLLABLE OM @@ -813,7 +814,11 @@ var lineBreakCodePoints = [][4]int{ {0x1D79, 0x1D7F, prAL, gcLl}, // [7] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER UPSILON WITH STROKE {0x1D80, 0x1D9A, prAL, gcLl}, // [27] LATIN SMALL LETTER B WITH PALATAL HOOK..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK {0x1D9B, 0x1DBF, prAL, gcLm}, // [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA - {0x1DC0, 0x1DFF, prCM, gcMn}, // [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW + {0x1DC0, 0x1DCC, prCM, gcMn}, // [13] COMBINING DOTTED GRAVE ACCENT..COMBINING MACRON-BREVE + {0x1DCD, 0x1DCD, prGL, gcMn}, // COMBINING DOUBLE CIRCUMFLEX ABOVE + {0x1DCE, 0x1DFB, prCM, gcMn}, // [46] COMBINING OGONEK ABOVE..COMBINING DELETION MARK + {0x1DFC, 0x1DFC, prGL, gcMn}, // COMBINING DOUBLE INVERTED BREVE BELOW + {0x1DFD, 0x1DFF, prCM, gcMn}, // [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW {0x1E00, 0x1EFF, prAL, gcLC}, // [256] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER Y WITH LOOP {0x1F00, 0x1F15, prAL, gcLC}, // [22] GREEK SMALL LETTER ALPHA WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA {0x1F18, 0x1F1D, prAL, gcLu}, // [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA @@ -889,7 +894,7 @@ var lineBreakCodePoints = [][4]int{ {0x2054, 0x2054, prAL, gcPc}, // INVERTED UNDERTIE {0x2055, 0x2055, prAL, gcPo}, // FLOWER PUNCTUATION MARK {0x2056, 0x2056, prBA, gcPo}, // THREE DOT PUNCTUATION - {0x2057, 0x2057, prAL, gcPo}, // QUADRUPLE PRIME + {0x2057, 0x2057, prPO, gcPo}, // QUADRUPLE PRIME {0x2058, 0x205B, prBA, gcPo}, // [4] FOUR DOT PUNCTUATION..FOUR DOT MARK {0x205C, 0x205C, prAL, gcPo}, // DOTTED CROSS {0x205D, 0x205E, prBA, gcPo}, // [2] TRICOLON..VERTICAL FOUR DOTS @@ -2751,6 +2756,7 @@ var lineBreakCodePoints = [][4]int{ {0x10EAB, 0x10EAC, prCM, gcMn}, // [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK {0x10EAD, 0x10EAD, prBA, gcPd}, // YEZIDI HYPHENATION MARK {0x10EB0, 0x10EB1, prAL, gcLo}, // [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE + {0x10EFD, 0x10EFF, prCM, gcMn}, // [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA {0x10F00, 0x10F1C, prAL, gcLo}, // [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL {0x10F1D, 0x10F26, prAL, gcNo}, // [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF {0x10F27, 0x10F27, prAL, gcLo}, // OLD SOGDIAN LIGATURE AYIN-DALETH @@ -2840,6 +2846,8 @@ var lineBreakCodePoints = [][4]int{ {0x1123B, 0x1123C, prBA, gcPo}, // [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK {0x1123D, 0x1123D, prAL, gcPo}, // KHOJKI ABBREVIATION SIGN {0x1123E, 0x1123E, prCM, gcMn}, // KHOJKI SIGN SUKUN + {0x1123F, 0x11240, prAL, gcLo}, // [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I + {0x11241, 0x11241, prCM, gcMn}, // KHOJKI VOWEL SIGN VOCALIC R {0x11280, 0x11286, prAL, gcLo}, // [7] MULTANI LETTER A..MULTANI LETTER GA {0x11288, 0x11288, prAL, gcLo}, // MULTANI LETTER GHA {0x1128A, 0x1128D, prAL, gcLo}, // [4] MULTANI LETTER CA..MULTANI LETTER JJA @@ -3013,6 +3021,7 @@ var lineBreakCodePoints = [][4]int{ {0x11AA1, 0x11AA2, prBA, gcPo}, // [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2 {0x11AB0, 0x11ABF, prAL, gcLo}, // [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA {0x11AC0, 0x11AF8, prAL, gcLo}, // [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL + {0x11B00, 0x11B09, prBB, gcPo}, // [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU {0x11C00, 0x11C08, prAL, gcLo}, // [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L {0x11C0A, 0x11C2E, prAL, gcLo}, // [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA {0x11C2F, 0x11C2F, prCM, gcMc}, // BHAIKSUKI VOWEL SIGN AA @@ -3059,6 +3068,20 @@ var lineBreakCodePoints = [][4]int{ {0x11EF3, 0x11EF4, prCM, gcMn}, // [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U {0x11EF5, 0x11EF6, prCM, gcMc}, // [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O {0x11EF7, 0x11EF8, prAL, gcPo}, // [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION + {0x11F00, 0x11F01, prCM, gcMn}, // [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA + {0x11F02, 0x11F02, prAL, gcLo}, // KAWI SIGN REPHA + {0x11F03, 0x11F03, prCM, gcMc}, // KAWI SIGN VISARGA + {0x11F04, 0x11F10, prAL, gcLo}, // [13] KAWI LETTER A..KAWI LETTER O + {0x11F12, 0x11F33, prAL, gcLo}, // [34] KAWI LETTER KA..KAWI LETTER JNYA + {0x11F34, 0x11F35, prCM, gcMc}, // [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA + {0x11F36, 0x11F3A, prCM, gcMn}, // [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R + {0x11F3E, 0x11F3F, prCM, gcMc}, // [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI + {0x11F40, 0x11F40, prCM, gcMn}, // KAWI VOWEL SIGN EU + {0x11F41, 0x11F41, prCM, gcMc}, // KAWI SIGN KILLER + {0x11F42, 0x11F42, prCM, gcMn}, // KAWI CONJOINER + {0x11F43, 0x11F44, prBA, gcPo}, // [2] KAWI DANDA..KAWI DOUBLE DANDA + {0x11F45, 0x11F4F, prID, gcPo}, // [11] KAWI PUNCTUATION SECTION MARKER..KAWI PUNCTUATION CLOSING SPIRAL + {0x11F50, 0x11F59, prNU, gcNd}, // [10] KAWI DIGIT ZERO..KAWI DIGIT NINE {0x11FB0, 0x11FB0, prAL, gcLo}, // LISU LETTER YHA {0x11FC0, 0x11FD4, prAL, gcNo}, // [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH {0x11FD5, 0x11FDC, prAL, gcSo}, // [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI @@ -3084,10 +3107,18 @@ var lineBreakCodePoints = [][4]int{ {0x1328A, 0x13378, prAL, gcLo}, // [239] EGYPTIAN HIEROGLYPH O037..EGYPTIAN HIEROGLYPH V011 {0x13379, 0x13379, prOP, gcLo}, // EGYPTIAN HIEROGLYPH V011A {0x1337A, 0x1337B, prCL, gcLo}, // [2] EGYPTIAN HIEROGLYPH V011B..EGYPTIAN HIEROGLYPH V011C - {0x1337C, 0x1342E, prAL, gcLo}, // [179] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH AA032 + {0x1337C, 0x1342F, prAL, gcLo}, // [180] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH V011D {0x13430, 0x13436, prGL, gcCf}, // [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE {0x13437, 0x13437, prOP, gcCf}, // EGYPTIAN HIEROGLYPH BEGIN SEGMENT {0x13438, 0x13438, prCL, gcCf}, // EGYPTIAN HIEROGLYPH END SEGMENT + {0x13439, 0x1343B, prGL, gcCf}, // [3] EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM + {0x1343C, 0x1343C, prOP, gcCf}, // EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE + {0x1343D, 0x1343D, prCL, gcCf}, // EGYPTIAN HIEROGLYPH END ENCLOSURE + {0x1343E, 0x1343E, prOP, gcCf}, // EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE + {0x1343F, 0x1343F, prCL, gcCf}, // EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE + {0x13440, 0x13440, prCM, gcMn}, // EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY + {0x13441, 0x13446, prAL, gcLo}, // [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN + {0x13447, 0x13455, prCM, gcMn}, // [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED {0x14400, 0x145CD, prAL, gcLo}, // [462] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A409 {0x145CE, 0x145CE, prOP, gcLo}, // ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK {0x145CF, 0x145CF, prCL, gcLo}, // ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK @@ -3137,7 +3168,9 @@ var lineBreakCodePoints = [][4]int{ {0x1AFFD, 0x1AFFE, prAL, gcLm}, // [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 {0x1B000, 0x1B0FF, prID, gcLo}, // [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2 {0x1B100, 0x1B122, prID, gcLo}, // [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU + {0x1B132, 0x1B132, prCJ, gcLo}, // HIRAGANA LETTER SMALL KO {0x1B150, 0x1B152, prCJ, gcLo}, // [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO + {0x1B155, 0x1B155, prCJ, gcLo}, // KATAKANA LETTER SMALL KO {0x1B164, 0x1B167, prCJ, gcLo}, // [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N {0x1B170, 0x1B2FB, prID, gcLo}, // [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB {0x1BC00, 0x1BC6A, prAL, gcLo}, // [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M @@ -3168,6 +3201,7 @@ var lineBreakCodePoints = [][4]int{ {0x1D200, 0x1D241, prAL, gcSo}, // [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 {0x1D242, 0x1D244, prCM, gcMn}, // [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME {0x1D245, 0x1D245, prAL, gcSo}, // GREEK MUSICAL LEIMMA + {0x1D2C0, 0x1D2D3, prAL, gcNo}, // [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN {0x1D2E0, 0x1D2F3, prAL, gcNo}, // [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN {0x1D300, 0x1D356, prAL, gcSo}, // [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING {0x1D360, 0x1D378, prAL, gcNo}, // [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE @@ -3228,11 +3262,14 @@ var lineBreakCodePoints = [][4]int{ {0x1DF00, 0x1DF09, prAL, gcLl}, // [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK {0x1DF0A, 0x1DF0A, prAL, gcLo}, // LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK {0x1DF0B, 0x1DF1E, prAL, gcLl}, // [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL + {0x1DF25, 0x1DF2A, prAL, gcLl}, // [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK {0x1E000, 0x1E006, prCM, gcMn}, // [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE {0x1E008, 0x1E018, prCM, gcMn}, // [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU {0x1E01B, 0x1E021, prCM, gcMn}, // [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI {0x1E023, 0x1E024, prCM, gcMn}, // [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS {0x1E026, 0x1E02A, prCM, gcMn}, // [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA + {0x1E030, 0x1E06D, prAL, gcLm}, // [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE + {0x1E08F, 0x1E08F, prCM, gcMn}, // COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I {0x1E100, 0x1E12C, prAL, gcLo}, // [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W {0x1E130, 0x1E136, prCM, gcMn}, // [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D {0x1E137, 0x1E13D, prAL, gcLm}, // [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER @@ -3245,6 +3282,10 @@ var lineBreakCodePoints = [][4]int{ {0x1E2EC, 0x1E2EF, prCM, gcMn}, // [4] WANCHO TONE TUP..WANCHO TONE KOINI {0x1E2F0, 0x1E2F9, prNU, gcNd}, // [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE {0x1E2FF, 0x1E2FF, prPR, gcSc}, // WANCHO NGUN SIGN + {0x1E4D0, 0x1E4EA, prAL, gcLo}, // [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL + {0x1E4EB, 0x1E4EB, prAL, gcLm}, // NAG MUNDARI SIGN OJOD + {0x1E4EC, 0x1E4EF, prCM, gcMn}, // [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH + {0x1E4F0, 0x1E4F9, prNU, gcNd}, // [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE {0x1E7E0, 0x1E7E6, prAL, gcLo}, // [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO {0x1E7E8, 0x1E7EB, prAL, gcLo}, // [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE {0x1E7ED, 0x1E7EE, prAL, gcLo}, // [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE @@ -3412,16 +3453,18 @@ var lineBreakCodePoints = [][4]int{ {0x1F6C1, 0x1F6CB, prID, gcSo}, // [11] BATHTUB..COUCH AND LAMP {0x1F6CC, 0x1F6CC, prEB, gcSo}, // SLEEPING ACCOMMODATION {0x1F6CD, 0x1F6D7, prID, gcSo}, // [11] SHOPPING BAGS..ELEVATOR - {0x1F6D8, 0x1F6DC, prID, gcCn}, // [5] .. - {0x1F6DD, 0x1F6EC, prID, gcSo}, // [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING + {0x1F6D8, 0x1F6DB, prID, gcCn}, // [4] .. + {0x1F6DC, 0x1F6EC, prID, gcSo}, // [17] WIRELESS..AIRPLANE ARRIVING {0x1F6ED, 0x1F6EF, prID, gcCn}, // [3] .. {0x1F6F0, 0x1F6FC, prID, gcSo}, // [13] SATELLITE..ROLLER SKATE {0x1F6FD, 0x1F6FF, prID, gcCn}, // [3] .. {0x1F700, 0x1F773, prAL, gcSo}, // [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE - {0x1F774, 0x1F77F, prID, gcCn}, // [12] .. + {0x1F774, 0x1F776, prID, gcSo}, // [3] LOT OF FORTUNE..LUNAR ECLIPSE + {0x1F777, 0x1F77A, prID, gcCn}, // [4] .. + {0x1F77B, 0x1F77F, prID, gcSo}, // [5] HAUMEA..ORCUS {0x1F780, 0x1F7D4, prAL, gcSo}, // [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR - {0x1F7D5, 0x1F7D8, prID, gcSo}, // [4] CIRCLED TRIANGLE..NEGATIVE CIRCLED SQUARE - {0x1F7D9, 0x1F7DF, prID, gcCn}, // [7] .. + {0x1F7D5, 0x1F7D9, prID, gcSo}, // [5] CIRCLED TRIANGLE..NINE POINTED WHITE STAR + {0x1F7DA, 0x1F7DF, prID, gcCn}, // [6] .. {0x1F7E0, 0x1F7EB, prID, gcSo}, // [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE {0x1F7EC, 0x1F7EF, prID, gcCn}, // [4] .. {0x1F7F0, 0x1F7F0, prID, gcSo}, // HEAVY EQUALS SIGN @@ -3467,33 +3510,29 @@ var lineBreakCodePoints = [][4]int{ {0x1FA54, 0x1FA5F, prID, gcCn}, // [12] .. {0x1FA60, 0x1FA6D, prID, gcSo}, // [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER {0x1FA6E, 0x1FA6F, prID, gcCn}, // [2] .. - {0x1FA70, 0x1FA74, prID, gcSo}, // [5] BALLET SHOES..THONG SANDAL - {0x1FA75, 0x1FA77, prID, gcCn}, // [3] .. - {0x1FA78, 0x1FA7C, prID, gcSo}, // [5] DROP OF BLOOD..CRUTCH + {0x1FA70, 0x1FA7C, prID, gcSo}, // [13] BALLET SHOES..CRUTCH {0x1FA7D, 0x1FA7F, prID, gcCn}, // [3] .. - {0x1FA80, 0x1FA86, prID, gcSo}, // [7] YO-YO..NESTING DOLLS - {0x1FA87, 0x1FA8F, prID, gcCn}, // [9] .. - {0x1FA90, 0x1FAAC, prID, gcSo}, // [29] RINGED PLANET..HAMSA - {0x1FAAD, 0x1FAAF, prID, gcCn}, // [3] .. - {0x1FAB0, 0x1FABA, prID, gcSo}, // [11] FLY..NEST WITH EGGS - {0x1FABB, 0x1FABF, prID, gcCn}, // [5] .. - {0x1FAC0, 0x1FAC2, prID, gcSo}, // [3] ANATOMICAL HEART..PEOPLE HUGGING + {0x1FA80, 0x1FA88, prID, gcSo}, // [9] YO-YO..FLUTE + {0x1FA89, 0x1FA8F, prID, gcCn}, // [7] .. + {0x1FA90, 0x1FABD, prID, gcSo}, // [46] RINGED PLANET..WING + {0x1FABE, 0x1FABE, prID, gcCn}, // + {0x1FABF, 0x1FAC2, prID, gcSo}, // [4] GOOSE..PEOPLE HUGGING {0x1FAC3, 0x1FAC5, prEB, gcSo}, // [3] PREGNANT MAN..PERSON WITH CROWN - {0x1FAC6, 0x1FACF, prID, gcCn}, // [10] .. - {0x1FAD0, 0x1FAD9, prID, gcSo}, // [10] BLUEBERRIES..JAR - {0x1FADA, 0x1FADF, prID, gcCn}, // [6] .. - {0x1FAE0, 0x1FAE7, prID, gcSo}, // [8] MELTING FACE..BUBBLES - {0x1FAE8, 0x1FAEF, prID, gcCn}, // [8] .. - {0x1FAF0, 0x1FAF6, prEB, gcSo}, // [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS - {0x1FAF7, 0x1FAFF, prID, gcCn}, // [9] .. + {0x1FAC6, 0x1FACD, prID, gcCn}, // [8] .. + {0x1FACE, 0x1FADB, prID, gcSo}, // [14] MOOSE..PEA POD + {0x1FADC, 0x1FADF, prID, gcCn}, // [4] .. + {0x1FAE0, 0x1FAE8, prID, gcSo}, // [9] MELTING FACE..SHAKING FACE + {0x1FAE9, 0x1FAEF, prID, gcCn}, // [7] .. + {0x1FAF0, 0x1FAF8, prEB, gcSo}, // [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND + {0x1FAF9, 0x1FAFF, prID, gcCn}, // [7] .. {0x1FB00, 0x1FB92, prAL, gcSo}, // [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK {0x1FB94, 0x1FBCA, prAL, gcSo}, // [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON {0x1FBF0, 0x1FBF9, prNU, gcNd}, // [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE {0x1FC00, 0x1FFFD, prID, gcCn}, // [1022] .. {0x20000, 0x2A6DF, prID, gcLo}, // [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF {0x2A6E0, 0x2A6FF, prID, gcCn}, // [32] .. - {0x2A700, 0x2B738, prID, gcLo}, // [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738 - {0x2B739, 0x2B73F, prID, gcCn}, // [7] .. + {0x2A700, 0x2B739, prID, gcLo}, // [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 + {0x2B73A, 0x2B73F, prID, gcCn}, // [6] .. {0x2B740, 0x2B81D, prID, gcLo}, // [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D {0x2B81E, 0x2B81F, prID, gcCn}, // [2] .. {0x2B820, 0x2CEA1, prID, gcLo}, // [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 @@ -3504,7 +3543,9 @@ var lineBreakCodePoints = [][4]int{ {0x2FA1E, 0x2FA1F, prID, gcCn}, // [2] .. {0x2FA20, 0x2FFFD, prID, gcCn}, // [1502] .. {0x30000, 0x3134A, prID, gcLo}, // [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A - {0x3134B, 0x3FFFD, prID, gcCn}, // [60595] .. + {0x3134B, 0x3134F, prID, gcCn}, // [5] .. + {0x31350, 0x323AF, prID, gcLo}, // [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF + {0x323B0, 0x3FFFD, prID, gcCn}, // [56398] .. {0xE0001, 0xE0001, prCM, gcCf}, // LANGUAGE TAG {0xE0020, 0xE007F, prCM, gcCf}, // [96] TAG SPACE..CANCEL TAG {0xE0100, 0xE01EF, prCM, gcMn}, // [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 diff --git a/sentencebreak_test.go b/sentencebreak_test.go index 31a1dbc..6afacdb 100644 --- a/sentencebreak_test.go +++ b/sentencebreak_test.go @@ -3,8 +3,8 @@ package uniseg // sentenceBreakTestCases are Grapheme testcases taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakTest.txt -// on July 17, 2023. See +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/SentenceBreakTest.txt +// on September 5, 2023. See // https://www.unicode.org/license.html for the Unicode license agreement. var sentenceBreakTestCases = []testCase{ {original: "\u0001\u0001", expected: [][]rune{{0x0001, 0x0001}}}, // รท [0.2] (Other) ร— [998.0] (Other) รท [0.3] diff --git a/sentenceproperties.go b/sentenceproperties.go index eeb5f44..67717ec 100644 --- a/sentenceproperties.go +++ b/sentenceproperties.go @@ -3,11 +3,11 @@ package uniseg // sentenceBreakCodePoints are taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakProperty.txt +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/SentenceBreakProperty.txt // and -// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt +// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt // ("Extended_Pictographic" only) -// on July 17, 2023. See https://www.unicode.org/license.html for the Unicode +// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode // license agreement. var sentenceBreakCodePoints = [][3]int{ {0x0009, 0x0009, prSp}, // Cc @@ -843,6 +843,7 @@ var sentenceBreakCodePoints = [][3]int{ {0x0CE2, 0x0CE3, prExtend}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL {0x0CE6, 0x0CEF, prNumeric}, // Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE {0x0CF1, 0x0CF2, prOLetter}, // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA + {0x0CF3, 0x0CF3, prExtend}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT {0x0D00, 0x0D01, prExtend}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU {0x0D02, 0x0D03, prExtend}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA {0x0D04, 0x0D0C, prOLetter}, // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L @@ -896,7 +897,7 @@ var sentenceBreakCodePoints = [][3]int{ {0x0EBD, 0x0EBD, prOLetter}, // Lo LAO SEMIVOWEL SIGN NYO {0x0EC0, 0x0EC4, prOLetter}, // Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI {0x0EC6, 0x0EC6, prOLetter}, // Lm LAO KO LA - {0x0EC8, 0x0ECD, prExtend}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA + {0x0EC8, 0x0ECE, prExtend}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN {0x0ED0, 0x0ED9, prNumeric}, // Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE {0x0EDC, 0x0EDF, prOLetter}, // Lo [4] LAO HO NO..LAO LETTER KHMU NYO {0x0F00, 0x0F00, prOLetter}, // Lo TIBETAN SYLLABLE OM @@ -958,7 +959,7 @@ var sentenceBreakCodePoints = [][3]int{ {0x10C7, 0x10C7, prUpper}, // L& GEORGIAN CAPITAL LETTER YN {0x10CD, 0x10CD, prUpper}, // L& GEORGIAN CAPITAL LETTER AEN {0x10D0, 0x10FA, prOLetter}, // L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN - {0x10FC, 0x10FC, prOLetter}, // Lm MODIFIER LETTER GEORGIAN NAR + {0x10FC, 0x10FC, prLower}, // Lm MODIFIER LETTER GEORGIAN NAR {0x10FD, 0x10FF, prOLetter}, // L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN {0x1100, 0x1248, prOLetter}, // Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA {0x124A, 0x124D, prOLetter}, // Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE @@ -2034,7 +2035,7 @@ var sentenceBreakCodePoints = [][3]int{ {0xA7D7, 0xA7D7, prLower}, // L& LATIN SMALL LETTER MIDDLE SCOTS S {0xA7D8, 0xA7D8, prUpper}, // L& LATIN CAPITAL LETTER SIGMOID S {0xA7D9, 0xA7D9, prLower}, // L& LATIN SMALL LETTER SIGMOID S - {0xA7F2, 0xA7F4, prOLetter}, // Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q + {0xA7F2, 0xA7F4, prLower}, // Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q {0xA7F5, 0xA7F5, prUpper}, // L& LATIN CAPITAL LETTER REVERSED HALF H {0xA7F6, 0xA7F6, prLower}, // L& LATIN SMALL LETTER REVERSED HALF H {0xA7F7, 0xA7F7, prOLetter}, // Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I @@ -2140,7 +2141,7 @@ var sentenceBreakCodePoints = [][3]int{ {0xAB30, 0xAB5A, prLower}, // L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG {0xAB5C, 0xAB5F, prLower}, // Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK {0xAB60, 0xAB68, prLower}, // L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE - {0xAB69, 0xAB69, prOLetter}, // Lm MODIFIER LETTER SMALL TURNED W + {0xAB69, 0xAB69, prLower}, // Lm MODIFIER LETTER SMALL TURNED W {0xAB70, 0xABBF, prLower}, // L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA {0xABC0, 0xABE2, prOLetter}, // Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM {0xABE3, 0xABE4, prExtend}, // Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP @@ -2334,6 +2335,7 @@ var sentenceBreakCodePoints = [][3]int{ {0x10E80, 0x10EA9, prOLetter}, // Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET {0x10EAB, 0x10EAC, prExtend}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK {0x10EB0, 0x10EB1, prOLetter}, // Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE + {0x10EFD, 0x10EFF, prExtend}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA {0x10F00, 0x10F1C, prOLetter}, // Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL {0x10F27, 0x10F27, prOLetter}, // Lo OLD SOGDIAN LIGATURE AYIN-DALETH {0x10F30, 0x10F45, prOLetter}, // Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -2408,6 +2410,8 @@ var sentenceBreakCodePoints = [][3]int{ {0x11238, 0x11239, prSTerm}, // Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA {0x1123B, 0x1123C, prSTerm}, // Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK {0x1123E, 0x1123E, prExtend}, // Mn KHOJKI SIGN SUKUN + {0x1123F, 0x11240, prOLetter}, // Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I + {0x11241, 0x11241, prExtend}, // Mn KHOJKI VOWEL SIGN VOCALIC R {0x11280, 0x11286, prOLetter}, // Lo [7] MULTANI LETTER A..MULTANI LETTER GA {0x11288, 0x11288, prOLetter}, // Lo MULTANI LETTER GHA {0x1128A, 0x1128D, prOLetter}, // Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA @@ -2603,13 +2607,29 @@ var sentenceBreakCodePoints = [][3]int{ {0x11EF3, 0x11EF4, prExtend}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U {0x11EF5, 0x11EF6, prExtend}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O {0x11EF7, 0x11EF8, prSTerm}, // Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION + {0x11F00, 0x11F01, prExtend}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA + {0x11F02, 0x11F02, prOLetter}, // Lo KAWI SIGN REPHA + {0x11F03, 0x11F03, prExtend}, // Mc KAWI SIGN VISARGA + {0x11F04, 0x11F10, prOLetter}, // Lo [13] KAWI LETTER A..KAWI LETTER O + {0x11F12, 0x11F33, prOLetter}, // Lo [34] KAWI LETTER KA..KAWI LETTER JNYA + {0x11F34, 0x11F35, prExtend}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA + {0x11F36, 0x11F3A, prExtend}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R + {0x11F3E, 0x11F3F, prExtend}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI + {0x11F40, 0x11F40, prExtend}, // Mn KAWI VOWEL SIGN EU + {0x11F41, 0x11F41, prExtend}, // Mc KAWI SIGN KILLER + {0x11F42, 0x11F42, prExtend}, // Mn KAWI CONJOINER + {0x11F43, 0x11F44, prSTerm}, // Po [2] KAWI DANDA..KAWI DOUBLE DANDA + {0x11F50, 0x11F59, prNumeric}, // Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE {0x11FB0, 0x11FB0, prOLetter}, // Lo LISU LETTER YHA {0x12000, 0x12399, prOLetter}, // Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U {0x12400, 0x1246E, prOLetter}, // Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM {0x12480, 0x12543, prOLetter}, // Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU {0x12F90, 0x12FF0, prOLetter}, // Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 - {0x13000, 0x1342E, prOLetter}, // Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 - {0x13430, 0x13438, prFormat}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT + {0x13000, 0x1342F, prOLetter}, // Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D + {0x13430, 0x1343F, prFormat}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE + {0x13440, 0x13440, prExtend}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY + {0x13441, 0x13446, prOLetter}, // Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN + {0x13447, 0x13455, prExtend}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED {0x14400, 0x14646, prOLetter}, // Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 {0x16800, 0x16A38, prOLetter}, // Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ {0x16A40, 0x16A5E, prOLetter}, // Lo [31] MRO LETTER TA..MRO LETTER TEK @@ -2648,7 +2668,9 @@ var sentenceBreakCodePoints = [][3]int{ {0x1AFF5, 0x1AFFB, prOLetter}, // Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 {0x1AFFD, 0x1AFFE, prOLetter}, // Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 {0x1B000, 0x1B122, prOLetter}, // Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU + {0x1B132, 0x1B132, prOLetter}, // Lo HIRAGANA LETTER SMALL KO {0x1B150, 0x1B152, prOLetter}, // Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO + {0x1B155, 0x1B155, prOLetter}, // Lo KATAKANA LETTER SMALL KO {0x1B164, 0x1B167, prOLetter}, // Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N {0x1B170, 0x1B2FB, prOLetter}, // Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB {0x1BC00, 0x1BC6A, prOLetter}, // Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M @@ -2738,11 +2760,14 @@ var sentenceBreakCodePoints = [][3]int{ {0x1DF00, 0x1DF09, prLower}, // L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK {0x1DF0A, 0x1DF0A, prOLetter}, // Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK {0x1DF0B, 0x1DF1E, prLower}, // L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL + {0x1DF25, 0x1DF2A, prLower}, // L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK {0x1E000, 0x1E006, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE {0x1E008, 0x1E018, prExtend}, // Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU {0x1E01B, 0x1E021, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI {0x1E023, 0x1E024, prExtend}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS {0x1E026, 0x1E02A, prExtend}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA + {0x1E030, 0x1E06D, prLower}, // Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE + {0x1E08F, 0x1E08F, prExtend}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I {0x1E100, 0x1E12C, prOLetter}, // Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W {0x1E130, 0x1E136, prExtend}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D {0x1E137, 0x1E13D, prOLetter}, // Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER @@ -2753,6 +2778,10 @@ var sentenceBreakCodePoints = [][3]int{ {0x1E2C0, 0x1E2EB, prOLetter}, // Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH {0x1E2EC, 0x1E2EF, prExtend}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI {0x1E2F0, 0x1E2F9, prNumeric}, // Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE + {0x1E4D0, 0x1E4EA, prOLetter}, // Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL + {0x1E4EB, 0x1E4EB, prOLetter}, // Lm NAG MUNDARI SIGN OJOD + {0x1E4EC, 0x1E4EF, prExtend}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH + {0x1E4F0, 0x1E4F9, prNumeric}, // Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE {0x1E7E0, 0x1E7E6, prOLetter}, // Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO {0x1E7E8, 0x1E7EB, prOLetter}, // Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE {0x1E7ED, 0x1E7EE, prOLetter}, // Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE @@ -2803,12 +2832,13 @@ var sentenceBreakCodePoints = [][3]int{ {0x1F676, 0x1F678, prClose}, // So [3] SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT..SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT {0x1FBF0, 0x1FBF9, prNumeric}, // Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE {0x20000, 0x2A6DF, prOLetter}, // Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF - {0x2A700, 0x2B738, prOLetter}, // Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738 + {0x2A700, 0x2B739, prOLetter}, // Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 {0x2B740, 0x2B81D, prOLetter}, // Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D {0x2B820, 0x2CEA1, prOLetter}, // Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 {0x2CEB0, 0x2EBE0, prOLetter}, // Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 {0x2F800, 0x2FA1D, prOLetter}, // Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D {0x30000, 0x3134A, prOLetter}, // Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A + {0x31350, 0x323AF, prOLetter}, // Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF {0xE0001, 0xE0001, prFormat}, // Cf LANGUAGE TAG {0xE0020, 0xE007F, prExtend}, // Cf [96] TAG SPACE..CANCEL TAG {0xE0100, 0xE01EF, prExtend}, // Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 diff --git a/wordbreak_test.go b/wordbreak_test.go index c016df1..42b9b34 100644 --- a/wordbreak_test.go +++ b/wordbreak_test.go @@ -3,8 +3,8 @@ package uniseg // wordBreakTestCases are Grapheme testcases taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakTest.txt -// on July 17, 2023. See +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/WordBreakTest.txt +// on September 5, 2023. See // https://www.unicode.org/license.html for the Unicode license agreement. var wordBreakTestCases = []testCase{ {original: "\u0001\u0001", expected: [][]rune{{0x0001}, {0x0001}}}, // รท [0.2] (Other) รท [999.0] (Other) รท [0.3] diff --git a/wordproperties.go b/wordproperties.go index c0380c1..277ca10 100644 --- a/wordproperties.go +++ b/wordproperties.go @@ -3,11 +3,11 @@ package uniseg // workBreakCodePoints are taken from -// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt +// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/WordBreakProperty.txt // and -// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt +// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt // ("Extended_Pictographic" only) -// on July 17, 2023. See https://www.unicode.org/license.html for the Unicode +// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode // license agreement. var workBreakCodePoints = [][3]int{ {0x000A, 0x000A, prLF}, // Cc @@ -318,6 +318,7 @@ var workBreakCodePoints = [][3]int{ {0x0CE2, 0x0CE3, prExtend}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL {0x0CE6, 0x0CEF, prNumeric}, // Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE {0x0CF1, 0x0CF2, prALetter}, // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA + {0x0CF3, 0x0CF3, prExtend}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT {0x0D00, 0x0D01, prExtend}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU {0x0D02, 0x0D03, prExtend}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA {0x0D04, 0x0D0C, prALetter}, // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L @@ -357,7 +358,7 @@ var workBreakCodePoints = [][3]int{ {0x0E50, 0x0E59, prNumeric}, // Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE {0x0EB1, 0x0EB1, prExtend}, // Mn LAO VOWEL SIGN MAI KAN {0x0EB4, 0x0EBC, prExtend}, // Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO - {0x0EC8, 0x0ECD, prExtend}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA + {0x0EC8, 0x0ECE, prExtend}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN {0x0ED0, 0x0ED9, prNumeric}, // Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE {0x0F00, 0x0F00, prALetter}, // Lo TIBETAN SYLLABLE OM {0x0F18, 0x0F19, prExtend}, // Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS @@ -1093,6 +1094,7 @@ var workBreakCodePoints = [][3]int{ {0x10E80, 0x10EA9, prALetter}, // Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET {0x10EAB, 0x10EAC, prExtend}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK {0x10EB0, 0x10EB1, prALetter}, // Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE + {0x10EFD, 0x10EFF, prExtend}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA {0x10F00, 0x10F1C, prALetter}, // Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL {0x10F27, 0x10F27, prALetter}, // Lo OLD SOGDIAN LIGATURE AYIN-DALETH {0x10F30, 0x10F45, prALetter}, // Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -1157,6 +1159,8 @@ var workBreakCodePoints = [][3]int{ {0x11235, 0x11235, prExtend}, // Mc KHOJKI SIGN VIRAMA {0x11236, 0x11237, prExtend}, // Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA {0x1123E, 0x1123E, prExtend}, // Mn KHOJKI SIGN SUKUN + {0x1123F, 0x11240, prALetter}, // Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I + {0x11241, 0x11241, prExtend}, // Mn KHOJKI VOWEL SIGN VOCALIC R {0x11280, 0x11286, prALetter}, // Lo [7] MULTANI LETTER A..MULTANI LETTER GA {0x11288, 0x11288, prALetter}, // Lo MULTANI LETTER GHA {0x1128A, 0x1128D, prALetter}, // Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA @@ -1337,13 +1341,28 @@ var workBreakCodePoints = [][3]int{ {0x11EE0, 0x11EF2, prALetter}, // Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA {0x11EF3, 0x11EF4, prExtend}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U {0x11EF5, 0x11EF6, prExtend}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O + {0x11F00, 0x11F01, prExtend}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA + {0x11F02, 0x11F02, prALetter}, // Lo KAWI SIGN REPHA + {0x11F03, 0x11F03, prExtend}, // Mc KAWI SIGN VISARGA + {0x11F04, 0x11F10, prALetter}, // Lo [13] KAWI LETTER A..KAWI LETTER O + {0x11F12, 0x11F33, prALetter}, // Lo [34] KAWI LETTER KA..KAWI LETTER JNYA + {0x11F34, 0x11F35, prExtend}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA + {0x11F36, 0x11F3A, prExtend}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R + {0x11F3E, 0x11F3F, prExtend}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI + {0x11F40, 0x11F40, prExtend}, // Mn KAWI VOWEL SIGN EU + {0x11F41, 0x11F41, prExtend}, // Mc KAWI SIGN KILLER + {0x11F42, 0x11F42, prExtend}, // Mn KAWI CONJOINER + {0x11F50, 0x11F59, prNumeric}, // Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE {0x11FB0, 0x11FB0, prALetter}, // Lo LISU LETTER YHA {0x12000, 0x12399, prALetter}, // Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U {0x12400, 0x1246E, prALetter}, // Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM {0x12480, 0x12543, prALetter}, // Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU {0x12F90, 0x12FF0, prALetter}, // Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 - {0x13000, 0x1342E, prALetter}, // Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 - {0x13430, 0x13438, prFormat}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT + {0x13000, 0x1342F, prALetter}, // Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D + {0x13430, 0x1343F, prFormat}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE + {0x13440, 0x13440, prExtend}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY + {0x13441, 0x13446, prALetter}, // Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN + {0x13447, 0x13455, prExtend}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED {0x14400, 0x14646, prALetter}, // Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 {0x16800, 0x16A38, prALetter}, // Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ {0x16A40, 0x16A5E, prALetter}, // Lo [31] MRO LETTER TA..MRO LETTER TEK @@ -1374,6 +1393,7 @@ var workBreakCodePoints = [][3]int{ {0x1AFFD, 0x1AFFE, prKatakana}, // Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 {0x1B000, 0x1B000, prKatakana}, // Lo KATAKANA LETTER ARCHAIC E {0x1B120, 0x1B122, prKatakana}, // Lo [3] KATAKANA LETTER ARCHAIC YI..KATAKANA LETTER ARCHAIC WU + {0x1B155, 0x1B155, prKatakana}, // Lo KATAKANA LETTER SMALL KO {0x1B164, 0x1B167, prKatakana}, // Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N {0x1BC00, 0x1BC6A, prALetter}, // Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M {0x1BC70, 0x1BC7C, prALetter}, // Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK @@ -1431,11 +1451,14 @@ var workBreakCodePoints = [][3]int{ {0x1DF00, 0x1DF09, prALetter}, // L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK {0x1DF0A, 0x1DF0A, prALetter}, // Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK {0x1DF0B, 0x1DF1E, prALetter}, // L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL + {0x1DF25, 0x1DF2A, prALetter}, // L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK {0x1E000, 0x1E006, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE {0x1E008, 0x1E018, prExtend}, // Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU {0x1E01B, 0x1E021, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI {0x1E023, 0x1E024, prExtend}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS {0x1E026, 0x1E02A, prExtend}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA + {0x1E030, 0x1E06D, prALetter}, // Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE + {0x1E08F, 0x1E08F, prExtend}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I {0x1E100, 0x1E12C, prALetter}, // Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W {0x1E130, 0x1E136, prExtend}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D {0x1E137, 0x1E13D, prALetter}, // Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER @@ -1446,6 +1469,10 @@ var workBreakCodePoints = [][3]int{ {0x1E2C0, 0x1E2EB, prALetter}, // Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH {0x1E2EC, 0x1E2EF, prExtend}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI {0x1E2F0, 0x1E2F9, prNumeric}, // Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE + {0x1E4D0, 0x1E4EA, prALetter}, // Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL + {0x1E4EB, 0x1E4EB, prALetter}, // Lm NAG MUNDARI SIGN OJOD + {0x1E4EC, 0x1E4EF, prExtend}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH + {0x1E4F0, 0x1E4F9, prNumeric}, // Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE {0x1E7E0, 0x1E7E6, prALetter}, // Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO {0x1E7E8, 0x1E7EB, prALetter}, // Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE {0x1E7ED, 0x1E7EE, prALetter}, // Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE @@ -1740,7 +1767,8 @@ var workBreakCodePoints = [][3]int{ {0x1F6D3, 0x1F6D4, prExtendedPictographic}, // E0.0 [2] (๐Ÿ›“..๐Ÿ›”) STUPA..PAGODA {0x1F6D5, 0x1F6D5, prExtendedPictographic}, // E12.0 [1] (๐Ÿ›•) hindu temple {0x1F6D6, 0x1F6D7, prExtendedPictographic}, // E13.0 [2] (๐Ÿ›–..๐Ÿ›—) hut..elevator - {0x1F6D8, 0x1F6DC, prExtendedPictographic}, // E0.0 [5] (๐Ÿ›˜..๐Ÿ›œ) .. + {0x1F6D8, 0x1F6DB, prExtendedPictographic}, // E0.0 [4] (๐Ÿ›˜..๐Ÿ››) .. + {0x1F6DC, 0x1F6DC, prExtendedPictographic}, // E15.0 [1] (๐Ÿ›œ) wireless {0x1F6DD, 0x1F6DF, prExtendedPictographic}, // E14.0 [3] (๐Ÿ›..๐Ÿ›Ÿ) playground slide..ring buoy {0x1F6E0, 0x1F6E5, prExtendedPictographic}, // E0.7 [6] (๐Ÿ› ๏ธ..๐Ÿ›ฅ๏ธ) hammer and wrench..motor boat {0x1F6E6, 0x1F6E8, prExtendedPictographic}, // E0.0 [3] (๐Ÿ›ฆ..๐Ÿ›จ) UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE @@ -1757,7 +1785,7 @@ var workBreakCodePoints = [][3]int{ {0x1F6FA, 0x1F6FA, prExtendedPictographic}, // E12.0 [1] (๐Ÿ›บ) auto rickshaw {0x1F6FB, 0x1F6FC, prExtendedPictographic}, // E13.0 [2] (๐Ÿ›ป..๐Ÿ›ผ) pickup truck..roller skate {0x1F6FD, 0x1F6FF, prExtendedPictographic}, // E0.0 [3] (๐Ÿ›ฝ..๐Ÿ›ฟ) .. - {0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (๐Ÿด..๐Ÿฟ) .. + {0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (๐Ÿด..๐Ÿฟ) LOT OF FORTUNE..ORCUS {0x1F7D5, 0x1F7DF, prExtendedPictographic}, // E0.0 [11] (๐ŸŸ•..๐ŸŸŸ) CIRCLED TRIANGLE.. {0x1F7E0, 0x1F7EB, prExtendedPictographic}, // E12.0 [12] (๐ŸŸ ..๐ŸŸซ) orange circle..brown square {0x1F7EC, 0x1F7EF, prExtendedPictographic}, // E0.0 [4] (๐ŸŸฌ..๐ŸŸฏ) .. @@ -1816,30 +1844,37 @@ var workBreakCodePoints = [][3]int{ {0x1FA00, 0x1FA6F, prExtendedPictographic}, // E0.0 [112] (๐Ÿจ€..๐Ÿฉฏ) NEUTRAL CHESS KING.. {0x1FA70, 0x1FA73, prExtendedPictographic}, // E12.0 [4] (๐Ÿฉฐ..๐Ÿฉณ) ballet shoes..shorts {0x1FA74, 0x1FA74, prExtendedPictographic}, // E13.0 [1] (๐Ÿฉด) thong sandal - {0x1FA75, 0x1FA77, prExtendedPictographic}, // E0.0 [3] (๐Ÿฉต..๐Ÿฉท) .. + {0x1FA75, 0x1FA77, prExtendedPictographic}, // E15.0 [3] (๐Ÿฉต..๐Ÿฉท) light blue heart..pink heart {0x1FA78, 0x1FA7A, prExtendedPictographic}, // E12.0 [3] (๐Ÿฉธ..๐Ÿฉบ) drop of blood..stethoscope {0x1FA7B, 0x1FA7C, prExtendedPictographic}, // E14.0 [2] (๐Ÿฉป..๐Ÿฉผ) x-ray..crutch {0x1FA7D, 0x1FA7F, prExtendedPictographic}, // E0.0 [3] (๐Ÿฉฝ..๐Ÿฉฟ) .. {0x1FA80, 0x1FA82, prExtendedPictographic}, // E12.0 [3] (๐Ÿช€..๐Ÿช‚) yo-yo..parachute {0x1FA83, 0x1FA86, prExtendedPictographic}, // E13.0 [4] (๐Ÿชƒ..๐Ÿช†) boomerang..nesting dolls - {0x1FA87, 0x1FA8F, prExtendedPictographic}, // E0.0 [9] (๐Ÿช‡..๐Ÿช) .. + {0x1FA87, 0x1FA88, prExtendedPictographic}, // E15.0 [2] (๐Ÿช‡..๐Ÿชˆ) maracas..flute + {0x1FA89, 0x1FA8F, prExtendedPictographic}, // E0.0 [7] (๐Ÿช‰..๐Ÿช) .. {0x1FA90, 0x1FA95, prExtendedPictographic}, // E12.0 [6] (๐Ÿช..๐Ÿช•) ringed planet..banjo {0x1FA96, 0x1FAA8, prExtendedPictographic}, // E13.0 [19] (๐Ÿช–..๐Ÿชจ) military helmet..rock {0x1FAA9, 0x1FAAC, prExtendedPictographic}, // E14.0 [4] (๐Ÿชฉ..๐Ÿชฌ) mirror ball..hamsa - {0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E0.0 [3] (๐Ÿชญ..๐Ÿชฏ) .. + {0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E15.0 [3] (๐Ÿชญ..๐Ÿชฏ) folding hand fan..khanda {0x1FAB0, 0x1FAB6, prExtendedPictographic}, // E13.0 [7] (๐Ÿชฐ..๐Ÿชถ) fly..feather {0x1FAB7, 0x1FABA, prExtendedPictographic}, // E14.0 [4] (๐Ÿชท..๐Ÿชบ) lotus..nest with eggs - {0x1FABB, 0x1FABF, prExtendedPictographic}, // E0.0 [5] (๐Ÿชป..๐Ÿชฟ) .. + {0x1FABB, 0x1FABD, prExtendedPictographic}, // E15.0 [3] (๐Ÿชป..๐Ÿชฝ) hyacinth..wing + {0x1FABE, 0x1FABE, prExtendedPictographic}, // E0.0 [1] (๐Ÿชพ) + {0x1FABF, 0x1FABF, prExtendedPictographic}, // E15.0 [1] (๐Ÿชฟ) goose {0x1FAC0, 0x1FAC2, prExtendedPictographic}, // E13.0 [3] (๐Ÿซ€..๐Ÿซ‚) anatomical heart..people hugging {0x1FAC3, 0x1FAC5, prExtendedPictographic}, // E14.0 [3] (๐Ÿซƒ..๐Ÿซ…) pregnant man..person with crown - {0x1FAC6, 0x1FACF, prExtendedPictographic}, // E0.0 [10] (๐Ÿซ†..๐Ÿซ) .. + {0x1FAC6, 0x1FACD, prExtendedPictographic}, // E0.0 [8] (๐Ÿซ†..๐Ÿซ) .. + {0x1FACE, 0x1FACF, prExtendedPictographic}, // E15.0 [2] (๐ŸซŽ..๐Ÿซ) moose..donkey {0x1FAD0, 0x1FAD6, prExtendedPictographic}, // E13.0 [7] (๐Ÿซ..๐Ÿซ–) blueberries..teapot {0x1FAD7, 0x1FAD9, prExtendedPictographic}, // E14.0 [3] (๐Ÿซ—..๐Ÿซ™) pouring liquid..jar - {0x1FADA, 0x1FADF, prExtendedPictographic}, // E0.0 [6] (๐Ÿซš..๐ŸซŸ) .. + {0x1FADA, 0x1FADB, prExtendedPictographic}, // E15.0 [2] (๐Ÿซš..๐Ÿซ›) ginger root..pea pod + {0x1FADC, 0x1FADF, prExtendedPictographic}, // E0.0 [4] (๐Ÿซœ..๐ŸซŸ) .. {0x1FAE0, 0x1FAE7, prExtendedPictographic}, // E14.0 [8] (๐Ÿซ ..๐Ÿซง) melting face..bubbles - {0x1FAE8, 0x1FAEF, prExtendedPictographic}, // E0.0 [8] (๐Ÿซจ..๐Ÿซฏ) .. + {0x1FAE8, 0x1FAE8, prExtendedPictographic}, // E15.0 [1] (๐Ÿซจ) shaking face + {0x1FAE9, 0x1FAEF, prExtendedPictographic}, // E0.0 [7] (๐Ÿซฉ..๐Ÿซฏ) .. {0x1FAF0, 0x1FAF6, prExtendedPictographic}, // E14.0 [7] (๐Ÿซฐ..๐Ÿซถ) hand with index finger and thumb crossed..heart hands - {0x1FAF7, 0x1FAFF, prExtendedPictographic}, // E0.0 [9] (๐Ÿซท..๐Ÿซฟ) .. + {0x1FAF7, 0x1FAF8, prExtendedPictographic}, // E15.0 [2] (๐Ÿซท..๐Ÿซธ) leftwards pushing hand..rightwards pushing hand + {0x1FAF9, 0x1FAFF, prExtendedPictographic}, // E0.0 [7] (๐Ÿซน..๐Ÿซฟ) .. {0x1FBF0, 0x1FBF9, prNumeric}, // Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE {0x1FC00, 0x1FFFD, prExtendedPictographic}, // E0.0[1022] (๐Ÿฐ€..๐Ÿฟฝ) .. {0xE0001, 0xE0001, prFormat}, // Cf LANGUAGE TAG