Skip to content

Commit 8e40640

Browse files
Support Grapheme_Cluster_Break=Prepend (#62)
These characters act like combining marks, except they go before the base character instead of after it.
1 parent afab363 commit 8e40640

File tree

4 files changed

+42
-9
lines changed

4 files changed

+42
-9
lines changed

scripts/unicode.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# - ReadMe.txt
2121
# - Scripts.txt
2222
# - UnicodeData.txt
23+
# - auxiliary/GraphemeBreakProperty.txt
2324
# - emoji/emoji-data.txt
2425
# - emoji/emoji-variation-sequences.txt
2526
# - extracted/DerivedGeneralCategory.txt
@@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]:
526527
zw_map[0x0891] = True
527528
zw_map[0x08E2] = True
528529

530+
# `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]`
531+
gcb_prepend = set()
532+
load_property(
533+
"auxiliary/GraphemeBreakProperty.txt",
534+
"Prepend",
535+
lambda cp: gcb_prepend.add(cp),
536+
)
537+
load_property(
538+
"PropList.txt",
539+
"Prepended_Concatenation_Mark",
540+
lambda cp: gcb_prepend.remove(cp),
541+
)
542+
for cp in gcb_prepend:
543+
zw_map[cp] = True
544+
529545
# HANGUL CHOSEONG FILLER
530546
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
531547
# zero width. However, the expected usage is to combine it with vowel or trailing jamo

src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
//! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
7474
//! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
7575
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
76-
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
76+
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
7777
//! have width 0.
7878
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
7979
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
@@ -113,6 +113,8 @@
113113
//! - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
114114
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
115115
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
116+
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D)
117+
//! with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s.
116118
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
117119
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
118120
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
@@ -132,6 +134,7 @@
132134
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
133135
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
134136
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
137+
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
135138
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
136139
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
137140
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862

src/tables.rs

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
11621162
],
11631163
[
11641164
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
1165-
0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
1165+
0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
11661166
0x55, 0x55,
11671167
],
11681168
[
@@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
15321532
],
15331533
[
15341534
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00,
1535-
0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1535+
0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
15361536
0x55, 0x55,
15371537
],
15381538
[
@@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
15871587
],
15881588
[
15891589
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15,
1590-
0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1590+
0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
15911591
0x55, 0x55,
15921592
],
15931593
[
@@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
15961596
0x55, 0x55,
15971597
],
15981598
[
1599-
0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14,
1599+
0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04,
16001600
0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16011601
0x55, 0x55,
16021602
],
16031603
[
1604-
0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1604+
0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16051605
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16061606
0x55, 0x55,
16071607
],
@@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
16171617
],
16181618
[
16191619
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45,
1620-
0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1620+
0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16211621
0x55, 0x55,
16221622
],
16231623
[
@@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
16311631
0x55, 0x55,
16321632
],
16331633
[
1634-
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
1634+
0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
16351635
0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16361636
0x55, 0x55,
16371637
],
@@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19941994
/// Sorted list of codepoint ranges (inclusive)
19951995
/// that are zero-width but not `Joining_Type=Transparent`
19961996
/// FIXME: can we get better compression?
1997-
static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
1997+
static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [
19981998
([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]),
19991999
([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]),
20002000
([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]),
@@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
20102010
([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]),
20112011
([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]),
20122012
([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]),
2013+
([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]),
20132014
([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]),
20142015
([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]),
20152016
([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]),
@@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
20282029
([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]),
20292030
([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]),
20302031
([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]),
2032+
([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]),
20312033
([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]),
20322034
([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]),
20332035
([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]),
20342036
([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]),
20352037
([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]),
20362038
([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]),
2039+
([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]),
2040+
([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]),
2041+
([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]),
2042+
([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]),
2043+
([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]),
2044+
([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]),
20372045
([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]),
20382046
([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]),
20392047
([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]),

tests/tests.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() {
110110
}
111111
}
112112

113+
#[test]
114+
fn test_gcb_prepend() {
115+
assert_width!("ൎഉ", 1, 1);
116+
assert_width!("\u{11A89}", 0, 0);
117+
}
118+
113119
#[test]
114120
fn test_interlinear_annotation_chars() {
115121
assert_width!('\u{FFF9}', Some(1), Some(1));

0 commit comments

Comments
 (0)