From fb4947786821a7587f3a3255b691e47dc3f04455 Mon Sep 17 00:00:00 2001 From: changjoon-park Date: Fri, 1 May 2026 15:32:33 +0900 Subject: [PATCH] Apply titlecase mapping in str.title() for uppercase digraphs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The uppercase/titlecase branch of PyStr::title() pushed characters unchanged when starting a new word, which left Latin Extended-B digraphs (U+01F1 'DZ', U+01C4 'DŽ', etc.) in their uppercase form instead of mapping them to their distinct titlecase counterparts (U+01F2 'Dz', U+01C5 'Dž'). For ASCII letters and characters where to_titlecase is identity this had no effect, hiding the bug for the common case. Mirror the lowercase branch — which already calls to_titlecase() when starting a new word — so both branches symmetrically apply the titlecase mapping. char::to_titlecase is identity for already- titlecase and ASCII-uppercase characters, so existing cases stay correct. Also unmasks test_unicodedata.UnicodeMiscTest.test_bug_4971, which asserts exactly this behavior (`'DŽ'.title() == 'Dž'` etc.) and was marked expectedFailure with reason `+ Dž`. Closes #7527 (the only example from that issue still failing on 3.14.4; the other four examples already pass on current main). --- Lib/test/test_unicodedata.py | 1 - crates/vm/src/builtins/str.rs | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index bb54818ecb0..5b99cd29d33 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -359,7 +359,6 @@ def test_bug_5828(self): [0] ) - @unittest.expectedFailure # TODO: RUSTPYTHON; + Dž def test_bug_4971(self): # LETTER DZ WITH CARON: DZ, Dz, dz self.assertEqual("\u01c4".title(), "\u01c5") diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 40c9905c1e3..8a50c4a5268 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -1053,7 +1053,7 @@ impl PyStr { if previous_is_cased { title.extend(c.to_lowercase()); } else { - title.push_char(c); + title.extend(c.to_titlecase()); } previous_is_cased = true; } else { @@ -2661,6 +2661,10 @@ mod tests { ("Greek Ωppercases ...", "greek ωppercases ..."), // spell-checker:disable-next-line ("Greek ῼitlecases ...", "greek ῳitlecases ..."), + // Latin Extended-B digraphs: uppercase forms map to titlecase forms + // (e.g. U+01F1 'DZ' -> U+01F2 'Dz', U+01C4 'DŽ' -> U+01C5 'Dž'). + ("\u{01F2}", "\u{01F1}"), + ("\u{01C5}", "\u{01C4}"), ]; for (title, input) in tests { assert_eq!(PyStr::from(input).title().as_str(), Ok(title));