Skip to content

Commit 6298646

Browse files
committed
Auto merge of #122668 - Jules-Bertholet:titlecase, r=<try>
Add APIs for dealing with titlecase try-job: x86_64-gnu-aux
2 parents 562dee4 + 8d07261 commit 6298646

11 files changed

Lines changed: 594 additions & 106 deletions

File tree

library/alloc/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@
148148
#![feature(slice_range)]
149149
#![feature(std_internals)]
150150
#![feature(temporary_niche_types)]
151+
#![feature(titlecase)]
151152
#![feature(transmutability)]
152153
#![feature(trivial_clone)]
153154
#![feature(trusted_fused)]

library/core/src/char/methods.rs

Lines changed: 226 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,72 @@ impl char {
778778
pub fn is_alphabetic(self) -> bool {
779779
match self {
780780
'a'..='z' | 'A'..='Z' => true,
781-
c => c > '\x7f' && unicode::Alphabetic(c),
781+
'\0'..='\u{A9}' => false,
782+
_ => unicode::Alphabetic(self),
783+
}
784+
}
785+
786+
/// Returns `true` if this `char` has the `Cased` property.
787+
/// A character is cased if and only if it is uppercase, lowercase, or titlecase.
788+
///
789+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
790+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
791+
///
792+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
793+
/// [ucd]: https://www.unicode.org/reports/tr44/
794+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
795+
///
796+
/// # Examples
797+
///
798+
/// Basic usage:
799+
///
800+
/// ```
801+
/// #![feature(titlecase)]
802+
/// assert!('A'.is_cased());
803+
/// assert!('a'.is_cased());
804+
/// assert!(!'京'.is_cased());
805+
/// ```
806+
#[must_use]
807+
#[unstable(feature = "titlecase", issue = "153892")]
808+
#[inline]
809+
pub fn is_cased(self) -> bool {
810+
match self {
811+
'a'..='z' | 'A'..='Z' => true,
812+
'\0'..='\u{A9}' => false,
813+
_ => unicode::Cased(self),
814+
}
815+
}
816+
817+
/// Returns the case of this character:
818+
/// [`Some(CharCase::Upper)`][`CharCase::Upper`] if [`self.is_uppercase()`][`char::is_uppercase`],
819+
/// [`Some(CharCase::Lower)`][`CharCase::Lower`] if [`self.is_lowercase()`][`char::is_lowercase`],
820+
/// [`Some(CharCase::Title)`][`CharCase::Title`] if [`self.is_titlecase()`][`char::is_titlecase`], and
821+
/// `None` if [`!self.is_cased()`][`char::is_cased`].
822+
///
823+
/// # Examples
824+
///
825+
/// ```
826+
/// #![feature(titlecase)]
827+
/// use core::char::CharCase;
828+
/// assert_eq!('a'.case(), Some(CharCase::Lower));
829+
/// assert_eq!('δ'.case(), Some(CharCase::Lower));
830+
/// assert_eq!('A'.case(), Some(CharCase::Upper));
831+
/// assert_eq!('Δ'.case(), Some(CharCase::Upper));
832+
/// assert_eq!('Dž'.case(), Some(CharCase::Title));
833+
/// assert_eq!('中'.case(), None);
834+
/// ```
835+
#[must_use]
836+
#[unstable(feature = "titlecase", issue = "153892")]
837+
#[inline]
838+
pub fn case(self) -> Option<CharCase> {
839+
match self {
840+
'a'..='z' => Some(CharCase::Lower),
841+
'A'..='Z' => Some(CharCase::Upper),
842+
'\0'..='\u{A9}' => None,
843+
_ if !unicode::Cased(self) => None,
844+
_ if unicode::Lowercase(self) => Some(CharCase::Lower),
845+
_ if unicode::Uppercase(self) => Some(CharCase::Upper),
846+
_ => Some(CharCase::Title),
782847
}
783848
}
784849

@@ -819,7 +884,42 @@ impl char {
819884
pub const fn is_lowercase(self) -> bool {
820885
match self {
821886
'a'..='z' => true,
822-
c => c > '\x7f' && unicode::Lowercase(c),
887+
'\0'..='\u{A9}' => false,
888+
_ => unicode::Lowercase(self),
889+
}
890+
}
891+
892+
/// Returns `true` if this `char` has the general category for titlecase letters.
893+
/// Conceptually, these characters consist of an uppercase portion followed by a lowercase portion.
894+
///
895+
/// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4
896+
/// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character
897+
/// Database][ucd] [`UnicodeData.txt`].
898+
///
899+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
900+
/// [ucd]: https://www.unicode.org/reports/tr44/
901+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
902+
///
903+
/// # Examples
904+
///
905+
/// Basic usage:
906+
///
907+
/// ```
908+
/// #![feature(titlecase)]
909+
/// assert!('Dž'.is_titlecase());
910+
/// assert!('ῼ'.is_titlecase());
911+
/// assert!(!'D'.is_titlecase());
912+
/// assert!(!'z'.is_titlecase());
913+
/// assert!(!'中'.is_titlecase());
914+
/// assert!(!' '.is_titlecase());
915+
/// ```
916+
#[must_use]
917+
#[unstable(feature = "titlecase", issue = "153892")]
918+
#[inline]
919+
pub fn is_titlecase(self) -> bool {
920+
match self {
921+
'\0'..='\u{01C4}' => false,
922+
_ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(),
823923
}
824924
}
825925

@@ -860,7 +960,8 @@ impl char {
860960
pub const fn is_uppercase(self) -> bool {
861961
match self {
862962
'A'..='Z' => true,
863-
c => c > '\x7f' && unicode::Uppercase(c),
963+
'\0'..='\u{BF}' => false,
964+
_ => unicode::Uppercase(self),
864965
}
865966
}
866967

@@ -893,7 +994,8 @@ impl char {
893994
pub const fn is_whitespace(self) -> bool {
894995
match self {
895996
' ' | '\x09'..='\x0d' => true,
896-
c => c > '\x7f' && unicode::White_Space(c),
997+
'\0'..='\u{84}' => false,
998+
_ => unicode::White_Space(self),
897999
}
8981000
}
8991001

@@ -920,10 +1022,10 @@ impl char {
9201022
#[stable(feature = "rust1", since = "1.0.0")]
9211023
#[inline]
9221024
pub fn is_alphanumeric(self) -> bool {
923-
if self.is_ascii() {
924-
self.is_ascii_alphanumeric()
925-
} else {
926-
unicode::Alphabetic(self) || unicode::N(self)
1025+
match self {
1026+
'a'..='z' | 'A'..='Z' | '0'..='9' => true,
1027+
'\0'..='\u{A9}' => false,
1028+
_ => unicode::Alphabetic(self) || unicode::N(self),
9271029
}
9281030
}
9291031

@@ -969,23 +1071,7 @@ impl char {
9691071
#[must_use]
9701072
#[inline]
9711073
pub(crate) fn is_grapheme_extended(self) -> bool {
972-
!self.is_ascii() && unicode::Grapheme_Extend(self)
973-
}
974-
975-
/// Returns `true` if this `char` has the `Cased` property.
976-
///
977-
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
978-
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
979-
///
980-
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
981-
/// [ucd]: https://www.unicode.org/reports/tr44/
982-
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
983-
#[must_use]
984-
#[inline]
985-
#[doc(hidden)]
986-
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987-
pub fn is_cased(self) -> bool {
988-
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
1074+
self > '\u{02FF}' && unicode::Grapheme_Extend(self)
9891075
}
9901076

9911077
/// Returns `true` if this `char` has the `Case_Ignorable` property.
@@ -1047,7 +1133,8 @@ impl char {
10471133
pub fn is_numeric(self) -> bool {
10481134
match self {
10491135
'0'..='9' => true,
1050-
c => c > '\x7f' && unicode::N(c),
1136+
'\0'..='\u{B1}' => false,
1137+
_ => unicode::N(self),
10511138
}
10521139
}
10531140

@@ -1110,17 +1197,123 @@ impl char {
11101197
/// // convert into themselves.
11111198
/// assert_eq!('山'.to_lowercase().to_string(), "山");
11121199
/// ```
1113-
#[must_use = "this returns the lowercase character as a new iterator, \
1200+
#[must_use = "this returns the lowercased character as a new iterator, \
11141201
without modifying the original"]
11151202
#[stable(feature = "rust1", since = "1.0.0")]
11161203
#[inline]
11171204
pub fn to_lowercase(self) -> ToLowercase {
11181205
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
11191206
}
11201207

1208+
/// Returns an iterator that yields the titlecase mapping of this `char` as one or more
1209+
/// `char`s.
1210+
///
1211+
/// This is usually, but not always, equivalent to the uppercase mapping
1212+
/// returned by [`Self::to_uppercase`]. Prefer this method when seeking to capitalize
1213+
/// Only The First Letter of a word, but use [`Self::to_uppercase`] for ALL CAPS.
1214+
///
1215+
/// If this `char` does not have an titlecase mapping, the iterator yields the same `char`.
1216+
///
1217+
/// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character
1218+
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
1219+
///
1220+
/// [ucd]: https://www.unicode.org/reports/tr44/
1221+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1222+
///
1223+
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1224+
/// the `char`(s) given by [`SpecialCasing.txt`].
1225+
///
1226+
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
1227+
///
1228+
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
1229+
/// is independent of context and language.
1230+
///
1231+
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
1232+
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
1233+
///
1234+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1235+
///
1236+
/// # Examples
1237+
///
1238+
/// As an iterator:
1239+
///
1240+
/// ```
1241+
/// #![feature(titlecase)]
1242+
/// for c in 'ß'.to_titlecase() {
1243+
/// print!("{c}");
1244+
/// }
1245+
/// println!();
1246+
/// ```
1247+
///
1248+
/// Using `println!` directly:
1249+
///
1250+
/// ```
1251+
/// #![feature(titlecase)]
1252+
/// println!("{}", 'ß'.to_titlecase());
1253+
/// ```
1254+
///
1255+
/// Both are equivalent to:
1256+
///
1257+
/// ```
1258+
/// println!("Ss");
1259+
/// ```
1260+
///
1261+
/// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string):
1262+
///
1263+
/// ```
1264+
/// #![feature(titlecase)]
1265+
/// assert_eq!('c'.to_titlecase().to_string(), "C");
1266+
/// assert_eq!('dž'.to_titlecase().to_string(), "Dž");
1267+
/// assert_eq!('ῼ'.to_titlecase().to_string(), "ῼ");
1268+
///
1269+
/// // Sometimes the result is more than one character:
1270+
/// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
1271+
///
1272+
/// // Characters that do not have separate cased forms
1273+
/// // convert into themselves.
1274+
/// assert_eq!('山'.to_titlecase().to_string(), "山");
1275+
/// ```
1276+
///
1277+
/// # Note on locale
1278+
///
1279+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
1280+
///
1281+
/// * 'Dotless': I / ı, sometimes written ï
1282+
/// * 'Dotted': İ / i
1283+
///
1284+
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
1285+
///
1286+
/// ```
1287+
/// #![feature(titlecase)]
1288+
/// let upper_i = 'i'.to_titlecase().to_string();
1289+
/// ```
1290+
///
1291+
/// The value of `upper_i` here relies on the language of the text: if we're
1292+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
1293+
/// be `"İ"`. `to_titlecase()` does not take this into account, and so:
1294+
///
1295+
/// ```
1296+
/// #![feature(titlecase)]
1297+
/// let upper_i = 'i'.to_titlecase().to_string();
1298+
///
1299+
/// assert_eq!(upper_i, "I");
1300+
/// ```
1301+
///
1302+
/// holds across languages.
1303+
#[must_use = "this returns the titlecased character as a new iterator, \
1304+
without modifying the original"]
1305+
#[unstable(feature = "titlecase", issue = "153892")]
1306+
#[inline]
1307+
pub fn to_titlecase(self) -> ToTitlecase {
1308+
ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
1309+
}
1310+
11211311
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
11221312
/// `char`s.
11231313
///
1314+
/// Prefer this method when converting a word into ALL CAPS, but consider [`Self::to_titlecase`]
1315+
/// instead if you seek to capitalize Only The First Letter.
1316+
///
11241317
/// If this `char` does not have an uppercase mapping, the iterator yields the same `char`.
11251318
///
11261319
/// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character
@@ -1170,9 +1363,11 @@ impl char {
11701363
///
11711364
/// ```
11721365
/// assert_eq!('c'.to_uppercase().to_string(), "C");
1366+
/// assert_eq!('dž'.to_uppercase().to_string(), "DŽ");
11731367
///
11741368
/// // Sometimes the result is more than one character:
11751369
/// assert_eq!('ſt'.to_uppercase().to_string(), "ST");
1370+
/// assert_eq!('ῼ'.to_uppercase().to_string(), "ΩΙ");
11761371
///
11771372
/// // Characters that do not have both uppercase and lowercase
11781373
/// // convert into themselves.
@@ -1181,7 +1376,7 @@ impl char {
11811376
///
11821377
/// # Note on locale
11831378
///
1184-
/// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
1379+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
11851380
///
11861381
/// * 'Dotless': I / ı, sometimes written ï
11871382
/// * 'Dotted': İ / i
@@ -1193,7 +1388,7 @@ impl char {
11931388
/// ```
11941389
///
11951390
/// The value of `upper_i` here relies on the language of the text: if we're
1196-
/// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
1391+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
11971392
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
11981393
///
11991394
/// ```
@@ -1203,7 +1398,7 @@ impl char {
12031398
/// ```
12041399
///
12051400
/// holds across languages.
1206-
#[must_use = "this returns the uppercase character as a new iterator, \
1401+
#[must_use = "this returns the uppercased character as a new iterator, \
12071402
without modifying the original"]
12081403
#[stable(feature = "rust1", since = "1.0.0")]
12091404
#[inline]
@@ -1446,7 +1641,7 @@ impl char {
14461641
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
14471642
#[inline]
14481643
pub const fn is_ascii_alphabetic(&self) -> bool {
1449-
matches!(*self, 'A'..='Z' | 'a'..='z')
1644+
matches!(*self, 'a'..='z' | 'A'..='Z')
14501645
}
14511646

14521647
/// Checks if the value is an ASCII uppercase character:

0 commit comments

Comments
 (0)