unicode.h source code [llvm_runtimes/build/include/c++/v1/__format/unicode.h]

1	// -- C++ --
2	//===----------------------------------------------------------------------===//
3	//
4	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	// See https://llvm.org/LICENSE.txt for license information.
6	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	//
8	//===----------------------------------------------------------------------===//
9
10	#ifndef _LIBCPP___FORMAT_UNICODE_H
11	#define _LIBCPP___FORMAT_UNICODE_H
12
13	#include <__assert>
14	#include <__bit/countl.h>
15	#include <__concepts/same_as.h>
16	#include <__config>
17	#include <__format/extended_grapheme_cluster_table.h>
18	#include <__format/indic_conjunct_break_table.h>
19	#include <__iterator/concepts.h>
20	#include <__iterator/readable_traits.h> // iter_value_t
21	#include <__utility/unreachable.h>
22	#include <string_view>
23
24	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
25	# pragma GCC system_header
26	#endif
27
28	_LIBCPP_BEGIN_NAMESPACE_STD
29
30	#if _LIBCPP_STD_VER >= 20
31
32	namespace __unicode {
33
34	// Helper struct for the result of a consume operation.
35	//
36	// The status value for a correct code point is 0. This allows a valid value to
37	// be used without masking.
38	// When the decoding fails it know the number of code units affected. For the
39	// current use-cases that value is not needed, therefore it is not stored.
40	// The escape routine needs the number of code units for both a valid and
41	// invalid character and keeps track of it itself. Doing it in this result
42	// unconditionally would give some overhead when the value is unneeded.
43	struct __consume_result {
44	// When __status == __ok it contains the decoded code point.
45	// Else it contains the replacement character U+FFFD
46	char32_t __code_point : `31`;
47
48	enum : char32_t {
49	// Consumed a well-formed code point.
50	__ok = `0`,
51	// Encountered invalid UTF-8
52	__error = `1`
53	} __status : `1` {__ok};
54	};
55	static_assert(sizeof(__consume_result) == sizeof(char32_t));
56
57	# if _LIBCPP_HAS_UNICODE
58
59	/// Implements the grapheme cluster boundary rules
60	///
61	/// These rules are used to implement format's width estimation as stated in
62	/// [format.string.std]/11
63	///
64	/// The Standard refers to UAX \#29 for Unicode 12.0.0
65	/// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
66	///
67	/// The data tables used are
68	/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
69	/// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
70	/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
71
72	inline constexpr char32_t __replacement_character = U`'\ufffd'`;
73
74	// The error of a consume operation.
75	//
76	// This sets the code point to the replacement character. This code point does
77	// not participate in the grapheme clustering, so grapheme clustering code can
78	// ignore the error status and always use the code point.
79	inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
80
81	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
82	return __value >= `0xd800` && __value <= `0xdbff`;
83	}
84
85	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
86	return __value >= `0xdc00` && __value <= `0xdfff`;
87	}
88
89	// https://www.unicode.org/glossary/#surrogate_code_point
90	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
91	return __value >= `0xd800` && __value <= `0xdfff`;
92	}
93
94	// https://www.unicode.org/glossary/#code_point
95	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
96	return __value <= `0x10ffff`;
97	}
98
99	// https://www.unicode.org/glossary/#unicode_scalar_value
100	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
101	return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
102	}
103
104	template <contiguous_iterator _Iterator>
105	requires same_as<iter_value_t<_Iterator>, char>
106	_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
107	do {
108	if ((*__char & `0b1100'0000`) != `0b1000'0000`)
109	return false;
110	--__count;
111	++__char;
112	} while (__count);
113	return true;
114	}
115
116	/// Helper class to extract a code unit from a Unicode character range.
117	///
118	/// The stored range is a view. There are multiple specialization for different
119	/// character types.
120	template <class _CharT>
121	class __code_point_view;
122
123	/// UTF-8 specialization.
124	template <>
125	class __code_point_view<char> {
126	using _Iterator _LIBCPP_NODEBUG = basic_string_view<char>::const_iterator;
127
128	public:
129	_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
130	: __first_(__first), __last_(__last) {}
131
132	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
133	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
134
135	// https://www.unicode.org/versions/latest/ch03.pdf#G7404
136	// Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
137	//
138	// Code Points First Byte Second Byte Third Byte Fourth Byte Remarks
139	// U+0000..U+007F 00..7F U+0000..U+007F 1 code unit range
140	// C0..C1 80..BF invalid overlong encoding
141	// U+0080..U+07FF C2..DF 80..BF U+0080..U+07FF 2 code unit range
142	// E0 80..9F 80..BF invalid overlong encoding
143	// U+0800..U+0FFF E0 A0..BF 80..BF U+0800..U+FFFF 3 code unit range
144	// U+1000..U+CFFF E1..EC 80..BF 80..BF
145	// U+D000..U+D7FF ED 80..9F 80..BF
146	// U+D800..U+DFFF ED A0..BF 80..BF invalid encoding of surrogate code point
147	// U+E000..U+FFFF EE..EF 80..BF 80..BF
148	// F0 80..8F 80..BF 80..BF invalid overlong encoding
149	// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF U+10000..U+10FFFF 4 code unit range
150	// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
151	// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
152	// F4 90..BF 80..BF 80..BF U+110000.. invalid code point range
153	//
154	// Unlike other parsers, these invalid entries are tested after decoding.
155	// - The parser always needs to consume these code units
156	// - The code is optimized for well-formed UTF-8
157	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
158	_LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
159
160	// Based on the number of leading 1 bits the number of code units in the
161	// code point can be determined. See
162	// https://en.wikipedia.org/wiki/UTF-8#Encoding
163	switch (std::countl_one(t: static_cast<unsigned char>(*__first_))) {
164	case `0`:
165	return {static_cast<unsigned char>(*__first_++)};
166
167	case `2`: {
168	if (__last_ - __first_ < `2` \|\| !__unicode::__is_continuation(char: __first_ + `1`, count: `1`)) [[unlikely]]
169	break;
170
171	char32_t __value = static_cast<unsigned char>(*__first_++) & `0x1f`;
172	__value <<= `6`;
173	__value \|= static_cast<unsigned char>(*__first_++) & `0x3f`;
174
175	// These values should be encoded in 1 UTF-8 code unit.
176	if (__value < `0x0080`) [[unlikely]]
177	return __consume_result_error;
178
179	return {__value};
180	}
181
182	case `3`: {
183	if (__last_ - __first_ < `3` \|\| !__unicode::__is_continuation(char: __first_ + `1`, count: `2`)) [[unlikely]]
184	break;
185
186	char32_t __value = static_cast<unsigned char>(*__first_++) & `0x0f`;
187	__value <<= `6`;
188	__value \|= static_cast<unsigned char>(*__first_++) & `0x3f`;
189	__value <<= `6`;
190	__value \|= static_cast<unsigned char>(*__first_++) & `0x3f`;
191
192	// These values should be encoded in 1 or 2 UTF-8 code units.
193	if (__value < `0x0800`) [[unlikely]]
194	return __consume_result_error;
195
196	// A surrogate value is always encoded in 3 UTF-8 code units.
197	if (__unicode::__is_surrogate(__value)) [[unlikely]]
198	return __consume_result_error;
199
200	return {__value};
201	}
202
203	case `4`: {
204	if (__last_ - __first_ < `4` \|\| !__unicode::__is_continuation(char: __first_ + `1`, count: `3`)) [[unlikely]]
205	break;
206
207	char32_t __value = static_cast<unsigned char>(*__first_++) & `0x07`;
208	__value <<= `6`;
209	__value \|= static_cast<unsigned char>(*__first_++) & `0x3f`;
210	__value <<= `6`;
211	__value \|= static_cast<unsigned char>(*__first_++) & `0x3f`;
212	__value <<= `6`;
213	__value \|= static_cast<unsigned char>(*__first_++) & `0x3f`;
214
215	// These values should be encoded in 1, 2, or 3 UTF-8 code units.
216	if (__value < `0x10000`) [[unlikely]]
217	return __consume_result_error;
218
219	// A value too large is always encoded in 4 UTF-8 code units.
220	if (!__unicode::__is_code_point(__value)) [[unlikely]]
221	return __consume_result_error;
222
223	return {__value};
224	}
225	}
226	// An invalid number of leading ones can be garbage or a code unit in the
227	// middle of a code point. By consuming one code unit the parser may get
228	// "in sync" after a few code units.
229	++__first_;
230	return __consume_result_error;
231	}
232
233	private:
234	_Iterator __first_;
235	_Iterator __last_;
236	};
237
238	# if _LIBCPP_HAS_WIDE_CHARACTERS
239	_LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
240	return __value >= `0xd800` && __value <= `0xdbff`;
241	}
242
243	_LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
244	return __value >= `0xdc00` && __value <= `0xdfff`;
245	}
246
247	/// This specialization depends on the size of wchar_t
248	/// - 2 UTF-16 (for example Windows and AIX)
249	/// - 4 UTF-32 (for example Linux)
250	template <>
251	class __code_point_view<wchar_t> {
252	using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<wchar_t>::const_iterator;
253
254	public:
255	static_assert(sizeof(wchar_t) == `2` \|\| sizeof(wchar_t) == `4`, "sizeof(wchar_t) has a not implemented value");
256
257	_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
258	: __first_(__first), __last_(__last) {}
259
260	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
261	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
262
263	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
264	_LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
265
266	char32_t __value = static_cast<char32_t>(*__first_++);
267	if constexpr (sizeof(wchar_t) == `2`) {
268	if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
269	return __consume_result_error;
270
271	if (__unicode::__is_high_surrogate(__value)) {
272	if (__first_ == __last_ \|\| !__unicode::__is_low_surrogate(value: static_cast<char32_t>(*__first_))) [[unlikely]]
273	return __consume_result_error;
274
275	__value -= `0xd800`;
276	__value <<= `10`;
277	__value += static_cast<char32_t>(*__first_++) - `0xdc00`;
278	__value += `0x10000`;
279
280	if (!__unicode::__is_code_point(__value)) [[unlikely]]
281	return __consume_result_error;
282	}
283	} else {
284	if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
285	return __consume_result_error;
286	}
287
288	return {__value};
289	}
290
291	private:
292	_Iterator __first_;
293	_Iterator __last_;
294	};
295	# endif // _LIBCPP_HAS_WIDE_CHARACTERS
296
297	// State machine to implement the Extended Grapheme Cluster Boundary
298	//
299	// The exact rules may change between Unicode versions.
300	// This implements the extended rules see
301	// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
302	class __extended_grapheme_cluster_break {
303	using __EGC_property _LIBCPP_NODEBUG = __extended_grapheme_custer_property_boundary::__property;
304	using __inCB_property _LIBCPP_NODEBUG = __indic_conjunct_break::__property;
305
306	public:
307	_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_break(char32_t __first_code_point)
308	: __prev_code_point_(__first_code_point),
309	__prev_property_(__extended_grapheme_custer_property_boundary::__get_property(code_point: __first_code_point)) {
310	// Initializes the active rule.
311	if (__prev_property_ == __EGC_property::__Extended_Pictographic)
312	__active_rule_ = __rule::__GB11_emoji;
313	else if (__prev_property_ == __EGC_property::__Regional_Indicator)
314	__active_rule_ = __rule::__GB12_GB13_regional_indicator;
315	else if (__indic_conjunct_break::__get_property(code_point: __first_code_point) == __inCB_property::__Consonant)
316	__active_rule_ = __rule::__GB9c_indic_conjunct_break;
317	}
318
319	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool operator()(char32_t __next_code_point) {
320	__EGC_property __next_property = __extended_grapheme_custer_property_boundary::__get_property(code_point: __next_code_point);
321	bool __result = __evaluate(__next_code_point, __next_property);
322	__prev_code_point_ = __next_code_point;
323	__prev_property_ = __next_property;
324	return __result;
325	}
326
327	// The code point whose break propery are considered during the next
328	// evaluation cyle.
329	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr char32_t __current_code_point() const { return __prev_code_point_; }
330
331	private:
332	// The naming of the identifiers matches the Unicode standard.
333	// NOLINTBEGIN(readability-identifier-naming)
334
335	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
336	__evaluate(char32_t __next_code_point, __EGC_property __next_property) {
337	switch (__active_rule_) {
338	case __rule::__none:
339	return __evaluate_none(__next_code_point, __next_property);
340	case __rule::__GB9c_indic_conjunct_break:
341	return __evaluate_GB9c_indic_conjunct_break(__next_code_point, __next_property);
342	case __rule::__GB11_emoji:
343	return __evaluate_GB11_emoji(__next_code_point, __next_property);
344	case __rule::__GB12_GB13_regional_indicator:
345	return __evaluate_GB12_GB13_regional_indicator(__next_code_point, __next_property);
346	}
347	__libcpp_unreachable();
348	}
349
350	_LIBCPP_HIDE_FROM_ABI constexpr bool __evaluate_none(char32_t __next_code_point, __EGC_property __next_property) {
351	// * Break at the start and end of text, unless the text is empty. *
352
353	_LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__sot, "should be handled in the constructor"); // GB1
354	_LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__eot, "should be handled by our caller"); // GB2
355
356	// * Do not break between a CR and LF. Otherwise, break before and after controls. *
357	if (__prev_property_ == __EGC_property::__CR && __next_property == __EGC_property::__LF) // GB3
358	return false;
359
360	if (__prev_property_ == __EGC_property::__Control \|\| __prev_property_ == __EGC_property::__CR \|\|
361	__prev_property_ == __EGC_property::__LF) // GB4
362	return true;
363
364	if (__next_property == __EGC_property::__Control \|\| __next_property == __EGC_property::__CR \|\|
365	__next_property == __EGC_property::__LF) // GB5
366	return true;
367
368	// * Do not break Hangul syllable sequences. *
369	if (__prev_property_ == __EGC_property::__L &&
370	(__next_property == __EGC_property::__L \|\| __next_property == __EGC_property::__V \|\|
371	__next_property == __EGC_property::__LV \|\| __next_property == __EGC_property::__LVT)) // GB6
372	return false;
373
374	if ((__prev_property_ == __EGC_property::__LV \|\| __prev_property_ == __EGC_property::__V) &&
375	(__next_property == __EGC_property::__V \|\| __next_property == __EGC_property::__T)) // GB7
376	return false;
377
378	if ((__prev_property_ == __EGC_property::__LVT \|\| __prev_property_ == __EGC_property::__T) &&
379	__next_property == __EGC_property::__T) // GB8
380	return false;
381
382	// * Do not break before extending characters or ZWJ. *
383	if (__next_property == __EGC_property::__Extend \|\| __next_property == __EGC_property::__ZWJ)
384	return false; // GB9
385
386	// * Do not break before SpacingMarks, or after Prepend characters. *
387	if (__next_property == __EGC_property::__SpacingMark) // GB9a
388	return false;
389
390	if (__prev_property_ == __EGC_property::__Prepend) // GB9b
391	return false;
392
393	// * Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker. *
394	if (__indic_conjunct_break::__get_property(code_point: __next_code_point) == __inCB_property::__Consonant) {
395	__active_rule_ = __rule::__GB9c_indic_conjunct_break;
396	__GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
397	return true;
398	}
399
400	// * Do not break within emoji modifier sequences or emoji zwj sequences. *
401	if (__next_property == __EGC_property::__Extended_Pictographic) {
402	__active_rule_ = __rule::__GB11_emoji;
403	__GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
404	return true;
405	}
406
407	// * Do not break within emoji flag sequences *
408
409	// That is, do not break between regional indicator (RI) symbols if there
410	// is an odd number of RI characters before the break point.
411	if (__next_property == __EGC_property::__Regional_Indicator) { // GB12 + GB13
412	__active_rule_ = __rule::__GB12_GB13_regional_indicator;
413	return true;
414	}
415
416	// * Otherwise, break everywhere. *
417	return true; // GB999
418	}
419
420	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
421	__evaluate_GB9c_indic_conjunct_break(char32_t __next_code_point, __EGC_property __next_property) {
422	__inCB_property __break = __indic_conjunct_break::__get_property(code_point: __next_code_point);
423	if (__break == __inCB_property::__none) {
424	__active_rule_ = __rule::__none;
425	return __evaluate_none(__next_code_point, __next_property);
426	}
427
428	switch (__GB9c_indic_conjunct_break_state_) {
429	case __GB9c_indic_conjunct_break_state::__Consonant:
430	if (__break == __inCB_property::__Extend) {
431	return false;
432	}
433	if (__break == __inCB_property::__Linker) {
434	__GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Linker;
435	return false;
436	}
437	__active_rule_ = __rule::__none;
438	return __evaluate_none(__next_code_point, __next_property);
439
440	case __GB9c_indic_conjunct_break_state::__Linker:
441	if (__break == __inCB_property::__Extend) {
442	return false;
443	}
444	if (__break == __inCB_property::__Linker) {
445	return false;
446	}
447	if (__break == __inCB_property::__Consonant) {
448	__GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
449	return false;
450	}
451	__active_rule_ = __rule::__none;
452	return __evaluate_none(__next_code_point, __next_property);
453	}
454	__libcpp_unreachable();
455	}
456
457	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
458	__evaluate_GB11_emoji(char32_t __next_code_point, __EGC_property __next_property) {
459	switch (__GB11_emoji_state_) {
460	case __GB11_emoji_state::__Extended_Pictographic:
461	if (__next_property == __EGC_property::__Extend) {
462	__GB11_emoji_state_ = __GB11_emoji_state::__Extend;
463	return false;
464	}
465	[[fallthrough]];
466	case __GB11_emoji_state::__Extend:
467	if (__next_property == __EGC_property::__ZWJ) {
468	__GB11_emoji_state_ = __GB11_emoji_state::__ZWJ;
469	return false;
470	}
471	if (__next_property == __EGC_property::__Extend)
472	return false;
473	__active_rule_ = __rule::__none;
474	return __evaluate_none(__next_code_point, __next_property);
475
476	case __GB11_emoji_state::__ZWJ:
477	if (__next_property == __EGC_property::__Extended_Pictographic) {
478	__GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
479	return false;
480	}
481	__active_rule_ = __rule::__none;
482	return __evaluate_none(__next_code_point, __next_property);
483	}
484	__libcpp_unreachable();
485	}
486
487	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
488	__evaluate_GB12_GB13_regional_indicator(char32_t __next_code_point, __EGC_property __next_property) {
489	__active_rule_ = __rule::__none;
490	if (__next_property == __EGC_property::__Regional_Indicator)
491	return false;
492	return __evaluate_none(__next_code_point, __next_property);
493	}
494
495	char32_t __prev_code_point_;
496	__EGC_property __prev_property_;
497
498	enum class __rule {
499	__none,
500	__GB9c_indic_conjunct_break,
501	__GB11_emoji,
502	__GB12_GB13_regional_indicator,
503	};
504	__rule __active_rule_ = __rule::__none;
505
506	enum class __GB11_emoji_state {
507	__Extended_Pictographic,
508	__Extend,
509	__ZWJ,
510	};
511	__GB11_emoji_state __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
512
513	enum class __GB9c_indic_conjunct_break_state {
514	__Consonant,
515	__Linker,
516	};
517
518	__GB9c_indic_conjunct_break_state __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
519
520	// NOLINTEND(readability-identifier-naming)
521	};
522
523	/// Helper class to extract an extended grapheme cluster from a Unicode character range.
524	///
525	/// This function is used to determine the column width of an extended grapheme
526	/// cluster. In order to do that only the first code point is evaluated.
527	/// Therefore only this code point is extracted.
528	template <class _CharT>
529	class __extended_grapheme_cluster_view {
530	using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
531
532	public:
533	_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
534	: __code_point_view_(__first, __last), __at_break_(__code_point_view_.__consume().__code_point) {}
535
536	struct __cluster {
537	/// The first code point of the extended grapheme cluster.
538	///
539	/// The first code point is used to estimate the width of the extended
540	/// grapheme cluster.
541	char32_t __code_point_;
542
543	/// Points one beyond the last code unit in the extended grapheme cluster.
544	///
545	/// It's expected the caller has the start position and thus can determine
546	/// the code unit range of the extended grapheme cluster.
547	_Iterator __last_;
548	};
549
550	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
551	char32_t __code_point = __at_break_.__current_code_point();
552	_Iterator __position = __code_point_view_.__position();
553	while (!__code_point_view_.__at_end()) {
554	if (__at_break_(__code_point_view_.__consume().__code_point))
555	break;
556	__position = __code_point_view_.__position();
557	}
558	return {__code_point, __position};
559	}
560
561	private:
562	__code_point_view<_CharT> __code_point_view_;
563	__extended_grapheme_cluster_break __at_break_;
564	};
565
566	template <contiguous_iterator _Iterator>
567	__extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>;
568
569	# else // _LIBCPP_HAS_UNICODE
570
571	// For ASCII every character is a "code point".
572	// This makes it easier to write code agnostic of the _LIBCPP_HAS_UNICODE define.
573	template <class _CharT>
574	class __code_point_view {
575	using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
576
577	public:
578	_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
579	: __first_(__first), __last_(__last) {}
580
581	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
582	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
583
584	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
585	_LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
586	return {static_cast<char32_t>(*__first_++)};
587	}
588
589	private:
590	_Iterator __first_;
591	_Iterator __last_;
592	};
593
594	# endif // _LIBCPP_HAS_UNICODE
595
596	} // namespace __unicode
597
598	#endif // _LIBCPP_STD_VER >= 20
599
600	_LIBCPP_END_NAMESPACE_STD
601
602	#endif // _LIBCPP___FORMAT_UNICODE_H
603

Browse the source code of llvm_runtimes/build/include/c++/v1/__format/unicode.h