UnicodeNameToCodepoint.cpp source code [llvm_projects/llvm/lib/Support/UnicodeNameToCodepoint.cpp]

1	//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2	//-- C++ --===//
3	//
4	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	// See https://llvm.org/LICENSE.txt for license information.
6	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	//
8	//===----------------------------------------------------------------------===//
9	//
10	// This file implements functions to map the name or alias of a unicode
11	// character to its codepoint.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/ADT/STLExtras.h"
16	#include "llvm/ADT/StringExtras.h"
17	#include "llvm/ADT/StringRef.h"
18	#include "llvm/Support/Unicode.h"
19
20	namespace llvm {
21	namespace sys {
22	namespace unicode {
23
24	extern const char *UnicodeNameToCodepointDict;
25	extern const uint8_t *UnicodeNameToCodepointIndex;
26	extern const std::size_t UnicodeNameToCodepointIndexSize;
27	extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28
29	using BufferType = SmallString<`64`>;
30
31	struct Node {
32	bool IsRoot = false;
33	char32_t Value = `0xFFFFFFFF`;
34	uint32_t ChildrenOffset = `0`;
35	bool HasSibling = false;
36	uint32_t Size = `0`;
37	StringRef Name;
38	const Node Parent = nullptr*;
39
40	constexpr bool isValid() const {
41	return !Name.empty() \|\| Value == `0xFFFFFFFF`;
42	}
43	constexpr bool hasChildren() const { return ChildrenOffset != `0` \|\| IsRoot; }
44
45	std::string fullName() const {
46	std::string S;
47	// Reserve enough space for most unicode code points.
48	// The chosen value represent the 99th percentile of name size as of
49	// Unicode 15.0.
50	S.reserve(res_arg: `46`);
51	const Node N = this*;
52	while (N) {
53	std::reverse_copy(first: N->Name.begin(), last: N->Name.end(), result: std::back_inserter(x&: S));
54	N = N->Parent;
55	}
56	std::reverse(first: S.begin(), last: S.end());
57	return S;
58	}
59	};
60
61	static Node createRoot() {
62	Node N;
63	N.IsRoot = true;
64	N.ChildrenOffset = `1`;
65	N.Size = `1`;
66	return N;
67	}
68
69	static Node readNode(uint32_t Offset, const Node Parent = nullptr*) {
70	if (Offset == `0`)
71	return createRoot();
72
73	uint32_t Origin = Offset;
74	Node N;
75	N.Parent = Parent;
76	uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77	if (Offset + `6` >= UnicodeNameToCodepointIndexSize)
78	return N;
79
80	bool LongName = NameInfo & `0x40`;
81	bool HasValue = NameInfo & `0x80`;
82	std::size_t Size = NameInfo & ~`0xC0`;
83	if (LongName) {
84	uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << `8`);
85	NameOffset \|= UnicodeNameToCodepointIndex[Offset++];
86	N.Name = StringRef (UnicodeNameToCodepointDict + NameOffset, Size);
87	} else {
88	N.Name = StringRef (UnicodeNameToCodepointDict + Size, `1`);
89	}
90	if (HasValue) {
91	uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92	uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93	uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94	N.Value = ((H << `16`) \| (M << `8`) \| L) >> `3`;
95
96	bool HasChildren = L & `0x02`;
97	N.HasSibling = L & `0x01`;
98
99	if (HasChildren) {
100	N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << `16`;
101	N.ChildrenOffset \|= UnicodeNameToCodepointIndex[Offset++] << `8`;
102	N.ChildrenOffset \|= UnicodeNameToCodepointIndex[Offset++];
103	}
104	} else {
105	uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106	N.HasSibling = H & `0x80`;
107	bool HasChildren = H & `0x40`;
108	H &= uint8_t(~`0xC0`);
109	if (HasChildren) {
110	N.ChildrenOffset = (H << `16`);
111	N.ChildrenOffset \|=
112	(uint32_t(UnicodeNameToCodepointIndex[Offset++]) << `8`);
113	N.ChildrenOffset \|= UnicodeNameToCodepointIndex[Offset++];
114	}
115	}
116	N.Size = Offset - Origin;
117	return N;
118	}
119
120	static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121	std::size_t &Consummed, char &PreviousCharInName,
122	bool IsPrefix = false) {
123
124	Consummed = `0`;
125	if (Strict) {
126	if (!Name.starts_with(Prefix: Needle))
127	return false;
128	Consummed = Needle.size();
129	return true;
130	}
131	if (Needle.empty())
132	return true;
133
134	auto NamePos = Name.begin();
135	auto NeedlePos = Needle.begin();
136
137	char PreviousCharInNameOrigin = PreviousCharInName;
138	char PreviousCharInNeedle = *Needle.begin();
139	auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
140	bool IsPrefix = false) {
141	while (It != End) {
142	const auto Next = std::next(It);
143	// Ignore spaces, underscore, medial hyphens
144	// The generator ensures a needle never ends (or starts) by a medial
145	// hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
146	bool Ignore =
147	It == `' '` \|\| It == `'_'` \|\|
148	(*It == `'-'` && isAlnum(C: PreviousChar) &&
149	((Next != End && isAlnum(*Next)) \|\| (Next == End && IsPrefix)));
150	PreviousChar = *It;
151	if (!Ignore)
152	break;
153	++It;
154	}
155	return It;
156	};
157
158	while (true) {
159	NamePos = IgnoreSpaces (NamePos, Name.end(), PreviousCharInName);
160	NeedlePos =
161	IgnoreSpaces (NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162	if (NeedlePos == Needle.end())
163	break;
164	if (NamePos == Name.end())
165	break;
166	if (toUpper(x: NeedlePos) != toUpper(x: NamePos))
167	break;
168	NeedlePos++;
169	NamePos++;
170	}
171	Consummed = std::distance(first: Name.begin(), last: NamePos);
172	if (NeedlePos != Needle.end()) {
173	PreviousCharInName = PreviousCharInNameOrigin;
174	}
175	return NeedlePos == Needle.end();
176	}
177
178	static std::tuple<Node, bool, uint32_t>
179	compareNode(uint32_t Offset, StringRef Name, bool Strict,
180	char PreviousCharInName, BufferType &Buffer,
181	const Node Parent = nullptr*) {
182	Node N = readNode(Offset, Parent);
183	std::size_t Consummed = `0`;
184	bool DoesStartWith = N.IsRoot \|\| startsWith(Name, Needle: N.Name, Strict, Consummed,
185	PreviousCharInName);
186	if (!DoesStartWith)
187	return std::make_tuple(args&: N, args: false, args: `0`);
188
189	if (Name.size() - Consummed == `0` && N.Value != `0xFFFFFFFF`)
190	return std::make_tuple(args&: N, args: true, args&: N.Value);
191
192	if (N.hasChildren()) {
193	uint32_t ChildOffset = N.ChildrenOffset;
194	for (;;) {
195	Node C;
196	bool Matches;
197	uint32_t Value;
198	std::tie(args&: C, args&: Matches, args&: Value) =
199	compareNode(Offset: ChildOffset, Name: Name.substr(Start: Consummed), Strict,
200	PreviousCharInName, Buffer, Parent: &N);
201	if (Matches) {
202	std::reverse_copy(first: C.Name.begin(), last: C.Name.end(),
203	result: std::back_inserter(x&: Buffer));
204	return std::make_tuple(args&: N, args: true, args&: Value);
205	}
206	ChildOffset += C.Size;
207	if (!C.HasSibling)
208	break;
209	}
210	}
211	return std::make_tuple(args&: N, args: false, args: `0`);
212	}
213
214	static std::tuple<Node, bool, uint32_t>
215	compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
216	return compareNode(Offset, Name, Strict, PreviousCharInName: `0`, Buffer);
217	}
218
219	// clang-format off
220	constexpr const char *const HangulSyllables[][`3`] = {
221	{ "G", "A", "" },
222	{ "GG", "AE", "G" },
223	{ "N", "YA", "GG" },
224	{ "D", "YAE", "GS" },
225	{ "DD", "EO", "N", },
226	{ "R", "E", "NJ" },
227	{ "M", "YEO", "NH" },
228	{ "B", "YE", "D" },
229	{ "BB", "O", "L" },
230	{ "S", "WA", "LG" },
231	{ "SS", "WAE", "LM" },
232	{ "", "OE", "LB" },
233	{ "J", "YO", "LS" },
234	{ "JJ", "U", "LT" },
235	{ "C", "WEO", "LP" },
236	{ "K", "WE", "LH" },
237	{ "T", "WI", "M" },
238	{ "P", "YU", "B" },
239	{ "H", "EU", "BS" },
240	{ `0`, "YI", "S" },
241	{ `0`, "I", "SS" },
242	{ `0`, `0`, "NG" },
243	{ `0`, `0`, "J" },
244	{ `0`, `0`, "C" },
245	{ `0`, `0`, "K" },
246	{ `0`, `0`, "T" },
247	{ `0`, `0`, "P" },
248	{ `0`, `0`, "H" }
249	};
250	// clang-format on
251
252	// Unicode 15.0
253	// 3.12 Conjoining Jamo Behavior Common constants
254	constexpr const char32_t SBase = `0xAC00`;
255	constexpr const uint32_t LCount = `19`;
256	constexpr const uint32_t VCount = `21`;
257	constexpr const uint32_t TCount = `28`;
258
259	static std::size_t findSyllable(StringRef Name, bool Strict,
260	char &PreviousInName, int &Pos, int Column) {
261	assert(Column == `0` \|\| Column == `1` \|\| Column == `2`);
262	static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
263	int Len = -`1`;
264	int Prev = PreviousInName;
265	for (std::size_t I = `0`; I < CountPerColumn[Column]; I++) {
266	StringRef Syllable(HangulSyllables[I][Column]);
267	if (int(Syllable.size()) <= Len)
268	continue;
269	std::size_t Consummed = `0`;
270	char PreviousInNameCopy = PreviousInName;
271	bool DoesStartWith =
272	startsWith(Name, Needle: Syllable, Strict, Consummed, PreviousCharInName&: PreviousInNameCopy);
273	if (!DoesStartWith)
274	continue;
275	Len = Consummed;
276	Pos = I;
277	Prev = PreviousInNameCopy;
278	}
279	if (Len == -`1`)
280	return `0`;
281	PreviousInName = Prev;
282	return size_t(Len);
283	}
284
285	static std::optional<char32_t>
286	nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
287	Buffer.clear();
288	// Hangul Syllable Decomposition
289	std::size_t Consummed = `0`;
290	char NameStart = `0`;
291	bool DoesStartWith =
292	startsWith(Name, Needle: "HANGUL SYLLABLE ", Strict, Consummed, PreviousCharInName&: NameStart);
293	if (!DoesStartWith)
294	return std::nullopt;
295	Name = Name.substr(Start: Consummed);
296	int L = -`1`, V = -`1`, T = -`1`;
297	Name = Name.substr(Start: findSyllable(Name, Strict, PreviousInName&: NameStart, Pos&: L, Column: `0`));
298	Name = Name.substr(Start: findSyllable(Name, Strict, PreviousInName&: NameStart, Pos&: V, Column: `1`));
299	Name = Name.substr(Start: findSyllable(Name, Strict, PreviousInName&: NameStart, Pos&: T, Column: `2`));
300	if (L != -`1` && V != -`1` && T != -`1` && Name.empty()) {
301	if (!Strict) {
302	Buffer.append(RHS: "HANGUL SYLLABLE ");
303	if (L != -`1`)
304	Buffer.append(RHS: HangulSyllables[L][`0`]);
305	if (V != -`1`)
306	Buffer.append(RHS: HangulSyllables[V][`1`]);
307	if (T != -`1`)
308	Buffer.append(RHS: HangulSyllables[T][`2`]);
309	}
310	return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
311	std::uint32_t(T);
312	}
313	// Otherwise, it's an illegal syllable name.
314	return std::nullopt;
315	}
316
317	struct GeneratedNamesData {
318	StringRef Prefix;
319	uint32_t Start;
320	uint32_t End;
321	};
322
323	// Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings
324	static const GeneratedNamesData GeneratedNamesDataTable[] = {
325	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x3400`, .End: `0x4DBF`},
326	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x4E00`, .End: `0x9FFF`},
327	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x20000`, .End: `0x2A6DF`},
328	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x2A700`, .End: `0x2B739`},
329	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x2B740`, .End: `0x2B81D`},
330	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x2B820`, .End: `0x2CEA1`},
331	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x2CEB0`, .End: `0x2EBE0`},
332	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x2EBF0`, .End: `0x2EE5D`},
333	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x30000`, .End: `0x3134A`},
334	{.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: `0x31350`, .End: `0x323AF`},
335	{.Prefix: "TANGUT IDEOGRAPH-", .Start: `0x17000`, .End: `0x187F7`},
336	{.Prefix: "TANGUT IDEOGRAPH-", .Start: `0x18D00`, .End: `0x18D08`},
337	{.Prefix: "KHITAN SMALL SCRIPT CHARACTER-", .Start: `0x18B00`, .End: `0x18CD5`},
338	{.Prefix: "NUSHU CHARACTER-", .Start: `0x1B170`, .End: `0x1B2FB`},
339	{.Prefix: "CJK COMPATIBILITY IDEOGRAPH-", .Start: `0xF900`, .End: `0xFA6D`},
340	{.Prefix: "CJK COMPATIBILITY IDEOGRAPH-", .Start: `0xFA70`, .End: `0xFAD9`},
341	{.Prefix: "CJK COMPATIBILITY IDEOGRAPH-", .Start: `0x2F800`, .End: `0x2FA1D`},
342	};
343
344	static std::optional<char32_t>
345	nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
346	for (auto &&Item : GeneratedNamesDataTable) {
347	Buffer.clear();
348	std::size_t Consummed = `0`;
349	char NameStart = `0`;
350	bool DoesStartWith = startsWith(Name, Needle: Item.Prefix, Strict, Consummed,
351	PreviousCharInName&: NameStart, /IsPrefix=/true);
352	if (!DoesStartWith)
353	continue;
354	auto Number = Name.substr(Start: Consummed);
355	unsigned long long V = `0`;
356	// Be consistent about mandating upper casing.
357	if (Strict &&
358	llvm::any_of(Range&: Number, P: [](char C) { return C >= `'a'` && C <= `'f'`; }))
359	return {};
360	if (getAsUnsignedInteger(Str: Number, Radix: `16`, Result&: V) \|\| V < Item.Start \|\| V > Item.End)
361	continue;
362	if (!Strict) {
363	Buffer.append(RHS: Item.Prefix);
364	Buffer.append(RHS: utohexstr(X: V, LowerCase: true));
365	}
366	return V;
367	}
368	return std::nullopt;
369	}
370
371	static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
372	BufferType &Buffer) {
373	if (Name.empty())
374	return std::nullopt;
375
376	std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
377	if (!Res)
378	Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
379	if (Res)
380	return *Res;
381
382	Buffer.clear();
383	Node Node;
384	bool Matches;
385	uint32_t Value;
386	std::tie(args&: Node, args&: Matches, args&: Value) = compareNode(Offset: `0`, Name, Strict, Buffer);
387	if (Matches) {
388	std::reverse(first: Buffer.begin(), last: Buffer.end());
389	// UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
390	// hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
391	if (!Strict && Value == `0x116c` && Name.contains_insensitive(Other: "O-E")) {
392	Buffer = "HANGUL JUNGSEONG O-E";
393	Value = `0x1180`;
394	}
395	return Value;
396	}
397	return std::nullopt;
398	}
399
400	std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
401
402	BufferType Buffer;
403	auto Opt = nameToCodepoint(Name, Strict: true, Buffer);
404	return Opt;
405	}
406
407	std::optional<LooseMatchingResult>
408	nameToCodepointLooseMatching(StringRef Name) {
409	BufferType Buffer;
410	auto Opt = nameToCodepoint(Name, Strict: false, Buffer);
411	if (!Opt)
412	return std::nullopt;
413	return LooseMatchingResult{.CodePoint: *Opt, .Name: Buffer};
414	}
415
416	// Find the unicode character whose editing distance to Pattern
417	// is shortest, using the Wagner–Fischer algorithm.
418	llvm::SmallVector<MatchForCodepointName>
419	nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
420	// We maintain a fixed size vector of matches,
421	// sorted by distance
422	// The worst match (with the biggest distance) are discarded when new elements
423	// are added.
424	std::size_t LargestEditDistance = `0`;
425	llvm::SmallVector<MatchForCodepointName> Matches;
426	Matches.reserve(N: MaxMatchesCount + `1`);
427
428	auto Insert = [&](const Node &Node, uint32_t Distance,
429	char32_t Value) -> bool {
430	if (Distance > LargestEditDistance) {
431	if (Matches.size() == MaxMatchesCount)
432	return false;
433	LargestEditDistance = Distance;
434	}
435	// To avoid allocations, the creation of the name is delayed
436	// as much as possible.
437	std::string Name;
438	auto GetName = [&] {
439	if (Name.empty())
440	Name = Node.fullName();
441	return Name;
442	};
443
444	auto It = llvm::lower_bound(
445	Range&: Matches, Value&: Distance,
446	C: [&](const MatchForCodepointName &a, std::size_t Distance) {
447	if (Distance == a.Distance)
448	return a.Name < GetName();
449	return a.Distance < Distance;
450	});
451	if (It == Matches.end() && Matches.size() == MaxMatchesCount)
452	return false;
453
454	MatchForCodepointName M{.Name: GetName(), .Distance: Distance, .Value: Value};
455	Matches.insert(I: It, Elt: std::move(M));
456	if (Matches.size() > MaxMatchesCount)
457	Matches.pop_back();
458	return true;
459	};
460
461	// We ignore case, space, hyphens, etc,
462	// in both the search pattern and the prospective names.
463	auto Normalize = [](StringRef Name) {
464	std::string Out;
465	Out.reserve(res_arg: Name.size());
466	for (char C : Name) {
467	if (isAlnum(C))
468	Out.push_back(c: toUpper(x: C));
469	}
470	return Out;
471	};
472	std::string NormalizedName = Normalize (Pattern);
473
474	// Allocate a matrix big enough for longest names.
475	const std::size_t Columns =
476	std::min(a: NormalizedName.size(), b: UnicodeNameToCodepointLargestNameSize) +
477	`1`;
478
479	LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
480	UnicodeNameToCodepointLargestNameSize + `1`;
481
482	std::vector<char> Distances(
483	Columns * (UnicodeNameToCodepointLargestNameSize + `1`), `0`);
484
485	auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
486	assert(Column < Columns);
487	assert(Row < Rows);
488	return Distances [Row * Columns + Column];
489	};
490
491	for (std::size_t I = `0`; I < Columns; I++)
492	Get (I, `0`) = I;
493
494	// Visit the childrens,
495	// Filling (and overriding) the matrix for the name fragment of each node
496	// iteratively. CompleteName is used to collect the actual name of potential
497	// match, respecting case and spacing.
498	auto VisitNode = [&](const Node &N, std::size_t Row,
499	auto &VisitNode) -> void {
500	std::size_t J = `0`;
501	for (; J < N.Name.size(); J++) {
502	if (!isAlnum(C: N.Name [J]))
503	continue;
504
505	Get (`0`, Row) = Row;
506
507	for (std::size_t I = `1`; I < Columns; I++) {
508	const int Delete = Get (I - `1`, Row) + `1`;
509	const int Insert = Get (I, Row - `1`) + `1`;
510
511	const int Replace =
512	Get (I - `1`, Row - `1`) + (NormalizedName [I - `1`] != N.Name [J] ? `1` : `0`);
513
514	Get (I, Row) = std::min(a: Insert, b: std::min(a: Delete, b: Replace));
515	}
516
517	Row++;
518	}
519
520	unsigned Cost = Get (Columns - `1`, Row - `1`);
521	if (N.Value != `0xFFFFFFFF`) {
522	Insert (N, Cost, N.Value);
523	}
524
525	if (N.hasChildren()) {
526	auto ChildOffset = N.ChildrenOffset;
527	for (;;) {
528	Node C = readNode(Offset: ChildOffset, Parent: &N);
529	ChildOffset += C.Size;
530	if (!C.isValid())
531	break;
532	VisitNode(C, Row, VisitNode);
533	if (!C.HasSibling)
534	break;
535	}
536	}
537	};
538
539	Node Root = createRoot();
540	VisitNode (Root, `1`, VisitNode);
541	return Matches;
542	}
543
544	} // namespace unicode
545
546	} // namespace sys
547	} // namespace llvm
548

Browse the source code of llvm_projects/llvm/lib/Support/UnicodeNameToCodepoint.cpp