1//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2//-*- C++ -*-===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements functions to map the name or alias of a unicode
11// character to its codepoint.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/STLExtras.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/Support/Unicode.h"
19
20namespace llvm {
21namespace sys {
22namespace unicode {
23
24extern const char *UnicodeNameToCodepointDict;
25extern const uint8_t *UnicodeNameToCodepointIndex;
26extern const std::size_t UnicodeNameToCodepointIndexSize;
27extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28
29using BufferType = SmallString<64>;
30
31struct Node {
32 bool IsRoot = false;
33 char32_t Value = 0xFFFFFFFF;
34 uint32_t ChildrenOffset = 0;
35 bool HasSibling = false;
36 uint32_t Size = 0;
37 StringRef Name;
38 const Node *Parent = nullptr;
39
40 constexpr bool isValid() const {
41 return !Name.empty() || Value == 0xFFFFFFFF;
42 }
43 constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44
45 std::string fullName() const {
46 std::string S;
47 // Reserve enough space for most unicode code points.
48 // The chosen value represent the 99th percentile of name size as of
49 // Unicode 15.0.
50 S.reserve(res_arg: 46);
51 const Node *N = this;
52 while (N) {
53 std::reverse_copy(first: N->Name.begin(), last: N->Name.end(), result: std::back_inserter(x&: S));
54 N = N->Parent;
55 }
56 std::reverse(first: S.begin(), last: S.end());
57 return S;
58 }
59};
60
61static Node createRoot() {
62 Node N;
63 N.IsRoot = true;
64 N.ChildrenOffset = 1;
65 N.Size = 1;
66 return N;
67}
68
69static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70 if (Offset == 0)
71 return createRoot();
72
73 uint32_t Origin = Offset;
74 Node N;
75 N.Parent = Parent;
76 uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77 if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
78 return N;
79
80 bool LongName = NameInfo & 0x40;
81 bool HasValue = NameInfo & 0x80;
82 std::size_t Size = NameInfo & ~0xC0;
83 if (LongName) {
84 uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85 NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86 N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87 } else {
88 N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
89 }
90 if (HasValue) {
91 uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92 uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93 uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94 N.Value = ((H << 16) | (M << 8) | L) >> 3;
95
96 bool HasChildren = L & 0x02;
97 N.HasSibling = L & 0x01;
98
99 if (HasChildren) {
100 N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103 }
104 } else {
105 uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106 N.HasSibling = H & 0x80;
107 bool HasChildren = H & 0x40;
108 H &= uint8_t(~0xC0);
109 if (HasChildren) {
110 N.ChildrenOffset = (H << 16);
111 N.ChildrenOffset |=
112 (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
113 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114 }
115 }
116 N.Size = Offset - Origin;
117 return N;
118}
119
120static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121 std::size_t &Consummed, char &PreviousCharInName,
122 bool IsPrefix = false) {
123
124 Consummed = 0;
125 if (Strict) {
126 if (!Name.starts_with(Prefix: Needle))
127 return false;
128 Consummed = Needle.size();
129 return true;
130 }
131 if (Needle.empty())
132 return true;
133
134 auto NamePos = Name.begin();
135 auto NeedlePos = Needle.begin();
136
137 char PreviousCharInNameOrigin = PreviousCharInName;
138 char PreviousCharInNeedle = *Needle.begin();
139 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
140 bool IsPrefix = false) {
141 while (It != End) {
142 const auto Next = std::next(It);
143 // Ignore spaces, underscore, medial hyphens
144 // The generator ensures a needle never ends (or starts) by a medial
145 // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
146 bool Ignore =
147 *It == ' ' || *It == '_' ||
148 (*It == '-' && isAlnum(C: PreviousChar) &&
149 ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
150 PreviousChar = *It;
151 if (!Ignore)
152 break;
153 ++It;
154 }
155 return It;
156 };
157
158 while (true) {
159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160 NeedlePos =
161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162 if (NeedlePos == Needle.end())
163 break;
164 if (NamePos == Name.end())
165 break;
166 if (toUpper(x: *NeedlePos) != toUpper(x: *NamePos))
167 break;
168 NeedlePos++;
169 NamePos++;
170 }
171 Consummed = std::distance(first: Name.begin(), last: NamePos);
172 if (NeedlePos != Needle.end()) {
173 PreviousCharInName = PreviousCharInNameOrigin;
174 }
175 return NeedlePos == Needle.end();
176}
177
178static std::tuple<Node, bool, uint32_t>
179compareNode(uint32_t Offset, StringRef Name, bool Strict,
180 char PreviousCharInName, BufferType &Buffer,
181 const Node *Parent = nullptr) {
182 Node N = readNode(Offset, Parent);
183 std::size_t Consummed = 0;
184 bool DoesStartWith = N.IsRoot || startsWith(Name, Needle: N.Name, Strict, Consummed,
185 PreviousCharInName);
186 if (!DoesStartWith)
187 return std::make_tuple(args&: N, args: false, args: 0);
188
189 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
190 return std::make_tuple(args&: N, args: true, args&: N.Value);
191
192 if (N.hasChildren()) {
193 uint32_t ChildOffset = N.ChildrenOffset;
194 for (;;) {
195 Node C;
196 bool Matches;
197 uint32_t Value;
198 std::tie(args&: C, args&: Matches, args&: Value) =
199 compareNode(Offset: ChildOffset, Name: Name.substr(Start: Consummed), Strict,
200 PreviousCharInName, Buffer, Parent: &N);
201 if (Matches) {
202 std::reverse_copy(first: C.Name.begin(), last: C.Name.end(),
203 result: std::back_inserter(x&: Buffer));
204 return std::make_tuple(args&: N, args: true, args&: Value);
205 }
206 ChildOffset += C.Size;
207 if (!C.HasSibling)
208 break;
209 }
210 }
211 return std::make_tuple(args&: N, args: false, args: 0);
212}
213
214static std::tuple<Node, bool, uint32_t>
215compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
216 return compareNode(Offset, Name, Strict, PreviousCharInName: 0, Buffer);
217}
218
219// clang-format off
220constexpr const char *const HangulSyllables[][3] = {
221 { "G", "A", "" },
222 { "GG", "AE", "G" },
223 { "N", "YA", "GG" },
224 { "D", "YAE", "GS" },
225 { "DD", "EO", "N", },
226 { "R", "E", "NJ" },
227 { "M", "YEO", "NH" },
228 { "B", "YE", "D" },
229 { "BB", "O", "L" },
230 { "S", "WA", "LG" },
231 { "SS", "WAE", "LM" },
232 { "", "OE", "LB" },
233 { "J", "YO", "LS" },
234 { "JJ", "U", "LT" },
235 { "C", "WEO", "LP" },
236 { "K", "WE", "LH" },
237 { "T", "WI", "M" },
238 { "P", "YU", "B" },
239 { "H", "EU", "BS" },
240 { 0, "YI", "S" },
241 { 0, "I", "SS" },
242 { 0, 0, "NG" },
243 { 0, 0, "J" },
244 { 0, 0, "C" },
245 { 0, 0, "K" },
246 { 0, 0, "T" },
247 { 0, 0, "P" },
248 { 0, 0, "H" }
249 };
250// clang-format on
251
252// Unicode 15.0
253// 3.12 Conjoining Jamo Behavior Common constants
254constexpr const char32_t SBase = 0xAC00;
255constexpr const uint32_t LCount = 19;
256constexpr const uint32_t VCount = 21;
257constexpr const uint32_t TCount = 28;
258
259static std::size_t findSyllable(StringRef Name, bool Strict,
260 char &PreviousInName, int &Pos, int Column) {
261 assert(Column == 0 || Column == 1 || Column == 2);
262 static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
263 int Len = -1;
264 int Prev = PreviousInName;
265 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
266 StringRef Syllable(HangulSyllables[I][Column]);
267 if (int(Syllable.size()) <= Len)
268 continue;
269 std::size_t Consummed = 0;
270 char PreviousInNameCopy = PreviousInName;
271 bool DoesStartWith =
272 startsWith(Name, Needle: Syllable, Strict, Consummed, PreviousCharInName&: PreviousInNameCopy);
273 if (!DoesStartWith)
274 continue;
275 Len = Consummed;
276 Pos = I;
277 Prev = PreviousInNameCopy;
278 }
279 if (Len == -1)
280 return 0;
281 PreviousInName = Prev;
282 return size_t(Len);
283}
284
285static std::optional<char32_t>
286nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
287 Buffer.clear();
288 // Hangul Syllable Decomposition
289 std::size_t Consummed = 0;
290 char NameStart = 0;
291 bool DoesStartWith =
292 startsWith(Name, Needle: "HANGUL SYLLABLE ", Strict, Consummed, PreviousCharInName&: NameStart);
293 if (!DoesStartWith)
294 return std::nullopt;
295 Name = Name.substr(Start: Consummed);
296 int L = -1, V = -1, T = -1;
297 Name = Name.substr(Start: findSyllable(Name, Strict, PreviousInName&: NameStart, Pos&: L, Column: 0));
298 Name = Name.substr(Start: findSyllable(Name, Strict, PreviousInName&: NameStart, Pos&: V, Column: 1));
299 Name = Name.substr(Start: findSyllable(Name, Strict, PreviousInName&: NameStart, Pos&: T, Column: 2));
300 if (L != -1 && V != -1 && T != -1 && Name.empty()) {
301 if (!Strict) {
302 Buffer.append(RHS: "HANGUL SYLLABLE ");
303 if (L != -1)
304 Buffer.append(RHS: HangulSyllables[L][0]);
305 if (V != -1)
306 Buffer.append(RHS: HangulSyllables[V][1]);
307 if (T != -1)
308 Buffer.append(RHS: HangulSyllables[T][2]);
309 }
310 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
311 std::uint32_t(T);
312 }
313 // Otherwise, it's an illegal syllable name.
314 return std::nullopt;
315}
316
317struct GeneratedNamesData {
318 StringRef Prefix;
319 uint32_t Start;
320 uint32_t End;
321};
322
323// Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings
324static const GeneratedNamesData GeneratedNamesDataTable[] = {
325 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x3400, .End: 0x4DBF},
326 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x4E00, .End: 0x9FFF},
327 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x20000, .End: 0x2A6DF},
328 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x2A700, .End: 0x2B739},
329 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x2B740, .End: 0x2B81D},
330 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x2B820, .End: 0x2CEA1},
331 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x2CEB0, .End: 0x2EBE0},
332 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x2EBF0, .End: 0x2EE5D},
333 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x30000, .End: 0x3134A},
334 {.Prefix: "CJK UNIFIED IDEOGRAPH-", .Start: 0x31350, .End: 0x323AF},
335 {.Prefix: "TANGUT IDEOGRAPH-", .Start: 0x17000, .End: 0x187F7},
336 {.Prefix: "TANGUT IDEOGRAPH-", .Start: 0x18D00, .End: 0x18D08},
337 {.Prefix: "KHITAN SMALL SCRIPT CHARACTER-", .Start: 0x18B00, .End: 0x18CD5},
338 {.Prefix: "NUSHU CHARACTER-", .Start: 0x1B170, .End: 0x1B2FB},
339 {.Prefix: "CJK COMPATIBILITY IDEOGRAPH-", .Start: 0xF900, .End: 0xFA6D},
340 {.Prefix: "CJK COMPATIBILITY IDEOGRAPH-", .Start: 0xFA70, .End: 0xFAD9},
341 {.Prefix: "CJK COMPATIBILITY IDEOGRAPH-", .Start: 0x2F800, .End: 0x2FA1D},
342};
343
344static std::optional<char32_t>
345nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
346 for (auto &&Item : GeneratedNamesDataTable) {
347 Buffer.clear();
348 std::size_t Consummed = 0;
349 char NameStart = 0;
350 bool DoesStartWith = startsWith(Name, Needle: Item.Prefix, Strict, Consummed,
351 PreviousCharInName&: NameStart, /*IsPrefix=*/true);
352 if (!DoesStartWith)
353 continue;
354 auto Number = Name.substr(Start: Consummed);
355 unsigned long long V = 0;
356 // Be consistent about mandating upper casing.
357 if (Strict &&
358 llvm::any_of(Range&: Number, P: [](char C) { return C >= 'a' && C <= 'f'; }))
359 return {};
360 if (getAsUnsignedInteger(Str: Number, Radix: 16, Result&: V) || V < Item.Start || V > Item.End)
361 continue;
362 if (!Strict) {
363 Buffer.append(RHS: Item.Prefix);
364 Buffer.append(RHS: utohexstr(X: V, LowerCase: true));
365 }
366 return V;
367 }
368 return std::nullopt;
369}
370
371static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
372 BufferType &Buffer) {
373 if (Name.empty())
374 return std::nullopt;
375
376 std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
377 if (!Res)
378 Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
379 if (Res)
380 return *Res;
381
382 Buffer.clear();
383 Node Node;
384 bool Matches;
385 uint32_t Value;
386 std::tie(args&: Node, args&: Matches, args&: Value) = compareNode(Offset: 0, Name, Strict, Buffer);
387 if (Matches) {
388 std::reverse(first: Buffer.begin(), last: Buffer.end());
389 // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
390 // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
391 if (!Strict && Value == 0x116c && Name.contains_insensitive(Other: "O-E")) {
392 Buffer = "HANGUL JUNGSEONG O-E";
393 Value = 0x1180;
394 }
395 return Value;
396 }
397 return std::nullopt;
398}
399
400std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
401
402 BufferType Buffer;
403 auto Opt = nameToCodepoint(Name, Strict: true, Buffer);
404 return Opt;
405}
406
407std::optional<LooseMatchingResult>
408nameToCodepointLooseMatching(StringRef Name) {
409 BufferType Buffer;
410 auto Opt = nameToCodepoint(Name, Strict: false, Buffer);
411 if (!Opt)
412 return std::nullopt;
413 return LooseMatchingResult{.CodePoint: *Opt, .Name: Buffer};
414}
415
416// Find the unicode character whose editing distance to Pattern
417// is shortest, using the Wagner–Fischer algorithm.
418llvm::SmallVector<MatchForCodepointName>
419nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
420 // We maintain a fixed size vector of matches,
421 // sorted by distance
422 // The worst match (with the biggest distance) are discarded when new elements
423 // are added.
424 std::size_t LargestEditDistance = 0;
425 llvm::SmallVector<MatchForCodepointName> Matches;
426 Matches.reserve(N: MaxMatchesCount + 1);
427
428 auto Insert = [&](const Node &Node, uint32_t Distance,
429 char32_t Value) -> bool {
430 if (Distance > LargestEditDistance) {
431 if (Matches.size() == MaxMatchesCount)
432 return false;
433 LargestEditDistance = Distance;
434 }
435 // To avoid allocations, the creation of the name is delayed
436 // as much as possible.
437 std::string Name;
438 auto GetName = [&] {
439 if (Name.empty())
440 Name = Node.fullName();
441 return Name;
442 };
443
444 auto It = llvm::lower_bound(
445 Range&: Matches, Value&: Distance,
446 C: [&](const MatchForCodepointName &a, std::size_t Distance) {
447 if (Distance == a.Distance)
448 return a.Name < GetName();
449 return a.Distance < Distance;
450 });
451 if (It == Matches.end() && Matches.size() == MaxMatchesCount)
452 return false;
453
454 MatchForCodepointName M{.Name: GetName(), .Distance: Distance, .Value: Value};
455 Matches.insert(I: It, Elt: std::move(M));
456 if (Matches.size() > MaxMatchesCount)
457 Matches.pop_back();
458 return true;
459 };
460
461 // We ignore case, space, hyphens, etc,
462 // in both the search pattern and the prospective names.
463 auto Normalize = [](StringRef Name) {
464 std::string Out;
465 Out.reserve(res_arg: Name.size());
466 for (char C : Name) {
467 if (isAlnum(C))
468 Out.push_back(c: toUpper(x: C));
469 }
470 return Out;
471 };
472 std::string NormalizedName = Normalize(Pattern);
473
474 // Allocate a matrix big enough for longest names.
475 const std::size_t Columns =
476 std::min(a: NormalizedName.size(), b: UnicodeNameToCodepointLargestNameSize) +
477 1;
478
479 LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
480 UnicodeNameToCodepointLargestNameSize + 1;
481
482 std::vector<char> Distances(
483 Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
484
485 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
486 assert(Column < Columns);
487 assert(Row < Rows);
488 return Distances[Row * Columns + Column];
489 };
490
491 for (std::size_t I = 0; I < Columns; I++)
492 Get(I, 0) = I;
493
494 // Visit the childrens,
495 // Filling (and overriding) the matrix for the name fragment of each node
496 // iteratively. CompleteName is used to collect the actual name of potential
497 // match, respecting case and spacing.
498 auto VisitNode = [&](const Node &N, std::size_t Row,
499 auto &VisitNode) -> void {
500 std::size_t J = 0;
501 for (; J < N.Name.size(); J++) {
502 if (!isAlnum(C: N.Name[J]))
503 continue;
504
505 Get(0, Row) = Row;
506
507 for (std::size_t I = 1; I < Columns; I++) {
508 const int Delete = Get(I - 1, Row) + 1;
509 const int Insert = Get(I, Row - 1) + 1;
510
511 const int Replace =
512 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
513
514 Get(I, Row) = std::min(a: Insert, b: std::min(a: Delete, b: Replace));
515 }
516
517 Row++;
518 }
519
520 unsigned Cost = Get(Columns - 1, Row - 1);
521 if (N.Value != 0xFFFFFFFF) {
522 Insert(N, Cost, N.Value);
523 }
524
525 if (N.hasChildren()) {
526 auto ChildOffset = N.ChildrenOffset;
527 for (;;) {
528 Node C = readNode(Offset: ChildOffset, Parent: &N);
529 ChildOffset += C.Size;
530 if (!C.isValid())
531 break;
532 VisitNode(C, Row, VisitNode);
533 if (!C.HasSibling)
534 break;
535 }
536 }
537 };
538
539 Node Root = createRoot();
540 VisitNode(Root, 1, VisitNode);
541 return Matches;
542}
543
544} // namespace unicode
545
546} // namespace sys
547} // namespace llvm
548