1//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10// using UnicodeData.txt and NameAliases.txt available at
11// https://unicode.org/Public/15.1.0/ucd/
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/ADT/StringRef.h"
17#include <algorithm>
18#include <deque>
19#include <fstream>
20#include <memory>
21#include <optional>
22#include <set>
23#include <string>
24#include <unordered_map>
25#include <utility>
26#include <vector>
27
28static const llvm::StringRef Letters =
29 " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
30
31// Collect names UnicodeData.txt and AliasNames.txt
32// There may be multiple names per code points.
33static std::unordered_multimap<char32_t, std::string>
34loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
35 std::unordered_multimap<char32_t, std::string> CollectedCharacters;
36 auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
37 std::ifstream InputFile(File);
38 for (std::string Line; getline(is&: InputFile, str&: Line);) {
39 if (Line.empty() || !isxdigit(Line[0]))
40 continue;
41 auto FirstSemiPos = Line.find(c: ';');
42 if (FirstSemiPos == std::string::npos)
43 continue;
44 auto SecondSemiPos = Line.find(c: ';', pos: FirstSemiPos + 1);
45 if (SecondSemiPos == std::string::npos)
46 continue;
47 unsigned long long CodePoint;
48 if (llvm::getAsUnsignedInteger(
49 Str: llvm::StringRef(Line.c_str(), FirstSemiPos), Radix: 16, Result&: CodePoint)) {
50 continue;
51 }
52
53 std::string Name =
54 Line.substr(pos: FirstSemiPos + 1, n: SecondSemiPos - FirstSemiPos - 1);
55
56 if (!Name.empty() && Name[0] == '<') {
57 // Ignore ranges of characters, as their name is either absent or
58 // generated.
59 continue;
60 }
61
62 // Some aliases are ignored for compatibility with C++
63 if (IsAliasFile) {
64 std::string Kind = Line.substr(pos: SecondSemiPos + 1);
65 if (Kind != "control" && Kind != "correction" && Kind != "alternate")
66 continue;
67 }
68
69 auto InsertUnique = [&](char32_t CP, std::string Name) {
70 auto It = CollectedCharacters.find(x: CP);
71 while (It != std::end(cont&: CollectedCharacters) && It->first == CP) {
72 if (It->second == Name)
73 return;
74 ++It;
75 }
76 CollectedCharacters.insert(x: {CP, std::move(Name)});
77 };
78 InsertUnique(CodePoint, std::move(Name));
79 }
80 };
81
82 FromFile(NamesFile);
83 FromFile(AliasesFile, true);
84 return CollectedCharacters;
85}
86
87class Trie {
88 struct Node;
89
90public:
91 // When inserting named codepoint
92 // We create a node per character in the name.
93 // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
94 // Once all characters are inserted, the tree is compacted
95 void insert(llvm::StringRef Name, char32_t Codepoint) {
96 Node *N = Root.get();
97 bool IsBeforeMedial = false;
98 for (auto ChIt = Name.begin(); ChIt != Name.end();
99 ChIt += (IsBeforeMedial ? 3 : 1)) {
100 char Ch = *ChIt;
101 assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
102
103 std::string Label(1, Ch);
104
105 // We need to ensure a node never ends or starts by
106 // a medial hyphen as this would break the
107 // loose matching algorithm.
108 IsBeforeMedial = llvm::isAlnum(C: Ch) && ChIt + 1 != Name.end() &&
109 *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
110 llvm::isAlnum(C: *(ChIt + 2));
111 if (IsBeforeMedial)
112 Label.assign(first: ChIt, last: ChIt + 3);
113
114 auto It = llvm::find_if(Range&: N->Children,
115 P: [&](const auto &C) { return C->Name == Label; });
116 if (It == N->Children.end()) {
117 It = N->Children.insert(position: It, x: std::make_unique<Node>(args&: Label, args&: N));
118 }
119 N = It->get();
120 }
121 N->Value = Codepoint;
122 }
123
124 void compact() { compact(N: Root.get()); }
125
126 // This creates 2 arrays of bytes from the tree:
127 // A serialized dictionary of node labels,
128 // And the nodes themselves.
129 // The name of each label is found by indexing into the dictionary.
130 // The longest names are inserted first into the dictionary,
131 // in the hope it will contain shorter labels as substring,
132 // thereby reducing duplication.
133 // We could theorically be more clever by trying to minimizing the size
134 // of the dictionary.
135 std::pair<std::string, std::vector<uint8_t>> serialize() {
136 std::set<std::string> Names = this->getNameFragments();
137 std::vector<std::string> Sorted(Names.begin(), Names.end());
138 llvm::sort(C&: Sorted, Comp: [](const auto &a, const auto &b) {
139 return a.size() > b.size();
140 });
141 std::string Dict(Letters.begin(), Letters.end());
142 Dict.reserve(res_arg: 50000);
143 for (const std::string &Name : Sorted) {
144 if (Name.size() <= 1)
145 continue;
146 if (Dict.find(str: Name) != std::string::npos)
147 continue;
148 Dict += Name;
149 }
150
151 if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
152 fprintf(stderr, format: "Dictionary too big to be serialized");
153 exit(status: 1);
154 }
155
156 auto Bytes = dumpIndex(Dict);
157 return {Dict, Bytes};
158 }
159
160 std::set<std::string> getNameFragments() {
161 std::set<std::string> Keys;
162 collectKeys(N: Root.get(), Keys);
163 return Keys;
164 }
165
166 // Maps a valid char in an Unicode character name
167 // To a 6 bits index.
168 static uint8_t letter(char C) {
169 auto Pos = Letters.find(C);
170 assert(Pos != std::string::npos &&
171 "Invalid letter in Unicode character name");
172 return Pos;
173 }
174
175 // clang-format off
176 // +================+============+======================+=============+========+===+==============+===============+
177 // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
178 // +================+============+======================+=============+========+===+==============+===============+
179 // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
180 // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
181 // clang-format on
182
183 std::vector<uint8_t> dumpIndex(const std::string &Dict) {
184 struct ChildrenOffset {
185 Node *FirstChild;
186 std::size_t Offset;
187 bool HasValue;
188 };
189
190 // Keep track of the start of each node
191 // position in the serialized data.
192 std::unordered_map<Node *, int32_t> Offsets;
193
194 // Keep track of where to write the index
195 // of the first children
196 std::vector<ChildrenOffset> ChildrenOffsets;
197 std::unordered_map<Node *, bool> SiblingTracker;
198 std::deque<Node *> AllNodes;
199 std::vector<uint8_t> Bytes;
200 Bytes.reserve(n: 250'000);
201 // This leading byte is used by the reading code to detect the root node.
202 Bytes.push_back(x: 0);
203
204 auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
205 for (std::size_t Index = 0; Index < Children.size(); Index++) {
206 const std::unique_ptr<Node> &Child = Children[Index];
207 AllNodes.push_back(x: Child.get());
208 if (Index != Children.size() - 1)
209 SiblingTracker[Child.get()] = true;
210 }
211 };
212 CollectChildren(Root->Children);
213
214 while (!AllNodes.empty()) {
215 const std::size_t Offset = Bytes.size();
216 Node *const N = AllNodes.front();
217 AllNodes.pop_front();
218
219 assert(!N->Name.empty());
220 Offsets[N] = Offset;
221
222 uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
223 // Single letter node are indexed in 6 bits
224 if (N->Name.size() == 1) {
225 FirstByte |= letter(C: N->Name[0]);
226 Bytes.push_back(x: FirstByte);
227 } else {
228 // Otherwise we use a 16 bits index
229 FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
230 Bytes.push_back(x: FirstByte);
231 auto PosInDict = Dict.find(str: N->Name);
232 assert(PosInDict != std::string::npos);
233 uint8_t Low = PosInDict;
234 uint8_t High = ((PosInDict >> 8) & 0xFF);
235 Bytes.push_back(x: High);
236 Bytes.push_back(x: Low);
237 }
238
239 const bool HasSibling = SiblingTracker.count(x: N) != 0;
240 const bool HasChildren = N->Children.size() != 0;
241
242 if (!!N->Value) {
243 uint32_t Value = (*(N->Value) << 3);
244 uint8_t H = ((Value >> 16) & 0xFF);
245 uint8_t M = ((Value >> 8) & 0xFF);
246 uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
247 uint8_t(HasChildren ? 0x02 : 0);
248
249 Bytes.push_back(x: H);
250 Bytes.push_back(x: M);
251 Bytes.push_back(x: L);
252
253 if (HasChildren) {
254 ChildrenOffsets.push_back(
255 x: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size(), .HasValue: true});
256 // index of the first children
257 Bytes.push_back(x: 0x00);
258 Bytes.push_back(x: 0x00);
259 Bytes.push_back(x: 0x00);
260 }
261 } else {
262 // When there is no value (that's most intermediate nodes)
263 // Dispense of the 3 values bytes, and only store
264 // 1 byte to track whether the node has sibling and children
265 // + 2 bytes for the index of the first children if necessary.
266 // That index also uses bytes 0-6 of the previous byte.
267 uint8_t Byte =
268 uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
269 Bytes.push_back(x: Byte);
270 if (HasChildren) {
271 ChildrenOffsets.emplace_back(
272 args: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size() - 1, .HasValue: false});
273 Bytes.push_back(x: 0x00);
274 Bytes.push_back(x: 0x00);
275 }
276 }
277 CollectChildren(N->Children);
278 }
279
280 // Once all the nodes are in the inndex
281 // Fill the bytes we left to indicate the position
282 // of the children
283 for (const ChildrenOffset &Parent : ChildrenOffsets) {
284 const auto It = Offsets.find(x: Parent.FirstChild);
285 assert(It != Offsets.end());
286 std::size_t Pos = It->second;
287 if (Parent.HasValue) {
288 Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
289 } else {
290 Bytes[Parent.Offset] =
291 Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
292 }
293 Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
294 Bytes[Parent.Offset + 2] = Pos & 0xFF;
295 }
296
297 // Add some padding so that the deserialization code
298 // doesn't try to read past the enf of the array.
299 Bytes.push_back(x: 0);
300 Bytes.push_back(x: 0);
301 Bytes.push_back(x: 0);
302 Bytes.push_back(x: 0);
303 Bytes.push_back(x: 0);
304 Bytes.push_back(x: 0);
305
306 return Bytes;
307 }
308
309private:
310 void collectKeys(Node *N, std::set<std::string> &Keys) {
311 Keys.insert(x: N->Name);
312 for (const std::unique_ptr<Node> &Child : N->Children) {
313 collectKeys(N: Child.get(), Keys);
314 }
315 }
316
317 // Merge sequences of 1-character nodes
318 // This greatly reduce the total number of nodes,
319 // and therefore the size of the index.
320 // When the tree gets serialized, we only have 5 bytes to store the
321 // size of a name. Overlong names (>32 characters) are therefore
322 // kep into separate nodes
323 void compact(Node *N) {
324 for (auto &&Child : N->Children) {
325 compact(N: Child.get());
326 }
327 if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
328 (N->Parent->Name.size() + N->Name.size() <= 32)) {
329 N->Parent->Value = N->Value;
330 N->Parent->Name += N->Name;
331 N->Parent->Children = std::move(N->Children);
332 for (std::unique_ptr<Node> &c : N->Parent->Children) {
333 c->Parent = N->Parent;
334 }
335 }
336 }
337 struct Node {
338 Node(std::string Name, Node *Parent = nullptr)
339 : Name(Name), Parent(Parent) {}
340
341 std::vector<std::unique_ptr<Node>> Children;
342 std::string Name;
343 Node *Parent = nullptr;
344 std::optional<char32_t> Value;
345 };
346
347 std::unique_ptr<Node> Root = std::make_unique<Node>(args: "");
348};
349
350extern const char *UnicodeLicense;
351
352int main(int argc, char **argv) {
353 printf(format: "Unicode name -> codepoint mapping generator\n"
354 "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
355 argv[0]);
356 printf(format: "NameAliases.txt can be found at "
357 "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
358 "UnicodeData.txt can be found at "
359 "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
360
361 if (argc != 4)
362 return EXIT_FAILURE;
363
364 FILE *Out = fopen(filename: argv[3], modes: "w");
365 if (!Out) {
366 printf(format: "Error creating output file.\n");
367 return EXIT_FAILURE;
368 }
369
370 Trie T;
371 uint32_t NameCount = 0;
372 std::size_t LongestName = 0;
373 auto Entries = loadDataFiles(NamesFile: argv[1], AliasesFile: argv[2]);
374 for (const std::pair<const char32_t, std::string> &Entry : Entries) {
375 char32_t Codepoint = Entry.first;
376 const std::string &Name = Entry.second;
377 // Ignore names which are not valid.
378 if (Name.empty() ||
379 !llvm::all_of(Range: Name, P: [](char C) { return Letters.contains(C); })) {
380 continue;
381 }
382 printf(format: "%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
383 T.insert(Name, Codepoint);
384 LongestName =
385 std::max(a: LongestName, b: std::size_t(llvm::count_if(Range: Name, P: llvm::isAlnum)));
386 NameCount++;
387 }
388 T.compact();
389
390 std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
391 const std::string &Dict = Data.first;
392 const std::vector<uint8_t> &Tree = Data.second;
393
394 fprintf(stream: Out, format: R"(
395//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
396// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
397// See https://llvm.org/LICENSE.txt for license information.
398// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
399//
400//===----------------------------------------------------------------------===//
401//
402// This file implements mapping the name of a unicode code point to its value.
403//
404// This file was generated using %s.
405// Do not edit manually.
406//
407//===----------------------------------------------------------------------===//
408%s
409
410
411
412#include "llvm/Support/Compiler.h"
413#include <cstddef>
414#include <cstdint>
415)",
416 argv[0], UnicodeLicense);
417
418 fprintf(stream: Out,
419 format: "namespace llvm { namespace sys { namespace unicode { \n"
420 "extern const char *const UnicodeNameToCodepointDict;\n"
421 "extern const uint8_t *const UnicodeNameToCodepointIndex;\n"
422 "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
423 "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
424
425 fprintf(stream: Out, format: "const char *const UnicodeNameToCodepointDict = \"%s\";\n",
426 Dict.c_str());
427
428 fprintf(stream: Out, format: "const uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
429 Tree.size() + 1);
430
431 for (auto Byte : Tree) {
432 fprintf(stream: Out, format: "0x%02x,", Byte);
433 }
434
435 fprintf(stream: Out, format: "0};");
436 fprintf(stream: Out, format: "const uint8_t *const UnicodeNameToCodepointIndex = "
437 "UnicodeNameToCodepointIndex_; \n");
438 fprintf(stream: Out, format: "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
439 Tree.size() + 1);
440 fprintf(stream: Out,
441 format: "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
442 LongestName);
443 fprintf(stream: Out, format: "\n}}}\n");
444 fclose(stream: Out);
445 printf(format: "Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
446 argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
447}
448
449const char *UnicodeLicense = R"(
450/*
451UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
452
453See Terms of Use <https://www.unicode.org/copyright.html>
454for definitions of Unicode Inc.’s Data Files and Software.
455
456NOTICE TO USER: Carefully read the following legal agreement.
457BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
458DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
459YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
460TERMS AND CONDITIONS OF THIS AGREEMENT.
461IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
462THE DATA FILES OR SOFTWARE.
463
464COPYRIGHT AND PERMISSION NOTICE
465
466Copyright © 1991-2022 Unicode, Inc. All rights reserved.
467Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
468
469Permission is hereby granted, free of charge, to any person obtaining
470a copy of the Unicode data files and any associated documentation
471(the "Data Files") or Unicode software and any associated documentation
472(the "Software") to deal in the Data Files or Software
473without restriction, including without limitation the rights to use,
474copy, modify, merge, publish, distribute, and/or sell copies of
475the Data Files or Software, and to permit persons to whom the Data Files
476or Software are furnished to do so, provided that either
477(a) this copyright and permission notice appear with all copies
478of the Data Files or Software, or
479(b) this copyright and permission notice appear in associated
480Documentation.
481
482THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
483ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
484WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
485NONINFRINGEMENT OF THIRD PARTY RIGHTS.
486IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
487NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
488DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
489DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
490TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
491PERFORMANCE OF THE DATA FILES OR SOFTWARE.
492
493Except as contained in this notice, the name of a copyright holder
494shall not be used in advertising or otherwise to promote the sale,
495use or other dealings in these Data Files or Software without prior
496written authorization of the copyright holder.
497*/
498)";
499