1//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10// using UnicodeData.txt and NameAliases.txt available at
11// https://unicode.org/Public/15.1.0/ucd/
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/ADT/StringRef.h"
17#include <algorithm>
18#include <array>
19#include <deque>
20#include <fstream>
21#include <memory>
22#include <optional>
23#include <set>
24#include <string>
25#include <unordered_map>
26#include <utility>
27#include <vector>
28
29static const llvm::StringRef Letters =
30 " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31
32// Collect names UnicodeData.txt and AliasNames.txt
33// There may be multiple names per code points.
34static std::unordered_multimap<char32_t, std::string>
35loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36 std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37 auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38 std::ifstream InputFile(File);
39 for (std::string Line; getline(is&: InputFile, str&: Line);) {
40 if (Line.empty() || !isxdigit(Line[0]))
41 continue;
42 auto FirstSemiPos = Line.find(c: ';');
43 if (FirstSemiPos == std::string::npos)
44 continue;
45 auto SecondSemiPos = Line.find(c: ';', pos: FirstSemiPos + 1);
46 if (SecondSemiPos == std::string::npos)
47 continue;
48 unsigned long long CodePoint;
49 if (llvm::getAsUnsignedInteger(
50 Str: llvm::StringRef(Line.c_str(), FirstSemiPos), Radix: 16, Result&: CodePoint)) {
51 continue;
52 }
53
54 std::string Name =
55 Line.substr(pos: FirstSemiPos + 1, n: SecondSemiPos - FirstSemiPos - 1);
56
57 if (!Name.empty() && Name[0] == '<') {
58 // Ignore ranges of characters, as their name is either absent or
59 // generated.
60 continue;
61 }
62
63 // Some aliases are ignored for compatibility with C++
64 if (IsAliasFile) {
65 std::string Kind = Line.substr(pos: SecondSemiPos + 1);
66 if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67 continue;
68 }
69
70 auto InsertUnique = [&](char32_t CP, std::string Name) {
71 auto It = CollectedCharacters.find(x: CP);
72 while (It != std::end(cont&: CollectedCharacters) && It->first == CP) {
73 if (It->second == Name)
74 return;
75 ++It;
76 }
77 CollectedCharacters.insert(x: {CP, std::move(Name)});
78 };
79 InsertUnique(CodePoint, std::move(Name));
80 }
81 };
82
83 FromFile(NamesFile);
84 FromFile(AliasesFile, true);
85 return CollectedCharacters;
86}
87
88class Trie {
89 struct Node;
90
91public:
92 // When inserting named codepoint
93 // We create a node per character in the name.
94 // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95 // Once all characters are inserted, the tree is compacted
96 void insert(llvm::StringRef Name, char32_t Codepoint) {
97 Node *N = Root.get();
98 bool IsBeforeMedial = false;
99 for (auto ChIt = Name.begin(); ChIt != Name.end();
100 ChIt += (IsBeforeMedial ? 3 : 1)) {
101 char Ch = *ChIt;
102 assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
103
104 std::string Label(1, Ch);
105
106 // We need to ensure a node never ends or starts by
107 // a medial hyphen as this would break the
108 // loose matching algorithm.
109 IsBeforeMedial = llvm::isAlnum(C: Ch) && ChIt + 1 != Name.end() &&
110 *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
111 llvm::isAlnum(C: *(ChIt + 2));
112 if (IsBeforeMedial)
113 Label.assign(first: ChIt, last: ChIt + 3);
114
115 auto It = llvm::find_if(Range&: N->Children,
116 P: [&](const auto &C) { return C->Name == Label; });
117 if (It == N->Children.end()) {
118 It = N->Children.insert(position: It, x: std::make_unique<Node>(args&: Label, args&: N));
119 }
120 N = It->get();
121 }
122 N->Value = Codepoint;
123 }
124
125 void compact() { compact(N: Root.get()); }
126
127 // This creates 2 arrays of bytes from the tree:
128 // A serialized dictionary of node labels,
129 // And the nodes themselves.
130 // The name of each label is found by indexing into the dictionary.
131 // The longest names are inserted first into the dictionary,
132 // in the hope it will contain shorter labels as substring,
133 // thereby reducing duplication.
134 // We could theorically be more clever by trying to minimizing the size
135 // of the dictionary.
136 std::pair<std::string, std::vector<uint8_t>> serialize() {
137 std::set<std::string> Names = this->getNameFragments();
138 std::vector<std::string> Sorted(Names.begin(), Names.end());
139 llvm::sort(C&: Sorted, Comp: [](const auto &a, const auto &b) {
140 return a.size() > b.size();
141 });
142 std::string Dict(Letters.begin(), Letters.end());
143 Dict.reserve(res_arg: 50000);
144 for (const std::string &Name : Sorted) {
145 if (Name.size() <= 1)
146 continue;
147 if (Dict.find(str: Name) != std::string::npos)
148 continue;
149 Dict += Name;
150 }
151
152 if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
153 fprintf(stderr, format: "Dictionary too big to be serialized");
154 exit(status: 1);
155 }
156
157 auto Bytes = dumpIndex(Dict);
158 return {Dict, Bytes};
159 }
160
161 std::set<std::string> getNameFragments() {
162 std::set<std::string> Keys;
163 collectKeys(N: Root.get(), Keys);
164 return Keys;
165 }
166
167 // Maps a valid char in an Unicode character name
168 // To a 6 bits index.
169 static uint8_t letter(char C) {
170 auto Pos = Letters.find(C);
171 assert(Pos != std::string::npos &&
172 "Invalid letter in Unicode character name");
173 return Pos;
174 }
175
176 // clang-format off
177 // +================+============+======================+=============+========+===+==============+===============+
178 // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
179 // +================+============+======================+=============+========+===+==============+===============+
180 // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
181 // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
182 // clang-format on
183
184 std::vector<uint8_t> dumpIndex(const std::string &Dict) {
185 struct ChildrenOffset {
186 Node *FirstChild;
187 std::size_t Offset;
188 bool HasValue;
189 };
190
191 // Keep track of the start of each node
192 // position in the serialized data.
193 std::unordered_map<Node *, int32_t> Offsets;
194
195 // Keep track of where to write the index
196 // of the first children
197 std::vector<ChildrenOffset> ChildrenOffsets;
198 std::unordered_map<Node *, bool> SiblingTracker;
199 std::deque<Node *> AllNodes;
200 std::vector<uint8_t> Bytes;
201 Bytes.reserve(n: 250'000);
202 // This leading byte is used by the reading code to detect the root node.
203 Bytes.push_back(x: 0);
204
205 auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
206 for (std::size_t Index = 0; Index < Children.size(); Index++) {
207 const std::unique_ptr<Node> &Child = Children[Index];
208 AllNodes.push_back(x: Child.get());
209 if (Index != Children.size() - 1)
210 SiblingTracker[Child.get()] = true;
211 }
212 };
213 CollectChildren(Root->Children);
214
215 while (!AllNodes.empty()) {
216 const std::size_t Offset = Bytes.size();
217 Node *const N = AllNodes.front();
218 AllNodes.pop_front();
219
220 assert(!N->Name.empty());
221 Offsets[N] = Offset;
222
223 uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
224 // Single letter node are indexed in 6 bits
225 if (N->Name.size() == 1) {
226 FirstByte |= letter(C: N->Name[0]);
227 Bytes.push_back(x: FirstByte);
228 } else {
229 // Otherwise we use a 16 bits index
230 FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
231 Bytes.push_back(x: FirstByte);
232 auto PosInDict = Dict.find(str: N->Name);
233 assert(PosInDict != std::string::npos);
234 uint8_t Low = PosInDict;
235 uint8_t High = ((PosInDict >> 8) & 0xFF);
236 Bytes.push_back(x: High);
237 Bytes.push_back(x: Low);
238 }
239
240 const bool HasSibling = SiblingTracker.count(x: N) != 0;
241 const bool HasChildren = N->Children.size() != 0;
242
243 if (!!N->Value) {
244 uint32_t Value = (*(N->Value) << 3);
245 uint8_t H = ((Value >> 16) & 0xFF);
246 uint8_t M = ((Value >> 8) & 0xFF);
247 uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
248 uint8_t(HasChildren ? 0x02 : 0);
249
250 Bytes.push_back(x: H);
251 Bytes.push_back(x: M);
252 Bytes.push_back(x: L);
253
254 if (HasChildren) {
255 ChildrenOffsets.push_back(
256 x: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size(), .HasValue: true});
257 // index of the first children
258 Bytes.push_back(x: 0x00);
259 Bytes.push_back(x: 0x00);
260 Bytes.push_back(x: 0x00);
261 }
262 } else {
263 // When there is no value (that's most intermediate nodes)
264 // Dispense of the 3 values bytes, and only store
265 // 1 byte to track whether the node has sibling and children
266 // + 2 bytes for the index of the first children if necessary.
267 // That index also uses bytes 0-6 of the previous byte.
268 uint8_t Byte =
269 uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
270 Bytes.push_back(x: Byte);
271 if (HasChildren) {
272 ChildrenOffsets.emplace_back(
273 args: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size() - 1, .HasValue: false});
274 Bytes.push_back(x: 0x00);
275 Bytes.push_back(x: 0x00);
276 }
277 }
278 CollectChildren(N->Children);
279 }
280
281 // Once all the nodes are in the inndex
282 // Fill the bytes we left to indicate the position
283 // of the children
284 for (const ChildrenOffset &Parent : ChildrenOffsets) {
285 const auto It = Offsets.find(x: Parent.FirstChild);
286 assert(It != Offsets.end());
287 std::size_t Pos = It->second;
288 if (Parent.HasValue) {
289 Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
290 } else {
291 Bytes[Parent.Offset] =
292 Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
293 }
294 Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
295 Bytes[Parent.Offset + 2] = Pos & 0xFF;
296 }
297
298 // Add some padding so that the deserialization code
299 // doesn't try to read past the enf of the array.
300 Bytes.push_back(x: 0);
301 Bytes.push_back(x: 0);
302 Bytes.push_back(x: 0);
303 Bytes.push_back(x: 0);
304 Bytes.push_back(x: 0);
305 Bytes.push_back(x: 0);
306
307 return Bytes;
308 }
309
310private:
311 void collectKeys(Node *N, std::set<std::string> &Keys) {
312 Keys.insert(x: N->Name);
313 for (const std::unique_ptr<Node> &Child : N->Children) {
314 collectKeys(N: Child.get(), Keys);
315 }
316 }
317
318 // Merge sequences of 1-character nodes
319 // This greatly reduce the total number of nodes,
320 // and therefore the size of the index.
321 // When the tree gets serialized, we only have 5 bytes to store the
322 // size of a name. Overlong names (>32 characters) are therefore
323 // kep into separate nodes
324 void compact(Node *N) {
325 for (auto &&Child : N->Children) {
326 compact(N: Child.get());
327 }
328 if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
329 (N->Parent->Name.size() + N->Name.size() <= 32)) {
330 N->Parent->Value = N->Value;
331 N->Parent->Name += N->Name;
332 N->Parent->Children = std::move(N->Children);
333 for (std::unique_ptr<Node> &c : N->Parent->Children) {
334 c->Parent = N->Parent;
335 }
336 }
337 }
338 struct Node {
339 Node(std::string Name, Node *Parent = nullptr)
340 : Name(Name), Parent(Parent) {}
341
342 std::vector<std::unique_ptr<Node>> Children;
343 std::string Name;
344 Node *Parent = nullptr;
345 std::optional<char32_t> Value;
346 };
347
348 std::unique_ptr<Node> Root = std::make_unique<Node>(args: "");
349};
350
351extern const char *UnicodeLicense;
352
353int main(int argc, char **argv) {
354 printf(format: "Unicode name -> codepoint mapping generator\n"
355 "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
356 argv[0]);
357 printf(format: "NameAliases.txt can be found at "
358 "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
359 "UnicodeData.txt can be found at "
360 "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
361
362 if (argc != 4)
363 return EXIT_FAILURE;
364
365 FILE *Out = fopen(filename: argv[3], modes: "w");
366 if (!Out) {
367 printf(format: "Error creating output file.\n");
368 return EXIT_FAILURE;
369 }
370
371 Trie T;
372 uint32_t NameCount = 0;
373 std::size_t LongestName = 0;
374 auto Entries = loadDataFiles(NamesFile: argv[1], AliasesFile: argv[2]);
375 for (const std::pair<const char32_t, std::string> &Entry : Entries) {
376 char32_t Codepoint = Entry.first;
377 const std::string &Name = Entry.second;
378 // Ignore names which are not valid.
379 if (Name.empty() ||
380 !llvm::all_of(Range: Name, P: [](char C) { return Letters.contains(C); })) {
381 continue;
382 }
383 printf(format: "%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
384 T.insert(Name, Codepoint);
385 LongestName =
386 std::max(a: LongestName, b: std::size_t(llvm::count_if(Range: Name, P: llvm::isAlnum)));
387 NameCount++;
388 }
389 T.compact();
390
391 std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
392 const std::string &Dict = Data.first;
393 const std::vector<uint8_t> &Tree = Data.second;
394
395 fprintf(stream: Out, format: R"(
396//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
397// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
398// See https://llvm.org/LICENSE.txt for license information.
399// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
400//
401//===----------------------------------------------------------------------===//
402//
403// This file implements mapping the name of a unicode code point to its value.
404//
405// This file was generated using %s.
406// Do not edit manually.
407//
408//===----------------------------------------------------------------------===//
409%s
410
411
412
413#include "llvm/Support/Compiler.h"
414#include <cstddef>
415#include <cstdint>
416)",
417 argv[0], UnicodeLicense);
418
419 fprintf(stream: Out,
420 format: "namespace llvm { namespace sys { namespace unicode { \n"
421 "extern const char *UnicodeNameToCodepointDict;\n"
422 "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
423 "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
424 "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
425
426 fprintf(stream: Out, format: "const char* UnicodeNameToCodepointDict = \"%s\";\n",
427 Dict.c_str());
428
429 fprintf(stream: Out, format: "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
430 Tree.size() + 1);
431
432 for (auto Byte : Tree) {
433 fprintf(stream: Out, format: "0x%02x,", Byte);
434 }
435
436 fprintf(stream: Out, format: "0};");
437 fprintf(stream: Out, format: "const uint8_t* UnicodeNameToCodepointIndex = "
438 "UnicodeNameToCodepointIndex_; \n");
439 fprintf(stream: Out, format: "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
440 Tree.size() + 1);
441 fprintf(stream: Out,
442 format: "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
443 LongestName);
444 fprintf(stream: Out, format: "\n}}}\n");
445 fclose(stream: Out);
446 printf(format: "Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
447 argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
448}
449
450const char *UnicodeLicense = R"(
451/*
452UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
453
454See Terms of Use <https://www.unicode.org/copyright.html>
455for definitions of Unicode Inc.’s Data Files and Software.
456
457NOTICE TO USER: Carefully read the following legal agreement.
458BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
459DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
460YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
461TERMS AND CONDITIONS OF THIS AGREEMENT.
462IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
463THE DATA FILES OR SOFTWARE.
464
465COPYRIGHT AND PERMISSION NOTICE
466
467Copyright © 1991-2022 Unicode, Inc. All rights reserved.
468Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
469
470Permission is hereby granted, free of charge, to any person obtaining
471a copy of the Unicode data files and any associated documentation
472(the "Data Files") or Unicode software and any associated documentation
473(the "Software") to deal in the Data Files or Software
474without restriction, including without limitation the rights to use,
475copy, modify, merge, publish, distribute, and/or sell copies of
476the Data Files or Software, and to permit persons to whom the Data Files
477or Software are furnished to do so, provided that either
478(a) this copyright and permission notice appear with all copies
479of the Data Files or Software, or
480(b) this copyright and permission notice appear in associated
481Documentation.
482
483THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
484ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
485WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
486NONINFRINGEMENT OF THIRD PARTY RIGHTS.
487IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
488NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
489DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
490DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
491TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
492PERFORMANCE OF THE DATA FILES OR SOFTWARE.
493
494Except as contained in this notice, the name of a copyright holder
495shall not be used in advertising or otherwise to promote the sale,
496use or other dealings in these Data Files or Software without prior
497written authorization of the copyright holder.
498*/
499)";
500