1//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10// using UnicodeData.txt and NameAliases.txt available at
11// https://unicode.org/Public/draft/ucd/
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/DenseMap.h"
15#include "llvm/ADT/STLExtras.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/StringExtras.h"
18#include "llvm/ADT/StringRef.h"
19#include <algorithm>
20#include <deque>
21#include <fstream>
22#include <memory>
23#include <optional>
24#include <set>
25#include <string>
26#include <unordered_map>
27#include <utility>
28#include <vector>
29
30static const llvm::StringRef Letters =
31 " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
32
33// Collect names UnicodeData.txt and AliasNames.txt
34// There may be multiple names per code points.
35static std::unordered_multimap<char32_t, std::string>
36loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
37 std::unordered_multimap<char32_t, std::string> CollectedCharacters;
38 auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
39 std::ifstream InputFile(File);
40 for (std::string Line; getline(is&: InputFile, str&: Line);) {
41 if (Line.empty() || !isxdigit(Line[0]))
42 continue;
43 auto FirstSemiPos = Line.find(c: ';');
44 if (FirstSemiPos == std::string::npos)
45 continue;
46 auto SecondSemiPos = Line.find(c: ';', pos: FirstSemiPos + 1);
47 if (SecondSemiPos == std::string::npos)
48 continue;
49 unsigned long long CodePoint;
50 if (llvm::getAsUnsignedInteger(
51 Str: llvm::StringRef(Line.c_str(), FirstSemiPos), Radix: 16, Result&: CodePoint)) {
52 continue;
53 }
54
55 std::string Name =
56 Line.substr(pos: FirstSemiPos + 1, n: SecondSemiPos - FirstSemiPos - 1);
57
58 if (!Name.empty() && Name[0] == '<') {
59 // Ignore ranges of characters, as their name is either absent or
60 // generated.
61 continue;
62 }
63
64 // Some aliases are ignored for compatibility with C++
65 if (IsAliasFile) {
66 std::string Kind = Line.substr(pos: SecondSemiPos + 1);
67 if (Kind != "control" && Kind != "correction" && Kind != "alternate")
68 continue;
69 }
70
71 auto InsertUnique = [&](char32_t CP, std::string Name) {
72 auto It = CollectedCharacters.find(x: CP);
73 while (It != std::end(cont&: CollectedCharacters) && It->first == CP) {
74 if (It->second == Name)
75 return;
76 ++It;
77 }
78 CollectedCharacters.insert(x: {CP, std::move(Name)});
79 };
80 InsertUnique(CodePoint, std::move(Name));
81 }
82 };
83
84 FromFile(NamesFile);
85 FromFile(AliasesFile, true);
86 return CollectedCharacters;
87}
88
89class Trie {
90 struct Node;
91
92public:
93 // When inserting named codepoint
94 // We create a node per character in the name.
95 // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
96 // Once all characters are inserted, the tree is compacted
97 void insert(llvm::StringRef Name, char32_t Codepoint) {
98 Node *N = Root.get();
99 bool IsBeforeMedial = false;
100 for (auto ChIt = Name.begin(); ChIt != Name.end();
101 ChIt += (IsBeforeMedial ? 3 : 1)) {
102 char Ch = *ChIt;
103 assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
104
105 std::string Label(1, Ch);
106
107 // We need to ensure a node never ends or starts by
108 // a medial hyphen as this would break the
109 // loose matching algorithm.
110 IsBeforeMedial = llvm::isAlnum(C: Ch) && ChIt + 1 != Name.end() &&
111 *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
112 llvm::isAlnum(C: *(ChIt + 2));
113 if (IsBeforeMedial)
114 Label.assign(first: ChIt, last: ChIt + 3);
115
116 auto It = llvm::find_if(Range&: N->Children,
117 P: [&](const auto &C) { return C->Name == Label; });
118 if (It == N->Children.end()) {
119 It = N->Children.insert(position: It, x: std::make_unique<Node>(args&: Label, args&: N));
120 }
121 N = It->get();
122 }
123 N->Value = Codepoint;
124 }
125
126 void compact() { compact(N: Root.get()); }
127
128 // This creates 2 arrays of bytes from the tree:
129 // A serialized dictionary of node labels,
130 // And the nodes themselves.
131 // The name of each label is found by indexing into the dictionary.
132 // The longest names are inserted first into the dictionary,
133 // in the hope it will contain shorter labels as substring,
134 // thereby reducing duplication.
135 // We could theorically be more clever by trying to minimizing the size
136 // of the dictionary.
137 std::pair<std::string, std::vector<uint8_t>> serialize() {
138 std::set<std::string> Names = this->getNameFragments();
139 std::vector<std::string> Sorted(Names.begin(), Names.end());
140 llvm::sort(C&: Sorted, Comp: [](const auto &a, const auto &b) {
141 return a.size() > b.size();
142 });
143 std::string Dict(Letters.begin(), Letters.end());
144 Dict.reserve(res_arg: 50000);
145 for (const std::string &Name : Sorted) {
146 if (Name.size() <= 1)
147 continue;
148 if (Dict.find(str: Name) != std::string::npos)
149 continue;
150 Dict += Name;
151 }
152
153 if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
154 fprintf(stderr, format: "Dictionary too big to be serialized");
155 exit(status: 1);
156 }
157
158 auto Bytes = dumpIndex(Dict);
159 return {Dict, Bytes};
160 }
161
162 std::set<std::string> getNameFragments() {
163 std::set<std::string> Keys;
164 collectKeys(N: Root.get(), Keys);
165 return Keys;
166 }
167
168 // Maps a valid char in an Unicode character name
169 // To a 6 bits index.
170 static uint8_t letter(char C) {
171 auto Pos = Letters.find(C);
172 assert(Pos != std::string::npos &&
173 "Invalid letter in Unicode character name");
174 return Pos;
175 }
176
177 // clang-format off
178 // +================+============+======================+=============+========+===+==============+===============+
179 // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
180 // +================+============+======================+=============+========+===+==============+===============+
181 // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
182 // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
183 // clang-format on
184
185 std::vector<uint8_t> dumpIndex(const std::string &Dict) {
186 struct ChildrenOffset {
187 Node *FirstChild;
188 std::size_t Offset;
189 bool HasValue;
190 };
191
192 // Keep track of the start of each node
193 // position in the serialized data.
194 llvm::DenseMap<Node *, int32_t> Offsets;
195
196 // Keep track of where to write the index
197 // of the first children
198 std::vector<ChildrenOffset> ChildrenOffsets;
199 llvm::SmallPtrSet<Node *, 16> SiblingTracker;
200 std::deque<Node *> AllNodes;
201 std::vector<uint8_t> Bytes;
202 Bytes.reserve(n: 250'000);
203 // This leading byte is used by the reading code to detect the root node.
204 Bytes.push_back(x: 0);
205
206 auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
207 for (std::size_t Index = 0; Index < Children.size(); Index++) {
208 const std::unique_ptr<Node> &Child = Children[Index];
209 AllNodes.push_back(x: Child.get());
210 if (Index != Children.size() - 1)
211 SiblingTracker.insert(Ptr: Child.get());
212 }
213 };
214 CollectChildren(Root->Children);
215
216 while (!AllNodes.empty()) {
217 const std::size_t Offset = Bytes.size();
218 Node *const N = AllNodes.front();
219 AllNodes.pop_front();
220
221 assert(!N->Name.empty());
222 Offsets[N] = Offset;
223
224 uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
225 // Single letter node are indexed in 6 bits
226 if (N->Name.size() == 1) {
227 FirstByte |= letter(C: N->Name[0]);
228 Bytes.push_back(x: FirstByte);
229 } else {
230 // Otherwise we use a 16 bits index
231 FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
232 Bytes.push_back(x: FirstByte);
233 auto PosInDict = Dict.find(str: N->Name);
234 assert(PosInDict != std::string::npos);
235 uint8_t Low = PosInDict;
236 uint8_t High = ((PosInDict >> 8) & 0xFF);
237 Bytes.push_back(x: High);
238 Bytes.push_back(x: Low);
239 }
240
241 const bool HasSibling = SiblingTracker.contains(Ptr: N);
242 const bool HasChildren = N->Children.size() != 0;
243
244 if (!!N->Value) {
245 uint32_t Value = (*(N->Value) << 3);
246 uint8_t H = ((Value >> 16) & 0xFF);
247 uint8_t M = ((Value >> 8) & 0xFF);
248 uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
249 uint8_t(HasChildren ? 0x02 : 0);
250
251 Bytes.push_back(x: H);
252 Bytes.push_back(x: M);
253 Bytes.push_back(x: L);
254
255 if (HasChildren) {
256 ChildrenOffsets.push_back(
257 x: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size(), .HasValue: true});
258 // index of the first children
259 Bytes.push_back(x: 0x00);
260 Bytes.push_back(x: 0x00);
261 Bytes.push_back(x: 0x00);
262 }
263 } else {
264 // When there is no value (that's most intermediate nodes)
265 // Dispense of the 3 values bytes, and only store
266 // 1 byte to track whether the node has sibling and children
267 // + 2 bytes for the index of the first children if necessary.
268 // That index also uses bytes 0-6 of the previous byte.
269 uint8_t Byte =
270 uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
271 Bytes.push_back(x: Byte);
272 if (HasChildren) {
273 ChildrenOffsets.emplace_back(
274 args: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size() - 1, .HasValue: false});
275 Bytes.push_back(x: 0x00);
276 Bytes.push_back(x: 0x00);
277 }
278 }
279 CollectChildren(N->Children);
280 }
281
282 // Once all the nodes are in the inndex
283 // Fill the bytes we left to indicate the position
284 // of the children
285 for (const ChildrenOffset &Parent : ChildrenOffsets) {
286 const auto It = Offsets.find(Val: Parent.FirstChild);
287 assert(It != Offsets.end());
288 std::size_t Pos = It->second;
289 if (Parent.HasValue) {
290 Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
291 } else {
292 Bytes[Parent.Offset] =
293 Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
294 }
295 Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
296 Bytes[Parent.Offset + 2] = Pos & 0xFF;
297 }
298
299 // Add some padding so that the deserialization code
300 // doesn't try to read past the enf of the array.
301 Bytes.push_back(x: 0);
302 Bytes.push_back(x: 0);
303 Bytes.push_back(x: 0);
304 Bytes.push_back(x: 0);
305 Bytes.push_back(x: 0);
306 Bytes.push_back(x: 0);
307
308 return Bytes;
309 }
310
311private:
312 void collectKeys(Node *N, std::set<std::string> &Keys) {
313 Keys.insert(x: N->Name);
314 for (const std::unique_ptr<Node> &Child : N->Children) {
315 collectKeys(N: Child.get(), Keys);
316 }
317 }
318
319 // Merge sequences of 1-character nodes
320 // This greatly reduce the total number of nodes,
321 // and therefore the size of the index.
322 // When the tree gets serialized, we only have 5 bytes to store the
323 // size of a name. Overlong names (>32 characters) are therefore
324 // kep into separate nodes
325 void compact(Node *N) {
326 for (auto &&Child : N->Children) {
327 compact(N: Child.get());
328 }
329 if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
330 (N->Parent->Name.size() + N->Name.size() <= 32)) {
331 N->Parent->Value = N->Value;
332 N->Parent->Name += N->Name;
333 N->Parent->Children = std::move(N->Children);
334 for (std::unique_ptr<Node> &c : N->Parent->Children) {
335 c->Parent = N->Parent;
336 }
337 }
338 }
339 struct Node {
340 Node(std::string Name, Node *Parent = nullptr)
341 : Name(Name), Parent(Parent) {}
342
343 std::vector<std::unique_ptr<Node>> Children;
344 std::string Name;
345 Node *Parent = nullptr;
346 std::optional<char32_t> Value;
347 };
348
349 std::unique_ptr<Node> Root = std::make_unique<Node>(args: "");
350};
351
352extern const char *UnicodeLicense;
353
354int main(int argc, char **argv) {
355 printf(format: "Unicode name -> codepoint mapping generator\n"
356 "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
357 argv[0]);
358 printf(format: "NameAliases.txt can be found at "
359 "https://unicode.org/Public/draft/ucd/NameAliases.txt\n"
360 "UnicodeData.txt can be found at "
361 "https://unicode.org/Public/draft/ucd/UnicodeData.txt\n\n");
362
363 if (argc != 4)
364 return EXIT_FAILURE;
365
366 FILE *Out = fopen(filename: argv[3], modes: "w");
367 if (!Out) {
368 printf(format: "Error creating output file.\n");
369 return EXIT_FAILURE;
370 }
371
372 Trie T;
373 uint32_t NameCount = 0;
374 std::size_t LongestName = 0;
375 auto Entries = loadDataFiles(NamesFile: argv[1], AliasesFile: argv[2]);
376 for (const std::pair<const char32_t, std::string> &Entry : Entries) {
377 char32_t Codepoint = Entry.first;
378 const std::string &Name = Entry.second;
379 // Ignore names which are not valid.
380 if (Name.empty() ||
381 !llvm::all_of(Range: Name, P: [](char C) { return Letters.contains(C); })) {
382 continue;
383 }
384 printf(format: "%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
385 T.insert(Name, Codepoint);
386 LongestName =
387 std::max(a: LongestName, b: std::size_t(llvm::count_if(Range: Name, P: llvm::isAlnum)));
388 NameCount++;
389 }
390 T.compact();
391
392 std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
393 const std::string &Dict = Data.first;
394 const std::vector<uint8_t> &Tree = Data.second;
395
396 fprintf(stream: Out, format: R"(
397//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
398// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
399// See https://llvm.org/LICENSE.txt for license information.
400// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
401//
402//===----------------------------------------------------------------------===//
403//
404// This file implements mapping the name of a unicode code point to its value.
405//
406// This file was generated using %s.
407// Do not edit manually.
408//
409//===----------------------------------------------------------------------===//
410%s
411
412
413
414#include "llvm/Support/Compiler.h"
415#include <cstddef>
416#include <cstdint>
417)",
418 argv[0], UnicodeLicense);
419
420 fprintf(stream: Out,
421 format: "namespace llvm { namespace sys { namespace unicode { \n"
422 "extern const char *const UnicodeNameToCodepointDict;\n"
423 "extern const uint8_t *const UnicodeNameToCodepointIndex;\n"
424 "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
425 "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
426
427 fprintf(stream: Out, format: "const char *const UnicodeNameToCodepointDict = \"%s\";\n",
428 Dict.c_str());
429
430 fprintf(stream: Out, format: "const uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
431 Tree.size() + 1);
432
433 for (auto Byte : Tree) {
434 fprintf(stream: Out, format: "0x%02x,", Byte);
435 }
436
437 fprintf(stream: Out, format: "0};");
438 fprintf(stream: Out, format: "const uint8_t *const UnicodeNameToCodepointIndex = "
439 "UnicodeNameToCodepointIndex_; \n");
440 fprintf(stream: Out, format: "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
441 Tree.size() + 1);
442 fprintf(stream: Out,
443 format: "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
444 LongestName);
445 fprintf(stream: Out, format: "\n}}}\n");
446 fclose(stream: Out);
447 printf(format: "Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
448 argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
449}
450
451const char *UnicodeLicense = R"(
452/*
453UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
454
455See Terms of Use <https://www.unicode.org/copyright.html>
456for definitions of Unicode Inc.’s Data Files and Software.
457
458NOTICE TO USER: Carefully read the following legal agreement.
459BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
460DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
461YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
462TERMS AND CONDITIONS OF THIS AGREEMENT.
463IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
464THE DATA FILES OR SOFTWARE.
465
466COPYRIGHT AND PERMISSION NOTICE
467
468Copyright © 1991-2022 Unicode, Inc. All rights reserved.
469Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
470
471Permission is hereby granted, free of charge, to any person obtaining
472a copy of the Unicode data files and any associated documentation
473(the "Data Files") or Unicode software and any associated documentation
474(the "Software") to deal in the Data Files or Software
475without restriction, including without limitation the rights to use,
476copy, modify, merge, publish, distribute, and/or sell copies of
477the Data Files or Software, and to permit persons to whom the Data Files
478or Software are furnished to do so, provided that either
479(a) this copyright and permission notice appear with all copies
480of the Data Files or Software, or
481(b) this copyright and permission notice appear in associated
482Documentation.
483
484THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
485ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
486WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
487NONINFRINGEMENT OF THIRD PARTY RIGHTS.
488IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
489NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
490DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
491DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
492TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
493PERFORMANCE OF THE DATA FILES OR SOFTWARE.
494
495Except as contained in this notice, the name of a copyright holder
496shall not be used in advertising or otherwise to promote the sale,
497use or other dealings in these Data Files or Software without prior
498written authorization of the copyright holder.
499*/
500)";
501