1 | //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp |
10 | // using UnicodeData.txt and NameAliases.txt available at |
11 | // https://unicode.org/Public/15.1.0/ucd/ |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/ADT/STLExtras.h" |
15 | #include "llvm/ADT/StringExtras.h" |
16 | #include "llvm/ADT/StringRef.h" |
17 | #include <algorithm> |
18 | #include <array> |
19 | #include <deque> |
20 | #include <fstream> |
21 | #include <memory> |
22 | #include <optional> |
23 | #include <set> |
24 | #include <string> |
25 | #include <unordered_map> |
26 | #include <utility> |
27 | #include <vector> |
28 | |
29 | static const llvm::StringRef Letters = |
30 | " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" ; |
31 | |
32 | // Collect names UnicodeData.txt and AliasNames.txt |
33 | // There may be multiple names per code points. |
34 | static std::unordered_multimap<char32_t, std::string> |
35 | loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) { |
36 | std::unordered_multimap<char32_t, std::string> CollectedCharacters; |
37 | auto FromFile = [&](const std::string &File, bool IsAliasFile = false) { |
38 | std::ifstream InputFile(File); |
39 | for (std::string Line; getline(is&: InputFile, str&: Line);) { |
40 | if (Line.empty() || !isxdigit(Line[0])) |
41 | continue; |
42 | auto FirstSemiPos = Line.find(c: ';'); |
43 | if (FirstSemiPos == std::string::npos) |
44 | continue; |
45 | auto SecondSemiPos = Line.find(c: ';', pos: FirstSemiPos + 1); |
46 | if (SecondSemiPos == std::string::npos) |
47 | continue; |
48 | unsigned long long CodePoint; |
49 | if (llvm::getAsUnsignedInteger( |
50 | Str: llvm::StringRef(Line.c_str(), FirstSemiPos), Radix: 16, Result&: CodePoint)) { |
51 | continue; |
52 | } |
53 | |
54 | std::string Name = |
55 | Line.substr(pos: FirstSemiPos + 1, n: SecondSemiPos - FirstSemiPos - 1); |
56 | |
57 | if (!Name.empty() && Name[0] == '<') { |
58 | // Ignore ranges of characters, as their name is either absent or |
59 | // generated. |
60 | continue; |
61 | } |
62 | |
63 | // Some aliases are ignored for compatibility with C++ |
64 | if (IsAliasFile) { |
65 | std::string Kind = Line.substr(pos: SecondSemiPos + 1); |
66 | if (Kind != "control" && Kind != "correction" && Kind != "alternate" ) |
67 | continue; |
68 | } |
69 | |
70 | auto InsertUnique = [&](char32_t CP, std::string Name) { |
71 | auto It = CollectedCharacters.find(x: CP); |
72 | while (It != std::end(cont&: CollectedCharacters) && It->first == CP) { |
73 | if (It->second == Name) |
74 | return; |
75 | ++It; |
76 | } |
77 | CollectedCharacters.insert(x: {CP, std::move(Name)}); |
78 | }; |
79 | InsertUnique(CodePoint, std::move(Name)); |
80 | } |
81 | }; |
82 | |
83 | FromFile(NamesFile); |
84 | FromFile(AliasesFile, true); |
85 | return CollectedCharacters; |
86 | } |
87 | |
88 | class Trie { |
89 | struct Node; |
90 | |
91 | public: |
92 | // When inserting named codepoint |
93 | // We create a node per character in the name. |
94 | // SPARKLE becomes S <- P <- A <- R <- K <- L <- E |
95 | // Once all characters are inserted, the tree is compacted |
96 | void insert(llvm::StringRef Name, char32_t Codepoint) { |
97 | Node *N = Root.get(); |
98 | bool IsBeforeMedial = false; |
99 | for (auto ChIt = Name.begin(); ChIt != Name.end(); |
100 | ChIt += (IsBeforeMedial ? 3 : 1)) { |
101 | char Ch = *ChIt; |
102 | assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name" ); |
103 | |
104 | std::string Label(1, Ch); |
105 | |
106 | // We need to ensure a node never ends or starts by |
107 | // a medial hyphen as this would break the |
108 | // loose matching algorithm. |
109 | IsBeforeMedial = llvm::isAlnum(C: Ch) && ChIt + 1 != Name.end() && |
110 | *(ChIt + 1) == '-' && ChIt + 2 != Name.end() && |
111 | llvm::isAlnum(C: *(ChIt + 2)); |
112 | if (IsBeforeMedial) |
113 | Label.assign(first: ChIt, last: ChIt + 3); |
114 | |
115 | auto It = llvm::find_if(Range&: N->Children, |
116 | P: [&](const auto &C) { return C->Name == Label; }); |
117 | if (It == N->Children.end()) { |
118 | It = N->Children.insert(position: It, x: std::make_unique<Node>(args&: Label, args&: N)); |
119 | } |
120 | N = It->get(); |
121 | } |
122 | N->Value = Codepoint; |
123 | } |
124 | |
125 | void compact() { compact(N: Root.get()); } |
126 | |
127 | // This creates 2 arrays of bytes from the tree: |
128 | // A serialized dictionary of node labels, |
129 | // And the nodes themselves. |
130 | // The name of each label is found by indexing into the dictionary. |
131 | // The longest names are inserted first into the dictionary, |
132 | // in the hope it will contain shorter labels as substring, |
133 | // thereby reducing duplication. |
134 | // We could theorically be more clever by trying to minimizing the size |
135 | // of the dictionary. |
136 | std::pair<std::string, std::vector<uint8_t>> serialize() { |
137 | std::set<std::string> Names = this->getNameFragments(); |
138 | std::vector<std::string> Sorted(Names.begin(), Names.end()); |
139 | llvm::sort(C&: Sorted, Comp: [](const auto &a, const auto &b) { |
140 | return a.size() > b.size(); |
141 | }); |
142 | std::string Dict(Letters.begin(), Letters.end()); |
143 | Dict.reserve(res_arg: 50000); |
144 | for (const std::string &Name : Sorted) { |
145 | if (Name.size() <= 1) |
146 | continue; |
147 | if (Dict.find(str: Name) != std::string::npos) |
148 | continue; |
149 | Dict += Name; |
150 | } |
151 | |
152 | if (Dict.size() >= std::numeric_limits<uint16_t>::max()) { |
153 | fprintf(stderr, format: "Dictionary too big to be serialized" ); |
154 | exit(status: 1); |
155 | } |
156 | |
157 | auto Bytes = dumpIndex(Dict); |
158 | return {Dict, Bytes}; |
159 | } |
160 | |
161 | std::set<std::string> getNameFragments() { |
162 | std::set<std::string> Keys; |
163 | collectKeys(N: Root.get(), Keys); |
164 | return Keys; |
165 | } |
166 | |
167 | // Maps a valid char in an Unicode character name |
168 | // To a 6 bits index. |
169 | static uint8_t letter(char C) { |
170 | auto Pos = Letters.find(C); |
171 | assert(Pos != std::string::npos && |
172 | "Invalid letter in Unicode character name" ); |
173 | return Pos; |
174 | } |
175 | |
176 | // clang-format off |
177 | // +================+============+======================+=============+========+===+==============+===============+ |
178 | // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 | |
179 | // +================+============+======================+=============+========+===+==============+===============+ |
180 | // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children | |
181 | // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+ |
182 | // clang-format on |
183 | |
184 | std::vector<uint8_t> dumpIndex(const std::string &Dict) { |
185 | struct ChildrenOffset { |
186 | Node *FirstChild; |
187 | std::size_t Offset; |
188 | bool HasValue; |
189 | }; |
190 | |
191 | // Keep track of the start of each node |
192 | // position in the serialized data. |
193 | std::unordered_map<Node *, int32_t> Offsets; |
194 | |
195 | // Keep track of where to write the index |
196 | // of the first children |
197 | std::vector<ChildrenOffset> ChildrenOffsets; |
198 | std::unordered_map<Node *, bool> SiblingTracker; |
199 | std::deque<Node *> AllNodes; |
200 | std::vector<uint8_t> Bytes; |
201 | Bytes.reserve(n: 250'000); |
202 | // This leading byte is used by the reading code to detect the root node. |
203 | Bytes.push_back(x: 0); |
204 | |
205 | auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) { |
206 | for (std::size_t Index = 0; Index < Children.size(); Index++) { |
207 | const std::unique_ptr<Node> &Child = Children[Index]; |
208 | AllNodes.push_back(x: Child.get()); |
209 | if (Index != Children.size() - 1) |
210 | SiblingTracker[Child.get()] = true; |
211 | } |
212 | }; |
213 | CollectChildren(Root->Children); |
214 | |
215 | while (!AllNodes.empty()) { |
216 | const std::size_t Offset = Bytes.size(); |
217 | Node *const N = AllNodes.front(); |
218 | AllNodes.pop_front(); |
219 | |
220 | assert(!N->Name.empty()); |
221 | Offsets[N] = Offset; |
222 | |
223 | uint8_t FirstByte = (!!N->Value) ? 0x80 : 0; |
224 | // Single letter node are indexed in 6 bits |
225 | if (N->Name.size() == 1) { |
226 | FirstByte |= letter(C: N->Name[0]); |
227 | Bytes.push_back(x: FirstByte); |
228 | } else { |
229 | // Otherwise we use a 16 bits index |
230 | FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40; |
231 | Bytes.push_back(x: FirstByte); |
232 | auto PosInDict = Dict.find(str: N->Name); |
233 | assert(PosInDict != std::string::npos); |
234 | uint8_t Low = PosInDict; |
235 | uint8_t High = ((PosInDict >> 8) & 0xFF); |
236 | Bytes.push_back(x: High); |
237 | Bytes.push_back(x: Low); |
238 | } |
239 | |
240 | const bool HasSibling = SiblingTracker.count(x: N) != 0; |
241 | const bool HasChildren = N->Children.size() != 0; |
242 | |
243 | if (!!N->Value) { |
244 | uint32_t Value = (*(N->Value) << 3); |
245 | uint8_t H = ((Value >> 16) & 0xFF); |
246 | uint8_t M = ((Value >> 8) & 0xFF); |
247 | uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) | |
248 | uint8_t(HasChildren ? 0x02 : 0); |
249 | |
250 | Bytes.push_back(x: H); |
251 | Bytes.push_back(x: M); |
252 | Bytes.push_back(x: L); |
253 | |
254 | if (HasChildren) { |
255 | ChildrenOffsets.push_back( |
256 | x: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size(), .HasValue: true}); |
257 | // index of the first children |
258 | Bytes.push_back(x: 0x00); |
259 | Bytes.push_back(x: 0x00); |
260 | Bytes.push_back(x: 0x00); |
261 | } |
262 | } else { |
263 | // When there is no value (that's most intermediate nodes) |
264 | // Dispense of the 3 values bytes, and only store |
265 | // 1 byte to track whether the node has sibling and children |
266 | // + 2 bytes for the index of the first children if necessary. |
267 | // That index also uses bytes 0-6 of the previous byte. |
268 | uint8_t Byte = |
269 | uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0); |
270 | Bytes.push_back(x: Byte); |
271 | if (HasChildren) { |
272 | ChildrenOffsets.emplace_back( |
273 | args: ChildrenOffset{.FirstChild: N->Children[0].get(), .Offset: Bytes.size() - 1, .HasValue: false}); |
274 | Bytes.push_back(x: 0x00); |
275 | Bytes.push_back(x: 0x00); |
276 | } |
277 | } |
278 | CollectChildren(N->Children); |
279 | } |
280 | |
281 | // Once all the nodes are in the inndex |
282 | // Fill the bytes we left to indicate the position |
283 | // of the children |
284 | for (const ChildrenOffset &Parent : ChildrenOffsets) { |
285 | const auto It = Offsets.find(x: Parent.FirstChild); |
286 | assert(It != Offsets.end()); |
287 | std::size_t Pos = It->second; |
288 | if (Parent.HasValue) { |
289 | Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF); |
290 | } else { |
291 | Bytes[Parent.Offset] = |
292 | Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF); |
293 | } |
294 | Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF); |
295 | Bytes[Parent.Offset + 2] = Pos & 0xFF; |
296 | } |
297 | |
298 | // Add some padding so that the deserialization code |
299 | // doesn't try to read past the enf of the array. |
300 | Bytes.push_back(x: 0); |
301 | Bytes.push_back(x: 0); |
302 | Bytes.push_back(x: 0); |
303 | Bytes.push_back(x: 0); |
304 | Bytes.push_back(x: 0); |
305 | Bytes.push_back(x: 0); |
306 | |
307 | return Bytes; |
308 | } |
309 | |
310 | private: |
311 | void collectKeys(Node *N, std::set<std::string> &Keys) { |
312 | Keys.insert(x: N->Name); |
313 | for (const std::unique_ptr<Node> &Child : N->Children) { |
314 | collectKeys(N: Child.get(), Keys); |
315 | } |
316 | } |
317 | |
318 | // Merge sequences of 1-character nodes |
319 | // This greatly reduce the total number of nodes, |
320 | // and therefore the size of the index. |
321 | // When the tree gets serialized, we only have 5 bytes to store the |
322 | // size of a name. Overlong names (>32 characters) are therefore |
323 | // kep into separate nodes |
324 | void compact(Node *N) { |
325 | for (auto &&Child : N->Children) { |
326 | compact(N: Child.get()); |
327 | } |
328 | if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value && |
329 | (N->Parent->Name.size() + N->Name.size() <= 32)) { |
330 | N->Parent->Value = N->Value; |
331 | N->Parent->Name += N->Name; |
332 | N->Parent->Children = std::move(N->Children); |
333 | for (std::unique_ptr<Node> &c : N->Parent->Children) { |
334 | c->Parent = N->Parent; |
335 | } |
336 | } |
337 | } |
338 | struct Node { |
339 | Node(std::string Name, Node *Parent = nullptr) |
340 | : Name(Name), Parent(Parent) {} |
341 | |
342 | std::vector<std::unique_ptr<Node>> Children; |
343 | std::string Name; |
344 | Node *Parent = nullptr; |
345 | std::optional<char32_t> Value; |
346 | }; |
347 | |
348 | std::unique_ptr<Node> Root = std::make_unique<Node>(args: "" ); |
349 | }; |
350 | |
351 | extern const char *UnicodeLicense; |
352 | |
353 | int main(int argc, char **argv) { |
354 | printf(format: "Unicode name -> codepoint mapping generator\n" |
355 | "Usage: %s UnicodeData.txt NameAliases.txt output\n\n" , |
356 | argv[0]); |
357 | printf(format: "NameAliases.txt can be found at " |
358 | "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n" |
359 | "UnicodeData.txt can be found at " |
360 | "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n" ); |
361 | |
362 | if (argc != 4) |
363 | return EXIT_FAILURE; |
364 | |
365 | FILE *Out = fopen(filename: argv[3], modes: "w" ); |
366 | if (!Out) { |
367 | printf(format: "Error creating output file.\n" ); |
368 | return EXIT_FAILURE; |
369 | } |
370 | |
371 | Trie T; |
372 | uint32_t NameCount = 0; |
373 | std::size_t LongestName = 0; |
374 | auto Entries = loadDataFiles(NamesFile: argv[1], AliasesFile: argv[2]); |
375 | for (const std::pair<const char32_t, std::string> &Entry : Entries) { |
376 | char32_t Codepoint = Entry.first; |
377 | const std::string &Name = Entry.second; |
378 | // Ignore names which are not valid. |
379 | if (Name.empty() || |
380 | !llvm::all_of(Range: Name, P: [](char C) { return Letters.contains(C); })) { |
381 | continue; |
382 | } |
383 | printf(format: "%06x: %s\n" , static_cast<unsigned int>(Codepoint), Name.c_str()); |
384 | T.insert(Name, Codepoint); |
385 | LongestName = |
386 | std::max(a: LongestName, b: std::size_t(llvm::count_if(Range: Name, P: llvm::isAlnum))); |
387 | NameCount++; |
388 | } |
389 | T.compact(); |
390 | |
391 | std::pair<std::string, std::vector<uint8_t>> Data = T.serialize(); |
392 | const std::string &Dict = Data.first; |
393 | const std::vector<uint8_t> &Tree = Data.second; |
394 | |
395 | fprintf(stream: Out, format: R"( |
396 | //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===// |
397 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
398 | // See https://llvm.org/LICENSE.txt for license information. |
399 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
400 | // |
401 | //===----------------------------------------------------------------------===// |
402 | // |
403 | // This file implements mapping the name of a unicode code point to its value. |
404 | // |
405 | // This file was generated using %s. |
406 | // Do not edit manually. |
407 | // |
408 | //===----------------------------------------------------------------------===// |
409 | %s |
410 | |
411 | |
412 | |
413 | #include "llvm/Support/Compiler.h" |
414 | #include <cstddef> |
415 | #include <cstdint> |
416 | )" , |
417 | argv[0], UnicodeLicense); |
418 | |
419 | fprintf(stream: Out, |
420 | format: "namespace llvm { namespace sys { namespace unicode { \n" |
421 | "extern const char *UnicodeNameToCodepointDict;\n" |
422 | "extern const uint8_t *UnicodeNameToCodepointIndex;\n" |
423 | "extern const std::size_t UnicodeNameToCodepointIndexSize;\n" |
424 | "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n" ); |
425 | |
426 | fprintf(stream: Out, format: "const char* UnicodeNameToCodepointDict = \"%s\";\n" , |
427 | Dict.c_str()); |
428 | |
429 | fprintf(stream: Out, format: "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n" , |
430 | Tree.size() + 1); |
431 | |
432 | for (auto Byte : Tree) { |
433 | fprintf(stream: Out, format: "0x%02x," , Byte); |
434 | } |
435 | |
436 | fprintf(stream: Out, format: "0};" ); |
437 | fprintf(stream: Out, format: "const uint8_t* UnicodeNameToCodepointIndex = " |
438 | "UnicodeNameToCodepointIndex_; \n" ); |
439 | fprintf(stream: Out, format: "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n" , |
440 | Tree.size() + 1); |
441 | fprintf(stream: Out, |
442 | format: "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n" , |
443 | LongestName); |
444 | fprintf(stream: Out, format: "\n}}}\n" ); |
445 | fclose(stream: Out); |
446 | printf(format: "Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n" , |
447 | argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0); |
448 | } |
449 | |
450 | const char *UnicodeLicense = R"( |
451 | /* |
452 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE |
453 | |
454 | See Terms of Use <https://www.unicode.org/copyright.html> |
455 | for definitions of Unicode Inc.’s Data Files and Software. |
456 | |
457 | NOTICE TO USER: Carefully read the following legal agreement. |
458 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S |
459 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), |
460 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE |
461 | TERMS AND CONDITIONS OF THIS AGREEMENT. |
462 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE |
463 | THE DATA FILES OR SOFTWARE. |
464 | |
465 | COPYRIGHT AND PERMISSION NOTICE |
466 | |
467 | Copyright © 1991-2022 Unicode, Inc. All rights reserved. |
468 | Distributed under the Terms of Use in https://www.unicode.org/copyright.html. |
469 | |
470 | Permission is hereby granted, free of charge, to any person obtaining |
471 | a copy of the Unicode data files and any associated documentation |
472 | (the "Data Files") or Unicode software and any associated documentation |
473 | (the "Software") to deal in the Data Files or Software |
474 | without restriction, including without limitation the rights to use, |
475 | copy, modify, merge, publish, distribute, and/or sell copies of |
476 | the Data Files or Software, and to permit persons to whom the Data Files |
477 | or Software are furnished to do so, provided that either |
478 | (a) this copyright and permission notice appear with all copies |
479 | of the Data Files or Software, or |
480 | (b) this copyright and permission notice appear in associated |
481 | Documentation. |
482 | |
483 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF |
484 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
485 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
486 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
487 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS |
488 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
489 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
490 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
491 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
492 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. |
493 | |
494 | Except as contained in this notice, the name of a copyright holder |
495 | shall not be used in advertising or otherwise to promote the sale, |
496 | use or other dealings in these Data Files or Software without prior |
497 | written authorization of the copyright holder. |
498 | */ |
499 | )" ; |
500 | |