UnicodeNameMappingGenerator.cpp source code [llvm_projects/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp]

1	//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10	// using UnicodeData.txt and NameAliases.txt available at
11	// https://unicode.org/Public/15.1.0/ucd/
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/ADT/STLExtras.h"
15	#include "llvm/ADT/StringExtras.h"
16	#include "llvm/ADT/StringRef.h"
17	#include <algorithm>
18	#include <deque>
19	#include <fstream>
20	#include <memory>
21	#include <optional>
22	#include <set>
23	#include <string>
24	#include <unordered_map>
25	#include <utility>
26	#include <vector>
27
28	static const llvm::StringRef Letters =
29	" _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
30
31	// Collect names UnicodeData.txt and AliasNames.txt
32	// There may be multiple names per code points.
33	static std::unordered_multimap<char32_t, std::string>
34	loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
35	std::unordered_multimap<char32_t, std::string> CollectedCharacters;
36	auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
37	std::ifstream InputFile(File);
38	for (std::string Line; getline(is&: InputFile, str&: Line);) {
39	if (Line.empty() \|\| !isxdigit(Line [`0`]))
40	continue;
41	auto FirstSemiPos = Line.find(c: `';'`);
42	if (FirstSemiPos == std::string::npos)
43	continue;
44	auto SecondSemiPos = Line.find(c: `';'`, pos: FirstSemiPos + `1`);
45	if (SecondSemiPos == std::string::npos)
46	continue;
47	unsigned long long CodePoint;
48	if (llvm::getAsUnsignedInteger(
49	Str: llvm::StringRef (Line.c_str(), FirstSemiPos), Radix: `16`, Result&: CodePoint)) {
50	continue;
51	}
52
53	std::string Name =
54	Line.substr(pos: FirstSemiPos + `1`, n: SecondSemiPos - FirstSemiPos - `1`);
55
56	if (!Name.empty() && Name [`0`] == `'<'`) {
57	// Ignore ranges of characters, as their name is either absent or
58	// generated.
59	continue;
60	}
61
62	// Some aliases are ignored for compatibility with C++
63	if (IsAliasFile) {
64	std::string Kind = Line.substr(pos: SecondSemiPos + `1`);
65	if (Kind != "control" && Kind != "correction" && Kind != "alternate")
66	continue;
67	}
68
69	auto InsertUnique = [&](char32_t CP, std::string Name) {
70	auto It = CollectedCharacters.find(x: CP);
71	while (It != std::end(cont&: CollectedCharacters) && It ->first == CP) {
72	if (It ->second == Name)
73	return;
74	++It;
75	}
76	CollectedCharacters.insert(x: {CP, std::move(Name)});
77	};
78	InsertUnique(CodePoint, std::move(Name));
79	}
80	};
81
82	FromFile (NamesFile);
83	FromFile (AliasesFile, true);
84	return CollectedCharacters;
85	}
86
87	class Trie {
88	struct Node;
89
90	public:
91	// When inserting named codepoint
92	// We create a node per character in the name.
93	// SPARKLE becomes S <- P <- A <- R <- K <- L <- E
94	// Once all characters are inserted, the tree is compacted
95	void insert(llvm::StringRef Name, char32_t Codepoint) {
96	Node *N = Root.get();
97	bool IsBeforeMedial = false;
98	for (auto ChIt = Name.begin(); ChIt != Name.end();
99	ChIt += (IsBeforeMedial ? `3` : `1`)) {
100	char Ch = *ChIt;
101	assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
102
103	std::string Label(`1`, Ch);
104
105	// We need to ensure a node never ends or starts by
106	// a medial hyphen as this would break the
107	// loose matching algorithm.
108	IsBeforeMedial = llvm::isAlnum(C: Ch) && ChIt + `1` != Name.end() &&
109	*(ChIt + `1`) == `'-'` && ChIt + `2` != Name.end() &&
110	llvm::isAlnum(C: *(ChIt + `2`));
111	if (IsBeforeMedial)
112	Label.assign(first: ChIt, last: ChIt + `3`);
113
114	auto It = llvm::find_if(Range&: N->Children,
115	P: [&](const auto &C) { return C->Name == Label; });
116	if (It == N->Children.end()) {
117	It = N->Children.insert(position: It, x: std::make_unique<Node>(args&: Label, args&: N));
118	}
119	N = It ->get();
120	}
121	N->Value = Codepoint;
122	}
123
124	void compact() { compact(N: Root.get()); }
125
126	// This creates 2 arrays of bytes from the tree:
127	// A serialized dictionary of node labels,
128	// And the nodes themselves.
129	// The name of each label is found by indexing into the dictionary.
130	// The longest names are inserted first into the dictionary,
131	// in the hope it will contain shorter labels as substring,
132	// thereby reducing duplication.
133	// We could theorically be more clever by trying to minimizing the size
134	// of the dictionary.
135	std::pair<std::string, std::vector<uint8_t>> serialize() {
136	std::set<std::string> Names = this->getNameFragments();
137	std::vector<std::string> Sorted(Names.begin(), Names.end());
138	llvm::sort(C&: Sorted, Comp: [](const auto &a, const auto &b) {
139	return a.size() > b.size();
140	});
141	std::string Dict(Letters.begin(), Letters.end());
142	Dict.reserve(res_arg: `50000`);
143	for (const std::string &Name : Sorted) {
144	if (Name.size() <= `1`)
145	continue;
146	if (Dict.find(str: Name) != std::string::npos)
147	continue;
148	Dict += Name;
149	}
150
151	if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
152	fprintf(stderr, format: "Dictionary too big to be serialized");
153	exit(status: `1`);
154	}
155
156	auto Bytes = dumpIndex(Dict);
157	return {Dict, Bytes};
158	}
159
160	std::set<std::string> getNameFragments() {
161	std::set<std::string> Keys;
162	collectKeys(N: Root.get(), Keys);
163	return Keys;
164	}
165
166	// Maps a valid char in an Unicode character name
167	// To a 6 bits index.
168	static uint8_t letter(char C) {
169	auto Pos = Letters.find(C);
170	assert(Pos != std::string::npos &&
171	"Invalid letter in Unicode character name");
172	return Pos;
173	}
174
175	// clang-format off
176	// +================+============+======================+=============+========+===+==============+===============+
177	// \| 0 \| 1 \| 2-7 (6) \| 8-23 \| 24-44 \| \| 46 \| 47 \|
178	// +================+============+======================+=============+========+===+==============+===============+
179	// \| Has Value \| Has Long Name \| Letter OR Name Size \| Dict Index \| Value \| \| Has Sibling \| Has Children \|
180	// +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
181	// clang-format on
182
183	std::vector<uint8_t> dumpIndex(const std::string &Dict) {
184	struct ChildrenOffset {
185	Node *FirstChild;
186	std::size_t Offset;
187	bool HasValue;
188	};
189
190	// Keep track of the start of each node
191	// position in the serialized data.
192	std::unordered_map<Node *, int32_t> Offsets;
193
194	// Keep track of where to write the index
195	// of the first children
196	std::vector<ChildrenOffset> ChildrenOffsets;
197	std::unordered_map<Node , bool*> SiblingTracker;
198	std::deque<Node *> AllNodes;
199	std::vector<uint8_t> Bytes;
200	Bytes.reserve(n: `250'000`);
201	// This leading byte is used by the reading code to detect the root node.
202	Bytes.push_back(x: `0`);
203
204	auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
205	for (std::size_t Index = `0`; Index < Children.size(); Index++) {
206	const std::unique_ptr<Node> &Child = Children[Index];
207	AllNodes.push_back(x: Child.get());
208	if (Index != Children.size() - `1`)
209	SiblingTracker [Child.get()] = true;
210	}
211	};
212	CollectChildren(Root ->Children);
213
214	while (!AllNodes.empty()) {
215	const std::size_t Offset = Bytes.size();
216	Node *const N = AllNodes.front();
217	AllNodes.pop_front();
218
219	assert(!N->Name.empty());
220	Offsets [N] = Offset;
221
222	uint8_t FirstByte = (!!N->Value) ? `0x80` : `0`;
223	// Single letter node are indexed in 6 bits
224	if (N->Name.size() == `1`) {
225	FirstByte \|= letter(C: N->Name [`0`]);
226	Bytes.push_back(x: FirstByte);
227	} else {
228	// Otherwise we use a 16 bits index
229	FirstByte = FirstByte \| uint8_t(N->Name.size()) \| `0x40`;
230	Bytes.push_back(x: FirstByte);
231	auto PosInDict = Dict.find(str: N->Name);
232	assert(PosInDict != std::string::npos);
233	uint8_t Low = PosInDict;
234	uint8_t High = ((PosInDict >> `8`) & `0xFF`);
235	Bytes.push_back(x: High);
236	Bytes.push_back(x: Low);
237	}
238
239	const bool HasSibling = SiblingTracker.count(x: N) != `0`;
240	const bool HasChildren = N->Children.size() != `0`;
241
242	if (!!N->Value) {
243	uint32_t Value = (*(N->Value) << `3`);
244	uint8_t H = ((Value >> `16`) & `0xFF`);
245	uint8_t M = ((Value >> `8`) & `0xFF`);
246	uint8_t L = (Value & `0xFF`) \| uint8_t(HasSibling ? `0x01` : `0`) \|
247	uint8_t(HasChildren ? `0x02` : `0`);
248
249	Bytes.push_back(x: H);
250	Bytes.push_back(x: M);
251	Bytes.push_back(x: L);
252
253	if (HasChildren) {
254	ChildrenOffsets.push_back(
255	x: ChildrenOffset{.FirstChild: N->Children [`0`].get(), .Offset: Bytes.size(), .HasValue: true});
256	// index of the first children
257	Bytes.push_back(x: `0x00`);
258	Bytes.push_back(x: `0x00`);
259	Bytes.push_back(x: `0x00`);
260	}
261	} else {
262	// When there is no value (that's most intermediate nodes)
263	// Dispense of the 3 values bytes, and only store
264	// 1 byte to track whether the node has sibling and children
265	// + 2 bytes for the index of the first children if necessary.
266	// That index also uses bytes 0-6 of the previous byte.
267	uint8_t Byte =
268	uint8_t(HasSibling ? `0x80` : `0`) \| uint8_t(HasChildren ? `0x40` : `0`);
269	Bytes.push_back(x: Byte);
270	if (HasChildren) {
271	ChildrenOffsets.emplace_back(
272	args: ChildrenOffset{.FirstChild: N->Children [`0`].get(), .Offset: Bytes.size() - `1`, .HasValue: false});
273	Bytes.push_back(x: `0x00`);
274	Bytes.push_back(x: `0x00`);
275	}
276	}
277	CollectChildren(N->Children);
278	}
279
280	// Once all the nodes are in the inndex
281	// Fill the bytes we left to indicate the position
282	// of the children
283	for (const ChildrenOffset &Parent : ChildrenOffsets) {
284	const auto It = Offsets.find(x: Parent.FirstChild);
285	assert(It != Offsets.end());
286	std::size_t Pos = It ->second;
287	if (Parent.HasValue) {
288	Bytes [Parent.Offset] = ((Pos >> `16`) & `0xFF`);
289	} else {
290	Bytes [Parent.Offset] =
291	Bytes [Parent.Offset] \| uint8_t((Pos >> `16`) & `0xFF`);
292	}
293	Bytes [Parent.Offset + `1`] = ((Pos >> `8`) & `0xFF`);
294	Bytes [Parent.Offset + `2`] = Pos & `0xFF`;
295	}
296
297	// Add some padding so that the deserialization code
298	// doesn't try to read past the enf of the array.
299	Bytes.push_back(x: `0`);
300	Bytes.push_back(x: `0`);
301	Bytes.push_back(x: `0`);
302	Bytes.push_back(x: `0`);
303	Bytes.push_back(x: `0`);
304	Bytes.push_back(x: `0`);
305
306	return Bytes;
307	}
308
309	private:
310	void collectKeys(Node *N, std::set<std::string> &Keys) {
311	Keys.insert(x: N->Name);
312	for (const std::unique_ptr<Node> &Child : N->Children) {
313	collectKeys(N: Child.get(), Keys);
314	}
315	}
316
317	// Merge sequences of 1-character nodes
318	// This greatly reduce the total number of nodes,
319	// and therefore the size of the index.
320	// When the tree gets serialized, we only have 5 bytes to store the
321	// size of a name. Overlong names (>32 characters) are therefore
322	// kep into separate nodes
323	void compact(Node *N) {
324	for (auto &&Child : N->Children) {
325	compact(N: Child.get());
326	}
327	if (N->Parent && N->Parent->Children.size() == `1` && !N->Parent->Value &&
328	(N->Parent->Name.size() + N->Name.size() <= `32`)) {
329	N->Parent->Value = N->Value;
330	N->Parent->Name += N->Name;
331	N->Parent->Children = std::move(N->Children);
332	for (std::unique_ptr<Node> &c : N->Parent->Children) {
333	c ->Parent = N->Parent;
334	}
335	}
336	}
337	struct Node {
338	Node(std::string Name, Node Parent = nullptr*)
339	: Name (Name), Parent(Parent) {}
340
341	std::vector<std::unique_ptr<Node>> Children;
342	std::string Name;
343	Node Parent = nullptr*;
344	std::optional<char32_t> Value;
345	};
346
347	std::unique_ptr<Node> Root = std::make_unique<Node>(args: "");
348	};
349
350	extern const char *UnicodeLicense;
351
352	int main(int argc, char **argv) {
353	printf(format: "Unicode name -> codepoint mapping generator\n"
354	"Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
355	argv[`0`]);
356	printf(format: "NameAliases.txt can be found at "
357	"https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
358	"UnicodeData.txt can be found at "
359	"https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
360
361	if (argc != `4`)
362	return EXIT_FAILURE;
363
364	FILE *Out = fopen(filename: argv[`3`], modes: "w");
365	if (!Out) {
366	printf(format: "Error creating output file.\n");
367	return EXIT_FAILURE;
368	}
369
370	Trie T;
371	uint32_t NameCount = `0`;
372	std::size_t LongestName = `0`;
373	auto Entries = loadDataFiles(NamesFile: argv[`1`], AliasesFile: argv[`2`]);
374	for (const std::pair<const char32_t, std::string> &Entry : Entries) {
375	char32_t Codepoint = Entry.first;
376	const std::string &Name = Entry.second;
377	// Ignore names which are not valid.
378	if (Name.empty() \|\|
379	!llvm::all_of(Range: Name, P: [](char C) { return Letters.contains(C); })) {
380	continue;
381	}
382	printf(format: "%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
383	T.insert(Name, Codepoint);
384	LongestName =
385	std::max(a: LongestName, b: std::size_t(llvm::count_if(Range: Name, P: llvm::isAlnum)));
386	NameCount++;
387	}
388	T.compact();
389
390	std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
391	const std::string &Dict = Data.first;
392	const std::vector<uint8_t> &Tree = Data.second;
393
394	fprintf(stream: Out, format: R"(
395	//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
396	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
397	// See https://llvm.org/LICENSE.txt for license information.
398	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
399	//
400	//===----------------------------------------------------------------------===//
401	//
402	// This file implements mapping the name of a unicode code point to its value.
403	//
404	// This file was generated using %s.
405	// Do not edit manually.
406	//
407	//===----------------------------------------------------------------------===//
408	%s
409
410
411
412	#include "llvm/Support/Compiler.h"
413	#include <cstddef>
414	#include <cstdint>
415	)",
416	argv[`0`], UnicodeLicense);
417
418	fprintf(stream: Out,
419	format: "namespace llvm { namespace sys { namespace unicode { \n"
420	"extern const char *const UnicodeNameToCodepointDict;\n"
421	"extern const uint8_t *const UnicodeNameToCodepointIndex;\n"
422	"extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
423	"extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
424
425	fprintf(stream: Out, format: "const char *const UnicodeNameToCodepointDict = \"%s\";\n",
426	Dict.c_str());
427
428	fprintf(stream: Out, format: "const uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
429	Tree.size() + `1`);
430
431	for (auto Byte : Tree) {
432	fprintf(stream: Out, format: "0x%02x,", Byte);
433	}
434
435	fprintf(stream: Out, format: "0};");
436	fprintf(stream: Out, format: "const uint8_t *const UnicodeNameToCodepointIndex = "
437	"UnicodeNameToCodepointIndex_; \n");
438	fprintf(stream: Out, format: "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
439	Tree.size() + `1`);
440	fprintf(stream: Out,
441	format: "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
442	LongestName);
443	fprintf(stream: Out, format: "\n}}}\n");
444	fclose(stream: Out);
445	printf(format: "Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
446	argv[`3`], NameCount, Tree.size() / `1024.0`, Dict.size() / `1024.0`);
447	}
448
449	const char *UnicodeLicense = R"(
450	/*
451	UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
452
453	See Terms of Use <https://www.unicode.org/copyright.html>
454	for definitions of Unicode Inc.’s Data Files and Software.
455
456	NOTICE TO USER: Carefully read the following legal agreement.
457	BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
458	DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
459	YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
460	TERMS AND CONDITIONS OF THIS AGREEMENT.
461	IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
462	THE DATA FILES OR SOFTWARE.
463
464	COPYRIGHT AND PERMISSION NOTICE
465
466	Copyright © 1991-2022 Unicode, Inc. All rights reserved.
467	Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
468
469	Permission is hereby granted, free of charge, to any person obtaining
470	a copy of the Unicode data files and any associated documentation
471	(the "Data Files") or Unicode software and any associated documentation
472	(the "Software") to deal in the Data Files or Software
473	without restriction, including without limitation the rights to use,
474	copy, modify, merge, publish, distribute, and/or sell copies of
475	the Data Files or Software, and to permit persons to whom the Data Files
476	or Software are furnished to do so, provided that either
477	(a) this copyright and permission notice appear with all copies
478	of the Data Files or Software, or
479	(b) this copyright and permission notice appear in associated
480	Documentation.
481
482	THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
483	ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
484	WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
485	NONINFRINGEMENT OF THIRD PARTY RIGHTS.
486	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
487	NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
488	DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
489	DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
490	TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
491	PERFORMANCE OF THE DATA FILES OR SOFTWARE.
492
493	Except as contained in this notice, the name of a copyright holder
494	shall not be used in advertising or otherwise to promote the sale,
495	use or other dealings in these Data Files or Software without prior
496	written authorization of the copyright holder.
497	*/
498	)";
499

Browse the source code of llvm_projects/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp