ResourceScriptToken.cpp source code [llvm_projects/llvm/tools/llvm-rc/ResourceScriptToken.cpp]

1	//===-- ResourceScriptToken.cpp ---------------------------------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===---------------------------------------------------------------------===//
8	//
9	// This file implements an interface defined in ResourceScriptToken.h.
10	// In particular, it defines an .rc script tokenizer.
11	//
12	//===---------------------------------------------------------------------===//
13
14	#include "ResourceScriptToken.h"
15	#include "llvm/ADT/StringExtras.h"
16	#include "llvm/Support/raw_ostream.h"
17
18	#include <algorithm>
19	#include <cassert>
20	#include <cctype>
21	#include <cstdlib>
22	#include <utility>
23
24	using namespace llvm;
25
26	using Kind = RCToken::Kind;
27
28	// Checks if Representation is a correct description of an RC integer.
29	// It should be a 32-bit unsigned integer, either decimal or hexadecimal
30	// (0x[0-9a-f]+). For Windres mode, it can also be octal (0[0-7]+).
31	// It might be followed by a single 'L' character (that is the difference
32	// between our representation and StringRef's one). If Representation is
33	// correct, 'true' is returned and the return value is put back in Num.
34	static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35	size_t Length = Representation.size();
36	if (Length == `0`)
37	return false;
38	// Strip the last 'L' if unnecessary.
39	if (std::toupper(c: Representation.back()) == `'L'`)
40	Representation = Representation.drop_back(N: `1`);
41
42	return !Representation.getAsInteger<uint32_t>(Radix: `0`, Result&: Num);
43	}
44
45	RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46	: TokenKind(RCTokenKind), TokenValue (Value) {}
47
48	uint32_t RCToken::intValue() const {
49	assert(TokenKind == Kind::Int);
50	// We assume that the token already is a correct integer (checked by
51	// rcGetAsInteger).
52	uint32_t Result;
53	bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result);
54	assert(IsSuccess);
55	(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56	return Result;
57	}
58
59	bool RCToken::isLongInt() const {
60	return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == `'L'`;
61	}
62
63	StringRef RCToken::value() const { return TokenValue; }
64
65	Kind RCToken::kind() const { return TokenKind; }
66
67	bool RCToken::isLowPrecedenceBinaryOp() const {
68	switch (TokenKind) {
69	case Kind::Plus:
70	case Kind::Minus:
71	case Kind::Pipe:
72	case Kind::Amp:
73	return true;
74	default:
75	return false;
76	}
77	}
78
79	bool RCToken::isHighPrecedenceBinaryOp() const {
80	switch (TokenKind) {
81	case Kind::Asterisk:
82	case Kind::Slash:
83	return true;
84	default:
85	return false;
86	}
87	}
88
89	static Error getStringError(const Twine &message) {
90	return make_error<StringError>(Args: "Error parsing file: " + message,
91	Args: inconvertibleErrorCode());
92	}
93
94	namespace {
95
96	class Tokenizer {
97	public:
98	Tokenizer(StringRef Input, bool IsWindres)
99	: Data (Input), DataLength(Input.size()), Pos(`0`), IsWindres(IsWindres) {}
100
101	Expected<std::vector<RCToken>> run();
102
103	private:
104	// All 'advancing' methods return boolean values; if they're equal to false,
105	// the stream has ended or failed.
106	bool advance(size_t Amount = `1`);
107	bool skipWhitespaces();
108
109	// Consumes a token. If any problem occurred, a non-empty Error is returned.
110	Error consumeToken(const Kind TokenKind);
111
112	// Check if tokenizer is about to read FollowingChars.
113	bool willNowRead(StringRef FollowingChars) const;
114
115	// Check if tokenizer can start reading an identifier at current position.
116	// The original tool did non specify the rules to determine what is a correct
117	// identifier. We assume they should follow the C convention:
118	// [a-zA-Z_][a-zA-Z0-9_].*
119	bool canStartIdentifier() const;
120	// Check if tokenizer can continue reading an identifier.
121	bool canContinueIdentifier() const;
122
123	// Check if tokenizer can start reading an integer.
124	// A correct integer always starts with a 0-9 digit,
125	// can contain characters 0-9A-Fa-f (digits),
126	// Ll (marking the integer is 32-bit), Xx (marking the representation
127	// is hexadecimal). As some kind of separator should come after the
128	// integer, we can consume the integer until a non-alphanumeric
129	// character.
130	bool canStartInt() const;
131	bool canContinueInt() const;
132	void trimIntString(StringRef &Str) const;
133
134	bool canStartString() const;
135
136	// Check if tokenizer can start reading a single line comment (e.g. a comment
137	// that begins with '//')
138	bool canStartLineComment() const;
139
140	// Check if tokenizer can start or finish reading a block comment (e.g. a
141	// comment that begins with '/' and ends with '/')
142	bool canStartBlockComment() const;
143
144	// Throw away all remaining characters on the current line.
145	void skipCurrentLine();
146
147	bool streamEof() const;
148
149	// Classify the token that is about to be read from the current position.
150	Kind classifyCurrentToken() const;
151
152	// Process the Kind::Identifier token - check if it is
153	// an identifier describing a block start or end.
154	void processIdentifier(RCToken &token) const;
155
156	StringRef Data;
157	size_t DataLength, Pos;
158	bool IsWindres;
159	};
160
161	void Tokenizer::skipCurrentLine() {
162	Pos = Data.find_first_of(Chars: "\r\n", From: Pos);
163	Pos = Data.find_first_not_of(Chars: "\r\n", From: Pos);
164
165	if (Pos == StringRef::npos)
166	Pos = DataLength;
167	}
168
169	Expected<std::vector<RCToken>> Tokenizer::run() {
170	Pos = `0`;
171	std::vector<RCToken> Result;
172
173	// Consume an optional UTF-8 Byte Order Mark.
174	if (willNowRead(FollowingChars: "\xef\xbb\xbf"))
175	advance(Amount: `3`);
176
177	while (!streamEof()) {
178	if (!skipWhitespaces())
179	break;
180
181	Kind TokenKind = classifyCurrentToken();
182	if (TokenKind == Kind::Invalid)
183	return getStringError(message: "Invalid token found at position " + Twine (Pos));
184
185	const size_t TokenStart = Pos;
186	if (Error TokenError = consumeToken(TokenKind))
187	return std::move(TokenError);
188
189	// Comments are just deleted, don't bother saving them.
190	if (TokenKind == Kind::LineComment \|\| TokenKind == Kind::StartComment)
191	continue;
192
193	StringRef Contents = Data.take_front(N: Pos).drop_front(N: TokenStart);
194
195	if (TokenKind == Kind::Int)
196	trimIntString(Str&: Contents);
197
198	RCToken Token(TokenKind, Contents);
199	if (TokenKind == Kind::Identifier) {
200	processIdentifier(token&: Token);
201	} else if (TokenKind == Kind::Int) {
202	uint32_t TokenInt;
203	if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) {
204	// The integer has incorrect format or cannot be represented in
205	// a 32-bit integer.
206	return getStringError(message: "Integer invalid or too large: " +
207	Token.value().str());
208	}
209	}
210
211	Result.push_back(x: Token);
212	}
213
214	return Result;
215	}
216
217	bool Tokenizer::advance(size_t Amount) {
218	Pos += Amount;
219	return !streamEof();
220	}
221
222	bool Tokenizer::skipWhitespaces() {
223	while (!streamEof() && isSpace(C: Data [Pos]))
224	advance();
225	return !streamEof();
226	}
227
228	Error Tokenizer::consumeToken(const Kind TokenKind) {
229	switch (TokenKind) {
230	// One-character token consumption.
231	#define TOKEN(Name)
232	#define SHORT_TOKEN(Name, Ch) case Kind::Name:
233	#include "ResourceScriptTokenList.def"
234	advance();
235	return Error::success();
236
237	case Kind::LineComment:
238	advance(Amount: `2`);
239	skipCurrentLine();
240	return Error::success();
241
242	case Kind::StartComment: {
243	advance(Amount: `2`);
244	auto EndPos = Data.find(Str: "*/", From: Pos);
245	if (EndPos == StringRef::npos)
246	return getStringError(
247	message: "Unclosed multi-line comment beginning at position " + Twine (Pos));
248	advance(Amount: EndPos - Pos);
249	advance(Amount: `2`);
250	return Error::success();
251	}
252	case Kind::Identifier:
253	while (!streamEof() && canContinueIdentifier())
254	advance();
255	return Error::success();
256
257	case Kind::Int:
258	while (!streamEof() && canContinueInt())
259	advance();
260	return Error::success();
261
262	case Kind::String:
263	// Consume the preceding 'L', if there is any.
264	if (std::toupper(c: Data [Pos]) == `'L'`)
265	advance();
266	// Consume the double-quote.
267	advance();
268
269	// Consume the characters until the end of the file, line or string.
270	while (true) {
271	if (streamEof()) {
272	return getStringError(message: "Unterminated string literal.");
273	} else if (Data [Pos] == `'"'`) {
274	// Consume the ending double-quote.
275	advance();
276	// However, if another '"' follows this double-quote, the string didn't
277	// end and we just included '"' into the string.
278	if (!willNowRead(FollowingChars: "\""))
279	return Error::success();
280	} else if (Data [Pos] == `'\n'`) {
281	return getStringError(message: "String literal not terminated in the line.");
282	}
283
284	advance();
285	}
286
287	case Kind::Invalid:
288	assert(false && "Cannot consume an invalid token.");
289	}
290
291	llvm_unreachable("Unknown RCToken::Kind");
292	}
293
294	bool Tokenizer::willNowRead(StringRef FollowingChars) const {
295	return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars);
296	}
297
298	bool Tokenizer::canStartIdentifier() const {
299	assert(!streamEof());
300
301	const char CurChar = Data [Pos];
302	return std::isalpha(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'`;
303	}
304
305	bool Tokenizer::canContinueIdentifier() const {
306	assert(!streamEof());
307	const char CurChar = Data [Pos];
308	return std::isalnum(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'` \|\|
309	CurChar == `'/'` \|\| CurChar == `'\\'` \|\| CurChar == `'-'`;
310	}
311
312	bool Tokenizer::canStartInt() const {
313	assert(!streamEof());
314	return std::isdigit(Data [Pos]);
315	}
316
317	bool Tokenizer::canStartBlockComment() const {
318	assert(!streamEof());
319	return Data.drop_front(N: Pos).starts_with(Prefix: "/*");
320	}
321
322	bool Tokenizer::canStartLineComment() const {
323	assert(!streamEof());
324	return Data.drop_front(N: Pos).starts_with(Prefix: "//");
325	}
326
327	bool Tokenizer::canContinueInt() const {
328	assert(!streamEof());
329	return std::isalnum(Data [Pos]);
330	}
331
332	bool Tokenizer::canStartString() const {
333	return willNowRead(FollowingChars: "\"") \|\| willNowRead(FollowingChars: "L\"") \|\| willNowRead(FollowingChars: "l\"");
334	}
335
336	bool Tokenizer::streamEof() const { return Pos == DataLength; }
337
338	Kind Tokenizer::classifyCurrentToken() const {
339	if (canStartBlockComment())
340	return Kind::StartComment;
341	if (canStartLineComment())
342	return Kind::LineComment;
343
344	if (canStartInt())
345	return Kind::Int;
346	if (canStartString())
347	return Kind::String;
348	// BEGIN and END are at this point of lexing recognized as identifiers.
349	if (canStartIdentifier())
350	return Kind::Identifier;
351
352	const char CurChar = Data [Pos];
353
354	switch (CurChar) {
355	// One-character token classification.
356	#define TOKEN(Name)
357	#define SHORT_TOKEN(Name, Ch) \
358	case Ch: \
359	return Kind::Name;
360	#include "ResourceScriptTokenList.def"
361
362	default:
363	return Kind::Invalid;
364	}
365	}
366
367	void Tokenizer::processIdentifier(RCToken &Token) const {
368	assert(Token.kind() == Kind::Identifier);
369	StringRef Name = Token.value();
370
371	if (Name.equals_insensitive(RHS: "begin"))
372	Token = RCToken (Kind::BlockBegin, Name);
373	else if (Name.equals_insensitive(RHS: "end"))
374	Token = RCToken (Kind::BlockEnd, Name);
375	}
376
377	void Tokenizer::trimIntString(StringRef &Str) const {
378	if (!IsWindres) {
379	// For compatibility with rc.exe, strip leading zeros that make the
380	// integer literal interpreted as octal.
381	//
382	// We do rely on Stringref::getAsInteger for autodetecting between
383	// decimal and hexadecimal literals, but we want to avoid interpreting
384	// literals as octal.
385	//
386	// This omits the leading zeros from the RCToken's value string entirely,
387	// which also has a visible effect when dumping the tokenizer output.
388	// Alternatively, we could store the IsWindres flag in RCToken and defer
389	// the trimming to RCToken::intValue.
390	while (Str.size() >= `2` && Str [`0`] == `'0'` && std::isdigit(Str [`1`]))
391	Str = Str.drop_front(N: `1`);
392	}
393	}
394
395	} // anonymous namespace
396
397	namespace llvm {
398
399	Expected<std::vector<RCToken>> tokenizeRC(StringRef Input, bool IsWindres) {
400	return Tokenizer (Input, IsWindres).run();
401	}
402
403	} // namespace llvm
404

Browse the source code of llvm_projects/llvm/tools/llvm-rc/ResourceScriptToken.cpp