ResourceScriptToken.cpp source code [llvm_projects/llvm/tools/llvm-rc/ResourceScriptToken.cpp]

1	//===-- ResourceScriptToken.cpp ---------------------------------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===---------------------------------------------------------------------===//
8	//
9	// This file implements an interface defined in ResourceScriptToken.h.
10	// In particular, it defines an .rc script tokenizer.
11	//
12	//===---------------------------------------------------------------------===//
13
14	#include "ResourceScriptToken.h"
15	#include "llvm/ADT/StringExtras.h"
16	#include "llvm/Support/raw_ostream.h"
17
18	#include <algorithm>
19	#include <cassert>
20	#include <cctype>
21	#include <cstdlib>
22	#include <utility>
23
24	using namespace llvm;
25
26	using Kind = RCToken::Kind;
27
28	// Checks if Representation is a correct description of an RC integer.
29	// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30	// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31	// character (that is the difference between our representation and
32	// StringRef's one). If Representation is correct, 'true' is returned and
33	// the return value is put back in Num.
34	static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35	size_t Length = Representation.size();
36	if (Length == `0`)
37	return false;
38	// Strip the last 'L' if unnecessary.
39	if (std::toupper(c: Representation.back()) == `'L'`)
40	Representation = Representation.drop_back(N: `1`);
41
42	return !Representation.getAsInteger<uint32_t>(Radix: `0`, Result&: Num);
43	}
44
45	RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46	: TokenKind(RCTokenKind), TokenValue (Value) {}
47
48	uint32_t RCToken::intValue() const {
49	assert(TokenKind == Kind::Int);
50	// We assume that the token already is a correct integer (checked by
51	// rcGetAsInteger).
52	uint32_t Result;
53	bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result);
54	assert(IsSuccess);
55	(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56	return Result;
57	}
58
59	bool RCToken::isLongInt() const {
60	return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == `'L'`;
61	}
62
63	StringRef RCToken::value() const { return TokenValue; }
64
65	Kind RCToken::kind() const { return TokenKind; }
66
67	bool RCToken::isLowPrecedenceBinaryOp() const {
68	switch (TokenKind) {
69	case Kind::Plus:
70	case Kind::Minus:
71	case Kind::Pipe:
72	case Kind::Amp:
73	return true;
74	default:
75	return false;
76	}
77	}
78
79	bool RCToken::isHighPrecedenceBinaryOp() const {
80	switch (TokenKind) {
81	case Kind::Asterisk:
82	case Kind::Slash:
83	return true;
84	default:
85	return false;
86	}
87	}
88
89	static Error getStringError(const Twine &message) {
90	return make_error<StringError>(Args: "Error parsing file: " + message,
91	Args: inconvertibleErrorCode());
92	}
93
94	namespace {
95
96	class Tokenizer {
97	public:
98	Tokenizer(StringRef Input) : Data (Input), DataLength(Input.size()), Pos(`0`) {}
99
100	Expected<std::vector<RCToken>> run();
101
102	private:
103	// All 'advancing' methods return boolean values; if they're equal to false,
104	// the stream has ended or failed.
105	bool advance(size_t Amount = `1`);
106	bool skipWhitespaces();
107
108	// Consumes a token. If any problem occurred, a non-empty Error is returned.
109	Error consumeToken(const Kind TokenKind);
110
111	// Check if tokenizer is about to read FollowingChars.
112	bool willNowRead(StringRef FollowingChars) const;
113
114	// Check if tokenizer can start reading an identifier at current position.
115	// The original tool did non specify the rules to determine what is a correct
116	// identifier. We assume they should follow the C convention:
117	// [a-zA-Z_][a-zA-Z0-9_].*
118	bool canStartIdentifier() const;
119	// Check if tokenizer can continue reading an identifier.
120	bool canContinueIdentifier() const;
121
122	// Check if tokenizer can start reading an integer.
123	// A correct integer always starts with a 0-9 digit,
124	// can contain characters 0-9A-Fa-f (digits),
125	// Ll (marking the integer is 32-bit), Xx (marking the representation
126	// is hexadecimal). As some kind of separator should come after the
127	// integer, we can consume the integer until a non-alphanumeric
128	// character.
129	bool canStartInt() const;
130	bool canContinueInt() const;
131
132	bool canStartString() const;
133
134	// Check if tokenizer can start reading a single line comment (e.g. a comment
135	// that begins with '//')
136	bool canStartLineComment() const;
137
138	// Check if tokenizer can start or finish reading a block comment (e.g. a
139	// comment that begins with '/' and ends with '/')
140	bool canStartBlockComment() const;
141
142	// Throw away all remaining characters on the current line.
143	void skipCurrentLine();
144
145	bool streamEof() const;
146
147	// Classify the token that is about to be read from the current position.
148	Kind classifyCurrentToken() const;
149
150	// Process the Kind::Identifier token - check if it is
151	// an identifier describing a block start or end.
152	void processIdentifier(RCToken &token) const;
153
154	StringRef Data;
155	size_t DataLength, Pos;
156	};
157
158	void Tokenizer::skipCurrentLine() {
159	Pos = Data.find_first_of(Chars: "\r\n", From: Pos);
160	Pos = Data.find_first_not_of(Chars: "\r\n", From: Pos);
161
162	if (Pos == StringRef::npos)
163	Pos = DataLength;
164	}
165
166	Expected<std::vector<RCToken>> Tokenizer::run() {
167	Pos = `0`;
168	std::vector<RCToken> Result;
169
170	// Consume an optional UTF-8 Byte Order Mark.
171	if (willNowRead(FollowingChars: "\xef\xbb\xbf"))
172	advance(Amount: `3`);
173
174	while (!streamEof()) {
175	if (!skipWhitespaces())
176	break;
177
178	Kind TokenKind = classifyCurrentToken();
179	if (TokenKind == Kind::Invalid)
180	return getStringError(message: "Invalid token found at position " + Twine (Pos));
181
182	const size_t TokenStart = Pos;
183	if (Error TokenError = consumeToken(TokenKind))
184	return std::move(TokenError);
185
186	// Comments are just deleted, don't bother saving them.
187	if (TokenKind == Kind::LineComment \|\| TokenKind == Kind::StartComment)
188	continue;
189
190	RCToken Token(TokenKind, Data.take_front(N: Pos).drop_front(N: TokenStart));
191	if (TokenKind == Kind::Identifier) {
192	processIdentifier(token&: Token);
193	} else if (TokenKind == Kind::Int) {
194	uint32_t TokenInt;
195	if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) {
196	// The integer has incorrect format or cannot be represented in
197	// a 32-bit integer.
198	return getStringError(message: "Integer invalid or too large: " +
199	Token.value().str());
200	}
201	}
202
203	Result.push_back(x: Token);
204	}
205
206	return Result;
207	}
208
209	bool Tokenizer::advance(size_t Amount) {
210	Pos += Amount;
211	return !streamEof();
212	}
213
214	bool Tokenizer::skipWhitespaces() {
215	while (!streamEof() && isSpace(C: Data [Pos]))
216	advance();
217	return !streamEof();
218	}
219
220	Error Tokenizer::consumeToken(const Kind TokenKind) {
221	switch (TokenKind) {
222	// One-character token consumption.
223	#define TOKEN(Name)
224	#define SHORT_TOKEN(Name, Ch) case Kind::Name:
225	#include "ResourceScriptTokenList.def"
226	advance();
227	return Error::success();
228
229	case Kind::LineComment:
230	advance(Amount: `2`);
231	skipCurrentLine();
232	return Error::success();
233
234	case Kind::StartComment: {
235	advance(Amount: `2`);
236	auto EndPos = Data.find(Str: "*/", From: Pos);
237	if (EndPos == StringRef::npos)
238	return getStringError(
239	message: "Unclosed multi-line comment beginning at position " + Twine (Pos));
240	advance(Amount: EndPos - Pos);
241	advance(Amount: `2`);
242	return Error::success();
243	}
244	case Kind::Identifier:
245	while (!streamEof() && canContinueIdentifier())
246	advance();
247	return Error::success();
248
249	case Kind::Int:
250	while (!streamEof() && canContinueInt())
251	advance();
252	return Error::success();
253
254	case Kind::String:
255	// Consume the preceding 'L', if there is any.
256	if (std::toupper(c: Data [Pos]) == `'L'`)
257	advance();
258	// Consume the double-quote.
259	advance();
260
261	// Consume the characters until the end of the file, line or string.
262	while (true) {
263	if (streamEof()) {
264	return getStringError(message: "Unterminated string literal.");
265	} else if (Data [Pos] == `'"'`) {
266	// Consume the ending double-quote.
267	advance();
268	// However, if another '"' follows this double-quote, the string didn't
269	// end and we just included '"' into the string.
270	if (!willNowRead(FollowingChars: "\""))
271	return Error::success();
272	} else if (Data [Pos] == `'\n'`) {
273	return getStringError(message: "String literal not terminated in the line.");
274	}
275
276	advance();
277	}
278
279	case Kind::Invalid:
280	assert(false && "Cannot consume an invalid token.");
281	}
282
283	llvm_unreachable("Unknown RCToken::Kind");
284	}
285
286	bool Tokenizer::willNowRead(StringRef FollowingChars) const {
287	return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars);
288	}
289
290	bool Tokenizer::canStartIdentifier() const {
291	assert(!streamEof());
292
293	const char CurChar = Data [Pos];
294	return std::isalpha(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'`;
295	}
296
297	bool Tokenizer::canContinueIdentifier() const {
298	assert(!streamEof());
299	const char CurChar = Data [Pos];
300	return std::isalnum(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'` \|\|
301	CurChar == `'/'` \|\| CurChar == `'\\'` \|\| CurChar == `'-'`;
302	}
303
304	bool Tokenizer::canStartInt() const {
305	assert(!streamEof());
306	return std::isdigit(Data [Pos]);
307	}
308
309	bool Tokenizer::canStartBlockComment() const {
310	assert(!streamEof());
311	return Data.drop_front(N: Pos).starts_with(Prefix: "/*");
312	}
313
314	bool Tokenizer::canStartLineComment() const {
315	assert(!streamEof());
316	return Data.drop_front(N: Pos).starts_with(Prefix: "//");
317	}
318
319	bool Tokenizer::canContinueInt() const {
320	assert(!streamEof());
321	return std::isalnum(Data [Pos]);
322	}
323
324	bool Tokenizer::canStartString() const {
325	return willNowRead(FollowingChars: "\"") \|\| willNowRead(FollowingChars: "L\"") \|\| willNowRead(FollowingChars: "l\"");
326	}
327
328	bool Tokenizer::streamEof() const { return Pos == DataLength; }
329
330	Kind Tokenizer::classifyCurrentToken() const {
331	if (canStartBlockComment())
332	return Kind::StartComment;
333	if (canStartLineComment())
334	return Kind::LineComment;
335
336	if (canStartInt())
337	return Kind::Int;
338	if (canStartString())
339	return Kind::String;
340	// BEGIN and END are at this point of lexing recognized as identifiers.
341	if (canStartIdentifier())
342	return Kind::Identifier;
343
344	const char CurChar = Data [Pos];
345
346	switch (CurChar) {
347	// One-character token classification.
348	#define TOKEN(Name)
349	#define SHORT_TOKEN(Name, Ch) \
350	case Ch: \
351	return Kind::Name;
352	#include "ResourceScriptTokenList.def"
353
354	default:
355	return Kind::Invalid;
356	}
357	}
358
359	void Tokenizer::processIdentifier(RCToken &Token) const {
360	assert(Token.kind() == Kind::Identifier);
361	StringRef Name = Token.value();
362
363	if (Name.equals_insensitive(RHS: "begin"))
364	Token = RCToken (Kind::BlockBegin, Name);
365	else if (Name.equals_insensitive(RHS: "end"))
366	Token = RCToken (Kind::BlockEnd, Name);
367	}
368
369	} // anonymous namespace
370
371	namespace llvm {
372
373	Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
374	return Tokenizer (Input).run();
375	}
376
377	} // namespace llvm
378

Browse the source code of llvm_projects/llvm/tools/llvm-rc/ResourceScriptToken.cpp