1//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===---------------------------------------------------------------------===//
8//
9// This file implements an interface defined in ResourceScriptToken.h.
10// In particular, it defines an .rc script tokenizer.
11//
12//===---------------------------------------------------------------------===//
13
14#include "ResourceScriptToken.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/Support/raw_ostream.h"
17
18#include <algorithm>
19#include <cassert>
20#include <cctype>
21#include <cstdlib>
22#include <utility>
23
24using namespace llvm;
25
26using Kind = RCToken::Kind;
27
28// Checks if Representation is a correct description of an RC integer.
29// It should be a 32-bit unsigned integer, either decimal or hexadecimal
30// (0x[0-9a-f]+). For Windres mode, it can also be octal (0[0-7]+).
31// It might be followed by a single 'L' character (that is the difference
32// between our representation and StringRef's one). If Representation is
33// correct, 'true' is returned and the return value is put back in Num.
34static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35 size_t Length = Representation.size();
36 if (Length == 0)
37 return false;
38 // Strip the last 'L' if unnecessary.
39 if (std::toupper(c: Representation.back()) == 'L')
40 Representation = Representation.drop_back(N: 1);
41
42 return !Representation.getAsInteger<uint32_t>(Radix: 0, Result&: Num);
43}
44
45RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46 : TokenKind(RCTokenKind), TokenValue(Value) {}
47
48uint32_t RCToken::intValue() const {
49 assert(TokenKind == Kind::Int);
50 // We assume that the token already is a correct integer (checked by
51 // rcGetAsInteger).
52 uint32_t Result;
53 bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result);
54 assert(IsSuccess);
55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56 return Result;
57}
58
59bool RCToken::isLongInt() const {
60 return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == 'L';
61}
62
63StringRef RCToken::value() const { return TokenValue; }
64
65Kind RCToken::kind() const { return TokenKind; }
66
67bool RCToken::isLowPrecedenceBinaryOp() const {
68 switch (TokenKind) {
69 case Kind::Plus:
70 case Kind::Minus:
71 case Kind::Pipe:
72 case Kind::Amp:
73 return true;
74 default:
75 return false;
76 }
77}
78
79bool RCToken::isHighPrecedenceBinaryOp() const {
80 switch (TokenKind) {
81 case Kind::Asterisk:
82 case Kind::Slash:
83 return true;
84 default:
85 return false;
86 }
87}
88
89static Error getStringError(const Twine &message) {
90 return make_error<StringError>(Args: "Error parsing file: " + message,
91 Args: inconvertibleErrorCode());
92}
93
94namespace {
95
96class Tokenizer {
97public:
98 Tokenizer(StringRef Input, bool IsWindres)
99 : Data(Input), DataLength(Input.size()), Pos(0), IsWindres(IsWindres) {}
100
101 Expected<std::vector<RCToken>> run();
102
103private:
104 // All 'advancing' methods return boolean values; if they're equal to false,
105 // the stream has ended or failed.
106 bool advance(size_t Amount = 1);
107 bool skipWhitespaces();
108
109 // Consumes a token. If any problem occurred, a non-empty Error is returned.
110 Error consumeToken(const Kind TokenKind);
111
112 // Check if tokenizer is about to read FollowingChars.
113 bool willNowRead(StringRef FollowingChars) const;
114
115 // Check if tokenizer can start reading an identifier at current position.
116 // The original tool did non specify the rules to determine what is a correct
117 // identifier. We assume they should follow the C convention:
118 // [a-zA-Z_][a-zA-Z0-9_]*.
119 bool canStartIdentifier() const;
120 // Check if tokenizer can continue reading an identifier.
121 bool canContinueIdentifier() const;
122
123 // Check if tokenizer can start reading an integer.
124 // A correct integer always starts with a 0-9 digit,
125 // can contain characters 0-9A-Fa-f (digits),
126 // Ll (marking the integer is 32-bit), Xx (marking the representation
127 // is hexadecimal). As some kind of separator should come after the
128 // integer, we can consume the integer until a non-alphanumeric
129 // character.
130 bool canStartInt() const;
131 bool canContinueInt() const;
132 void trimIntString(StringRef &Str) const;
133
134 bool canStartString() const;
135
136 // Check if tokenizer can start reading a single line comment (e.g. a comment
137 // that begins with '//')
138 bool canStartLineComment() const;
139
140 // Check if tokenizer can start or finish reading a block comment (e.g. a
141 // comment that begins with '/*' and ends with '*/')
142 bool canStartBlockComment() const;
143
144 // Throw away all remaining characters on the current line.
145 void skipCurrentLine();
146
147 bool streamEof() const;
148
149 // Classify the token that is about to be read from the current position.
150 Kind classifyCurrentToken() const;
151
152 // Process the Kind::Identifier token - check if it is
153 // an identifier describing a block start or end.
154 void processIdentifier(RCToken &token) const;
155
156 StringRef Data;
157 size_t DataLength, Pos;
158 bool IsWindres;
159};
160
161void Tokenizer::skipCurrentLine() {
162 Pos = Data.find_first_of(Chars: "\r\n", From: Pos);
163 Pos = Data.find_first_not_of(Chars: "\r\n", From: Pos);
164
165 if (Pos == StringRef::npos)
166 Pos = DataLength;
167}
168
169Expected<std::vector<RCToken>> Tokenizer::run() {
170 Pos = 0;
171 std::vector<RCToken> Result;
172
173 // Consume an optional UTF-8 Byte Order Mark.
174 if (willNowRead(FollowingChars: "\xef\xbb\xbf"))
175 advance(Amount: 3);
176
177 while (!streamEof()) {
178 if (!skipWhitespaces())
179 break;
180
181 Kind TokenKind = classifyCurrentToken();
182 if (TokenKind == Kind::Invalid)
183 return getStringError(message: "Invalid token found at position " + Twine(Pos));
184
185 const size_t TokenStart = Pos;
186 if (Error TokenError = consumeToken(TokenKind))
187 return std::move(TokenError);
188
189 // Comments are just deleted, don't bother saving them.
190 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
191 continue;
192
193 StringRef Contents = Data.take_front(N: Pos).drop_front(N: TokenStart);
194
195 if (TokenKind == Kind::Int)
196 trimIntString(Str&: Contents);
197
198 RCToken Token(TokenKind, Contents);
199 if (TokenKind == Kind::Identifier) {
200 processIdentifier(token&: Token);
201 } else if (TokenKind == Kind::Int) {
202 uint32_t TokenInt;
203 if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) {
204 // The integer has incorrect format or cannot be represented in
205 // a 32-bit integer.
206 return getStringError(message: "Integer invalid or too large: " +
207 Token.value().str());
208 }
209 }
210
211 Result.push_back(x: Token);
212 }
213
214 return Result;
215}
216
217bool Tokenizer::advance(size_t Amount) {
218 Pos += Amount;
219 return !streamEof();
220}
221
222bool Tokenizer::skipWhitespaces() {
223 while (!streamEof() && isSpace(C: Data[Pos]))
224 advance();
225 return !streamEof();
226}
227
228Error Tokenizer::consumeToken(const Kind TokenKind) {
229 switch (TokenKind) {
230 // One-character token consumption.
231#define TOKEN(Name)
232#define SHORT_TOKEN(Name, Ch) case Kind::Name:
233#include "ResourceScriptTokenList.def"
234 advance();
235 return Error::success();
236
237 case Kind::LineComment:
238 advance(Amount: 2);
239 skipCurrentLine();
240 return Error::success();
241
242 case Kind::StartComment: {
243 advance(Amount: 2);
244 auto EndPos = Data.find(Str: "*/", From: Pos);
245 if (EndPos == StringRef::npos)
246 return getStringError(
247 message: "Unclosed multi-line comment beginning at position " + Twine(Pos));
248 advance(Amount: EndPos - Pos);
249 advance(Amount: 2);
250 return Error::success();
251 }
252 case Kind::Identifier:
253 while (!streamEof() && canContinueIdentifier())
254 advance();
255 return Error::success();
256
257 case Kind::Int:
258 while (!streamEof() && canContinueInt())
259 advance();
260 return Error::success();
261
262 case Kind::String:
263 // Consume the preceding 'L', if there is any.
264 if (std::toupper(c: Data[Pos]) == 'L')
265 advance();
266 // Consume the double-quote.
267 advance();
268
269 // Consume the characters until the end of the file, line or string.
270 while (true) {
271 if (streamEof()) {
272 return getStringError(message: "Unterminated string literal.");
273 } else if (Data[Pos] == '"') {
274 // Consume the ending double-quote.
275 advance();
276 // However, if another '"' follows this double-quote, the string didn't
277 // end and we just included '"' into the string.
278 if (!willNowRead(FollowingChars: "\""))
279 return Error::success();
280 } else if (Data[Pos] == '\n') {
281 return getStringError(message: "String literal not terminated in the line.");
282 }
283
284 advance();
285 }
286
287 case Kind::Invalid:
288 assert(false && "Cannot consume an invalid token.");
289 }
290
291 llvm_unreachable("Unknown RCToken::Kind");
292}
293
294bool Tokenizer::willNowRead(StringRef FollowingChars) const {
295 return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars);
296}
297
298bool Tokenizer::canStartIdentifier() const {
299 assert(!streamEof());
300
301 const char CurChar = Data[Pos];
302 return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
303}
304
305bool Tokenizer::canContinueIdentifier() const {
306 assert(!streamEof());
307 const char CurChar = Data[Pos];
308 return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
309 CurChar == '/' || CurChar == '\\' || CurChar == '-';
310}
311
312bool Tokenizer::canStartInt() const {
313 assert(!streamEof());
314 return std::isdigit(Data[Pos]);
315}
316
317bool Tokenizer::canStartBlockComment() const {
318 assert(!streamEof());
319 return Data.drop_front(N: Pos).starts_with(Prefix: "/*");
320}
321
322bool Tokenizer::canStartLineComment() const {
323 assert(!streamEof());
324 return Data.drop_front(N: Pos).starts_with(Prefix: "//");
325}
326
327bool Tokenizer::canContinueInt() const {
328 assert(!streamEof());
329 return std::isalnum(Data[Pos]);
330}
331
332bool Tokenizer::canStartString() const {
333 return willNowRead(FollowingChars: "\"") || willNowRead(FollowingChars: "L\"") || willNowRead(FollowingChars: "l\"");
334}
335
336bool Tokenizer::streamEof() const { return Pos == DataLength; }
337
338Kind Tokenizer::classifyCurrentToken() const {
339 if (canStartBlockComment())
340 return Kind::StartComment;
341 if (canStartLineComment())
342 return Kind::LineComment;
343
344 if (canStartInt())
345 return Kind::Int;
346 if (canStartString())
347 return Kind::String;
348 // BEGIN and END are at this point of lexing recognized as identifiers.
349 if (canStartIdentifier())
350 return Kind::Identifier;
351
352 const char CurChar = Data[Pos];
353
354 switch (CurChar) {
355 // One-character token classification.
356#define TOKEN(Name)
357#define SHORT_TOKEN(Name, Ch) \
358 case Ch: \
359 return Kind::Name;
360#include "ResourceScriptTokenList.def"
361
362 default:
363 return Kind::Invalid;
364 }
365}
366
367void Tokenizer::processIdentifier(RCToken &Token) const {
368 assert(Token.kind() == Kind::Identifier);
369 StringRef Name = Token.value();
370
371 if (Name.equals_insensitive(RHS: "begin"))
372 Token = RCToken(Kind::BlockBegin, Name);
373 else if (Name.equals_insensitive(RHS: "end"))
374 Token = RCToken(Kind::BlockEnd, Name);
375}
376
377void Tokenizer::trimIntString(StringRef &Str) const {
378 if (!IsWindres) {
379 // For compatibility with rc.exe, strip leading zeros that make the
380 // integer literal interpreted as octal.
381 //
382 // We do rely on Stringref::getAsInteger for autodetecting between
383 // decimal and hexadecimal literals, but we want to avoid interpreting
384 // literals as octal.
385 //
386 // This omits the leading zeros from the RCToken's value string entirely,
387 // which also has a visible effect when dumping the tokenizer output.
388 // Alternatively, we could store the IsWindres flag in RCToken and defer
389 // the trimming to RCToken::intValue.
390 while (Str.size() >= 2 && Str[0] == '0' && std::isdigit(Str[1]))
391 Str = Str.drop_front(N: 1);
392 }
393}
394
395} // anonymous namespace
396
397namespace llvm {
398
399Expected<std::vector<RCToken>> tokenizeRC(StringRef Input, bool IsWindres) {
400 return Tokenizer(Input, IsWindres).run();
401}
402
403} // namespace llvm
404