1//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===---------------------------------------------------------------------===//
8//
9// This file implements an interface defined in ResourceScriptToken.h.
10// In particular, it defines an .rc script tokenizer.
11//
12//===---------------------------------------------------------------------===//
13
14#include "ResourceScriptToken.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/Support/raw_ostream.h"
17
18#include <algorithm>
19#include <cassert>
20#include <cctype>
21#include <cstdlib>
22#include <utility>
23
24using namespace llvm;
25
26using Kind = RCToken::Kind;
27
28// Checks if Representation is a correct description of an RC integer.
29// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31// character (that is the difference between our representation and
32// StringRef's one). If Representation is correct, 'true' is returned and
33// the return value is put back in Num.
34static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35 size_t Length = Representation.size();
36 if (Length == 0)
37 return false;
38 // Strip the last 'L' if unnecessary.
39 if (std::toupper(c: Representation.back()) == 'L')
40 Representation = Representation.drop_back(N: 1);
41
42 return !Representation.getAsInteger<uint32_t>(Radix: 0, Result&: Num);
43}
44
45RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46 : TokenKind(RCTokenKind), TokenValue(Value) {}
47
48uint32_t RCToken::intValue() const {
49 assert(TokenKind == Kind::Int);
50 // We assume that the token already is a correct integer (checked by
51 // rcGetAsInteger).
52 uint32_t Result;
53 bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result);
54 assert(IsSuccess);
55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56 return Result;
57}
58
59bool RCToken::isLongInt() const {
60 return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == 'L';
61}
62
63StringRef RCToken::value() const { return TokenValue; }
64
65Kind RCToken::kind() const { return TokenKind; }
66
67bool RCToken::isBinaryOp() const {
68 switch (TokenKind) {
69 case Kind::Plus:
70 case Kind::Minus:
71 case Kind::Pipe:
72 case Kind::Amp:
73 return true;
74 default:
75 return false;
76 }
77}
78
79static Error getStringError(const Twine &message) {
80 return make_error<StringError>(Args: "Error parsing file: " + message,
81 Args: inconvertibleErrorCode());
82}
83
84namespace {
85
86class Tokenizer {
87public:
88 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {}
89
90 Expected<std::vector<RCToken>> run();
91
92private:
93 // All 'advancing' methods return boolean values; if they're equal to false,
94 // the stream has ended or failed.
95 bool advance(size_t Amount = 1);
96 bool skipWhitespaces();
97
98 // Consumes a token. If any problem occurred, a non-empty Error is returned.
99 Error consumeToken(const Kind TokenKind);
100
101 // Check if tokenizer is about to read FollowingChars.
102 bool willNowRead(StringRef FollowingChars) const;
103
104 // Check if tokenizer can start reading an identifier at current position.
105 // The original tool did non specify the rules to determine what is a correct
106 // identifier. We assume they should follow the C convention:
107 // [a-zA-Z_][a-zA-Z0-9_]*.
108 bool canStartIdentifier() const;
109 // Check if tokenizer can continue reading an identifier.
110 bool canContinueIdentifier() const;
111
112 // Check if tokenizer can start reading an integer.
113 // A correct integer always starts with a 0-9 digit,
114 // can contain characters 0-9A-Fa-f (digits),
115 // Ll (marking the integer is 32-bit), Xx (marking the representation
116 // is hexadecimal). As some kind of separator should come after the
117 // integer, we can consume the integer until a non-alphanumeric
118 // character.
119 bool canStartInt() const;
120 bool canContinueInt() const;
121
122 bool canStartString() const;
123
124 // Check if tokenizer can start reading a single line comment (e.g. a comment
125 // that begins with '//')
126 bool canStartLineComment() const;
127
128 // Check if tokenizer can start or finish reading a block comment (e.g. a
129 // comment that begins with '/*' and ends with '*/')
130 bool canStartBlockComment() const;
131
132 // Throw away all remaining characters on the current line.
133 void skipCurrentLine();
134
135 bool streamEof() const;
136
137 // Classify the token that is about to be read from the current position.
138 Kind classifyCurrentToken() const;
139
140 // Process the Kind::Identifier token - check if it is
141 // an identifier describing a block start or end.
142 void processIdentifier(RCToken &token) const;
143
144 StringRef Data;
145 size_t DataLength, Pos;
146};
147
148void Tokenizer::skipCurrentLine() {
149 Pos = Data.find_first_of(Chars: "\r\n", From: Pos);
150 Pos = Data.find_first_not_of(Chars: "\r\n", From: Pos);
151
152 if (Pos == StringRef::npos)
153 Pos = DataLength;
154}
155
156Expected<std::vector<RCToken>> Tokenizer::run() {
157 Pos = 0;
158 std::vector<RCToken> Result;
159
160 // Consume an optional UTF-8 Byte Order Mark.
161 if (willNowRead(FollowingChars: "\xef\xbb\xbf"))
162 advance(Amount: 3);
163
164 while (!streamEof()) {
165 if (!skipWhitespaces())
166 break;
167
168 Kind TokenKind = classifyCurrentToken();
169 if (TokenKind == Kind::Invalid)
170 return getStringError(message: "Invalid token found at position " + Twine(Pos));
171
172 const size_t TokenStart = Pos;
173 if (Error TokenError = consumeToken(TokenKind))
174 return std::move(TokenError);
175
176 // Comments are just deleted, don't bother saving them.
177 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
178 continue;
179
180 RCToken Token(TokenKind, Data.take_front(N: Pos).drop_front(N: TokenStart));
181 if (TokenKind == Kind::Identifier) {
182 processIdentifier(token&: Token);
183 } else if (TokenKind == Kind::Int) {
184 uint32_t TokenInt;
185 if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) {
186 // The integer has incorrect format or cannot be represented in
187 // a 32-bit integer.
188 return getStringError(message: "Integer invalid or too large: " +
189 Token.value().str());
190 }
191 }
192
193 Result.push_back(x: Token);
194 }
195
196 return Result;
197}
198
199bool Tokenizer::advance(size_t Amount) {
200 Pos += Amount;
201 return !streamEof();
202}
203
204bool Tokenizer::skipWhitespaces() {
205 while (!streamEof() && isSpace(C: Data[Pos]))
206 advance();
207 return !streamEof();
208}
209
210Error Tokenizer::consumeToken(const Kind TokenKind) {
211 switch (TokenKind) {
212 // One-character token consumption.
213#define TOKEN(Name)
214#define SHORT_TOKEN(Name, Ch) case Kind::Name:
215#include "ResourceScriptTokenList.def"
216 advance();
217 return Error::success();
218
219 case Kind::LineComment:
220 advance(Amount: 2);
221 skipCurrentLine();
222 return Error::success();
223
224 case Kind::StartComment: {
225 advance(Amount: 2);
226 auto EndPos = Data.find(Str: "*/", From: Pos);
227 if (EndPos == StringRef::npos)
228 return getStringError(
229 message: "Unclosed multi-line comment beginning at position " + Twine(Pos));
230 advance(Amount: EndPos - Pos);
231 advance(Amount: 2);
232 return Error::success();
233 }
234 case Kind::Identifier:
235 while (!streamEof() && canContinueIdentifier())
236 advance();
237 return Error::success();
238
239 case Kind::Int:
240 while (!streamEof() && canContinueInt())
241 advance();
242 return Error::success();
243
244 case Kind::String:
245 // Consume the preceding 'L', if there is any.
246 if (std::toupper(c: Data[Pos]) == 'L')
247 advance();
248 // Consume the double-quote.
249 advance();
250
251 // Consume the characters until the end of the file, line or string.
252 while (true) {
253 if (streamEof()) {
254 return getStringError(message: "Unterminated string literal.");
255 } else if (Data[Pos] == '"') {
256 // Consume the ending double-quote.
257 advance();
258 // However, if another '"' follows this double-quote, the string didn't
259 // end and we just included '"' into the string.
260 if (!willNowRead(FollowingChars: "\""))
261 return Error::success();
262 } else if (Data[Pos] == '\n') {
263 return getStringError(message: "String literal not terminated in the line.");
264 }
265
266 advance();
267 }
268
269 case Kind::Invalid:
270 assert(false && "Cannot consume an invalid token.");
271 }
272
273 llvm_unreachable("Unknown RCToken::Kind");
274}
275
276bool Tokenizer::willNowRead(StringRef FollowingChars) const {
277 return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars);
278}
279
280bool Tokenizer::canStartIdentifier() const {
281 assert(!streamEof());
282
283 const char CurChar = Data[Pos];
284 return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
285}
286
287bool Tokenizer::canContinueIdentifier() const {
288 assert(!streamEof());
289 const char CurChar = Data[Pos];
290 return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
291 CurChar == '/' || CurChar == '\\' || CurChar == '-';
292}
293
294bool Tokenizer::canStartInt() const {
295 assert(!streamEof());
296 return std::isdigit(Data[Pos]);
297}
298
299bool Tokenizer::canStartBlockComment() const {
300 assert(!streamEof());
301 return Data.drop_front(N: Pos).starts_with(Prefix: "/*");
302}
303
304bool Tokenizer::canStartLineComment() const {
305 assert(!streamEof());
306 return Data.drop_front(N: Pos).starts_with(Prefix: "//");
307}
308
309bool Tokenizer::canContinueInt() const {
310 assert(!streamEof());
311 return std::isalnum(Data[Pos]);
312}
313
314bool Tokenizer::canStartString() const {
315 return willNowRead(FollowingChars: "\"") || willNowRead(FollowingChars: "L\"") || willNowRead(FollowingChars: "l\"");
316}
317
318bool Tokenizer::streamEof() const { return Pos == DataLength; }
319
320Kind Tokenizer::classifyCurrentToken() const {
321 if (canStartBlockComment())
322 return Kind::StartComment;
323 if (canStartLineComment())
324 return Kind::LineComment;
325
326 if (canStartInt())
327 return Kind::Int;
328 if (canStartString())
329 return Kind::String;
330 // BEGIN and END are at this point of lexing recognized as identifiers.
331 if (canStartIdentifier())
332 return Kind::Identifier;
333
334 const char CurChar = Data[Pos];
335
336 switch (CurChar) {
337 // One-character token classification.
338#define TOKEN(Name)
339#define SHORT_TOKEN(Name, Ch) \
340 case Ch: \
341 return Kind::Name;
342#include "ResourceScriptTokenList.def"
343
344 default:
345 return Kind::Invalid;
346 }
347}
348
349void Tokenizer::processIdentifier(RCToken &Token) const {
350 assert(Token.kind() == Kind::Identifier);
351 StringRef Name = Token.value();
352
353 if (Name.equals_insensitive(RHS: "begin"))
354 Token = RCToken(Kind::BlockBegin, Name);
355 else if (Name.equals_insensitive(RHS: "end"))
356 Token = RCToken(Kind::BlockEnd, Name);
357}
358
359} // anonymous namespace
360
361namespace llvm {
362
363Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
364 return Tokenizer(Input).run();
365}
366
367} // namespace llvm
368