1 | //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===---------------------------------------------------------------------===// |
8 | // |
9 | // This file implements an interface defined in ResourceScriptToken.h. |
10 | // In particular, it defines an .rc script tokenizer. |
11 | // |
12 | //===---------------------------------------------------------------------===// |
13 | |
14 | #include "ResourceScriptToken.h" |
15 | #include "llvm/ADT/StringExtras.h" |
16 | #include "llvm/Support/raw_ostream.h" |
17 | |
18 | #include <algorithm> |
19 | #include <cassert> |
20 | #include <cctype> |
21 | #include <cstdlib> |
22 | #include <utility> |
23 | |
24 | using namespace llvm; |
25 | |
26 | using Kind = RCToken::Kind; |
27 | |
28 | // Checks if Representation is a correct description of an RC integer. |
29 | // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), |
30 | // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' |
31 | // character (that is the difference between our representation and |
32 | // StringRef's one). If Representation is correct, 'true' is returned and |
33 | // the return value is put back in Num. |
34 | static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { |
35 | size_t Length = Representation.size(); |
36 | if (Length == 0) |
37 | return false; |
38 | // Strip the last 'L' if unnecessary. |
39 | if (std::toupper(c: Representation.back()) == 'L') |
40 | Representation = Representation.drop_back(N: 1); |
41 | |
42 | return !Representation.getAsInteger<uint32_t>(Radix: 0, Result&: Num); |
43 | } |
44 | |
45 | RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) |
46 | : TokenKind(RCTokenKind), TokenValue(Value) {} |
47 | |
48 | uint32_t RCToken::intValue() const { |
49 | assert(TokenKind == Kind::Int); |
50 | // We assume that the token already is a correct integer (checked by |
51 | // rcGetAsInteger). |
52 | uint32_t Result; |
53 | bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result); |
54 | assert(IsSuccess); |
55 | (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. |
56 | return Result; |
57 | } |
58 | |
59 | bool RCToken::isLongInt() const { |
60 | return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == 'L'; |
61 | } |
62 | |
63 | StringRef RCToken::value() const { return TokenValue; } |
64 | |
65 | Kind RCToken::kind() const { return TokenKind; } |
66 | |
67 | bool RCToken::isLowPrecedenceBinaryOp() const { |
68 | switch (TokenKind) { |
69 | case Kind::Plus: |
70 | case Kind::Minus: |
71 | case Kind::Pipe: |
72 | case Kind::Amp: |
73 | return true; |
74 | default: |
75 | return false; |
76 | } |
77 | } |
78 | |
79 | bool RCToken::isHighPrecedenceBinaryOp() const { |
80 | switch (TokenKind) { |
81 | case Kind::Asterisk: |
82 | case Kind::Slash: |
83 | return true; |
84 | default: |
85 | return false; |
86 | } |
87 | } |
88 | |
89 | static Error getStringError(const Twine &message) { |
90 | return make_error<StringError>(Args: "Error parsing file: " + message, |
91 | Args: inconvertibleErrorCode()); |
92 | } |
93 | |
94 | namespace { |
95 | |
96 | class Tokenizer { |
97 | public: |
98 | Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {} |
99 | |
100 | Expected<std::vector<RCToken>> run(); |
101 | |
102 | private: |
103 | // All 'advancing' methods return boolean values; if they're equal to false, |
104 | // the stream has ended or failed. |
105 | bool advance(size_t Amount = 1); |
106 | bool skipWhitespaces(); |
107 | |
108 | // Consumes a token. If any problem occurred, a non-empty Error is returned. |
109 | Error consumeToken(const Kind TokenKind); |
110 | |
111 | // Check if tokenizer is about to read FollowingChars. |
112 | bool willNowRead(StringRef FollowingChars) const; |
113 | |
114 | // Check if tokenizer can start reading an identifier at current position. |
115 | // The original tool did non specify the rules to determine what is a correct |
116 | // identifier. We assume they should follow the C convention: |
117 | // [a-zA-Z_][a-zA-Z0-9_]*. |
118 | bool canStartIdentifier() const; |
119 | // Check if tokenizer can continue reading an identifier. |
120 | bool canContinueIdentifier() const; |
121 | |
122 | // Check if tokenizer can start reading an integer. |
123 | // A correct integer always starts with a 0-9 digit, |
124 | // can contain characters 0-9A-Fa-f (digits), |
125 | // Ll (marking the integer is 32-bit), Xx (marking the representation |
126 | // is hexadecimal). As some kind of separator should come after the |
127 | // integer, we can consume the integer until a non-alphanumeric |
128 | // character. |
129 | bool canStartInt() const; |
130 | bool canContinueInt() const; |
131 | |
132 | bool canStartString() const; |
133 | |
134 | // Check if tokenizer can start reading a single line comment (e.g. a comment |
135 | // that begins with '//') |
136 | bool canStartLineComment() const; |
137 | |
138 | // Check if tokenizer can start or finish reading a block comment (e.g. a |
139 | // comment that begins with '/*' and ends with '*/') |
140 | bool canStartBlockComment() const; |
141 | |
142 | // Throw away all remaining characters on the current line. |
143 | void skipCurrentLine(); |
144 | |
145 | bool streamEof() const; |
146 | |
147 | // Classify the token that is about to be read from the current position. |
148 | Kind classifyCurrentToken() const; |
149 | |
150 | // Process the Kind::Identifier token - check if it is |
151 | // an identifier describing a block start or end. |
152 | void processIdentifier(RCToken &token) const; |
153 | |
154 | StringRef Data; |
155 | size_t DataLength, Pos; |
156 | }; |
157 | |
158 | void Tokenizer::skipCurrentLine() { |
159 | Pos = Data.find_first_of(Chars: "\r\n" , From: Pos); |
160 | Pos = Data.find_first_not_of(Chars: "\r\n" , From: Pos); |
161 | |
162 | if (Pos == StringRef::npos) |
163 | Pos = DataLength; |
164 | } |
165 | |
166 | Expected<std::vector<RCToken>> Tokenizer::run() { |
167 | Pos = 0; |
168 | std::vector<RCToken> Result; |
169 | |
170 | // Consume an optional UTF-8 Byte Order Mark. |
171 | if (willNowRead(FollowingChars: "\xef\xbb\xbf" )) |
172 | advance(Amount: 3); |
173 | |
174 | while (!streamEof()) { |
175 | if (!skipWhitespaces()) |
176 | break; |
177 | |
178 | Kind TokenKind = classifyCurrentToken(); |
179 | if (TokenKind == Kind::Invalid) |
180 | return getStringError(message: "Invalid token found at position " + Twine(Pos)); |
181 | |
182 | const size_t TokenStart = Pos; |
183 | if (Error TokenError = consumeToken(TokenKind)) |
184 | return std::move(TokenError); |
185 | |
186 | // Comments are just deleted, don't bother saving them. |
187 | if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) |
188 | continue; |
189 | |
190 | RCToken Token(TokenKind, Data.take_front(N: Pos).drop_front(N: TokenStart)); |
191 | if (TokenKind == Kind::Identifier) { |
192 | processIdentifier(token&: Token); |
193 | } else if (TokenKind == Kind::Int) { |
194 | uint32_t TokenInt; |
195 | if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) { |
196 | // The integer has incorrect format or cannot be represented in |
197 | // a 32-bit integer. |
198 | return getStringError(message: "Integer invalid or too large: " + |
199 | Token.value().str()); |
200 | } |
201 | } |
202 | |
203 | Result.push_back(x: Token); |
204 | } |
205 | |
206 | return Result; |
207 | } |
208 | |
209 | bool Tokenizer::advance(size_t Amount) { |
210 | Pos += Amount; |
211 | return !streamEof(); |
212 | } |
213 | |
214 | bool Tokenizer::skipWhitespaces() { |
215 | while (!streamEof() && isSpace(C: Data[Pos])) |
216 | advance(); |
217 | return !streamEof(); |
218 | } |
219 | |
220 | Error Tokenizer::consumeToken(const Kind TokenKind) { |
221 | switch (TokenKind) { |
222 | // One-character token consumption. |
223 | #define TOKEN(Name) |
224 | #define SHORT_TOKEN(Name, Ch) case Kind::Name: |
225 | #include "ResourceScriptTokenList.def" |
226 | advance(); |
227 | return Error::success(); |
228 | |
229 | case Kind::LineComment: |
230 | advance(Amount: 2); |
231 | skipCurrentLine(); |
232 | return Error::success(); |
233 | |
234 | case Kind::StartComment: { |
235 | advance(Amount: 2); |
236 | auto EndPos = Data.find(Str: "*/" , From: Pos); |
237 | if (EndPos == StringRef::npos) |
238 | return getStringError( |
239 | message: "Unclosed multi-line comment beginning at position " + Twine(Pos)); |
240 | advance(Amount: EndPos - Pos); |
241 | advance(Amount: 2); |
242 | return Error::success(); |
243 | } |
244 | case Kind::Identifier: |
245 | while (!streamEof() && canContinueIdentifier()) |
246 | advance(); |
247 | return Error::success(); |
248 | |
249 | case Kind::Int: |
250 | while (!streamEof() && canContinueInt()) |
251 | advance(); |
252 | return Error::success(); |
253 | |
254 | case Kind::String: |
255 | // Consume the preceding 'L', if there is any. |
256 | if (std::toupper(c: Data[Pos]) == 'L') |
257 | advance(); |
258 | // Consume the double-quote. |
259 | advance(); |
260 | |
261 | // Consume the characters until the end of the file, line or string. |
262 | while (true) { |
263 | if (streamEof()) { |
264 | return getStringError(message: "Unterminated string literal." ); |
265 | } else if (Data[Pos] == '"') { |
266 | // Consume the ending double-quote. |
267 | advance(); |
268 | // However, if another '"' follows this double-quote, the string didn't |
269 | // end and we just included '"' into the string. |
270 | if (!willNowRead(FollowingChars: "\"" )) |
271 | return Error::success(); |
272 | } else if (Data[Pos] == '\n') { |
273 | return getStringError(message: "String literal not terminated in the line." ); |
274 | } |
275 | |
276 | advance(); |
277 | } |
278 | |
279 | case Kind::Invalid: |
280 | assert(false && "Cannot consume an invalid token." ); |
281 | } |
282 | |
283 | llvm_unreachable("Unknown RCToken::Kind" ); |
284 | } |
285 | |
286 | bool Tokenizer::willNowRead(StringRef FollowingChars) const { |
287 | return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars); |
288 | } |
289 | |
290 | bool Tokenizer::canStartIdentifier() const { |
291 | assert(!streamEof()); |
292 | |
293 | const char CurChar = Data[Pos]; |
294 | return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.'; |
295 | } |
296 | |
297 | bool Tokenizer::canContinueIdentifier() const { |
298 | assert(!streamEof()); |
299 | const char CurChar = Data[Pos]; |
300 | return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' || |
301 | CurChar == '/' || CurChar == '\\' || CurChar == '-'; |
302 | } |
303 | |
304 | bool Tokenizer::canStartInt() const { |
305 | assert(!streamEof()); |
306 | return std::isdigit(Data[Pos]); |
307 | } |
308 | |
309 | bool Tokenizer::() const { |
310 | assert(!streamEof()); |
311 | return Data.drop_front(N: Pos).starts_with(Prefix: "/*" ); |
312 | } |
313 | |
314 | bool Tokenizer::() const { |
315 | assert(!streamEof()); |
316 | return Data.drop_front(N: Pos).starts_with(Prefix: "//" ); |
317 | } |
318 | |
319 | bool Tokenizer::canContinueInt() const { |
320 | assert(!streamEof()); |
321 | return std::isalnum(Data[Pos]); |
322 | } |
323 | |
324 | bool Tokenizer::canStartString() const { |
325 | return willNowRead(FollowingChars: "\"" ) || willNowRead(FollowingChars: "L\"" ) || willNowRead(FollowingChars: "l\"" ); |
326 | } |
327 | |
328 | bool Tokenizer::streamEof() const { return Pos == DataLength; } |
329 | |
330 | Kind Tokenizer::classifyCurrentToken() const { |
331 | if (canStartBlockComment()) |
332 | return Kind::StartComment; |
333 | if (canStartLineComment()) |
334 | return Kind::LineComment; |
335 | |
336 | if (canStartInt()) |
337 | return Kind::Int; |
338 | if (canStartString()) |
339 | return Kind::String; |
340 | // BEGIN and END are at this point of lexing recognized as identifiers. |
341 | if (canStartIdentifier()) |
342 | return Kind::Identifier; |
343 | |
344 | const char CurChar = Data[Pos]; |
345 | |
346 | switch (CurChar) { |
347 | // One-character token classification. |
348 | #define TOKEN(Name) |
349 | #define SHORT_TOKEN(Name, Ch) \ |
350 | case Ch: \ |
351 | return Kind::Name; |
352 | #include "ResourceScriptTokenList.def" |
353 | |
354 | default: |
355 | return Kind::Invalid; |
356 | } |
357 | } |
358 | |
359 | void Tokenizer::processIdentifier(RCToken &Token) const { |
360 | assert(Token.kind() == Kind::Identifier); |
361 | StringRef Name = Token.value(); |
362 | |
363 | if (Name.equals_insensitive(RHS: "begin" )) |
364 | Token = RCToken(Kind::BlockBegin, Name); |
365 | else if (Name.equals_insensitive(RHS: "end" )) |
366 | Token = RCToken(Kind::BlockEnd, Name); |
367 | } |
368 | |
369 | } // anonymous namespace |
370 | |
371 | namespace llvm { |
372 | |
373 | Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { |
374 | return Tokenizer(Input).run(); |
375 | } |
376 | |
377 | } // namespace llvm |
378 | |