1 | //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===---------------------------------------------------------------------===// |
8 | // |
9 | // This file implements an interface defined in ResourceScriptToken.h. |
10 | // In particular, it defines an .rc script tokenizer. |
11 | // |
12 | //===---------------------------------------------------------------------===// |
13 | |
14 | #include "ResourceScriptToken.h" |
15 | #include "llvm/ADT/StringExtras.h" |
16 | #include "llvm/Support/raw_ostream.h" |
17 | |
18 | #include <algorithm> |
19 | #include <cassert> |
20 | #include <cctype> |
21 | #include <cstdlib> |
22 | #include <utility> |
23 | |
24 | using namespace llvm; |
25 | |
26 | using Kind = RCToken::Kind; |
27 | |
28 | // Checks if Representation is a correct description of an RC integer. |
29 | // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), |
30 | // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' |
31 | // character (that is the difference between our representation and |
32 | // StringRef's one). If Representation is correct, 'true' is returned and |
33 | // the return value is put back in Num. |
34 | static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { |
35 | size_t Length = Representation.size(); |
36 | if (Length == 0) |
37 | return false; |
38 | // Strip the last 'L' if unnecessary. |
39 | if (std::toupper(c: Representation.back()) == 'L') |
40 | Representation = Representation.drop_back(N: 1); |
41 | |
42 | return !Representation.getAsInteger<uint32_t>(Radix: 0, Result&: Num); |
43 | } |
44 | |
45 | RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) |
46 | : TokenKind(RCTokenKind), TokenValue(Value) {} |
47 | |
48 | uint32_t RCToken::intValue() const { |
49 | assert(TokenKind == Kind::Int); |
50 | // We assume that the token already is a correct integer (checked by |
51 | // rcGetAsInteger). |
52 | uint32_t Result; |
53 | bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result); |
54 | assert(IsSuccess); |
55 | (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. |
56 | return Result; |
57 | } |
58 | |
59 | bool RCToken::isLongInt() const { |
60 | return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == 'L'; |
61 | } |
62 | |
63 | StringRef RCToken::value() const { return TokenValue; } |
64 | |
65 | Kind RCToken::kind() const { return TokenKind; } |
66 | |
67 | bool RCToken::isBinaryOp() const { |
68 | switch (TokenKind) { |
69 | case Kind::Plus: |
70 | case Kind::Minus: |
71 | case Kind::Pipe: |
72 | case Kind::Amp: |
73 | return true; |
74 | default: |
75 | return false; |
76 | } |
77 | } |
78 | |
79 | static Error getStringError(const Twine &message) { |
80 | return make_error<StringError>(Args: "Error parsing file: " + message, |
81 | Args: inconvertibleErrorCode()); |
82 | } |
83 | |
84 | namespace { |
85 | |
86 | class Tokenizer { |
87 | public: |
88 | Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {} |
89 | |
90 | Expected<std::vector<RCToken>> run(); |
91 | |
92 | private: |
93 | // All 'advancing' methods return boolean values; if they're equal to false, |
94 | // the stream has ended or failed. |
95 | bool advance(size_t Amount = 1); |
96 | bool skipWhitespaces(); |
97 | |
98 | // Consumes a token. If any problem occurred, a non-empty Error is returned. |
99 | Error consumeToken(const Kind TokenKind); |
100 | |
101 | // Check if tokenizer is about to read FollowingChars. |
102 | bool willNowRead(StringRef FollowingChars) const; |
103 | |
104 | // Check if tokenizer can start reading an identifier at current position. |
105 | // The original tool did non specify the rules to determine what is a correct |
106 | // identifier. We assume they should follow the C convention: |
107 | // [a-zA-Z_][a-zA-Z0-9_]*. |
108 | bool canStartIdentifier() const; |
109 | // Check if tokenizer can continue reading an identifier. |
110 | bool canContinueIdentifier() const; |
111 | |
112 | // Check if tokenizer can start reading an integer. |
113 | // A correct integer always starts with a 0-9 digit, |
114 | // can contain characters 0-9A-Fa-f (digits), |
115 | // Ll (marking the integer is 32-bit), Xx (marking the representation |
116 | // is hexadecimal). As some kind of separator should come after the |
117 | // integer, we can consume the integer until a non-alphanumeric |
118 | // character. |
119 | bool canStartInt() const; |
120 | bool canContinueInt() const; |
121 | |
122 | bool canStartString() const; |
123 | |
124 | // Check if tokenizer can start reading a single line comment (e.g. a comment |
125 | // that begins with '//') |
126 | bool canStartLineComment() const; |
127 | |
128 | // Check if tokenizer can start or finish reading a block comment (e.g. a |
129 | // comment that begins with '/*' and ends with '*/') |
130 | bool canStartBlockComment() const; |
131 | |
132 | // Throw away all remaining characters on the current line. |
133 | void skipCurrentLine(); |
134 | |
135 | bool streamEof() const; |
136 | |
137 | // Classify the token that is about to be read from the current position. |
138 | Kind classifyCurrentToken() const; |
139 | |
140 | // Process the Kind::Identifier token - check if it is |
141 | // an identifier describing a block start or end. |
142 | void processIdentifier(RCToken &token) const; |
143 | |
144 | StringRef Data; |
145 | size_t DataLength, Pos; |
146 | }; |
147 | |
148 | void Tokenizer::skipCurrentLine() { |
149 | Pos = Data.find_first_of(Chars: "\r\n" , From: Pos); |
150 | Pos = Data.find_first_not_of(Chars: "\r\n" , From: Pos); |
151 | |
152 | if (Pos == StringRef::npos) |
153 | Pos = DataLength; |
154 | } |
155 | |
156 | Expected<std::vector<RCToken>> Tokenizer::run() { |
157 | Pos = 0; |
158 | std::vector<RCToken> Result; |
159 | |
160 | // Consume an optional UTF-8 Byte Order Mark. |
161 | if (willNowRead(FollowingChars: "\xef\xbb\xbf" )) |
162 | advance(Amount: 3); |
163 | |
164 | while (!streamEof()) { |
165 | if (!skipWhitespaces()) |
166 | break; |
167 | |
168 | Kind TokenKind = classifyCurrentToken(); |
169 | if (TokenKind == Kind::Invalid) |
170 | return getStringError(message: "Invalid token found at position " + Twine(Pos)); |
171 | |
172 | const size_t TokenStart = Pos; |
173 | if (Error TokenError = consumeToken(TokenKind)) |
174 | return std::move(TokenError); |
175 | |
176 | // Comments are just deleted, don't bother saving them. |
177 | if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) |
178 | continue; |
179 | |
180 | RCToken Token(TokenKind, Data.take_front(N: Pos).drop_front(N: TokenStart)); |
181 | if (TokenKind == Kind::Identifier) { |
182 | processIdentifier(token&: Token); |
183 | } else if (TokenKind == Kind::Int) { |
184 | uint32_t TokenInt; |
185 | if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) { |
186 | // The integer has incorrect format or cannot be represented in |
187 | // a 32-bit integer. |
188 | return getStringError(message: "Integer invalid or too large: " + |
189 | Token.value().str()); |
190 | } |
191 | } |
192 | |
193 | Result.push_back(x: Token); |
194 | } |
195 | |
196 | return Result; |
197 | } |
198 | |
199 | bool Tokenizer::advance(size_t Amount) { |
200 | Pos += Amount; |
201 | return !streamEof(); |
202 | } |
203 | |
204 | bool Tokenizer::skipWhitespaces() { |
205 | while (!streamEof() && isSpace(C: Data[Pos])) |
206 | advance(); |
207 | return !streamEof(); |
208 | } |
209 | |
210 | Error Tokenizer::consumeToken(const Kind TokenKind) { |
211 | switch (TokenKind) { |
212 | // One-character token consumption. |
213 | #define TOKEN(Name) |
214 | #define SHORT_TOKEN(Name, Ch) case Kind::Name: |
215 | #include "ResourceScriptTokenList.def" |
216 | advance(); |
217 | return Error::success(); |
218 | |
219 | case Kind::LineComment: |
220 | advance(Amount: 2); |
221 | skipCurrentLine(); |
222 | return Error::success(); |
223 | |
224 | case Kind::StartComment: { |
225 | advance(Amount: 2); |
226 | auto EndPos = Data.find(Str: "*/" , From: Pos); |
227 | if (EndPos == StringRef::npos) |
228 | return getStringError( |
229 | message: "Unclosed multi-line comment beginning at position " + Twine(Pos)); |
230 | advance(Amount: EndPos - Pos); |
231 | advance(Amount: 2); |
232 | return Error::success(); |
233 | } |
234 | case Kind::Identifier: |
235 | while (!streamEof() && canContinueIdentifier()) |
236 | advance(); |
237 | return Error::success(); |
238 | |
239 | case Kind::Int: |
240 | while (!streamEof() && canContinueInt()) |
241 | advance(); |
242 | return Error::success(); |
243 | |
244 | case Kind::String: |
245 | // Consume the preceding 'L', if there is any. |
246 | if (std::toupper(c: Data[Pos]) == 'L') |
247 | advance(); |
248 | // Consume the double-quote. |
249 | advance(); |
250 | |
251 | // Consume the characters until the end of the file, line or string. |
252 | while (true) { |
253 | if (streamEof()) { |
254 | return getStringError(message: "Unterminated string literal." ); |
255 | } else if (Data[Pos] == '"') { |
256 | // Consume the ending double-quote. |
257 | advance(); |
258 | // However, if another '"' follows this double-quote, the string didn't |
259 | // end and we just included '"' into the string. |
260 | if (!willNowRead(FollowingChars: "\"" )) |
261 | return Error::success(); |
262 | } else if (Data[Pos] == '\n') { |
263 | return getStringError(message: "String literal not terminated in the line." ); |
264 | } |
265 | |
266 | advance(); |
267 | } |
268 | |
269 | case Kind::Invalid: |
270 | assert(false && "Cannot consume an invalid token." ); |
271 | } |
272 | |
273 | llvm_unreachable("Unknown RCToken::Kind" ); |
274 | } |
275 | |
276 | bool Tokenizer::willNowRead(StringRef FollowingChars) const { |
277 | return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars); |
278 | } |
279 | |
280 | bool Tokenizer::canStartIdentifier() const { |
281 | assert(!streamEof()); |
282 | |
283 | const char CurChar = Data[Pos]; |
284 | return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.'; |
285 | } |
286 | |
287 | bool Tokenizer::canContinueIdentifier() const { |
288 | assert(!streamEof()); |
289 | const char CurChar = Data[Pos]; |
290 | return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' || |
291 | CurChar == '/' || CurChar == '\\' || CurChar == '-'; |
292 | } |
293 | |
294 | bool Tokenizer::canStartInt() const { |
295 | assert(!streamEof()); |
296 | return std::isdigit(Data[Pos]); |
297 | } |
298 | |
299 | bool Tokenizer::() const { |
300 | assert(!streamEof()); |
301 | return Data.drop_front(N: Pos).starts_with(Prefix: "/*" ); |
302 | } |
303 | |
304 | bool Tokenizer::() const { |
305 | assert(!streamEof()); |
306 | return Data.drop_front(N: Pos).starts_with(Prefix: "//" ); |
307 | } |
308 | |
309 | bool Tokenizer::canContinueInt() const { |
310 | assert(!streamEof()); |
311 | return std::isalnum(Data[Pos]); |
312 | } |
313 | |
314 | bool Tokenizer::canStartString() const { |
315 | return willNowRead(FollowingChars: "\"" ) || willNowRead(FollowingChars: "L\"" ) || willNowRead(FollowingChars: "l\"" ); |
316 | } |
317 | |
318 | bool Tokenizer::streamEof() const { return Pos == DataLength; } |
319 | |
320 | Kind Tokenizer::classifyCurrentToken() const { |
321 | if (canStartBlockComment()) |
322 | return Kind::StartComment; |
323 | if (canStartLineComment()) |
324 | return Kind::LineComment; |
325 | |
326 | if (canStartInt()) |
327 | return Kind::Int; |
328 | if (canStartString()) |
329 | return Kind::String; |
330 | // BEGIN and END are at this point of lexing recognized as identifiers. |
331 | if (canStartIdentifier()) |
332 | return Kind::Identifier; |
333 | |
334 | const char CurChar = Data[Pos]; |
335 | |
336 | switch (CurChar) { |
337 | // One-character token classification. |
338 | #define TOKEN(Name) |
339 | #define SHORT_TOKEN(Name, Ch) \ |
340 | case Ch: \ |
341 | return Kind::Name; |
342 | #include "ResourceScriptTokenList.def" |
343 | |
344 | default: |
345 | return Kind::Invalid; |
346 | } |
347 | } |
348 | |
349 | void Tokenizer::processIdentifier(RCToken &Token) const { |
350 | assert(Token.kind() == Kind::Identifier); |
351 | StringRef Name = Token.value(); |
352 | |
353 | if (Name.equals_insensitive(RHS: "begin" )) |
354 | Token = RCToken(Kind::BlockBegin, Name); |
355 | else if (Name.equals_insensitive(RHS: "end" )) |
356 | Token = RCToken(Kind::BlockEnd, Name); |
357 | } |
358 | |
359 | } // anonymous namespace |
360 | |
361 | namespace llvm { |
362 | |
363 | Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { |
364 | return Tokenizer(Input).run(); |
365 | } |
366 | |
367 | } // namespace llvm |
368 | |