1 | //===- ScriptLexer.cpp ----------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines a lexer for the linker script. |
10 | // |
11 | // The linker script's grammar is not complex but ambiguous due to the |
12 | // lack of the formal specification of the language. What we are trying to |
13 | // do in this and other files in LLD is to make a "reasonable" linker |
14 | // script processor. |
15 | // |
16 | // Among simplicity, compatibility and efficiency, we put the most |
17 | // emphasis on simplicity when we wrote this lexer. Compatibility with the |
18 | // GNU linkers is important, but we did not try to clone every tiny corner |
19 | // case of their lexers, as even ld.bfd and ld.gold are subtly different |
20 | // in various corner cases. We do not care much about efficiency because |
21 | // the time spent in parsing linker scripts is usually negligible. |
22 | // |
23 | // Overall, this lexer works fine for most linker scripts. There might |
24 | // be room for improving compatibility, but that's probably not at the |
25 | // top of our todo list. |
26 | // |
27 | //===----------------------------------------------------------------------===// |
28 | |
29 | #include "ScriptLexer.h" |
30 | #include "lld/Common/ErrorHandler.h" |
31 | #include "llvm/ADT/Twine.h" |
32 | #include "llvm/Support/ErrorHandling.h" |
33 | #include <algorithm> |
34 | |
35 | using namespace llvm; |
36 | using namespace lld; |
37 | using namespace lld::elf; |
38 | |
39 | // Returns a whole line containing the current token. |
40 | StringRef ScriptLexer::getLine() { |
41 | StringRef s = getCurrentMB().getBuffer(); |
42 | StringRef tok = tokens[pos - 1]; |
43 | |
44 | size_t pos = s.rfind(C: '\n', From: tok.data() - s.data()); |
45 | if (pos != StringRef::npos) |
46 | s = s.substr(Start: pos + 1); |
47 | return s.substr(Start: 0, N: s.find_first_of(Chars: "\r\n" )); |
48 | } |
49 | |
50 | // Returns 1-based line number of the current token. |
51 | size_t ScriptLexer::getLineNumber() { |
52 | if (pos == 0) |
53 | return 1; |
54 | StringRef s = getCurrentMB().getBuffer(); |
55 | StringRef tok = tokens[pos - 1]; |
56 | const size_t tokOffset = tok.data() - s.data(); |
57 | |
58 | // For the first token, or when going backwards, start from the beginning of |
59 | // the buffer. If this token is after the previous token, start from the |
60 | // previous token. |
61 | size_t line = 1; |
62 | size_t start = 0; |
63 | if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) { |
64 | start = lastLineNumberOffset; |
65 | line = lastLineNumber; |
66 | } |
67 | |
68 | line += s.substr(Start: start, N: tokOffset - start).count(C: '\n'); |
69 | |
70 | // Store the line number of this token for reuse. |
71 | lastLineNumberOffset = tokOffset; |
72 | lastLineNumber = line; |
73 | |
74 | return line; |
75 | } |
76 | |
77 | // Returns 0-based column number of the current token. |
78 | size_t ScriptLexer::getColumnNumber() { |
79 | StringRef tok = tokens[pos - 1]; |
80 | return tok.data() - getLine().data(); |
81 | } |
82 | |
83 | std::string ScriptLexer::getCurrentLocation() { |
84 | std::string filename = std::string(getCurrentMB().getBufferIdentifier()); |
85 | return (filename + ":" + Twine(getLineNumber())).str(); |
86 | } |
87 | |
88 | ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); } |
89 | |
90 | // We don't want to record cascading errors. Keep only the first one. |
91 | void ScriptLexer::setError(const Twine &msg) { |
92 | if (errorCount()) |
93 | return; |
94 | |
95 | std::string s = (getCurrentLocation() + ": " + msg).str(); |
96 | if (pos) |
97 | s += "\n>>> " + getLine().str() + "\n>>> " + |
98 | std::string(getColumnNumber(), ' ') + "^" ; |
99 | error(msg: s); |
100 | } |
101 | |
102 | // Split S into linker script tokens. |
103 | void ScriptLexer::tokenize(MemoryBufferRef mb) { |
104 | std::vector<StringRef> vec; |
105 | mbs.push_back(x: mb); |
106 | StringRef s = mb.getBuffer(); |
107 | StringRef begin = s; |
108 | |
109 | for (;;) { |
110 | s = skipSpace(s); |
111 | if (s.empty()) |
112 | break; |
113 | |
114 | // Quoted token. Note that double-quote characters are parts of a token |
115 | // because, in a glob match context, only unquoted tokens are interpreted |
116 | // as glob patterns. Double-quoted tokens are literal patterns in that |
117 | // context. |
118 | if (s.starts_with(Prefix: "\"" )) { |
119 | size_t e = s.find(Str: "\"" , From: 1); |
120 | if (e == StringRef::npos) { |
121 | StringRef filename = mb.getBufferIdentifier(); |
122 | size_t lineno = begin.substr(Start: 0, N: s.data() - begin.data()).count(C: '\n'); |
123 | error(msg: filename + ":" + Twine(lineno + 1) + ": unclosed quote" ); |
124 | return; |
125 | } |
126 | |
127 | vec.push_back(x: s.take_front(N: e + 1)); |
128 | s = s.substr(Start: e + 1); |
129 | continue; |
130 | } |
131 | |
132 | // Some operators form separate tokens. |
133 | if (s.starts_with(Prefix: "<<=" ) || s.starts_with(Prefix: ">>=" )) { |
134 | vec.push_back(x: s.substr(Start: 0, N: 3)); |
135 | s = s.substr(Start: 3); |
136 | continue; |
137 | } |
138 | if (s.size() > 1 && ((s[1] == '=' && strchr(s: "*/+-<>&^|" , c: s[0])) || |
139 | (s[0] == s[1] && strchr(s: "<>&|" , c: s[0])))) { |
140 | vec.push_back(x: s.substr(Start: 0, N: 2)); |
141 | s = s.substr(Start: 2); |
142 | continue; |
143 | } |
144 | |
145 | // Unquoted token. This is more relaxed than tokens in C-like language, |
146 | // so that you can write "file-name.cpp" as one bare token, for example. |
147 | size_t pos = s.find_first_not_of( |
148 | Chars: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
149 | "0123456789_.$/\\~=+[]*?-!^:" ); |
150 | |
151 | // A character that cannot start a word (which is usually a |
152 | // punctuation) forms a single character token. |
153 | if (pos == 0) |
154 | pos = 1; |
155 | vec.push_back(x: s.substr(Start: 0, N: pos)); |
156 | s = s.substr(Start: pos); |
157 | } |
158 | |
159 | tokens.insert(position: tokens.begin() + pos, first: vec.begin(), last: vec.end()); |
160 | } |
161 | |
162 | // Skip leading whitespace characters or comments. |
163 | StringRef ScriptLexer::skipSpace(StringRef s) { |
164 | for (;;) { |
165 | if (s.starts_with(Prefix: "/*" )) { |
166 | size_t e = s.find(Str: "*/" , From: 2); |
167 | if (e == StringRef::npos) { |
168 | setError("unclosed comment in a linker script" ); |
169 | return "" ; |
170 | } |
171 | s = s.substr(Start: e + 2); |
172 | continue; |
173 | } |
174 | if (s.starts_with(Prefix: "#" )) { |
175 | size_t e = s.find(C: '\n', From: 1); |
176 | if (e == StringRef::npos) |
177 | e = s.size() - 1; |
178 | s = s.substr(Start: e + 1); |
179 | continue; |
180 | } |
181 | size_t size = s.size(); |
182 | s = s.ltrim(); |
183 | if (s.size() == size) |
184 | return s; |
185 | } |
186 | } |
187 | |
188 | // An erroneous token is handled as if it were the last token before EOF. |
189 | bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; } |
190 | |
191 | // Split a given string as an expression. |
192 | // This function returns "3", "*" and "5" for "3*5" for example. |
193 | static std::vector<StringRef> tokenizeExpr(StringRef s) { |
194 | StringRef ops = "!~*/+-<>?^:=" ; // List of operators |
195 | |
196 | // Quoted strings are literal strings, so we don't want to split it. |
197 | if (s.starts_with(Prefix: "\"" )) |
198 | return {s}; |
199 | |
200 | // Split S with operators as separators. |
201 | std::vector<StringRef> ret; |
202 | while (!s.empty()) { |
203 | size_t e = s.find_first_of(Chars: ops); |
204 | |
205 | // No need to split if there is no operator. |
206 | if (e == StringRef::npos) { |
207 | ret.push_back(x: s); |
208 | break; |
209 | } |
210 | |
211 | // Get a token before the operator. |
212 | if (e != 0) |
213 | ret.push_back(x: s.substr(Start: 0, N: e)); |
214 | |
215 | // Get the operator as a token. |
216 | // Keep !=, ==, >=, <=, << and >> operators as a single tokens. |
217 | if (s.substr(Start: e).starts_with(Prefix: "!=" ) || s.substr(Start: e).starts_with(Prefix: "==" ) || |
218 | s.substr(Start: e).starts_with(Prefix: ">=" ) || s.substr(Start: e).starts_with(Prefix: "<=" ) || |
219 | s.substr(Start: e).starts_with(Prefix: "<<" ) || s.substr(Start: e).starts_with(Prefix: ">>" )) { |
220 | ret.push_back(x: s.substr(Start: e, N: 2)); |
221 | s = s.substr(Start: e + 2); |
222 | } else { |
223 | ret.push_back(x: s.substr(Start: e, N: 1)); |
224 | s = s.substr(Start: e + 1); |
225 | } |
226 | } |
227 | return ret; |
228 | } |
229 | |
230 | // In contexts where expressions are expected, the lexer should apply |
231 | // different tokenization rules than the default one. By default, |
232 | // arithmetic operator characters are regular characters, but in the |
233 | // expression context, they should be independent tokens. |
234 | // |
235 | // For example, "foo*3" should be tokenized to "foo", "*" and "3" only |
236 | // in the expression context. |
237 | // |
238 | // This function may split the current token into multiple tokens. |
239 | void ScriptLexer::maybeSplitExpr() { |
240 | if (!inExpr || errorCount() || atEOF()) |
241 | return; |
242 | |
243 | std::vector<StringRef> v = tokenizeExpr(s: tokens[pos]); |
244 | if (v.size() == 1) |
245 | return; |
246 | tokens.erase(position: tokens.begin() + pos); |
247 | tokens.insert(position: tokens.begin() + pos, first: v.begin(), last: v.end()); |
248 | } |
249 | |
250 | StringRef ScriptLexer::next() { |
251 | maybeSplitExpr(); |
252 | |
253 | if (errorCount()) |
254 | return "" ; |
255 | if (atEOF()) { |
256 | setError("unexpected EOF" ); |
257 | return "" ; |
258 | } |
259 | return tokens[pos++]; |
260 | } |
261 | |
262 | StringRef ScriptLexer::peek() { |
263 | StringRef tok = next(); |
264 | if (errorCount()) |
265 | return "" ; |
266 | pos = pos - 1; |
267 | return tok; |
268 | } |
269 | |
270 | bool ScriptLexer::consume(StringRef tok) { |
271 | if (next() == tok) |
272 | return true; |
273 | --pos; |
274 | return false; |
275 | } |
276 | |
277 | // Consumes Tok followed by ":". Space is allowed between Tok and ":". |
278 | bool ScriptLexer::consumeLabel(StringRef tok) { |
279 | if (consume(tok: (tok + ":" ).str())) |
280 | return true; |
281 | if (tokens.size() >= pos + 2 && tokens[pos] == tok && |
282 | tokens[pos + 1] == ":" ) { |
283 | pos += 2; |
284 | return true; |
285 | } |
286 | return false; |
287 | } |
288 | |
289 | void ScriptLexer::skip() { (void)next(); } |
290 | |
291 | void ScriptLexer::expect(StringRef expect) { |
292 | if (errorCount()) |
293 | return; |
294 | StringRef tok = next(); |
295 | if (tok != expect) |
296 | setError(expect + " expected, but got " + tok); |
297 | } |
298 | |
299 | // Returns true if S encloses T. |
300 | static bool encloses(StringRef s, StringRef t) { |
301 | return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); |
302 | } |
303 | |
304 | MemoryBufferRef ScriptLexer::getCurrentMB() { |
305 | // Find input buffer containing the current token. |
306 | assert(!mbs.empty()); |
307 | if (pos == 0) |
308 | return mbs.back(); |
309 | for (MemoryBufferRef mb : mbs) |
310 | if (encloses(s: mb.getBuffer(), t: tokens[pos - 1])) |
311 | return mb; |
312 | llvm_unreachable("getCurrentMB: failed to find a token" ); |
313 | } |
314 | |