1 | //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the TokenConcatenation class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "clang/Lex/TokenConcatenation.h" |
14 | #include "clang/Basic/CharInfo.h" |
15 | #include "clang/Lex/Preprocessor.h" |
16 | #include "llvm/Support/ErrorHandling.h" |
17 | using namespace clang; |
18 | |
19 | |
20 | /// IsStringPrefix - Return true if Str is a string prefix. |
21 | /// 'L', 'u', 'U', or 'u8'. Including raw versions. |
22 | static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) { |
23 | |
24 | if (Str[0] == 'L' || |
25 | (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) { |
26 | |
27 | if (Str.size() == 1) |
28 | return true; // "L", "u", "U", and "R" |
29 | |
30 | // Check for raw flavors. Need to make sure the first character wasn't |
31 | // already R. Need CPlusPlus11 check for "LR". |
32 | if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11) |
33 | return true; // "LR", "uR", "UR" |
34 | |
35 | // Check for "u8" and "u8R" |
36 | if (Str[0] == 'u' && Str[1] == '8') { |
37 | if (Str.size() == 2) return true; // "u8" |
38 | if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" |
39 | } |
40 | } |
41 | |
42 | return false; |
43 | } |
44 | |
45 | /// IsIdentifierStringPrefix - Return true if the spelling of the token |
46 | /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. |
47 | bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { |
48 | const LangOptions &LangOpts = PP.getLangOpts(); |
49 | |
50 | if (!Tok.needsCleaning()) { |
51 | if (Tok.getLength() < 1 || Tok.getLength() > 3) |
52 | return false; |
53 | SourceManager &SM = PP.getSourceManager(); |
54 | const char *Ptr = SM.getCharacterData(SL: SM.getSpellingLoc(Loc: Tok.getLocation())); |
55 | return IsStringPrefix(Str: StringRef(Ptr, Tok.getLength()), |
56 | CPlusPlus11: LangOpts.CPlusPlus11); |
57 | } |
58 | |
59 | if (Tok.getLength() < 256) { |
60 | char Buffer[256]; |
61 | const char *TokPtr = Buffer; |
62 | unsigned length = PP.getSpelling(Tok, Buffer&: TokPtr); |
63 | return IsStringPrefix(Str: StringRef(TokPtr, length), CPlusPlus11: LangOpts.CPlusPlus11); |
64 | } |
65 | |
66 | return IsStringPrefix(Str: StringRef(PP.getSpelling(Tok)), CPlusPlus11: LangOpts.CPlusPlus11); |
67 | } |
68 | |
69 | TokenConcatenation::TokenConcatenation(const Preprocessor &pp) : PP(pp) { |
70 | memset(s: TokenInfo, c: 0, n: sizeof(TokenInfo)); |
71 | |
72 | // These tokens have custom code in AvoidConcat. |
73 | TokenInfo[tok::identifier ] |= aci_custom; |
74 | TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; |
75 | TokenInfo[tok::period ] |= aci_custom_firstchar; |
76 | TokenInfo[tok::amp ] |= aci_custom_firstchar; |
77 | TokenInfo[tok::plus ] |= aci_custom_firstchar; |
78 | TokenInfo[tok::minus ] |= aci_custom_firstchar; |
79 | TokenInfo[tok::slash ] |= aci_custom_firstchar; |
80 | TokenInfo[tok::less ] |= aci_custom_firstchar; |
81 | TokenInfo[tok::greater ] |= aci_custom_firstchar; |
82 | TokenInfo[tok::pipe ] |= aci_custom_firstchar; |
83 | TokenInfo[tok::percent ] |= aci_custom_firstchar; |
84 | TokenInfo[tok::colon ] |= aci_custom_firstchar; |
85 | TokenInfo[tok::hash ] |= aci_custom_firstchar; |
86 | TokenInfo[tok::arrow ] |= aci_custom_firstchar; |
87 | |
88 | // These tokens have custom code in C++11 mode. |
89 | if (PP.getLangOpts().CPlusPlus11) { |
90 | TokenInfo[tok::string_literal ] |= aci_custom; |
91 | TokenInfo[tok::wide_string_literal ] |= aci_custom; |
92 | TokenInfo[tok::utf8_string_literal ] |= aci_custom; |
93 | TokenInfo[tok::utf16_string_literal] |= aci_custom; |
94 | TokenInfo[tok::utf32_string_literal] |= aci_custom; |
95 | TokenInfo[tok::char_constant ] |= aci_custom; |
96 | TokenInfo[tok::wide_char_constant ] |= aci_custom; |
97 | TokenInfo[tok::utf16_char_constant ] |= aci_custom; |
98 | TokenInfo[tok::utf32_char_constant ] |= aci_custom; |
99 | } |
100 | |
101 | // These tokens have custom code in C++17 mode. |
102 | if (PP.getLangOpts().CPlusPlus17) |
103 | TokenInfo[tok::utf8_char_constant] |= aci_custom; |
104 | |
105 | // These tokens have custom code in C++2a mode. |
106 | if (PP.getLangOpts().CPlusPlus20) |
107 | TokenInfo[tok::lessequal ] |= aci_custom_firstchar; |
108 | |
109 | // These tokens change behavior if followed by an '='. |
110 | TokenInfo[tok::amp ] |= aci_avoid_equal; // &= |
111 | TokenInfo[tok::plus ] |= aci_avoid_equal; // += |
112 | TokenInfo[tok::minus ] |= aci_avoid_equal; // -= |
113 | TokenInfo[tok::slash ] |= aci_avoid_equal; // /= |
114 | TokenInfo[tok::less ] |= aci_avoid_equal; // <= |
115 | TokenInfo[tok::greater ] |= aci_avoid_equal; // >= |
116 | TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= |
117 | TokenInfo[tok::percent ] |= aci_avoid_equal; // %= |
118 | TokenInfo[tok::star ] |= aci_avoid_equal; // *= |
119 | TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != |
120 | TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= |
121 | TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>= |
122 | TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= |
123 | TokenInfo[tok::equal ] |= aci_avoid_equal; // == |
124 | } |
125 | |
126 | /// GetFirstChar - Get the first character of the token \arg Tok, |
127 | /// avoiding calls to getSpelling where possible. |
128 | static char GetFirstChar(const Preprocessor &PP, const Token &Tok) { |
129 | if (IdentifierInfo *II = Tok.getIdentifierInfo()) { |
130 | // Avoid spelling identifiers, the most common form of token. |
131 | return II->getNameStart()[0]; |
132 | } else if (!Tok.needsCleaning()) { |
133 | if (Tok.isLiteral() && Tok.getLiteralData()) { |
134 | return *Tok.getLiteralData(); |
135 | } else { |
136 | SourceManager &SM = PP.getSourceManager(); |
137 | return *SM.getCharacterData(SL: SM.getSpellingLoc(Loc: Tok.getLocation())); |
138 | } |
139 | } else if (Tok.getLength() < 256) { |
140 | char Buffer[256]; |
141 | const char *TokPtr = Buffer; |
142 | PP.getSpelling(Tok, Buffer&: TokPtr); |
143 | return TokPtr[0]; |
144 | } else { |
145 | return PP.getSpelling(Tok)[0]; |
146 | } |
147 | } |
148 | |
149 | /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause |
150 | /// the two individual tokens to be lexed as a single token, return true |
151 | /// (which causes a space to be printed between them). This allows the output |
152 | /// of -E mode to be lexed to the same token stream as lexing the input |
153 | /// directly would. |
154 | /// |
155 | /// This code must conservatively return true if it doesn't want to be 100% |
156 | /// accurate. This will cause the output to include extra space characters, |
157 | /// but the resulting output won't have incorrect concatenations going on. |
158 | /// Examples include "..", which we print with a space between, because we |
159 | /// don't want to track enough to tell "x.." from "...". |
160 | bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, |
161 | const Token &PrevTok, |
162 | const Token &Tok) const { |
163 | // Conservatively assume that every annotation token that has a printable |
164 | // form requires whitespace. |
165 | if (PrevTok.isAnnotation()) |
166 | return true; |
167 | |
168 | // First, check to see if the tokens were directly adjacent in the original |
169 | // source. If they were, it must be okay to stick them together: if there |
170 | // were an issue, the tokens would have been lexed differently. |
171 | SourceManager &SM = PP.getSourceManager(); |
172 | SourceLocation PrevSpellLoc = SM.getSpellingLoc(Loc: PrevTok.getLocation()); |
173 | SourceLocation SpellLoc = SM.getSpellingLoc(Loc: Tok.getLocation()); |
174 | if (PrevSpellLoc.getLocWithOffset(Offset: PrevTok.getLength()) == SpellLoc) |
175 | return false; |
176 | |
177 | tok::TokenKind PrevKind = PrevTok.getKind(); |
178 | if (!PrevTok.isAnnotation() && PrevTok.getIdentifierInfo()) |
179 | PrevKind = tok::identifier; // Language keyword or named operator. |
180 | |
181 | // Look up information on when we should avoid concatenation with prevtok. |
182 | unsigned ConcatInfo = TokenInfo[PrevKind]; |
183 | |
184 | // If prevtok never causes a problem for anything after it, return quickly. |
185 | if (ConcatInfo == 0) return false; |
186 | |
187 | if (ConcatInfo & aci_avoid_equal) { |
188 | // If the next token is '=' or '==', avoid concatenation. |
189 | if (Tok.isOneOf(K1: tok::equal, K2: tok::equalequal)) |
190 | return true; |
191 | ConcatInfo &= ~aci_avoid_equal; |
192 | } |
193 | if (Tok.isAnnotation()) { |
194 | // Modules annotation can show up when generated automatically for includes. |
195 | assert(Tok.isOneOf(tok::annot_module_include, tok::annot_module_begin, |
196 | tok::annot_module_end, tok::annot_embed) && |
197 | "unexpected annotation in AvoidConcat" ); |
198 | |
199 | ConcatInfo = 0; |
200 | if (Tok.is(K: tok::annot_embed)) |
201 | return true; |
202 | } |
203 | |
204 | if (ConcatInfo == 0) |
205 | return false; |
206 | |
207 | // Basic algorithm: we look at the first character of the second token, and |
208 | // determine whether it, if appended to the first token, would form (or |
209 | // would contribute) to a larger token if concatenated. |
210 | char FirstChar = 0; |
211 | if (ConcatInfo & aci_custom) { |
212 | // If the token does not need to know the first character, don't get it. |
213 | } else { |
214 | FirstChar = GetFirstChar(PP, Tok); |
215 | } |
216 | |
217 | switch (PrevKind) { |
218 | default: |
219 | llvm_unreachable("InitAvoidConcatTokenInfo built wrong" ); |
220 | |
221 | case tok::raw_identifier: |
222 | llvm_unreachable("tok::raw_identifier in non-raw lexing mode!" ); |
223 | |
224 | case tok::string_literal: |
225 | case tok::wide_string_literal: |
226 | case tok::utf8_string_literal: |
227 | case tok::utf16_string_literal: |
228 | case tok::utf32_string_literal: |
229 | case tok::char_constant: |
230 | case tok::wide_char_constant: |
231 | case tok::utf8_char_constant: |
232 | case tok::utf16_char_constant: |
233 | case tok::utf32_char_constant: |
234 | if (!PP.getLangOpts().CPlusPlus11) |
235 | return false; |
236 | |
237 | // In C++11, a string or character literal followed by an identifier is a |
238 | // single token. |
239 | if (Tok.getIdentifierInfo()) |
240 | return true; |
241 | |
242 | // A ud-suffix is an identifier. If the previous token ends with one, treat |
243 | // it as an identifier. |
244 | if (!PrevTok.hasUDSuffix()) |
245 | return false; |
246 | [[fallthrough]]; |
247 | case tok::identifier: // id+id or id+number or id+L"foo". |
248 | // id+'.'... will not append. |
249 | if (Tok.is(K: tok::numeric_constant)) |
250 | return GetFirstChar(PP, Tok) != '.'; |
251 | |
252 | if (Tok.getIdentifierInfo() || |
253 | Tok.isOneOf(K1: tok::wide_string_literal, Ks: tok::utf8_string_literal, |
254 | Ks: tok::utf16_string_literal, Ks: tok::utf32_string_literal, |
255 | Ks: tok::wide_char_constant, Ks: tok::utf8_char_constant, |
256 | Ks: tok::utf16_char_constant, Ks: tok::utf32_char_constant)) |
257 | return true; |
258 | |
259 | // If this isn't identifier + string, we're done. |
260 | if (Tok.isNot(K: tok::char_constant) && Tok.isNot(K: tok::string_literal)) |
261 | return false; |
262 | |
263 | // Otherwise, this is a narrow character or string. If the *identifier* |
264 | // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". |
265 | return IsIdentifierStringPrefix(Tok: PrevTok); |
266 | |
267 | case tok::numeric_constant: |
268 | return isPreprocessingNumberBody(c: FirstChar) || |
269 | FirstChar == '+' || FirstChar == '-'; |
270 | case tok::period: // ..., .*, .1234 |
271 | return (FirstChar == '.' && PrevPrevTok.is(K: tok::period)) || |
272 | isDigit(c: FirstChar) || |
273 | (PP.getLangOpts().CPlusPlus && FirstChar == '*'); |
274 | case tok::amp: // && |
275 | return FirstChar == '&'; |
276 | case tok::plus: // ++ |
277 | return FirstChar == '+'; |
278 | case tok::minus: // --, ->, ->* |
279 | return FirstChar == '-' || FirstChar == '>'; |
280 | case tok::slash: //, /*, // |
281 | return FirstChar == '*' || FirstChar == '/'; |
282 | case tok::less: // <<, <<=, <:, <% |
283 | return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; |
284 | case tok::greater: // >>, >>= |
285 | return FirstChar == '>'; |
286 | case tok::pipe: // || |
287 | return FirstChar == '|'; |
288 | case tok::percent: // %>, %: |
289 | return FirstChar == '>' || FirstChar == ':'; |
290 | case tok::colon: // ::, :> |
291 | return FirstChar == '>' || |
292 | (PP.getLangOpts().CPlusPlus && FirstChar == ':'); |
293 | case tok::hash: // ##, #@, %:%: |
294 | return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; |
295 | case tok::arrow: // ->* |
296 | return PP.getLangOpts().CPlusPlus && FirstChar == '*'; |
297 | case tok::lessequal: // <=> (C++2a) |
298 | return PP.getLangOpts().CPlusPlus20 && FirstChar == '>'; |
299 | } |
300 | } |
301 | |