1//===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains the main building blocks of macro support in
11/// clang-format.
12///
13/// In order to not violate the requirement that clang-format can format files
14/// in isolation, clang-format's macro support uses expansions users provide
15/// as part of clang-format's style configuration.
16///
17/// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
18/// one level of expansion (\see MacroExpander for a full description of what
19/// is supported).
20///
21/// As part of parsing, clang-format uses the MacroExpander to expand the
22/// spelled token streams into expanded token streams when it encounters a
23/// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
24/// from the expanded token stream.
25/// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
26/// matches the spelled token stream into unwrapped lines that best resemble the
27/// structure of the expanded unwrapped lines. These reconstructed unwrapped
28/// lines are aliasing the tokens in the expanded token stream, so that token
29/// annotations will be reused when formatting the spelled macro calls.
30///
31/// When formatting, clang-format annotates and formats the expanded unwrapped
32/// lines first, determining the token types. Next, it formats the spelled
33/// unwrapped lines, keeping the token types fixed, while allowing other
34/// formatting decisions to change.
35///
36//===----------------------------------------------------------------------===//
37
38#ifndef CLANG_LIB_FORMAT_MACROS_H
39#define CLANG_LIB_FORMAT_MACROS_H
40
41#include <list>
42
43#include "FormatToken.h"
44#include "llvm/ADT/DenseMap.h"
45
46namespace clang {
47namespace format {
48
49struct UnwrappedLine;
50struct UnwrappedLineNode;
51
52/// Takes a set of macro definitions as strings and allows expanding calls to
53/// those macros.
54///
55/// For example:
56/// Definition: A(x, y)=x + y
57/// Call : A(int a = 1, 2)
58/// Expansion : int a = 1 + 2
59///
60/// Expansion does not check arity of the definition.
61/// If fewer arguments than expected are provided, the remaining parameters
62/// are considered empty:
63/// Call : A(a)
64/// Expansion: a +
65/// If more arguments than expected are provided, they will be discarded.
66///
67/// The expander does not support:
68/// - recursive expansion
69/// - stringification
70/// - concatenation
71/// - variadic macros
72///
73/// Furthermore, only a single expansion of each macro argument is supported,
74/// so that we cannot get conflicting formatting decisions from different
75/// expansions.
76/// Definition: A(x)=x+x
77/// Call : A(id)
78/// Expansion : id+x
79///
80class MacroExpander {
81public:
82 using ArgsList = ArrayRef<SmallVector<FormatToken *, 8>>;
83
84 /// Construct a macro expander from a set of macro definitions.
85 /// Macro definitions must be encoded as UTF-8.
86 ///
87 /// Each entry in \p Macros must conform to the following simple
88 /// macro-definition language:
89 /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
90 /// <params> ::= <id-list> | ""
91 /// <id-list> ::= <id> | <id> "," <params>
92 /// <expansion> ::= "=" <tail> | <eof>
93 /// <tail> ::= <tok> <tail> | <eof>
94 ///
95 /// Macros that cannot be parsed will be silently discarded.
96 ///
97 MacroExpander(const std::vector<std::string> &Macros,
98 SourceManager &SourceMgr, const FormatStyle &Style,
99 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
100 IdentifierTable &IdentTable);
101 ~MacroExpander();
102
103 /// Returns whether any macro \p Name is defined, regardless of overloads.
104 bool defined(StringRef Name) const;
105
106 /// Returns whetherh there is an object-like overload, i.e. where the macro
107 /// has no arguments and should not consume subsequent parentheses.
108 bool objectLike(StringRef Name) const;
109
110 /// Returns whether macro \p Name provides an overload with the given arity.
111 bool hasArity(StringRef Name, unsigned Arity) const;
112
113 /// Returns the expanded stream of format tokens for \p ID, where
114 /// each element in \p Args is a positional argument to the macro call.
115 /// If \p Args is not set, the object-like overload is used.
116 /// If \p Args is set, the overload with the arity equal to \c Args.size() is
117 /// used.
118 SmallVector<FormatToken *, 8>
119 expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const;
120
121private:
122 struct Definition;
123 class DefinitionParser;
124
125 void parseDefinition(const std::string &Macro);
126
127 SourceManager &SourceMgr;
128 const FormatStyle &Style;
129 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
130 IdentifierTable &IdentTable;
131 SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
132 llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike;
133 llvm::StringMap<Definition> ObjectLike;
134};
135
136/// Converts a sequence of UnwrappedLines containing expanded macros into a
137/// single UnwrappedLine containing the macro calls. This UnwrappedLine may be
138/// broken into child lines, in a way that best conveys the structure of the
139/// expanded code.
140///
141/// In the simplest case, a spelled UnwrappedLine contains one macro, and after
142/// expanding it we have one expanded UnwrappedLine. In general, macro
143/// expansions can span UnwrappedLines, and multiple macros can contribute
144/// tokens to the same line. We keep consuming expanded lines until:
145/// * all expansions that started have finished (we're not chopping any macros
146/// in half)
147/// * *and* we've reached the end of a *spelled* unwrapped line.
148///
149/// A single UnwrappedLine represents this chunk of code.
150///
151/// After this point, the state of the spelled/expanded stream is "in sync"
152/// (both at the start of an UnwrappedLine, with no macros open), so the
153/// Reconstructor can be thrown away and parsing can continue.
154///
155/// Given a mapping from the macro name identifier token in the macro call
156/// to the tokens of the macro call, for example:
157/// CLASSA -> CLASSA({public: void x();})
158///
159/// When getting the formatted lines of the expansion via the \c addLine method
160/// (each '->' specifies a call to \c addLine ):
161/// -> class A {
162/// -> public:
163/// -> void x();
164/// -> };
165///
166/// Creates the tree of unwrapped lines containing the macro call tokens so that
167/// the macro call tokens fit the semantic structure of the expanded formatted
168/// lines:
169/// -> CLASSA({
170/// -> public:
171/// -> void x();
172/// -> })
173class MacroCallReconstructor {
174public:
175 /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
176 /// \p Level, using the map from name identifier token to the corresponding
177 /// tokens of the spelled macro call.
178 MacroCallReconstructor(
179 unsigned Level,
180 const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
181 &ActiveExpansions);
182
183 /// For the given \p Line, match all occurences of tokens expanded from a
184 /// macro to unwrapped lines in the spelled macro call so that the resulting
185 /// tree of unwrapped lines best resembles the structure of unwrapped lines
186 /// passed in via \c addLine.
187 void addLine(const UnwrappedLine &Line);
188
189 /// Check whether at the current state there is no open macro expansion
190 /// that needs to be processed to finish an macro call.
191 /// Only when \c finished() is true, \c takeResult() can be called to retrieve
192 /// the resulting \c UnwrappedLine.
193 /// If there are multiple subsequent macro calls within an unwrapped line in
194 /// the spelled token stream, the calling code may also continue to call
195 /// \c addLine() when \c finished() is true.
196 bool finished() const { return ActiveExpansions.empty(); }
197
198 /// Retrieve the formatted \c UnwrappedLine containing the orginal
199 /// macro calls, formatted according to the expanded token stream received
200 /// via \c addLine().
201 /// Generally, this line tries to have the same structure as the expanded,
202 /// formatted unwrapped lines handed in via \c addLine(), with the exception
203 /// that for multiple top-level lines, each subsequent line will be the
204 /// child of the last token in its predecessor. This representation is chosen
205 /// because it is a precondition to the formatter that we get what looks like
206 /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
207 ///
208 /// If a token in a macro argument is a child of a token in the expansion,
209 /// the parent will be the corresponding token in the macro call.
210 /// For example:
211 /// #define C(a, b) class C { a b
212 /// C(int x;, int y;)
213 /// would expand to
214 /// class C { int x; int y;
215 /// where in a formatted line "int x;" and "int y;" would both be new separate
216 /// lines.
217 ///
218 /// In the result, "int x;" will be a child of the opening parenthesis in "C("
219 /// and "int y;" will be a child of the "," token:
220 /// C (
221 /// \- int x;
222 /// ,
223 /// \- int y;
224 /// )
225 UnwrappedLine takeResult() &&;
226
227private:
228 void add(FormatToken *Token, FormatToken *ExpandedParent, bool First,
229 unsigned Level);
230 void prepareParent(FormatToken *ExpandedParent, bool First, unsigned Level);
231 FormatToken *getParentInResult(FormatToken *Parent);
232 void reconstruct(FormatToken *Token);
233 void startReconstruction(FormatToken *Token);
234 bool reconstructActiveCallUntil(FormatToken *Token);
235 void endReconstruction(FormatToken *Token);
236 bool processNextReconstructed();
237 void finalize();
238
239 struct ReconstructedLine;
240
241 void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
242 UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
243 void debug(const ReconstructedLine &Line, int Level);
244 ReconstructedLine &parentLine();
245 ReconstructedLine *currentLine();
246 void debugParentMap() const;
247
248#ifndef NDEBUG
249 enum ReconstructorState {
250 Start, // No macro expansion was found in the input yet.
251 InProgress, // During a macro reconstruction.
252 Finalized, // Past macro reconstruction, the result is finalized.
253 };
254 ReconstructorState State = Start;
255#endif
256
257 // Node in which we build up the resulting unwrapped line; this type is
258 // analogous to UnwrappedLineNode.
259 struct LineNode {
260 LineNode() = default;
261 LineNode(FormatToken *Tok) : Tok(Tok) {}
262 FormatToken *Tok = nullptr;
263 SmallVector<std::unique_ptr<ReconstructedLine>> Children;
264 };
265
266 // Line in which we build up the resulting unwrapped line.
267 // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
268 // instead of rolling our own type.
269 struct ReconstructedLine {
270 explicit ReconstructedLine(unsigned Level) : Level(Level) {}
271 unsigned Level;
272 SmallVector<std::unique_ptr<LineNode>> Tokens;
273 };
274
275 // The line in which we collect the resulting reconstructed output.
276 // To reduce special cases in the algorithm, the first level of the line
277 // contains a single null token that has the reconstructed incoming
278 // lines as children.
279 // In the end, we stich the lines together so that each subsequent line
280 // is a child of the last token of the previous line. This is necessary
281 // in order to format the overall expression as a single logical line -
282 // if we created separate lines, we'd format them with their own top-level
283 // indent depending on the semantic structure, which is not desired.
284 ReconstructedLine Result;
285
286 // Stack of currently "open" lines, where each line's predecessor's last
287 // token is the parent token for that line.
288 SmallVector<ReconstructedLine *> ActiveReconstructedLines;
289
290 // Maps from the expanded token to the token that takes its place in the
291 // reconstructed token stream in terms of parent-child relationships.
292 // Note that it might take multiple steps to arrive at the correct
293 // parent in the output.
294 // Given: #define C(a, b) []() { a; b; }
295 // And a call: C(f(), g())
296 // The structure in the incoming formatted unwrapped line will be:
297 // []() {
298 // |- f();
299 // \- g();
300 // }
301 // with f and g being children of the opening brace.
302 // In the reconstructed call:
303 // C(f(), g())
304 // \- f()
305 // \- g()
306 // We want f to be a child of the opening parenthesis and g to be a child
307 // of the comma token in the macro call.
308 // Thus, we map
309 // { -> (
310 // and add
311 // ( -> ,
312 // once we're past the comma in the reconstruction.
313 llvm::DenseMap<FormatToken *, FormatToken *>
314 SpelledParentToReconstructedParent;
315
316 // Keeps track of a single expansion while we're reconstructing tokens it
317 // generated.
318 struct Expansion {
319 // The identifier token of the macro call.
320 FormatToken *ID;
321 // Our current position in the reconstruction.
322 std::list<UnwrappedLineNode>::iterator SpelledI;
323 // The end of the reconstructed token sequence.
324 std::list<UnwrappedLineNode>::iterator SpelledE;
325 };
326
327 // Stack of macro calls for which we're in the middle of an expansion.
328 SmallVector<Expansion> ActiveExpansions;
329
330 struct MacroCallState {
331 MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
332 FormatToken *MacroCallLParen);
333
334 ReconstructedLine *Line;
335
336 // The last token in the parent line or expansion, or nullptr if the macro
337 // expansion is on a top-level line.
338 //
339 // For example, in the macro call:
340 // auto f = []() { ID(1); };
341 // The MacroCallState for ID will have '{' as ParentLastToken.
342 //
343 // In the macro call:
344 // ID(ID(void f()));
345 // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
346 // while the MacroCallState for the inner ID will have the '(' of the outer
347 // ID as ParentLastToken.
348 //
349 // In the macro call:
350 // ID2(a, ID(b));
351 // The MacroCallState of ID will have ',' as ParentLastToken.
352 FormatToken *ParentLastToken;
353
354 // The l_paren of this MacroCallState's macro call.
355 FormatToken *MacroCallLParen;
356 };
357
358 // Keeps track of the lines into which the opening brace/parenthesis &
359 // argument separating commas for each level in the macro call go in order to
360 // put the corresponding closing brace/parenthesis into the same line in the
361 // output and keep track of which parents in the expanded token stream map to
362 // which tokens in the reconstructed stream.
363 // When an opening brace/parenthesis has children, we want the structure of
364 // the output line to be:
365 // |- MACRO
366 // |- (
367 // | \- <argument>
368 // |- ,
369 // | \- <argument>
370 // \- )
371 SmallVector<MacroCallState> MacroCallStructure;
372
373 // Maps from identifier of the macro call to an unwrapped line containing
374 // all tokens of the macro call.
375 const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
376 &IdToReconstructed;
377};
378
379} // namespace format
380} // namespace clang
381
382#endif
383