| 1 | //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains the main building blocks of macro support in |
| 11 | /// clang-format. |
| 12 | /// |
| 13 | /// In order to not violate the requirement that clang-format can format files |
| 14 | /// in isolation, clang-format's macro support uses expansions users provide |
| 15 | /// as part of clang-format's style configuration. |
| 16 | /// |
| 17 | /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support |
| 18 | /// one level of expansion (\see MacroExpander for a full description of what |
| 19 | /// is supported). |
| 20 | /// |
| 21 | /// As part of parsing, clang-format uses the MacroExpander to expand the |
| 22 | /// spelled token streams into expanded token streams when it encounters a |
| 23 | /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines |
| 24 | /// from the expanded token stream. |
| 25 | /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor |
| 26 | /// matches the spelled token stream into unwrapped lines that best resemble the |
| 27 | /// structure of the expanded unwrapped lines. These reconstructed unwrapped |
| 28 | /// lines are aliasing the tokens in the expanded token stream, so that token |
| 29 | /// annotations will be reused when formatting the spelled macro calls. |
| 30 | /// |
| 31 | /// When formatting, clang-format annotates and formats the expanded unwrapped |
| 32 | /// lines first, determining the token types. Next, it formats the spelled |
| 33 | /// unwrapped lines, keeping the token types fixed, while allowing other |
| 34 | /// formatting decisions to change. |
| 35 | /// |
| 36 | //===----------------------------------------------------------------------===// |
| 37 | |
| 38 | #ifndef CLANG_LIB_FORMAT_MACROS_H |
| 39 | #define CLANG_LIB_FORMAT_MACROS_H |
| 40 | |
| 41 | #include <list> |
| 42 | |
| 43 | #include "FormatToken.h" |
| 44 | #include "llvm/ADT/DenseMap.h" |
| 45 | |
| 46 | namespace clang { |
| 47 | namespace format { |
| 48 | |
| 49 | struct UnwrappedLine; |
| 50 | struct UnwrappedLineNode; |
| 51 | |
| 52 | /// Takes a set of macro definitions as strings and allows expanding calls to |
| 53 | /// those macros. |
| 54 | /// |
| 55 | /// For example: |
| 56 | /// Definition: A(x, y)=x + y |
| 57 | /// Call : A(int a = 1, 2) |
| 58 | /// Expansion : int a = 1 + 2 |
| 59 | /// |
| 60 | /// Expansion does not check arity of the definition. |
| 61 | /// If fewer arguments than expected are provided, the remaining parameters |
| 62 | /// are considered empty: |
| 63 | /// Call : A(a) |
| 64 | /// Expansion: a + |
| 65 | /// If more arguments than expected are provided, they will be discarded. |
| 66 | /// |
| 67 | /// The expander does not support: |
| 68 | /// - recursive expansion |
| 69 | /// - stringification |
| 70 | /// - concatenation |
| 71 | /// - variadic macros |
| 72 | /// |
| 73 | /// Furthermore, only a single expansion of each macro argument is supported, |
| 74 | /// so that we cannot get conflicting formatting decisions from different |
| 75 | /// expansions. |
| 76 | /// Definition: A(x)=x+x |
| 77 | /// Call : A(id) |
| 78 | /// Expansion : id+x |
| 79 | /// |
| 80 | class MacroExpander { |
| 81 | public: |
| 82 | using ArgsList = ArrayRef<SmallVector<FormatToken *, 8>>; |
| 83 | |
| 84 | /// Construct a macro expander from a set of macro definitions. |
| 85 | /// Macro definitions must be encoded as UTF-8. |
| 86 | /// |
| 87 | /// Each entry in \p Macros must conform to the following simple |
| 88 | /// macro-definition language: |
| 89 | /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion> |
| 90 | /// <params> ::= <id-list> | "" |
| 91 | /// <id-list> ::= <id> | <id> "," <params> |
| 92 | /// <expansion> ::= "=" <tail> | <eof> |
| 93 | /// <tail> ::= <tok> <tail> | <eof> |
| 94 | /// |
| 95 | /// Macros that cannot be parsed will be silently discarded. |
| 96 | /// |
| 97 | MacroExpander(const std::vector<std::string> &Macros, |
| 98 | SourceManager &SourceMgr, const FormatStyle &Style, |
| 99 | llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, |
| 100 | IdentifierTable &IdentTable); |
| 101 | ~MacroExpander(); |
| 102 | |
| 103 | /// Returns whether any macro \p Name is defined, regardless of overloads. |
| 104 | bool defined(StringRef Name) const; |
| 105 | |
| 106 | /// Returns whetherh there is an object-like overload, i.e. where the macro |
| 107 | /// has no arguments and should not consume subsequent parentheses. |
| 108 | bool objectLike(StringRef Name) const; |
| 109 | |
| 110 | /// Returns whether macro \p Name provides an overload with the given arity. |
| 111 | bool hasArity(StringRef Name, unsigned Arity) const; |
| 112 | |
| 113 | /// Returns the expanded stream of format tokens for \p ID, where |
| 114 | /// each element in \p Args is a positional argument to the macro call. |
| 115 | /// If \p Args is not set, the object-like overload is used. |
| 116 | /// If \p Args is set, the overload with the arity equal to \c Args.size() is |
| 117 | /// used. |
| 118 | SmallVector<FormatToken *, 8> |
| 119 | expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const; |
| 120 | |
| 121 | private: |
| 122 | struct Definition; |
| 123 | class DefinitionParser; |
| 124 | |
| 125 | void parseDefinition(const std::string &Macro); |
| 126 | |
| 127 | SourceManager &SourceMgr; |
| 128 | const FormatStyle &Style; |
| 129 | llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator; |
| 130 | IdentifierTable &IdentTable; |
| 131 | SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers; |
| 132 | llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike; |
| 133 | llvm::StringMap<Definition> ObjectLike; |
| 134 | }; |
| 135 | |
| 136 | /// Converts a sequence of UnwrappedLines containing expanded macros into a |
| 137 | /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be |
| 138 | /// broken into child lines, in a way that best conveys the structure of the |
| 139 | /// expanded code. |
| 140 | /// |
| 141 | /// In the simplest case, a spelled UnwrappedLine contains one macro, and after |
| 142 | /// expanding it we have one expanded UnwrappedLine. In general, macro |
| 143 | /// expansions can span UnwrappedLines, and multiple macros can contribute |
| 144 | /// tokens to the same line. We keep consuming expanded lines until: |
| 145 | /// * all expansions that started have finished (we're not chopping any macros |
| 146 | /// in half) |
| 147 | /// * *and* we've reached the end of a *spelled* unwrapped line. |
| 148 | /// |
| 149 | /// A single UnwrappedLine represents this chunk of code. |
| 150 | /// |
| 151 | /// After this point, the state of the spelled/expanded stream is "in sync" |
| 152 | /// (both at the start of an UnwrappedLine, with no macros open), so the |
| 153 | /// Reconstructor can be thrown away and parsing can continue. |
| 154 | /// |
| 155 | /// Given a mapping from the macro name identifier token in the macro call |
| 156 | /// to the tokens of the macro call, for example: |
| 157 | /// CLASSA -> CLASSA({public: void x();}) |
| 158 | /// |
| 159 | /// When getting the formatted lines of the expansion via the \c addLine method |
| 160 | /// (each '->' specifies a call to \c addLine ): |
| 161 | /// -> class A { |
| 162 | /// -> public: |
| 163 | /// -> void x(); |
| 164 | /// -> }; |
| 165 | /// |
| 166 | /// Creates the tree of unwrapped lines containing the macro call tokens so that |
| 167 | /// the macro call tokens fit the semantic structure of the expanded formatted |
| 168 | /// lines: |
| 169 | /// -> CLASSA({ |
| 170 | /// -> public: |
| 171 | /// -> void x(); |
| 172 | /// -> }) |
| 173 | class MacroCallReconstructor { |
| 174 | public: |
| 175 | /// Create an Reconstructor whose resulting \p UnwrappedLine will start at |
| 176 | /// \p Level, using the map from name identifier token to the corresponding |
| 177 | /// tokens of the spelled macro call. |
| 178 | MacroCallReconstructor( |
| 179 | unsigned Level, |
| 180 | const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> |
| 181 | &ActiveExpansions); |
| 182 | |
| 183 | /// For the given \p Line, match all occurences of tokens expanded from a |
| 184 | /// macro to unwrapped lines in the spelled macro call so that the resulting |
| 185 | /// tree of unwrapped lines best resembles the structure of unwrapped lines |
| 186 | /// passed in via \c addLine. |
| 187 | void addLine(const UnwrappedLine &Line); |
| 188 | |
| 189 | /// Check whether at the current state there is no open macro expansion |
| 190 | /// that needs to be processed to finish an macro call. |
| 191 | /// Only when \c finished() is true, \c takeResult() can be called to retrieve |
| 192 | /// the resulting \c UnwrappedLine. |
| 193 | /// If there are multiple subsequent macro calls within an unwrapped line in |
| 194 | /// the spelled token stream, the calling code may also continue to call |
| 195 | /// \c addLine() when \c finished() is true. |
| 196 | bool finished() const { return ActiveExpansions.empty(); } |
| 197 | |
| 198 | /// Retrieve the formatted \c UnwrappedLine containing the orginal |
| 199 | /// macro calls, formatted according to the expanded token stream received |
| 200 | /// via \c addLine(). |
| 201 | /// Generally, this line tries to have the same structure as the expanded, |
| 202 | /// formatted unwrapped lines handed in via \c addLine(), with the exception |
| 203 | /// that for multiple top-level lines, each subsequent line will be the |
| 204 | /// child of the last token in its predecessor. This representation is chosen |
| 205 | /// because it is a precondition to the formatter that we get what looks like |
| 206 | /// a single statement in a single \c UnwrappedLine (i.e. matching parens). |
| 207 | /// |
| 208 | /// If a token in a macro argument is a child of a token in the expansion, |
| 209 | /// the parent will be the corresponding token in the macro call. |
| 210 | /// For example: |
| 211 | /// #define C(a, b) class C { a b |
| 212 | /// C(int x;, int y;) |
| 213 | /// would expand to |
| 214 | /// class C { int x; int y; |
| 215 | /// where in a formatted line "int x;" and "int y;" would both be new separate |
| 216 | /// lines. |
| 217 | /// |
| 218 | /// In the result, "int x;" will be a child of the opening parenthesis in "C(" |
| 219 | /// and "int y;" will be a child of the "," token: |
| 220 | /// C ( |
| 221 | /// \- int x; |
| 222 | /// , |
| 223 | /// \- int y; |
| 224 | /// ) |
| 225 | UnwrappedLine takeResult() &&; |
| 226 | |
| 227 | private: |
| 228 | void add(FormatToken *Token, FormatToken *ExpandedParent, bool First, |
| 229 | unsigned Level); |
| 230 | void prepareParent(FormatToken *ExpandedParent, bool First, unsigned Level); |
| 231 | FormatToken *getParentInResult(FormatToken *Parent); |
| 232 | void reconstruct(FormatToken *Token); |
| 233 | void startReconstruction(FormatToken *Token); |
| 234 | bool reconstructActiveCallUntil(FormatToken *Token); |
| 235 | void endReconstruction(FormatToken *Token); |
| 236 | bool processNextReconstructed(); |
| 237 | void finalize(); |
| 238 | |
| 239 | struct ReconstructedLine; |
| 240 | |
| 241 | void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr); |
| 242 | UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level); |
| 243 | void debug(const ReconstructedLine &Line, int Level); |
| 244 | ReconstructedLine &parentLine(); |
| 245 | ReconstructedLine *currentLine(); |
| 246 | void debugParentMap() const; |
| 247 | |
| 248 | #ifndef NDEBUG |
| 249 | enum ReconstructorState { |
| 250 | Start, // No macro expansion was found in the input yet. |
| 251 | InProgress, // During a macro reconstruction. |
| 252 | Finalized, // Past macro reconstruction, the result is finalized. |
| 253 | }; |
| 254 | ReconstructorState State = Start; |
| 255 | #endif |
| 256 | |
| 257 | // Node in which we build up the resulting unwrapped line; this type is |
| 258 | // analogous to UnwrappedLineNode. |
| 259 | struct LineNode { |
| 260 | LineNode() = default; |
| 261 | LineNode(FormatToken *Tok) : Tok(Tok) {} |
| 262 | FormatToken *Tok = nullptr; |
| 263 | SmallVector<std::unique_ptr<ReconstructedLine>> Children; |
| 264 | }; |
| 265 | |
| 266 | // Line in which we build up the resulting unwrapped line. |
| 267 | // FIXME: Investigate changing UnwrappedLine to a pointer type and using it |
| 268 | // instead of rolling our own type. |
| 269 | struct ReconstructedLine { |
| 270 | explicit ReconstructedLine(unsigned Level) : Level(Level) {} |
| 271 | unsigned Level; |
| 272 | SmallVector<std::unique_ptr<LineNode>> Tokens; |
| 273 | }; |
| 274 | |
| 275 | // The line in which we collect the resulting reconstructed output. |
| 276 | // To reduce special cases in the algorithm, the first level of the line |
| 277 | // contains a single null token that has the reconstructed incoming |
| 278 | // lines as children. |
| 279 | // In the end, we stich the lines together so that each subsequent line |
| 280 | // is a child of the last token of the previous line. This is necessary |
| 281 | // in order to format the overall expression as a single logical line - |
| 282 | // if we created separate lines, we'd format them with their own top-level |
| 283 | // indent depending on the semantic structure, which is not desired. |
| 284 | ReconstructedLine Result; |
| 285 | |
| 286 | // Stack of currently "open" lines, where each line's predecessor's last |
| 287 | // token is the parent token for that line. |
| 288 | SmallVector<ReconstructedLine *> ActiveReconstructedLines; |
| 289 | |
| 290 | // Maps from the expanded token to the token that takes its place in the |
| 291 | // reconstructed token stream in terms of parent-child relationships. |
| 292 | // Note that it might take multiple steps to arrive at the correct |
| 293 | // parent in the output. |
| 294 | // Given: #define C(a, b) []() { a; b; } |
| 295 | // And a call: C(f(), g()) |
| 296 | // The structure in the incoming formatted unwrapped line will be: |
| 297 | // []() { |
| 298 | // |- f(); |
| 299 | // \- g(); |
| 300 | // } |
| 301 | // with f and g being children of the opening brace. |
| 302 | // In the reconstructed call: |
| 303 | // C(f(), g()) |
| 304 | // \- f() |
| 305 | // \- g() |
| 306 | // We want f to be a child of the opening parenthesis and g to be a child |
| 307 | // of the comma token in the macro call. |
| 308 | // Thus, we map |
| 309 | // { -> ( |
| 310 | // and add |
| 311 | // ( -> , |
| 312 | // once we're past the comma in the reconstruction. |
| 313 | llvm::DenseMap<FormatToken *, FormatToken *> |
| 314 | SpelledParentToReconstructedParent; |
| 315 | |
| 316 | // Keeps track of a single expansion while we're reconstructing tokens it |
| 317 | // generated. |
| 318 | struct Expansion { |
| 319 | // The identifier token of the macro call. |
| 320 | FormatToken *ID; |
| 321 | // Our current position in the reconstruction. |
| 322 | std::list<UnwrappedLineNode>::iterator SpelledI; |
| 323 | // The end of the reconstructed token sequence. |
| 324 | std::list<UnwrappedLineNode>::iterator SpelledE; |
| 325 | }; |
| 326 | |
| 327 | // Stack of macro calls for which we're in the middle of an expansion. |
| 328 | SmallVector<Expansion> ActiveExpansions; |
| 329 | |
| 330 | struct MacroCallState { |
| 331 | MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken, |
| 332 | FormatToken *MacroCallLParen); |
| 333 | |
| 334 | ReconstructedLine *Line; |
| 335 | |
| 336 | // The last token in the parent line or expansion, or nullptr if the macro |
| 337 | // expansion is on a top-level line. |
| 338 | // |
| 339 | // For example, in the macro call: |
| 340 | // auto f = []() { ID(1); }; |
| 341 | // The MacroCallState for ID will have '{' as ParentLastToken. |
| 342 | // |
| 343 | // In the macro call: |
| 344 | // ID(ID(void f())); |
| 345 | // The MacroCallState of the outer ID will have nullptr as ParentLastToken, |
| 346 | // while the MacroCallState for the inner ID will have the '(' of the outer |
| 347 | // ID as ParentLastToken. |
| 348 | // |
| 349 | // In the macro call: |
| 350 | // ID2(a, ID(b)); |
| 351 | // The MacroCallState of ID will have ',' as ParentLastToken. |
| 352 | FormatToken *ParentLastToken; |
| 353 | |
| 354 | // The l_paren of this MacroCallState's macro call. |
| 355 | FormatToken *MacroCallLParen; |
| 356 | }; |
| 357 | |
| 358 | // Keeps track of the lines into which the opening brace/parenthesis & |
| 359 | // argument separating commas for each level in the macro call go in order to |
| 360 | // put the corresponding closing brace/parenthesis into the same line in the |
| 361 | // output and keep track of which parents in the expanded token stream map to |
| 362 | // which tokens in the reconstructed stream. |
| 363 | // When an opening brace/parenthesis has children, we want the structure of |
| 364 | // the output line to be: |
| 365 | // |- MACRO |
| 366 | // |- ( |
| 367 | // | \- <argument> |
| 368 | // |- , |
| 369 | // | \- <argument> |
| 370 | // \- ) |
| 371 | SmallVector<MacroCallState> MacroCallStructure; |
| 372 | |
| 373 | // Maps from identifier of the macro call to an unwrapped line containing |
| 374 | // all tokens of the macro call. |
| 375 | const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> |
| 376 | &IdToReconstructed; |
| 377 | }; |
| 378 | |
| 379 | } // namespace format |
| 380 | } // namespace clang |
| 381 | |
| 382 | #endif |
| 383 | |