1//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class can produce a generic deterministic finite state automaton (DFA),
10// given a set of possible states and transitions.
11//
12// The input transitions can be nondeterministic - this class will produce the
13// deterministic equivalent state machine.
14//
15// The generated code can run the DFA and produce an accepted / not accepted
16// state and also produce, given a sequence of transitions that results in an
17// accepted state, the sequence of intermediate states. This is useful if the
18// initial automaton was nondeterministic - it allows mapping back from the DFA
19// to the NFA.
20//
21//===----------------------------------------------------------------------===//
22
23#include "DFAEmitter.h"
24#include "Basic/SequenceToOffsetTable.h"
25#include "llvm/ADT/SmallVector.h"
26#include "llvm/ADT/StringExtras.h"
27#include "llvm/ADT/UniqueVector.h"
28#include "llvm/Support/Debug.h"
29#include "llvm/Support/raw_ostream.h"
30#include "llvm/TableGen/Record.h"
31#include "llvm/TableGen/TableGenBackend.h"
32#include <cassert>
33#include <cstdint>
34#include <deque>
35#include <map>
36#include <set>
37#include <string>
38#include <variant>
39#include <vector>
40
41#define DEBUG_TYPE "dfa-emitter"
42
43using namespace llvm;
44
45//===----------------------------------------------------------------------===//
46// DfaEmitter implementation. This is independent of the GenAutomaton backend.
47//===----------------------------------------------------------------------===//
48
49void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
50 Actions.insert(x: A);
51 NfaStates.insert(x: From);
52 NfaStates.insert(x: To);
53 NfaTransitions[{From, A}].push_back(x: To);
54 ++NumNfaTransitions;
55}
56
57void DfaEmitter::visitDfaState(const DfaState &DS) {
58 // For every possible action...
59 auto FromId = DfaStates.idFor(Entry: DS);
60 for (action_type A : Actions) {
61 DfaState NewStates;
62 DfaTransitionInfo TI;
63 // For every represented state, word pair in the original NFA...
64 for (state_type FromState : DS) {
65 // If this action is possible from this state add the transitioned-to
66 // states to NewStates.
67 auto I = NfaTransitions.find(x: {FromState, A});
68 if (I == NfaTransitions.end())
69 continue;
70 for (state_type &ToState : I->second) {
71 NewStates.push_back(Elt: ToState);
72 TI.emplace_back(Args&: FromState, Args&: ToState);
73 }
74 }
75 if (NewStates.empty())
76 continue;
77 // Sort and unique.
78 sort(C&: NewStates);
79 NewStates.erase(CS: llvm::unique(R&: NewStates), CE: NewStates.end());
80 sort(C&: TI);
81 TI.erase(CS: llvm::unique(R&: TI), CE: TI.end());
82 unsigned ToId = DfaStates.insert(Entry: NewStates);
83 DfaTransitions.emplace(args: std::pair(FromId, A), args: std::pair(ToId, TI));
84 }
85}
86
87void DfaEmitter::constructDfa() {
88 DfaState Initial(1, /*NFA initial state=*/0);
89 DfaStates.insert(Entry: Initial);
90
91 // Note that UniqueVector starts indices at 1, not zero.
92 unsigned DfaStateId = 1;
93 while (DfaStateId <= DfaStates.size()) {
94 DfaState S = DfaStates[DfaStateId];
95 visitDfaState(DS: S);
96 DfaStateId++;
97 }
98}
99
100void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
101 constructDfa();
102
103 OS << "// Input NFA has " << NfaStates.size() << " states with "
104 << NumNfaTransitions << " transitions.\n";
105 OS << "// Generated DFA has " << DfaStates.size() << " states with "
106 << DfaTransitions.size() << " transitions.\n\n";
107
108 // Implementation note: We don't bake a simple std::pair<> here as it requires
109 // significantly more effort to parse. A simple test with a large array of
110 // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
111 // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
112 // define the pair type.
113 //
114 // FIXME: It may make sense to emit these as ULEB sequences instead of
115 // pairs of uint64_t.
116 OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
117 OS << "// transition implies a set of NFA transitions. These are referred\n";
118 OS << "// to by index in " << Name << "Transitions[].\n";
119
120 SequenceToOffsetTable<DfaTransitionInfo> Table;
121 for (auto &T : DfaTransitions)
122 Table.add(Seq: T.second.second);
123 Table.layout();
124 OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
125 << "TransitionInfo = {{\n";
126 Table.emit(OS, Print: [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
127 OS << "{" << P.first << ", " << P.second << "}";
128 });
129
130 OS << "}};\n\n";
131
132 OS << "// A transition in the generated " << Name << " DFA.\n";
133 OS << "struct " << Name << "Transition {\n";
134 OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n";
135 OS << " ";
136 printActionType(OS);
137 OS << " Action; // The input symbol that causes this transition.\n";
138 OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n";
139 OS << " unsigned InfoIdx; // Start index into " << Name
140 << "TransitionInfo.\n";
141 OS << "};\n\n";
142
143 OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
144 OS << "// The initial state is 1, not zero.\n";
145 OS << "const std::array<" << Name << "Transition, " << DfaTransitions.size()
146 << "> " << Name << "Transitions = {{\n";
147 for (auto &KV : DfaTransitions) {
148 dfa_state_type From = KV.first.first;
149 dfa_state_type To = KV.second.first;
150 action_type A = KV.first.second;
151 unsigned InfoIdx = Table.get(Seq: KV.second.second);
152 OS << " {" << From << ", ";
153 printActionValue(A, OS);
154 OS << ", " << To << ", " << InfoIdx << "},\n";
155 }
156 OS << "\n}};\n\n";
157}
158
159void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
160
161void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
162
163//===----------------------------------------------------------------------===//
164// AutomatonEmitter implementation
165//===----------------------------------------------------------------------===//
166
167namespace {
168
169using Action = std::variant<const Record *, unsigned, std::string>;
170using ActionTuple = std::vector<Action>;
171class Automaton;
172
173class Transition {
174 uint64_t NewState;
175 // The tuple of actions that causes this transition.
176 ActionTuple Actions;
177 // The types of the actions; this is the same across all transitions.
178 SmallVector<std::string, 4> Types;
179
180public:
181 Transition(const Record *R, Automaton *Parent);
182 const ActionTuple &getActions() { return Actions; }
183 SmallVector<std::string, 4> getTypes() { return Types; }
184
185 bool canTransitionFrom(uint64_t State);
186 uint64_t transitionFrom(uint64_t State);
187};
188
189class Automaton {
190 const RecordKeeper &Records;
191 const Record *R;
192 std::vector<Transition> Transitions;
193 /// All possible action tuples, uniqued.
194 UniqueVector<ActionTuple> Actions;
195 /// The fields within each Transition object to find the action symbols.
196 std::vector<StringRef> ActionSymbolFields;
197
198public:
199 Automaton(const RecordKeeper &Records, const Record *R);
200 void emit(raw_ostream &OS);
201
202 ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
203 /// If the type of action A has been overridden (there exists a field
204 /// "TypeOf_A") return that, otherwise return the empty string.
205 StringRef getActionSymbolType(StringRef A);
206};
207
208class AutomatonEmitter {
209 const RecordKeeper &Records;
210
211public:
212 AutomatonEmitter(const RecordKeeper &R) : Records(R) {}
213 void run(raw_ostream &OS);
214};
215
216/// A DfaEmitter implementation that can print our variant action type.
217class CustomDfaEmitter : public DfaEmitter {
218 const UniqueVector<ActionTuple> &Actions;
219 std::string TypeName;
220
221public:
222 CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
223 : Actions(Actions), TypeName(TypeName) {}
224
225 void printActionType(raw_ostream &OS) override;
226 void printActionValue(action_type A, raw_ostream &OS) override;
227};
228} // namespace
229
230void AutomatonEmitter::run(raw_ostream &OS) {
231 for (const Record *R : Records.getAllDerivedDefinitions(ClassName: "GenericAutomaton")) {
232 Automaton A(Records, R);
233 OS << "#ifdef GET_" << R->getName() << "_DECL\n";
234 A.emit(OS);
235 OS << "#endif // GET_" << R->getName() << "_DECL\n";
236 }
237}
238
239Automaton::Automaton(const RecordKeeper &Records, const Record *R)
240 : Records(Records), R(R) {
241 LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
242 ActionSymbolFields = R->getValueAsListOfStrings(FieldName: "SymbolFields");
243}
244
245void Automaton::emit(raw_ostream &OS) {
246 StringRef TransitionClass = R->getValueAsString(FieldName: "TransitionClass");
247 for (const Record *T : Records.getAllDerivedDefinitions(ClassName: TransitionClass)) {
248 assert(T->isSubClassOf("Transition"));
249 Transitions.emplace_back(args&: T, args: this);
250 Actions.insert(Entry: Transitions.back().getActions());
251 }
252
253 LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size()
254 << "\n");
255 LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size()
256 << " potential transitions.\n");
257
258 StringRef Name = R->getName();
259
260 CustomDfaEmitter Emitter(Actions, Name.str() + "Action");
261 // Starting from the initial state, build up a list of possible states and
262 // transitions.
263 std::deque<uint64_t> Worklist(1, 0);
264 std::set<uint64_t> SeenStates;
265 unsigned NumTransitions = 0;
266 SeenStates.insert(x: Worklist.front());
267 while (!Worklist.empty()) {
268 uint64_t State = Worklist.front();
269 Worklist.pop_front();
270 for (Transition &T : Transitions) {
271 if (!T.canTransitionFrom(State))
272 continue;
273 uint64_t NewState = T.transitionFrom(State);
274 if (SeenStates.emplace(args&: NewState).second)
275 Worklist.emplace_back(args&: NewState);
276 ++NumTransitions;
277 Emitter.addTransition(From: State, To: NewState, A: Actions.idFor(Entry: T.getActions()));
278 }
279 }
280 LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size()
281 << " states with " << NumTransitions << " transitions.\n");
282 (void)NumTransitions;
283
284 const auto &ActionTypes = Transitions.back().getTypes();
285 OS << "// The type of an action in the " << Name << " automaton.\n";
286 if (ActionTypes.size() == 1) {
287 OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
288 } else {
289 OS << "using " << Name << "Action = std::tuple<" << join(R: ActionTypes, Separator: ", ")
290 << ">;\n";
291 }
292 OS << "\n";
293
294 Emitter.emit(Name, OS);
295}
296
297StringRef Automaton::getActionSymbolType(StringRef A) {
298 Twine Ty = "TypeOf_" + A;
299 if (!R->getValue(Name: Ty.str()))
300 return "";
301 return R->getValueAsString(FieldName: Ty.str());
302}
303
304Transition::Transition(const Record *R, Automaton *Parent) {
305 const BitsInit *NewStateInit = R->getValueAsBitsInit(FieldName: "NewState");
306 NewState = 0;
307 assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
308 "State cannot be represented in 64 bits!");
309 for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
310 if (auto *Bit = dyn_cast<BitInit>(Val: NewStateInit->getBit(Bit: I))) {
311 if (Bit->getValue())
312 NewState |= 1ULL << I;
313 }
314 }
315
316 for (StringRef A : Parent->getActionSymbolFields()) {
317 const RecordVal *SymbolV = R->getValue(Name: A);
318 if (const auto *Ty = dyn_cast<RecordRecTy>(Val: SymbolV->getType())) {
319 Actions.emplace_back(args: R->getValueAsDef(FieldName: A));
320 Types.emplace_back(Args: Ty->getAsString());
321 } else if (isa<IntRecTy>(Val: SymbolV->getType())) {
322 Actions.emplace_back(args: static_cast<unsigned>(R->getValueAsInt(FieldName: A)));
323 Types.emplace_back(Args: "unsigned");
324 } else if (isa<StringRecTy>(Val: SymbolV->getType())) {
325 Actions.emplace_back(args: R->getValueAsString(FieldName: A).str());
326 Types.emplace_back(Args: "std::string");
327 } else {
328 report_fatal_error(reason: "Unhandled symbol type!");
329 }
330
331 StringRef TypeOverride = Parent->getActionSymbolType(A);
332 if (!TypeOverride.empty())
333 Types.back() = TypeOverride.str();
334 }
335}
336
337bool Transition::canTransitionFrom(uint64_t State) {
338 if ((State & NewState) == 0)
339 // The bits we want to set are not set;
340 return true;
341 return false;
342}
343
344uint64_t Transition::transitionFrom(uint64_t State) { return State | NewState; }
345
346void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
347
348void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
349 const ActionTuple &AT = Actions[A];
350 if (AT.size() > 1)
351 OS << "{";
352 ListSeparator LS;
353 for (const auto &SingleAction : AT) {
354 OS << LS;
355 if (const auto *R = std::get_if<const Record *>(ptr: &SingleAction))
356 OS << (*R)->getName();
357 else if (const auto *S = std::get_if<std::string>(ptr: &SingleAction))
358 OS << '"' << *S << '"';
359 else
360 OS << std::get<unsigned>(v: SingleAction);
361 }
362 if (AT.size() > 1)
363 OS << "}";
364}
365
366static TableGen::Emitter::OptClass<AutomatonEmitter>
367 X("gen-automata", "Generate generic automata");
368