1 | //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the Suffix Tree class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Support/SuffixTree.h" |
14 | #include "llvm/Support/Allocator.h" |
15 | #include "llvm/Support/Casting.h" |
16 | #include "llvm/Support/SuffixTreeNode.h" |
17 | |
18 | using namespace llvm; |
19 | |
20 | /// \returns the number of elements in the substring associated with \p N. |
21 | static size_t numElementsInSubstring(const SuffixTreeNode *N) { |
22 | assert(N && "Got a null node?" ); |
23 | if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(Val: N)) |
24 | if (Internal->isRoot()) |
25 | return 0; |
26 | return N->getEndIdx() - N->getStartIdx() + 1; |
27 | } |
28 | |
29 | SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str, |
30 | bool OutlinerLeafDescendants) |
31 | : Str(Str), OutlinerLeafDescendants(OutlinerLeafDescendants) { |
32 | Root = insertRoot(); |
33 | Active.Node = Root; |
34 | |
35 | // Keep track of the number of suffixes we have to add of the current |
36 | // prefix. |
37 | unsigned SuffixesToAdd = 0; |
38 | |
39 | // Construct the suffix tree iteratively on each prefix of the string. |
40 | // PfxEndIdx is the end index of the current prefix. |
41 | // End is one past the last element in the string. |
42 | for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { |
43 | SuffixesToAdd++; |
44 | LeafEndIdx = PfxEndIdx; // Extend each of the leaves. |
45 | SuffixesToAdd = extend(EndIdx: PfxEndIdx, SuffixesToAdd); |
46 | } |
47 | |
48 | // Set the suffix indices of each leaf. |
49 | assert(Root && "Root node can't be nullptr!" ); |
50 | setSuffixIndices(); |
51 | |
52 | // Collect all leaf nodes of the suffix tree. And for each internal node, |
53 | // record the range of leaf nodes that are descendants of it. |
54 | if (OutlinerLeafDescendants) |
55 | setLeafNodes(); |
56 | } |
57 | |
58 | SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent, |
59 | unsigned StartIdx, unsigned Edge) { |
60 | assert(StartIdx <= LeafEndIdx && "String can't start after it ends!" ); |
61 | auto *N = new (LeafNodeAllocator.Allocate()) |
62 | SuffixTreeLeafNode(StartIdx, &LeafEndIdx); |
63 | Parent.Children[Edge] = N; |
64 | return N; |
65 | } |
66 | |
67 | SuffixTreeInternalNode * |
68 | SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent, |
69 | unsigned StartIdx, unsigned EndIdx, |
70 | unsigned Edge) { |
71 | assert(StartIdx <= EndIdx && "String can't start after it ends!" ); |
72 | assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) && |
73 | "Non-root internal nodes must have parents!" ); |
74 | auto *N = new (InternalNodeAllocator.Allocate()) |
75 | SuffixTreeInternalNode(StartIdx, EndIdx, Root); |
76 | if (Parent) |
77 | Parent->Children[Edge] = N; |
78 | return N; |
79 | } |
80 | |
81 | SuffixTreeInternalNode *SuffixTree::insertRoot() { |
82 | return insertInternalNode(/*Parent = */ nullptr, StartIdx: SuffixTreeNode::EmptyIdx, |
83 | EndIdx: SuffixTreeNode::EmptyIdx, /*Edge = */ 0); |
84 | } |
85 | |
86 | void SuffixTree::setSuffixIndices() { |
87 | // List of nodes we need to visit along with the current length of the |
88 | // string. |
89 | SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit; |
90 | |
91 | // Current node being visited. |
92 | SuffixTreeNode *CurrNode = Root; |
93 | |
94 | // Sum of the lengths of the nodes down the path to the current one. |
95 | unsigned CurrNodeLen = 0; |
96 | ToVisit.push_back(Elt: {CurrNode, CurrNodeLen}); |
97 | while (!ToVisit.empty()) { |
98 | std::tie(args&: CurrNode, args&: CurrNodeLen) = ToVisit.back(); |
99 | ToVisit.pop_back(); |
100 | // Length of the current node from the root down to here. |
101 | CurrNode->setConcatLen(CurrNodeLen); |
102 | if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(Val: CurrNode)) |
103 | for (auto &ChildPair : InternalNode->Children) { |
104 | assert(ChildPair.second && "Node had a null child!" ); |
105 | ToVisit.push_back( |
106 | Elt: {ChildPair.second, |
107 | CurrNodeLen + numElementsInSubstring(N: ChildPair.second)}); |
108 | } |
109 | // No children, so we are at the end of the string. |
110 | if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(Val: CurrNode)) |
111 | LeafNode->setSuffixIdx(Str.size() - CurrNodeLen); |
112 | } |
113 | } |
114 | |
115 | void SuffixTree::setLeafNodes() { |
116 | // A stack that keeps track of nodes to visit for post-order DFS traversal. |
117 | SmallVector<SuffixTreeNode *> ToVisit; |
118 | ToVisit.push_back(Elt: Root); |
119 | |
120 | // This keeps track of the index of the next leaf node to be added to |
121 | // the LeafNodes vector of the suffix tree. |
122 | unsigned LeafCounter = 0; |
123 | |
124 | // This keeps track of nodes whose children have been added to the stack. |
125 | // The value is a pair, representing a node's first and last children. |
126 | DenseMap<SuffixTreeInternalNode *, |
127 | std::pair<SuffixTreeNode *, SuffixTreeNode *>> |
128 | ChildrenMap; |
129 | |
130 | // Traverse the tree in post-order. |
131 | while (!ToVisit.empty()) { |
132 | SuffixTreeNode *CurrNode = ToVisit.pop_back_val(); |
133 | if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(Val: CurrNode)) { |
134 | // The current node is an internal node. |
135 | auto I = ChildrenMap.find(Val: CurrInternalNode); |
136 | if (I == ChildrenMap.end()) { |
137 | // This is the first time we visit this node. |
138 | // Its children have not been added to the stack yet. |
139 | // We add current node back, and add its children to the stack. |
140 | // We keep track of the first and last children of the current node. |
141 | auto J = CurrInternalNode->Children.begin(); |
142 | if (J != CurrInternalNode->Children.end()) { |
143 | ToVisit.push_back(Elt: CurrNode); |
144 | SuffixTreeNode *FirstChild = J->second; |
145 | SuffixTreeNode *LastChild = nullptr; |
146 | for (; J != CurrInternalNode->Children.end(); ++J) { |
147 | LastChild = J->second; |
148 | ToVisit.push_back(Elt: LastChild); |
149 | } |
150 | ChildrenMap[CurrInternalNode] = {FirstChild, LastChild}; |
151 | } |
152 | } else { |
153 | // This is the second time we visit this node. |
154 | // All of its children have already been processed. |
155 | // Now, we can set its LeftLeafIdx and RightLeafIdx; |
156 | auto [FirstChild, LastChild] = I->second; |
157 | // Get the first child to use its RightLeafIdx. |
158 | // The first child is the first one added to the stack, so it is |
159 | // the last one to be processed. Hence, the leaf descendants |
160 | // of the first child are assigned the largest index numbers. |
161 | CurrNode->setRightLeafIdx(FirstChild->getRightLeafIdx()); |
162 | // Get the last child to use its LeftLeafIdx. |
163 | CurrNode->setLeftLeafIdx(LastChild->getLeftLeafIdx()); |
164 | assert(CurrNode->getLeftLeafIdx() <= CurrNode->getRightLeafIdx() && |
165 | "LeftLeafIdx should not be larger than RightLeafIdx" ); |
166 | } |
167 | } else { |
168 | // The current node is a leaf node. |
169 | // We can simply set its LeftLeafIdx and RightLeafIdx. |
170 | CurrNode->setLeftLeafIdx(LeafCounter); |
171 | CurrNode->setRightLeafIdx(LeafCounter); |
172 | ++LeafCounter; |
173 | auto *CurrLeafNode = cast<SuffixTreeLeafNode>(Val: CurrNode); |
174 | LeafNodes.push_back(x: CurrLeafNode); |
175 | } |
176 | } |
177 | } |
178 | |
179 | unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { |
180 | SuffixTreeInternalNode *NeedsLink = nullptr; |
181 | |
182 | while (SuffixesToAdd > 0) { |
183 | |
184 | // Are we waiting to add anything other than just the last character? |
185 | if (Active.Len == 0) { |
186 | // If not, then say the active index is the end index. |
187 | Active.Idx = EndIdx; |
188 | } |
189 | |
190 | assert(Active.Idx <= EndIdx && "Start index can't be after end index!" ); |
191 | |
192 | // The first character in the current substring we're looking at. |
193 | unsigned FirstChar = Str[Active.Idx]; |
194 | |
195 | // Have we inserted anything starting with FirstChar at the current node? |
196 | if (Active.Node->Children.count(Val: FirstChar) == 0) { |
197 | // If not, then we can just insert a leaf and move to the next step. |
198 | insertLeaf(Parent&: *Active.Node, StartIdx: EndIdx, Edge: FirstChar); |
199 | |
200 | // The active node is an internal node, and we visited it, so it must |
201 | // need a link if it doesn't have one. |
202 | if (NeedsLink) { |
203 | NeedsLink->setLink(Active.Node); |
204 | NeedsLink = nullptr; |
205 | } |
206 | } else { |
207 | // There's a match with FirstChar, so look for the point in the tree to |
208 | // insert a new node. |
209 | SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; |
210 | |
211 | unsigned SubstringLen = numElementsInSubstring(N: NextNode); |
212 | |
213 | // Is the current suffix we're trying to insert longer than the size of |
214 | // the child we want to move to? |
215 | if (Active.Len >= SubstringLen) { |
216 | // If yes, then consume the characters we've seen and move to the next |
217 | // node. |
218 | assert(isa<SuffixTreeInternalNode>(NextNode) && |
219 | "Expected an internal node?" ); |
220 | Active.Idx += SubstringLen; |
221 | Active.Len -= SubstringLen; |
222 | Active.Node = cast<SuffixTreeInternalNode>(Val: NextNode); |
223 | continue; |
224 | } |
225 | |
226 | // Otherwise, the suffix we're trying to insert must be contained in the |
227 | // next node we want to move to. |
228 | unsigned LastChar = Str[EndIdx]; |
229 | |
230 | // Is the string we're trying to insert a substring of the next node? |
231 | if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) { |
232 | // If yes, then we're done for this step. Remember our insertion point |
233 | // and move to the next end index. At this point, we have an implicit |
234 | // suffix tree. |
235 | if (NeedsLink && !Active.Node->isRoot()) { |
236 | NeedsLink->setLink(Active.Node); |
237 | NeedsLink = nullptr; |
238 | } |
239 | |
240 | Active.Len++; |
241 | break; |
242 | } |
243 | |
244 | // The string we're trying to insert isn't a substring of the next node, |
245 | // but matches up to a point. Split the node. |
246 | // |
247 | // For example, say we ended our search at a node n and we're trying to |
248 | // insert ABD. Then we'll create a new node s for AB, reduce n to just |
249 | // representing C, and insert a new leaf node l to represent d. This |
250 | // allows us to ensure that if n was a leaf, it remains a leaf. |
251 | // |
252 | // | ABC ---split---> | AB |
253 | // n s |
254 | // C / \ D |
255 | // n l |
256 | |
257 | // The node s from the diagram |
258 | SuffixTreeInternalNode *SplitNode = insertInternalNode( |
259 | Parent: Active.Node, StartIdx: NextNode->getStartIdx(), |
260 | EndIdx: NextNode->getStartIdx() + Active.Len - 1, Edge: FirstChar); |
261 | |
262 | // Insert the new node representing the new substring into the tree as |
263 | // a child of the split node. This is the node l from the diagram. |
264 | insertLeaf(Parent&: *SplitNode, StartIdx: EndIdx, Edge: LastChar); |
265 | |
266 | // Make the old node a child of the split node and update its start |
267 | // index. This is the node n from the diagram. |
268 | NextNode->incrementStartIdx(Inc: Active.Len); |
269 | SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode; |
270 | |
271 | // SplitNode is an internal node, update the suffix link. |
272 | if (NeedsLink) |
273 | NeedsLink->setLink(SplitNode); |
274 | |
275 | NeedsLink = SplitNode; |
276 | } |
277 | |
278 | // We've added something new to the tree, so there's one less suffix to |
279 | // add. |
280 | SuffixesToAdd--; |
281 | |
282 | if (Active.Node->isRoot()) { |
283 | if (Active.Len > 0) { |
284 | Active.Len--; |
285 | Active.Idx = EndIdx - SuffixesToAdd + 1; |
286 | } |
287 | } else { |
288 | // Start the next phase at the next smallest suffix. |
289 | Active.Node = Active.Node->getLink(); |
290 | } |
291 | } |
292 | |
293 | return SuffixesToAdd; |
294 | } |
295 | |
296 | void SuffixTree::RepeatedSubstringIterator::advance() { |
297 | // Clear the current state. If we're at the end of the range, then this |
298 | // is the state we want to be in. |
299 | RS = RepeatedSubstring(); |
300 | N = nullptr; |
301 | |
302 | // Each leaf node represents a repeat of a string. |
303 | SmallVector<unsigned> RepeatedSubstringStarts; |
304 | |
305 | // Continue visiting nodes until we find one which repeats more than once. |
306 | while (!InternalNodesToVisit.empty()) { |
307 | RepeatedSubstringStarts.clear(); |
308 | auto *Curr = InternalNodesToVisit.back(); |
309 | InternalNodesToVisit.pop_back(); |
310 | |
311 | // Keep track of the length of the string associated with the node. If |
312 | // it's too short, we'll quit. |
313 | unsigned Length = Curr->getConcatLen(); |
314 | |
315 | // Iterate over each child, saving internal nodes for visiting. |
316 | // Internal nodes represent individual strings, which may repeat. |
317 | for (auto &ChildPair : Curr->Children) |
318 | // Save all of this node's children for processing. |
319 | if (auto *InternalChild = |
320 | dyn_cast<SuffixTreeInternalNode>(Val: ChildPair.second)) |
321 | InternalNodesToVisit.push_back(Elt: InternalChild); |
322 | |
323 | // If length of repeated substring is below threshold, then skip it. |
324 | if (Length < MinLength) |
325 | continue; |
326 | |
327 | // The root never represents a repeated substring. If we're looking at |
328 | // that, then skip it. |
329 | if (Curr->isRoot()) |
330 | continue; |
331 | |
332 | // Collect leaf children or leaf descendants by OutlinerLeafDescendants. |
333 | if (OutlinerLeafDescendants) { |
334 | for (unsigned I = Curr->getLeftLeafIdx(); I <= Curr->getRightLeafIdx(); |
335 | ++I) |
336 | RepeatedSubstringStarts.push_back(Elt: LeafNodes[I]->getSuffixIdx()); |
337 | } else { |
338 | for (auto &ChildPair : Curr->Children) |
339 | if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(Val: ChildPair.second)) |
340 | RepeatedSubstringStarts.push_back(Elt: Leaf->getSuffixIdx()); |
341 | } |
342 | |
343 | // Do we have any repeated substrings? |
344 | if (RepeatedSubstringStarts.size() < 2) |
345 | continue; |
346 | |
347 | // Yes. Update the state to reflect this, and then bail out. |
348 | N = Curr; |
349 | RS.Length = Length; |
350 | for (unsigned StartIdx : RepeatedSubstringStarts) |
351 | RS.StartIndices.push_back(Elt: StartIdx); |
352 | break; |
353 | } |
354 | // At this point, either NewRS is an empty RepeatedSubstring, or it was |
355 | // set in the above loop. Similarly, N is either nullptr, or the node |
356 | // associated with NewRS. |
357 | } |
358 | |