1//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a glob pattern matcher.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Support/GlobPattern.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/Support/Errc.h"
16
17using namespace llvm;
18
19// Expands character ranges and returns a bitmap.
20// For example, "a-cf-hz" is expanded to "abcfghz".
21static Expected<BitVector> expand(StringRef S, StringRef Original) {
22 BitVector BV(256, false);
23
24 // Expand X-Y.
25 for (;;) {
26 if (S.size() < 3)
27 break;
28
29 uint8_t Start = S[0];
30 uint8_t End = S[2];
31
32 // If it doesn't start with something like X-Y,
33 // consume the first character and proceed.
34 if (S[1] != '-') {
35 BV[Start] = true;
36 S = S.substr(Start: 1);
37 continue;
38 }
39
40 // It must be in the form of X-Y.
41 // Validate it and then interpret the range.
42 if (Start > End)
43 return make_error<StringError>(Args: "invalid glob pattern: " + Original,
44 Args: errc::invalid_argument);
45
46 for (int C = Start; C <= End; ++C)
47 BV[(uint8_t)C] = true;
48 S = S.substr(Start: 3);
49 }
50
51 for (char C : S)
52 BV[(uint8_t)C] = true;
53 return BV;
54}
55
56// Identify brace expansions in S and return the list of patterns they expand
57// into.
58static Expected<SmallVector<std::string, 1>>
59parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
60 SmallVector<std::string> SubPatterns = {S.str()};
61 if (!MaxSubPatterns || !S.contains(C: '{'))
62 return std::move(SubPatterns);
63
64 struct BraceExpansion {
65 size_t Start;
66 size_t Length;
67 SmallVector<StringRef, 2> Terms;
68 };
69 SmallVector<BraceExpansion, 0> BraceExpansions;
70
71 BraceExpansion *CurrentBE = nullptr;
72 size_t TermBegin;
73 for (size_t I = 0, E = S.size(); I != E; ++I) {
74 if (S[I] == '[') {
75 I = S.find(C: ']', From: I + 2);
76 if (I == std::string::npos)
77 return make_error<StringError>(Args: "invalid glob pattern, unmatched '['",
78 Args: errc::invalid_argument);
79 } else if (S[I] == '{') {
80 if (CurrentBE)
81 return make_error<StringError>(
82 Args: "nested brace expansions are not supported",
83 Args: errc::invalid_argument);
84 CurrentBE = &BraceExpansions.emplace_back();
85 CurrentBE->Start = I;
86 TermBegin = I + 1;
87 } else if (S[I] == ',') {
88 if (!CurrentBE)
89 continue;
90 CurrentBE->Terms.push_back(Elt: S.substr(Start: TermBegin, N: I - TermBegin));
91 TermBegin = I + 1;
92 } else if (S[I] == '}') {
93 if (!CurrentBE)
94 continue;
95 if (CurrentBE->Terms.empty())
96 return make_error<StringError>(
97 Args: "empty or singleton brace expansions are not supported",
98 Args: errc::invalid_argument);
99 CurrentBE->Terms.push_back(Elt: S.substr(Start: TermBegin, N: I - TermBegin));
100 CurrentBE->Length = I - CurrentBE->Start + 1;
101 CurrentBE = nullptr;
102 } else if (S[I] == '\\') {
103 if (++I == E)
104 return make_error<StringError>(Args: "invalid glob pattern, stray '\\'",
105 Args: errc::invalid_argument);
106 }
107 }
108 if (CurrentBE)
109 return make_error<StringError>(Args: "incomplete brace expansion",
110 Args: errc::invalid_argument);
111
112 size_t NumSubPatterns = 1;
113 for (auto &BE : BraceExpansions) {
114 if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
115 NumSubPatterns = std::numeric_limits<size_t>::max();
116 break;
117 }
118 NumSubPatterns *= BE.Terms.size();
119 }
120 if (NumSubPatterns > *MaxSubPatterns)
121 return make_error<StringError>(Args: "too many brace expansions",
122 Args: errc::invalid_argument);
123 // Replace brace expansions in reverse order so that we don't invalidate
124 // earlier start indices
125 for (auto &BE : reverse(C&: BraceExpansions)) {
126 SmallVector<std::string> OrigSubPatterns;
127 std::swap(LHS&: SubPatterns, RHS&: OrigSubPatterns);
128 for (StringRef Term : BE.Terms)
129 for (StringRef Orig : OrigSubPatterns)
130 SubPatterns.emplace_back(Args&: Orig).replace(pos: BE.Start, n: BE.Length, svt: Term);
131 }
132 return std::move(SubPatterns);
133}
134
135Expected<GlobPattern>
136GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
137 GlobPattern Pat;
138
139 // Store the prefix that does not contain any metacharacter.
140 size_t PrefixSize = S.find_first_of(Chars: "?*[{\\");
141 Pat.Prefix = S.substr(Start: 0, N: PrefixSize);
142 if (PrefixSize == std::string::npos)
143 return Pat;
144 S = S.substr(Start: PrefixSize);
145
146 SmallVector<std::string, 1> SubPats;
147 if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(Value&: SubPats))
148 return std::move(Err);
149 for (StringRef SubPat : SubPats) {
150 auto SubGlobOrErr = SubGlobPattern::create(Pat: SubPat);
151 if (!SubGlobOrErr)
152 return SubGlobOrErr.takeError();
153 Pat.SubGlobs.push_back(Elt: *SubGlobOrErr);
154 }
155
156 return Pat;
157}
158
159Expected<GlobPattern::SubGlobPattern>
160GlobPattern::SubGlobPattern::create(StringRef S) {
161 SubGlobPattern Pat;
162
163 // Parse brackets.
164 Pat.Pat.assign(in_start: S.begin(), in_end: S.end());
165 for (size_t I = 0, E = S.size(); I != E; ++I) {
166 if (S[I] == '[') {
167 // ']' is allowed as the first character of a character class. '[]' is
168 // invalid. So, just skip the first character.
169 ++I;
170 size_t J = S.find(C: ']', From: I + 1);
171 if (J == StringRef::npos)
172 return make_error<StringError>(Args: "invalid glob pattern, unmatched '['",
173 Args: errc::invalid_argument);
174 StringRef Chars = S.substr(Start: I, N: J - I);
175 bool Invert = S[I] == '^' || S[I] == '!';
176 Expected<BitVector> BV =
177 Invert ? expand(S: Chars.substr(Start: 1), Original: S) : expand(S: Chars, Original: S);
178 if (!BV)
179 return BV.takeError();
180 if (Invert)
181 BV->flip();
182 Pat.Brackets.push_back(Elt: Bracket{.NextOffset: J + 1, .Bytes: std::move(*BV)});
183 I = J;
184 } else if (S[I] == '\\') {
185 if (++I == E)
186 return make_error<StringError>(Args: "invalid glob pattern, stray '\\'",
187 Args: errc::invalid_argument);
188 }
189 }
190 return Pat;
191}
192
193bool GlobPattern::match(StringRef S) const {
194 if (!S.consume_front(Prefix))
195 return false;
196 if (SubGlobs.empty() && S.empty())
197 return true;
198 for (auto &Glob : SubGlobs)
199 if (Glob.match(S))
200 return true;
201 return false;
202}
203
204// Factor the pattern into segments split by '*'. The segment is matched
205// sequentianlly by finding the first occurrence past the end of the previous
206// match.
207bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
208 const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
209 *SavedS = S;
210 const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
211 size_t B = 0, SavedB = 0;
212 while (S != End) {
213 if (P == PEnd)
214 ;
215 else if (*P == '*') {
216 // The non-* substring on the left of '*' matches the tail of S. Save the
217 // positions to be used by backtracking if we see a mismatch later.
218 SegmentBegin = ++P;
219 SavedS = S;
220 SavedB = B;
221 continue;
222 } else if (*P == '[') {
223 if (Brackets[B].Bytes[uint8_t(*S)]) {
224 P = Pat.data() + Brackets[B++].NextOffset;
225 ++S;
226 continue;
227 }
228 } else if (*P == '\\') {
229 if (*++P == *S) {
230 ++P;
231 ++S;
232 continue;
233 }
234 } else if (*P == *S || *P == '?') {
235 ++P;
236 ++S;
237 continue;
238 }
239 if (!SegmentBegin)
240 return false;
241 // We have seen a '*'. Backtrack to the saved positions. Shift the S
242 // position to probe the next starting position in the segment.
243 P = SegmentBegin;
244 S = ++SavedS;
245 B = SavedB;
246 }
247 // All bytes in Str have been matched. Return true if the rest part of Pat is
248 // empty or contains only '*'.
249 return getPat().find_first_not_of(C: '*', From: P - Pat.data()) == std::string::npos;
250}
251