1//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a glob pattern matcher.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Support/GlobPattern.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/Support/Errc.h"
16
17using namespace llvm;
18
19static constexpr char PrefixMetacharacters[] = "?*[{\\";
20static constexpr char SuffixMetacharacters[] = "?*[]{}\\";
21static constexpr char PrefixMetacharactersWithSlash[] = "?*[{\\/";
22static constexpr char SuffixMetacharactersWithSlash[] = "?*[]{}\\/";
23
24// Expands character ranges and returns a bitmap.
25// For example, "a-cf-hz" is expanded to "abcfghz".
26static Expected<BitVector> expand(StringRef S, StringRef Original) {
27 BitVector BV(256, false);
28
29 // Expand X-Y.
30 for (;;) {
31 if (S.size() < 3)
32 break;
33
34 uint8_t Start = S[0];
35 uint8_t End = S[2];
36
37 // If it doesn't start with something like X-Y,
38 // consume the first character and proceed.
39 if (S[1] != '-') {
40 BV[Start] = true;
41 S = S.substr(Start: 1);
42 continue;
43 }
44
45 // It must be in the form of X-Y.
46 // Validate it and then interpret the range.
47 if (Start > End)
48 return make_error<StringError>(Args: "invalid glob pattern: " + Original,
49 Args: errc::invalid_argument);
50
51 for (int C = Start; C <= End; ++C)
52 BV[(uint8_t)C] = true;
53 S = S.substr(Start: 3);
54 }
55
56 for (char C : S)
57 BV[(uint8_t)C] = true;
58 return BV;
59}
60
61// Identify brace expansions in S and return the list of patterns they expand
62// into.
63static Expected<SmallVector<std::string, 1>>
64parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
65 SmallVector<std::string> SubPatterns = {S.str()};
66 if (!MaxSubPatterns || !S.contains(C: '{'))
67 return std::move(SubPatterns);
68
69 struct BraceExpansion {
70 size_t Start;
71 size_t Length;
72 SmallVector<StringRef, 2> Terms;
73 };
74 SmallVector<BraceExpansion, 0> BraceExpansions;
75
76 BraceExpansion *CurrentBE = nullptr;
77 size_t TermBegin;
78 for (size_t I = 0, E = S.size(); I != E; ++I) {
79 if (S[I] == '[') {
80 I = S.find(C: ']', From: I + 2);
81 if (I == std::string::npos)
82 return make_error<StringError>(Args: "invalid glob pattern, unmatched '['",
83 Args: errc::invalid_argument);
84 } else if (S[I] == '{') {
85 if (CurrentBE)
86 return make_error<StringError>(
87 Args: "nested brace expansions are not supported",
88 Args: errc::invalid_argument);
89 CurrentBE = &BraceExpansions.emplace_back();
90 CurrentBE->Start = I;
91 TermBegin = I + 1;
92 } else if (S[I] == ',') {
93 if (!CurrentBE)
94 continue;
95 CurrentBE->Terms.push_back(Elt: S.substr(Start: TermBegin, N: I - TermBegin));
96 TermBegin = I + 1;
97 } else if (S[I] == '}') {
98 if (!CurrentBE)
99 continue;
100 if (CurrentBE->Terms.empty())
101 return make_error<StringError>(
102 Args: "empty or singleton brace expansions are not supported",
103 Args: errc::invalid_argument);
104 CurrentBE->Terms.push_back(Elt: S.substr(Start: TermBegin, N: I - TermBegin));
105 CurrentBE->Length = I - CurrentBE->Start + 1;
106 CurrentBE = nullptr;
107 } else if (S[I] == '\\') {
108 if (++I == E)
109 return make_error<StringError>(Args: "invalid glob pattern, stray '\\'",
110 Args: errc::invalid_argument);
111 }
112 }
113 if (CurrentBE)
114 return make_error<StringError>(Args: "incomplete brace expansion",
115 Args: errc::invalid_argument);
116
117 size_t NumSubPatterns = 1;
118 for (auto &BE : BraceExpansions) {
119 if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
120 NumSubPatterns = std::numeric_limits<size_t>::max();
121 break;
122 }
123 NumSubPatterns *= BE.Terms.size();
124 }
125 if (NumSubPatterns > *MaxSubPatterns)
126 return make_error<StringError>(Args: "too many brace expansions",
127 Args: errc::invalid_argument);
128 // Replace brace expansions in reverse order so that we don't invalidate
129 // earlier start indices
130 for (auto &BE : reverse(C&: BraceExpansions)) {
131 SmallVector<std::string> OrigSubPatterns;
132 std::swap(LHS&: SubPatterns, RHS&: OrigSubPatterns);
133 for (StringRef Term : BE.Terms)
134 for (StringRef Orig : OrigSubPatterns)
135 SubPatterns.emplace_back(Args&: Orig).replace(pos: BE.Start, n: BE.Length, svt: Term);
136 }
137 return std::move(SubPatterns);
138}
139
140static StringRef maxPlainSubstring(StringRef S, bool SlashAgnostic) {
141 const char *Metas =
142 SlashAgnostic ? PrefixMetacharactersWithSlash : PrefixMetacharacters;
143 StringRef Best;
144 while (!S.empty()) {
145 size_t PrefixSize = S.find_first_of(Chars: Metas);
146 if (PrefixSize == std::string::npos)
147 PrefixSize = S.size();
148
149 if (Best.size() < PrefixSize)
150 Best = S.take_front(N: PrefixSize);
151
152 S = S.drop_front(N: PrefixSize);
153
154 // It's impossible, as the first and last characters of the input string
155 // must be Glob special characters, otherwise they would be parts of
156 // the prefix or the suffix.
157 assert(!S.empty());
158
159 switch (S.front()) {
160 case '\\':
161 S = S.drop_front(N: 2);
162 break;
163 case '[': {
164 // Drop '[' and the first character which can be ']'.
165 S = S.drop_front(N: 2);
166 size_t EndBracket = S.find_first_of(Chars: "]");
167 // Should not be possible, SubGlobPattern::create should fail on invalid
168 // pattern before we get here.
169 assert(EndBracket != std::string::npos);
170 S = S.drop_front(N: EndBracket + 1);
171 break;
172 }
173 case '{':
174 // TODO: implement.
175 // Fallback to whatever is best for now.
176 return Best;
177 default:
178 S = S.drop_front(N: 1);
179 }
180 }
181
182 return Best;
183}
184
185Expected<GlobPattern> GlobPattern::create(StringRef S,
186 std::optional<size_t> MaxSubPatterns,
187 bool SlashAgnostic) {
188 GlobPattern Pat;
189 Pat.SlashAgnostic = SlashAgnostic;
190 Pat.Pattern = S;
191
192 const char *PrefixMetas =
193 SlashAgnostic ? PrefixMetacharactersWithSlash : PrefixMetacharacters;
194 const char *SuffixMetas =
195 SlashAgnostic ? SuffixMetacharactersWithSlash : SuffixMetacharacters;
196
197 // Store the prefix that does not contain any metacharacter.
198 Pat.PrefixSize = S.find_first_of(Chars: PrefixMetas);
199 if (Pat.PrefixSize == std::string::npos) {
200 Pat.PrefixSize = S.size();
201 return Pat;
202 }
203 S = S.substr(Start: Pat.PrefixSize);
204
205 // Just in case we stop on unmatched opening brackets.
206 size_t SuffixStart = S.find_last_of(Chars: SuffixMetas);
207 assert(SuffixStart != std::string::npos);
208 if (S[SuffixStart] == '\\')
209 ++SuffixStart;
210 if (SuffixStart < S.size())
211 ++SuffixStart;
212 Pat.SuffixSize = S.size() - SuffixStart;
213 S = S.substr(Start: 0, N: SuffixStart);
214
215 SmallVector<std::string, 1> SubPats;
216 if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(Value&: SubPats))
217 return std::move(Err);
218 for (StringRef SubPat : SubPats) {
219 auto SubGlobOrErr = SubGlobPattern::create(Pat: SubPat, SlashAgnostic);
220 if (!SubGlobOrErr)
221 return SubGlobOrErr.takeError();
222 Pat.SubGlobs.push_back(Elt: *SubGlobOrErr);
223 }
224
225 return Pat;
226}
227
228Expected<GlobPattern::SubGlobPattern>
229GlobPattern::SubGlobPattern::create(StringRef S, bool SlashAgnostic) {
230 SubGlobPattern Pat;
231
232 // Parse brackets.
233 Pat.Pat.assign(in_start: S.begin(), in_end: S.end());
234 for (size_t I = 0, E = S.size(); I != E; ++I) {
235 if (S[I] == '[') {
236 // ']' is allowed as the first character of a character class. '[]' is
237 // invalid. So, just skip the first character.
238 ++I;
239 size_t J = S.find(C: ']', From: I + 1);
240 if (J == StringRef::npos)
241 return make_error<StringError>(Args: "invalid glob pattern, unmatched '['",
242 Args: errc::invalid_argument);
243 StringRef Chars = S.substr(Start: I, N: J - I);
244 bool Invert = S[I] == '^' || S[I] == '!';
245 if (Invert)
246 Chars = Chars.drop_front();
247 Expected<BitVector> BVOrErr = expand(S: Chars, Original: S);
248 if (!BVOrErr)
249 return BVOrErr.takeError();
250 BitVector &BV = *BVOrErr;
251 if (SlashAgnostic && (BV['\\'] || BV['/'])) {
252 BV.set('\\');
253 BV.set('/');
254 }
255 if (Invert)
256 BV.flip();
257 Pat.Brackets.push_back(Elt: Bracket{.NextOffset: J + 1, .Bytes: std::move(BV)});
258 I = J;
259 } else if (S[I] == '\\') {
260 if (++I == E)
261 return make_error<StringError>(Args: "invalid glob pattern, stray '\\'",
262 Args: errc::invalid_argument);
263 }
264 }
265 return Pat;
266}
267
268StringRef GlobPattern::longest_substr() const {
269 return maxPlainSubstring(S: Pattern.drop_front(N: PrefixSize).drop_back(N: SuffixSize),
270 SlashAgnostic);
271}
272
273bool GlobPattern::match(StringRef S) const {
274 if (!S.consume_front(Prefix: prefix()))
275 return false;
276 if (!S.consume_back(Suffix: suffix()))
277 return false;
278 if (SubGlobs.empty() && S.empty())
279 return true;
280 for (auto &Glob : SubGlobs)
281 if (Glob.match(S, SlashAgnostic))
282 return true;
283 return false;
284}
285
286static bool matchChar(char PatC, char QueryC, bool SlashAgnostic) {
287 if (PatC == QueryC)
288 return true;
289 return SlashAgnostic && (PatC == '\\' || PatC == '/') &&
290 (QueryC == '\\' || QueryC == '/');
291}
292
293// Factor the pattern into segments split by '*'. The segment is matched
294// sequentianlly by finding the first occurrence past the end of the previous
295// match.
296bool GlobPattern::SubGlobPattern::match(StringRef Str,
297 bool SlashAgnostic) const {
298 const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
299 *SavedS = S;
300 const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
301 size_t B = 0, SavedB = 0;
302 while (S != End) {
303 if (P == PEnd)
304 ;
305 else if (*P == '*') {
306 // The non-* substring on the left of '*' matches the tail of S. Save the
307 // positions to be used by backtracking if we see a mismatch later.
308 SegmentBegin = ++P;
309 SavedS = S;
310 SavedB = B;
311 continue;
312 } else if (*P == '[') {
313 if (Brackets[B].Bytes[uint8_t(*S)]) {
314 P = Pat.data() + Brackets[B++].NextOffset;
315 ++S;
316 continue;
317 }
318 } else if (*P == '\\') {
319 if (matchChar(PatC: *++P, QueryC: *S, SlashAgnostic)) {
320 ++P;
321 ++S;
322 continue;
323 }
324 } else if (matchChar(PatC: *P, QueryC: *S, SlashAgnostic) || *P == '?') {
325 ++P;
326 ++S;
327 continue;
328 }
329 if (!SegmentBegin)
330 return false;
331 // We have seen a '*'. Backtrack to the saved positions. Shift the S
332 // position to probe the next starting position in the segment.
333 P = SegmentBegin;
334 S = ++SavedS;
335 B = SavedB;
336 }
337 // All bytes in Str have been matched. Return true if the rest part of Pat is
338 // empty or contains only '*'.
339 return getPat().find_first_not_of(C: '*', From: P - Pat.data()) == std::string::npos;
340}
341