GlobPattern.cpp source code [llvm_projects/llvm/lib/Support/GlobPattern.cpp]

1	//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a glob pattern matcher.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/Support/GlobPattern.h"
14	#include "llvm/ADT/StringRef.h"
15	#include "llvm/Support/Errc.h"
16
17	using namespace llvm;
18
19	static constexpr char PrefixMetacharacters[] = "?*[{\\";
20	static constexpr char SuffixMetacharacters[] = "?*[]{}\\";
21	static constexpr char PrefixMetacharactersWithSlash[] = "?*[{\\/";
22	static constexpr char SuffixMetacharactersWithSlash[] = "?*[]{}\\/";
23
24	// Expands character ranges and returns a bitmap.
25	// For example, "a-cf-hz" is expanded to "abcfghz".
26	static Expected<BitVector> expand(StringRef S, StringRef Original) {
27	BitVector BV(`256`, false);
28
29	// Expand X-Y.
30	for (;;) {
31	if (S.size() < `3`)
32	break;
33
34	uint8_t Start = S [`0`];
35	uint8_t End = S [`2`];
36
37	// If it doesn't start with something like X-Y,
38	// consume the first character and proceed.
39	if (S [`1`] != `'-'`) {
40	BV [Start] = true;
41	S = S.substr(Start: `1`);
42	continue;
43	}
44
45	// It must be in the form of X-Y.
46	// Validate it and then interpret the range.
47	if (Start > End)
48	return make_error<StringError>(Args: "invalid glob pattern: " + Original,
49	Args: errc::invalid_argument);
50
51	for (int C = Start; C <= End; ++C)
52	BV [(uint8_t)C] = true;
53	S = S.substr(Start: `3`);
54	}
55
56	for (char C : S)
57	BV [(uint8_t)C] = true;
58	return BV;
59	}
60
61	// Identify brace expansions in S and return the list of patterns they expand
62	// into.
63	static Expected<SmallVector<std::string, `1`>>
64	parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
65	SmallVector<std::string> SubPatterns = {S.str()};
66	if (!MaxSubPatterns \|\| !S.contains(C: `'{'`))
67	return std::move(SubPatterns);
68
69	struct BraceExpansion {
70	size_t Start;
71	size_t Length;
72	SmallVector<StringRef, `2`> Terms;
73	};
74	SmallVector<BraceExpansion, `0`> BraceExpansions;
75
76	BraceExpansion CurrentBE = nullptr*;
77	size_t TermBegin;
78	for (size_t I = `0`, E = S.size(); I != E; ++I) {
79	if (S [I] == `'['`) {
80	I = S.find(C: `']'`, From: I + `2`);
81	if (I == std::string::npos)
82	return make_error<StringError>(Args: "invalid glob pattern, unmatched '['",
83	Args: errc::invalid_argument);
84	} else if (S [I] == `'{'`) {
85	if (CurrentBE)
86	return make_error<StringError>(
87	Args: "nested brace expansions are not supported",
88	Args: errc::invalid_argument);
89	CurrentBE = &BraceExpansions.emplace_back();
90	CurrentBE->Start = I;
91	TermBegin = I + `1`;
92	} else if (S [I] == `','`) {
93	if (!CurrentBE)
94	continue;
95	CurrentBE->Terms.push_back(Elt: S.substr(Start: TermBegin, N: I - TermBegin));
96	TermBegin = I + `1`;
97	} else if (S [I] == `'}'`) {
98	if (!CurrentBE)
99	continue;
100	if (CurrentBE->Terms.empty())
101	return make_error<StringError>(
102	Args: "empty or singleton brace expansions are not supported",
103	Args: errc::invalid_argument);
104	CurrentBE->Terms.push_back(Elt: S.substr(Start: TermBegin, N: I - TermBegin));
105	CurrentBE->Length = I - CurrentBE->Start + `1`;
106	CurrentBE = nullptr;
107	} else if (S [I] == `'\\'`) {
108	if (++I == E)
109	return make_error<StringError>(Args: "invalid glob pattern, stray '\\'",
110	Args: errc::invalid_argument);
111	}
112	}
113	if (CurrentBE)
114	return make_error<StringError>(Args: "incomplete brace expansion",
115	Args: errc::invalid_argument);
116
117	size_t NumSubPatterns = `1`;
118	for (auto &BE : BraceExpansions) {
119	if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
120	NumSubPatterns = std::numeric_limits<size_t>::max();
121	break;
122	}
123	NumSubPatterns *= BE.Terms.size();
124	}
125	if (NumSubPatterns > *MaxSubPatterns)
126	return make_error<StringError>(Args: "too many brace expansions",
127	Args: errc::invalid_argument);
128	// Replace brace expansions in reverse order so that we don't invalidate
129	// earlier start indices
130	for (auto &BE : reverse(C&: BraceExpansions)) {
131	SmallVector<std::string> OrigSubPatterns;
132	std::swap(LHS&: SubPatterns, RHS&: OrigSubPatterns);
133	for (StringRef Term : BE.Terms)
134	for (StringRef Orig : OrigSubPatterns)
135	SubPatterns.emplace_back(Args&: Orig).replace(pos: BE.Start, n: BE.Length, svt: Term);
136	}
137	return std::move(SubPatterns);
138	}
139
140	static StringRef maxPlainSubstring(StringRef S, bool SlashAgnostic) {
141	const char *Metas =
142	SlashAgnostic ? PrefixMetacharactersWithSlash : PrefixMetacharacters;
143	StringRef Best;
144	while (!S.empty()) {
145	size_t PrefixSize = S.find_first_of(Chars: Metas);
146	if (PrefixSize == std::string::npos)
147	PrefixSize = S.size();
148
149	if (Best.size() < PrefixSize)
150	Best = S.take_front(N: PrefixSize);
151
152	S = S.drop_front(N: PrefixSize);
153
154	// It's impossible, as the first and last characters of the input string
155	// must be Glob special characters, otherwise they would be parts of
156	// the prefix or the suffix.
157	assert(!S.empty());
158
159	switch (S.front()) {
160	case `'\\'`:
161	S = S.drop_front(N: `2`);
162	break;
163	case `'['`: {
164	// Drop '[' and the first character which can be ']'.
165	S = S.drop_front(N: `2`);
166	size_t EndBracket = S.find_first_of(Chars: "]");
167	// Should not be possible, SubGlobPattern::create should fail on invalid
168	// pattern before we get here.
169	assert(EndBracket != std::string::npos);
170	S = S.drop_front(N: EndBracket + `1`);
171	break;
172	}
173	case `'{'`:
174	// TODO: implement.
175	// Fallback to whatever is best for now.
176	return Best;
177	default:
178	S = S.drop_front(N: `1`);
179	}
180	}
181
182	return Best;
183	}
184
185	Expected<GlobPattern> GlobPattern::create(StringRef S,
186	std::optional<size_t> MaxSubPatterns,
187	bool SlashAgnostic) {
188	GlobPattern Pat;
189	Pat.SlashAgnostic = SlashAgnostic;
190	Pat.Pattern = S;
191
192	const char *PrefixMetas =
193	SlashAgnostic ? PrefixMetacharactersWithSlash : PrefixMetacharacters;
194	const char *SuffixMetas =
195	SlashAgnostic ? SuffixMetacharactersWithSlash : SuffixMetacharacters;
196
197	// Store the prefix that does not contain any metacharacter.
198	Pat.PrefixSize = S.find_first_of(Chars: PrefixMetas);
199	if (Pat.PrefixSize == std::string::npos) {
200	Pat.PrefixSize = S.size();
201	return Pat;
202	}
203	S = S.substr(Start: Pat.PrefixSize);
204
205	// Just in case we stop on unmatched opening brackets.
206	size_t SuffixStart = S.find_last_of(Chars: SuffixMetas);
207	assert(SuffixStart != std::string::npos);
208	if (S [SuffixStart] == `'\\'`)
209	++SuffixStart;
210	if (SuffixStart < S.size())
211	++SuffixStart;
212	Pat.SuffixSize = S.size() - SuffixStart;
213	S = S.substr(Start: `0`, N: SuffixStart);
214
215	SmallVector<std::string, `1`> SubPats;
216	if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(Value&: SubPats))
217	return std::move(Err);
218	for (StringRef SubPat : SubPats) {
219	auto SubGlobOrErr = SubGlobPattern::create(Pat: SubPat, SlashAgnostic);
220	if (!SubGlobOrErr)
221	return SubGlobOrErr.takeError();
222	Pat.SubGlobs.push_back(Elt: *SubGlobOrErr);
223	}
224
225	return Pat;
226	}
227
228	Expected<GlobPattern::SubGlobPattern>
229	GlobPattern::SubGlobPattern::create(StringRef S, bool SlashAgnostic) {
230	SubGlobPattern Pat;
231
232	// Parse brackets.
233	Pat.Pat.assign(in_start: S.begin(), in_end: S.end());
234	for (size_t I = `0`, E = S.size(); I != E; ++I) {
235	if (S [I] == `'['`) {
236	// ']' is allowed as the first character of a character class. '[]' is
237	// invalid. So, just skip the first character.
238	++I;
239	size_t J = S.find(C: `']'`, From: I + `1`);
240	if (J == StringRef::npos)
241	return make_error<StringError>(Args: "invalid glob pattern, unmatched '['",
242	Args: errc::invalid_argument);
243	StringRef Chars = S.substr(Start: I, N: J - I);
244	bool Invert = S [I] == `'^'` \|\| S [I] == `'!'`;
245	if (Invert)
246	Chars = Chars.drop_front();
247	Expected<BitVector> BVOrErr = expand(S: Chars, Original: S);
248	if (!BVOrErr)
249	return BVOrErr.takeError();
250	BitVector &BV = *BVOrErr;
251	if (SlashAgnostic && (BV [`'\\'`] \|\| BV [`'/'`])) {
252	BV.set(`'\\'`);
253	BV.set(`'/'`);
254	}
255	if (Invert)
256	BV.flip();
257	Pat.Brackets.push_back(Elt: Bracket{.NextOffset: J + `1`, .Bytes: std::move(BV)});
258	I = J;
259	} else if (S [I] == `'\\'`) {
260	if (++I == E)
261	return make_error<StringError>(Args: "invalid glob pattern, stray '\\'",
262	Args: errc::invalid_argument);
263	}
264	}
265	return Pat;
266	}
267
268	StringRef GlobPattern::longest_substr() const {
269	return maxPlainSubstring(S: Pattern.drop_front(N: PrefixSize).drop_back(N: SuffixSize),
270	SlashAgnostic);
271	}
272
273	bool GlobPattern::match(StringRef S) const {
274	if (!S.consume_front(Prefix: prefix()))
275	return false;
276	if (!S.consume_back(Suffix: suffix()))
277	return false;
278	if (SubGlobs.empty() && S.empty())
279	return true;
280	for (auto &Glob : SubGlobs)
281	if (Glob.match(S, SlashAgnostic))
282	return true;
283	return false;
284	}
285
286	static bool matchChar(char PatC, char QueryC, bool SlashAgnostic) {
287	if (PatC == QueryC)
288	return true;
289	return SlashAgnostic && (PatC == `'\\'` \|\| PatC == `'/'`) &&
290	(QueryC == `'\\'` \|\| QueryC == `'/'`);
291	}
292
293	// Factor the pattern into segments split by ''. The segment is matched*
294	// sequentianlly by finding the first occurrence past the end of the previous
295	// match.
296	bool GlobPattern::SubGlobPattern::match(StringRef Str,
297	bool SlashAgnostic) const {
298	const char P = Pat.data(), SegmentBegin = nullptr, *S = Str.data(),
299	*SavedS = S;
300	const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
301	size_t B = `0`, SavedB = `0`;
302	while (S != End) {
303	if (P == PEnd)
304	;
305	else if (P == `''`) {
306	// The non- substring on the left of '' matches the tail of S. Save the
307	// positions to be used by backtracking if we see a mismatch later.
308	SegmentBegin = ++P;
309	SavedS = S;
310	SavedB = B;
311	continue;
312	} else if (*P == `'['`) {
313	if (Brackets [B].Bytes [uint8_t(*S)]) {
314	P = Pat.data() + Brackets [B++].NextOffset;
315	++S;
316	continue;
317	}
318	} else if (*P == `'\\'`) {
319	if (matchChar(PatC: ++P, QueryC: S, SlashAgnostic)) {
320	++P;
321	++S;
322	continue;
323	}
324	} else if (matchChar(PatC: P, QueryC: S, SlashAgnostic) \|\| *P == `'?'`) {
325	++P;
326	++S;
327	continue;
328	}
329	if (!SegmentBegin)
330	return false;
331	// We have seen a ''. Backtrack to the saved positions. Shift the S*
332	// position to probe the next starting position in the segment.
333	P = SegmentBegin;
334	S = ++SavedS;
335	B = SavedB;
336	}
337	// All bytes in Str have been matched. Return true if the rest part of Pat is
338	// empty or contains only ''.*
339	return getPat().find_first_not_of(C: `'*'`, From: P - Pat.data()) == std::string::npos;
340	}
341

Browse the source code of llvm_projects/llvm/lib/Support/GlobPattern.cpp