1 | //===-- Regex.cpp - Regular Expression matcher implementation -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a POSIX regular expression matcher. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Support/Regex.h" |
14 | #include "llvm/ADT/SmallVector.h" |
15 | #include "llvm/ADT/StringRef.h" |
16 | #include "llvm/ADT/Twine.h" |
17 | #include "regex_impl.h" |
18 | |
19 | #include <cassert> |
20 | #include <string> |
21 | |
22 | using namespace llvm; |
23 | |
24 | Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} |
25 | |
26 | Regex::Regex(StringRef regex, RegexFlags Flags) { |
27 | unsigned flags = 0; |
28 | preg = new llvm_regex(); |
29 | preg->re_endp = regex.end(); |
30 | if (Flags & IgnoreCase) |
31 | flags |= REG_ICASE; |
32 | if (Flags & Newline) |
33 | flags |= REG_NEWLINE; |
34 | if (!(Flags & BasicRegex)) |
35 | flags |= REG_EXTENDED; |
36 | error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); |
37 | } |
38 | |
39 | Regex::Regex(StringRef regex, unsigned Flags) |
40 | : Regex(regex, static_cast<RegexFlags>(Flags)) {} |
41 | |
42 | Regex::Regex(Regex &®ex) { |
43 | preg = regex.preg; |
44 | error = regex.error; |
45 | regex.preg = nullptr; |
46 | regex.error = REG_BADPAT; |
47 | } |
48 | |
49 | Regex::~Regex() { |
50 | if (preg) { |
51 | llvm_regfree(preg); |
52 | delete preg; |
53 | } |
54 | } |
55 | |
56 | namespace { |
57 | |
58 | /// Utility to convert a regex error code into a human-readable string. |
59 | void RegexErrorToString(int error, struct llvm_regex *preg, |
60 | std::string &Error) { |
61 | size_t len = llvm_regerror(error, preg, nullptr, 0); |
62 | |
63 | Error.resize(n: len - 1); |
64 | llvm_regerror(error, preg, &Error[0], len); |
65 | } |
66 | |
67 | } // namespace |
68 | |
69 | bool Regex::isValid(std::string &Error) const { |
70 | if (!error) |
71 | return true; |
72 | |
73 | RegexErrorToString(error, preg, Error); |
74 | return false; |
75 | } |
76 | |
77 | /// getNumMatches - In a valid regex, return the number of parenthesized |
78 | /// matches it contains. |
79 | unsigned Regex::getNumMatches() const { |
80 | return preg->re_nsub; |
81 | } |
82 | |
83 | bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches, |
84 | std::string *Error) const { |
85 | // Reset error, if given. |
86 | if (Error && !Error->empty()) |
87 | *Error = "" ; |
88 | |
89 | // Check if the regex itself didn't successfully compile. |
90 | if (Error ? !isValid(Error&: *Error) : !isValid()) |
91 | return false; |
92 | |
93 | unsigned nmatch = Matches ? preg->re_nsub+1 : 0; |
94 | |
95 | // Update null string to empty string. |
96 | if (String.data() == nullptr) |
97 | String = "" ; |
98 | |
99 | // pmatch needs to have at least one element. |
100 | SmallVector<llvm_regmatch_t, 8> pm; |
101 | pm.resize(N: nmatch > 0 ? nmatch : 1); |
102 | pm[0].rm_so = 0; |
103 | pm[0].rm_eo = String.size(); |
104 | |
105 | int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); |
106 | |
107 | // Failure to match is not an error, it's just a normal return value. |
108 | // Any other error code is considered abnormal, and is logged in the Error. |
109 | if (rc == REG_NOMATCH) |
110 | return false; |
111 | if (rc != 0) { |
112 | if (Error) |
113 | RegexErrorToString(error, preg, Error&: *Error); |
114 | return false; |
115 | } |
116 | |
117 | // There was a match. |
118 | |
119 | if (Matches) { // match position requested |
120 | Matches->clear(); |
121 | |
122 | for (unsigned i = 0; i != nmatch; ++i) { |
123 | if (pm[i].rm_so == -1) { |
124 | // this group didn't match |
125 | Matches->push_back(Elt: StringRef()); |
126 | continue; |
127 | } |
128 | assert(pm[i].rm_eo >= pm[i].rm_so); |
129 | Matches->push_back(Elt: StringRef(String.data()+pm[i].rm_so, |
130 | pm[i].rm_eo-pm[i].rm_so)); |
131 | } |
132 | } |
133 | |
134 | return true; |
135 | } |
136 | |
137 | std::string Regex::sub(StringRef Repl, StringRef String, |
138 | std::string *Error) const { |
139 | SmallVector<StringRef, 8> Matches; |
140 | |
141 | // Return the input if there was no match. |
142 | if (!match(String, Matches: &Matches, Error)) |
143 | return std::string(String); |
144 | |
145 | // Otherwise splice in the replacement string, starting with the prefix before |
146 | // the match. |
147 | std::string Res(String.begin(), Matches[0].begin()); |
148 | |
149 | // Then the replacement string, honoring possible substitutions. |
150 | while (!Repl.empty()) { |
151 | // Skip to the next escape. |
152 | std::pair<StringRef, StringRef> Split = Repl.split(Separator: '\\'); |
153 | |
154 | // Add the skipped substring. |
155 | Res += Split.first; |
156 | |
157 | // Check for terminimation and trailing backslash. |
158 | if (Split.second.empty()) { |
159 | if (Repl.size() != Split.first.size() && |
160 | Error && Error->empty()) |
161 | *Error = "replacement string contained trailing backslash" ; |
162 | break; |
163 | } |
164 | |
165 | // Otherwise update the replacement string and interpret escapes. |
166 | Repl = Split.second; |
167 | |
168 | // FIXME: We should have a StringExtras function for mapping C99 escapes. |
169 | switch (Repl[0]) { |
170 | |
171 | // Backreference with the "\g<ref>" syntax |
172 | case 'g': |
173 | if (Repl.size() >= 4 && Repl[1] == '<') { |
174 | size_t End = Repl.find(C: '>'); |
175 | StringRef Ref = Repl.slice(Start: 2, End); |
176 | unsigned RefValue; |
177 | if (End != StringRef::npos && !Ref.getAsInteger(Radix: 10, Result&: RefValue)) { |
178 | Repl = Repl.substr(Start: End + 1); |
179 | if (RefValue < Matches.size()) |
180 | Res += Matches[RefValue]; |
181 | else if (Error && Error->empty()) |
182 | *Error = |
183 | ("invalid backreference string 'g<" + Twine(Ref) + ">'" ).str(); |
184 | break; |
185 | } |
186 | } |
187 | [[fallthrough]]; |
188 | |
189 | // Treat all unrecognized characters as self-quoting. |
190 | default: |
191 | Res += Repl[0]; |
192 | Repl = Repl.substr(Start: 1); |
193 | break; |
194 | |
195 | // Single character escapes. |
196 | case 't': |
197 | Res += '\t'; |
198 | Repl = Repl.substr(Start: 1); |
199 | break; |
200 | case 'n': |
201 | Res += '\n'; |
202 | Repl = Repl.substr(Start: 1); |
203 | break; |
204 | |
205 | // Decimal escapes are backreferences. |
206 | case '0': case '1': case '2': case '3': case '4': |
207 | case '5': case '6': case '7': case '8': case '9': { |
208 | // Extract the backreference number. |
209 | StringRef Ref = Repl.slice(Start: 0, End: Repl.find_first_not_of(Chars: "0123456789" )); |
210 | Repl = Repl.substr(Start: Ref.size()); |
211 | |
212 | unsigned RefValue; |
213 | if (!Ref.getAsInteger(Radix: 10, Result&: RefValue) && |
214 | RefValue < Matches.size()) |
215 | Res += Matches[RefValue]; |
216 | else if (Error && Error->empty()) |
217 | *Error = ("invalid backreference string '" + Twine(Ref) + "'" ).str(); |
218 | break; |
219 | } |
220 | } |
221 | } |
222 | |
223 | // And finally the suffix. |
224 | Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); |
225 | |
226 | return Res; |
227 | } |
228 | |
229 | // These are the special characters matched in functions like "p_ere_exp". |
230 | static const char RegexMetachars[] = "()^$|*+?.[]\\{}" ; |
231 | |
232 | bool Regex::isLiteralERE(StringRef Str) { |
233 | // Check for regex metacharacters. This list was derived from our regex |
234 | // implementation in regcomp.c and double checked against the POSIX extended |
235 | // regular expression specification. |
236 | return Str.find_first_of(Chars: RegexMetachars) == StringRef::npos; |
237 | } |
238 | |
239 | std::string Regex::escape(StringRef String) { |
240 | std::string RegexStr; |
241 | for (char C : String) { |
242 | if (strchr(s: RegexMetachars, c: C)) |
243 | RegexStr += '\\'; |
244 | RegexStr += C; |
245 | } |
246 | |
247 | return RegexStr; |
248 | } |
249 | |