regex.cpp source code [llvm_runtimes/libcxx/src/regex.cpp]

1	//===----------------------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include <algorithm>
10	#include <iterator>
11	#include <regex>
12
13	_LIBCPP_BEGIN_NAMESPACE_STD
14	_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS
15
16	static const char* make_error_type_string(regex_constants::error_type ecode) {
17	switch (ecode) {
18	case regex_constants::error_collate:
19	return "The expression contained an invalid collating element name.";
20	case regex_constants::error_ctype:
21	return "The expression contained an invalid character class name.";
22	case regex_constants::error_escape:
23	return "The expression contained an invalid escaped character, or a "
24	"trailing escape.";
25	case regex_constants::error_backref:
26	return "The expression contained an invalid back reference.";
27	case regex_constants::error_brack:
28	return "The expression contained mismatched [ and ].";
29	case regex_constants::error_paren:
30	return "The expression contained mismatched ( and ).";
31	case regex_constants::error_brace:
32	return "The expression contained mismatched { and }.";
33	case regex_constants::error_badbrace:
34	return "The expression contained an invalid range in a {} expression.";
35	case regex_constants::error_range:
36	return "The expression contained an invalid character range, "
37	"such as [b-a] in most encodings.";
38	case regex_constants::error_space:
39	return "There was insufficient memory to convert the expression into "
40	"a finite state machine.";
41	case regex_constants::error_badrepeat:
42	return "One of *?+{ was not preceded by a valid regular expression.";
43	case regex_constants::error_complexity:
44	return "The complexity of an attempted match against a regular "
45	"expression exceeded a pre-set level.";
46	case regex_constants::error_stack:
47	return "There was insufficient memory to determine whether the regular "
48	"expression could match the specified character sequence.";
49	case regex_constants::__re_err_grammar:
50	return "An invalid regex grammar has been requested.";
51	case regex_constants::__re_err_empty:
52	return "An empty regex is not allowed in the POSIX grammar.";
53	case regex_constants::__re_err_parse:
54	return "The parser did not consume the entire regular expression.";
55	default:
56	break;
57	}
58	return "Unknown error type";
59	}
60
61	regex_error::regex_error(regex_constants::error_type ecode)
62	: runtime_error (make_error_type_string(ecode)), __code_(ecode) {}
63
64	regex_error::~regex_error() throw() {}
65
66	namespace {
67
68	struct collationnames {
69	const char* elem_;
70	char char_;
71	};
72
73	#if defined(__MVS__) && !defined(__NATIVE_ASCII_F)
74	// EBCDIC IBM-1047
75	// Sorted via the EBCDIC collating sequence
76	const collationnames collatenames[] = {
77	{"a", `0x81`},
78	{"alert", `0x2f`},
79	{"ampersand", `0x50`},
80	{"apostrophe", `0x7d`},
81	{"asterisk", `0x5c`},
82	{"b", `0x82`},
83	{"backslash", `0xe0`},
84	{"backspace", `0x16`},
85	{"c", `0x83`},
86	{"carriage-return", `0xd`},
87	{"circumflex", `0x5f`},
88	{"circumflex-accent", `0x5f`},
89	{"colon", `0x7a`},
90	{"comma", `0x6b`},
91	{"commercial-at", `0x7c`},
92	{"d", `0x84`},
93	{"dollar-sign", `0x5b`},
94	{"e", `0x85`},
95	{"eight", `0xf8`},
96	{"equals-sign", `0x7e`},
97	{"exclamation-mark", `0x5a`},
98	{"f", `0x86`},
99	{"five", `0xf5`},
100	{"form-feed", `0xc`},
101	{"four", `0xf4`},
102	{"full-stop", `0x4b`},
103	{"g", `0x87`},
104	{"grave-accent", `0x79`},
105	{"greater-than-sign", `0x6e`},
106	{"h", `0x88`},
107	{"hyphen", `0x60`},
108	{"hyphen-minus", `0x60`},
109	{"i", `0x89`},
110	{"j", `0x91`},
111	{"k", `0x92`},
112	{"l", `0x93`},
113	{"left-brace", `0xc0`},
114	{"left-curly-bracket", `0xc0`},
115	{"left-parenthesis", `0x4d`},
116	{"left-square-bracket", `0xad`},
117	{"less-than-sign", `0x4c`},
118	{"low-line", `0x6d`},
119	{"m", `0x94`},
120	{"n", `0x95`},
121	{"newline", `0x15`},
122	{"nine", `0xf9`},
123	{"number-sign", `0x7b`},
124	{"o", `0x96`},
125	{"one", `0xf1`},
126	{"p", `0x97`},
127	{"percent-sign", `0x6c`},
128	{"period", `0x4b`},
129	{"plus-sign", `0x4e`},
130	{"q", `0x98`},
131	{"question-mark", `0x6f`},
132	{"quotation-mark", `0x7f`},
133	{"r", `0x99`},
134	{"reverse-solidus", `0xe0`},
135	{"right-brace", `0xd0`},
136	{"right-curly-bracket", `0xd0`},
137	{"right-parenthesis", `0x5d`},
138	{"right-square-bracket", `0xbd`},
139	{"s", `0xa2`},
140	{"semicolon", `0x5e`},
141	{"seven", `0xf7`},
142	{"six", `0xf6`},
143	{"slash", `0x61`},
144	{"solidus", `0x61`},
145	{"space", `0x40`},
146	{"t", `0xa3`},
147	{"tab", `0x5`},
148	{"three", `0xf3`},
149	{"tilde", `0xa1`},
150	{"two", `0xf2`},
151	{"u", `0xa4`},
152	{"underscore", `0x6d`},
153	{"v", `0xa5`},
154	{"vertical-line", `0x4f`},
155	{"vertical-tab", `0xb`},
156	{"w", `0xa6`},
157	{"x", `0xa7`},
158	{"y", `0xa8`},
159	{"z", `0xa9`},
160	{"zero", `0xf0`},
161	{"A", `0xc1`},
162	{"B", `0xc2`},
163	{"C", `0xc3`},
164	{"D", `0xc4`},
165	{"E", `0xc5`},
166	{"F", `0xc6`},
167	{"G", `0xc7`},
168	{"H", `0xc8`},
169	{"I", `0xc9`},
170	{"J", `0xd1`},
171	{"K", `0xd2`},
172	{"L", `0xd3`},
173	{"M", `0xd4`},
174	{"N", `0xd5`},
175	{"NUL", `0`},
176	{"O", `0xd6`},
177	{"P", `0xd7`},
178	{"Q", `0xd8`},
179	{"R", `0xd9`},
180	{"S", `0xe2`},
181	{"T", `0xe3`},
182	{"U", `0xe4`},
183	{"V", `0xe5`},
184	{"W", `0xe6`},
185	{"X", `0xe7`},
186	{"Y", `0xe8`},
187	{"Z", `0xe9`}};
188	#else
189	// ASCII
190	const collationnames collatenames[] = {
191	{.elem_: "A", .char_: `0x41`},
192	{.elem_: "B", .char_: `0x42`},
193	{.elem_: "C", .char_: `0x43`},
194	{.elem_: "D", .char_: `0x44`},
195	{.elem_: "E", .char_: `0x45`},
196	{.elem_: "F", .char_: `0x46`},
197	{.elem_: "G", .char_: `0x47`},
198	{.elem_: "H", .char_: `0x48`},
199	{.elem_: "I", .char_: `0x49`},
200	{.elem_: "J", .char_: `0x4a`},
201	{.elem_: "K", .char_: `0x4b`},
202	{.elem_: "L", .char_: `0x4c`},
203	{.elem_: "M", .char_: `0x4d`},
204	{.elem_: "N", .char_: `0x4e`},
205	{.elem_: "NUL", .char_: `0x00`},
206	{.elem_: "O", .char_: `0x4f`},
207	{.elem_: "P", .char_: `0x50`},
208	{.elem_: "Q", .char_: `0x51`},
209	{.elem_: "R", .char_: `0x52`},
210	{.elem_: "S", .char_: `0x53`},
211	{.elem_: "T", .char_: `0x54`},
212	{.elem_: "U", .char_: `0x55`},
213	{.elem_: "V", .char_: `0x56`},
214	{.elem_: "W", .char_: `0x57`},
215	{.elem_: "X", .char_: `0x58`},
216	{.elem_: "Y", .char_: `0x59`},
217	{.elem_: "Z", .char_: `0x5a`},
218	{.elem_: "a", .char_: `0x61`},
219	{.elem_: "alert", .char_: `0x07`},
220	{.elem_: "ampersand", .char_: `0x26`},
221	{.elem_: "apostrophe", .char_: `0x27`},
222	{.elem_: "asterisk", .char_: `0x2a`},
223	{.elem_: "b", .char_: `0x62`},
224	{.elem_: "backslash", .char_: `0x5c`},
225	{.elem_: "backspace", .char_: `0x08`},
226	{.elem_: "c", .char_: `0x63`},
227	{.elem_: "carriage-return", .char_: `0x0d`},
228	{.elem_: "circumflex", .char_: `0x5e`},
229	{.elem_: "circumflex-accent", .char_: `0x5e`},
230	{.elem_: "colon", .char_: `0x3a`},
231	{.elem_: "comma", .char_: `0x2c`},
232	{.elem_: "commercial-at", .char_: `0x40`},
233	{.elem_: "d", .char_: `0x64`},
234	{.elem_: "dollar-sign", .char_: `0x24`},
235	{.elem_: "e", .char_: `0x65`},
236	{.elem_: "eight", .char_: `0x38`},
237	{.elem_: "equals-sign", .char_: `0x3d`},
238	{.elem_: "exclamation-mark", .char_: `0x21`},
239	{.elem_: "f", .char_: `0x66`},
240	{.elem_: "five", .char_: `0x35`},
241	{.elem_: "form-feed", .char_: `0x0c`},
242	{.elem_: "four", .char_: `0x34`},
243	{.elem_: "full-stop", .char_: `0x2e`},
244	{.elem_: "g", .char_: `0x67`},
245	{.elem_: "grave-accent", .char_: `0x60`},
246	{.elem_: "greater-than-sign", .char_: `0x3e`},
247	{.elem_: "h", .char_: `0x68`},
248	{.elem_: "hyphen", .char_: `0x2d`},
249	{.elem_: "hyphen-minus", .char_: `0x2d`},
250	{.elem_: "i", .char_: `0x69`},
251	{.elem_: "j", .char_: `0x6a`},
252	{.elem_: "k", .char_: `0x6b`},
253	{.elem_: "l", .char_: `0x6c`},
254	{.elem_: "left-brace", .char_: `0x7b`},
255	{.elem_: "left-curly-bracket", .char_: `0x7b`},
256	{.elem_: "left-parenthesis", .char_: `0x28`},
257	{.elem_: "left-square-bracket", .char_: `0x5b`},
258	{.elem_: "less-than-sign", .char_: `0x3c`},
259	{.elem_: "low-line", .char_: `0x5f`},
260	{.elem_: "m", .char_: `0x6d`},
261	{.elem_: "n", .char_: `0x6e`},
262	{.elem_: "newline", .char_: `0x0a`},
263	{.elem_: "nine", .char_: `0x39`},
264	{.elem_: "number-sign", .char_: `0x23`},
265	{.elem_: "o", .char_: `0x6f`},
266	{.elem_: "one", .char_: `0x31`},
267	{.elem_: "p", .char_: `0x70`},
268	{.elem_: "percent-sign", .char_: `0x25`},
269	{.elem_: "period", .char_: `0x2e`},
270	{.elem_: "plus-sign", .char_: `0x2b`},
271	{.elem_: "q", .char_: `0x71`},
272	{.elem_: "question-mark", .char_: `0x3f`},
273	{.elem_: "quotation-mark", .char_: `0x22`},
274	{.elem_: "r", .char_: `0x72`},
275	{.elem_: "reverse-solidus", .char_: `0x5c`},
276	{.elem_: "right-brace", .char_: `0x7d`},
277	{.elem_: "right-curly-bracket", .char_: `0x7d`},
278	{.elem_: "right-parenthesis", .char_: `0x29`},
279	{.elem_: "right-square-bracket", .char_: `0x5d`},
280	{.elem_: "s", .char_: `0x73`},
281	{.elem_: "semicolon", .char_: `0x3b`},
282	{.elem_: "seven", .char_: `0x37`},
283	{.elem_: "six", .char_: `0x36`},
284	{.elem_: "slash", .char_: `0x2f`},
285	{.elem_: "solidus", .char_: `0x2f`},
286	{.elem_: "space", .char_: `0x20`},
287	{.elem_: "t", .char_: `0x74`},
288	{.elem_: "tab", .char_: `0x09`},
289	{.elem_: "three", .char_: `0x33`},
290	{.elem_: "tilde", .char_: `0x7e`},
291	{.elem_: "two", .char_: `0x32`},
292	{.elem_: "u", .char_: `0x75`},
293	{.elem_: "underscore", .char_: `0x5f`},
294	{.elem_: "v", .char_: `0x76`},
295	{.elem_: "vertical-line", .char_: `0x7c`},
296	{.elem_: "vertical-tab", .char_: `0x0b`},
297	{.elem_: "w", .char_: `0x77`},
298	{.elem_: "x", .char_: `0x78`},
299	{.elem_: "y", .char_: `0x79`},
300	{.elem_: "z", .char_: `0x7a`},
301	{.elem_: "zero", .char_: `0x30`}};
302	#endif
303
304	struct classnames {
305	const char* elem_;
306	regex_traits<char>::char_class_type mask_;
307	};
308
309	const classnames ClassNames[] = {
310	{.elem_: "alnum", .mask_: ctype_base::alnum},
311	{.elem_: "alpha", .mask_: ctype_base::alpha},
312	{.elem_: "blank", .mask_: ctype_base::blank},
313	{.elem_: "cntrl", .mask_: ctype_base::cntrl},
314	{.elem_: "d", .mask_: ctype_base::digit},
315	{.elem_: "digit", .mask_: ctype_base::digit},
316	{.elem_: "graph", .mask_: ctype_base::graph},
317	{.elem_: "lower", .mask_: ctype_base::lower},
318	{.elem_: "print", .mask_: ctype_base::print},
319	{.elem_: "punct", .mask_: ctype_base::punct},
320	{.elem_: "s", .mask_: ctype_base::space},
321	{.elem_: "space", .mask_: ctype_base::space},
322	{.elem_: "upper", .mask_: ctype_base::upper},
323	{.elem_: "w", .mask_: regex_traits<char>::__regex_word},
324	{.elem_: "xdigit", .mask_: ctype_base::xdigit}};
325
326	struct use_strcmp {
327	bool operator()(const collationnames& x, const char* y) const { return strcmp(s1: x.elem_, s2: y) < `0`; }
328	bool operator()(const classnames& x, const char* y) const { return strcmp(s1: x.elem_, s2: y) < `0`; }
329	};
330
331	} // namespace
332
333	string __get_collation_name(const char* s) {
334	const collationnames* i = std::lower_bound(first: begin(array: collatenames), last: end(array: collatenames), value: s, comp: use_strcmp ());
335	string r;
336	if (i != end(array: collatenames) && strcmp(s1: s, s2: i->elem_) == `0`)
337	r = char(i->char_);
338	return r;
339	}
340
341	regex_traits<char>::char_class_type __get_classname(const char* s, bool __icase) {
342	const classnames* i = std::lower_bound(first: begin(array: ClassNames), last: end(array: ClassNames), value: s, comp: use_strcmp ());
343	regex_traits<char>::char_class_type r = `0`;
344	if (i != end(array: ClassNames) && strcmp(s1: s, s2: i->elem_) == `0`) {
345	r = i->mask_;
346	if (r == regex_traits<char>::__regex_word)
347	r \|= ctype_base::alnum \| ctype_base::upper \| ctype_base::lower;
348	else if (__icase) {
349	if (r & (ctype_base::lower \| ctype_base::upper))
350	r \|= ctype_base::alpha;
351	}
352	}
353	return r;
354	}
355
356	template <>
357	void __match_any_but_newline<char>::__exec(__state& __s) const {
358	if (__s.__current_ != __s.__last_) {
359	switch (*__s.__current_) {
360	case `'\r'`:
361	case `'\n'`:
362	__s.__do_ = __state::__reject;
363	__s.__node_ = nullptr;
364	break;
365	default:
366	__s.__do_ = __state::__accept_and_consume;
367	++__s.__current_;
368	__s.__node_ = this->first();
369	break;
370	}
371	} else {
372	__s.__do_ = __state::__reject;
373	__s.__node_ = nullptr;
374	}
375	}
376
377	template <>
378	void __match_any_but_newline<wchar_t>::__exec(__state& __s) const {
379	if (__s.__current_ != __s.__last_) {
380	switch (*__s.__current_) {
381	case `'\r'`:
382	case `'\n'`:
383	case `0x2028`:
384	case `0x2029`:
385	__s.__do_ = __state::__reject;
386	__s.__node_ = nullptr;
387	break;
388	default:
389	__s.__do_ = __state::__accept_and_consume;
390	++__s.__current_;
391	__s.__node_ = this->first();
392	break;
393	}
394	} else {
395	__s.__do_ = __state::__reject;
396	__s.__node_ = nullptr;
397	}
398	}
399
400	_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS
401	_LIBCPP_END_NAMESPACE_STD
402

Browse the source code of llvm_runtimes/libcxx/src/regex.cpp