regex.cpp source code [llvm_runtimes/libcxx/src/regex.cpp]

1	//===----------------------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include <algorithm>
10	#include <iterator>
11	#include <regex>
12
13	_LIBCPP_BEGIN_NAMESPACE_STD
14
15	static const char* make_error_type_string(regex_constants::error_type ecode) {
16	switch (ecode) {
17	case regex_constants::error_collate:
18	return "The expression contained an invalid collating element name.";
19	case regex_constants::error_ctype:
20	return "The expression contained an invalid character class name.";
21	case regex_constants::error_escape:
22	return "The expression contained an invalid escaped character, or a "
23	"trailing escape.";
24	case regex_constants::error_backref:
25	return "The expression contained an invalid back reference.";
26	case regex_constants::error_brack:
27	return "The expression contained mismatched [ and ].";
28	case regex_constants::error_paren:
29	return "The expression contained mismatched ( and ).";
30	case regex_constants::error_brace:
31	return "The expression contained mismatched { and }.";
32	case regex_constants::error_badbrace:
33	return "The expression contained an invalid range in a {} expression.";
34	case regex_constants::error_range:
35	return "The expression contained an invalid character range, "
36	"such as [b-a] in most encodings.";
37	case regex_constants::error_space:
38	return "There was insufficient memory to convert the expression into "
39	"a finite state machine.";
40	case regex_constants::error_badrepeat:
41	return "One of *?+{ was not preceded by a valid regular expression.";
42	case regex_constants::error_complexity:
43	return "The complexity of an attempted match against a regular "
44	"expression exceeded a pre-set level.";
45	case regex_constants::error_stack:
46	return "There was insufficient memory to determine whether the regular "
47	"expression could match the specified character sequence.";
48	case regex_constants::__re_err_grammar:
49	return "An invalid regex grammar has been requested.";
50	case regex_constants::__re_err_empty:
51	return "An empty regex is not allowed in the POSIX grammar.";
52	case regex_constants::__re_err_parse:
53	return "The parser did not consume the entire regular expression.";
54	default:
55	break;
56	}
57	return "Unknown error type";
58	}
59
60	regex_error::regex_error(regex_constants::error_type ecode)
61	: runtime_error(make_error_type_string(ecode)), __code_(ecode) {}
62
63	regex_error::~regex_error() throw() {}
64
65	namespace {
66
67	struct collationnames {
68	const char* elem_;
69	char char_;
70	};
71
72	#if defined(__MVS__) && !defined(__NATIVE_ASCII_F)
73	// EBCDIC IBM-1047
74	// Sorted via the EBCDIC collating sequence
75	const collationnames collatenames[] = {
76	{"a", `0x81`},
77	{"alert", `0x2f`},
78	{"ampersand", `0x50`},
79	{"apostrophe", `0x7d`},
80	{"asterisk", `0x5c`},
81	{"b", `0x82`},
82	{"backslash", `0xe0`},
83	{"backspace", `0x16`},
84	{"c", `0x83`},
85	{"carriage-return", `0xd`},
86	{"circumflex", `0x5f`},
87	{"circumflex-accent", `0x5f`},
88	{"colon", `0x7a`},
89	{"comma", `0x6b`},
90	{"commercial-at", `0x7c`},
91	{"d", `0x84`},
92	{"dollar-sign", `0x5b`},
93	{"e", `0x85`},
94	{"eight", `0xf8`},
95	{"equals-sign", `0x7e`},
96	{"exclamation-mark", `0x5a`},
97	{"f", `0x86`},
98	{"five", `0xf5`},
99	{"form-feed", `0xc`},
100	{"four", `0xf4`},
101	{"full-stop", `0x4b`},
102	{"g", `0x87`},
103	{"grave-accent", `0x79`},
104	{"greater-than-sign", `0x6e`},
105	{"h", `0x88`},
106	{"hyphen", `0x60`},
107	{"hyphen-minus", `0x60`},
108	{"i", `0x89`},
109	{"j", `0x91`},
110	{"k", `0x92`},
111	{"l", `0x93`},
112	{"left-brace", `0xc0`},
113	{"left-curly-bracket", `0xc0`},
114	{"left-parenthesis", `0x4d`},
115	{"left-square-bracket", `0xad`},
116	{"less-than-sign", `0x4c`},
117	{"low-line", `0x6d`},
118	{"m", `0x94`},
119	{"n", `0x95`},
120	{"newline", `0x15`},
121	{"nine", `0xf9`},
122	{"number-sign", `0x7b`},
123	{"o", `0x96`},
124	{"one", `0xf1`},
125	{"p", `0x97`},
126	{"percent-sign", `0x6c`},
127	{"period", `0x4b`},
128	{"plus-sign", `0x4e`},
129	{"q", `0x98`},
130	{"question-mark", `0x6f`},
131	{"quotation-mark", `0x7f`},
132	{"r", `0x99`},
133	{"reverse-solidus", `0xe0`},
134	{"right-brace", `0xd0`},
135	{"right-curly-bracket", `0xd0`},
136	{"right-parenthesis", `0x5d`},
137	{"right-square-bracket", `0xbd`},
138	{"s", `0xa2`},
139	{"semicolon", `0x5e`},
140	{"seven", `0xf7`},
141	{"six", `0xf6`},
142	{"slash", `0x61`},
143	{"solidus", `0x61`},
144	{"space", `0x40`},
145	{"t", `0xa3`},
146	{"tab", `0x5`},
147	{"three", `0xf3`},
148	{"tilde", `0xa1`},
149	{"two", `0xf2`},
150	{"u", `0xa4`},
151	{"underscore", `0x6d`},
152	{"v", `0xa5`},
153	{"vertical-line", `0x4f`},
154	{"vertical-tab", `0xb`},
155	{"w", `0xa6`},
156	{"x", `0xa7`},
157	{"y", `0xa8`},
158	{"z", `0xa9`},
159	{"zero", `0xf0`},
160	{"A", `0xc1`},
161	{"B", `0xc2`},
162	{"C", `0xc3`},
163	{"D", `0xc4`},
164	{"E", `0xc5`},
165	{"F", `0xc6`},
166	{"G", `0xc7`},
167	{"H", `0xc8`},
168	{"I", `0xc9`},
169	{"J", `0xd1`},
170	{"K", `0xd2`},
171	{"L", `0xd3`},
172	{"M", `0xd4`},
173	{"N", `0xd5`},
174	{"NUL", `0`},
175	{"O", `0xd6`},
176	{"P", `0xd7`},
177	{"Q", `0xd8`},
178	{"R", `0xd9`},
179	{"S", `0xe2`},
180	{"T", `0xe3`},
181	{"U", `0xe4`},
182	{"V", `0xe5`},
183	{"W", `0xe6`},
184	{"X", `0xe7`},
185	{"Y", `0xe8`},
186	{"Z", `0xe9`}};
187	#else
188	// ASCII
189	const collationnames collatenames[] = {
190	{"A", `0x41`},
191	{"B", `0x42`},
192	{"C", `0x43`},
193	{"D", `0x44`},
194	{"E", `0x45`},
195	{"F", `0x46`},
196	{"G", `0x47`},
197	{"H", `0x48`},
198	{"I", `0x49`},
199	{"J", `0x4a`},
200	{"K", `0x4b`},
201	{"L", `0x4c`},
202	{"M", `0x4d`},
203	{"N", `0x4e`},
204	{"NUL", `0x00`},
205	{"O", `0x4f`},
206	{"P", `0x50`},
207	{"Q", `0x51`},
208	{"R", `0x52`},
209	{"S", `0x53`},
210	{"T", `0x54`},
211	{"U", `0x55`},
212	{"V", `0x56`},
213	{"W", `0x57`},
214	{"X", `0x58`},
215	{"Y", `0x59`},
216	{"Z", `0x5a`},
217	{"a", `0x61`},
218	{"alert", `0x07`},
219	{"ampersand", `0x26`},
220	{"apostrophe", `0x27`},
221	{"asterisk", `0x2a`},
222	{"b", `0x62`},
223	{"backslash", `0x5c`},
224	{"backspace", `0x08`},
225	{"c", `0x63`},
226	{"carriage-return", `0x0d`},
227	{"circumflex", `0x5e`},
228	{"circumflex-accent", `0x5e`},
229	{"colon", `0x3a`},
230	{"comma", `0x2c`},
231	{"commercial-at", `0x40`},
232	{"d", `0x64`},
233	{"dollar-sign", `0x24`},
234	{"e", `0x65`},
235	{"eight", `0x38`},
236	{"equals-sign", `0x3d`},
237	{"exclamation-mark", `0x21`},
238	{"f", `0x66`},
239	{"five", `0x35`},
240	{"form-feed", `0x0c`},
241	{"four", `0x34`},
242	{"full-stop", `0x2e`},
243	{"g", `0x67`},
244	{"grave-accent", `0x60`},
245	{"greater-than-sign", `0x3e`},
246	{"h", `0x68`},
247	{"hyphen", `0x2d`},
248	{"hyphen-minus", `0x2d`},
249	{"i", `0x69`},
250	{"j", `0x6a`},
251	{"k", `0x6b`},
252	{"l", `0x6c`},
253	{"left-brace", `0x7b`},
254	{"left-curly-bracket", `0x7b`},
255	{"left-parenthesis", `0x28`},
256	{"left-square-bracket", `0x5b`},
257	{"less-than-sign", `0x3c`},
258	{"low-line", `0x5f`},
259	{"m", `0x6d`},
260	{"n", `0x6e`},
261	{"newline", `0x0a`},
262	{"nine", `0x39`},
263	{"number-sign", `0x23`},
264	{"o", `0x6f`},
265	{"one", `0x31`},
266	{"p", `0x70`},
267	{"percent-sign", `0x25`},
268	{"period", `0x2e`},
269	{"plus-sign", `0x2b`},
270	{"q", `0x71`},
271	{"question-mark", `0x3f`},
272	{"quotation-mark", `0x22`},
273	{"r", `0x72`},
274	{"reverse-solidus", `0x5c`},
275	{"right-brace", `0x7d`},
276	{"right-curly-bracket", `0x7d`},
277	{"right-parenthesis", `0x29`},
278	{"right-square-bracket", `0x5d`},
279	{"s", `0x73`},
280	{"semicolon", `0x3b`},
281	{"seven", `0x37`},
282	{"six", `0x36`},
283	{"slash", `0x2f`},
284	{"solidus", `0x2f`},
285	{"space", `0x20`},
286	{"t", `0x74`},
287	{"tab", `0x09`},
288	{"three", `0x33`},
289	{"tilde", `0x7e`},
290	{"two", `0x32`},
291	{"u", `0x75`},
292	{"underscore", `0x5f`},
293	{"v", `0x76`},
294	{"vertical-line", `0x7c`},
295	{"vertical-tab", `0x0b`},
296	{"w", `0x77`},
297	{"x", `0x78`},
298	{"y", `0x79`},
299	{"z", `0x7a`},
300	{"zero", `0x30`}};
301	#endif
302
303	struct classnames {
304	const char* elem_;
305	regex_traits<char>::char_class_type mask_;
306	};
307
308	const classnames ClassNames[] = {
309	{"alnum", ctype_base::alnum},
310	{"alpha", ctype_base::alpha},
311	{"blank", ctype_base::blank},
312	{"cntrl", ctype_base::cntrl},
313	{"d", ctype_base::digit},
314	{"digit", ctype_base::digit},
315	{"graph", ctype_base::graph},
316	{"lower", ctype_base::lower},
317	{"print", ctype_base::print},
318	{"punct", ctype_base::punct},
319	{"s", ctype_base::space},
320	{"space", ctype_base::space},
321	{"upper", ctype_base::upper},
322	{"w", regex_traits<char>::__regex_word},
323	{"xdigit", ctype_base::xdigit}};
324
325	struct use_strcmp {
326	bool operator()(const collationnames& x, const char* y) { return strcmp(x.elem_, y) < `0`; }
327	bool operator()(const classnames& x, const char* y) { return strcmp(x.elem_, y) < `0`; }
328	};
329
330	} // namespace
331
332	string __get_collation_name(const char* s) {
333	const collationnames* i = std::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp());
334	string r;
335	if (i != end(collatenames) && strcmp(s, i->elem_) == `0`)
336	r = char(i->char_);
337	return r;
338	}
339
340	regex_traits<char>::char_class_type __get_classname(const char* s, bool __icase) {
341	const classnames* i = std::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp());
342	regex_traits<char>::char_class_type r = `0`;
343	if (i != end(ClassNames) && strcmp(s, i->elem_) == `0`) {
344	r = i->mask_;
345	if (r == regex_traits<char>::__regex_word)
346	r \|= ctype_base::alnum \| ctype_base::upper \| ctype_base::lower;
347	else if (__icase) {
348	if (r & (ctype_base::lower \| ctype_base::upper))
349	r \|= ctype_base::alpha;
350	}
351	}
352	return r;
353	}
354
355	template <>
356	void __match_any_but_newline<char>::__exec(__state& __s) const {
357	if (__s.__current_ != __s.__last_) {
358	switch (*__s.__current_) {
359	case `'\r'`:
360	case `'\n'`:
361	__s.__do_ = __state::__reject;
362	__s.__node_ = nullptr;
363	break;
364	default:
365	__s.__do_ = __state::__accept_and_consume;
366	++__s.__current_;
367	__s.__node_ = this->first();
368	break;
369	}
370	} else {
371	__s.__do_ = __state::__reject;
372	__s.__node_ = nullptr;
373	}
374	}
375
376	template <>
377	void __match_any_but_newline<wchar_t>::__exec(__state& __s) const {
378	if (__s.__current_ != __s.__last_) {
379	switch (*__s.__current_) {
380	case `'\r'`:
381	case `'\n'`:
382	case `0x2028`:
383	case `0x2029`:
384	__s.__do_ = __state::__reject;
385	__s.__node_ = nullptr;
386	break;
387	default:
388	__s.__do_ = __state::__accept_and_consume;
389	++__s.__current_;
390	__s.__node_ = this->first();
391	break;
392	}
393	} else {
394	__s.__do_ = __state::__reject;
395	__s.__node_ = nullptr;
396	}
397	}
398
399	_LIBCPP_END_NAMESPACE_STD
400

Browse the source code of llvm_runtimes/libcxx/src/regex.cpp