1// class template regex -*- C++ -*-
2
3// Copyright (C) 2013-2024 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/**
26 * @file bits/regex_scanner.tcc
27 * This is an internal header file, included by other library headers.
28 * Do not attempt to use it directly. @headername{regex}
29 */
30
31// FIXME make comments doxygen format.
32
33// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34// and awk
35// 1) grep is basic except '\n' is treated as '|'
36// 2) egrep is extended except '\n' is treated as '|'
37// 3) awk is extended except special escaping rules, and there's no
38// back-reference.
39//
40// References:
41//
42// ECMAScript: ECMA-262 15.10
43//
44// basic, extended:
45// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46//
47// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48
49namespace std _GLIBCXX_VISIBILITY(default)
50{
51_GLIBCXX_BEGIN_NAMESPACE_VERSION
52
53namespace __detail
54{
55 template<typename _CharT>
56 _Scanner<_CharT>::
57 _Scanner(const _CharT* __begin, const _CharT* __end,
58 _FlagT __flags, std::locale __loc)
59 : _ScannerBase(__flags),
60 _M_current(__begin), _M_end(__end),
61 _M_ctype(std::use_facet<_CtypeT>(__loc)),
62 _M_eat_escape(_M_is_ecma()
63 ? &_Scanner::_M_eat_escape_ecma
64 : &_Scanner::_M_eat_escape_posix)
65 { _M_advance(); }
66
67 template<typename _CharT>
68 void
69 _Scanner<_CharT>::
70 _M_advance()
71 {
72 if (_M_current == _M_end)
73 {
74 _M_token = _S_token_eof;
75 return;
76 }
77
78 if (_M_state == _S_state_normal)
79 _M_scan_normal();
80 else if (_M_state == _S_state_in_bracket)
81 _M_scan_in_bracket();
82 else if (_M_state == _S_state_in_brace)
83 _M_scan_in_brace();
84 else
85 {
86 __glibcxx_assert(!"unexpected state while processing regex");
87 }
88 }
89
90 // Differences between styles:
91 // 1) "\(", "\)", "\{" in basic. It's not escaping.
92 // 2) "(?:", "(?=", "(?!" in ECMAScript.
93 template<typename _CharT>
94 void
95 _Scanner<_CharT>::
96 _M_scan_normal()
97 {
98 auto __c = *_M_current++;
99
100 if (__builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
101 {
102 _M_token = _S_token_ord_char;
103 _M_value.assign(1, __c);
104 return;
105 }
106 if (__c == '\\')
107 {
108 if (_M_current == _M_end)
109 __throw_regex_error(
110 ecode: regex_constants::error_escape,
111 what: "Invalid escape at end of regular expression");
112
113 if (!_M_is_basic()
114 || (*_M_current != '('
115 && *_M_current != ')'
116 && *_M_current != '{'))
117 {
118 (this->*_M_eat_escape)();
119 return;
120 }
121 __c = *_M_current++;
122 }
123 if (__c == '(')
124 {
125 if (_M_is_ecma() && *_M_current == '?')
126 {
127 if (++_M_current == _M_end)
128 __throw_regex_error(ecode: regex_constants::error_paren);
129
130 if (*_M_current == ':')
131 {
132 ++_M_current;
133 _M_token = _S_token_subexpr_no_group_begin;
134 }
135 else if (*_M_current == '=')
136 {
137 ++_M_current;
138 _M_token = _S_token_subexpr_lookahead_begin;
139 _M_value.assign(1, 'p');
140 }
141 else if (*_M_current == '!')
142 {
143 ++_M_current;
144 _M_token = _S_token_subexpr_lookahead_begin;
145 _M_value.assign(1, 'n');
146 }
147 else
148 __throw_regex_error(ecode: regex_constants::error_paren,
149 what: "Invalid '(?...)' zero-width assertion "
150 "in regular expression");
151 }
152 else if (_M_flags & regex_constants::nosubs)
153 _M_token = _S_token_subexpr_no_group_begin;
154 else
155 _M_token = _S_token_subexpr_begin;
156 }
157 else if (__c == ')')
158 _M_token = _S_token_subexpr_end;
159 else if (__c == '[')
160 {
161 _M_state = _S_state_in_bracket;
162 _M_at_bracket_start = true;
163 if (_M_current != _M_end && *_M_current == '^')
164 {
165 _M_token = _S_token_bracket_neg_begin;
166 ++_M_current;
167 }
168 else
169 _M_token = _S_token_bracket_begin;
170 }
171 else if (__c == '{')
172 {
173 _M_state = _S_state_in_brace;
174 _M_token = _S_token_interval_begin;
175 }
176 else if (__builtin_expect(__c == _CharT(0), false))
177 {
178 if (!_M_is_ecma())
179 __throw_regex_error(ecode: regex_constants::_S_null);
180 _M_token = _S_token_ord_char;
181 _M_value.assign(1, __c);
182 }
183 else if (__c != ']' && __c != '}')
184 {
185 auto __it = _M_token_tbl;
186 auto __narrowc = _M_ctype.narrow(__c, '\0');
187 for (; __it->first != '\0'; ++__it)
188 if (__it->first == __narrowc)
189 {
190 _M_token = __it->second;
191 return;
192 }
193 __glibcxx_assert(!"unexpected special character in regex");
194 }
195 else
196 {
197 _M_token = _S_token_ord_char;
198 _M_value.assign(1, __c);
199 }
200 }
201
202 // Differences between styles:
203 // 1) different semantics of "[]" and "[^]".
204 // 2) Escaping in bracket expr.
205 template<typename _CharT>
206 void
207 _Scanner<_CharT>::
208 _M_scan_in_bracket()
209 {
210 if (_M_current == _M_end)
211 __throw_regex_error(ecode: regex_constants::error_brack);
212
213 auto __c = *_M_current++;
214
215 if (__c == '-')
216 _M_token = _S_token_bracket_dash;
217 else if (__c == '[')
218 {
219 if (_M_current == _M_end)
220 __throw_regex_error(ecode: regex_constants::error_brack,
221 what: "Incomplete '[[' character class in "
222 "regular expression");
223
224 if (*_M_current == '.')
225 {
226 _M_token = _S_token_collsymbol;
227 _M_eat_class(*_M_current++);
228 }
229 else if (*_M_current == ':')
230 {
231 _M_token = _S_token_char_class_name;
232 _M_eat_class(*_M_current++);
233 }
234 else if (*_M_current == '=')
235 {
236 _M_token = _S_token_equiv_class_name;
237 _M_eat_class(*_M_current++);
238 }
239 else
240 {
241 _M_token = _S_token_ord_char;
242 _M_value.assign(1, __c);
243 }
244 }
245 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
246 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
247 // `.../empty_range.cc`.
248 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
249 {
250 _M_token = _S_token_bracket_end;
251 _M_state = _S_state_normal;
252 }
253 // ECMAScript and awk permits escaping in bracket.
254 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
255 (this->*_M_eat_escape)();
256 else
257 {
258 _M_token = _S_token_ord_char;
259 _M_value.assign(1, __c);
260 }
261 _M_at_bracket_start = false;
262 }
263
264 // Differences between styles:
265 // 1) "\}" in basic style.
266 template<typename _CharT>
267 void
268 _Scanner<_CharT>::
269 _M_scan_in_brace()
270 {
271 if (_M_current == _M_end)
272 __throw_regex_error(ecode: regex_constants::error_brace);
273
274 auto __c = *_M_current++;
275
276 if (_M_ctype.is(_CtypeT::digit, __c))
277 {
278 _M_token = _S_token_dup_count;
279 _M_value.assign(1, __c);
280 while (_M_current != _M_end
281 && _M_ctype.is(_CtypeT::digit, *_M_current))
282 _M_value += *_M_current++;
283 }
284 else if (__c == ',')
285 _M_token = _S_token_comma;
286 // basic use \}.
287 else if (_M_is_basic())
288 {
289 if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
290 {
291 _M_state = _S_state_normal;
292 _M_token = _S_token_interval_end;
293 ++_M_current;
294 }
295 else
296 __throw_regex_error(ecode: regex_constants::error_badbrace);
297 }
298 else if (__c == '}')
299 {
300 _M_state = _S_state_normal;
301 _M_token = _S_token_interval_end;
302 }
303 else
304 __throw_regex_error(ecode: regex_constants::error_badbrace);
305 }
306
307 template<typename _CharT>
308 void
309 _Scanner<_CharT>::
310 _M_eat_escape_ecma()
311 {
312 if (_M_current == _M_end)
313 __throw_regex_error(ecode: regex_constants::error_escape);
314
315 auto __c = *_M_current++;
316 auto __pos = _M_find_escape(c: _M_ctype.narrow(__c, '\0'));
317
318 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
319 {
320 _M_token = _S_token_ord_char;
321 _M_value.assign(1, *__pos);
322 }
323 else if (__c == 'b')
324 {
325 _M_token = _S_token_word_bound;
326 _M_value.assign(1, 'p');
327 }
328 else if (__c == 'B')
329 {
330 _M_token = _S_token_word_bound;
331 _M_value.assign(1, 'n');
332 }
333 // N3376 28.13
334 else if (__c == 'd'
335 || __c == 'D'
336 || __c == 's'
337 || __c == 'S'
338 || __c == 'w'
339 || __c == 'W')
340 {
341 _M_token = _S_token_quoted_class;
342 _M_value.assign(1, __c);
343 }
344 else if (__c == 'c')
345 {
346 if (_M_current == _M_end)
347 __throw_regex_error(ecode: regex_constants::error_escape,
348 what: "invalid '\\cX' control character in "
349 "regular expression");
350 _M_token = _S_token_ord_char;
351 _M_value.assign(1, *_M_current++);
352 }
353 else if (__c == 'x' || __c == 'u')
354 {
355 _M_value.clear();
356 const int __n = __c == 'x' ? 2 : 4;
357 for (int __i = 0; __i < __n; __i++)
358 {
359 if (_M_current == _M_end
360 || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
361 __throw_regex_error(ecode: regex_constants::error_escape,
362 what: __n == 2
363 ? "Invalid '\\xNN' control character in "
364 "regular expression"
365 : "Invalid '\\uNNNN' control character in "
366 "regular expression");
367 _M_value += *_M_current++;
368 }
369 _M_token = _S_token_hex_num;
370 }
371 // ECMAScript recognizes multi-digit back-references.
372 else if (_M_ctype.is(_CtypeT::digit, __c))
373 {
374 _M_value.assign(1, __c);
375 while (_M_current != _M_end
376 && _M_ctype.is(_CtypeT::digit, *_M_current))
377 _M_value += *_M_current++;
378 _M_token = _S_token_backref;
379 }
380 else
381 {
382 _M_token = _S_token_ord_char;
383 _M_value.assign(1, __c);
384 }
385 }
386
387 // Differences between styles:
388 // 1) Extended doesn't support backref, but basic does.
389 template<typename _CharT>
390 void
391 _Scanner<_CharT>::
392 _M_eat_escape_posix()
393 {
394 if (_M_current == _M_end)
395 __throw_regex_error(ecode: regex_constants::error_escape);
396
397 auto __c = *_M_current;
398 auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
399
400 if (__pos != nullptr && *__pos != '\0')
401 {
402 _M_token = _S_token_ord_char;
403 _M_value.assign(1, __c);
404 }
405 // We MUST judge awk before handling backrefs. There's no backref in awk.
406 else if (_M_is_awk())
407 {
408 _M_eat_escape_awk();
409 return;
410 }
411 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
412 {
413 _M_token = _S_token_backref;
414 _M_value.assign(1, __c);
415 }
416 else
417 {
418#ifdef __STRICT_ANSI__
419 // POSIX says it is undefined to escape ordinary characters
420 __throw_regex_error(ecode: regex_constants::error_escape);
421#else
422 _M_token = _S_token_ord_char;
423 _M_value.assign(1, __c);
424#endif
425 }
426 ++_M_current;
427 }
428
429 template<typename _CharT>
430 void
431 _Scanner<_CharT>::
432 _M_eat_escape_awk()
433 {
434 auto __c = *_M_current++;
435 auto __pos = _M_find_escape(c: _M_ctype.narrow(__c, '\0'));
436
437 if (__pos != nullptr)
438 {
439 _M_token = _S_token_ord_char;
440 _M_value.assign(1, *__pos);
441 }
442 // \ddd for oct representation
443 else if (_M_ctype.is(_CtypeT::digit, __c)
444 && __c != '8'
445 && __c != '9')
446 {
447 _M_value.assign(1, __c);
448 for (int __i = 0;
449 __i < 2
450 && _M_current != _M_end
451 && _M_ctype.is(_CtypeT::digit, *_M_current)
452 && *_M_current != '8'
453 && *_M_current != '9';
454 __i++)
455 _M_value += *_M_current++;
456 _M_token = _S_token_oct_num;
457 return;
458 }
459 else
460 __throw_regex_error(ecode: regex_constants::error_escape);
461 }
462
463 // Eats a character class or throws an exception.
464 // __ch could be ':', '.' or '=', _M_current is the char after ']' when
465 // returning.
466 template<typename _CharT>
467 void
468 _Scanner<_CharT>::
469 _M_eat_class(char __ch)
470 {
471 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
472 _M_value += *_M_current++;
473 if (_M_current == _M_end
474 || *_M_current++ != __ch
475 || _M_current == _M_end // skip __ch
476 || *_M_current++ != ']') // skip ']'
477 {
478 __throw_regex_error(ecode: __ch == ':' ? regex_constants::error_ctype
479 : regex_constants::error_collate);
480 }
481 }
482
483#ifdef _GLIBCXX_DEBUG
484 template<typename _CharT>
485 std::ostream&
486 _Scanner<_CharT>::
487 _M_print(std::ostream& __ostr)
488 {
489 switch (_M_token)
490 {
491 case _S_token_anychar:
492 __ostr << "any-character\n";
493 break;
494 case _S_token_backref:
495 __ostr << "backref\n";
496 break;
497 case _S_token_bracket_begin:
498 __ostr << "bracket-begin\n";
499 break;
500 case _S_token_bracket_neg_begin:
501 __ostr << "bracket-neg-begin\n";
502 break;
503 case _S_token_bracket_end:
504 __ostr << "bracket-end\n";
505 break;
506 case _S_token_char_class_name:
507 __ostr << "char-class-name \"" << _M_value << "\"\n";
508 break;
509 case _S_token_closure0:
510 __ostr << "closure0\n";
511 break;
512 case _S_token_closure1:
513 __ostr << "closure1\n";
514 break;
515 case _S_token_collsymbol:
516 __ostr << "collsymbol \"" << _M_value << "\"\n";
517 break;
518 case _S_token_comma:
519 __ostr << "comma\n";
520 break;
521 case _S_token_dup_count:
522 __ostr << "dup count: " << _M_value << "\n";
523 break;
524 case _S_token_eof:
525 __ostr << "EOF\n";
526 break;
527 case _S_token_equiv_class_name:
528 __ostr << "equiv-class-name \"" << _M_value << "\"\n";
529 break;
530 case _S_token_interval_begin:
531 __ostr << "interval begin\n";
532 break;
533 case _S_token_interval_end:
534 __ostr << "interval end\n";
535 break;
536 case _S_token_line_begin:
537 __ostr << "line begin\n";
538 break;
539 case _S_token_line_end:
540 __ostr << "line end\n";
541 break;
542 case _S_token_opt:
543 __ostr << "opt\n";
544 break;
545 case _S_token_or:
546 __ostr << "or\n";
547 break;
548 case _S_token_ord_char:
549 __ostr << "ordinary character: \"" << _M_value << "\"\n";
550 break;
551 case _S_token_subexpr_begin:
552 __ostr << "subexpr begin\n";
553 break;
554 case _S_token_subexpr_no_group_begin:
555 __ostr << "no grouping subexpr begin\n";
556 break;
557 case _S_token_subexpr_lookahead_begin:
558 __ostr << "lookahead subexpr begin\n";
559 break;
560 case _S_token_subexpr_end:
561 __ostr << "subexpr end\n";
562 break;
563 case _S_token_unknown:
564 __ostr << "-- unknown token --\n";
565 break;
566 case _S_token_oct_num:
567 __ostr << "oct number " << _M_value << "\n";
568 break;
569 case _S_token_hex_num:
570 __ostr << "hex number " << _M_value << "\n";
571 break;
572 case _S_token_quoted_class:
573 __ostr << "quoted class " << "\\" << _M_value << "\n";
574 break;
575 default:
576 _GLIBCXX_DEBUG_ASSERT(false);
577 }
578 return __ostr;
579 }
580#endif
581
582} // namespace __detail
583_GLIBCXX_END_NAMESPACE_VERSION
584} // namespace
585