1 | // class template regex -*- C++ -*- |
2 | |
3 | // Copyright (C) 2013-2022 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | /** |
26 | * @file bits/regex_scanner.h |
27 | * This is an internal header file, included by other library headers. |
28 | * Do not attempt to use it directly. @headername{regex} |
29 | */ |
30 | |
31 | namespace std _GLIBCXX_VISIBILITY(default) |
32 | { |
33 | _GLIBCXX_BEGIN_NAMESPACE_VERSION |
34 | |
35 | namespace __detail |
36 | { |
37 | /** |
38 | * @addtogroup regex-detail |
39 | * @{ |
40 | */ |
41 | |
42 | struct _ScannerBase |
43 | { |
44 | public: |
45 | /// Token types returned from the scanner. |
46 | enum _TokenT : unsigned |
47 | { |
48 | _S_token_anychar, |
49 | _S_token_ord_char, |
50 | _S_token_oct_num, |
51 | _S_token_hex_num, |
52 | _S_token_backref, |
53 | _S_token_subexpr_begin, |
54 | _S_token_subexpr_no_group_begin, |
55 | _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n' |
56 | _S_token_subexpr_end, |
57 | _S_token_bracket_begin, |
58 | _S_token_bracket_neg_begin, |
59 | _S_token_bracket_end, |
60 | _S_token_interval_begin, |
61 | _S_token_interval_end, |
62 | _S_token_quoted_class, |
63 | _S_token_char_class_name, |
64 | _S_token_collsymbol, |
65 | _S_token_equiv_class_name, |
66 | _S_token_opt, |
67 | _S_token_or, |
68 | _S_token_closure0, |
69 | _S_token_closure1, |
70 | _S_token_line_begin, |
71 | _S_token_line_end, |
72 | _S_token_word_bound, // neg if _M_value[0] == 'n' |
73 | _S_token_comma, |
74 | _S_token_dup_count, |
75 | _S_token_eof, |
76 | _S_token_bracket_dash, |
77 | _S_token_unknown = -1u |
78 | }; |
79 | |
80 | protected: |
81 | typedef regex_constants::syntax_option_type _FlagT; |
82 | |
83 | enum _StateT |
84 | { |
85 | _S_state_normal, |
86 | _S_state_in_brace, |
87 | _S_state_in_bracket, |
88 | }; |
89 | |
90 | protected: |
91 | _ScannerBase(_FlagT __flags) |
92 | : _M_state(_S_state_normal), |
93 | _M_flags(__flags), |
94 | _M_escape_tbl(_M_is_ecma() |
95 | ? _M_ecma_escape_tbl |
96 | : _M_awk_escape_tbl), |
97 | _M_spec_char(_M_is_ecma() |
98 | ? _M_ecma_spec_char |
99 | : _M_flags & regex_constants::basic |
100 | ? _M_basic_spec_char |
101 | : _M_flags & regex_constants::extended |
102 | ? _M_extended_spec_char |
103 | : _M_flags & regex_constants::grep |
104 | ? ".[\\*^$\n" |
105 | : _M_flags & regex_constants::egrep |
106 | ? ".[\\()*+?{|^$\n" |
107 | : _M_flags & regex_constants::awk |
108 | ? _M_extended_spec_char |
109 | : nullptr), |
110 | _M_at_bracket_start(false) |
111 | { __glibcxx_assert(_M_spec_char); } |
112 | |
113 | protected: |
114 | const char* |
115 | _M_find_escape(char __c) |
116 | { |
117 | auto __it = _M_escape_tbl; |
118 | for (; __it->first != '\0'; ++__it) |
119 | if (__it->first == __c) |
120 | return &__it->second; |
121 | return nullptr; |
122 | } |
123 | |
124 | bool |
125 | _M_is_ecma() const |
126 | { return _M_flags & regex_constants::ECMAScript; } |
127 | |
128 | bool |
129 | _M_is_basic() const |
130 | { return _M_flags & (regex_constants::basic | regex_constants::grep); } |
131 | |
132 | bool |
133 | _M_is_extended() const |
134 | { |
135 | return _M_flags & (regex_constants::extended |
136 | | regex_constants::egrep |
137 | | regex_constants::awk); |
138 | } |
139 | |
140 | bool |
141 | _M_is_grep() const |
142 | { return _M_flags & (regex_constants::grep | regex_constants::egrep); } |
143 | |
144 | bool |
145 | _M_is_awk() const |
146 | { return _M_flags & regex_constants::awk; } |
147 | |
148 | protected: |
149 | // TODO: Make them static in the next abi change. |
150 | const std::pair<char, _TokenT> _M_token_tbl[9] = |
151 | { |
152 | {'^', _S_token_line_begin}, |
153 | {'$', _S_token_line_end}, |
154 | {'.', _S_token_anychar}, |
155 | {'*', _S_token_closure0}, |
156 | {'+', _S_token_closure1}, |
157 | {'?', _S_token_opt}, |
158 | {'|', _S_token_or}, |
159 | {'\n', _S_token_or}, // grep and egrep |
160 | {'\0', _S_token_or}, |
161 | }; |
162 | const std::pair<char, char> _M_ecma_escape_tbl[8] = |
163 | { |
164 | {'0', '\0'}, |
165 | {'b', '\b'}, |
166 | {'f', '\f'}, |
167 | {'n', '\n'}, |
168 | {'r', '\r'}, |
169 | {'t', '\t'}, |
170 | {'v', '\v'}, |
171 | {'\0', '\0'}, |
172 | }; |
173 | const std::pair<char, char> _M_awk_escape_tbl[11] = |
174 | { |
175 | {'"', '"'}, |
176 | {'/', '/'}, |
177 | {'\\', '\\'}, |
178 | {'a', '\a'}, |
179 | {'b', '\b'}, |
180 | {'f', '\f'}, |
181 | {'n', '\n'}, |
182 | {'r', '\r'}, |
183 | {'t', '\t'}, |
184 | {'v', '\v'}, |
185 | {'\0', '\0'}, |
186 | }; |
187 | const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|" ; |
188 | const char* _M_basic_spec_char = ".[\\*^$" ; |
189 | const char* _M_extended_spec_char = ".[\\()*+?{|^$" ; |
190 | |
191 | _StateT _M_state; |
192 | _FlagT _M_flags; |
193 | _TokenT _M_token; |
194 | const std::pair<char, char>* _M_escape_tbl; |
195 | const char* _M_spec_char; |
196 | bool _M_at_bracket_start; |
197 | }; |
198 | |
199 | /** |
200 | * @brief Scans an input range for regex tokens. |
201 | * |
202 | * The %_Scanner class interprets the regular expression pattern in |
203 | * the input range passed to its constructor as a sequence of parse |
204 | * tokens passed to the regular expression compiler. The sequence |
205 | * of tokens provided depends on the flag settings passed to the |
206 | * constructor: different regular expression grammars will interpret |
207 | * the same input pattern in syntactically different ways. |
208 | */ |
209 | template<typename _CharT> |
210 | class _Scanner |
211 | : public _ScannerBase |
212 | { |
213 | public: |
214 | typedef std::basic_string<_CharT> _StringT; |
215 | typedef regex_constants::syntax_option_type _FlagT; |
216 | typedef const std::ctype<_CharT> _CtypeT; |
217 | |
218 | _Scanner(const _CharT* __begin, const _CharT* __end, |
219 | _FlagT __flags, std::locale __loc); |
220 | |
221 | void |
222 | _M_advance(); |
223 | |
224 | _TokenT |
225 | _M_get_token() const noexcept |
226 | { return _M_token; } |
227 | |
228 | const _StringT& |
229 | _M_get_value() const noexcept |
230 | { return _M_value; } |
231 | |
232 | #ifdef _GLIBCXX_DEBUG |
233 | std::ostream& |
234 | _M_print(std::ostream&); |
235 | #endif |
236 | |
237 | private: |
238 | void |
239 | _M_scan_normal(); |
240 | |
241 | void |
242 | _M_scan_in_bracket(); |
243 | |
244 | void |
245 | _M_scan_in_brace(); |
246 | |
247 | void |
248 | _M_eat_escape_ecma(); |
249 | |
250 | void |
251 | _M_eat_escape_posix(); |
252 | |
253 | void |
254 | _M_eat_escape_awk(); |
255 | |
256 | void |
257 | _M_eat_class(char); |
258 | |
259 | const _CharT* _M_current; |
260 | const _CharT* _M_end; |
261 | _CtypeT& _M_ctype; |
262 | _StringT _M_value; |
263 | void (_Scanner::* _M_eat_escape)(); |
264 | }; |
265 | |
266 | ///@} regex-detail |
267 | } // namespace __detail |
268 | _GLIBCXX_END_NAMESPACE_VERSION |
269 | } // namespace std |
270 | |
271 | #include <bits/regex_scanner.tcc> |
272 | |