| 1 | // class template regex -*- C++ -*- |
| 2 | |
| 3 | // Copyright (C) 2013-2024 Free Software Foundation, Inc. |
| 4 | // |
| 5 | // This file is part of the GNU ISO C++ Library. This library is free |
| 6 | // software; you can redistribute it and/or modify it under the |
| 7 | // terms of the GNU General Public License as published by the |
| 8 | // Free Software Foundation; either version 3, or (at your option) |
| 9 | // any later version. |
| 10 | |
| 11 | // This library is distributed in the hope that it will be useful, |
| 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | // GNU General Public License for more details. |
| 15 | |
| 16 | // Under Section 7 of GPL version 3, you are granted additional |
| 17 | // permissions described in the GCC Runtime Library Exception, version |
| 18 | // 3.1, as published by the Free Software Foundation. |
| 19 | |
| 20 | // You should have received a copy of the GNU General Public License and |
| 21 | // a copy of the GCC Runtime Library Exception along with this program; |
| 22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| 23 | // <http://www.gnu.org/licenses/>. |
| 24 | |
| 25 | /** |
| 26 | * @file bits/regex_executor.h |
| 27 | * This is an internal header file, included by other library headers. |
| 28 | * Do not attempt to use it directly. @headername{regex} |
| 29 | */ |
| 30 | |
| 31 | // FIXME convert comments to doxygen format. |
| 32 | |
| 33 | namespace std _GLIBCXX_VISIBILITY(default) |
| 34 | { |
| 35 | _GLIBCXX_BEGIN_NAMESPACE_VERSION |
| 36 | |
| 37 | namespace __detail |
| 38 | { |
| 39 | /** |
| 40 | * @addtogroup regex-detail |
| 41 | * @{ |
| 42 | */ |
| 43 | |
| 44 | /** |
| 45 | * @brief Takes a regex and an input string and does the matching. |
| 46 | * |
| 47 | * The %_Executor class has two modes: DFS mode and BFS mode, controlled |
| 48 | * by the template parameter %__dfs_mode. |
| 49 | */ |
| 50 | template<typename _BiIter, typename _Alloc, typename _TraitsT, |
| 51 | bool __dfs_mode> |
| 52 | class _Executor |
| 53 | { |
| 54 | using __search_mode = integral_constant<bool, __dfs_mode>; |
| 55 | using __dfs = true_type; |
| 56 | using __bfs = false_type; |
| 57 | |
| 58 | enum class _Match_mode : unsigned char { _Exact, _Prefix }; |
| 59 | |
| 60 | public: |
| 61 | typedef typename iterator_traits<_BiIter>::value_type _CharT; |
| 62 | typedef basic_regex<_CharT, _TraitsT> _RegexT; |
| 63 | typedef _GLIBCXX_STD_C::vector<sub_match<_BiIter>, _Alloc> _ResultsVec; |
| 64 | typedef regex_constants::match_flag_type _FlagT; |
| 65 | typedef typename _TraitsT::char_class_type _ClassT; |
| 66 | typedef _NFA<_TraitsT> _NFAT; |
| 67 | |
| 68 | public: |
| 69 | _Executor(_BiIter __begin, |
| 70 | _BiIter __end, |
| 71 | _ResultsVec& __results, |
| 72 | const _RegexT& __re, |
| 73 | _FlagT __flags) |
| 74 | : _M_cur_results(__results.get_allocator()), |
| 75 | _M_begin(__begin), |
| 76 | _M_end(__end), |
| 77 | _M_re(__re), |
| 78 | _M_nfa(*__re._M_automaton), |
| 79 | _M_results(__results), |
| 80 | _M_rep_count(_M_nfa.size()), |
| 81 | _M_states(_M_nfa._M_start(), _M_nfa.size()), |
| 82 | _M_flags(__flags) |
| 83 | { |
| 84 | using namespace regex_constants; |
| 85 | if (__flags & match_prev_avail) // ignore not_bol and not_bow |
| 86 | _M_flags &= ~(match_not_bol | match_not_bow); |
| 87 | } |
| 88 | |
| 89 | // Set matched when string exactly matches the pattern. |
| 90 | bool |
| 91 | _M_match() |
| 92 | { |
| 93 | _M_current = _M_begin; |
| 94 | return _M_main(match_mode: _Match_mode::_Exact); |
| 95 | } |
| 96 | |
| 97 | // Set matched when some prefix of the string matches the pattern. |
| 98 | bool |
| 99 | _M_search_from_first() |
| 100 | { |
| 101 | _M_current = _M_begin; |
| 102 | return _M_main(match_mode: _Match_mode::_Prefix); |
| 103 | } |
| 104 | |
| 105 | bool |
| 106 | _M_search(); |
| 107 | |
| 108 | private: |
| 109 | void |
| 110 | _M_rep_once_more(_Match_mode __match_mode, _StateIdT); |
| 111 | |
| 112 | void |
| 113 | _M_handle_repeat(_Match_mode, _StateIdT); |
| 114 | |
| 115 | void |
| 116 | _M_handle_subexpr_begin(_Match_mode, _StateIdT); |
| 117 | |
| 118 | void |
| 119 | _M_handle_subexpr_end(_Match_mode, _StateIdT); |
| 120 | |
| 121 | void |
| 122 | _M_handle_line_begin_assertion(_Match_mode, _StateIdT); |
| 123 | |
| 124 | void |
| 125 | _M_handle_line_end_assertion(_Match_mode, _StateIdT); |
| 126 | |
| 127 | void |
| 128 | _M_handle_word_boundary(_Match_mode, _StateIdT); |
| 129 | |
| 130 | void |
| 131 | _M_handle_subexpr_lookahead(_Match_mode, _StateIdT); |
| 132 | |
| 133 | void |
| 134 | _M_handle_match(_Match_mode, _StateIdT); |
| 135 | |
| 136 | void |
| 137 | _M_handle_backref(_Match_mode, _StateIdT); |
| 138 | |
| 139 | void |
| 140 | _M_handle_accept(_Match_mode, _StateIdT); |
| 141 | |
| 142 | void |
| 143 | _M_handle_alternative(_Match_mode, _StateIdT); |
| 144 | |
| 145 | void |
| 146 | _M_dfs(_Match_mode __match_mode, _StateIdT __start); |
| 147 | |
| 148 | bool |
| 149 | _M_main(_Match_mode __match_mode) |
| 150 | { return _M_main_dispatch(__match_mode, __search_mode{}); } |
| 151 | |
| 152 | bool |
| 153 | _M_main_dispatch(_Match_mode __match_mode, __dfs); |
| 154 | |
| 155 | bool |
| 156 | _M_main_dispatch(_Match_mode __match_mode, __bfs); |
| 157 | |
| 158 | bool |
| 159 | _M_is_word(_CharT __ch) const |
| 160 | { |
| 161 | static const _CharT __s[2] = { 'w' }; |
| 162 | return _M_re._M_automaton->_M_traits.isctype |
| 163 | (__ch, _M_re._M_automaton->_M_traits.lookup_classname(__s, __s+1)); |
| 164 | } |
| 165 | |
| 166 | bool |
| 167 | _M_at_begin() const |
| 168 | { |
| 169 | if (_M_current == _M_begin) |
| 170 | { |
| 171 | // match_not_bol means ^ does not match [_M_begin,_M_begin) |
| 172 | if (_M_flags & regex_constants::match_not_bol) |
| 173 | return false; |
| 174 | // match_prev_avail means _M_begin is not the start of the input. |
| 175 | if (_M_flags & regex_constants::match_prev_avail) |
| 176 | { |
| 177 | // For ECMAScript multiline matches, check if the previous |
| 178 | // character is a line terminator. |
| 179 | if (_M_match_multiline()) |
| 180 | return _M_is_line_terminator(c: *std::prev(_M_current)); |
| 181 | else |
| 182 | return false; |
| 183 | } |
| 184 | else // ^ matches at _M_begin |
| 185 | return true; |
| 186 | } |
| 187 | else if (_M_match_multiline()) |
| 188 | return _M_is_line_terminator(c: *std::prev(_M_current)); |
| 189 | else |
| 190 | return false; |
| 191 | } |
| 192 | |
| 193 | bool |
| 194 | _M_at_end() const |
| 195 | { |
| 196 | if (_M_current == _M_end) |
| 197 | return !(_M_flags & regex_constants::match_not_eol); |
| 198 | else if (_M_match_multiline()) |
| 199 | return _M_is_line_terminator(c: *_M_current); |
| 200 | else |
| 201 | return false; |
| 202 | } |
| 203 | |
| 204 | bool |
| 205 | _M_word_boundary() const; |
| 206 | |
| 207 | bool |
| 208 | _M_lookahead(_StateIdT __next); |
| 209 | |
| 210 | bool |
| 211 | _M_is_line_terminator(_CharT __c) const |
| 212 | { |
| 213 | const auto& __traits = _M_re._M_automaton->_M_traits; |
| 214 | const auto& __ct = use_facet<ctype<_CharT>>(__traits.getloc()); |
| 215 | const char __n{ __ct.narrow(__c, ' ') }; |
| 216 | if (__n == '\n') |
| 217 | return true; |
| 218 | if (_M_re._M_automaton->_M_options() & regex_constants::ECMAScript) |
| 219 | { |
| 220 | if (__n == '\r') |
| 221 | return true; |
| 222 | // FIXME: U+2028 (line separator) and U+2029 (paragraph separator) |
| 223 | } |
| 224 | return false; |
| 225 | } |
| 226 | |
| 227 | bool |
| 228 | _M_match_multiline() const noexcept |
| 229 | { |
| 230 | constexpr auto __m |
| 231 | = regex_constants::ECMAScript | regex_constants::__multiline; |
| 232 | return (_M_re._M_automaton->_M_options() & __m) == __m; |
| 233 | } |
| 234 | |
| 235 | // Holds additional information used in BFS-mode. |
| 236 | template<typename _SearchMode, typename _ResultsVec> |
| 237 | struct _State_info; |
| 238 | |
| 239 | template<typename _ResultsVec> |
| 240 | struct _State_info<__bfs, _ResultsVec> |
| 241 | { |
| 242 | explicit |
| 243 | _State_info(_StateIdT __start, size_t __n) |
| 244 | : _M_visited_states(new bool[__n]()), _M_start(__start) |
| 245 | { } |
| 246 | |
| 247 | ~_State_info() { delete[] _M_visited_states; } |
| 248 | |
| 249 | _State_info(const _State_info&) = delete; |
| 250 | _State_info& operator=(const _State_info&) = delete; |
| 251 | |
| 252 | bool _M_visited(_StateIdT __i) |
| 253 | { |
| 254 | if (_M_visited_states[__i]) |
| 255 | return true; |
| 256 | _M_visited_states[__i] = true; |
| 257 | return false; |
| 258 | } |
| 259 | |
| 260 | void _M_queue(_StateIdT __i, const _ResultsVec& __res) |
| 261 | { _M_match_queue.emplace_back(__i, __res); } |
| 262 | |
| 263 | // Dummy implementations for BFS mode. |
| 264 | _BiIter* _M_get_sol_pos() { return nullptr; } |
| 265 | |
| 266 | // Saves states that need to be considered for the next character. |
| 267 | _GLIBCXX_STD_C::vector<pair<_StateIdT, _ResultsVec>> _M_match_queue; |
| 268 | // Indicates which states are already visited. |
| 269 | bool* _M_visited_states; |
| 270 | // To record current solution. |
| 271 | _StateIdT _M_start; |
| 272 | }; |
| 273 | |
| 274 | template<typename _ResultsVec> |
| 275 | struct _State_info<__dfs, _ResultsVec> |
| 276 | { |
| 277 | explicit |
| 278 | _State_info(_StateIdT __start, size_t) : _M_start(__start) |
| 279 | { } |
| 280 | |
| 281 | // Dummy implementations for DFS mode. |
| 282 | bool _M_visited(_StateIdT) const { return false; } |
| 283 | void _M_queue(_StateIdT, const _ResultsVec&) { } |
| 284 | |
| 285 | _BiIter* _M_get_sol_pos() { return &_M_sol_pos; } |
| 286 | |
| 287 | // To record current solution. |
| 288 | _StateIdT _M_start; |
| 289 | _BiIter _M_sol_pos; |
| 290 | }; |
| 291 | |
| 292 | public: |
| 293 | _ResultsVec _M_cur_results; |
| 294 | _BiIter _M_current; |
| 295 | _BiIter _M_begin; |
| 296 | const _BiIter _M_end; |
| 297 | const _RegexT& _M_re; |
| 298 | const _NFAT& _M_nfa; |
| 299 | _ResultsVec& _M_results; |
| 300 | _GLIBCXX_STD_C::vector<pair<_BiIter, int>> _M_rep_count; |
| 301 | _State_info<__search_mode, _ResultsVec> _M_states; |
| 302 | _FlagT _M_flags; |
| 303 | // Do we have a solution so far? |
| 304 | bool _M_has_sol; |
| 305 | }; |
| 306 | |
| 307 | ///@} regex-detail |
| 308 | } // namespace __detail |
| 309 | _GLIBCXX_END_NAMESPACE_VERSION |
| 310 | } // namespace std |
| 311 | |
| 312 | #include <bits/regex_executor.tcc> |
| 313 | |