Home | History | Annotate | Line # | Download | only in bits
      1 // class template regex -*- C++ -*-
      2 
      3 // Copyright (C) 2013-2024 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /**
     26  *  @file bits/regex_scanner.h
     27  *  This is an internal header file, included by other library headers.
     28  *  Do not attempt to use it directly. @headername{regex}
     29  */
     30 
     31 namespace std _GLIBCXX_VISIBILITY(default)
     32 {
     33 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     34 
     35 namespace __detail
     36 {
     37   /**
     38    * @addtogroup regex-detail
     39    * @{
     40    */
     41 
     42   struct _ScannerBase
     43   {
     44   public:
     45     /// Token types returned from the scanner.
     46     enum _TokenT : unsigned
     47     {
     48       _S_token_anychar,
     49       _S_token_ord_char,
     50       _S_token_oct_num,
     51       _S_token_hex_num,
     52       _S_token_backref,
     53       _S_token_subexpr_begin,
     54       _S_token_subexpr_no_group_begin,
     55       _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
     56       _S_token_subexpr_end,
     57       _S_token_bracket_begin,
     58       _S_token_bracket_neg_begin,
     59       _S_token_bracket_end,
     60       _S_token_interval_begin,
     61       _S_token_interval_end,
     62       _S_token_quoted_class,
     63       _S_token_char_class_name,
     64       _S_token_collsymbol,
     65       _S_token_equiv_class_name,
     66       _S_token_opt,
     67       _S_token_or,
     68       _S_token_closure0,
     69       _S_token_closure1,
     70       _S_token_line_begin,
     71       _S_token_line_end,
     72       _S_token_word_bound, // neg if _M_value[0] == 'n'
     73       _S_token_comma,
     74       _S_token_dup_count,
     75       _S_token_eof,
     76       _S_token_bracket_dash,
     77       _S_token_unknown = -1u
     78     };
     79 
     80   protected:
     81     typedef regex_constants::syntax_option_type _FlagT;
     82 
     83     enum _StateT
     84     {
     85       _S_state_normal,
     86       _S_state_in_brace,
     87       _S_state_in_bracket,
     88     };
     89 
     90   protected:
     91     _ScannerBase(_FlagT __flags)
     92     : _M_state(_S_state_normal),
     93     _M_flags(__flags),
     94     _M_escape_tbl(_M_is_ecma()
     95 		  ? _M_ecma_escape_tbl
     96 		  : _M_awk_escape_tbl),
     97     _M_spec_char(_M_is_ecma()
     98 		 ? _M_ecma_spec_char
     99 		 : _M_flags & regex_constants::basic
    100 		 ? _M_basic_spec_char
    101 		 : _M_flags & regex_constants::extended
    102 		 ? _M_extended_spec_char
    103 		 : _M_flags & regex_constants::grep
    104 		 ?  ".[\\*^$\n"
    105 		 : _M_flags & regex_constants::egrep
    106 		 ? ".[\\()*+?{|^$\n"
    107 		 : _M_flags & regex_constants::awk
    108 		 ? _M_extended_spec_char
    109 		 : nullptr),
    110     _M_at_bracket_start(false)
    111     { __glibcxx_assert(_M_spec_char); }
    112 
    113   protected:
    114     const char*
    115     _M_find_escape(char __c)
    116     {
    117       auto __it = _M_escape_tbl;
    118       for (; __it->first != '\0'; ++__it)
    119 	if (__it->first == __c)
    120 	  return &__it->second;
    121       return nullptr;
    122     }
    123 
    124     bool
    125     _M_is_ecma() const
    126     { return _M_flags & regex_constants::ECMAScript; }
    127 
    128     bool
    129     _M_is_basic() const
    130     { return _M_flags & (regex_constants::basic | regex_constants::grep); }
    131 
    132     bool
    133     _M_is_extended() const
    134     {
    135       return _M_flags & (regex_constants::extended
    136 			 | regex_constants::egrep
    137 			 | regex_constants::awk);
    138     }
    139 
    140     bool
    141     _M_is_grep() const
    142     { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
    143 
    144     bool
    145     _M_is_awk() const
    146     { return _M_flags & regex_constants::awk; }
    147 
    148   protected:
    149     // TODO: Make them static in the next abi change.
    150     const std::pair<char, _TokenT> _M_token_tbl[9] =
    151       {
    152 	{'^', _S_token_line_begin},
    153 	{'$', _S_token_line_end},
    154 	{'.', _S_token_anychar},
    155 	{'*', _S_token_closure0},
    156 	{'+', _S_token_closure1},
    157 	{'?', _S_token_opt},
    158 	{'|', _S_token_or},
    159 	{'\n', _S_token_or}, // grep and egrep
    160 	{'\0', _S_token_or},
    161       };
    162     const std::pair<char, char> _M_ecma_escape_tbl[8] =
    163       {
    164 	{'0', '\0'},
    165 	{'b', '\b'},
    166 	{'f', '\f'},
    167 	{'n', '\n'},
    168 	{'r', '\r'},
    169 	{'t', '\t'},
    170 	{'v', '\v'},
    171 	{'\0', '\0'},
    172       };
    173     const std::pair<char, char> _M_awk_escape_tbl[11] =
    174       {
    175 	{'"', '"'},
    176 	{'/', '/'},
    177 	{'\\', '\\'},
    178 	{'a', '\a'},
    179 	{'b', '\b'},
    180 	{'f', '\f'},
    181 	{'n', '\n'},
    182 	{'r', '\r'},
    183 	{'t', '\t'},
    184 	{'v', '\v'},
    185 	{'\0', '\0'},
    186       };
    187     const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
    188     const char* _M_basic_spec_char = ".[\\*^$";
    189     const char* _M_extended_spec_char = ".[\\()*+?{|^$";
    190 
    191     _StateT                       _M_state;
    192     _FlagT                        _M_flags;
    193     _TokenT                       _M_token;
    194     const std::pair<char, char>*  _M_escape_tbl;
    195     const char*                   _M_spec_char;
    196     bool                          _M_at_bracket_start;
    197   };
    198 
    199   /**
    200    * @brief Scans an input range for regex tokens.
    201    *
    202    * The %_Scanner class interprets the regular expression pattern in
    203    * the input range passed to its constructor as a sequence of parse
    204    * tokens passed to the regular expression compiler.  The sequence
    205    * of tokens provided depends on the flag settings passed to the
    206    * constructor: different regular expression grammars will interpret
    207    * the same input pattern in syntactically different ways.
    208    */
    209   template<typename _CharT>
    210     class _Scanner
    211     : public _ScannerBase
    212     {
    213     public:
    214       typedef std::basic_string<_CharT>                           _StringT;
    215       typedef regex_constants::syntax_option_type                 _FlagT;
    216       typedef const std::ctype<_CharT>                            _CtypeT;
    217 
    218       _Scanner(const _CharT* __begin, const _CharT* __end,
    219 	       _FlagT __flags, std::locale __loc);
    220 
    221       void
    222       _M_advance();
    223 
    224       _TokenT
    225       _M_get_token() const noexcept
    226       { return _M_token; }
    227 
    228       const _StringT&
    229       _M_get_value() const noexcept
    230       { return _M_value; }
    231 
    232 #ifdef _GLIBCXX_DEBUG
    233       std::ostream&
    234       _M_print(std::ostream&);
    235 #endif
    236 
    237     private:
    238       void
    239       _M_scan_normal();
    240 
    241       void
    242       _M_scan_in_bracket();
    243 
    244       void
    245       _M_scan_in_brace();
    246 
    247       void
    248       _M_eat_escape_ecma();
    249 
    250       void
    251       _M_eat_escape_posix();
    252 
    253       void
    254       _M_eat_escape_awk();
    255 
    256       void
    257       _M_eat_class(char);
    258 
    259       const _CharT*                 _M_current;
    260       const _CharT*                 _M_end;
    261       _CtypeT&                      _M_ctype;
    262       _StringT                      _M_value;
    263       void (_Scanner::* _M_eat_escape)();
    264     };
    265 
    266  ///@} regex-detail
    267 } // namespace __detail
    268 _GLIBCXX_END_NAMESPACE_VERSION
    269 } // namespace std
    270 
    271 #include <bits/regex_scanner.tcc>
    272