Home | History | Annotate | Line # | Download | only in libcody
      1 // CODYlib		-*- mode:c++ -*-
      2 // Copyright (C) 2020 Nathan Sidwell, nathan (at) acm.org
      3 // License: Apache v2.0
      4 
      5 // Cody
      6 #include "internal.hh"
      7 // C++
      8 #include <algorithm>
      9 // C
     10 #include <cstring>
     11 // OS
     12 #include <unistd.h>
     13 #include <cerrno>
     14 
     15 // MessageBuffer code
     16 
     17 // Lines consist of words and end with a NEWLINE (0xa) char
     18 // Whitespace characters are TAB (0x9) and SPACE (0x20)
     19 // Words consist of non-whitespace chars separated by whitespace.
     20 // Multiple lines in one transaction are indicated by ending non-final
     21 // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
     22 // Continuations with ; preceding it
     23 // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
     24 // Quoting with '...'
     25 // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
     26 // Anything outside of <= <space> or DEL or \' or \\ needs escaping.
     27 // Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
     28 // Spaces separate words, UTF8 encoding for non-ascii chars
     29 
     30 namespace Cody {
     31 namespace Detail {
     32 
     33 static const char CONTINUE = S2C(u8";");
     34 
     35 void MessageBuffer::BeginLine ()
     36 {
     37   if (!buffer.empty ())
     38     {
     39       // Terminate the previous line with a continuation
     40       buffer.reserve (buffer.size () + 3);
     41       buffer.push_back (S2C(u8" "));
     42       buffer.push_back (CONTINUE);
     43       buffer.push_back (S2C(u8"\n"));
     44     }
     45   lastBol = buffer.size ();
     46 }
     47 
     48 // QUOTE means 'maybe quote', we search it for quote-needing chars
     49 
     50 void MessageBuffer::Append (char const *str, bool quote, size_t len)
     51 {
     52   if (len == ~size_t (0))
     53     len = strlen (str);
     54 
     55   if (!len && !quote)
     56     return;
     57 
     58   // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
     59   // that could remotely be shell-active.  UTF8 encoding for non-ascii.
     60   if (quote && len)
     61     {
     62       quote = false;
     63       // Scan looking for quote-needing characters.  We could just
     64       // append until we find one, but that's probably confusing
     65       for (size_t ix = len; ix--;)
     66 	{
     67 	  unsigned char c = (unsigned char)str[ix];
     68 	  if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
     69 		|| (c >= S2C(u8"A") && c <= S2C(u8"Z"))
     70 		|| (c >= S2C(u8"0") && c <= S2C(u8"9"))
     71 		|| c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
     72 		|| c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
     73 	    {
     74 	      quote = true;
     75 	      break;
     76 	    }
     77 	}
     78     }
     79 
     80   // Maximal length of appended string
     81   buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);
     82 
     83   if (quote)
     84     buffer.push_back (S2C(u8"'"));
     85 
     86   for (auto *end = str + len; str != end;)
     87     {
     88       auto *e = end;
     89 
     90       if (quote)
     91 	// Look for next escape-needing char.  More relaxed than
     92 	// the earlier needs-quoting check.
     93 	for (e = str; e != end; ++e)
     94 	  {
     95 	    unsigned char c = (unsigned char)*e;
     96 	    if (c < S2C(u8" ") || c == 0x7f
     97 		|| c == S2C(u8"\\") || c == S2C(u8"'"))
     98 	      break;
     99 	  }
    100       buffer.insert (buffer.end (), str, e);
    101       str = e;
    102 
    103       if (str == end)
    104 	break;
    105 
    106       buffer.push_back (S2C(u8"\\"));
    107       switch (unsigned char c = (unsigned char)*str++)
    108 	{
    109 	case S2C(u8"\t"):
    110 	  c = S2C(u8"t");
    111 	  goto append;
    112 
    113 	case S2C(u8"\n"):
    114 	  c = S2C(u8"n");
    115 	  goto append;
    116 
    117 	case S2C(u8"'"):
    118 	case S2C(u8"\\"):
    119 	append:
    120 	  buffer.push_back (c);
    121 	  break;
    122 
    123 	default:
    124 	  // Full-on escape.  Use 2 lower-case hex chars
    125 	  for (unsigned shift = 8; shift;)
    126 	    {
    127 	      shift -= 4;
    128 
    129 	      char nibble = (c >> shift) & 0xf;
    130 	      nibble += S2C(u8"0");
    131 	      if (nibble > S2C(u8"9"))
    132 		nibble += S2C(u8"a") - (S2C(u8"9") + 1);
    133 	      buffer.push_back (nibble);
    134 	    }
    135 	}
    136     }
    137 
    138   if (quote)
    139     buffer.push_back (S2C(u8"'"));
    140 }
    141 
    142 void MessageBuffer::Append (char c)
    143 {
    144   buffer.push_back (c);
    145 }
    146 
    147 void MessageBuffer::AppendInteger (unsigned u)
    148 {
    149   // Sigh, even though std::to_string is C++11, we support building on
    150   // gcc 4.8, which is a C++11 compiler lacking std::to_string.  so
    151   // have something horrible.
    152   std::string v (20, 0);
    153   size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
    154   v.erase (len);
    155 
    156   AppendWord (v);
    157 }
    158 
    159 int MessageBuffer::Write (int fd) noexcept
    160 {
    161   size_t limit = buffer.size () - lastBol;
    162   ssize_t count = write (fd, &buffer.data ()[lastBol], limit);
    163 
    164   int err = 0;
    165   if (count < 0)
    166     err = errno;
    167   else
    168     {
    169       lastBol += count;
    170       if (size_t (count) != limit)
    171 	err = EAGAIN;
    172     }
    173 
    174   if (err != EAGAIN && err != EINTR)
    175     {
    176       // Reset for next message
    177       buffer.clear ();
    178       lastBol = 0;
    179     }
    180 
    181   return err;
    182 }
    183 
    184 int MessageBuffer::Read (int fd) noexcept
    185 {
    186   constexpr size_t blockSize = 200;
    187 
    188   size_t lwm = buffer.size ();
    189   size_t hwm = buffer.capacity ();
    190   if (hwm - lwm < blockSize / 2)
    191     hwm += blockSize;
    192   buffer.resize (hwm);
    193 
    194   auto iter = buffer.begin () + lwm;
    195   ssize_t count = read (fd, &*iter, hwm - lwm);
    196   buffer.resize (lwm + (count >= 0 ? count : 0));
    197 
    198   if (count < 0)
    199     return errno;
    200 
    201   if (!count)
    202     // End of file
    203     return -1;
    204 
    205   bool more = true;
    206   for (;;)
    207     {
    208       auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
    209       if (newline == buffer.end ())
    210 	break;
    211       more = newline != buffer.begin () && newline[-1] == CONTINUE;
    212       iter = newline + 1;
    213 
    214       if (iter == buffer.end ())
    215 	break;
    216 
    217       if (!more)
    218 	{
    219 	  // There is no continuation, but there are chars after the
    220 	  // newline.  Truncate the buffer and return an error
    221 	  buffer.resize (iter - buffer.begin ());
    222 	  return EINVAL;
    223 	}
    224     }
    225 
    226   return more ? EAGAIN : 0;
    227 }
    228 
    229 int MessageBuffer::Lex (std::vector<std::string> &result)
    230 {
    231   result.clear ();
    232 
    233   if (IsAtEnd ())
    234     return ENOENT;
    235 
    236   Assert (buffer.back () == S2C(u8"\n"));
    237 
    238   auto iter = buffer.begin () + lastBol;
    239 
    240   for (std::string *word = nullptr;;)
    241     {
    242       char c = *iter;
    243 
    244       ++iter;
    245       if (c == S2C(u8" ") || c == S2C(u8"\t"))
    246 	{
    247 	  word = nullptr;
    248 	  continue;
    249 	}
    250 
    251       if (c == S2C(u8"\n"))
    252 	break;
    253 
    254       if (c == CONTINUE)
    255 	{
    256 	  // Line continuation
    257 	  if (word || *iter != S2C(u8"\n"))
    258 	    goto malformed;
    259 	  ++iter;
    260 	  break;
    261 	}
    262 
    263       if (c <= S2C(u8" ") || c >= 0x7f)
    264 	goto malformed;
    265 
    266       if (!word)
    267 	{
    268 	  result.emplace_back ();
    269 	  word = &result.back ();
    270 	}
    271 
    272       if (c == S2C(u8"'"))
    273 	{
    274 	  // Quoted word
    275 	  for (;;)
    276 	    {
    277 	      c = *iter;
    278 
    279 	      if (c == S2C(u8"\n"))
    280 		{
    281 		malformed:;
    282 		  result.clear ();
    283 		  iter = std::find (iter, buffer.end (), S2C(u8"\n"));
    284 		  auto back = iter;
    285 		  if (back[-1] == CONTINUE  && back[-2] == S2C(u8" "))
    286 		    // Smells like a line continuation
    287 		    back -= 2;
    288 		  result.emplace_back (&buffer[lastBol],
    289 				       back - buffer.begin () - lastBol);
    290 		  ++iter;
    291 		  lastBol = iter - buffer.begin ();
    292 		  return EINVAL;
    293 		}
    294 
    295 	      if (c < S2C(u8" ") || c >= 0x7f)
    296 		goto malformed;
    297 
    298 	      ++iter;
    299 	      if (c == S2C(u8"'"))
    300 		break;
    301 
    302 	      if (c == S2C(u8"\\"))
    303 		// escape
    304 		switch (c = *iter)
    305 		  {
    306 		    case S2C(u8"\\"):
    307 		    case S2C(u8"'"):
    308 		      ++iter;
    309 		      break;
    310 
    311 		    case S2C(u8"n"):
    312 		      c = S2C(u8"\n");
    313 		      ++iter;
    314 		      break;
    315 
    316 		    case S2C(u8"_"):
    317 		      // We used to escape SPACE as \_, so accept that
    318 		      c = S2C(u8" ");
    319 		      ++iter;
    320 		      break;
    321 
    322 		    case S2C(u8"t"):
    323 		      c = S2C(u8"\t");
    324 		      ++iter;
    325 		      break;
    326 
    327 		    default:
    328 		      {
    329 			unsigned v = 0;
    330 			for (unsigned nibble = 0; nibble != 2; nibble++)
    331 			  {
    332 			    c = *iter;
    333 			    if (c < S2C(u8"0"))
    334 			      {
    335 				if (!nibble)
    336 				  goto malformed;
    337 				break;
    338 			      }
    339 			    else if (c <= S2C(u8"9"))
    340 			      c -= S2C(u8"0");
    341 			    else if (c < S2C(u8"a"))
    342 			      {
    343 				if (!nibble)
    344 				  goto malformed;
    345 				break;
    346 			      }
    347 			    else if (c <= S2C(u8"f"))
    348 			      c -= S2C(u8"a") - 10;
    349 			    else
    350 			      {
    351 				if (!nibble)
    352 				  goto malformed;
    353 				break;
    354 			      }
    355 			    ++iter;
    356 			    v = (v << 4) | c;
    357 			  }
    358 			c = v;
    359 		      }
    360 		  }
    361 	      word->push_back (c);
    362 	    }
    363 	}
    364       else
    365 	// Unquoted character
    366 	word->push_back (c);
    367     }
    368   lastBol = iter - buffer.begin ();
    369   if (result.empty ())
    370     return ENOENT;
    371 
    372   return 0;
    373 }
    374 
    375 void MessageBuffer::LexedLine (std::string &str)
    376 {
    377   if (lastBol)
    378     {
    379       size_t pos = lastBol - 1;
    380       for (; pos; pos--)
    381 	if (buffer[pos-1] == S2C(u8"\n"))
    382 	  break;
    383 
    384       size_t end = lastBol - 1;
    385       if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
    386 	// Strip line continuation
    387 	end -= 2;
    388       str.append (&buffer[pos], end - pos);
    389     }
    390 }
    391 } // Detail
    392 } // Cody
    393