nucleus/library/configuration/variable_tokenizer.cpp

   1 // Name   : variable_tokenizer
   2 // Author : Chris Koeritz
   3 /*
   4 * Copyright (c) 1997-$now By Author.  This program is free software; you can  *
   5 * redistribute it and/or modify it under the terms of the GNU General Public  *
   6 * License as published by the Free Software Foundation; either version 2 of   *
   7 * the License or (at your option) any later version.  This is online at:      *
   8 *     http://www.fsf.org/copyleft/gpl.html                                    *
   9 * Please send any updates to: fred@gruntose.com                               *
  10 */
  11
  12 #include "variable_tokenizer.h"
  13
  14 #include <basis/astring.h>
  15 #include <basis/functions.h>
  16 #include <structures/stack.h>
  17 #include <structures/string_table.h>
  18 #include <structures/symbol_table.h>
  19 #include <textual/parser_bits.h>
  20
  21 //#define DEBUG_VARIABLE_TOKENIZER
  22   // uncomment for noisier run.
  23
  24 const char *SPECIAL_VALUE = " ";
  25   // special value stored for entries with assignment operators but no
  26   // value contents.
  27
  28 #undef LOG
  29 #ifdef DEBUG_VARIABLE_TOKENIZER
  30   #include <stdio.h>
  31   #define LOG(to_print) printf("%s\n", astring(to_print).s());
  32 #else
  33   #define LOG(to_print)
  34 #endif
  35
  36 using namespace basis;
  37 using namespace structures;
  38 using namespace textual;
  39
  40 namespace configuration {
  41
  42 variable_tokenizer::variable_tokenizer(int max_bits)
  43 : _implementation(new string_table(max_bits)),
  44   _assignments(new astring("=")),
  45   _separators(new astring(",")),
  46   _quotes(new astring),
  47   _nesting(false),
  48   _comments(new astring),
  49   _comment_number(1),
  50   _add_spaces(false)
  51 {}
  52
  53 variable_tokenizer::variable_tokenizer(const astring &separator, const astring &assignment,
  54     int max_bits)
  55 : _implementation(new string_table(max_bits)),
  56   _assignments(new astring(assignment)),
  57   _separators(new astring(separator)),
  58   _quotes(new astring),
  59   _nesting(false),
  60   _comments(new astring),
  61   _comment_number(1),
  62   _add_spaces(false)
  63 {}
  64
  65 variable_tokenizer::variable_tokenizer(const astring &separator, const astring &assignment,
  66     const astring &quotes, bool nesting, int max_bits)
  67 : _implementation(new string_table(max_bits)),
  68   _assignments(new astring(assignment)),
  69   _separators(new astring(separator)),
  70   _quotes(new astring(quotes)),
  71   _nesting(nesting),
  72   _comments(new astring),
  73   _comment_number(1),
  74   _add_spaces(false)
  75 {}
  76
  77 variable_tokenizer::variable_tokenizer(const variable_tokenizer &to_copy)
  78 : _implementation(new string_table),
  79   _assignments(new astring),
  80   _separators(new astring),
  81   _quotes(new astring),
  82   _nesting(false),
  83   _comments(new astring),
  84   _comment_number(1),
  85   _add_spaces(false)
  86 { *this = to_copy; }
  87
  88 variable_tokenizer::~variable_tokenizer()
  89 {
  90   WHACK(_separators);
  91   WHACK(_assignments);
  92   WHACK(_implementation);
  93   WHACK(_quotes);
  94   WHACK(_comments);
  95 }
  96
  97 int variable_tokenizer::symbols() const { return _implementation->symbols(); }
  98
  99 void variable_tokenizer::set_comment_chars(const astring &comments)
 100 { *_comments = comments; }
 101
 102 const astring &variable_tokenizer::assignments() const { return *_assignments; }
 103
 104 const astring &variable_tokenizer::separators() const { return *_separators; }
 105
 106 const astring &variable_tokenizer::quotes() const { return *_quotes; }
 107
 108 bool variable_tokenizer::exists(const astring &name) const
 109 { return !!_implementation->find(name); }
 110
 111 void variable_tokenizer::reset() { _implementation->reset(); }
 112
 113 const string_table &variable_tokenizer::table() const { return *_implementation; }
 114
 115 string_table &variable_tokenizer::table() { return *_implementation; }
 116
 117 variable_tokenizer &variable_tokenizer::operator =(const variable_tokenizer &to_copy)
 118 {
 119   if (this == &to_copy) return *this;
 120   *_implementation = *to_copy._implementation;
 121   *_separators = *to_copy._separators;
 122   *_assignments = *to_copy._assignments;
 123   *_quotes = *to_copy._quotes;
 124   _nesting = to_copy._nesting;
 125   _add_spaces = to_copy._add_spaces;
 126   return *this;
 127 }
 128
 129 astring variable_tokenizer::find(const astring &name) const
 130 {
 131   astring *found = _implementation->find(name);
 132   if (!found) return "";
 133
 134   // check that the contents are not just our significator of emptiness.
 135   if (found->equal_to(SPECIAL_VALUE)) return "";
 136   return *found;
 137 }
 138
 139 bool variable_tokenizer::okay_for_variable_name(char to_check) const
 140 {
 141   if (!to_check || separator(to_check) || assignment(to_check)) return false;
 142   return true;
 143 }
 144
 145 bool variable_tokenizer::separator(char to_check) const
 146 {
 147   // special case allows a CR separator to be either flavor.
 148   if (parser_bits::is_eol(to_check)
 149       && (astring::matches(*_separators, '\n')
 150            || astring::matches(*_separators, '\r')) ) return true;
 151   return astring::matches(*_separators, to_check);
 152 }
 153
 154 bool variable_tokenizer::assignment(char to_check) const
 155 { return astring::matches(*_assignments, to_check); }
 156
 157 bool variable_tokenizer::quote_mark(char to_check) const
 158 { return astring::matches(*_quotes, to_check); }
 159
 160 bool variable_tokenizer::comment_char(char to_check) const
 161 { return astring::matches(*_comments, to_check); }
 162
 163 #define COOL to_tokenize.length()
 164   // true if the string should continue to be parsed.
 165
 166 // sets "current" to the first character in the string.
 167 #define CHOP { \
 168   current = to_tokenize[0]; \
 169   to_tokenize.zap(0, 0); \
 170 }
 171
 172 bool variable_tokenizer::parse(const astring &to_tokenize_in)
 173 {
 174   FUNCDEF("parse");
 175   astring to_tokenize(to_tokenize_in);  // de-const.
 176 //hmmm: do we need a copy?  try scooting based on a current pos.
 177
 178   astring name, value;  // accumulated during the loop.
 179   char current;  // the most recent character from to_tokenize.
 180   bool just_ate_blank_line = false;
 181     // records when we handle a blank line as a comment.
 182
 183   // loop over the string.
 184   while (COOL) {
 185     name.reset();
 186     value.reset();
 187
 188     // pre-processing to remove extra eols and white space in front.
 189     if (is_eol_a_separator() && parser_bits::is_eol(to_tokenize[0])) {
 190       CHOP;
 191       // chop any white space but don't eat any non-white space coming up.
 192       while (COOL && parser_bits::white_space(current)) {
 193         CHOP;
 194         if (!parser_bits::white_space(current)) {
 195           // oops; we ate something we shouldn't have, since it will be
 196           // chopped when we get in the main loop.
 197           to_tokenize.insert(0, astring(current, 1));
 198         }
 199       }
 200     }
 201
 202     // chop the first character off for analysis.
 203     CHOP;
 204
 205     // ignore any white space until we hit a variable or other good stuff.
 206     if (parser_bits::white_space_no_cr(current))
 207       continue;
 208
 209     // ignore eol unless they are in separator list.
 210     bool handle_as_comment = false;
 211     if (parser_bits::is_eol(current) && !is_eol_a_separator()) {
 212       continue;
 213     } else if (just_ate_blank_line && parser_bits::is_eol(current)) {
 214       just_ate_blank_line = false;
 215       continue;
 216     } else if (parser_bits::is_eol(current) && is_eol_a_separator()) {
 217 //LOG("found eol and it's a separator here");
 218       handle_as_comment = true;
 219     }
 220
 221     if (comment_char(current) || handle_as_comment) {
 222       // set our flag since we are going to eat the end of line in any case.
 223       just_ate_blank_line = true;
 224       // seek all text until next separator.
 225       while (COOL && !separator(current)) {
 226         value += current;
 227         CHOP;
 228       }
 229       // add the item with our ongoing comment number.
 230       a_sprintf name("%s%d", STRTAB_COMMENT_PREFIX, _comment_number);
 231       _implementation->add(name, value);
 232       _comment_number++;  // go to next comment number to keep unique.
 233 LOG(astring("got comment: ") + name + " -> " + value);
 234       continue;  // got our chunk, keep going.
 235     }
 236
 237     just_ate_blank_line = false;  // reset our flag.
 238
 239     // skip characters we can't use for a variable name.
 240     if (!okay_for_variable_name(current)) continue;
 241
 242     // we've found the start of a variable.
 243     while (COOL && okay_for_variable_name(current)) {
 244       // accumulate the variable name.
 245       name += current;
 246       CHOP;  // get the next character.
 247     }
 248     if (!COOL) {
 249       // we're at the end of the line, so deal with this situation.
 250       if (!separator(current) && !parser_bits::white_space(current) )
 251         name += current;  // get the character from the end of the line.
 252 LOG(astring("last add: ") + name + " -> " + value);
 253       _implementation->add(name, value);  // store what we built.
 254       continue;  // skip the rest; we're at the END of the line man.
 255     }
 256
 257     // skip spaces after variable name.
 258     while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
 259
 260     bool found_assignment = false;  // assume there isn't one.
 261     if (assignment(current)) {
 262       // we found the assignment operator and are starting on the value.
 263       CHOP;  // skip the assignment operator.
 264       found_assignment = true;
 265     }
 266
 267     // skip spaces after the assignment statement.
 268     while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
 269
 270     // track the quoting that we have to deal with in parsing a value.
 271     stack<char> q_stack(!int(_nesting));
 272       // create an unbounded stack for nesting.
 273
 274     while (COOL) {
 275       // check if the current character is a quote.
 276       bool ignore_separator = false;
 277       if (quote_mark(current)) {
 278         if (!q_stack.size()) {
 279           // nothing on the stack yet, so start accumulating.
 280           ignore_separator = true;
 281           q_stack.push(current);
 282         } else if (current == q_stack.top()) {
 283           // we got the end of this quoting.
 284           q_stack.pop();
 285           // check if we're done with any quotes.  if not, we still need to
 286           // ignore the separators.
 287           if (q_stack.size())
 288             ignore_separator = true;
 289         } else {
 290           // if we are using a bounded stack, it means we only support one
 291           // level of quoting at a time.  thus, this quote character simply
 292           // falls in as a regular character.  but if we're unbound, then
 293           // we can nest arbitrary levels of quotes.
 294           if (q_stack.kind() == stack<char>::UNBOUNDED)
 295             q_stack.push(current);
 296           // we have something on the stack already so we're still ignoring
 297           // separators.  we just don't care about this type of quote.
 298           ignore_separator = true;
 299         }
 300       } else if (q_stack.size()) {
 301         // it's not a quote but we're still trying to chow the matching
 302         // quote character.
 303         ignore_separator = true;
 304       }
 305
 306       // look for the separator.
 307       if (!ignore_separator && separator(current)) {
 308         break;
 309       }
 310
 311       // accumulate the value.
 312       value += current;
 313       CHOP;  // get the next character.
 314     }
 315     // get the last character if it's relevant.
 316     if (!separator(current) && !parser_bits::white_space(current) ) {
 317       value += current;
 318     }
 319
 320     if (found_assignment && !value) {
 321       // use our special case for empty values, since there was an assignment
 322       // operator but no value afterwards.
 323       value = SPECIAL_VALUE;
 324     }
 325
 326     // store the accumulated variable name and value, but only if the name
 327     // is non-empty.  otherwise, it's not much of a definition.
 328     if (name.t()) {
 329       // strip spaces at the end of the name.
 330       while (parser_bits::white_space_no_cr(name[name.end()]))
 331         name.zap(name.end(), name.end());
 332       // strip spaces at the end of the value unless it's the special case.
 333       if (!value.equal_to(SPECIAL_VALUE)) {
 334         while (parser_bits::white_space(value[value.end()]))
 335           value.zap(value.end(), value.end());
 336       }
 337 LOG(astring("normal add: ") + name + " -> " + value);
 338       _implementation->add(name, value);  // store what we built.
 339       just_ate_blank_line = true;  // flag that we don't want next EOL.
 340       // reset, just in case.
 341       name.reset();
 342       value.reset();
 343     }
 344   }
 345   // currently we just kind of bully through whatever string is provided and do not
 346   // flag any error conditions.  but people do like to know if it worked or not.  they can
 347   // make their own conclusions if there are not enough variables defined for their needs.
 348   return true;
 349 }
 350
 351 bool variable_tokenizer::is_eol_a_separator() const
 352 {
 353   for (int i = 0; i < _separators->length(); i++) {
 354     char sep = _separators->get(i);
 355     // correct the separator for platform when it's the end of the line.
 356     if (parser_bits::is_eol(sep)) return true;
 357   }
 358   return false;
 359 }
 360
 361 void variable_tokenizer::text_form(astring &accumulator) const
 362 {
 363   accumulator.reset();
 364   bool added_sep = false;
 365   for (int i = 0; i < _implementation->symbols(); i++) {
 366     added_sep = false;
 367     if (!string_table::is_comment(_implementation->name(i))) {
 368       // a normal assignment is here.
 369       accumulator += _implementation->name(i);
 370       if (_implementation->operator [](i).t()) {
 371         if (_add_spaces) accumulator += " ";
 372         accumulator += _assignments->get(0);
 373         if (_add_spaces) accumulator += " ";
 374         accumulator += _implementation->operator [](i);
 375       }
 376     } else {
 377       // this one is a comment.  just spit out the value.
 378       if (_implementation->operator [](i).t())
 379         accumulator += _implementation->operator [](i);
 380     }
 381     // correct the separator for platform when it's the end of the line.
 382     if (is_eol_a_separator()) {
 383       accumulator += parser_bits::platform_eol_to_chars();
 384     } else {
 385       added_sep = true;  // record that we put a separator in there.
 386       accumulator += _separators->get(0);
 387       accumulator += ' ';
 388     }
 389   }
 390   // strip the final separator and space back off, if we added them.
 391   if (added_sep)
 392     accumulator.zap(accumulator.end() - 1, accumulator.end());
 393 }
 394
 395 astring variable_tokenizer::text_form() const
 396 {
 397   astring accumulator;
 398   text_form(accumulator);
 399   return accumulator;
 400 }
 401
 402 } //namespace.
 403