feisty meow concerns codebase  2.140
variable_tokenizer.cpp
Go to the documentation of this file.
1 // Name : variable_tokenizer
2 // Author : Chris Koeritz
3 /*
4 * Copyright (c) 1997-$now By Author. This program is free software; you can *
5 * redistribute it and/or modify it under the terms of the GNU General Public *
6 * License as published by the Free Software Foundation; either version 2 of *
7 * the License or (at your option) any later version. This is online at: *
8 * http://www.fsf.org/copyleft/gpl.html *
9 * Please send any updates to: fred@gruntose.com *
10 */
11 
12 #include "variable_tokenizer.h"
13 
14 #include <basis/astring.h>
15 #include <basis/functions.h>
16 #include <structures/stack.h>
19 #include <textual/parser_bits.h>
20 
21 //#define DEBUG_VARIABLE_TOKENIZER
22  // uncomment for noisier run.
23 
24 const char *SPECIAL_VALUE = " ";
25  // special value stored for entries with assignment operators but no
26  // value contents.
27 
28 #undef LOG
29 #ifdef DEBUG_VARIABLE_TOKENIZER
30  #include <stdio.h>
31  #define LOG(to_print) printf("%s\n", astring(to_print).s());
32 #else
33  #define LOG(to_print)
34 #endif
35 
36 using namespace basis;
37 using namespace structures;
38 using namespace textual;
39 
40 namespace configuration {
41 
42 variable_tokenizer::variable_tokenizer(int max_bits)
43 : _implementation(new string_table(max_bits)),
44  _assignments(new astring("=")),
45  _separators(new astring(",")),
46  _quotes(new astring),
47  _nesting(false),
48  _comments(new astring),
49  _comment_number(1),
50  _add_spaces(false)
51 {}
52 
53 variable_tokenizer::variable_tokenizer(const astring &separator, const astring &assignment,
54  int max_bits)
55 : _implementation(new string_table(max_bits)),
56  _assignments(new astring(assignment)),
57  _separators(new astring(separator)),
58  _quotes(new astring),
59  _nesting(false),
60  _comments(new astring),
61  _comment_number(1),
62  _add_spaces(false)
63 {}
64 
65 variable_tokenizer::variable_tokenizer(const astring &separator, const astring &assignment,
66  const astring &quotes, bool nesting, int max_bits)
67 : _implementation(new string_table(max_bits)),
68  _assignments(new astring(assignment)),
69  _separators(new astring(separator)),
70  _quotes(new astring(quotes)),
71  _nesting(nesting),
72  _comments(new astring),
73  _comment_number(1),
74  _add_spaces(false)
75 {}
76 
78 : _implementation(new string_table),
79  _assignments(new astring),
80  _separators(new astring),
81  _quotes(new astring),
82  _nesting(false),
83  _comments(new astring),
84  _comment_number(1),
85  _add_spaces(false)
86 { *this = to_copy; }
87 
89 {
90  WHACK(_separators);
91  WHACK(_assignments);
92  WHACK(_implementation);
93  WHACK(_quotes);
94  WHACK(_comments);
95 }
96 
97 int variable_tokenizer::symbols() const { return _implementation->symbols(); }
98 
100 { *_comments = comments; }
101 
102 const astring &variable_tokenizer::assignments() const { return *_assignments; }
103 
104 const astring &variable_tokenizer::separators() const { return *_separators; }
105 
106 const astring &variable_tokenizer::quotes() const { return *_quotes; }
107 
108 bool variable_tokenizer::exists(const astring &name) const
109 { return !!_implementation->find(name); }
110 
111 void variable_tokenizer::reset() { _implementation->reset(); }
112 
113 const string_table &variable_tokenizer::table() const { return *_implementation; }
114 
115 string_table &variable_tokenizer::table() { return *_implementation; }
116 
118 {
119  if (this == &to_copy) return *this;
120  *_implementation = *to_copy._implementation;
121  *_separators = *to_copy._separators;
122  *_assignments = *to_copy._assignments;
123  *_quotes = *to_copy._quotes;
124  _nesting = to_copy._nesting;
125  _add_spaces = to_copy._add_spaces;
126  return *this;
127 }
128 
130 {
131  astring *found = _implementation->find(name);
132  if (!found) return "";
133 
134  // check that the contents are not just our significator of emptiness.
135  if (found->equal_to(SPECIAL_VALUE)) return "";
136  return *found;
137 }
138 
140 {
141  if (!to_check || separator(to_check) || assignment(to_check)) return false;
142  return true;
143 }
144 
145 bool variable_tokenizer::separator(char to_check) const
146 {
147  // special case allows a CR separator to be either flavor.
148  if (parser_bits::is_eol(to_check)
149  && (astring::matches(*_separators, '\n')
150  || astring::matches(*_separators, '\r')) ) return true;
151  return astring::matches(*_separators, to_check);
152 }
153 
154 bool variable_tokenizer::assignment(char to_check) const
155 { return astring::matches(*_assignments, to_check); }
156 
157 bool variable_tokenizer::quote_mark(char to_check) const
158 { return astring::matches(*_quotes, to_check); }
159 
160 bool variable_tokenizer::comment_char(char to_check) const
161 { return astring::matches(*_comments, to_check); }
162 
163 #define COOL to_tokenize.length()
164  // true if the string should continue to be parsed.
165 
166 // sets "current" to the first character in the string.
167 #define CHOP { \
168  current = to_tokenize[0]; \
169  to_tokenize.zap(0, 0); \
170 }
171 
172 bool variable_tokenizer::parse(const astring &to_tokenize_in)
173 {
174  FUNCDEF("parse");
175  astring to_tokenize(to_tokenize_in); // de-const.
176 //hmmm: do we need a copy? try scooting based on a current pos.
177 
178  astring name, value; // accumulated during the loop.
179  char current; // the most recent character from to_tokenize.
180  bool just_ate_blank_line = false;
181  // records when we handle a blank line as a comment.
182 
183  // loop over the string.
184  while (COOL) {
185  name.reset();
186  value.reset();
187 
188  // pre-processing to remove extra eols and white space in front.
189  if (is_eol_a_separator() && parser_bits::is_eol(to_tokenize[0])) {
190  CHOP;
191  // chop any white space but don't eat any non-white space coming up.
192  while (COOL && parser_bits::white_space(current)) {
193  CHOP;
194  if (!parser_bits::white_space(current)) {
195  // oops; we ate something we shouldn't have, since it will be
196  // chopped when we get in the main loop.
197  to_tokenize.insert(0, astring(current, 1));
198  }
199  }
200  }
201 
202  // chop the first character off for analysis.
203  CHOP;
204 
205  // ignore any white space until we hit a variable or other good stuff.
206  if (parser_bits::white_space_no_cr(current))
207  continue;
208 
209  // ignore eol unless they are in separator list.
210  bool handle_as_comment = false;
211  if (parser_bits::is_eol(current) && !is_eol_a_separator()) {
212  continue;
213  } else if (just_ate_blank_line && parser_bits::is_eol(current)) {
214  just_ate_blank_line = false;
215  continue;
216  } else if (parser_bits::is_eol(current) && is_eol_a_separator()) {
217 //LOG("found eol and it's a separator here");
218  handle_as_comment = true;
219  }
220 
221  if (comment_char(current) || handle_as_comment) {
222  // set our flag since we are going to eat the end of line in any case.
223  just_ate_blank_line = true;
224  // seek all text until next separator.
225  while (COOL && !separator(current)) {
226  value += current;
227  CHOP;
228  }
229  // add the item with our ongoing comment number.
230  a_sprintf name("%s%d", STRTAB_COMMENT_PREFIX, _comment_number);
231  _implementation->add(name, value);
232  _comment_number++; // go to next comment number to keep unique.
233 LOG(astring("got comment: ") + name + " -> " + value);
234  continue; // got our chunk, keep going.
235  }
236 
237  just_ate_blank_line = false; // reset our flag.
238 
239  // skip characters we can't use for a variable name.
240  if (!okay_for_variable_name(current)) continue;
241 
242  // we've found the start of a variable.
243  while (COOL && okay_for_variable_name(current)) {
244  // accumulate the variable name.
245  name += current;
246  CHOP; // get the next character.
247  }
248  if (!COOL) {
249  // we're at the end of the line, so deal with this situation.
250  if (!separator(current) && !parser_bits::white_space(current) )
251  name += current; // get the character from the end of the line.
252 LOG(astring("last add: ") + name + " -> " + value);
253  _implementation->add(name, value); // store what we built.
254  continue; // skip the rest; we're at the END of the line man.
255  }
256 
257  // skip spaces after variable name.
258  while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
259 
260  bool found_assignment = false; // assume there isn't one.
261  if (assignment(current)) {
262  // we found the assignment operator and are starting on the value.
263  CHOP; // skip the assignment operator.
264  found_assignment = true;
265  }
266 
267  // skip spaces after the assignment statement.
268  while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
269 
270  // track the quoting that we have to deal with in parsing a value.
271  stack<char> q_stack(!int(_nesting));
272  // create an unbounded stack for nesting.
273 
274  while (COOL) {
275  // check if the current character is a quote.
276  bool ignore_separator = false;
277  if (quote_mark(current)) {
278  if (!q_stack.size()) {
279  // nothing on the stack yet, so start accumulating.
280  ignore_separator = true;
281  q_stack.push(current);
282  } else if (current == q_stack.top()) {
283  // we got the end of this quoting.
284  q_stack.pop();
285  // check if we're done with any quotes. if not, we still need to
286  // ignore the separators.
287  if (q_stack.size())
288  ignore_separator = true;
289  } else {
290  // if we are using a bounded stack, it means we only support one
291  // level of quoting at a time. thus, this quote character simply
292  // falls in as a regular character. but if we're unbound, then
293  // we can nest arbitrary levels of quotes.
294  if (q_stack.kind() == stack<char>::UNBOUNDED)
295  q_stack.push(current);
296  // we have something on the stack already so we're still ignoring
297  // separators. we just don't care about this type of quote.
298  ignore_separator = true;
299  }
300  } else if (q_stack.size()) {
301  // it's not a quote but we're still trying to chow the matching
302  // quote character.
303  ignore_separator = true;
304  }
305 
306  // look for the separator.
307  if (!ignore_separator && separator(current)) {
308  break;
309  }
310 
311  // accumulate the value.
312  value += current;
313  CHOP; // get the next character.
314  }
315  // get the last character if it's relevant.
316  if (!separator(current) && !parser_bits::white_space(current) ) {
317  value += current;
318  }
319 
320  if (found_assignment && !value) {
321  // use our special case for empty values, since there was an assignment
322  // operator but no value afterwards.
323  value = SPECIAL_VALUE;
324  }
325 
326  // store the accumulated variable name and value, but only if the name
327  // is non-empty. otherwise, it's not much of a definition.
328  if (name.t()) {
329  // strip spaces at the end of the name.
330  while (parser_bits::white_space_no_cr(name[name.end()]))
331  name.zap(name.end(), name.end());
332  // strip spaces at the end of the value unless it's the special case.
333  if (!value.equal_to(SPECIAL_VALUE)) {
334  while (parser_bits::white_space(value[value.end()]))
335  value.zap(value.end(), value.end());
336  }
337 LOG(astring("normal add: ") + name + " -> " + value);
338  _implementation->add(name, value); // store what we built.
339  just_ate_blank_line = true; // flag that we don't want next EOL.
340  // reset, just in case.
341  name.reset();
342  value.reset();
343  }
344  }
345  // currently we just kind of bully through whatever string is provided and do not
346  // flag any error conditions. but people do like to know if it worked or not. they can
347  // make their own conclusions if there are not enough variables defined for their needs.
348  return true;
349 }
350 
352 {
353  for (int i = 0; i < _separators->length(); i++) {
354  char sep = _separators->get(i);
355  // correct the separator for platform when it's the end of the line.
356  if (parser_bits::is_eol(sep)) return true;
357  }
358  return false;
359 }
360 
361 void variable_tokenizer::text_form(astring &accumulator) const
362 {
363  accumulator.reset();
364  bool added_sep = false;
365  for (int i = 0; i < _implementation->symbols(); i++) {
366  added_sep = false;
367  if (!string_table::is_comment(_implementation->name(i))) {
368  // a normal assignment is here.
369  accumulator += _implementation->name(i);
370  if (_implementation->operator [](i).t()) {
371  if (_add_spaces) accumulator += " ";
372  accumulator += _assignments->get(0);
373  if (_add_spaces) accumulator += " ";
374  accumulator += _implementation->operator [](i);
375  }
376  } else {
377  // this one is a comment. just spit out the value.
378  if (_implementation->operator [](i).t())
379  accumulator += _implementation->operator [](i);
380  }
381  // correct the separator for platform when it's the end of the line.
382  if (is_eol_a_separator()) {
383  accumulator += parser_bits::platform_eol_to_chars();
384  } else {
385  added_sep = true; // record that we put a separator in there.
386  accumulator += _separators->get(0);
387  accumulator += ' ';
388  }
389  }
390  // strip the final separator and space back off, if we added them.
391  if (added_sep)
392  accumulator.zap(accumulator.end() - 1, accumulator.end());
393 }
394 
396 {
397  astring accumulator;
398  text_form(accumulator);
399  return accumulator;
400 }
401 
402 } //namespace.
403 
a_sprintf is a specialization of astring that provides printf style support.
Definition: astring.h:440
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35
bool t() const
t() is a shortcut for the string being "true", as in non-empty.
Definition: astring.h:97
virtual char get(int index) const
a constant peek at the string's internals at the specified index.
Definition: astring.cpp:138
virtual void zap(int start, int end)
Deletes the characters between "start" and "end" inclusively.
Definition: astring.cpp:521
void insert(int position, const astring &to_insert)
Copies "to_insert" into "this" at the "position".
Definition: astring.cpp:892
void reset()
clears out the contents string.
Definition: astring.h:202
bool equal_to(const char *that) const
returns true if "that" is equal to this.
Definition: astring.cpp:159
int end() const
returns the index of the last (non-null) character in the string.
Definition: astring.h:86
int length() const
Returns the current length of the string.
Definition: astring.cpp:132
Manages a bank of textual definitions of variables.
variable_tokenizer & operator=(const variable_tokenizer &to_copy)
makes this variable_tokenizer identical to "to_copy".
bool okay_for_variable_name(char to_check) const
true if "to_check" is a valid variable name character.
bool comment_char(char to_check) const
true if "to_check" is a registered comment character.
bool is_eol_a_separator() const
reports whether any of the separators are an EOL character.
const structures::string_table & table() const
provides a constant peek at the string_table holding the values.
bool exists(const basis::astring &name) const
returns true if the "name" exists in the variable_tokenizer.
void reset()
clears all of the entries out.
bool assignment(char to_check) const
true if "to_check" is a valid assignment operator.
const basis::astring & separators() const
provides a peek at the separators list.
const basis::astring & assignments() const
provides a peek at the assignments list.
basis::astring find(const basis::astring &name) const
locates the value for a variable named "name" if it exists.
variable_tokenizer(int max_bits=DEFAULT_MAX_BITS)
creates a variable_tokenizer with the default characters.
bool separator(char to_check) const
true if "to_check" is a valid separator.
int symbols() const
returns the number of entries in the variable_tokenizer.
bool quote_mark(char to_check) const
true if "to_check" is a member of the quotes list.
basis::astring text_form() const
creates a new token list as a string of text.
void set_comment_chars(const basis::astring &comments)
establishes a set of characters in "comments" as the comment items.
const basis::astring & quotes() const
provides a peek at the quotes list.
bool parse(const basis::astring &to_tokenize)
parses the string using our established sentinel characters.
An abstraction that represents a stack data structure.
Definition: stack.h:30
basis::outcome push(const contents &element)
Enters a new element onto the top of the stack.
Definition: stack.h:139
int size() const
returns the size of the stack.
Definition: stack.h:127
contents & top()
Returns the top element from the stack but doesn't change the stack.
Definition: stack.h:161
stack_kinds kind() const
returns the type of stack that was constructed.
Definition: stack.h:54
basis::outcome pop()
Removes the top element on the stack.
Definition: stack.h:151
Provides a symbol_table that holds strings as the content.
Definition: string_table.h:32
const basis::astring & name(int index) const
returns the name held at the "index".
Definition: symbol_table.h:272
contents * find(const basis::astring &name) const
returns the contents held for "name" or NULL_POINTER if it wasn't found.
Definition: symbol_table.h:313
basis::outcome add(const basis::astring &name, const contents &storage)
Enters a symbol name into the table along with some contents.
Definition: symbol_table.h:383
int symbols() const
returns the number of symbols listed in the table.
Definition: symbol_table.h:241
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition: enhance_cpp.h:57
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
void WHACK(contents *&ptr)
deletion with clearing of the pointer.
Definition: functions.h:121
A dynamic container class that holds any kind of object via pointers.
Definition: amorph.h:55
bool is_eol(char to_check)
#define STRTAB_COMMENT_PREFIX
anything beginning with this is considered a comment.
Definition: string_table.h:52
#define CHOP
const char * SPECIAL_VALUE
#define COOL
#define LOG(to_print)