1 // Name : variable_tokenizer
2 // Author : Chris Koeritz
4 * Copyright (c) 1997-$now By Author. This program is free software; you can *
5 * redistribute it and/or modify it under the terms of the GNU General Public *
6 * License as published by the Free Software Foundation; either version 2 of *
7 * the License or (at your option) any later version. This is online at: *
8 * http://www.fsf.org/copyleft/gpl.html *
9 * Please send any updates to: fred@gruntose.com *
12 #include "variable_tokenizer.h"
14 #include <basis/astring.h>
15 #include <basis/functions.h>
16 #include <structures/stack.h>
17 #include <structures/string_table.h>
18 #include <structures/symbol_table.h>
19 #include <textual/parser_bits.h>
21 //#define DEBUG_VARIABLE_TOKENIZER
22 // uncomment for noisier run.
24 const char *SPECIAL_VALUE = " ";
25 // special value stored for entries with assignment operators but no
29 #ifdef DEBUG_VARIABLE_TOKENIZER
31 #define LOG(to_print) printf("%s\n", astring(to_print).s());
36 using namespace basis;
37 using namespace structures;
38 using namespace textual;
40 namespace configuration {
42 variable_tokenizer::variable_tokenizer(int max_bits)
43 : _implementation(new string_table(max_bits)),
44 _assignments(new astring("=")),
45 _separators(new astring(",")),
48 _comments(new astring),
53 variable_tokenizer::variable_tokenizer(const astring &separator, const astring &assignment,
55 : _implementation(new string_table(max_bits)),
56 _assignments(new astring(assignment)),
57 _separators(new astring(separator)),
60 _comments(new astring),
65 variable_tokenizer::variable_tokenizer(const astring &separator, const astring &assignment,
66 const astring "es, bool nesting, int max_bits)
67 : _implementation(new string_table(max_bits)),
68 _assignments(new astring(assignment)),
69 _separators(new astring(separator)),
70 _quotes(new astring(quotes)),
72 _comments(new astring),
77 variable_tokenizer::variable_tokenizer(const variable_tokenizer &to_copy)
78 : _implementation(new string_table),
79 _assignments(new astring),
80 _separators(new astring),
83 _comments(new astring),
88 variable_tokenizer::~variable_tokenizer()
92 WHACK(_implementation);
97 int variable_tokenizer::symbols() const { return _implementation->symbols(); }
99 void variable_tokenizer::set_comment_chars(const astring &comments)
100 { *_comments = comments; }
102 const astring &variable_tokenizer::assignments() const { return *_assignments; }
104 const astring &variable_tokenizer::separators() const { return *_separators; }
106 const astring &variable_tokenizer::quotes() const { return *_quotes; }
108 bool variable_tokenizer::exists(const astring &name) const
109 { return !!_implementation->find(name); }
111 void variable_tokenizer::reset() { _implementation->reset(); }
113 const string_table &variable_tokenizer::table() const { return *_implementation; }
115 string_table &variable_tokenizer::table() { return *_implementation; }
117 variable_tokenizer &variable_tokenizer::operator =(const variable_tokenizer &to_copy)
119 if (this == &to_copy) return *this;
120 *_implementation = *to_copy._implementation;
121 *_separators = *to_copy._separators;
122 *_assignments = *to_copy._assignments;
123 *_quotes = *to_copy._quotes;
124 _nesting = to_copy._nesting;
125 _add_spaces = to_copy._add_spaces;
129 astring variable_tokenizer::find(const astring &name) const
131 astring *found = _implementation->find(name);
132 if (!found) return "";
134 // check that the contents are not just our significator of emptiness.
135 if (found->equal_to(SPECIAL_VALUE)) return "";
139 bool variable_tokenizer::okay_for_variable_name(char to_check) const
141 if (!to_check || separator(to_check) || assignment(to_check)) return false;
145 bool variable_tokenizer::separator(char to_check) const
147 // special case allows a CR separator to be either flavor.
148 if (parser_bits::is_eol(to_check)
149 && (astring::matches(*_separators, '\n')
150 || astring::matches(*_separators, '\r')) ) return true;
151 return astring::matches(*_separators, to_check);
154 bool variable_tokenizer::assignment(char to_check) const
155 { return astring::matches(*_assignments, to_check); }
157 bool variable_tokenizer::quote_mark(char to_check) const
158 { return astring::matches(*_quotes, to_check); }
160 bool variable_tokenizer::comment_char(char to_check) const
161 { return astring::matches(*_comments, to_check); }
163 #define COOL to_tokenize.length()
164 // true if the string should continue to be parsed.
166 // sets "current" to the first character in the string.
168 current = to_tokenize[0]; \
169 to_tokenize.zap(0, 0); \
172 bool variable_tokenizer::parse(const astring &to_tokenize_in)
175 astring to_tokenize(to_tokenize_in); // de-const.
176 //hmmm: do we need a copy? try scooting based on a current pos.
178 astring name, value; // accumulated during the loop.
179 char current; // the most recent character from to_tokenize.
180 bool just_ate_blank_line = false;
181 // records when we handle a blank line as a comment.
183 // loop over the string.
188 // pre-processing to remove extra eols and white space in front.
189 if (is_eol_a_separator() && parser_bits::is_eol(to_tokenize[0])) {
191 // chop any white space but don't eat any non-white space coming up.
192 while (COOL && parser_bits::white_space(current)) {
194 if (!parser_bits::white_space(current)) {
195 // oops; we ate something we shouldn't have, since it will be
196 // chopped when we get in the main loop.
197 to_tokenize.insert(0, astring(current, 1));
202 // chop the first character off for analysis.
205 // ignore any white space until we hit a variable or other good stuff.
206 if (parser_bits::white_space_no_cr(current))
209 // ignore eol unless they are in separator list.
210 bool handle_as_comment = false;
211 if (parser_bits::is_eol(current) && !is_eol_a_separator()) {
213 } else if (just_ate_blank_line && parser_bits::is_eol(current)) {
214 just_ate_blank_line = false;
216 } else if (parser_bits::is_eol(current) && is_eol_a_separator()) {
217 //LOG("found eol and it's a separator here");
218 handle_as_comment = true;
221 if (comment_char(current) || handle_as_comment) {
222 // set our flag since we are going to eat the end of line in any case.
223 just_ate_blank_line = true;
224 // seek all text until next separator.
225 while (COOL && !separator(current)) {
229 // add the item with our ongoing comment number.
230 a_sprintf name("%s%d", STRTAB_COMMENT_PREFIX, _comment_number);
231 _implementation->add(name, value);
232 _comment_number++; // go to next comment number to keep unique.
233 LOG(astring("got comment: ") + name + " -> " + value);
234 continue; // got our chunk, keep going.
237 just_ate_blank_line = false; // reset our flag.
239 // skip characters we can't use for a variable name.
240 if (!okay_for_variable_name(current)) continue;
242 // we've found the start of a variable.
243 while (COOL && okay_for_variable_name(current)) {
244 // accumulate the variable name.
246 CHOP; // get the next character.
249 // we're at the end of the line, so deal with this situation.
250 if (!separator(current) && !parser_bits::white_space(current) )
251 name += current; // get the character from the end of the line.
252 LOG(astring("last add: ") + name + " -> " + value);
253 _implementation->add(name, value); // store what we built.
254 continue; // skip the rest; we're at the END of the line man.
257 // skip spaces after variable name.
258 while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
260 bool found_assignment = false; // assume there isn't one.
261 if (assignment(current)) {
262 // we found the assignment operator and are starting on the value.
263 CHOP; // skip the assignment operator.
264 found_assignment = true;
267 // skip spaces after the assignment statement.
268 while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
270 // track the quoting that we have to deal with in parsing a value.
271 stack<char> q_stack(!int(_nesting));
272 // create an unbounded stack for nesting.
275 // check if the current character is a quote.
276 bool ignore_separator = false;
277 if (quote_mark(current)) {
278 if (!q_stack.size()) {
279 // nothing on the stack yet, so start accumulating.
280 ignore_separator = true;
281 q_stack.push(current);
282 } else if (current == q_stack.top()) {
283 // we got the end of this quoting.
285 // check if we're done with any quotes. if not, we still need to
286 // ignore the separators.
288 ignore_separator = true;
290 // if we are using a bounded stack, it means we only support one
291 // level of quoting at a time. thus, this quote character simply
292 // falls in as a regular character. but if we're unbound, then
293 // we can nest arbitrary levels of quotes.
294 if (q_stack.kind() == stack<char>::UNBOUNDED)
295 q_stack.push(current);
296 // we have something on the stack already so we're still ignoring
297 // separators. we just don't care about this type of quote.
298 ignore_separator = true;
300 } else if (q_stack.size()) {
301 // it's not a quote but we're still trying to chow the matching
303 ignore_separator = true;
306 // look for the separator.
307 if (!ignore_separator && separator(current)) {
311 // accumulate the value.
313 CHOP; // get the next character.
315 // get the last character if it's relevant.
316 if (!separator(current) && !parser_bits::white_space(current) ) {
320 if (found_assignment && !value) {
321 // use our special case for empty values, since there was an assignment
322 // operator but no value afterwards.
323 value = SPECIAL_VALUE;
326 // store the accumulated variable name and value, but only if the name
327 // is non-empty. otherwise, it's not much of a definition.
329 // strip spaces at the end of the name.
330 while (parser_bits::white_space_no_cr(name[name.end()]))
331 name.zap(name.end(), name.end());
332 // strip spaces at the end of the value unless it's the special case.
333 if (!value.equal_to(SPECIAL_VALUE)) {
334 while (parser_bits::white_space(value[value.end()]))
335 value.zap(value.end(), value.end());
337 LOG(astring("normal add: ") + name + " -> " + value);
338 _implementation->add(name, value); // store what we built.
339 just_ate_blank_line = true; // flag that we don't want next EOL.
340 // reset, just in case.
345 // currently we just kind of bully through whatever string is provided and do not
346 // flag any error conditions. but people do like to know if it worked or not. they can
347 // make their own conclusions if there are not enough variables defined for their needs.
351 bool variable_tokenizer::is_eol_a_separator() const
353 for (int i = 0; i < _separators->length(); i++) {
354 char sep = _separators->get(i);
355 // correct the separator for platform when it's the end of the line.
356 if (parser_bits::is_eol(sep)) return true;
361 void variable_tokenizer::text_form(astring &accumulator) const
364 bool added_sep = false;
365 for (int i = 0; i < _implementation->symbols(); i++) {
367 if (!string_table::is_comment(_implementation->name(i))) {
368 // a normal assignment is here.
369 accumulator += _implementation->name(i);
370 if (_implementation->operator [](i).t()) {
371 if (_add_spaces) accumulator += " ";
372 accumulator += _assignments->get(0);
373 if (_add_spaces) accumulator += " ";
374 accumulator += _implementation->operator [](i);
377 // this one is a comment. just spit out the value.
378 if (_implementation->operator [](i).t())
379 accumulator += _implementation->operator [](i);
381 // correct the separator for platform when it's the end of the line.
382 if (is_eol_a_separator()) {
383 accumulator += parser_bits::platform_eol_to_chars();
385 added_sep = true; // record that we put a separator in there.
386 accumulator += _separators->get(0);
390 // strip the final separator and space back off, if we added them.
392 accumulator.zap(accumulator.end() - 1, accumulator.end());
395 astring variable_tokenizer::text_form() const
398 text_form(accumulator);