feisty meow concerns codebase  2.140
list_parsing.cpp
Go to the documentation of this file.
1 /*****************************************************************************\
2 * *
3 * Name : list_parsing *
4 * Author : Chris Koeritz *
5 * Author : Gary Hardley *
6 * *
7 *******************************************************************************
8 * Copyright (c) 2002-$now By Author. This program is free software; you can *
9 * redistribute it and/or modify it under the terms of the GNU General Public *
10 * License as published by the Free Software Foundation; either version 2 of *
11 * the License or (at your option) any later version. This is online at: *
12 * http://www.fsf.org/copyleft/gpl.html *
13 * Please send any updates to: fred@gruntose.com *
14 \*****************************************************************************/
15 
16 #include "list_parsing.h"
17 #include "parser_bits.h"
18 
19 #include <basis/astring.h>
20 #include <structures/set.h>
22 
23 #include <ctype.h>
24 #include <stdio.h>
25 
26 using namespace basis;
27 using namespace structures;
28 
29 namespace textual {
30 
31 #undef LOG
32 #define LOG(to_print) printf("%s::%s: %s\n", static_class_name(), func, astring(to_print).s())
33 
34 list_parsing::~list_parsing() {} // needed since we use the class_name macro.
35 
36 // by Gary Hardley.
37 bool list_parsing::get_ids_from_string(const astring &to_parse, int_set &identifiers)
38 {
39  identifiers.clear(); // clear existing ids, if any.
40  int_array found;
41  bool ret = get_ids_from_string(to_parse, found);
42  if (!ret) return false;
43  for (int i = 0; i < found.length(); i++) identifiers.add(found[i]);
44  return true;
45 }
46 
47 // by Gary Hardley.
48 bool list_parsing::get_ids_from_string(const astring &to_parse,
49  int_array &identifiers)
50 {
51  identifiers.reset(); // clear existing ids, if any.
52  if (!to_parse) return false;
53  // if an empty string is passed, return an empty set.
54 
55  int last_id = -1;
56  int tmp_id;
57  bool done = false;
58  char last_separator = ' ';
59 
60  int index = 0;
61  while (!done && (index < to_parse.length())) {
62  tmp_id = 0;
63  bool got_digit = false;
64  while ( (to_parse[index] != ',') && (to_parse[index] != '-')
65  && (to_parse[index] != ' ') && (index < to_parse.length()) ) {
66  if (!isdigit(to_parse[index])) return false;
67  tmp_id *= 10;
68  tmp_id += int(to_parse[index++]) - 0x30;
69  got_digit = true;
70  }
71 
72  if (got_digit) {
73  if (tmp_id > MAXINT32) return false;
74 
75  if (last_id == -1) {
76  last_id = tmp_id;
77  identifiers += last_id;
78  } else {
79  // if the last separator was a dash, this is a range
80  if (last_separator == '-') {
81  if (tmp_id >= last_id) {
82  for (int i = last_id + 1; i <= tmp_id; i++)
83  identifiers += i;
84  }
85  else {
86  for (int i = tmp_id; i < last_id; i++)
87  identifiers += i;
88  }
89  last_id = 0;
90  last_separator = ' ';
91  } else {
92  last_id = tmp_id;
93  identifiers += last_id;
94  }
95  }
96  } else {
97  // did not read an address, to_parse[index] must be a non-digit.
98  if ( (to_parse[index] != ' ') && (to_parse[index] != '-')
99  && (to_parse[index] != ',') ) return false;
100  last_separator = to_parse[index++];
101  }
102  }
103  return true;
104 }
105 
106 //by chris koeritz.
107 astring list_parsing::put_ids_in_string(const int_set &ids, char separator)
108 {
109  astring to_return;
110  for (int i = 0; i < ids.length(); i++) {
111  to_return += a_sprintf("%d", ids[i]);
112  if (i < ids.length() - 1) {
113  to_return += separator;
114  to_return += " ";
115  }
116  }
117  return to_return;
118 }
119 
120 //by chris koeritz.
121 astring list_parsing::put_ids_in_string(const int_array &ids, char separator)
122 {
123  astring to_return;
124  for (int i = 0; i < ids.length(); i++) {
125  to_return += a_sprintf("%d", ids[i]);
126  if (i < ids.length() - 1) {
127  to_return += separator;
128  to_return += " ";
129  }
130  }
131  return to_return;
132 }
133 
134 // ensures that quotes inside the string "to_emit" are escaped.
135 astring list_parsing::emit_quoted_chunk(const astring &to_emit)
136 {
137  astring to_return('\0', 256); // avoid reallocations with large pre-alloc.
138  to_return = ""; // reset to get blank string but keep pre-alloc.
139  for (int i = 0; i < to_emit.length(); i++) {
140  char next_char = to_emit[i];
141  if ( (next_char == '"') || (next_char == '\\') )
142  to_return += "\\"; // add the escape before quote or backslash.
143  to_return += astring(next_char, 1);
144  }
145  return to_return;
146 }
147 
148 void list_parsing::create_csv_line(const string_table &to_csv, astring &target)
149 {
150  target = astring::empty_string();
151  for (int i = 0; i < to_csv.symbols(); i++) {
152  target += astring("\"") + emit_quoted_chunk(to_csv.name(i))
153  + "=" + emit_quoted_chunk(to_csv[i]) + "\"";
154  if (i < to_csv.symbols() - 1) target += ",";
155  }
156 }
157 
158 void list_parsing::create_csv_line(const string_array &to_csv, astring &target)
159 {
160  target = astring::empty_string();
161  for (int i = 0; i < to_csv.length(); i++) {
162  target += astring("\"") + emit_quoted_chunk(to_csv[i]) + "\"";
163  if (i < to_csv.length() - 1) target += ",";
164  }
165 }
166 
167 // we do handle escaped quotes for csv parsing, so check for backslash.
168 // and since we handle escaped quotes, we also have to handle escaping the
169 // backslash (otherwise a quoted item with a backslash as the last character
170 // cannot be handled appropriately, because it will be interpreted as an
171 // escaped quote instead). no other escapes are implemented right now.
172 #define handle_escapes \
173  if (to_parse[i] == '\\') { \
174  if ( (to_parse[i + 1] == '"') || (to_parse[i + 1] == '\\') ) { \
175  i++; \
176  accumulator += to_parse[i]; \
177  continue; /* skip normal handling in sequel. */ \
178  } \
179  }
180 
181 const int ARRAY_PREFILL_AMOUNT = 7;
182  // a random default for pre-filling.
183 
184 #define ADD_LINE_TO_FIELDS(new_line) { \
185  storage_slot++; /* move to next place to store item. */ \
186  /* make sure we have enough space for the next slot and then some. */ \
187 /*LOG(a_sprintf("fields curr=%d stowslot=%d", fields.length(), storage_slot));*/ \
188  if (fields.length() < storage_slot + 2) \
189  fields.insert(fields.length(), ARRAY_PREFILL_AMOUNT); \
190 /*LOG(a_sprintf("now fields=%d stowslot=%d", fields.length(), storage_slot));*/ \
191  fields[storage_slot] = new_line; \
192 }
193 
194 //hmmm: parameterize what is meant by a quote. maybe comma too.
195 //by chris koeritz.
196 bool list_parsing::parse_csv_line(const astring &to_parse, string_array &fields)
197 {
198  FUNCDEF("parse_csv_line");
199  // the current field we're chowing. we puff it out to start with to
200  // avoid paying for expanding its memory later.
201  astring accumulator(' ', 256);
202  accumulator = astring::empty_string();
203 
204  // the state machine goes through these states until the entire string
205  // is consumed.
206  enum states { seeking_quote, eating_string, seeking_comma };
207  states state = seeking_quote;
208 
209  bool no_second_quote = false; // true if we started without a quote.
210  bool just_saw_comma = false; // true if seeking comma was the last state.
211 
212  int storage_slot = -1;
213 
214  for (int i = 0; i < to_parse.length(); i++) {
215  switch (state) {
216  case seeking_quote:
217  if (parser_bits::white_space(to_parse[i])) continue;
218  if (to_parse[i] == ',') {
219  // a missing quoted string counts as an empty string.
220  ADD_LINE_TO_FIELDS(astring::empty_string());
221  just_saw_comma = true;
222  continue;
223  }
224  just_saw_comma = false; // cancel that state.
225  if (to_parse[i] != '"') {
226  // short circuit the need for a quote.
227  accumulator += to_parse[i];
228  no_second_quote = true;
229  }
230  state = eating_string;
231  break;
232  case eating_string:
233  just_saw_comma = false; // no longer true.
234  if (no_second_quote && (to_parse[i] != ',') ) {
236  accumulator += to_parse[i];
237  } else if (!no_second_quote && (to_parse[i] != '"') ) {
239  accumulator += to_parse[i];
240  } else {
241  // we found the closing quote (or comma). add the string.
242  if (no_second_quote) {
243  state = seeking_quote;
244  just_saw_comma = true;
245  } else state = seeking_comma;
246  ADD_LINE_TO_FIELDS(accumulator)
247  accumulator = astring::empty_string();
248  no_second_quote = false;
249  }
250  break;
251  case seeking_comma:
252  if (parser_bits::white_space(to_parse[i])) continue;
253  if (to_parse[i] == ',') {
254  // we got what we wanted.
255  state = seeking_quote;
256  just_saw_comma = true;
257  continue;
258  }
259  // well, there was no comma. that's an error.
260  return false;
261  break;
262  default: {
263  LOG("erroneous state reached during csv parsing");
264  break;
265  }
266  }
267  }
268  if ( (state == eating_string) && (accumulator.length()) )
269  ADD_LINE_TO_FIELDS(accumulator)
270  else if (just_saw_comma)
271  ADD_LINE_TO_FIELDS(astring::empty_string())
272  if (fields.length() > storage_slot + 1)
273  fields.zap(storage_slot + 1, fields.last());
274  return true;
275 }
276 
277 } //namespace.
278 
279 
a_sprintf is a specialization of astring that provides printf style support.
Definition: astring.h:440
void reset(int number=0, const contents *initial_contents=NULL_POINTER)
Resizes this array and sets the contents from an array of contents.
Definition: array.h:349
int length() const
Returns the current reported length of the allocated C array.
Definition: array.h:115
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35
static const astring & empty_string()
useful wherever empty strings are needed, e.g., function defaults.
Definition: astring.cpp:128
int length() const
Returns the current length of the string.
Definition: astring.cpp:132
A simple object that wraps a templated array of ints.
Definition: array.h:275
A simple object that wraps a templated set of ints.
Definition: set.h:156
bool add(const contents &to_add)
Adds a new element "to_add" to the set.
Definition: set.h:232
void clear()
Empties out this set.
Definition: set.h:55
An array of strings with some additional helpful methods.
Definition: string_array.h:32
Provides a symbol_table that holds strings as the content.
Definition: string_table.h:32
const basis::astring & name(int index) const
returns the name held at the "index".
Definition: symbol_table.h:272
int symbols() const
returns the number of symbols listed in the table.
Definition: symbol_table.h:241
#define MAXINT32
Maximum 32-bit integer value.
Definition: definitions.h:75
#define ADD_LINE_TO_FIELDS(new_line)
#define LOG(to_print)
#define handle_escapes
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
A dynamic container class that holds any kind of object via pointers.
Definition: amorph.h:55