feisty meow concerns codebase  2.140
string_manipulation.cpp
Go to the documentation of this file.
1 /*****************************************************************************\
2 * *
3 * Name : string_manipulation *
4 * Author : Chris Koeritz *
5 * *
6 *******************************************************************************
7 * Copyright (c) 2000-$now By Author. This program is free software; you can *
8 * redistribute it and/or modify it under the terms of the GNU General Public *
9 * License as published by the Free Software Foundation; either version 2 of *
10 * the License or (at your option) any later version. This is online at: *
11 * http://www.fsf.org/copyleft/gpl.html *
12 * Please send any updates to: fred@gruntose.com *
13 \*****************************************************************************/
14 
15 #include "parser_bits.h"
16 #include "string_manipulation.h"
17 
18 #include <basis/byte_array.h>
19 #include <basis/functions.h>
20 #include <basis/mutex.h>
21 #include <mathematics/chaos.h>
22 
23 using namespace basis;
24 using namespace mathematics;
25 
26 namespace textual {
27 
28 //SAFE_STATIC_CONST(astring_object, string_manipulation::splitter_finding_set,
29 // ("\t\r\n -,;?!.:"))
30 const char *splitter_finding_set = "\t\r\n -,;?!.:";
31  // any of these characters make a valid place to break a line.
32 
33 astring string_manipulation::make_random_name(int min, int max)
34 {
35  chaos rando;
36  int length = rando.inclusive(min, max);
37  // pick a size for the string.
38  astring to_return;
39  for (int i = 0; i < length; i++) {
40  int chah = rando.inclusive(0, 26);
41  // use a range one larger than alphabet size.
42  char to_add = 'a' + chah;
43  if (chah == 26) to_add = '_';
44  // patch the extra value to be a separator.
45  to_return += to_add;
46  }
47  return to_return;
48 }
49 
50 astring string_manipulation::long_line(char line_item, int repeat)
51 { return astring(line_item, repeat); }
52 
53 astring string_manipulation::indentation(int spaces)
54 {
55  astring s;
56  for (int i = 0; i < spaces; i++) s += ' ';
57  return s;
58 }
59 
60 void string_manipulation::carriage_returns_to_spaces(astring &to_strip)
61 {
62  for (int j = 0; j < to_strip.length(); j++) {
63  int original_j = j; // track where we started looking.
64  if (!parser_bits::is_eol(to_strip[j])) continue;
65  // we have found at least one CR. let's see what else there is.
66  if ( (to_strip[j] == '\r') && (to_strip[j + 1] == '\n') ) {
67  // this is looking like a DOS CR. let's skip that now.
68  j++;
69  }
70  j++; // skip the one we know is a CR.
71  if (parser_bits::is_eol(to_strip[j])) {
72  // we are seeing more than one carriage return in a row. let's
73  // truncate that down to just one.
74  j++;
75  while (parser_bits::is_eol(to_strip[j]) && (j < to_strip.length()))
76  j++; // skip to next one that might not be CR.
77  // now we think we know where there's this huge line of CRs. we will
78  // turn them all into spaces except the first.
79  to_strip[original_j] = '\n';
80  for (int k = original_j + 1; k < j; k++) to_strip[k] = ' ';
81  // put the index back so we'll start looking at the non-CR char.
82  j--;
83  continue; // now skip back out to the main loop.
84  } else {
85  // we see only one carriage return, which we will drop in favor of
86  // joining those lines together. we iterate here since we might have
87  // seen a DOS CR taking up two spaces.
88  for (int k = original_j; k < j; k++) to_strip[k] = ' ';
89  }
90  }
91 
92 }
93 
94 void string_manipulation::split_lines(const astring &input_in, astring &output,
95  int min_column, int max_column)
96 {
97  output = "";
98  if (max_column - min_column + 1 < 2) return; // what's the point?
99 
100  astring input = input_in; // make a copy to work on.
101  carriage_returns_to_spaces(input);
102 
103  int col = min_column;
104  astring indent_add = indentation(min_column);
105  output = indent_add; // start with the extra space.
106 
107  bool just_had_break = false;
108  // set true if we just handled a line break in the previous loop.
109  bool put_accum_before_break = false; // true if we must postpone CR.
110  astring accumulated;
111  // holds stuff to print on next go-round.
112 
113  // now we parse across the list counting up our line size and making sure
114  // we don't go over it.
115  for (int j = 0; j < input.length(); j++) {
116 
117 //char to_print = input[j];
118 //if (parser_bits::is_eol(to_print)) to_print = '_';
119 //printf("[%d: val=%d, '%c', col=%d]\n", j, to_print, to_print, col);
120 //fflush(0);
121 
122  // handle the carriage return if it was ordered.
123  if (just_had_break) {
124  if (put_accum_before_break) {
125  output += accumulated;
126  // strip off any spaces from the end of the line.
128  output += parser_bits::platform_eol_to_chars();
129  accumulated = "";
130  j++; // skip the CR that we think is there.
131  }
132  // strip off any spaces from the end of the line.
134  output += parser_bits::platform_eol_to_chars();
135  col = min_column;
136  output += indent_add;
137  just_had_break = false;
138  if (accumulated.length()) {
139  output += accumulated;
140  col += accumulated.length();
141  accumulated = "";
142  }
143  j--;
144  continue;
145  }
146 
147  put_accum_before_break = false;
148 
149  // skip any spaces we've got at the current position.
150  while ( (input[j] == ' ') || (input[j] == '\t') ) {
151  j++;
152  if (j >= input.length()) break; // break out of subloop if past it.
153  }
154 
155  if (j >= input.length()) break; // we're past the end.
156 
157  // handle carriage returns when they're at the current position.
158  char current_char = input[j];
159  if (parser_bits::is_eol(current_char)) {
160  just_had_break = true; // set the state.
161  put_accum_before_break = true;
162  continue;
163  }
164 
165 //hmmm: the portion below could be called a find word break function.
166 
167  bool add_dash = false; // true if we need to break a word and add hyphen.
168  bool break_line = false; // true if we need to go to the next line.
169  bool invisible = false; // true if invisible characters were seen.
170  bool end_sentence = false; // true if there was a sentence terminator.
171  bool punctuate = false; // true if there was normal punctuation.
172  bool keep_on_line = false; // true if we want add current then break line.
173  char prior_break = '\0'; // set for real below.
174  char prior_break_plus_1 = '\0'; // ditto.
175 
176  // find where our next normal word break is, if possible.
177  int next_break = input.find_any(splitter_finding_set, j);
178  // if we didn't find a separator, just use the end of the string.
179  if (negative(next_break))
180  next_break = input.length() - 1;
181 
182  // now we know where we're supposed to break, but we don't know if it
183  // will all fit.
184  prior_break = input[next_break];
185  // hang onto the value before we change next_break.
186  prior_break_plus_1 = input[next_break + 1];
187  // should still be safe since we're stopping before the last zero.
188  switch (prior_break) {
189  case '\r': case '\n':
190  break_line = true;
191  just_had_break = true;
192  put_accum_before_break = true;
193  // intentional fall-through, so no break.
194  case '\t': case ' ':
195  invisible = true;
196  next_break--; // don't include it in what's printed.
197  break;
198  case '?': case '!': case '.':
199  end_sentence = true;
200  // if we see multiples of these, we count them as just one.
201  while ( (input[next_break + 1] == '?')
202  || (input[next_break + 1] == '!')
203  || (input[next_break + 1] == '.') ) {
204  next_break++;
205  }
206  // make sure that there's a blank area after the supposed punctuation.
207  if (!parser_bits::white_space(input[next_break + 1]))
208  end_sentence = false;
209  break;
210  case ',': case ';': case ':':
211  punctuate = true;
212  // make sure that there's a blank area after the supposed punctuation.
213  if (!parser_bits::white_space(input[next_break + 1]))
214  punctuate = false;
215  break;
216  }
217 
218  // we'll need to add some spaces for certain punctuation.
219  int punct_adder = 0;
220  if (punctuate || invisible) punct_adder = 1;
221  if (end_sentence) punct_adder = 2;
222 
223  // check that we're still in bounds.
224  int chars_added = next_break - j + 1;
225  if (col + chars_added + punct_adder > max_column) {
226  // we need to break before the next breakable character.
227  break_line = true;
228  just_had_break = true;
229  if (col + chars_added <= max_column) {
230  // it will fit without the punctuation spaces, which is fine since
231  // it should be the end of the line.
232  invisible = false;
233  punctuate = false;
234  end_sentence = false;
235  punct_adder = 0;
236  keep_on_line = true;
237  } else if (min_column + chars_added > max_column) {
238  // this word won't ever fit unless we break it.
239  int chars_left = max_column - col + 1;
240  // remember to take out room for the dash also.
241  if (chars_left < 2) {
242  j--; // stay where we are.
243  continue;
244  } else {
245  next_break = j + chars_left - 2;
246  chars_added = next_break - j + 1;
247  if (next_break >= input.length())
248  next_break = input.length() - 1;
249  else if (next_break < j)
250  next_break = j;
251  add_dash = true;
252  }
253  }
254  }
255 
256  astring adding_chunk = input.substring(j, next_break);
257  // this is what we've decided the next word chunk to be added will be.
258  // we still haven't completely decided where it goes.
259 
260  if (break_line) {
261  col = min_column;
262  if (add_dash || keep_on_line) {
263  // include the previous stuff on the same line.
264  output += adding_chunk;
265  if (add_dash) output += "-";
266  j = next_break;
267  continue; // done with this case.
268  }
269 
270  // don't include the previous stuff; make it go to the next line.
271  accumulated = adding_chunk;
272  if (punctuate || invisible) {
273  accumulated += " ";
274  } else if (end_sentence) {
275  accumulated += " ";
276  }
277  j = next_break;
278  continue;
279  }
280 
281  // add the line normally since it should fit.
282  output += adding_chunk;
283  col += chars_added + punct_adder; // add the characters added.
284  j = next_break;
285  just_had_break = false; // reset the state.
286 
287  // handle when we processed an invisible or punctuation character.
288  if (punctuate || invisible) {
289  output += " ";
290  } else if (end_sentence) {
291  output += " ";
292  }
293  }
294  // make sure we handle any leftovers.
295  if (accumulated.length()) {
297  output += parser_bits::platform_eol_to_chars();
298  output += indent_add;
299  output += accumulated;
300  }
302  output += parser_bits::platform_eol_to_chars();
303 }
304 
305 char string_manipulation::hex_to_char(abyte to_convert)
306 {
307  if (to_convert <= 9) return char('0' + to_convert);
308  else if ( (to_convert >= 10) && (to_convert <= 15) )
309  return char('A' - 10 + to_convert);
310  else return '?';
311 }
312 
313 abyte string_manipulation::char_to_hex(char to_convert)
314 {
315  if ( (to_convert >= '0') && (to_convert <= '9') )
316  return char(to_convert - '0');
317  else if ( (to_convert >= 'a') && (to_convert <= 'f') )
318  return char(to_convert - 'a' + 10);
319  else if ( (to_convert >= 'A') && (to_convert <= 'F') )
320  return char(to_convert - 'A' + 10);
321  else return 0;
322 }
323 
324 byte_array string_manipulation::string_to_hex(const astring &to_convert)
325 {
326  byte_array to_return(0, NULL_POINTER);
327  for (int i = 0; i < to_convert.length() / 2; i++) {
328  int str_index = i * 2;
329  abyte first_byte = char_to_hex(to_convert.get(str_index));
330  abyte second_byte = char_to_hex(to_convert.get(str_index + 1));
331  abyte to_stuff = abyte(first_byte * 16 + second_byte);
332  to_return.concatenate(to_stuff);
333  }
334  return to_return;
335 }
336 
337 astring string_manipulation::hex_to_string(const byte_array &to_convert)
338 {
339  astring to_return;
340  for (int i = 0; i < to_convert.length() * 2; i += 2) {
341  int str_index = i / 2;
342  char first_char = hex_to_char(char(to_convert.get(str_index) / 16));
343  char second_char = hex_to_char(char(to_convert.get(str_index) % 16));
344  to_return += astring(first_char, 1);
345  to_return += astring(second_char, 1);
346  }
347  return to_return;
348 }
349 
350 } //namespace.
351 
array & concatenate(const array &to_concatenate)
Appends the array "to_concatenate" onto "this" and returns "this".
Definition: array.h:379
const contents & get(int index) const
Accesses individual objects stored in "this" at the "index" position.
Definition: array.h:372
int length() const
Returns the current reported length of the allocated C array.
Definition: array.h:115
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35
virtual char get(int index) const
a constant peek at the string's internals at the specified index.
Definition: astring.cpp:138
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition: astring.cpp:865
void strip_spaces(how_to_strip way=FROM_BOTH_SIDES)
removes excess space characters from string's beginning, end or both.
Definition: astring.h:325
int find_any(const char *to_find, int position=0, bool reverse=false) const
searches for any of the characters in "to_find".
Definition: astring.cpp:577
int length() const
Returns the current length of the string.
Definition: astring.cpp:132
A very common template for a dynamic array of bytes.
Definition: byte_array.h:36
a platform-independent way to acquire random numbers in a specific range.
Definition: chaos.h:51
int inclusive(int low, int high) const
< Returns a pseudo-random number r, such that "low" <= r <= "high".
Definition: chaos.h:88
#define NULL_POINTER
The value representing a pointer to nothing.
Definition: definitions.h:32
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
unsigned char abyte
A fairly important unit which is seldom defined...
Definition: definitions.h:51
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition: functions.h:43
An extension to floating point primitives providing approximate equality.
Definition: averager.h:21
None split_lines(str unsplit_line)
const char * splitter_finding_set
bool is_eol(char to_check)
chaos rando