feisty meow concerns codebase 2.140
string_manipulation.cpp
Go to the documentation of this file.
1/*****************************************************************************\
2* *
3* Name : string_manipulation *
4* Author : Chris Koeritz *
5* *
6*******************************************************************************
7* Copyright (c) 2000-$now By Author. This program is free software; you can *
8* redistribute it and/or modify it under the terms of the GNU General Public *
9* License as published by the Free Software Foundation; either version 2 of *
10* the License or (at your option) any later version. This is online at: *
11* http://www.fsf.org/copyleft/gpl.html *
12* Please send any updates to: fred@gruntose.com *
13\*****************************************************************************/
14
15#include "parser_bits.h"
16#include "string_manipulation.h"
17
18#include <basis/byte_array.h>
19#include <basis/functions.h>
20#include <basis/mutex.h>
21#include <mathematics/chaos.h>
22
23using namespace basis;
24using namespace mathematics;
25
26namespace textual {
27
28//SAFE_STATIC_CONST(astring_object, string_manipulation::splitter_finding_set,
29// ("\t\r\n -,;?!.:"))
30const char *splitter_finding_set = "\t\r\n -,;?!.:";
31 // any of these characters make a valid place to break a line.
32
34{
36 int length = rando.inclusive(min, max);
37 // pick a size for the string.
38 astring to_return;
39 for (int i = 0; i < length; i++) {
40 int chah = rando.inclusive(0, 26);
41 // use a range one larger than alphabet size.
42 char to_add = 'a' + chah;
43 if (chah == 26) to_add = '_';
44 // patch the extra value to be a separator.
45 to_return += to_add;
46 }
47 return to_return;
48}
49
50astring string_manipulation::long_line(char line_item, int repeat)
51{ return astring(line_item, repeat); }
52
54{
55 astring s;
56 for (int i = 0; i < spaces; i++) s += ' ';
57 return s;
58}
59
61{
62 for (int j = 0; j < to_strip.length(); j++) {
63 int original_j = j; // track where we started looking.
64 if (!parser_bits::is_eol(to_strip[j])) continue;
65 // we have found at least one CR. let's see what else there is.
66 if ( (to_strip[j] == '\r') && (to_strip[j + 1] == '\n') ) {
67 // this is looking like a DOS CR. let's skip that now.
68 j++;
69 }
70 j++; // skip the one we know is a CR.
71 if (parser_bits::is_eol(to_strip[j])) {
72 // we are seeing more than one carriage return in a row. let's
73 // truncate that down to just one.
74 j++;
75 while (parser_bits::is_eol(to_strip[j]) && (j < to_strip.length()))
76 j++; // skip to next one that might not be CR.
77 // now we think we know where there's this huge line of CRs. we will
78 // turn them all into spaces except the first.
79 to_strip[original_j] = '\n';
80 for (int k = original_j + 1; k < j; k++) to_strip[k] = ' ';
81 // put the index back so we'll start looking at the non-CR char.
82 j--;
83 continue; // now skip back out to the main loop.
84 } else {
85 // we see only one carriage return, which we will drop in favor of
86 // joining those lines together. we iterate here since we might have
87 // seen a DOS CR taking up two spaces.
88 for (int k = original_j; k < j; k++) to_strip[k] = ' ';
89 }
90 }
91
92}
93
94void string_manipulation::split_lines(const astring &input_in, astring &output,
95 int min_column, int max_column)
96{
97 output = "";
98 if (max_column - min_column + 1 < 2) return; // what's the point?
99
100 astring input = input_in; // make a copy to work on.
102
103 int col = min_column;
104 astring indent_add = indentation(min_column);
105 output = indent_add; // start with the extra space.
106
107 bool just_had_break = false;
108 // set true if we just handled a line break in the previous loop.
109 bool put_accum_before_break = false; // true if we must postpone CR.
110 astring accumulated;
111 // holds stuff to print on next go-round.
112
113 // now we parse across the list counting up our line size and making sure
114 // we don't go over it.
115 for (int j = 0; j < input.length(); j++) {
116
117//char to_print = input[j];
118//if (parser_bits::is_eol(to_print)) to_print = '_';
119//printf("[%d: val=%d, '%c', col=%d]\n", j, to_print, to_print, col);
120//fflush(0);
121
122 // handle the carriage return if it was ordered.
123 if (just_had_break) {
124 if (put_accum_before_break) {
125 output += accumulated;
126 // strip off any spaces from the end of the line.
129 accumulated = "";
130 j++; // skip the CR that we think is there.
131 }
132 // strip off any spaces from the end of the line.
135 col = min_column;
136 output += indent_add;
137 just_had_break = false;
138 if (accumulated.length()) {
139 output += accumulated;
140 col += accumulated.length();
141 accumulated = "";
142 }
143 j--;
144 continue;
145 }
146
147 put_accum_before_break = false;
148
149 // skip any spaces we've got at the current position.
150 while ( (input[j] == ' ') || (input[j] == '\t') ) {
151 j++;
152 if (j >= input.length()) break; // break out of subloop if past it.
153 }
154
155 if (j >= input.length()) break; // we're past the end.
156
157 // handle carriage returns when they're at the current position.
158 char current_char = input[j];
159 if (parser_bits::is_eol(current_char)) {
160 just_had_break = true; // set the state.
161 put_accum_before_break = true;
162 continue;
163 }
164
165//hmmm: the portion below could be called a find word break function.
166
167 bool add_dash = false; // true if we need to break a word and add hyphen.
168 bool break_line = false; // true if we need to go to the next line.
169 bool invisible = false; // true if invisible characters were seen.
170 bool end_sentence = false; // true if there was a sentence terminator.
171 bool punctuate = false; // true if there was normal punctuation.
172 bool keep_on_line = false; // true if we want add current then break line.
173 char prior_break = '\0'; // set for real below.
174 char prior_break_plus_1 = '\0'; // ditto.
175
176 // find where our next normal word break is, if possible.
177 int next_break = input.find_any(splitter_finding_set, j);
178 // if we didn't find a separator, just use the end of the string.
179 if (negative(next_break))
180 next_break = input.length() - 1;
181
182 // now we know where we're supposed to break, but we don't know if it
183 // will all fit.
184 prior_break = input[next_break];
185 // hang onto the value before we change next_break.
186 prior_break_plus_1 = input[next_break + 1];
187 // should still be safe since we're stopping before the last zero.
188 switch (prior_break) {
189 case '\r': case '\n':
190 break_line = true;
191 just_had_break = true;
192 put_accum_before_break = true;
193 // intentional fall-through, so no break.
194 case '\t': case ' ':
195 invisible = true;
196 next_break--; // don't include it in what's printed.
197 break;
198 case '?': case '!': case '.':
199 end_sentence = true;
200 // if we see multiples of these, we count them as just one.
201 while ( (input[next_break + 1] == '?')
202 || (input[next_break + 1] == '!')
203 || (input[next_break + 1] == '.') ) {
204 next_break++;
205 }
206 // make sure that there's a blank area after the supposed punctuation.
207 if (!parser_bits::white_space(input[next_break + 1]))
208 end_sentence = false;
209 break;
210 case ',': case ';': case ':':
211 punctuate = true;
212 // make sure that there's a blank area after the supposed punctuation.
213 if (!parser_bits::white_space(input[next_break + 1]))
214 punctuate = false;
215 break;
216 }
217
218 // we'll need to add some spaces for certain punctuation.
219 int punct_adder = 0;
220 if (punctuate || invisible) punct_adder = 1;
221 if (end_sentence) punct_adder = 2;
222
223 // check that we're still in bounds.
224 int chars_added = next_break - j + 1;
225 if (col + chars_added + punct_adder > max_column) {
226 // we need to break before the next breakable character.
227 break_line = true;
228 just_had_break = true;
229 if (col + chars_added <= max_column) {
230 // it will fit without the punctuation spaces, which is fine since
231 // it should be the end of the line.
232 invisible = false;
233 punctuate = false;
234 end_sentence = false;
235 punct_adder = 0;
236 keep_on_line = true;
237 } else if (min_column + chars_added > max_column) {
238 // this word won't ever fit unless we break it.
239 int chars_left = max_column - col + 1;
240 // remember to take out room for the dash also.
241 if (chars_left < 2) {
242 j--; // stay where we are.
243 continue;
244 } else {
245 next_break = j + chars_left - 2;
246 chars_added = next_break - j + 1;
247 if (next_break >= input.length())
248 next_break = input.length() - 1;
249 else if (next_break < j)
250 next_break = j;
251 add_dash = true;
252 }
253 }
254 }
255
256 astring adding_chunk = input.substring(j, next_break);
257 // this is what we've decided the next word chunk to be added will be.
258 // we still haven't completely decided where it goes.
259
260 if (break_line) {
261 col = min_column;
262 if (add_dash || keep_on_line) {
263 // include the previous stuff on the same line.
264 output += adding_chunk;
265 if (add_dash) output += "-";
266 j = next_break;
267 continue; // done with this case.
268 }
269
270 // don't include the previous stuff; make it go to the next line.
271 accumulated = adding_chunk;
272 if (punctuate || invisible) {
273 accumulated += " ";
274 } else if (end_sentence) {
275 accumulated += " ";
276 }
277 j = next_break;
278 continue;
279 }
280
281 // add the line normally since it should fit.
282 output += adding_chunk;
283 col += chars_added + punct_adder; // add the characters added.
284 j = next_break;
285 just_had_break = false; // reset the state.
286
287 // handle when we processed an invisible or punctuation character.
288 if (punctuate || invisible) {
289 output += " ";
290 } else if (end_sentence) {
291 output += " ";
292 }
293 }
294 // make sure we handle any leftovers.
295 if (accumulated.length()) {
298 output += indent_add;
299 output += accumulated;
300 }
303}
304
306{
307 if (to_convert <= 9) return char('0' + to_convert);
308 else if ( (to_convert >= 10) && (to_convert <= 15) )
309 return char('A' - 10 + to_convert);
310 else return '?';
311}
312
314{
315 if ( (to_convert >= '0') && (to_convert <= '9') )
316 return char(to_convert - '0');
317 else if ( (to_convert >= 'a') && (to_convert <= 'f') )
318 return char(to_convert - 'a' + 10);
319 else if ( (to_convert >= 'A') && (to_convert <= 'F') )
320 return char(to_convert - 'A' + 10);
321 else return 0;
322}
323
325{
326 byte_array to_return(0, NULL_POINTER);
327 for (int i = 0; i < to_convert.length() / 2; i++) {
328 int str_index = i * 2;
329 abyte first_byte = char_to_hex(to_convert.get(str_index));
330 abyte second_byte = char_to_hex(to_convert.get(str_index + 1));
331 abyte to_stuff = abyte(first_byte * 16 + second_byte);
332 to_return.concatenate(to_stuff);
333 }
334 return to_return;
335}
336
338{
339 astring to_return;
340 for (int i = 0; i < to_convert.length() * 2; i += 2) {
341 int str_index = i / 2;
342 char first_char = hex_to_char(char(to_convert.get(str_index) / 16));
343 char second_char = hex_to_char(char(to_convert.get(str_index) % 16));
344 to_return += astring(first_char, 1);
345 to_return += astring(second_char, 1);
346 }
347 return to_return;
348}
349
350} //namespace.
351
array & concatenate(const array &to_concatenate)
Appends the array "to_concatenate" onto "this" and returns "this".
Definition array.h:379
const contents & get(int index) const
Accesses individual objects stored in "this" at the "index" position.
Definition array.h:372
int length() const
Returns the current reported length of the allocated C array.
Definition array.h:115
Provides a dynamically resizable ASCII character string.
Definition astring.h:35
virtual char get(int index) const
a constant peek at the string's internals at the specified index.
Definition astring.cpp:138
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition astring.cpp:868
void strip_spaces(how_to_strip way=FROM_BOTH_SIDES)
removes excess space characters from string's beginning, end or both.
Definition astring.h:325
int find_any(const char *to_find, int position=0, bool reverse=false) const
searches for any of the characters in "to_find".
Definition astring.cpp:580
int length() const
Returns the current length of the string.
Definition astring.cpp:132
A very common template for a dynamic array of bytes.
Definition byte_array.h:36
a platform-independent way to acquire random numbers in a specific range.
Definition chaos.h:51
int inclusive(int low, int high) const
< Returns a pseudo-random number r, such that "low" <= r <= "high".
Definition chaos.h:88
static bool white_space(char to_check)
returns true if the character "to_check" is considered a white space.
static bool is_eol(char to_check)
returns true if "to_check" is part of an end-of-line sequence.
static const char * platform_eol_to_chars()
provides the characters that make up this platform's line ending.
static char hex_to_char(basis::abyte to_convert)
Converts a byte between 0 and 15 into a corresponding hexadecimal character.
static basis::byte_array string_to_hex(const basis::astring &character_form)
Turns a string form of a set of hex numbers into an array of bytes.
static void split_lines(const basis::astring &input, basis::astring &output, int min_column=0, int max_column=79)
formats blocks of text for a maximum width.
static basis::astring long_line(char line_item='/', int repeat=76)
produces a long line of "line_item" characters.
static void carriage_returns_to_spaces(basis::astring &to_strip)
converts carriage returns in "to_strip" into spaces.
static basis::astring hex_to_string(const basis::byte_array &byte_form)
The inverse of string_to_hex prints "byte_form" as text.
static basis::astring make_random_name(int min=1, int max=64)
creates a random name, where the letters are between 'a' and 'z'.
static basis::abyte char_to_hex(char to_convert)
Converts a single character into the corresponding hex nibble.
static basis::astring indentation(int spaces)
Returns a string made of white space that is "spaces" long.
#define NULL_POINTER
The value representing a pointer to nothing.
Definition definitions.h:32
The guards collection helps in testing preconditions and reporting errors.
Definition array.h:30
unsigned char abyte
A fairly important unit which is seldom defined...
Definition definitions.h:51
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition functions.h:43
An extension to floating point primitives providing approximate equality.
Definition averager.h:21
const char * splitter_finding_set
chaos rando