nucleus/library/textual/string_manipulation.cpp

   1 /*****************************************************************************\
   2 *                                                                             *
   3 *  Name   : string_manipulation                                               *
   4 *  Author : Chris Koeritz                                                     *
   5 *                                                                             *
   6 *******************************************************************************
   7 * Copyright (c) 2000-$now By Author.  This program is free software; you can  *
   8 * redistribute it and/or modify it under the terms of the GNU General Public  *
   9 * License as published by the Free Software Foundation; either version 2 of   *
  10 * the License or (at your option) any later version.  This is online at:      *
  11 *     http://www.fsf.org/copyleft/gpl.html                                    *
  12 * Please send any updates to: fred@gruntose.com                               *
  13 \*****************************************************************************/
  14
  15 #include "parser_bits.h"
  16 #include "string_manipulation.h"
  17
  18 #include <basis/byte_array.h>
  19 #include <basis/functions.h>
  20 #include <basis/mutex.h>
  21 #include <mathematics/chaos.h>
  22
  23 using namespace basis;
  24 using namespace mathematics;
  25
  26 namespace textual {
  27
  28 //SAFE_STATIC_CONST(astring_object, string_manipulation::splitter_finding_set,
  29 //    ("\t\r\n -,;?!.:"))
  30 const char *splitter_finding_set = "\t\r\n -,;?!.:";
  31   // any of these characters make a valid place to break a line.
  32
  33 astring string_manipulation::make_random_name(int min, int max)
  34 {
  35   chaos rando;
  36   int length = rando.inclusive(min, max);
  37     // pick a size for the string.
  38   astring to_return;
  39   for (int i = 0; i < length; i++) {
  40     int chah = rando.inclusive(0, 26);
  41       // use a range one larger than alphabet size.
  42     char to_add = 'a' + chah;
  43     if (chah == 26) to_add = '_';
  44       // patch the extra value to be a separator.
  45     to_return += to_add;
  46   }
  47   return to_return;
  48 }
  49
  50 astring string_manipulation::long_line(char line_item, int repeat)
  51 { return astring(line_item, repeat); }
  52
  53 astring string_manipulation::indentation(int spaces)
  54 {
  55   astring s;
  56   for (int i = 0; i < spaces; i++) s += ' ';
  57   return s;
  58 }
  59
  60 void string_manipulation::carriage_returns_to_spaces(astring &to_strip)
  61 {
  62   for (int j = 0; j < to_strip.length(); j++) {
  63     int original_j = j;  // track where we started looking.
  64     if (!parser_bits::is_eol(to_strip[j])) continue;
  65     // we have found at least one CR.  let's see what else there is.
  66     if ( (to_strip[j] == '\r') && (to_strip[j + 1] == '\n') ) {
  67       // this is looking like a DOS CR.  let's skip that now.
  68       j++;
  69     }
  70     j++;  // skip the one we know is a CR.
  71     if (parser_bits::is_eol(to_strip[j])) {
  72       // we are seeing more than one carriage return in a row.  let's
  73       // truncate that down to just one.
  74       j++;
  75       while (parser_bits::is_eol(to_strip[j]) && (j < to_strip.length()))
  76         j++;  // skip to next one that might not be CR.
  77       // now we think we know where there's this huge line of CRs.  we will
  78       // turn them all into spaces except the first.
  79       to_strip[original_j] = '\n';
  80       for (int k = original_j + 1; k < j; k++) to_strip[k] = ' ';
  81       // put the index back so we'll start looking at the non-CR char.
  82       j--;
  83       continue;  // now skip back out to the main loop.
  84     } else {
  85       // we see only one carriage return, which we will drop in favor of
  86       // joining those lines together.  we iterate here since we might have
  87       // seen a DOS CR taking up two spaces.
  88       for (int k = original_j; k < j; k++) to_strip[k] = ' ';
  89     }
  90   }
  91
  92 }
  93
  94 void string_manipulation::split_lines(const astring &input_in, astring &output,
  95     int min_column, int max_column)
  96 {
  97   output = "";
  98   if (max_column - min_column + 1 < 2) return;  // what's the point?
  99
 100   astring input = input_in;  // make a copy to work on.
 101   carriage_returns_to_spaces(input);
 102
 103   int col = min_column;
 104   astring indent_add = indentation(min_column);
 105   output = indent_add;  // start with the extra space.
 106
 107   bool just_had_break = false;
 108     // set true if we just handled a line break in the previous loop.
 109   bool put_accum_before_break = false;  // true if we must postpone CR.
 110   astring accumulated;
 111     // holds stuff to print on next go-round.
 112
 113   // now we parse across the list counting up our line size and making sure
 114   // we don't go over it.
 115   for (int j = 0; j < input.length(); j++) {
 116
 117 //char to_print = input[j];
 118 //if (parser_bits::is_eol(to_print)) to_print = '_';
 119 //printf("[%d: val=%d, '%c', col=%d]\n", j, to_print, to_print, col);
 120 //fflush(0);
 121
 122     // handle the carriage return if it was ordered.
 123     if (just_had_break) {
 124       if (put_accum_before_break) {
 125         output += accumulated;
 126         // strip off any spaces from the end of the line.
 127         output.strip_spaces(astring::FROM_END);
 128         output += parser_bits::platform_eol_to_chars();
 129         accumulated = "";
 130         j++;  // skip the CR that we think is there.
 131       }
 132       // strip off any spaces from the end of the line.
 133       output.strip_spaces(astring::FROM_END);
 134       output += parser_bits::platform_eol_to_chars();
 135       col = min_column;
 136       output += indent_add;
 137       just_had_break = false;
 138       if (accumulated.length()) {
 139         output += accumulated;
 140         col += accumulated.length();
 141         accumulated = "";
 142       }
 143       j--;
 144       continue;
 145     }
 146
 147     put_accum_before_break = false;
 148
 149     // skip any spaces we've got at the current position.
 150     while ( (input[j] == ' ') || (input[j] == '\t') ) {
 151       j++;
 152       if (j >= input.length()) break;  // break out of subloop if past it.
 153     }
 154
 155     if (j >= input.length()) break;  // we're past the end.
 156
 157     // handle carriage returns when they're at the current position.
 158     char current_char = input[j];
 159     if (parser_bits::is_eol(current_char)) {
 160       just_had_break = true;  // set the state.
 161       put_accum_before_break = true;
 162       continue;
 163     }
 164
 165 //hmmm: the portion below could be called a find word break function.
 166
 167     bool add_dash = false;  // true if we need to break a word and add hyphen.
 168     bool break_line = false;  // true if we need to go to the next line.
 169     bool invisible = false;  // true if invisible characters were seen.
 170     bool end_sentence = false;  // true if there was a sentence terminator.
 171     bool punctuate = false;  // true if there was normal punctuation.
 172     bool keep_on_line = false;  // true if we want add current then break line.
 173     char prior_break = '\0';  // set for real below.
 174     char prior_break_plus_1 = '\0';  // ditto.
 175
 176     // find where our next normal word break is, if possible.
 177     int next_break = input.find_any(splitter_finding_set, j);
 178     // if we didn't find a separator, just use the end of the string.
 179     if (negative(next_break))
 180       next_break = input.length() - 1;
 181
 182     // now we know where we're supposed to break, but we don't know if it
 183     // will all fit.
 184     prior_break = input[next_break];
 185       // hang onto the value before we change next_break.
 186     prior_break_plus_1 = input[next_break + 1];
 187       // should still be safe since we're stopping before the last zero.
 188     switch (prior_break) {
 189       case '\r': case '\n':
 190         break_line = true;
 191         just_had_break = true;
 192         put_accum_before_break = true;
 193         // intentional fall-through.
 194       case '\t': case ' ':
 195         invisible = true;
 196         next_break--;  // don't include it in what's printed.
 197         break;
 198       case '?': case '!': case '.':
 199         end_sentence = true;
 200         // if we see multiples of these, we count them as just one.
 201         while ( (input[next_break + 1] == '?')
 202             || (input[next_break + 1] == '!')
 203             || (input[next_break + 1] == '.') ) {
 204           next_break++;
 205         }
 206         // make sure that there's a blank area after the supposed punctuation.
 207         if (!parser_bits::white_space(input[next_break + 1]))
 208           end_sentence = false;
 209         break;
 210       case ',': case ';': case ':':
 211         punctuate = true;
 212         // make sure that there's a blank area after the supposed punctuation.
 213         if (!parser_bits::white_space(input[next_break + 1]))
 214           punctuate = false;
 215         break;
 216     }
 217
 218     // we'll need to add some spaces for certain punctuation.
 219     int punct_adder = 0;
 220     if (punctuate || invisible) punct_adder = 1;
 221     if (end_sentence) punct_adder = 2;
 222
 223     // check that we're still in bounds.
 224     int chars_added = next_break - j + 1;
 225     if (col + chars_added + punct_adder > max_column + 1) {
 226       // we need to break before the next breakable character.
 227       break_line = true;
 228       just_had_break = true;
 229       if (col + chars_added <= max_column + 1) {
 230         // it will fit without the punctuation spaces, which is fine since
 231         // it should be the end of the line.
 232         invisible = false;
 233         punctuate = false;
 234         end_sentence = false;
 235         punct_adder = 0;
 236         keep_on_line = true;
 237       } else if (min_column + chars_added > max_column + 1) {
 238         // this word won't ever fit unless we break it.
 239         int chars_left = max_column - col + 1;
 240           // remember to take out room for the dash also.
 241         if (chars_left < 2) {
 242           j--;  // stay where we are.
 243           continue;
 244         } else {
 245           next_break = j + chars_left - 2;
 246           chars_added = next_break - j + 1;
 247           if (next_break >= input.length())
 248             next_break = input.length() - 1;
 249           else if (next_break < j)
 250             next_break = j;
 251           add_dash = true;
 252         }
 253       }
 254     }
 255
 256     astring adding_chunk = input.substring(j, next_break);
 257       // this is what we've decided the next word chunk to be added will be.
 258       // we still haven't completely decided where it goes.
 259
 260     if (break_line) {
 261       col = min_column;
 262       if (add_dash || keep_on_line) {
 263         // include the previous stuff on the same line.
 264         output += adding_chunk;
 265         if (add_dash) output += "-";
 266         j = next_break;
 267         continue;  // done with this case.
 268       }
 269
 270       // don't include the previous stuff; make it go to the next line.
 271       accumulated = adding_chunk;
 272       if (punctuate || invisible) {
 273         accumulated += " ";
 274       } else if (end_sentence) {
 275         accumulated += "  ";
 276       }
 277       j = next_break;
 278       continue;
 279     }
 280
 281     // add the line normally since it should fit.
 282     output += adding_chunk;
 283     col += chars_added + punct_adder;  // add the characters added.
 284     j = next_break;
 285     just_had_break = false;  // reset the state.
 286
 287     // handle when we processed an invisible or punctuation character.
 288     if (punctuate || invisible) {
 289       output += " ";
 290     } else if (end_sentence) {
 291       output += "  ";
 292     }
 293   }
 294   // make sure we handle any leftovers.
 295   if (accumulated.length()) {
 296     output.strip_spaces(astring::FROM_END);
 297     output += parser_bits::platform_eol_to_chars();
 298     output += indent_add;
 299     output += accumulated;
 300   }
 301   output.strip_spaces(astring::FROM_END);
 302   output += parser_bits::platform_eol_to_chars();
 303 }
 304
 305 char string_manipulation::hex_to_char(abyte to_convert)
 306 {
 307   if (to_convert <= 9) return char('0' + to_convert);
 308   else if ( (to_convert >= 10) && (to_convert <= 15) )
 309     return char('A' - 10 + to_convert);
 310   else return '?';
 311 }
 312
 313 abyte string_manipulation::char_to_hex(char to_convert)
 314 {
 315   if ( (to_convert >= '0') && (to_convert <= '9') )
 316     return char(to_convert - '0');
 317   else if ( (to_convert >= 'a') && (to_convert <= 'f') )
 318     return char(to_convert - 'a' + 10);
 319   else if ( (to_convert >= 'A') && (to_convert <= 'F') )
 320     return char(to_convert - 'A' + 10);
 321   else return 0;
 322 }
 323
 324 byte_array string_manipulation::string_to_hex(const astring &to_convert)
 325 {
 326   byte_array to_return(0, NIL);
 327   for (int i = 0; i < to_convert.length() / 2; i++) {
 328     int str_index = i * 2;
 329     abyte first_byte = char_to_hex(to_convert.get(str_index));
 330     abyte second_byte = char_to_hex(to_convert.get(str_index + 1));
 331     abyte to_stuff = abyte(first_byte * 16 + second_byte);
 332     to_return.concatenate(to_stuff);
 333   }
 334   return to_return;
 335 }
 336
 337 astring string_manipulation::hex_to_string(const byte_array &to_convert)
 338 {
 339   astring to_return;
 340   for (int i = 0; i < to_convert.length() * 2; i += 2) {
 341     int str_index = i / 2;
 342     char first_char = hex_to_char(char(to_convert.get(str_index) / 16));
 343     char second_char = hex_to_char(char(to_convert.get(str_index) % 16));
 344     to_return += astring(first_char, 1);
 345     to_return += astring(second_char, 1);
 346   }
 347   return to_return;
 348 }
 349
 350 } //namespace.
 351