production/code_guide/string__manipulation_8cpp_source.html

 /*****************************************************************************\

 *                                                                             *

 *  Name   : string_manipulation                                               *

 *  Author : Chris Koeritz                                                     *

 *                                                                             *

 *******************************************************************************

 * Copyright (c) 2000-$now By Author.  This program is free software; you can  *

 * redistribute it and/or modify it under the terms of the GNU General Public  *

 * License as published by the Free Software Foundation; either version 2 of   *

 * the License or (at your option) any later version.  This is online at:      *

 *     http://www.fsf.org/copyleft/gpl.html                                    *

 * Please send any updates to: fred@gruntose.com                               *

 \*****************************************************************************/


 #include "parser_bits.h"

 #include "string_manipulation.h"


 #include <basis/byte_array.h>

 #include <basis/functions.h>

 #include <basis/mutex.h>

 #include <mathematics/chaos.h>


 using namespace basis;

 using namespace mathematics;


 namespace textual {


 //SAFE_STATIC_CONST(astring_object, string_manipulation::splitter_finding_set,

 //    ("\t\r\n -,;?!.:"))

 const char *splitter_finding_set = "\t\r\n -,;?!.:";

   // any of these characters make a valid place to break a line.


 astring string_manipulation::make_random_name(int min, int max)

 {

   chaos rando;

   int length = rando.inclusive(min, max);

     // pick a size for the string.

   astring to_return;

   for (int i = 0; i < length; i++) {

     int chah = rando.inclusive(0, 26);

       // use a range one larger than alphabet size.

     char to_add = 'a' + chah;

     if (chah == 26) to_add = '_';

       // patch the extra value to be a separator.

     to_return += to_add;

   }

   return to_return;

 }


 astring string_manipulation::long_line(char line_item, int repeat)

 { return astring(line_item, repeat); }


 astring string_manipulation::indentation(int spaces)

 {

   astring s;

   for (int i = 0; i < spaces; i++) s += ' ';

   return s;

 }


 void string_manipulation::carriage_returns_to_spaces(astring &to_strip)

 {

   for (int j = 0; j < to_strip.length(); j++) {

     int original_j = j;  // track where we started looking.

     if (!parser_bits::is_eol(to_strip[j])) continue;

     // we have found at least one CR.  let's see what else there is.

     if ( (to_strip[j] == '\r') && (to_strip[j + 1] == '\n') ) {

       // this is looking like a DOS CR.  let's skip that now.

       j++;

     }

     j++;  // skip the one we know is a CR.

     if (parser_bits::is_eol(to_strip[j])) {

       // we are seeing more than one carriage return in a row.  let's

       // truncate that down to just one.

       j++;

       while (parser_bits::is_eol(to_strip[j]) && (j < to_strip.length()))

         j++;  // skip to next one that might not be CR.

       // now we think we know where there's this huge line of CRs.  we will

       // turn them all into spaces except the first.

       to_strip[original_j] = '\n';

       for (int k = original_j + 1; k < j; k++) to_strip[k] = ' ';

       // put the index back so we'll start looking at the non-CR char.

       j--;

       continue;  // now skip back out to the main loop.

     } else {

       // we see only one carriage return, which we will drop in favor of

       // joining those lines together.  we iterate here since we might have

       // seen a DOS CR taking up two spaces.

       for (int k = original_j; k < j; k++) to_strip[k] = ' ';

     }

   }


 }


 void string_manipulation::split_lines(const astring &input_in, astring &output,

     int min_column, int max_column)

 {

   output = "";

   if (max_column - min_column + 1 < 2) return;  // what's the point?


   astring input = input_in;  // make a copy to work on.

   carriage_returns_to_spaces(input);


   int col = min_column;

   astring indent_add = indentation(min_column);

   output = indent_add;  // start with the extra space.


   bool just_had_break = false;

     // set true if we just handled a line break in the previous loop.

   bool put_accum_before_break = false;  // true if we must postpone CR.

   astring accumulated;

     // holds stuff to print on next go-round.


   // now we parse across the list counting up our line size and making sure

   // we don't go over it.

   for (int j = 0; j < input.length(); j++) {


 //char to_print = input[j];

 //if (parser_bits::is_eol(to_print)) to_print = '_';

 //printf("[%d: val=%d, '%c', col=%d]\n", j, to_print, to_print, col);

 //fflush(0);


     // handle the carriage return if it was ordered.

     if (just_had_break) {

       if (put_accum_before_break) {

         output += accumulated;

         // strip off any spaces from the end of the line.

         output.strip_spaces(astring::FROM_END);

         output += parser_bits::platform_eol_to_chars();

         accumulated = "";

         j++;  // skip the CR that we think is there.

       }

       // strip off any spaces from the end of the line.

       output.strip_spaces(astring::FROM_END);

       output += parser_bits::platform_eol_to_chars();

       col = min_column;

       output += indent_add;

       just_had_break = false;

       if (accumulated.length()) {

         output += accumulated;

         col += accumulated.length();

         accumulated = "";

       }

       j--;

       continue;

     }


     put_accum_before_break = false;


     // skip any spaces we've got at the current position.

     while ( (input[j] == ' ') || (input[j] == '\t') ) {

       j++;

       if (j >= input.length()) break;  // break out of subloop if past it.

     }


     if (j >= input.length()) break;  // we're past the end.


     // handle carriage returns when they're at the current position.

     char current_char = input[j];

     if (parser_bits::is_eol(current_char)) {

       just_had_break = true;  // set the state.

       put_accum_before_break = true;

       continue;

     }


 //hmmm: the portion below could be called a find word break function.


     bool add_dash = false;  // true if we need to break a word and add hyphen.

     bool break_line = false;  // true if we need to go to the next line.

     bool invisible = false;  // true if invisible characters were seen.

     bool end_sentence = false;  // true if there was a sentence terminator.

     bool punctuate = false;  // true if there was normal punctuation.

     bool keep_on_line = false;  // true if we want add current then break line.

     char prior_break = '\0';  // set for real below.

     char prior_break_plus_1 = '\0';  // ditto.


     // find where our next normal word break is, if possible.

     int next_break = input.find_any(splitter_finding_set, j);

     // if we didn't find a separator, just use the end of the string.

     if (negative(next_break))

       next_break = input.length() - 1;


     // now we know where we're supposed to break, but we don't know if it

     // will all fit.

     prior_break = input[next_break];

       // hang onto the value before we change next_break.

     prior_break_plus_1 = input[next_break + 1];

       // should still be safe since we're stopping before the last zero.

     switch (prior_break) {

       case '\r': case '\n':

         break_line = true;

         just_had_break = true;

         put_accum_before_break = true;

         // intentional fall-through, so no break.

       case '\t': case ' ':

         invisible = true;

         next_break--;  // don't include it in what's printed.

         break;

       case '?': case '!': case '.':

         end_sentence = true;

         // if we see multiples of these, we count them as just one.

         while ( (input[next_break + 1] == '?')

             || (input[next_break + 1] == '!')

             || (input[next_break + 1] == '.') ) {

           next_break++;

         }

         // make sure that there's a blank area after the supposed punctuation.

         if (!parser_bits::white_space(input[next_break + 1]))

           end_sentence = false;

         break;

       case ',': case ';': case ':':

         punctuate = true;

         // make sure that there's a blank area after the supposed punctuation.

         if (!parser_bits::white_space(input[next_break + 1]))

           punctuate = false;

         break;

     }


     // we'll need to add some spaces for certain punctuation.

     int punct_adder = 0;

     if (punctuate || invisible) punct_adder = 1;

     if (end_sentence) punct_adder = 2;


     // check that we're still in bounds.

     int chars_added = next_break - j + 1;

     if (col + chars_added + punct_adder > max_column) {

       // we need to break before the next breakable character.

       break_line = true;

       just_had_break = true;

       if (col + chars_added <= max_column) {

         // it will fit without the punctuation spaces, which is fine since

         // it should be the end of the line.

         invisible = false;

         punctuate = false;

         end_sentence = false;

         punct_adder = 0;

         keep_on_line = true;

       } else if (min_column + chars_added > max_column) {

         // this word won't ever fit unless we break it.

         int chars_left = max_column - col + 1;

           // remember to take out room for the dash also.

         if (chars_left < 2) {

           j--;  // stay where we are.

           continue;

         } else {

           next_break = j + chars_left - 2;

           chars_added = next_break - j + 1;

           if (next_break >= input.length())

             next_break = input.length() - 1;

           else if (next_break < j)

             next_break = j;

           add_dash = true;

         }

       }

     }


     astring adding_chunk = input.substring(j, next_break);

       // this is what we've decided the next word chunk to be added will be.

       // we still haven't completely decided where it goes.


     if (break_line) {

       col = min_column;

       if (add_dash || keep_on_line) {

         // include the previous stuff on the same line.

         output += adding_chunk;

         if (add_dash) output += "-";

         j = next_break;

         continue;  // done with this case.

       }


       // don't include the previous stuff; make it go to the next line.

       accumulated = adding_chunk;

       if (punctuate || invisible) {

         accumulated += " ";

       } else if (end_sentence) {

         accumulated += "  ";

       }

       j = next_break;

       continue;

     }


     // add the line normally since it should fit.

     output += adding_chunk;

     col += chars_added + punct_adder;  // add the characters added.

     j = next_break;

     just_had_break = false;  // reset the state.


     // handle when we processed an invisible or punctuation character.

     if (punctuate || invisible) {

       output += " ";

     } else if (end_sentence) {

       output += "  ";

     }

   }

   // make sure we handle any leftovers.

   if (accumulated.length()) {

     output.strip_spaces(astring::FROM_END);

     output += parser_bits::platform_eol_to_chars();

     output += indent_add;

     output += accumulated;

   }

   output.strip_spaces(astring::FROM_END);

   output += parser_bits::platform_eol_to_chars();

 }


 char string_manipulation::hex_to_char(abyte to_convert)

 {

   if (to_convert <= 9) return char('0' + to_convert);

   else if ( (to_convert >= 10) && (to_convert <= 15) )

     return char('A' - 10 + to_convert);

   else return '?';

 }


 abyte string_manipulation::char_to_hex(char to_convert)

 {

   if ( (to_convert >= '0') && (to_convert <= '9') )

     return char(to_convert - '0');

   else if ( (to_convert >= 'a') && (to_convert <= 'f') )

     return char(to_convert - 'a' + 10);

   else if ( (to_convert >= 'A') && (to_convert <= 'F') )

     return char(to_convert - 'A' + 10);

   else return 0;

 }


 byte_array string_manipulation::string_to_hex(const astring &to_convert)

 {

   byte_array to_return(0, NULL_POINTER);

   for (int i = 0; i < to_convert.length() / 2; i++) {

     int str_index = i * 2;

     abyte first_byte = char_to_hex(to_convert.get(str_index));

     abyte second_byte = char_to_hex(to_convert.get(str_index + 1));

     abyte to_stuff = abyte(first_byte * 16 + second_byte);

     to_return.concatenate(to_stuff);

   }

   return to_return;

 }


 astring string_manipulation::hex_to_string(const byte_array &to_convert)

 {

   astring to_return;

   for (int i = 0; i < to_convert.length() * 2; i += 2) {

     int str_index = i / 2;

     char first_char = hex_to_char(char(to_convert.get(str_index) / 16));

     char second_char = hex_to_char(char(to_convert.get(str_index) % 16));

     to_return += astring(first_char, 1);

     to_return += astring(second_char, 1);

   }

   return to_return;

 }


 } //namespace.


byte_array.h

chaos.h

basis::array::concatenate
array & concatenate(const array &to_concatenate)
Appends the array "to_concatenate" onto "this" and returns "this".
Definition: array.h:379

basis::array::get
const contents & get(int index) const
Accesses individual objects stored in "this" at the "index" position.
Definition: array.h:372

basis::array::length
int length() const
Returns the current reported length of the allocated C array.
Definition: array.h:115

basis::astring
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35

basis::astring::FROM_END
@ FROM_END
Definition: astring.h:319

basis::astring::get
virtual char get(int index) const
a constant peek at the string's internals at the specified index.
Definition: astring.cpp:138

basis::astring::substring
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition: astring.cpp:865

basis::astring::strip_spaces
void strip_spaces(how_to_strip way=FROM_BOTH_SIDES)
removes excess space characters from string's beginning, end or both.
Definition: astring.h:325

basis::astring::find_any
int find_any(const char *to_find, int position=0, bool reverse=false) const
searches for any of the characters in "to_find".
Definition: astring.cpp:577

basis::astring::length
int length() const
Returns the current length of the string.
Definition: astring.cpp:132

basis::byte_array
A very common template for a dynamic array of bytes.
Definition: byte_array.h:36

mathematics::chaos
a platform-independent way to acquire random numbers in a specific range.
Definition: chaos.h:51

mathematics::chaos::inclusive
int inclusive(int low, int high) const
< Returns a pseudo-random number r, such that "low" <= r <= "high".
Definition: chaos.h:88

NULL_POINTER
#define NULL_POINTER
The value representing a pointer to nothing.
Definition: definitions.h:32

functions.h

mutex.h

basis
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30

basis::abyte
unsigned char abyte
A fairly important unit which is seldom defined...
Definition: definitions.h:51

basis::negative
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition: functions.h:43

generate_aliases-new.to_add
None to_add
Definition: generate_aliases-new.py:294

mathematics
An extension to floating point primitives providing approximate equality.
Definition: averager.h:21

one_line_cert_to_pem.split_lines
None split_lines(str unsplit_line)
Definition: one_line_cert_to_pem.py:18

textual
Definition: byte_formatter.cpp:38

textual::splitter_finding_set
const char * splitter_finding_set
Definition: string_manipulation.cpp:30

is_eol
bool is_eol(char to_check)
Definition: nechung_oracle.cpp:249

parser_bits.h

string_manipulation.h

rando
chaos rando
Definition: test_string.cpp:118