basis/utf_conversion.cpp

   1 /*****************************************************************************\
   2 *                                                                             *
   3  *  Name   : utf_conversion                                                    *
   4  *  Author : Unicode, Inc. (C conversion functions)                            *
   5  *  Author : Chris Koeritz (C++ conversion classes)                            *
   6  *                                                                             *
   7  *******************************************************************************
   8  * Copyright (c) 2006-$now By Author.  This program is free software; you can  *
   9  * redistribute it and/or modify it under the terms of the GNU General Public  *
  10  * License as published by the Free Software Foundation; either version 2 of   *
  11  * the License or (at your option) any later version.  This is online at:      *
  12  *     http://www.fsf.org/copyleft/gpl.html                                    *
  13  * Please send any updates to: fred@gruntose.com                               *
  14  \*****************************************************************************/
  15
  16 //copyright below is relevant to UTF conversion methods only.
  17 /*
  18  * Copyright 2001-$now Unicode, Inc.
  19  *
  20  * Disclaimer
  21  *
  22  * This source code is provided as is by Unicode, Inc. No claims are
  23  * made as to fitness for any particular purpose. No warranties of any
  24  * kind are expressed or implied. The recipient agrees to determine
  25  * applicability of information provided. If this file has been
  26  * purchased on magnetic or optical media from Unicode, Inc., the
  27  * sole remedy for any claim will be exchange of defective media
  28  * within 90 days of receipt.
  29  *
  30  * Limitations on Rights to Redistribute This Code
  31  *
  32  * Unicode, Inc. hereby grants the right to freely use the information
  33  * supplied in this file in the creation of products supporting the
  34  * Unicode Standard, and to make copies of this file in any form
  35  * for internal or external distribution as long as this notice
  36  * remains attached.
  37  */
  38
  39 /* ---------------------------------------------------------------------
  40
  41  Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  42  Author: Mark E. Davis, 1994.
  43  Rev History: Rick McGowan, fixes & updates May 2001.
  44  Sept 2001: fixed const & error conditions per
  45  mods suggested by S. Parent & A. Lillich.
  46  June 2002: Tim Dodd added detection and handling of incomplete
  47  source sequences, enhanced error detection, added casts
  48  to eliminate compiler warnings.
  49  July 2003: slight mods to back out aggressive FFFE detection.
  50  Jan 2004: updated switches in from-UTF8 conversions.
  51  Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  52
  53  See the header file "ConvertUTF.h" for complete documentation.
  54
  55  ------------------------------------------------------------------------ */
  56
  57 #include "astring.h"
  58 #include "utf_conversion.h"
  59
  60 #include <string.h>
  61 #include <wchar.h>
  62 #ifdef CVTUTF_DEBUG
  63 #include <stdio.h>
  64 #endif
  65
  66 namespace basis {
  67
  68         static const int halfShift = 10; /* used for shifting by 10 bits */
  69
  70         static const UTF32 halfBase = 0x0010000UL;
  71         static const UTF32 halfMask = 0x3FFUL;
  72
  73 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  74 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  75 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  76 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  77
  78         /* --------------------------------------------------------------------- */
  79
  80         ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart, const UTF32* sourceEnd,
  81           UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
  82         {
  83                 ConversionResult result = conversionOK;
  84                 const UTF32* source = *sourceStart;
  85                 UTF16* target = *targetStart;
  86                 while (source < sourceEnd) {
  87                         UTF32 ch;
  88                         if (target >= targetEnd) {
  89                                 result = targetExhausted;
  90                                 break;
  91                         }
  92                         ch = *source++;
  93                         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  94                                 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  95                                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  96                                         if (flags == strictConversion) {
  97                                                 --source; /* return to the illegal value itself */
  98                                                 result = sourceIllegal;
  99                                                 break;
 100                                         } else {
 101                                                 *target++ = UNI_REPLACEMENT_CHAR;
 102                                         }
 103                                 } else {
 104                                         *target++ = (UTF16)ch; /* normal case */
 105                                 }
 106                         } else if (ch > UNI_MAX_LEGAL_UTF32) {
 107                                 if (flags == strictConversion) {
 108                                         result = sourceIllegal;
 109                                 } else {
 110                                         *target++ = UNI_REPLACEMENT_CHAR;
 111                                 }
 112                         } else {
 113                                 /* target is a character in range 0xFFFF - 0x10FFFF. */
 114                                 if (target + 1 >= targetEnd) {
 115                                         --source; /* Back up source pointer! */
 116                                         result = targetExhausted;
 117                                         break;
 118                                 }
 119                                 ch -= halfBase;
 120                                 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
 121                                 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
 122                         }
 123                 }
 124                 *sourceStart = source;
 125                 *targetStart = target;
 126                 return result;
 127         }
 128
 129         /* --------------------------------------------------------------------- */
 130
 131         ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart, const UTF16* sourceEnd,
 132           UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
 133         {
 134                 ConversionResult result = conversionOK;
 135                 const UTF16* source = *sourceStart;
 136                 UTF32* target = *targetStart;
 137                 UTF32 ch, ch2;
 138                 while (source < sourceEnd) {
 139                         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
 140                         ch = *source++;
 141                         /* If we have a surrogate pair, convert to UTF32 first. */
 142                         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 143                                 /* If the 16 bits following the high surrogate are in the source buffer... */
 144                                 if (source < sourceEnd) {
 145                                         ch2 = *source;
 146                                         /* If it's a low surrogate, convert to UTF32. */
 147                                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 148                                                 ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
 149                                                     + halfBase;
 150                                                 ++source;
 151                                         } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
 152                                                 --source; /* return to the illegal value itself */
 153                                                 result = sourceIllegal;
 154                                                 break;
 155                                         }
 156                                 } else { /* We don't have the 16 bits following the high surrogate. */
 157                                         --source; /* return to the high surrogate */
 158                                         result = sourceExhausted;
 159                                         break;
 160                                 }
 161                         } else if (flags == strictConversion) {
 162                                 /* UTF-16 surrogate values are illegal in UTF-32 */
 163                                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 164                                         --source; /* return to the illegal value itself */
 165                                         result = sourceIllegal;
 166                                         break;
 167                                 }
 168                         }
 169                         if (target >= targetEnd) {
 170                                 source = oldSource; /* Back up source pointer! */
 171                                 result = targetExhausted;
 172                                 break;
 173                         }
 174                         *target++ = ch;
 175                 }
 176                 *sourceStart = source;
 177                 *targetStart = target;
 178 #ifdef CVTUTF_DEBUG
 179                 if (result == sourceIllegal) {
 180                         fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
 181                         fflush(stderr);
 182                 }
 183 #endif
 184                 return result;
 185         }
 186
 187         /* --------------------------------------------------------------------- */
 188
 189         /*
 190          * Index into the table below with the first byte of a UTF-8 sequence to
 191          * get the number of trailing bytes that are supposed to follow it.
 192          * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 193          * left as-is for anyone who may want to do such conversion, which was
 194          * allowed in earlier algorithms.
 195          */
 196         static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 197             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 198             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 199             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 200             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 201             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 202             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 203             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 204             2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
 205
 206         /*
 207          * Magic values subtracted from a buffer value during UTF8 conversion.
 208          * This table contains as many values as there might be trailing bytes
 209          * in a UTF-8 sequence.
 210          */
 211         static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
 212             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 213
 214         /*
 215          * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 216          * into the first byte, depending on how many bytes follow.  There are
 217          * as many entries in this table as there are UTF-8 sequence types.
 218          * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 219          * for *legal* UTF-8 will be 4 or fewer bytes total.
 220          */
 221         static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 222
 223         /* --------------------------------------------------------------------- */
 224
 225         /* The interface converts a whole buffer to avoid function-call overhead.
 226          * Constants have been gathered. Loops & conditionals have been removed as
 227          * much as possible for efficiency, in favor of drop-through switches.
 228          * (See "Note A" at the bottom of the file for equivalent code.)
 229          * If your compiler supports it, the "isLegalUTF8" call can be turned
 230          * into an inline function.
 231          */
 232
 233         /* --------------------------------------------------------------------- */
 234
 235         ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart, const UTF16* sourceEnd,
 236           UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
 237         {
 238                 ConversionResult result = conversionOK;
 239                 const UTF16* source = *sourceStart;
 240                 UTF8* target = *targetStart;
 241                 while (source < sourceEnd) {
 242                         UTF32 ch;
 243                         unsigned short bytesToWrite = 0;
 244                         const UTF32 byteMask = 0xBF;
 245                         const UTF32 byteMark = 0x80;
 246                         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
 247                         ch = *source++;
 248                         /* If we have a surrogate pair, convert to UTF32 first. */
 249                         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 250                                 /* If the 16 bits following the high surrogate are in the source buffer... */
 251                                 if (source < sourceEnd) {
 252                                         UTF32 ch2 = *source;
 253                                         /* If it's a low surrogate, convert to UTF32. */
 254                                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 255                                                 ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
 256                                                     + halfBase;
 257                                                 ++source;
 258                                         } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
 259                                                 --source; /* return to the illegal value itself */
 260                                                 result = sourceIllegal;
 261                                                 break;
 262                                         }
 263                                 } else { /* We don't have the 16 bits following the high surrogate. */
 264                                         --source; /* return to the high surrogate */
 265                                         result = sourceExhausted;
 266                                         break;
 267                                 }
 268                         } else if (flags == strictConversion) {
 269                                 /* UTF-16 surrogate values are illegal in UTF-32 */
 270                                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 271                                         --source; /* return to the illegal value itself */
 272                                         result = sourceIllegal;
 273                                         break;
 274                                 }
 275                         }
 276                         /* Figure out how many bytes the result will require */
 277                         if (ch < (UTF32)0x80) {
 278                                 bytesToWrite = 1;
 279                         } else if (ch < (UTF32)0x800) {
 280                                 bytesToWrite = 2;
 281                         } else if (ch < (UTF32)0x10000) {
 282                                 bytesToWrite = 3;
 283                         } else if (ch < (UTF32)0x110000) {
 284                                 bytesToWrite = 4;
 285                         } else {
 286                                 bytesToWrite = 3;
 287                                 ch = UNI_REPLACEMENT_CHAR;
 288                         }
 289
 290                         target += bytesToWrite;
 291                         if (target > targetEnd) {
 292                                 source = oldSource; /* Back up source pointer! */
 293                                 target -= bytesToWrite;
 294                                 result = targetExhausted;
 295                                 break;
 296                         }
 297                         switch (bytesToWrite) { /* note: everything falls through. */
 298                                 case 4:
 299                                         *--target = (UTF8)((ch | byteMark) & byteMask);
 300                                         ch >>= 6;
 301                                         // no break
 302                                 case 3:
 303                                         *--target = (UTF8)((ch | byteMark) & byteMask);
 304                                         ch >>= 6;
 305                                         // no break.
 306                                 case 2:
 307                                         *--target = (UTF8)((ch | byteMark) & byteMask);
 308                                         ch >>= 6;
 309                                         // no break.
 310                                 case 1:
 311                                         *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
 312                                         // no break.
 313                         }
 314                         target += bytesToWrite;
 315                 }
 316                 *sourceStart = source;
 317                 *targetStart = target;
 318                 return result;
 319         }
 320
 321         /* --------------------------------------------------------------------- */
 322
 323         /*
 324          * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 325          * This must be called with the length pre-determined by the first byte.
 326          * If not calling this from ConvertUTF8to*, then the length can be set by:
 327          *  length = trailingBytesForUTF8[*source]+1;
 328          * and the sequence is illegal right away if there aren't that many bytes
 329          * available.
 330          * If presented with a length > 4, this returns false.  The Unicode
 331          * definition of UTF-8 goes up to 4-byte sequences.
 332          */
 333
 334         static Booleano isLegalUTF8(const UTF8 *source, int length)
 335         {
 336                 UTF8 a;
 337                 const UTF8 *srcptr = source + length;
 338                 switch (length) {
 339                         /* Everything else falls through when "true"... */
 340                         case 4: {
 341                                 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 342                         }
 343                                 /* no break */
 344                         case 3: {
 345                                 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 346                         }
 347                                 /* no break */
 348                         case 2: {
 349                                 if ((a = (*--srcptr)) > 0xBF) return false;
 350
 351                                 switch (*source) {
 352                                         /* no fall-through in this inner switch */
 353                                         case 0xE0:
 354                                                 if (a < 0xA0) return false;
 355                                                 break;
 356                                         case 0xED:
 357                                                 if (a > 0x9F) return false;
 358                                                 break;
 359                                         case 0xF0:
 360                                                 if (a < 0x90) return false;
 361                                                 break;
 362                                         case 0xF4:
 363                                                 if (a > 0x8F) return false;
 364                                                 break;
 365                                         default:
 366                                                 if (a < 0x80) return false;
 367                                                 break;
 368                                 }
 369                         }
 370                                 /* no break */
 371                         case 1: {
 372                                 if (*source >= 0x80 && *source < 0xC2) return false;
 373                         }
 374                                 /* no break */
 375                         default: {
 376                                 return false;
 377                         }
 378                 }
 379                 if (*source > 0xF4) return false;
 380                 return true;
 381         }
 382
 383         /* --------------------------------------------------------------------- */
 384
 385         /*
 386          * Exported function to return whether a UTF-8 sequence is legal or not.
 387          * This is not used here; it's just exported.
 388          */
 389         Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
 390         {
 391                 int length = trailingBytesForUTF8[*source] + 1;
 392                 if (source + length > sourceEnd) {
 393                         return false;
 394                 }
 395                 return isLegalUTF8(source, length);
 396         }
 397
 398         /* --------------------------------------------------------------------- */
 399
 400         ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd,
 401           UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
 402         {
 403                 ConversionResult result = conversionOK;
 404                 const UTF8* source = *sourceStart;
 405                 UTF16* target = *targetStart;
 406                 while (source < sourceEnd) {
 407                         UTF32 ch = 0;
 408                         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 409                         if (source + extraBytesToRead >= sourceEnd) {
 410                                 result = sourceExhausted;
 411                                 break;
 412                         }
 413                         /* Do this check whether lenient or strict */
 414                         if (!isLegalUTF8(source, extraBytesToRead + 1)) {
 415                                 result = sourceIllegal;
 416                                 break;
 417                         }
 418                         /*
 419                          * The cases all fall through. See "Note A" below.
 420                          */
 421                         switch (extraBytesToRead) {
 422                                 case 5:
 423                                         ch += *source++;
 424                                         ch <<= 6; /* remember, illegal UTF-8 */
 425                                         /* no break */
 426                                 case 4:
 427                                         ch += *source++;
 428                                         ch <<= 6; /* remember, illegal UTF-8 */
 429                                         /* no break */
 430                                 case 3:
 431                                         ch += *source++;
 432                                         ch <<= 6;
 433                                         /* no break */
 434                                 case 2:
 435                                         ch += *source++;
 436                                         ch <<= 6;
 437                                         /* no break */
 438                                 case 1:
 439                                         ch += *source++;
 440                                         ch <<= 6;
 441                                         /* no break */
 442                                 case 0:
 443                                         ch += *source++;
 444                                         /* no break */
 445                         }
 446                         ch -= offsetsFromUTF8[extraBytesToRead];
 447
 448                         if (target >= targetEnd) {
 449                                 source -= (extraBytesToRead + 1); /* Back up source pointer! */
 450                                 result = targetExhausted;
 451                                 break;
 452                         }
 453                         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
 454                                 /* UTF-16 surrogate values are illegal in UTF-32 */
 455                                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 456                                         if (flags == strictConversion) {
 457                                                 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
 458                                                 result = sourceIllegal;
 459                                                 break;
 460                                         } else {
 461                                                 *target++ = UNI_REPLACEMENT_CHAR;
 462                                         }
 463                                 } else {
 464                                         *target++ = (UTF16)ch; /* normal case */
 465                                 }
 466                         } else if (ch > UNI_MAX_UTF16) {
 467                                 if (flags == strictConversion) {
 468                                         result = sourceIllegal;
 469                                         source -= (extraBytesToRead + 1); /* return to the start */
 470                                         break; /* Bail out; shouldn't continue */
 471                                 } else {
 472                                         *target++ = UNI_REPLACEMENT_CHAR;
 473                                 }
 474                         } else {
 475                                 /* target is a character in range 0xFFFF - 0x10FFFF. */
 476                                 if (target + 1 >= targetEnd) {
 477                                         source -= (extraBytesToRead + 1); /* Back up source pointer! */
 478                                         result = targetExhausted;
 479                                         break;
 480                                 }
 481                                 ch -= halfBase;
 482                                 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
 483                                 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
 484                         }
 485                 }
 486                 *sourceStart = source;
 487                 *targetStart = target;
 488                 return result;
 489         }
 490
 491         /* --------------------------------------------------------------------- */
 492
 493         ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart, const UTF32* sourceEnd,
 494           UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
 495         {
 496                 ConversionResult result = conversionOK;
 497                 const UTF32* source = *sourceStart;
 498                 UTF8* target = *targetStart;
 499                 while (source < sourceEnd) {
 500                         UTF32 ch;
 501                         unsigned short bytesToWrite = 0;
 502                         const UTF32 byteMask = 0xBF;
 503                         const UTF32 byteMark = 0x80;
 504                         ch = *source++;
 505                         if (flags == strictConversion) {
 506                                 /* UTF-16 surrogate values are illegal in UTF-32 */
 507                                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 508                                         --source; /* return to the illegal value itself */
 509                                         result = sourceIllegal;
 510                                         break;
 511                                 }
 512                         }
 513                         /*
 514                          * Figure out how many bytes the result will require. Turn any
 515                          * illegally large UTF32 things (> Plane 17) into replacement chars.
 516                          */
 517                         if (ch < (UTF32)0x80) {
 518                                 bytesToWrite = 1;
 519                         } else if (ch < (UTF32)0x800) {
 520                                 bytesToWrite = 2;
 521                         } else if (ch < (UTF32)0x10000) {
 522                                 bytesToWrite = 3;
 523                         } else if (ch <= UNI_MAX_LEGAL_UTF32) {
 524                                 bytesToWrite = 4;
 525                         } else {
 526                                 bytesToWrite = 3;
 527                                 ch = UNI_REPLACEMENT_CHAR;
 528                                 result = sourceIllegal;
 529                         }
 530
 531                         target += bytesToWrite;
 532                         if (target > targetEnd) {
 533                                 --source; /* Back up source pointer! */
 534                                 target -= bytesToWrite;
 535                                 result = targetExhausted;
 536                                 break;
 537                         }
 538                         switch (bytesToWrite) { /* note: everything falls through. */
 539                                 case 4:
 540                                         *--target = (UTF8)((ch | byteMark) & byteMask);
 541                                         ch >>= 6;
 542                                         /* no break */
 543                                 case 3:
 544                                         *--target = (UTF8)((ch | byteMark) & byteMask);
 545                                         ch >>= 6;
 546                                         /* no break */
 547                                 case 2:
 548                                         *--target = (UTF8)((ch | byteMark) & byteMask);
 549                                         ch >>= 6;
 550                                         /* no break */
 551                                 case 1:
 552                                         *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
 553                                         /* no break */
 554                         }
 555                         target += bytesToWrite;
 556                 }
 557                 *sourceStart = source;
 558                 *targetStart = target;
 559                 return result;
 560         }
 561
 562         /* --------------------------------------------------------------------- */
 563
 564         ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart, const UTF8* sourceEnd,
 565           UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
 566         {
 567                 ConversionResult result = conversionOK;
 568                 const UTF8* source = *sourceStart;
 569                 UTF32* target = *targetStart;
 570                 while (source < sourceEnd) {
 571                         UTF32 ch = 0;
 572                         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 573                         if (source + extraBytesToRead >= sourceEnd) {
 574                                 result = sourceExhausted;
 575                                 break;
 576                         }
 577                         /* Do this check whether lenient or strict */
 578                         if (!isLegalUTF8(source, extraBytesToRead + 1)) {
 579                                 result = sourceIllegal;
 580                                 break;
 581                         }
 582                         /*
 583                          * The cases all fall through. See "Note A" below.
 584                          */
 585                         switch (extraBytesToRead) {
 586                                 case 5:
 587                                         ch += *source++;
 588                                         ch <<= 6;
 589                                         /* no break */
 590                                 case 4:
 591                                         ch += *source++;
 592                                         ch <<= 6;
 593                                         /* no break */
 594                                 case 3:
 595                                         ch += *source++;
 596                                         ch <<= 6;
 597                                         /* no break */
 598                                 case 2:
 599                                         ch += *source++;
 600                                         ch <<= 6;
 601                                         /* no break */
 602                                 case 1:
 603                                         ch += *source++;
 604                                         ch <<= 6;
 605                                         /* no break */
 606                                 case 0:
 607                                         ch += *source++;
 608                                         /* no break */
 609                         }
 610                         ch -= offsetsFromUTF8[extraBytesToRead];
 611
 612                         if (target >= targetEnd) {
 613                                 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
 614                                 result = targetExhausted;
 615                                 break;
 616                         }
 617                         if (ch <= UNI_MAX_LEGAL_UTF32) {
 618                                 /*
 619                                  * UTF-16 surrogate values are illegal in UTF-32, and anything
 620                                  * over Plane 17 (> 0x10FFFF) is illegal.
 621                                  */
 622                                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 623                                         if (flags == strictConversion) {
 624                                                 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
 625                                                 result = sourceIllegal;
 626                                                 break;
 627                                         } else {
 628                                                 *target++ = UNI_REPLACEMENT_CHAR;
 629                                         }
 630                                 } else {
 631                                         *target++ = ch;
 632                                 }
 633                         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 634                                 result = sourceIllegal;
 635                                 *target++ = UNI_REPLACEMENT_CHAR;
 636                         }
 637                 }
 638                 *sourceStart = source;
 639                 *targetStart = target;
 640                 return result;
 641         }
 642
 643         /* ---------------------------------------------------------------------
 644
 645          Note A.
 646          The fall-through switches in UTF-8 reading code save a
 647          temp variable, some decrements & conditionals.  The switches
 648          are equivalent to the following loop:
 649          {
 650          int tmpBytesToRead = extraBytesToRead+1;
 651          do {
 652          ch += *source++;
 653          --tmpBytesToRead;
 654          if (tmpBytesToRead) ch <<= 6;
 655          } while (tmpBytesToRead > 0);
 656          }
 657          In UTF-8 writing code, the switches on "bytesToWrite" are
 658          similarly unrolled loops.
 659
 660          --------------------------------------------------------------------- */
 661
 662 //////////////
 663 #ifdef __cplusplus
 664
 665         transcode_to_utf16::transcode_to_utf16(const char *utf8_input)
 666                         : _orig_length(int(strlen(utf8_input)) + 1), _converted(new UTF16[_orig_length + 1])
 667         // we don't ever expect the string to get longer going to the larger data
 668         // type, so the current length should be enough.
 669         {
 670                 _result = conversionOK;
 671                 if (_orig_length == 1) {
 672                         // no length, so only provide a blank string.
 673                         _converted[0] = 0;
 674                         return;
 675                 }
 676                 memset((abyte *)_converted, 0, 2 * _orig_length);
 677                 // we use these temporary pointers since the converter resets the source
 678                 // and target pointers to the end of the conversion.  the same pattern
 679                 // is used in the code below.
 680                 const UTF8 *temp_in = (const UTF8 *)utf8_input;
 681                 UTF16 *temp_out = _converted;
 682                 _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
 683                     temp_out + _orig_length, lenientConversion);
 684         }
 685
 686         transcode_to_utf16::transcode_to_utf16(const astring &utf8_input)
 687                         : _orig_length(utf8_input.length() + 1), _converted(new UTF16[_orig_length])
 688         {
 689                 _result = conversionOK;
 690                 if (_orig_length == 1) {
 691                         // no length, so only provide a blank string.
 692                         _converted[0] = 0;
 693                         return;
 694                 }
 695                 memset((abyte *)_converted, 0, 2 * _orig_length);
 696                 const UTF8 *temp_in = (const UTF8 *)utf8_input.observe();
 697                 UTF16 *temp_out = _converted;
 698                 _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
 699                     temp_out + _orig_length, lenientConversion);
 700         }
 701
 702         transcode_to_utf16::~transcode_to_utf16()
 703         {
 704                 delete[] _converted;
 705                 _converted = NULL_POINTER;
 706         }
 707
 708         int transcode_to_utf16::length() const
 709         {
 710                 return int(wcslen((wchar_t *)_converted));
 711         }
 712
 713 //////////////
 714
 715         transcode_to_utf8::transcode_to_utf8(const UTF16 *utf16_input)
 716                         : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
 717                             _new_length(_orig_length * 2 + _orig_length / 2 + 1),
 718                             // this is just an estimate.  it may be appropriate most of the time.
 719                             // whatever doesn't fit will get truncated.
 720                             _converted(new UTF8[_new_length])
 721         {
 722                 _result = conversionOK;
 723                 if (_orig_length == 0) {
 724                         // no length, so only provide a blank string.
 725                         _converted[0] = 0;
 726                         return;
 727                 }
 728                 memset(_converted, 0, _new_length);
 729                 const UTF16 *temp_in = (const UTF16 *)utf16_input;
 730                 UTF8 *temp_out = _converted;
 731                 _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
 732                     temp_out + _new_length, lenientConversion);
 733         }
 734
 735         transcode_to_utf8::transcode_to_utf8(const wchar_t *utf16_input)
 736                         : _orig_length(int(wcslen(utf16_input))),
 737                             _new_length(_orig_length * 2 + _orig_length / 2 + 1),
 738                             // this is just an estimate.  it may be appropriate most of the time.
 739                             // whatever doesn't fit will get truncated.
 740                             _converted(new UTF8[_new_length > 0 ? _new_length : 1])
 741         {
 742                 _result = conversionOK;
 743                 if (_orig_length == 0) {
 744                         // no length, so only provide a blank string.
 745                         _converted[0] = 0;
 746                         return;
 747                 }
 748                 memset(_converted, 0, _new_length);
 749                 const UTF16 *temp_in = (const UTF16 *)utf16_input;
 750                 UTF8 *temp_out = _converted;
 751                 _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
 752                     temp_out + _new_length, lenientConversion);
 753         }
 754
 755         transcode_to_utf8::~transcode_to_utf8()
 756         {
 757                 delete[] _converted;
 758                 _converted = NULL_POINTER;
 759         }
 760
 761         int transcode_to_utf8::length() const
 762         {
 763                 return int(strlen((char *)_converted));
 764         }
 765
 766         transcode_to_utf8::operator astring() const
 767         {
 768                 return astring((char *)_converted);
 769         }
 770
 771 //////////////
 772
 773         null_transcoder::null_transcoder(const char *utf8_input, bool make_own_copy)
 774                         : _make_own_copy(make_own_copy),
 775                             _converted(make_own_copy ? new UTF8[strlen(utf8_input) + 1] : (const UTF8 *)utf8_input)
 776         {
 777                 if (_make_own_copy) {
 778                         strcpy((char *)_converted, utf8_input);
 779                 }
 780         }
 781
 782         null_transcoder::null_transcoder(const astring &utf8_input, bool make_own_copy)
 783                         : _make_own_copy(make_own_copy),
 784                             _converted(
 785                                 make_own_copy ? new UTF8[utf8_input.length() + 1] : (const UTF8 *)utf8_input.s())
 786         {
 787                 if (_make_own_copy) {
 788                         strcpy((char *)_converted, utf8_input.s());
 789                 }
 790         }
 791
 792         int null_transcoder::length() const
 793         {
 794                 return int(strlen((char *)_converted));
 795         }
 796
 797 #endif //_cplusplus
 798 }  //namespace.
 799