feisty meow concerns codebase  2.140
utf_conversion.cpp
Go to the documentation of this file.
1 /*****************************************************************************\
2 * *
3  * Name : utf_conversion *
4  * Author : Unicode, Inc. (C conversion functions) *
5  * Author : Chris Koeritz (C++ conversion classes) *
6  * *
7  *******************************************************************************
8  * Copyright (c) 2006-$now By Author. This program is free software; you can *
9  * redistribute it and/or modify it under the terms of the GNU General Public *
10  * License as published by the Free Software Foundation; either version 2 of *
11  * the License or (at your option) any later version. This is online at: *
12  * http://www.fsf.org/copyleft/gpl.html *
13  * Please send any updates to: fred@gruntose.com *
14  \*****************************************************************************/
15 
16 //copyright below is relevant to UTF conversion methods only.
17 /*
18  * Copyright 2001-$now Unicode, Inc.
19  *
20  * Disclaimer
21  *
22  * This source code is provided as is by Unicode, Inc. No claims are
23  * made as to fitness for any particular purpose. No warranties of any
24  * kind are expressed or implied. The recipient agrees to determine
25  * applicability of information provided. If this file has been
26  * purchased on magnetic or optical media from Unicode, Inc., the
27  * sole remedy for any claim will be exchange of defective media
28  * within 90 days of receipt.
29  *
30  * Limitations on Rights to Redistribute This Code
31  *
32  * Unicode, Inc. hereby grants the right to freely use the information
33  * supplied in this file in the creation of products supporting the
34  * Unicode Standard, and to make copies of this file in any form
35  * for internal or external distribution as long as this notice
36  * remains attached.
37  */
38 
39 /* ---------------------------------------------------------------------
40 
41  Conversions between UTF32, UTF-16, and UTF-8. Source code file.
42  Author: Mark E. Davis, 1994.
43  Rev History: Rick McGowan, fixes & updates May 2001.
44  Sept 2001: fixed const & error conditions per
45  mods suggested by S. Parent & A. Lillich.
46  June 2002: Tim Dodd added detection and handling of incomplete
47  source sequences, enhanced error detection, added casts
48  to eliminate compiler warnings.
49  July 2003: slight mods to back out aggressive FFFE detection.
50  Jan 2004: updated switches in from-UTF8 conversions.
51  Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
52 
53  See the header file "ConvertUTF.h" for complete documentation.
54 
55  ------------------------------------------------------------------------ */
56 
57 #include "astring.h"
58 #include "utf_conversion.h"
59 
60 #include <string.h>
61 #include <wchar.h>
62 #ifdef CVTUTF_DEBUG
63 #include <stdio.h>
64 #endif
65 
66 namespace basis {
67 
68  static const int halfShift = 10; /* used for shifting by 10 bits */
69 
70  static const UTF32 halfBase = 0x0010000UL;
71  static const UTF32 halfMask = 0x3FFUL;
72 
73 #define UNI_SUR_HIGH_START (UTF32)0xD800
74 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
75 #define UNI_SUR_LOW_START (UTF32)0xDC00
76 #define UNI_SUR_LOW_END (UTF32)0xDFFF
77 
78  /* --------------------------------------------------------------------- */
79 
80  ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart, const UTF32* sourceEnd,
81  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
82  {
84  const UTF32* source = *sourceStart;
85  UTF16* target = *targetStart;
86  while (source < sourceEnd) {
87  UTF32 ch;
88  if (target >= targetEnd) {
89  result = targetExhausted;
90  break;
91  }
92  ch = *source++;
93  if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
94  /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
95  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
96  if (flags == strictConversion) {
97  --source; /* return to the illegal value itself */
98  result = sourceIllegal;
99  break;
100  } else {
101  *target++ = UNI_REPLACEMENT_CHAR;
102  }
103  } else {
104  *target++ = (UTF16)ch; /* normal case */
105  }
106  } else if (ch > UNI_MAX_LEGAL_UTF32) {
107  if (flags == strictConversion) {
108  result = sourceIllegal;
109  } else {
110  *target++ = UNI_REPLACEMENT_CHAR;
111  }
112  } else {
113  /* target is a character in range 0xFFFF - 0x10FFFF. */
114  if (target + 1 >= targetEnd) {
115  --source; /* Back up source pointer! */
116  result = targetExhausted;
117  break;
118  }
119  ch -= halfBase;
120  *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
121  *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
122  }
123  }
124  *sourceStart = source;
125  *targetStart = target;
126  return result;
127  }
128 
129  /* --------------------------------------------------------------------- */
130 
131  ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart, const UTF16* sourceEnd,
132  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
133  {
135  const UTF16* source = *sourceStart;
136  UTF32* target = *targetStart;
137  UTF32 ch, ch2;
138  while (source < sourceEnd) {
139  const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
140  ch = *source++;
141  /* If we have a surrogate pair, convert to UTF32 first. */
142  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
143  /* If the 16 bits following the high surrogate are in the source buffer... */
144  if (source < sourceEnd) {
145  ch2 = *source;
146  /* If it's a low surrogate, convert to UTF32. */
147  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
148  ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
149  + halfBase;
150  ++source;
151  } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
152  --source; /* return to the illegal value itself */
153  result = sourceIllegal;
154  break;
155  }
156  } else { /* We don't have the 16 bits following the high surrogate. */
157  --source; /* return to the high surrogate */
158  result = sourceExhausted;
159  break;
160  }
161  } else if (flags == strictConversion) {
162  /* UTF-16 surrogate values are illegal in UTF-32 */
163  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
164  --source; /* return to the illegal value itself */
165  result = sourceIllegal;
166  break;
167  }
168  }
169  if (target >= targetEnd) {
170  source = oldSource; /* Back up source pointer! */
171  result = targetExhausted;
172  break;
173  }
174  *target++ = ch;
175  }
176  *sourceStart = source;
177  *targetStart = target;
178 #ifdef CVTUTF_DEBUG
179  if (result == sourceIllegal) {
180  fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
181  fflush(stderr);
182  }
183 #endif
184  return result;
185  }
186 
187  /* --------------------------------------------------------------------- */
188 
189  /*
190  * Index into the table below with the first byte of a UTF-8 sequence to
191  * get the number of trailing bytes that are supposed to follow it.
192  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
193  * left as-is for anyone who may want to do such conversion, which was
194  * allowed in earlier algorithms.
195  */
196  static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
204  2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
205 
206  /*
207  * Magic values subtracted from a buffer value during UTF8 conversion.
208  * This table contains as many values as there might be trailing bytes
209  * in a UTF-8 sequence.
210  */
211  static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
212  0x03C82080UL, 0xFA082080UL, 0x82082080UL };
213 
214  /*
215  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
216  * into the first byte, depending on how many bytes follow. There are
217  * as many entries in this table as there are UTF-8 sequence types.
218  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
219  * for *legal* UTF-8 will be 4 or fewer bytes total.
220  */
221  static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
222 
223  /* --------------------------------------------------------------------- */
224 
225  /* The interface converts a whole buffer to avoid function-call overhead.
226  * Constants have been gathered. Loops & conditionals have been removed as
227  * much as possible for efficiency, in favor of drop-through switches.
228  * (See "Note A" at the bottom of the file for equivalent code.)
229  * If your compiler supports it, the "isLegalUTF8" call can be turned
230  * into an inline function.
231  */
232 
233  /* --------------------------------------------------------------------- */
234 
235  ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart, const UTF16* sourceEnd,
236  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
237  {
239  const UTF16* source = *sourceStart;
240  UTF8* target = *targetStart;
241  while (source < sourceEnd) {
242  UTF32 ch;
243  unsigned short bytesToWrite = 0;
244  const UTF32 byteMask = 0xBF;
245  const UTF32 byteMark = 0x80;
246  const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
247  ch = *source++;
248  /* If we have a surrogate pair, convert to UTF32 first. */
249  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
250  /* If the 16 bits following the high surrogate are in the source buffer... */
251  if (source < sourceEnd) {
252  UTF32 ch2 = *source;
253  /* If it's a low surrogate, convert to UTF32. */
254  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
255  ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
256  + halfBase;
257  ++source;
258  } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
259  --source; /* return to the illegal value itself */
260  result = sourceIllegal;
261  break;
262  }
263  } else { /* We don't have the 16 bits following the high surrogate. */
264  --source; /* return to the high surrogate */
265  result = sourceExhausted;
266  break;
267  }
268  } else if (flags == strictConversion) {
269  /* UTF-16 surrogate values are illegal in UTF-32 */
270  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
271  --source; /* return to the illegal value itself */
272  result = sourceIllegal;
273  break;
274  }
275  }
276  /* Figure out how many bytes the result will require */
277  if (ch < (UTF32)0x80) {
278  bytesToWrite = 1;
279  } else if (ch < (UTF32)0x800) {
280  bytesToWrite = 2;
281  } else if (ch < (UTF32)0x10000) {
282  bytesToWrite = 3;
283  } else if (ch < (UTF32)0x110000) {
284  bytesToWrite = 4;
285  } else {
286  bytesToWrite = 3;
288  }
289 
290  target += bytesToWrite;
291  if (target > targetEnd) {
292  source = oldSource; /* Back up source pointer! */
293  target -= bytesToWrite;
294  result = targetExhausted;
295  break;
296  }
297  switch (bytesToWrite) { /* note: everything falls through. */
298  case 4:
299  *--target = (UTF8)((ch | byteMark) & byteMask);
300  ch >>= 6;
301  // no break
302  case 3:
303  *--target = (UTF8)((ch | byteMark) & byteMask);
304  ch >>= 6;
305  // no break.
306  case 2:
307  *--target = (UTF8)((ch | byteMark) & byteMask);
308  ch >>= 6;
309  // no break.
310  case 1:
311  *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
312  // no break.
313  }
314  target += bytesToWrite;
315  }
316  *sourceStart = source;
317  *targetStart = target;
318  return result;
319  }
320 
321  /* --------------------------------------------------------------------- */
322 
323  /*
324  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
325  * This must be called with the length pre-determined by the first byte.
326  * If not calling this from ConvertUTF8to*, then the length can be set by:
327  * length = trailingBytesForUTF8[*source]+1;
328  * and the sequence is illegal right away if there aren't that many bytes
329  * available.
330  * If presented with a length > 4, this returns false. The Unicode
331  * definition of UTF-8 goes up to 4-byte sequences.
332  */
333 
334  static Booleano isLegalUTF8(const UTF8 *source, int length)
335  {
336  UTF8 a;
337  const UTF8 *srcptr = source + length;
338  switch (length) {
339  /* Everything else falls through when "true"... */
340  case 4: {
341  if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
342  }
343  /* no break */
344  case 3: {
345  if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
346  }
347  /* no break */
348  case 2: {
349  if ((a = (*--srcptr)) > 0xBF) return false;
350 
351  switch (*source) {
352  /* no fall-through in this inner switch */
353  case 0xE0:
354  if (a < 0xA0) return false;
355  break;
356  case 0xED:
357  if (a > 0x9F) return false;
358  break;
359  case 0xF0:
360  if (a < 0x90) return false;
361  break;
362  case 0xF4:
363  if (a > 0x8F) return false;
364  break;
365  default:
366  if (a < 0x80) return false;
367  break;
368  }
369  }
370  /* no break */
371  case 1: {
372  if (*source >= 0x80 && *source < 0xC2) return false;
373  }
374  /* no break */
375  default: {
376  return false;
377  }
378  }
379  if (*source > 0xF4) return false;
380  return true;
381  }
382 
383  /* --------------------------------------------------------------------- */
384 
385  /*
386  * Exported function to return whether a UTF-8 sequence is legal or not.
387  * This is not used here; it's just exported.
388  */
389  Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
390  {
391  int length = trailingBytesForUTF8[*source] + 1;
392  if (source + length > sourceEnd) {
393  return false;
394  }
395  return isLegalUTF8(source, length);
396  }
397 
398  /* --------------------------------------------------------------------- */
399 
400  ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd,
401  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
402  {
404  const UTF8* source = *sourceStart;
405  UTF16* target = *targetStart;
406  while (source < sourceEnd) {
407  UTF32 ch = 0;
408  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
409  if (source + extraBytesToRead >= sourceEnd) {
410  result = sourceExhausted;
411  break;
412  }
413  /* Do this check whether lenient or strict */
414  if (!isLegalUTF8(source, extraBytesToRead + 1)) {
415  result = sourceIllegal;
416  break;
417  }
418  /*
419  * The cases all fall through. See "Note A" below.
420  */
421  switch (extraBytesToRead) {
422  case 5:
423  ch += *source++;
424  ch <<= 6; /* remember, illegal UTF-8 */
425  /* no break */
426  case 4:
427  ch += *source++;
428  ch <<= 6; /* remember, illegal UTF-8 */
429  /* no break */
430  case 3:
431  ch += *source++;
432  ch <<= 6;
433  /* no break */
434  case 2:
435  ch += *source++;
436  ch <<= 6;
437  /* no break */
438  case 1:
439  ch += *source++;
440  ch <<= 6;
441  /* no break */
442  case 0:
443  ch += *source++;
444  /* no break */
445  }
446  ch -= offsetsFromUTF8[extraBytesToRead];
447 
448  if (target >= targetEnd) {
449  source -= (extraBytesToRead + 1); /* Back up source pointer! */
450  result = targetExhausted;
451  break;
452  }
453  if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
454  /* UTF-16 surrogate values are illegal in UTF-32 */
455  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
456  if (flags == strictConversion) {
457  source -= (extraBytesToRead + 1); /* return to the illegal value itself */
458  result = sourceIllegal;
459  break;
460  } else {
461  *target++ = UNI_REPLACEMENT_CHAR;
462  }
463  } else {
464  *target++ = (UTF16)ch; /* normal case */
465  }
466  } else if (ch > UNI_MAX_UTF16) {
467  if (flags == strictConversion) {
468  result = sourceIllegal;
469  source -= (extraBytesToRead + 1); /* return to the start */
470  break; /* Bail out; shouldn't continue */
471  } else {
472  *target++ = UNI_REPLACEMENT_CHAR;
473  }
474  } else {
475  /* target is a character in range 0xFFFF - 0x10FFFF. */
476  if (target + 1 >= targetEnd) {
477  source -= (extraBytesToRead + 1); /* Back up source pointer! */
478  result = targetExhausted;
479  break;
480  }
481  ch -= halfBase;
482  *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
483  *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
484  }
485  }
486  *sourceStart = source;
487  *targetStart = target;
488  return result;
489  }
490 
491  /* --------------------------------------------------------------------- */
492 
493  ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart, const UTF32* sourceEnd,
494  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
495  {
497  const UTF32* source = *sourceStart;
498  UTF8* target = *targetStart;
499  while (source < sourceEnd) {
500  UTF32 ch;
501  unsigned short bytesToWrite = 0;
502  const UTF32 byteMask = 0xBF;
503  const UTF32 byteMark = 0x80;
504  ch = *source++;
505  if (flags == strictConversion) {
506  /* UTF-16 surrogate values are illegal in UTF-32 */
507  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
508  --source; /* return to the illegal value itself */
509  result = sourceIllegal;
510  break;
511  }
512  }
513  /*
514  * Figure out how many bytes the result will require. Turn any
515  * illegally large UTF32 things (> Plane 17) into replacement chars.
516  */
517  if (ch < (UTF32)0x80) {
518  bytesToWrite = 1;
519  } else if (ch < (UTF32)0x800) {
520  bytesToWrite = 2;
521  } else if (ch < (UTF32)0x10000) {
522  bytesToWrite = 3;
523  } else if (ch <= UNI_MAX_LEGAL_UTF32) {
524  bytesToWrite = 4;
525  } else {
526  bytesToWrite = 3;
528  result = sourceIllegal;
529  }
530 
531  target += bytesToWrite;
532  if (target > targetEnd) {
533  --source; /* Back up source pointer! */
534  target -= bytesToWrite;
535  result = targetExhausted;
536  break;
537  }
538  switch (bytesToWrite) { /* note: everything falls through. */
539  case 4:
540  *--target = (UTF8)((ch | byteMark) & byteMask);
541  ch >>= 6;
542  /* no break */
543  case 3:
544  *--target = (UTF8)((ch | byteMark) & byteMask);
545  ch >>= 6;
546  /* no break */
547  case 2:
548  *--target = (UTF8)((ch | byteMark) & byteMask);
549  ch >>= 6;
550  /* no break */
551  case 1:
552  *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
553  /* no break */
554  }
555  target += bytesToWrite;
556  }
557  *sourceStart = source;
558  *targetStart = target;
559  return result;
560  }
561 
562  /* --------------------------------------------------------------------- */
563 
564  ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart, const UTF8* sourceEnd,
565  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
566  {
568  const UTF8* source = *sourceStart;
569  UTF32* target = *targetStart;
570  while (source < sourceEnd) {
571  UTF32 ch = 0;
572  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
573  if (source + extraBytesToRead >= sourceEnd) {
574  result = sourceExhausted;
575  break;
576  }
577  /* Do this check whether lenient or strict */
578  if (!isLegalUTF8(source, extraBytesToRead + 1)) {
579  result = sourceIllegal;
580  break;
581  }
582  /*
583  * The cases all fall through. See "Note A" below.
584  */
585  switch (extraBytesToRead) {
586  case 5:
587  ch += *source++;
588  ch <<= 6;
589  /* no break */
590  case 4:
591  ch += *source++;
592  ch <<= 6;
593  /* no break */
594  case 3:
595  ch += *source++;
596  ch <<= 6;
597  /* no break */
598  case 2:
599  ch += *source++;
600  ch <<= 6;
601  /* no break */
602  case 1:
603  ch += *source++;
604  ch <<= 6;
605  /* no break */
606  case 0:
607  ch += *source++;
608  /* no break */
609  }
610  ch -= offsetsFromUTF8[extraBytesToRead];
611 
612  if (target >= targetEnd) {
613  source -= (extraBytesToRead + 1); /* Back up the source pointer! */
614  result = targetExhausted;
615  break;
616  }
617  if (ch <= UNI_MAX_LEGAL_UTF32) {
618  /*
619  * UTF-16 surrogate values are illegal in UTF-32, and anything
620  * over Plane 17 (> 0x10FFFF) is illegal.
621  */
622  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
623  if (flags == strictConversion) {
624  source -= (extraBytesToRead + 1); /* return to the illegal value itself */
625  result = sourceIllegal;
626  break;
627  } else {
628  *target++ = UNI_REPLACEMENT_CHAR;
629  }
630  } else {
631  *target++ = ch;
632  }
633  } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
634  result = sourceIllegal;
635  *target++ = UNI_REPLACEMENT_CHAR;
636  }
637  }
638  *sourceStart = source;
639  *targetStart = target;
640  return result;
641  }
642 
643  /* ---------------------------------------------------------------------
644 
645  Note A.
646  The fall-through switches in UTF-8 reading code save a
647  temp variable, some decrements & conditionals. The switches
648  are equivalent to the following loop:
649  {
650  int tmpBytesToRead = extraBytesToRead+1;
651  do {
652  ch += *source++;
653  --tmpBytesToRead;
654  if (tmpBytesToRead) ch <<= 6;
655  } while (tmpBytesToRead > 0);
656  }
657  In UTF-8 writing code, the switches on "bytesToWrite" are
658  similarly unrolled loops.
659 
660  --------------------------------------------------------------------- */
661 
663 #ifdef __cplusplus
664 
665  transcode_to_utf16::transcode_to_utf16(const char *utf8_input)
666  : _orig_length(int(strlen(utf8_input)) + 1), _converted(new UTF16[_orig_length + 1])
667  // we don't ever expect the string to get longer going to the larger data
668  // type, so the current length should be enough.
669  {
670  _result = conversionOK;
671  if (_orig_length == 1) {
672  // no length, so only provide a blank string.
673  _converted[0] = 0;
674  return;
675  }
676  memset((abyte *)_converted, 0, 2 * _orig_length);
677  // we use these temporary pointers since the converter resets the source
678  // and target pointers to the end of the conversion. the same pattern
679  // is used in the code below.
680  const UTF8 *temp_in = (const UTF8 *)utf8_input;
681  UTF16 *temp_out = _converted;
682  _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
683  temp_out + _orig_length, lenientConversion);
684  }
685 
686  transcode_to_utf16::transcode_to_utf16(const astring &utf8_input)
687  : _orig_length(utf8_input.length() + 1), _converted(new UTF16[_orig_length])
688  {
689  _result = conversionOK;
690  if (_orig_length == 1) {
691  // no length, so only provide a blank string.
692  _converted[0] = 0;
693  return;
694  }
695  memset((abyte *)_converted, 0, 2 * _orig_length);
696  const UTF8 *temp_in = (const UTF8 *)utf8_input.observe();
697  UTF16 *temp_out = _converted;
698  _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
699  temp_out + _orig_length, lenientConversion);
700  }
701 
702  transcode_to_utf16::~transcode_to_utf16()
703  {
704  delete[] _converted;
705  _converted = NULL_POINTER;
706  }
707 
708  int transcode_to_utf16::length() const
709  {
710  return int(wcslen((wchar_t *)_converted));
711  }
712 
714 
715  transcode_to_utf8::transcode_to_utf8(const UTF16 *utf16_input)
716  : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
717  _new_length(_orig_length * 2 + _orig_length / 2 + 1),
718  // this is just an estimate. it may be appropriate most of the time.
719  // whatever doesn't fit will get truncated.
720  _converted(new UTF8[_new_length])
721  {
722  _result = conversionOK;
723  if (_orig_length == 0) {
724  // no length, so only provide a blank string.
725  _converted[0] = 0;
726  return;
727  }
728  memset(_converted, 0, _new_length);
729  const UTF16 *temp_in = (const UTF16 *)utf16_input;
730  UTF8 *temp_out = _converted;
731  _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
732  temp_out + _new_length, lenientConversion);
733  }
734 
735  transcode_to_utf8::transcode_to_utf8(const wchar_t *utf16_input)
736  : _orig_length(int(wcslen(utf16_input))),
737  _new_length(_orig_length * 2 + _orig_length / 2 + 1),
738  // this is just an estimate. it may be appropriate most of the time.
739  // whatever doesn't fit will get truncated.
740  _converted(new UTF8[_new_length > 0 ? _new_length : 1])
741  {
742  _result = conversionOK;
743  if (_orig_length == 0) {
744  // no length, so only provide a blank string.
745  _converted[0] = 0;
746  return;
747  }
748  memset(_converted, 0, _new_length);
749  const UTF16 *temp_in = (const UTF16 *)utf16_input;
750  UTF8 *temp_out = _converted;
751  _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
752  temp_out + _new_length, lenientConversion);
753  }
754 
755  transcode_to_utf8::~transcode_to_utf8()
756  {
757  delete[] _converted;
758  _converted = NULL_POINTER;
759  }
760 
761  int transcode_to_utf8::length() const
762  {
763  return int(strlen((char *)_converted));
764  }
765 
766  transcode_to_utf8::operator astring() const
767  {
768  return astring((char *)_converted);
769  }
770 
772 
773  null_transcoder::null_transcoder(const char *utf8_input, bool make_own_copy)
774  : _make_own_copy(make_own_copy),
775  _converted(make_own_copy ? new UTF8[strlen(utf8_input) + 1] : (const UTF8 *)utf8_input)
776  {
777  if (_make_own_copy) {
778  strcpy((char *)_converted, utf8_input);
779  }
780  }
781 
782  null_transcoder::null_transcoder(const astring &utf8_input, bool make_own_copy)
783  : _make_own_copy(make_own_copy),
784  _converted(
785  make_own_copy ? new UTF8[utf8_input.length() + 1] : (const UTF8 *)utf8_input.s())
786  {
787  if (_make_own_copy) {
788  strcpy((char *)_converted, utf8_input.s());
789  }
790  }
791 
792  int null_transcoder::length() const
793  {
794  return int(strlen((char *)_converted));
795  }
796 
797 #endif //_cplusplus
798 } //namespace.
799 
#define NULL_POINTER
The value representing a pointer to nothing.
Definition: definitions.h:32
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
@ sourceIllegal
@ sourceExhausted
@ conversionOK
@ targetExhausted
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
unsigned short UTF16
unsigned char Booleano
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
@ strictConversion
@ lenientConversion
unsigned char abyte
A fairly important unit which is seldom defined...
Definition: definitions.h:51
unsigned char UTF8
unsigned long UTF32
Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
#define UNI_SUR_LOW_START
#define UNI_SUR_HIGH_START
#define UNI_SUR_LOW_END
#define UNI_SUR_HIGH_END
Support for unicode builds.
#define UNI_REPLACEMENT_CHAR
#define UNI_MAX_UTF16
#define UNI_MAX_LEGAL_UTF32
#define UNI_MAX_BMP