feisty meow concerns codebase 2.140
utf_conversion.cpp
Go to the documentation of this file.
1/*****************************************************************************\
2* *
3 * Name : utf_conversion *
4 * Author : Unicode, Inc. (C conversion functions) *
5 * Author : Chris Koeritz (C++ conversion classes) *
6 * *
7 *******************************************************************************
8 * Copyright (c) 2006-$now By Author. This program is free software; you can *
9 * redistribute it and/or modify it under the terms of the GNU General Public *
10 * License as published by the Free Software Foundation; either version 2 of *
11 * the License or (at your option) any later version. This is online at: *
12 * http://www.fsf.org/copyleft/gpl.html *
13 * Please send any updates to: fred@gruntose.com *
14 \*****************************************************************************/
15
16//copyright below is relevant to UTF conversion methods only.
17/*
18 * Copyright 2001-$now Unicode, Inc.
19 *
20 * Disclaimer
21 *
22 * This source code is provided as is by Unicode, Inc. No claims are
23 * made as to fitness for any particular purpose. No warranties of any
24 * kind are expressed or implied. The recipient agrees to determine
25 * applicability of information provided. If this file has been
26 * purchased on magnetic or optical media from Unicode, Inc., the
27 * sole remedy for any claim will be exchange of defective media
28 * within 90 days of receipt.
29 *
30 * Limitations on Rights to Redistribute This Code
31 *
32 * Unicode, Inc. hereby grants the right to freely use the information
33 * supplied in this file in the creation of products supporting the
34 * Unicode Standard, and to make copies of this file in any form
35 * for internal or external distribution as long as this notice
36 * remains attached.
37 */
38
39/* ---------------------------------------------------------------------
40
41 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
42 Author: Mark E. Davis, 1994.
43 Rev History: Rick McGowan, fixes & updates May 2001.
44 Sept 2001: fixed const & error conditions per
45 mods suggested by S. Parent & A. Lillich.
46 June 2002: Tim Dodd added detection and handling of incomplete
47 source sequences, enhanced error detection, added casts
48 to eliminate compiler warnings.
49 July 2003: slight mods to back out aggressive FFFE detection.
50 Jan 2004: updated switches in from-UTF8 conversions.
51 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
52
53 See the header file "ConvertUTF.h" for complete documentation.
54
55 ------------------------------------------------------------------------ */
56
57#include "astring.h"
58#include "utf_conversion.h"
59
60#include <string.h>
61#include <wchar.h>
62#ifdef CVTUTF_DEBUG
63#include <stdio.h>
64#endif
65
66namespace basis {
67
68 static const int halfShift = 10; /* used for shifting by 10 bits */
69
70 static const UTF32 halfBase = 0x0010000UL;
71 static const UTF32 halfMask = 0x3FFUL;
72
73#define UNI_SUR_HIGH_START (UTF32)0xD800
74#define UNI_SUR_HIGH_END (UTF32)0xDBFF
75#define UNI_SUR_LOW_START (UTF32)0xDC00
76#define UNI_SUR_LOW_END (UTF32)0xDFFF
77
78 /* --------------------------------------------------------------------- */
79
80 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart, const UTF32* sourceEnd,
81 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
82 {
84 const UTF32* source = *sourceStart;
85 UTF16* target = *targetStart;
86 while (source < sourceEnd) {
87 UTF32 ch;
88 if (target >= targetEnd) {
89 result = targetExhausted;
90 break;
91 }
92 ch = *source++;
93 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
94 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
95 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
96 if (flags == strictConversion) {
97 --source; /* return to the illegal value itself */
98 result = sourceIllegal;
99 break;
100 } else {
101 *target++ = UNI_REPLACEMENT_CHAR;
102 }
103 } else {
104 *target++ = (UTF16)ch; /* normal case */
105 }
106 } else if (ch > UNI_MAX_LEGAL_UTF32) {
107 if (flags == strictConversion) {
108 result = sourceIllegal;
109 } else {
110 *target++ = UNI_REPLACEMENT_CHAR;
111 }
112 } else {
113 /* target is a character in range 0xFFFF - 0x10FFFF. */
114 if (target + 1 >= targetEnd) {
115 --source; /* Back up source pointer! */
116 result = targetExhausted;
117 break;
118 }
119 ch -= halfBase;
120 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
121 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
122 }
123 }
124 *sourceStart = source;
125 *targetStart = target;
126 return result;
127 }
128
129 /* --------------------------------------------------------------------- */
130
131 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart, const UTF16* sourceEnd,
132 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
133 {
135 const UTF16* source = *sourceStart;
136 UTF32* target = *targetStart;
137 UTF32 ch, ch2;
138 while (source < sourceEnd) {
139 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
140 ch = *source++;
141 /* If we have a surrogate pair, convert to UTF32 first. */
142 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
143 /* If the 16 bits following the high surrogate are in the source buffer... */
144 if (source < sourceEnd) {
145 ch2 = *source;
146 /* If it's a low surrogate, convert to UTF32. */
147 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
148 ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
149 + halfBase;
150 ++source;
151 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
152 --source; /* return to the illegal value itself */
153 result = sourceIllegal;
154 break;
155 }
156 } else { /* We don't have the 16 bits following the high surrogate. */
157 --source; /* return to the high surrogate */
158 result = sourceExhausted;
159 break;
160 }
161 } else if (flags == strictConversion) {
162 /* UTF-16 surrogate values are illegal in UTF-32 */
163 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
164 --source; /* return to the illegal value itself */
165 result = sourceIllegal;
166 break;
167 }
168 }
169 if (target >= targetEnd) {
170 source = oldSource; /* Back up source pointer! */
171 result = targetExhausted;
172 break;
173 }
174 *target++ = ch;
175 }
176 *sourceStart = source;
177 *targetStart = target;
178#ifdef CVTUTF_DEBUG
179 if (result == sourceIllegal) {
180 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
181 fflush(stderr);
182 }
183#endif
184 return result;
185 }
186
187 /* --------------------------------------------------------------------- */
188
189 /*
190 * Index into the table below with the first byte of a UTF-8 sequence to
191 * get the number of trailing bytes that are supposed to follow it.
192 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
193 * left as-is for anyone who may want to do such conversion, which was
194 * allowed in earlier algorithms.
195 */
196 static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
204 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
205
206 /*
207 * Magic values subtracted from a buffer value during UTF8 conversion.
208 * This table contains as many values as there might be trailing bytes
209 * in a UTF-8 sequence.
210 */
211 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
212 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
213
214 /*
215 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
216 * into the first byte, depending on how many bytes follow. There are
217 * as many entries in this table as there are UTF-8 sequence types.
218 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
219 * for *legal* UTF-8 will be 4 or fewer bytes total.
220 */
221 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
222
223 /* --------------------------------------------------------------------- */
224
225 /* The interface converts a whole buffer to avoid function-call overhead.
226 * Constants have been gathered. Loops & conditionals have been removed as
227 * much as possible for efficiency, in favor of drop-through switches.
228 * (See "Note A" at the bottom of the file for equivalent code.)
229 * If your compiler supports it, the "isLegalUTF8" call can be turned
230 * into an inline function.
231 */
232
233 /* --------------------------------------------------------------------- */
234
235 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart, const UTF16* sourceEnd,
236 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
237 {
239 const UTF16* source = *sourceStart;
240 UTF8* target = *targetStart;
241 while (source < sourceEnd) {
242 UTF32 ch;
243 unsigned short bytesToWrite = 0;
244 const UTF32 byteMask = 0xBF;
245 const UTF32 byteMark = 0x80;
246 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
247 ch = *source++;
248 /* If we have a surrogate pair, convert to UTF32 first. */
249 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
250 /* If the 16 bits following the high surrogate are in the source buffer... */
251 if (source < sourceEnd) {
252 UTF32 ch2 = *source;
253 /* If it's a low surrogate, convert to UTF32. */
254 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
255 ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
256 + halfBase;
257 ++source;
258 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
259 --source; /* return to the illegal value itself */
260 result = sourceIllegal;
261 break;
262 }
263 } else { /* We don't have the 16 bits following the high surrogate. */
264 --source; /* return to the high surrogate */
265 result = sourceExhausted;
266 break;
267 }
268 } else if (flags == strictConversion) {
269 /* UTF-16 surrogate values are illegal in UTF-32 */
270 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
271 --source; /* return to the illegal value itself */
272 result = sourceIllegal;
273 break;
274 }
275 }
276 /* Figure out how many bytes the result will require */
277 if (ch < (UTF32)0x80) {
278 bytesToWrite = 1;
279 } else if (ch < (UTF32)0x800) {
280 bytesToWrite = 2;
281 } else if (ch < (UTF32)0x10000) {
282 bytesToWrite = 3;
283 } else if (ch < (UTF32)0x110000) {
284 bytesToWrite = 4;
285 } else {
286 bytesToWrite = 3;
288 }
289
290 target += bytesToWrite;
291 if (target > targetEnd) {
292 source = oldSource; /* Back up source pointer! */
293 target -= bytesToWrite;
294 result = targetExhausted;
295 break;
296 }
297 switch (bytesToWrite) { /* note: everything falls through. */
298 case 4:
299 *--target = (UTF8)((ch | byteMark) & byteMask);
300 ch >>= 6;
301 // no break
302 case 3:
303 *--target = (UTF8)((ch | byteMark) & byteMask);
304 ch >>= 6;
305 // no break.
306 case 2:
307 *--target = (UTF8)((ch | byteMark) & byteMask);
308 ch >>= 6;
309 // no break.
310 case 1:
311 *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
312 // no break.
313 }
314 target += bytesToWrite;
315 }
316 *sourceStart = source;
317 *targetStart = target;
318 return result;
319 }
320
321 /* --------------------------------------------------------------------- */
322
323 /*
324 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
325 * This must be called with the length pre-determined by the first byte.
326 * If not calling this from ConvertUTF8to*, then the length can be set by:
327 * length = trailingBytesForUTF8[*source]+1;
328 * and the sequence is illegal right away if there aren't that many bytes
329 * available.
330 * If presented with a length > 4, this returns false. The Unicode
331 * definition of UTF-8 goes up to 4-byte sequences.
332 */
333
334 static Booleano isLegalUTF8(const UTF8 *source, int length)
335 {
336 UTF8 a;
337 const UTF8 *srcptr = source + length;
338 switch (length) {
339 /* Everything else falls through when "true"... */
340 case 4: {
341 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
342 }
343 /* no break */
344 case 3: {
345 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
346 }
347 /* no break */
348 case 2: {
349 if ((a = (*--srcptr)) > 0xBF) return false;
350
351 switch (*source) {
352 /* no fall-through in this inner switch */
353 case 0xE0:
354 if (a < 0xA0) return false;
355 break;
356 case 0xED:
357 if (a > 0x9F) return false;
358 break;
359 case 0xF0:
360 if (a < 0x90) return false;
361 break;
362 case 0xF4:
363 if (a > 0x8F) return false;
364 break;
365 default:
366 if (a < 0x80) return false;
367 break;
368 }
369 }
370 /* no break */
371 case 1: {
372 if (*source >= 0x80 && *source < 0xC2) return false;
373 }
374 /* no break */
375 default: {
376 return false;
377 }
378 }
379 if (*source > 0xF4) return false;
380 return true;
381 }
382
383 /* --------------------------------------------------------------------- */
384
385 /*
386 * Exported function to return whether a UTF-8 sequence is legal or not.
387 * This is not used here; it's just exported.
388 */
389 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
390 {
391 int length = trailingBytesForUTF8[*source] + 1;
392 if (source + length > sourceEnd) {
393 return false;
394 }
395 return isLegalUTF8(source, length);
396 }
397
398 /* --------------------------------------------------------------------- */
399
400 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd,
401 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
402 {
404 const UTF8* source = *sourceStart;
405 UTF16* target = *targetStart;
406 while (source < sourceEnd) {
407 UTF32 ch = 0;
408 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
409 if (source + extraBytesToRead >= sourceEnd) {
410 result = sourceExhausted;
411 break;
412 }
413 /* Do this check whether lenient or strict */
414 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
415 result = sourceIllegal;
416 break;
417 }
418 /*
419 * The cases all fall through. See "Note A" below.
420 */
421 switch (extraBytesToRead) {
422 case 5:
423 ch += *source++;
424 ch <<= 6; /* remember, illegal UTF-8 */
425 /* no break */
426 case 4:
427 ch += *source++;
428 ch <<= 6; /* remember, illegal UTF-8 */
429 /* no break */
430 case 3:
431 ch += *source++;
432 ch <<= 6;
433 /* no break */
434 case 2:
435 ch += *source++;
436 ch <<= 6;
437 /* no break */
438 case 1:
439 ch += *source++;
440 ch <<= 6;
441 /* no break */
442 case 0:
443 ch += *source++;
444 /* no break */
445 }
446 ch -= offsetsFromUTF8[extraBytesToRead];
447
448 if (target >= targetEnd) {
449 source -= (extraBytesToRead + 1); /* Back up source pointer! */
450 result = targetExhausted;
451 break;
452 }
453 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
454 /* UTF-16 surrogate values are illegal in UTF-32 */
455 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
456 if (flags == strictConversion) {
457 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
458 result = sourceIllegal;
459 break;
460 } else {
461 *target++ = UNI_REPLACEMENT_CHAR;
462 }
463 } else {
464 *target++ = (UTF16)ch; /* normal case */
465 }
466 } else if (ch > UNI_MAX_UTF16) {
467 if (flags == strictConversion) {
468 result = sourceIllegal;
469 source -= (extraBytesToRead + 1); /* return to the start */
470 break; /* Bail out; shouldn't continue */
471 } else {
472 *target++ = UNI_REPLACEMENT_CHAR;
473 }
474 } else {
475 /* target is a character in range 0xFFFF - 0x10FFFF. */
476 if (target + 1 >= targetEnd) {
477 source -= (extraBytesToRead + 1); /* Back up source pointer! */
478 result = targetExhausted;
479 break;
480 }
481 ch -= halfBase;
482 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
483 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
484 }
485 }
486 *sourceStart = source;
487 *targetStart = target;
488 return result;
489 }
490
491 /* --------------------------------------------------------------------- */
492
493 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart, const UTF32* sourceEnd,
494 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
495 {
497 const UTF32* source = *sourceStart;
498 UTF8* target = *targetStart;
499 while (source < sourceEnd) {
500 UTF32 ch;
501 unsigned short bytesToWrite = 0;
502 const UTF32 byteMask = 0xBF;
503 const UTF32 byteMark = 0x80;
504 ch = *source++;
505 if (flags == strictConversion) {
506 /* UTF-16 surrogate values are illegal in UTF-32 */
507 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
508 --source; /* return to the illegal value itself */
509 result = sourceIllegal;
510 break;
511 }
512 }
513 /*
514 * Figure out how many bytes the result will require. Turn any
515 * illegally large UTF32 things (> Plane 17) into replacement chars.
516 */
517 if (ch < (UTF32)0x80) {
518 bytesToWrite = 1;
519 } else if (ch < (UTF32)0x800) {
520 bytesToWrite = 2;
521 } else if (ch < (UTF32)0x10000) {
522 bytesToWrite = 3;
523 } else if (ch <= UNI_MAX_LEGAL_UTF32) {
524 bytesToWrite = 4;
525 } else {
526 bytesToWrite = 3;
528 result = sourceIllegal;
529 }
530
531 target += bytesToWrite;
532 if (target > targetEnd) {
533 --source; /* Back up source pointer! */
534 target -= bytesToWrite;
535 result = targetExhausted;
536 break;
537 }
538 switch (bytesToWrite) { /* note: everything falls through. */
539 case 4:
540 *--target = (UTF8)((ch | byteMark) & byteMask);
541 ch >>= 6;
542 /* no break */
543 case 3:
544 *--target = (UTF8)((ch | byteMark) & byteMask);
545 ch >>= 6;
546 /* no break */
547 case 2:
548 *--target = (UTF8)((ch | byteMark) & byteMask);
549 ch >>= 6;
550 /* no break */
551 case 1:
552 *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
553 /* no break */
554 }
555 target += bytesToWrite;
556 }
557 *sourceStart = source;
558 *targetStart = target;
559 return result;
560 }
561
562 /* --------------------------------------------------------------------- */
563
564 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart, const UTF8* sourceEnd,
565 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
566 {
568 const UTF8* source = *sourceStart;
569 UTF32* target = *targetStart;
570 while (source < sourceEnd) {
571 UTF32 ch = 0;
572 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
573 if (source + extraBytesToRead >= sourceEnd) {
574 result = sourceExhausted;
575 break;
576 }
577 /* Do this check whether lenient or strict */
578 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
579 result = sourceIllegal;
580 break;
581 }
582 /*
583 * The cases all fall through. See "Note A" below.
584 */
585 switch (extraBytesToRead) {
586 case 5:
587 ch += *source++;
588 ch <<= 6;
589 /* no break */
590 case 4:
591 ch += *source++;
592 ch <<= 6;
593 /* no break */
594 case 3:
595 ch += *source++;
596 ch <<= 6;
597 /* no break */
598 case 2:
599 ch += *source++;
600 ch <<= 6;
601 /* no break */
602 case 1:
603 ch += *source++;
604 ch <<= 6;
605 /* no break */
606 case 0:
607 ch += *source++;
608 /* no break */
609 }
610 ch -= offsetsFromUTF8[extraBytesToRead];
611
612 if (target >= targetEnd) {
613 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
614 result = targetExhausted;
615 break;
616 }
617 if (ch <= UNI_MAX_LEGAL_UTF32) {
618 /*
619 * UTF-16 surrogate values are illegal in UTF-32, and anything
620 * over Plane 17 (> 0x10FFFF) is illegal.
621 */
622 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
623 if (flags == strictConversion) {
624 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
625 result = sourceIllegal;
626 break;
627 } else {
628 *target++ = UNI_REPLACEMENT_CHAR;
629 }
630 } else {
631 *target++ = ch;
632 }
633 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
634 result = sourceIllegal;
635 *target++ = UNI_REPLACEMENT_CHAR;
636 }
637 }
638 *sourceStart = source;
639 *targetStart = target;
640 return result;
641 }
642
643 /* ---------------------------------------------------------------------
644
645 Note A.
646 The fall-through switches in UTF-8 reading code save a
647 temp variable, some decrements & conditionals. The switches
648 are equivalent to the following loop:
649 {
650 int tmpBytesToRead = extraBytesToRead+1;
651 do {
652 ch += *source++;
653 --tmpBytesToRead;
654 if (tmpBytesToRead) ch <<= 6;
655 } while (tmpBytesToRead > 0);
656 }
657 In UTF-8 writing code, the switches on "bytesToWrite" are
658 similarly unrolled loops.
659
660 --------------------------------------------------------------------- */
661
663#ifdef __cplusplus
664
665 transcode_to_utf16::transcode_to_utf16(const char *utf8_input)
666 : _orig_length(int(strlen(utf8_input)) + 1), _converted(new UTF16[_orig_length + 1])
667 // we don't ever expect the string to get longer going to the larger data
668 // type, so the current length should be enough.
669 {
670 _result = conversionOK;
671 if (_orig_length == 1) {
672 // no length, so only provide a blank string.
673 _converted[0] = 0;
674 return;
675 }
676 memset((abyte *)_converted, 0, 2 * _orig_length);
677 // we use these temporary pointers since the converter resets the source
678 // and target pointers to the end of the conversion. the same pattern
679 // is used in the code below.
680 const UTF8 *temp_in = (const UTF8 *)utf8_input;
681 UTF16 *temp_out = _converted;
682 _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
683 temp_out + _orig_length, lenientConversion);
684 }
685
686 transcode_to_utf16::transcode_to_utf16(const astring &utf8_input)
687 : _orig_length(utf8_input.length() + 1), _converted(new UTF16[_orig_length])
688 {
689 _result = conversionOK;
690 if (_orig_length == 1) {
691 // no length, so only provide a blank string.
692 _converted[0] = 0;
693 return;
694 }
695 memset((abyte *)_converted, 0, 2 * _orig_length);
696 const UTF8 *temp_in = (const UTF8 *)utf8_input.observe();
697 UTF16 *temp_out = _converted;
698 _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
699 temp_out + _orig_length, lenientConversion);
700 }
701
702 transcode_to_utf16::~transcode_to_utf16()
703 {
704 delete[] _converted;
705 _converted = NULL_POINTER;
706 }
707
708 int transcode_to_utf16::length() const
709 {
710 return int(wcslen((wchar_t *)_converted));
711 }
712
714
715 transcode_to_utf8::transcode_to_utf8(const UTF16 *utf16_input)
716 : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
717 _new_length(_orig_length * 2 + _orig_length / 2 + 1),
718 // this is just an estimate. it may be appropriate most of the time.
719 // whatever doesn't fit will get truncated.
720 _converted(new UTF8[_new_length])
721 {
722 _result = conversionOK;
723 if (_orig_length == 0) {
724 // no length, so only provide a blank string.
725 _converted[0] = 0;
726 return;
727 }
728 memset(_converted, 0, _new_length);
729 const UTF16 *temp_in = (const UTF16 *)utf16_input;
730 UTF8 *temp_out = _converted;
731 _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
732 temp_out + _new_length, lenientConversion);
733 }
734
735 transcode_to_utf8::transcode_to_utf8(const wchar_t *utf16_input)
736 : _orig_length(int(wcslen(utf16_input))),
737 _new_length(_orig_length * 2 + _orig_length / 2 + 1),
738 // this is just an estimate. it may be appropriate most of the time.
739 // whatever doesn't fit will get truncated.
740 _converted(new UTF8[_new_length > 0 ? _new_length : 1])
741 {
742 _result = conversionOK;
743 if (_orig_length == 0) {
744 // no length, so only provide a blank string.
745 _converted[0] = 0;
746 return;
747 }
748 memset(_converted, 0, _new_length);
749 const UTF16 *temp_in = (const UTF16 *)utf16_input;
750 UTF8 *temp_out = _converted;
751 _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
752 temp_out + _new_length, lenientConversion);
753 }
754
755 transcode_to_utf8::~transcode_to_utf8()
756 {
757 delete[] _converted;
758 _converted = NULL_POINTER;
759 }
760
761 int transcode_to_utf8::length() const
762 {
763 return int(strlen((char *)_converted));
764 }
765
766 transcode_to_utf8::operator astring() const
767 {
768 return astring((char *)_converted);
769 }
770
772
773 null_transcoder::null_transcoder(const char *utf8_input, bool make_own_copy)
774 : _make_own_copy(make_own_copy),
775 _converted(make_own_copy ? new UTF8[strlen(utf8_input) + 1] : (const UTF8 *)utf8_input)
776 {
777 if (_make_own_copy) {
778 strcpy((char *)_converted, utf8_input);
779 }
780 }
781
782 null_transcoder::null_transcoder(const astring &utf8_input, bool make_own_copy)
783 : _make_own_copy(make_own_copy),
784 _converted(
785 make_own_copy ? new UTF8[utf8_input.length() + 1] : (const UTF8 *)utf8_input.s())
786 {
787 if (_make_own_copy) {
788 strcpy((char *)_converted, utf8_input.s());
789 }
790 }
791
792 int null_transcoder::length() const
793 {
794 return int(strlen((char *)_converted));
795 }
796
797#endif //_cplusplus
798} //namespace.
799
#define NULL_POINTER
The value representing a pointer to nothing.
Definition definitions.h:32
The guards collection helps in testing preconditions and reporting errors.
Definition array.h:30
@ sourceExhausted
@ targetExhausted
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
unsigned short UTF16
unsigned char Booleano
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
@ strictConversion
@ lenientConversion
unsigned char abyte
A fairly important unit which is seldom defined...
Definition definitions.h:51
unsigned char UTF8
unsigned long UTF32
Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
#define UNI_SUR_LOW_START
#define UNI_SUR_HIGH_START
#define UNI_SUR_LOW_END
#define UNI_SUR_HIGH_END
Support for unicode builds.
#define UNI_REPLACEMENT_CHAR
#define UNI_MAX_UTF16
#define UNI_MAX_LEGAL_UTF32
#define UNI_MAX_BMP