1 /*****************************************************************************\
3 * Name : utf_conversion *
4 * Author : Unicode, Inc. (C conversion functions) *
5 * Author : Chris Koeritz (C++ conversion classes) *
7 *******************************************************************************
8 * Copyright (c) 2006-$now By Author. This program is free software; you can *
9 * redistribute it and/or modify it under the terms of the GNU General Public *
10 * License as published by the Free Software Foundation; either version 2 of *
11 * the License or (at your option) any later version. This is online at: *
12 * http://www.fsf.org/copyleft/gpl.html *
13 * Please send any updates to: fred@gruntose.com *
14 \*****************************************************************************/
16 //copyright below is relevant to UTF conversion methods only.
18 * Copyright 2001-$now Unicode, Inc.
22 * This source code is provided as is by Unicode, Inc. No claims are
23 * made as to fitness for any particular purpose. No warranties of any
24 * kind are expressed or implied. The recipient agrees to determine
25 * applicability of information provided. If this file has been
26 * purchased on magnetic or optical media from Unicode, Inc., the
27 * sole remedy for any claim will be exchange of defective media
28 * within 90 days of receipt.
30 * Limitations on Rights to Redistribute This Code
32 * Unicode, Inc. hereby grants the right to freely use the information
33 * supplied in this file in the creation of products supporting the
34 * Unicode Standard, and to make copies of this file in any form
35 * for internal or external distribution as long as this notice
39 /* ---------------------------------------------------------------------
41 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
42 Author: Mark E. Davis, 1994.
43 Rev History: Rick McGowan, fixes & updates May 2001.
44 Sept 2001: fixed const & error conditions per
45 mods suggested by S. Parent & A. Lillich.
46 June 2002: Tim Dodd added detection and handling of incomplete
47 source sequences, enhanced error detection, added casts
48 to eliminate compiler warnings.
49 July 2003: slight mods to back out aggressive FFFE detection.
50 Jan 2004: updated switches in from-UTF8 conversions.
51 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
53 See the header file "ConvertUTF.h" for complete documentation.
55 ------------------------------------------------------------------------ */
58 #include "utf_conversion.h"
68 static const int halfShift = 10; /* used for shifting by 10 bits */
70 static const UTF32 halfBase = 0x0010000UL;
71 static const UTF32 halfMask = 0x3FFUL;
73 #define UNI_SUR_HIGH_START (UTF32)0xD800
74 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
75 #define UNI_SUR_LOW_START (UTF32)0xDC00
76 #define UNI_SUR_LOW_END (UTF32)0xDFFF
78 /* --------------------------------------------------------------------- */
80 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart, const UTF32* sourceEnd,
81 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
83 ConversionResult result = conversionOK;
84 const UTF32* source = *sourceStart;
85 UTF16* target = *targetStart;
86 while (source < sourceEnd) {
88 if (target >= targetEnd) {
89 result = targetExhausted;
93 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
94 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
95 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
96 if (flags == strictConversion) {
97 --source; /* return to the illegal value itself */
98 result = sourceIllegal;
101 *target++ = UNI_REPLACEMENT_CHAR;
104 *target++ = (UTF16)ch; /* normal case */
106 } else if (ch > UNI_MAX_LEGAL_UTF32) {
107 if (flags == strictConversion) {
108 result = sourceIllegal;
110 *target++ = UNI_REPLACEMENT_CHAR;
113 /* target is a character in range 0xFFFF - 0x10FFFF. */
114 if (target + 1 >= targetEnd) {
115 --source; /* Back up source pointer! */
116 result = targetExhausted;
120 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
121 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
124 *sourceStart = source;
125 *targetStart = target;
129 /* --------------------------------------------------------------------- */
131 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart, const UTF16* sourceEnd,
132 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
134 ConversionResult result = conversionOK;
135 const UTF16* source = *sourceStart;
136 UTF32* target = *targetStart;
138 while (source < sourceEnd) {
139 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
141 /* If we have a surrogate pair, convert to UTF32 first. */
142 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
143 /* If the 16 bits following the high surrogate are in the source buffer... */
144 if (source < sourceEnd) {
146 /* If it's a low surrogate, convert to UTF32. */
147 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
148 ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
151 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
152 --source; /* return to the illegal value itself */
153 result = sourceIllegal;
156 } else { /* We don't have the 16 bits following the high surrogate. */
157 --source; /* return to the high surrogate */
158 result = sourceExhausted;
161 } else if (flags == strictConversion) {
162 /* UTF-16 surrogate values are illegal in UTF-32 */
163 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
164 --source; /* return to the illegal value itself */
165 result = sourceIllegal;
169 if (target >= targetEnd) {
170 source = oldSource; /* Back up source pointer! */
171 result = targetExhausted;
176 *sourceStart = source;
177 *targetStart = target;
179 if (result == sourceIllegal) {
180 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
187 /* --------------------------------------------------------------------- */
190 * Index into the table below with the first byte of a UTF-8 sequence to
191 * get the number of trailing bytes that are supposed to follow it.
192 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
193 * left as-is for anyone who may want to do such conversion, which was
194 * allowed in earlier algorithms.
196 static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
204 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
207 * Magic values subtracted from a buffer value during UTF8 conversion.
208 * This table contains as many values as there might be trailing bytes
209 * in a UTF-8 sequence.
211 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
212 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
215 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
216 * into the first byte, depending on how many bytes follow. There are
217 * as many entries in this table as there are UTF-8 sequence types.
218 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
219 * for *legal* UTF-8 will be 4 or fewer bytes total.
221 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
223 /* --------------------------------------------------------------------- */
225 /* The interface converts a whole buffer to avoid function-call overhead.
226 * Constants have been gathered. Loops & conditionals have been removed as
227 * much as possible for efficiency, in favor of drop-through switches.
228 * (See "Note A" at the bottom of the file for equivalent code.)
229 * If your compiler supports it, the "isLegalUTF8" call can be turned
230 * into an inline function.
233 /* --------------------------------------------------------------------- */
235 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart, const UTF16* sourceEnd,
236 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
238 ConversionResult result = conversionOK;
239 const UTF16* source = *sourceStart;
240 UTF8* target = *targetStart;
241 while (source < sourceEnd) {
243 unsigned short bytesToWrite = 0;
244 const UTF32 byteMask = 0xBF;
245 const UTF32 byteMark = 0x80;
246 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
248 /* If we have a surrogate pair, convert to UTF32 first. */
249 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
250 /* If the 16 bits following the high surrogate are in the source buffer... */
251 if (source < sourceEnd) {
253 /* If it's a low surrogate, convert to UTF32. */
254 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
255 ch = ((ch - UNI_SUR_HIGH_START ) << halfShift) + (ch2 - UNI_SUR_LOW_START )
258 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
259 --source; /* return to the illegal value itself */
260 result = sourceIllegal;
263 } else { /* We don't have the 16 bits following the high surrogate. */
264 --source; /* return to the high surrogate */
265 result = sourceExhausted;
268 } else if (flags == strictConversion) {
269 /* UTF-16 surrogate values are illegal in UTF-32 */
270 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
271 --source; /* return to the illegal value itself */
272 result = sourceIllegal;
276 /* Figure out how many bytes the result will require */
277 if (ch < (UTF32)0x80) {
279 } else if (ch < (UTF32)0x800) {
281 } else if (ch < (UTF32)0x10000) {
283 } else if (ch < (UTF32)0x110000) {
287 ch = UNI_REPLACEMENT_CHAR;
290 target += bytesToWrite;
291 if (target > targetEnd) {
292 source = oldSource; /* Back up source pointer! */
293 target -= bytesToWrite;
294 result = targetExhausted;
297 switch (bytesToWrite) { /* note: everything falls through. */
299 *--target = (UTF8)((ch | byteMark) & byteMask);
303 *--target = (UTF8)((ch | byteMark) & byteMask);
307 *--target = (UTF8)((ch | byteMark) & byteMask);
311 *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
314 target += bytesToWrite;
316 *sourceStart = source;
317 *targetStart = target;
321 /* --------------------------------------------------------------------- */
324 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
325 * This must be called with the length pre-determined by the first byte.
326 * If not calling this from ConvertUTF8to*, then the length can be set by:
327 * length = trailingBytesForUTF8[*source]+1;
328 * and the sequence is illegal right away if there aren't that many bytes
330 * If presented with a length > 4, this returns false. The Unicode
331 * definition of UTF-8 goes up to 4-byte sequences.
334 static Booleano isLegalUTF8(const UTF8 *source, int length)
337 const UTF8 *srcptr = source + length;
339 /* Everything else falls through when "true"... */
341 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
345 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
349 if ((a = (*--srcptr)) > 0xBF) return false;
352 /* no fall-through in this inner switch */
354 if (a < 0xA0) return false;
357 if (a > 0x9F) return false;
360 if (a < 0x90) return false;
363 if (a > 0x8F) return false;
366 if (a < 0x80) return false;
372 if (*source >= 0x80 && *source < 0xC2) return false;
379 if (*source > 0xF4) return false;
383 /* --------------------------------------------------------------------- */
386 * Exported function to return whether a UTF-8 sequence is legal or not.
387 * This is not used here; it's just exported.
389 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
391 int length = trailingBytesForUTF8[*source] + 1;
392 if (source + length > sourceEnd) {
395 return isLegalUTF8(source, length);
398 /* --------------------------------------------------------------------- */
400 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd,
401 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
403 ConversionResult result = conversionOK;
404 const UTF8* source = *sourceStart;
405 UTF16* target = *targetStart;
406 while (source < sourceEnd) {
408 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
409 if (source + extraBytesToRead >= sourceEnd) {
410 result = sourceExhausted;
413 /* Do this check whether lenient or strict */
414 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
415 result = sourceIllegal;
419 * The cases all fall through. See "Note A" below.
421 switch (extraBytesToRead) {
424 ch <<= 6; /* remember, illegal UTF-8 */
428 ch <<= 6; /* remember, illegal UTF-8 */
446 ch -= offsetsFromUTF8[extraBytesToRead];
448 if (target >= targetEnd) {
449 source -= (extraBytesToRead + 1); /* Back up source pointer! */
450 result = targetExhausted;
453 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
454 /* UTF-16 surrogate values are illegal in UTF-32 */
455 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
456 if (flags == strictConversion) {
457 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
458 result = sourceIllegal;
461 *target++ = UNI_REPLACEMENT_CHAR;
464 *target++ = (UTF16)ch; /* normal case */
466 } else if (ch > UNI_MAX_UTF16) {
467 if (flags == strictConversion) {
468 result = sourceIllegal;
469 source -= (extraBytesToRead + 1); /* return to the start */
470 break; /* Bail out; shouldn't continue */
472 *target++ = UNI_REPLACEMENT_CHAR;
475 /* target is a character in range 0xFFFF - 0x10FFFF. */
476 if (target + 1 >= targetEnd) {
477 source -= (extraBytesToRead + 1); /* Back up source pointer! */
478 result = targetExhausted;
482 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START );
483 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START );
486 *sourceStart = source;
487 *targetStart = target;
491 /* --------------------------------------------------------------------- */
493 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart, const UTF32* sourceEnd,
494 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
496 ConversionResult result = conversionOK;
497 const UTF32* source = *sourceStart;
498 UTF8* target = *targetStart;
499 while (source < sourceEnd) {
501 unsigned short bytesToWrite = 0;
502 const UTF32 byteMask = 0xBF;
503 const UTF32 byteMark = 0x80;
505 if (flags == strictConversion) {
506 /* UTF-16 surrogate values are illegal in UTF-32 */
507 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
508 --source; /* return to the illegal value itself */
509 result = sourceIllegal;
514 * Figure out how many bytes the result will require. Turn any
515 * illegally large UTF32 things (> Plane 17) into replacement chars.
517 if (ch < (UTF32)0x80) {
519 } else if (ch < (UTF32)0x800) {
521 } else if (ch < (UTF32)0x10000) {
523 } else if (ch <= UNI_MAX_LEGAL_UTF32) {
527 ch = UNI_REPLACEMENT_CHAR;
528 result = sourceIllegal;
531 target += bytesToWrite;
532 if (target > targetEnd) {
533 --source; /* Back up source pointer! */
534 target -= bytesToWrite;
535 result = targetExhausted;
538 switch (bytesToWrite) { /* note: everything falls through. */
540 *--target = (UTF8)((ch | byteMark) & byteMask);
544 *--target = (UTF8)((ch | byteMark) & byteMask);
548 *--target = (UTF8)((ch | byteMark) & byteMask);
552 *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
555 target += bytesToWrite;
557 *sourceStart = source;
558 *targetStart = target;
562 /* --------------------------------------------------------------------- */
564 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart, const UTF8* sourceEnd,
565 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
567 ConversionResult result = conversionOK;
568 const UTF8* source = *sourceStart;
569 UTF32* target = *targetStart;
570 while (source < sourceEnd) {
572 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
573 if (source + extraBytesToRead >= sourceEnd) {
574 result = sourceExhausted;
577 /* Do this check whether lenient or strict */
578 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
579 result = sourceIllegal;
583 * The cases all fall through. See "Note A" below.
585 switch (extraBytesToRead) {
610 ch -= offsetsFromUTF8[extraBytesToRead];
612 if (target >= targetEnd) {
613 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
614 result = targetExhausted;
617 if (ch <= UNI_MAX_LEGAL_UTF32) {
619 * UTF-16 surrogate values are illegal in UTF-32, and anything
620 * over Plane 17 (> 0x10FFFF) is illegal.
622 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
623 if (flags == strictConversion) {
624 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
625 result = sourceIllegal;
628 *target++ = UNI_REPLACEMENT_CHAR;
633 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
634 result = sourceIllegal;
635 *target++ = UNI_REPLACEMENT_CHAR;
638 *sourceStart = source;
639 *targetStart = target;
643 /* ---------------------------------------------------------------------
646 The fall-through switches in UTF-8 reading code save a
647 temp variable, some decrements & conditionals. The switches
648 are equivalent to the following loop:
650 int tmpBytesToRead = extraBytesToRead+1;
654 if (tmpBytesToRead) ch <<= 6;
655 } while (tmpBytesToRead > 0);
657 In UTF-8 writing code, the switches on "bytesToWrite" are
658 similarly unrolled loops.
660 --------------------------------------------------------------------- */
665 transcode_to_utf16::transcode_to_utf16(const char *utf8_input)
666 : _orig_length(int(strlen(utf8_input)) + 1), _converted(new UTF16[_orig_length + 1])
667 // we don't ever expect the string to get longer going to the larger data
668 // type, so the current length should be enough.
670 _result = conversionOK;
671 if (_orig_length == 1) {
672 // no length, so only provide a blank string.
676 memset((abyte *)_converted, 0, 2 * _orig_length);
677 // we use these temporary pointers since the converter resets the source
678 // and target pointers to the end of the conversion. the same pattern
679 // is used in the code below.
680 const UTF8 *temp_in = (const UTF8 *)utf8_input;
681 UTF16 *temp_out = _converted;
682 _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
683 temp_out + _orig_length, lenientConversion);
686 transcode_to_utf16::transcode_to_utf16(const astring &utf8_input)
687 : _orig_length(utf8_input.length() + 1), _converted(new UTF16[_orig_length])
689 _result = conversionOK;
690 if (_orig_length == 1) {
691 // no length, so only provide a blank string.
695 memset((abyte *)_converted, 0, 2 * _orig_length);
696 const UTF8 *temp_in = (const UTF8 *)utf8_input.observe();
697 UTF16 *temp_out = _converted;
698 _result = ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length, &temp_out,
699 temp_out + _orig_length, lenientConversion);
702 transcode_to_utf16::~transcode_to_utf16()
705 _converted = NULL_POINTER;
708 int transcode_to_utf16::length() const
710 return int(wcslen((wchar_t *)_converted));
715 transcode_to_utf8::transcode_to_utf8(const UTF16 *utf16_input)
716 : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
717 _new_length(_orig_length * 2 + _orig_length / 2 + 1),
718 // this is just an estimate. it may be appropriate most of the time.
719 // whatever doesn't fit will get truncated.
720 _converted(new UTF8[_new_length])
722 _result = conversionOK;
723 if (_orig_length == 0) {
724 // no length, so only provide a blank string.
728 memset(_converted, 0, _new_length);
729 const UTF16 *temp_in = (const UTF16 *)utf16_input;
730 UTF8 *temp_out = _converted;
731 _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
732 temp_out + _new_length, lenientConversion);
735 transcode_to_utf8::transcode_to_utf8(const wchar_t *utf16_input)
736 : _orig_length(int(wcslen(utf16_input))),
737 _new_length(_orig_length * 2 + _orig_length / 2 + 1),
738 // this is just an estimate. it may be appropriate most of the time.
739 // whatever doesn't fit will get truncated.
740 _converted(new UTF8[_new_length > 0 ? _new_length : 1])
742 _result = conversionOK;
743 if (_orig_length == 0) {
744 // no length, so only provide a blank string.
748 memset(_converted, 0, _new_length);
749 const UTF16 *temp_in = (const UTF16 *)utf16_input;
750 UTF8 *temp_out = _converted;
751 _result = ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length, &temp_out,
752 temp_out + _new_length, lenientConversion);
755 transcode_to_utf8::~transcode_to_utf8()
758 _converted = NULL_POINTER;
761 int transcode_to_utf8::length() const
763 return int(strlen((char *)_converted));
766 transcode_to_utf8::operator astring() const
768 return astring((char *)_converted);
773 null_transcoder::null_transcoder(const char *utf8_input, bool make_own_copy)
774 : _make_own_copy(make_own_copy),
775 _converted(make_own_copy ? new UTF8[strlen(utf8_input) + 1] : (const UTF8 *)utf8_input)
777 if (_make_own_copy) {
778 strcpy((char *)_converted, utf8_input);
782 null_transcoder::null_transcoder(const astring &utf8_input, bool make_own_copy)
783 : _make_own_copy(make_own_copy),
785 make_own_copy ? new UTF8[utf8_input.length() + 1] : (const UTF8 *)utf8_input.s())
787 if (_make_own_copy) {
788 strcpy((char *)_converted, utf8_input.s());
792 int null_transcoder::length() const
794 return int(strlen((char *)_converted));