68 static const int halfShift = 10;
70 static const UTF32 halfBase = 0x0010000UL;
71 static const UTF32 halfMask = 0x3FFUL;
73 #define UNI_SUR_HIGH_START (UTF32)0xD800
74 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
75 #define UNI_SUR_LOW_START (UTF32)0xDC00
76 #define UNI_SUR_LOW_END (UTF32)0xDFFF
84 const UTF32* source = *sourceStart;
85 UTF16* target = *targetStart;
86 while (source < sourceEnd) {
88 if (target >= targetEnd) {
104 *target++ = (
UTF16)ch;
114 if (target + 1 >= targetEnd) {
124 *sourceStart = source;
125 *targetStart = target;
135 const UTF16* source = *sourceStart;
136 UTF32* target = *targetStart;
138 while (source < sourceEnd) {
139 const UTF16* oldSource = source;
144 if (source < sourceEnd) {
169 if (target >= targetEnd) {
176 *sourceStart = source;
177 *targetStart = target;
180 fprintf(stderr,
"ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
196 static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
204 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
211 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
212 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
221 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
239 const UTF16* source = *sourceStart;
240 UTF8* target = *targetStart;
241 while (source < sourceEnd) {
243 unsigned short bytesToWrite = 0;
244 const UTF32 byteMask = 0xBF;
245 const UTF32 byteMark = 0x80;
246 const UTF16* oldSource = source;
251 if (source < sourceEnd) {
277 if (ch < (
UTF32)0x80) {
279 }
else if (ch < (
UTF32)0x800) {
281 }
else if (ch < (
UTF32)0x10000) {
283 }
else if (ch < (
UTF32)0x110000) {
290 target += bytesToWrite;
291 if (target > targetEnd) {
293 target -= bytesToWrite;
297 switch (bytesToWrite) {
299 *--target = (
UTF8)((ch | byteMark) & byteMask);
303 *--target = (
UTF8)((ch | byteMark) & byteMask);
307 *--target = (
UTF8)((ch | byteMark) & byteMask);
311 *--target = (
UTF8)(ch | firstByteMark[bytesToWrite]);
314 target += bytesToWrite;
316 *sourceStart = source;
317 *targetStart = target;
334 static Booleano isLegalUTF8(
const UTF8 *source,
int length)
337 const UTF8 *srcptr = source + length;
341 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
345 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
349 if ((a = (*--srcptr)) > 0xBF)
return false;
354 if (a < 0xA0)
return false;
357 if (a > 0x9F)
return false;
360 if (a < 0x90)
return false;
363 if (a > 0x8F)
return false;
366 if (a < 0x80)
return false;
372 if (*source >= 0x80 && *source < 0xC2)
return false;
379 if (*source > 0xF4)
return false;
391 int length = trailingBytesForUTF8[*source] + 1;
392 if (source + length > sourceEnd) {
395 return isLegalUTF8(source, length);
404 const UTF8* source = *sourceStart;
405 UTF16* target = *targetStart;
406 while (source < sourceEnd) {
408 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
409 if (source + extraBytesToRead >= sourceEnd) {
414 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
421 switch (extraBytesToRead) {
446 ch -= offsetsFromUTF8[extraBytesToRead];
448 if (target >= targetEnd) {
449 source -= (extraBytesToRead + 1);
457 source -= (extraBytesToRead + 1);
464 *target++ = (
UTF16)ch;
469 source -= (extraBytesToRead + 1);
476 if (target + 1 >= targetEnd) {
477 source -= (extraBytesToRead + 1);
486 *sourceStart = source;
487 *targetStart = target;
497 const UTF32* source = *sourceStart;
498 UTF8* target = *targetStart;
499 while (source < sourceEnd) {
501 unsigned short bytesToWrite = 0;
502 const UTF32 byteMask = 0xBF;
503 const UTF32 byteMark = 0x80;
517 if (ch < (
UTF32)0x80) {
519 }
else if (ch < (
UTF32)0x800) {
521 }
else if (ch < (
UTF32)0x10000) {
531 target += bytesToWrite;
532 if (target > targetEnd) {
534 target -= bytesToWrite;
538 switch (bytesToWrite) {
540 *--target = (
UTF8)((ch | byteMark) & byteMask);
544 *--target = (
UTF8)((ch | byteMark) & byteMask);
548 *--target = (
UTF8)((ch | byteMark) & byteMask);
552 *--target = (
UTF8)(ch | firstByteMark[bytesToWrite]);
555 target += bytesToWrite;
557 *sourceStart = source;
558 *targetStart = target;
568 const UTF8* source = *sourceStart;
569 UTF32* target = *targetStart;
570 while (source < sourceEnd) {
572 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
573 if (source + extraBytesToRead >= sourceEnd) {
578 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
585 switch (extraBytesToRead) {
610 ch -= offsetsFromUTF8[extraBytesToRead];
612 if (target >= targetEnd) {
613 source -= (extraBytesToRead + 1);
624 source -= (extraBytesToRead + 1);
638 *sourceStart = source;
639 *targetStart = target;
665 transcode_to_utf16::transcode_to_utf16(
const char *utf8_input)
666 : _orig_length(int(strlen(utf8_input)) + 1), _converted(new
UTF16[_orig_length + 1])
671 if (_orig_length == 1) {
676 memset((
abyte *)_converted, 0, 2 * _orig_length);
680 const UTF8 *temp_in = (
const UTF8 *)utf8_input;
681 UTF16 *temp_out = _converted;
686 transcode_to_utf16::transcode_to_utf16(
const astring &utf8_input)
687 : _orig_length(utf8_input.length() + 1), _converted(new
UTF16[_orig_length])
690 if (_orig_length == 1) {
695 memset((
abyte *)_converted, 0, 2 * _orig_length);
696 const UTF8 *temp_in = (
const UTF8 *)utf8_input.observe();
697 UTF16 *temp_out = _converted;
702 transcode_to_utf16::~transcode_to_utf16()
708 int transcode_to_utf16::length()
const
710 return int(wcslen((
wchar_t *)_converted));
715 transcode_to_utf8::transcode_to_utf8(
const UTF16 *utf16_input)
716 : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
717 _new_length(_orig_length * 2 + _orig_length / 2 + 1),
720 _converted(new
UTF8[_new_length])
723 if (_orig_length == 0) {
728 memset(_converted, 0, _new_length);
729 const UTF16 *temp_in = (
const UTF16 *)utf16_input;
730 UTF8 *temp_out = _converted;
735 transcode_to_utf8::transcode_to_utf8(
const wchar_t *utf16_input)
736 : _orig_length(int(wcslen(utf16_input))),
737 _new_length(_orig_length * 2 + _orig_length / 2 + 1),
740 _converted(new
UTF8[_new_length > 0 ? _new_length : 1])
743 if (_orig_length == 0) {
748 memset(_converted, 0, _new_length);
749 const UTF16 *temp_in = (
const UTF16 *)utf16_input;
750 UTF8 *temp_out = _converted;
755 transcode_to_utf8::~transcode_to_utf8()
761 int transcode_to_utf8::length()
const
763 return int(strlen((
char *)_converted));
766 transcode_to_utf8::operator astring()
const
768 return astring((
char *)_converted);
773 null_transcoder::null_transcoder(
const char *utf8_input,
bool make_own_copy)
774 : _make_own_copy(make_own_copy),
775 _converted(make_own_copy ? new
UTF8[strlen(utf8_input) + 1] : (const
UTF8 *)utf8_input)
777 if (_make_own_copy) {
778 strcpy((
char *)_converted, utf8_input);
782 null_transcoder::null_transcoder(
const astring &utf8_input,
bool make_own_copy)
783 : _make_own_copy(make_own_copy),
785 make_own_copy ? new
UTF8[utf8_input.length() + 1] : (const
UTF8 *)utf8_input.s())
787 if (_make_own_copy) {
788 strcpy((
char *)_converted, utf8_input.s());
792 int null_transcoder::length()
const
794 return int(strlen((
char *)_converted));
#define NULL_POINTER
The value representing a pointer to nothing.
The guards collection helps in testing preconditions and reporting errors.
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
unsigned char abyte
A fairly important unit which is seldom defined...
Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
#define UNI_SUR_LOW_START
#define UNI_SUR_HIGH_START
Support for unicode builds.
#define UNI_REPLACEMENT_CHAR
#define UNI_MAX_LEGAL_UTF32