feisty meow concerns codebase  2.140
utf_conversion.h
Go to the documentation of this file.
1 #ifndef UTF_CONVERSION_GROUP
2 #define UTF_CONVERSION_GROUP
3 
4 /*****************************************************************************\
5 * *
6 * Name : utf_conversion *
7 * Author : Unicode, Inc. (C conversion functions) *
8 * Author : Chris Koeritz (C++ conversion classes) *
9 * *
10 *******************************************************************************
11 * Copyright (c) 2006-$now By Author. This program is free software; you can *
12 * redistribute it and/or modify it under the terms of the GNU General Public *
13 * License as published by the Free Software Foundation; either version 2 of *
14 * the License or (at your option) any later version. This is online at: *
15 * http://www.fsf.org/copyleft/gpl.html *
16 * Please send any updates to: fred@gruntose.com *
17 \*****************************************************************************/
18 
19 #include "astring.h"
20 #include "definitions.h"
21 
23 
29 // original copyright notice still applies to low-level conversion code:
30 /*
31  * Copyright 2001-$now Unicode, Inc.
32  *
33  * Disclaimer
34  *
35  * This source code is provided as is by Unicode, Inc. No claims are
36  * made as to fitness for any particular purpose. No warranties of any
37  * kind are expressed or implied. The recipient agrees to determine
38  * applicability of information provided. If this file has been
39  * purchased on magnetic or optical media from Unicode, Inc., the
40  * sole remedy for any claim will be exchange of defective media
41  * within 90 days of receipt.
42  *
43  * Limitations on Rights to Redistribute This Code
44  *
45  * Unicode, Inc. hereby grants the right to freely use the information
46  * supplied in this file in the creation of products supporting the
47  * Unicode Standard, and to make copies of this file in any form
48  * for internal or external distribution as long as this notice
49  * remains attached.
50  */
51 
52 /* ---------------------------------------------------------------------
53 
54  Conversions between UTF32, UTF-16, and UTF-8. Header file.
55 
56  Several funtions are included here, forming a complete set of
57  conversions between the three formats. UTF-7 is not included
58  here, but is handled in a separate source file.
59 
60  Each of these routines takes pointers to input buffers and output
61  buffers. The input buffers are const.
62 
63  Each routine converts the text between *sourceStart and sourceEnd,
64  putting the result into the buffer between *targetStart and
65  targetEnd. Note: the end pointers are *after* the last item: e.g.
66  *(sourceEnd - 1) is the last item.
67 
68  The return result indicates whether the conversion was successful,
69  and if not, whether the problem was in the source or target buffers.
70  (Only the first encountered problem is indicated.)
71 
72  After the conversion, *sourceStart and *targetStart are both
73  updated to point to the end of last text successfully converted in
74  the respective buffers.
75 
76  Input parameters:
77  sourceStart - pointer to a pointer to the source buffer.
78  The contents of this are modified on return so that
79  it points at the next thing to be converted.
80  targetStart - similarly, pointer to pointer to the target buffer.
81  sourceEnd, targetEnd - respectively pointers to the ends of the
82  two buffers, for overflow checking only.
83 
84  These conversion functions take a ConversionFlags argument. When this
85  flag is set to strict, both irregular sequences and isolated surrogates
86  will cause an error. When the flag is set to lenient, both irregular
87  sequences and isolated surrogates are converted.
88 
89  Whether the flag is strict or lenient, all illegal sequences will cause
90  an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
91  or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
92  must check for illegal sequences.
93 
94  When the flag is set to lenient, characters over 0x10FFFF are converted
95  to the replacement character; otherwise (when the flag is set to strict)
96  they constitute an error.
97 
98  Output parameters:
99  The value "sourceIllegal" is returned from some routines if the input
100  sequence is malformed. When "sourceIllegal" is returned, the source
101  value will point to the illegal value that caused the problem. E.g.,
102  in UTF-8 when a sequence is malformed, it points to the start of the
103  malformed sequence.
104 
105  Author: Mark E. Davis, 1994.
106  Rev History: Rick McGowan, fixes & updates May 2001.
107  Fixes & updates, Sept 2001.
108 
109 ------------------------------------------------------------------------ */
110 
111 namespace basis {
112 
113 /* ---------------------------------------------------------------------
114  The following 4 definitions are compiler-specific.
115  The C standard does not guarantee that wchar_t has at least
116  16 bits, so wchar_t is no less portable than unsigned short!
117  All should be unsigned values to avoid sign extension during
118  bit mask & shift operations.
119 ------------------------------------------------------------------------ */
120 typedef unsigned long UTF32; /* at least 32 bits */
121 typedef unsigned short UTF16; /* at least 16 bits */
122 typedef unsigned char UTF8; /* typically 8 bits */
123 typedef unsigned char Booleano; /* 0 or 1 */
124 
125 //hmmm: this is some really gross stuff below, just because it's got
126 // so much of its guts hanging out. make a class out of the lower-
127 // level conversion stuff and hide all the details.
128 
129 /* Some fundamental constants */
130 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
131 #define UNI_MAX_BMP (UTF32)0x0000FFFF
132 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
133 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
134 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
135 
136 typedef enum {
137  conversionOK, /* conversion successful */
138  sourceExhausted, /* partial character in source, but hit end */
139  targetExhausted, /* insuff. room in target for conversion */
140  sourceIllegal /* source sequence is illegal/malformed */
142 
143 typedef enum {
147 
148 #ifdef __cplusplus
149 extern "C" {
150 #endif
151 
152 ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart,
153  const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
154  ConversionFlags flags);
155 
156 ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart,
157  const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
158  ConversionFlags flags);
159 
160 ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart,
161  const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
162  ConversionFlags flags);
163 
164 ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart,
165  const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
166  ConversionFlags flags);
167 
168 ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart,
169  const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
170  ConversionFlags flags);
171 
172 ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart,
173  const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
174  ConversionFlags flags);
175 
176 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
177 
178 #ifdef __cplusplus
179 } //extern
180 #endif //cplusplus
181 
183 
184 #ifdef __cplusplus
185 
186 // The following types and macros help to make it irrelevant what kind of
187 // win32 build is being done. They will adapt as needed to provide the
188 // types used in system calls. They are rendered harmless for other operating
189 // systems or for non-Unicode builds; this is especially useful for POSIX
190 // compliant functions that required Unicode in win32 but not in Unix systems.
191 
192 #if defined(UNICODE)
193 #error should not be in here right now --cak
196 
199  #define to_unicode_temp(s) transcode_to_utf16(s)
201 
203  #define from_unicode_temp(s) transcode_to_utf8(s)
205 
208  #define to_unicode_persist(name, s) transcode_to_utf16 name(s)
210 
211  #define from_unicode_persist(name, s) transcode_to_utf8 name(s)
212 #else
213  // these versions of the macros simply defang any conversions.
214  #define to_unicode_temp(s) null_transcoder(s, false)
215  #define from_unicode_temp(s) null_transcoder(s, false)
216  #define to_unicode_persist(name, s) null_transcoder name(s, true)
217  #define from_unicode_persist(name, s) null_transcoder name(s, true)
218 #endif
219 
220 //#ifdef _MSC_VER
221 // //! sends UTF-8 information to the diagnostic view in the IDE.
222 // #define TRACE_PRINT(s) TRACE(_T("%s"), to_unicode_temp(s))
223 //#endif
224 
226 
227 // The next two classes support converting a UTF-8 string into a UTF-16
228 // string and vice-versa. They hold onto the converted string and provide
229 // operators that return it.
230 
232 
233 class transcode_to_utf16 : public virtual root_object
234 {
235 public:
236  transcode_to_utf16(const char *utf8_input);
238 
241  transcode_to_utf16(const astring &utf8_input);
243 
244  virtual ~transcode_to_utf16();
245 
246  int length() const;
248 
249  operator const UTF16 * () const { return _converted; }
251  operator UTF16 * () { return _converted; }
253  operator const flexichar * () const { return (const flexichar *)_converted; }
255  operator flexichar * () { return (flexichar *)_converted; }
257 
258  ConversionResult _result;
259 private:
260  int _orig_length;
261  UTF16 *_converted;
262 };
263 
265 
267 
268 class transcode_to_utf8 : public virtual root_object
269 {
270 public:
271  transcode_to_utf8(const UTF16 *utf16_input);
273 
276  transcode_to_utf8(const wchar_t *utf16_input);
278 
279  virtual ~transcode_to_utf8();
280 
281  int length() const;
283 
284  operator const UTF8 * () const { return _converted; }
286  operator UTF8 * () { return _converted; }
288 
289  operator astring() const;
291 
292  ConversionResult _result;
293 private:
294  int _orig_length;
295  int _new_length;
296  UTF8 *_converted;
297 };
298 
300 
302 
303 class null_transcoder : public virtual root_object
304 {
305 public:
307  null_transcoder(const char *utf8_input, bool make_own_copy);
309  null_transcoder(const astring &utf8_input, bool make_own_copy);
310  ~null_transcoder() {
311  if (_make_own_copy) delete [] _converted;
312  _converted = NULL_POINTER;
313  }
314 
315  int length() const;
316  operator char * () { return (char *)_converted; }
317  operator const char * () const { return (const char *)_converted; }
318 
319  operator astring() const { return astring((const char *)_converted); }
321 
322 private:
323  bool _make_own_copy;
324  const UTF8 *_converted;
325 };
326 
327 #endif //cplusplus
328 
329 } //namespace.
330 
331 #endif // outer guard.
332 
Constants and objects used throughout HOOPLE.
#define NULL_POINTER
The value representing a pointer to nothing.
Definition: definitions.h:32
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
@ sourceIllegal
@ sourceExhausted
@ conversionOK
@ targetExhausted
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
unsigned short UTF16
unsigned char Booleano
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
@ strictConversion
@ lenientConversion
unsigned char UTF8
unsigned long UTF32
Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
char flexichar
Definition: definitions.h:58
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)