Hoyt's FORK of DemoIccMAX 2.1.17.hoyt
Documentation for Hoyt's FORK of DemoIccMAX
Loading...
Searching...
No Matches
IccConvertUTF.cpp File Reference
#include "IccConvertUTF.h"
+ Include dependency graph for IccConvertUTF.cpp:

Go to the source code of this file.

Macros

#define false   0
 
#define true   1
 
#define UNI_SUR_HIGH_END   (UTF32)0xDBFF
 
#define UNI_SUR_HIGH_START   (UTF32)0xD800
 
#define UNI_SUR_LOW_END   (UTF32)0xDFFF
 
#define UNI_SUR_LOW_START   (UTF32)0xDC00
 

Functions

icUtfConversionResult icConvertUTF16toUTF32 (const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF16toUTF32 (const UTF16 *source, const UTF16 *sourceEnd, icUtf32Vector target, UTF32 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF16toUTF8 (const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF16toUTF8 (const UTF16 *source, const UTF16 *sourceEnd, icUtf8Vector &target, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF32toUTF16 (const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF32toUTF16 (const UTF32 *source, const UTF32 *sourceEnd, icUtf16Vector &target, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF32toUTF8 (const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF32toUTF8 (const UTF32 *source, const UTF32 *sourceEnd, icUtf8Vector &target, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF8toUTF16 (const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF8toUTF16 (const UTF8 *source, const UTF8 *sourceEnd, icUtf16Vector &target, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF8toUTF32 (const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF8toUTF32 (const UTF8 *source, const UTF8 *sourceEnd, icUtf32Vector &target, icUtfConversionFlags flags)
 
Boolean icIsLegalUTF8Sequence (const UTF8 *source, const UTF8 *sourceEnd)
 
static Boolean isLegalUTF8 (const UTF8 *source, int length)
 

Variables

static const UTF8 firstByteMark [7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }
 
static const UTF32 halfBase = 0x0010000UL
 
static const UTF32 halfMask = 0x3FFUL
 
static const int halfShift = 10
 
static const UTF32 offsetsFromUTF8 [6]
 
static const char trailingBytesForUTF8 [256]
 

Macro Definition Documentation

◆ false

#define false   0

Definition at line 56 of file IccConvertUTF.cpp.

◆ true

#define true   1

Definition at line 57 of file IccConvertUTF.cpp.

◆ UNI_SUR_HIGH_END

#define UNI_SUR_HIGH_END   (UTF32)0xDBFF

◆ UNI_SUR_HIGH_START

◆ UNI_SUR_LOW_END

◆ UNI_SUR_LOW_START

Function Documentation

◆ icConvertUTF16toUTF32() [1/2]

icUtfConversionResult icConvertUTF16toUTF32 ( const UTF16 ** sourceStart,
const UTF16 * sourceEnd,
UTF32 ** targetStart,
UTF32 * targetEnd,
icUtfConversionFlags flags )

Definition at line 147 of file IccConvertUTF.cpp.

149{
151 const UTF16* source = *sourceStart;
152 UTF32* target = *targetStart;
153 UTF32 ch, ch2;
154 while (source < sourceEnd) {
155 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
156 ch = *source++;
157 /* If we have a surrogate pair, convert to UTF32 first. */
158 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
159 /* If the 16 bits following the high surrogate are in the source buffer... */
160 if (source < sourceEnd) {
161 ch2 = *source;
162 /* If it's a low surrogate, convert to UTF32. */
163 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
164 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
165 + (ch2 - UNI_SUR_LOW_START) + halfBase;
166 ++source;
167 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
168 --source; /* return to the illegal value itself */
169 result = sourceIllegal;
170 break;
171 }
172 } else { /* We don't have the 16 bits following the high surrogate. */
173 --source; /* return to the high surrogate */
174 result = sourceExhausted;
175 break;
176 }
177 } else if (flags == strictConversion) {
178 /* UTF-16 surrogate values are illegal in UTF-32 */
179 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
180 --source; /* return to the illegal value itself */
181 result = sourceIllegal;
182 break;
183 }
184 }
185 if (target >= targetEnd) {
186 source = oldSource; /* Back up source pointer! */
187 result = targetExhausted; break;
188 }
189 *target++ = ch;
190 }
191 *sourceStart = source;
192 *targetStart = target;
193#ifdef CVTUTF_DEBUG
194 if (result == sourceIllegal) {
195 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
196 fflush(stderr);
197 }
198#endif
199 return result;
200}
#define UNI_SUR_LOW_START
#define UNI_SUR_HIGH_START
static const UTF32 halfBase
static const int halfShift
#define UNI_SUR_LOW_END
#define UNI_SUR_HIGH_END
uint32_t UTF32
unsigned short UTF16
icUtfConversionResult
@ targetExhausted
@ sourceIllegal
@ conversionOK
@ sourceExhausted
@ strictConversion

References conversionOK, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF16toUTF32() [2/2]

icUtfConversionResult icConvertUTF16toUTF32 ( const UTF16 * source,
const UTF16 * sourceEnd,
icUtf32Vector target,
UTF32 * targetEnd,
icUtfConversionFlags flags )

Definition at line 202 of file IccConvertUTF.cpp.

204{
206 target.clear();
207 UTF32 ch, ch2;
208 while (source < sourceEnd) {
209 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
210 ch = *source++;
211 /* If we have a surrogate pair, convert to UTF32 first. */
212 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
213 /* If the 16 bits following the high surrogate are in the source buffer... */
214 if (source < sourceEnd) {
215 ch2 = *source;
216 /* If it's a low surrogate, convert to UTF32. */
217 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
218 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
219 + (ch2 - UNI_SUR_LOW_START) + halfBase;
220 ++source;
221 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
222 --source; /* return to the illegal value itself */
223 result = sourceIllegal;
224 break;
225 }
226 } else { /* We don't have the 16 bits following the high surrogate. */
227 --source; /* return to the high surrogate */
228 result = sourceExhausted;
229 break;
230 }
231 } else if (flags == strictConversion) {
232 /* UTF-16 surrogate values are illegal in UTF-32 */
233 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
234 --source; /* return to the illegal value itself */
235 result = sourceIllegal;
236 break;
237 }
238 }
239 target.push_back(ch);
240 }
241#ifdef CVTUTF_DEBUG
242 if (result == sourceIllegal) {
243 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
244 fflush(stderr);
245 }
246#endif
247 return result;
248}

References conversionOK, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF16toUTF8() [1/2]

icUtfConversionResult icConvertUTF16toUTF8 ( const UTF16 ** sourceStart,
const UTF16 * sourceEnd,
UTF8 ** targetStart,
UTF8 * targetEnd,
icUtfConversionFlags flags )

Definition at line 299 of file IccConvertUTF.cpp.

301{
303 const UTF16* source = *sourceStart;
304 UTF8* target = *targetStart;
305 while (source < sourceEnd) {
306 UTF32 ch;
307 unsigned short bytesToWrite = 0;
308 const UTF32 byteMask = 0xBF;
309 const UTF32 byteMark = 0x80;
310 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
311 ch = *source++;
312 /* If we have a surrogate pair, convert to UTF32 first. */
313 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
314 /* If the 16 bits following the high surrogate are in the source buffer... */
315 if (source < sourceEnd) {
316 UTF32 ch2 = *source;
317 /* If it's a low surrogate, convert to UTF32. */
318 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
319 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
320 + (ch2 - UNI_SUR_LOW_START) + halfBase;
321 ++source;
322 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
323 --source; /* return to the illegal value itself */
324 result = sourceIllegal;
325 break;
326 }
327 } else { /* We don't have the 16 bits following the high surrogate. */
328 --source; /* return to the high surrogate */
329 result = sourceExhausted;
330 break;
331 }
332 } else if (flags == strictConversion) {
333 /* UTF-16 surrogate values are illegal in UTF-32 */
334 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
335 --source; /* return to the illegal value itself */
336 result = sourceIllegal;
337 break;
338 }
339 }
340 /* Figure out how many bytes the result will require */
341 if (ch < (UTF32)0x80) { bytesToWrite = 1;
342 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
343 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
344 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
345 } else { bytesToWrite = 3;
347 }
348
349 target += bytesToWrite;
350 if (target > targetEnd) {
351 source = oldSource; /* Back up source pointer! */
352 target -= bytesToWrite; result = targetExhausted; break;
353 }
354 switch (bytesToWrite) { /* note: everything falls through. */
355 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
356 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
357 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
359 }
360 target += bytesToWrite;
361 }
362 *sourceStart = source;
363 *targetStart = target;
364 return result;
365}
static const UTF8 firstByteMark[7]
#define UNI_REPLACEMENT_CHAR
unsigned char UTF8

References conversionOK, firstByteMark, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccTagUtf16Text::GetText(), icUtf16ToUtf8(), CIccTagUtf8Text::SetText(), and CIccTagZipUtf8Text::SetText().

+ Here is the caller graph for this function:

◆ icConvertUTF16toUTF8() [2/2]

icUtfConversionResult icConvertUTF16toUTF8 ( const UTF16 * source,
const UTF16 * sourceEnd,
icUtf8Vector & target,
icUtfConversionFlags flags )

Definition at line 367 of file IccConvertUTF.cpp.

369{
371 target.clear();
372 while (source < sourceEnd) {
373 UTF32 ch;
374 unsigned short bytesToWrite = 0;
375 const UTF32 byteMask = 0xBF;
376 const UTF32 byteMark = 0x80;
377 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
378 ch = *source++;
379 /* If we have a surrogate pair, convert to UTF32 first. */
380 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
381 /* If the 16 bits following the high surrogate are in the source buffer... */
382 if (source < sourceEnd) {
383 UTF32 ch2 = *source;
384 /* If it's a low surrogate, convert to UTF32. */
385 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
386 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
387 + (ch2 - UNI_SUR_LOW_START) + halfBase;
388 ++source;
389 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
390 --source; /* return to the illegal value itself */
391 result = sourceIllegal;
392 break;
393 }
394 } else { /* We don't have the 16 bits following the high surrogate. */
395 --source; /* return to the high surrogate */
396 result = sourceExhausted;
397 break;
398 }
399 } else if (flags == strictConversion) {
400 /* UTF-16 surrogate values are illegal in UTF-32 */
401 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
402 --source; /* return to the illegal value itself */
403 result = sourceIllegal;
404 break;
405 }
406 }
407 /* Figure out how many bytes the result will require */
408 if (ch < (UTF32)0x80) { bytesToWrite = 1;
409 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
410 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
411 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
412 } else { bytesToWrite = 3;
414 }
415
416 UTF8 buf[5], *ptr = &buf[bytesToWrite];
417 switch (bytesToWrite) { /* note: everything falls through. */
418 case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
419 case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
420 case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
421 case 1: *--ptr = (UTF8)(ch | firstByteMark[bytesToWrite]);
422 }
423 switch(bytesToWrite) {
424 case 4: target.push_back(*ptr++);
425 case 3: target.push_back(*ptr++);
426 case 2: target.push_back(*ptr++);
427 case 1: target.push_back(*ptr++);
428 }
429 }
430 return result;
431}

References conversionOK, firstByteMark, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF32toUTF16() [1/2]

icUtfConversionResult icConvertUTF32toUTF16 ( const UTF32 ** sourceStart,
const UTF32 * sourceEnd,
UTF16 ** targetStart,
UTF16 * targetEnd,
icUtfConversionFlags flags )

Definition at line 61 of file IccConvertUTF.cpp.

63{
65 const UTF32* source = *sourceStart;
66 UTF16* target = *targetStart;
67 while (source < sourceEnd) {
68 UTF32 ch;
69 if (target >= targetEnd) {
70 result = targetExhausted; break;
71 }
72 ch = *source++;
73 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76 if (flags == strictConversion) {
77 --source; /* return to the illegal value itself */
78 result = sourceIllegal;
79 break;
80 } else {
81 *target++ = UNI_REPLACEMENT_CHAR;
82 }
83 } else {
84 *target++ = (UTF16)ch; /* normal case */
85 }
86 } else if (ch > UNI_MAX_LEGAL_UTF32) {
87 if (flags == strictConversion) {
88 result = sourceIllegal;
89 } else {
90 *target++ = UNI_REPLACEMENT_CHAR;
91 }
92 } else {
93 /* target is a character in range 0xFFFF - 0x10FFFF. */
94 if (target + 1 >= targetEnd) {
95 --source; /* Back up source pointer! */
96 result = targetExhausted; break;
97 }
98 ch -= halfBase;
99 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101 }
102 }
103 *sourceStart = source;
104 *targetStart = target;
105 return result;
106}
static const UTF32 halfMask
#define UNI_MAX_LEGAL_UTF32
#define UNI_MAX_BMP

References conversionOK, halfBase, halfMask, halfShift, sourceIllegal, strictConversion, targetExhausted, UNI_MAX_BMP, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccLocalizedUnicode::SetText().

+ Here is the caller graph for this function:

◆ icConvertUTF32toUTF16() [2/2]

icUtfConversionResult icConvertUTF32toUTF16 ( const UTF32 * source,
const UTF32 * sourceEnd,
icUtf16Vector & target,
icUtfConversionFlags flags )

Definition at line 108 of file IccConvertUTF.cpp.

110{
112 target.clear();
113 while (source < sourceEnd) {
114 UTF32 ch;
115 ch = *source++;
116 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
117 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
118 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
119 if (flags == strictConversion) {
120 --source; /* return to the illegal value itself */
121 result = sourceIllegal;
122 break;
123 } else {
124 target.push_back(UNI_REPLACEMENT_CHAR);
125 }
126 } else {
127 target.push_back((UTF16)ch); /* normal case */
128 }
129 } else if (ch > UNI_MAX_LEGAL_UTF32) {
130 if (flags == strictConversion) {
131 result = sourceIllegal;
132 } else {
133 target.push_back(UNI_REPLACEMENT_CHAR);
134 }
135 } else {
136 ch -= halfBase;
137 target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
138 target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
139 }
140 }
141 return result;
142}

References conversionOK, halfBase, halfMask, halfShift, sourceIllegal, strictConversion, UNI_MAX_BMP, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF32toUTF8() [1/2]

icUtfConversionResult icConvertUTF32toUTF8 ( const UTF32 ** sourceStart,
const UTF32 * sourceEnd,
UTF8 ** targetStart,
UTF8 * targetEnd,
icUtfConversionFlags flags )

Definition at line 622 of file IccConvertUTF.cpp.

624{
626 const UTF32* source = *sourceStart;
627 UTF8* target = *targetStart;
628 while (source < sourceEnd) {
629 UTF32 ch;
630 unsigned short bytesToWrite = 0;
631 const UTF32 byteMask = 0xBF;
632 const UTF32 byteMark = 0x80;
633 ch = *source++;
634 if (flags == strictConversion ) {
635 /* UTF-16 surrogate values are illegal in UTF-32 */
636 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
637 --source; /* return to the illegal value itself */
638 result = sourceIllegal;
639 break;
640 }
641 }
642 /*
643 * Figure out how many bytes the result will require. Turn any
644 * illegally large UTF32 things (> Plane 17) into replacement chars.
645 */
646 if (ch < (UTF32)0x80) { bytesToWrite = 1;
647 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
648 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
649 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
650 } else { bytesToWrite = 3;
652 result = sourceIllegal;
653 }
654
655 target += bytesToWrite;
656 if (target > targetEnd) {
657 --source; /* Back up source pointer! */
658 target -= bytesToWrite; result = targetExhausted; break;
659 }
660 switch (bytesToWrite) { /* note: everything falls through. */
661 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
662 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
663 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
664 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
665 }
666 target += bytesToWrite;
667 }
668 *sourceStart = source;
669 *targetStart = target;
670 return result;
671}

References conversionOK, firstByteMark, sourceIllegal, strictConversion, targetExhausted, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

◆ icConvertUTF32toUTF8() [2/2]

icUtfConversionResult icConvertUTF32toUTF8 ( const UTF32 * source,
const UTF32 * sourceEnd,
icUtf8Vector & target,
icUtfConversionFlags flags )

Definition at line 673 of file IccConvertUTF.cpp.

675{
677 target.clear();
678 while (source < sourceEnd) {
679 UTF32 ch;
680 unsigned short bytesToWrite = 0;
681 const UTF32 byteMask = 0xBF;
682 const UTF32 byteMark = 0x80;
683 ch = *source++;
684 if (flags == strictConversion ) {
685 /* UTF-16 surrogate values are illegal in UTF-32 */
686 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
687 --source; /* return to the illegal value itself */
688 result = sourceIllegal;
689 break;
690 }
691 }
692 /*
693 * Figure out how many bytes the result will require. Turn any
694 * illegally large UTF32 things (> Plane 17) into replacement chars.
695 */
696 if (ch < (UTF32)0x80) { bytesToWrite = 1;
697 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
698 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
699 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
700 } else { bytesToWrite = 3;
702 result = sourceIllegal;
703 }
704
705 UTF8 buf[5], *ptr = &buf[bytesToWrite];
706 switch (bytesToWrite) { /* note: everything falls through. */
707 case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
708 case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
709 case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
710 case 1: *--ptr = (UTF8)(ch | firstByteMark[bytesToWrite]);
711 }
712 switch(bytesToWrite) {
713 case 4: target.push_back(*ptr++);
714 case 3: target.push_back(*ptr++);
715 case 2: target.push_back(*ptr++);
716 case 1: target.push_back(*ptr++);
717 }
718 }
719 return result;
720}

References conversionOK, firstByteMark, sourceIllegal, strictConversion, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

◆ icConvertUTF8toUTF16() [1/2]

icUtfConversionResult icConvertUTF8toUTF16 ( const UTF8 ** sourceStart,
const UTF8 * sourceEnd,
UTF16 ** targetStart,
UTF16 * targetEnd,
icUtfConversionFlags flags )

Definition at line 489 of file IccConvertUTF.cpp.

491{
493 const UTF8* source = *sourceStart;
494 UTF16* target = *targetStart;
495 while (source < sourceEnd) {
496 UTF32 ch = 0;
497 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
498 if (source + extraBytesToRead >= sourceEnd) {
499 result = sourceExhausted; break;
500 }
501 /* Do this check whether lenient or strict */
502 if (! isLegalUTF8(source, extraBytesToRead+1)) {
503 result = sourceIllegal;
504 break;
505 }
506 /*
507 * The cases all fall through. See "Note A" below.
508 */
509 switch (extraBytesToRead) {
510 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
511 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
512 case 3: ch += *source++; ch <<= 6;
513 case 2: ch += *source++; ch <<= 6;
514 case 1: ch += *source++; ch <<= 6;
515 case 0: ch += *source++;
516 }
517 ch -= offsetsFromUTF8[extraBytesToRead];
518
519 if (target >= targetEnd) {
520 source -= (extraBytesToRead+1); /* Back up source pointer! */
521 result = targetExhausted; break;
522 }
523 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
524 /* UTF-16 surrogate values are illegal in UTF-32 */
525 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
526 if (flags == strictConversion) {
527 source -= (extraBytesToRead+1); /* return to the illegal value itself */
528 result = sourceIllegal;
529 break;
530 } else {
531 *target++ = UNI_REPLACEMENT_CHAR;
532 }
533 } else {
534 *target++ = (UTF16)ch; /* normal case */
535 }
536 } else if (ch > UNI_MAX_UTF16) {
537 if (flags == strictConversion) {
538 result = sourceIllegal;
539 source -= (extraBytesToRead+1); /* return to the start */
540 break; /* Bail out; shouldn't continue */
541 } else {
542 *target++ = UNI_REPLACEMENT_CHAR;
543 }
544 } else {
545 /* target is a character in range 0xFFFF - 0x10FFFF. */
546 if (target + 1 >= targetEnd) {
547 source -= (extraBytesToRead+1); /* Back up source pointer! */
548 result = targetExhausted; break;
549 }
550 ch -= halfBase;
551 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
552 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
553 }
554 }
555 *sourceStart = source;
556 *targetStart = target;
557 return result;
558}
static const char trailingBytesForUTF8[256]
static Boolean isLegalUTF8(const UTF8 *source, int length)
static const UTF32 offsetsFromUTF8[6]
#define UNI_MAX_UTF16

References conversionOK, halfBase, halfMask, halfShift, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, trailingBytesForUTF8, UNI_MAX_BMP, UNI_MAX_UTF16, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccUTF16String::CIccUTF16String(), CIccUTF16String::FromUtf8(), and CIccTagUtf16Text::SetText().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ icConvertUTF8toUTF16() [2/2]

icUtfConversionResult icConvertUTF8toUTF16 ( const UTF8 * source,
const UTF8 * sourceEnd,
icUtf16Vector & target,
icUtfConversionFlags flags )

Definition at line 560 of file IccConvertUTF.cpp.

562{
564 target.clear();
565 while (source < sourceEnd) {
566 UTF32 ch = 0;
567 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
568 if (source + extraBytesToRead >= sourceEnd) {
569 result = sourceExhausted; break;
570 }
571 /* Do this check whether lenient or strict */
572 if (! isLegalUTF8(source, extraBytesToRead+1)) {
573 result = sourceIllegal;
574 break;
575 }
576 /*
577 * The cases all fall through. See "Note A" below.
578 */
579 switch (extraBytesToRead) {
580 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
581 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
582 case 3: ch += *source++; ch <<= 6;
583 case 2: ch += *source++; ch <<= 6;
584 case 1: ch += *source++; ch <<= 6;
585 case 0: ch += *source++;
586 }
587 ch -= offsetsFromUTF8[extraBytesToRead];
588
589 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
590 /* UTF-16 surrogate values are illegal in UTF-32 */
591 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
592 if (flags == strictConversion) {
593 source -= (extraBytesToRead+1); /* return to the illegal value itself */
594 result = sourceIllegal;
595 break;
596 } else {
597 target.push_back(UNI_REPLACEMENT_CHAR);
598 }
599 } else {
600 target.push_back((UTF16)ch); /* normal case */
601 }
602 } else if (ch > UNI_MAX_UTF16) {
603 if (flags == strictConversion) {
604 result = sourceIllegal;
605 source -= (extraBytesToRead+1); /* return to the start */
606 break; /* Bail out; shouldn't continue */
607 } else {
608 target.push_back(UNI_REPLACEMENT_CHAR);
609 }
610 } else {
611 /* target is a character in range 0xFFFF - 0x10FFFF. */
612 ch -= halfBase;
613 target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
614 target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
615 }
616 }
617 return result;
618}

References conversionOK, halfBase, halfMask, halfShift, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, trailingBytesForUTF8, UNI_MAX_BMP, UNI_MAX_UTF16, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

+ Here is the call graph for this function:

◆ icConvertUTF8toUTF32() [1/2]

icUtfConversionResult icConvertUTF8toUTF32 ( const UTF8 ** sourceStart,
const UTF8 * sourceEnd,
UTF32 ** targetStart,
UTF32 * targetEnd,
icUtfConversionFlags flags )

Definition at line 723 of file IccConvertUTF.cpp.

725{
727 const UTF8* source = *sourceStart;
728 UTF32* target = *targetStart;
729 while (source < sourceEnd) {
730 UTF32 ch = 0;
731 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
732 if (source + extraBytesToRead >= sourceEnd) {
733 result = sourceExhausted; break;
734 }
735 /* Do this check whether lenient or strict */
736 if (! isLegalUTF8(source, extraBytesToRead+1)) {
737 result = sourceIllegal;
738 break;
739 }
740 /*
741 * The cases all fall through. See "Note A" below.
742 */
743 switch (extraBytesToRead) {
744 case 5: ch += *source++; ch <<= 6;
745 case 4: ch += *source++; ch <<= 6;
746 case 3: ch += *source++; ch <<= 6;
747 case 2: ch += *source++; ch <<= 6;
748 case 1: ch += *source++; ch <<= 6;
749 case 0: ch += *source++;
750 }
751 ch -= offsetsFromUTF8[extraBytesToRead];
752
753 if (target >= targetEnd) {
754 source -= (extraBytesToRead+1); /* Back up the source pointer! */
755 result = targetExhausted; break;
756 }
757 if (ch <= UNI_MAX_LEGAL_UTF32) {
758 /*
759 * UTF-16 surrogate values are illegal in UTF-32, and anything
760 * over Plane 17 (> 0x10FFFF) is illegal.
761 */
762 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
763 if (flags == strictConversion) {
764 source -= (extraBytesToRead+1); /* return to the illegal value itself */
765 result = sourceIllegal;
766 break;
767 } else {
768 *target++ = UNI_REPLACEMENT_CHAR;
769 }
770 } else {
771 *target++ = ch;
772 }
773 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
774 result = sourceIllegal;
775 *target++ = UNI_REPLACEMENT_CHAR;
776 }
777 }
778 *sourceStart = source;
779 *targetStart = target;
780 return result;
781}

References conversionOK, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, trailingBytesForUTF8, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

+ Here is the call graph for this function:

◆ icConvertUTF8toUTF32() [2/2]

icUtfConversionResult icConvertUTF8toUTF32 ( const UTF8 * source,
const UTF8 * sourceEnd,
icUtf32Vector & target,
icUtfConversionFlags flags )

Definition at line 783 of file IccConvertUTF.cpp.

785{
787 target.clear();
788 while (source < sourceEnd) {
789 UTF32 ch = 0;
790 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
791 if (source + extraBytesToRead >= sourceEnd) {
792 result = sourceExhausted; break;
793 }
794 /* Do this check whether lenient or strict */
795 if (! isLegalUTF8(source, extraBytesToRead+1)) {
796 result = sourceIllegal;
797 break;
798 }
799 /*
800 * The cases all fall through. See "Note A" below.
801 */
802 switch (extraBytesToRead) {
803 case 5: ch += *source++; ch <<= 6;
804 case 4: ch += *source++; ch <<= 6;
805 case 3: ch += *source++; ch <<= 6;
806 case 2: ch += *source++; ch <<= 6;
807 case 1: ch += *source++; ch <<= 6;
808 case 0: ch += *source++;
809 }
810 ch -= offsetsFromUTF8[extraBytesToRead];
811
812 if (ch <= UNI_MAX_LEGAL_UTF32) {
813 /*
814 * UTF-16 surrogate values are illegal in UTF-32, and anything
815 * over Plane 17 (> 0x10FFFF) is illegal.
816 */
817 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
818 if (flags == strictConversion) {
819 source -= (extraBytesToRead+1); /* return to the illegal value itself */
820 result = sourceIllegal;
821 break;
822 } else {
823 target.push_back(UNI_REPLACEMENT_CHAR);
824 }
825 } else {
826 target.push_back(ch);
827 }
828 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
829 result = sourceIllegal;
830 target.push_back(UNI_REPLACEMENT_CHAR);
831 }
832 }
833
834 return result;
835}

References conversionOK, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, trailingBytesForUTF8, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

+ Here is the call graph for this function:

◆ icIsLegalUTF8Sequence()

Boolean icIsLegalUTF8Sequence ( const UTF8 * source,
const UTF8 * sourceEnd )

Definition at line 478 of file IccConvertUTF.cpp.

479{
480 int length = trailingBytesForUTF8[*source]+1;
481 if (source+length > sourceEnd) {
482 return false;
483 }
484 return isLegalUTF8(source, length);
485}

References isLegalUTF8(), and trailingBytesForUTF8.

+ Here is the call graph for this function:

◆ isLegalUTF8()

static Boolean isLegalUTF8 ( const UTF8 * source,
int length )
static

Definition at line 446 of file IccConvertUTF.cpp.

447{
448 UTF8 a;
449 const UTF8 *srcptr = source+length;
450 switch (length) {
451 default: return false;
452 /* Everything else falls through when "true"... */
453 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
454 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
455 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
456
457 switch (*source) {
458 /* no fall-through in this inner switch */
459 case 0xE0: if (a < 0xA0) return false; break;
460 case 0xED: if (a > 0x9F) return false; break;
461 case 0xF0: if (a < 0x90) return false; break;
462 case 0xF4: if (a > 0x8F) return false; break;
463 default: if (a < 0x80) return false;
464 }
465
466 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
467 }
468 if (*source > 0xF4) return false;
469 return true;
470}

Referenced by icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), icConvertUTF8toUTF32(), and icIsLegalUTF8Sequence().

+ Here is the caller graph for this function:

Variable Documentation

◆ firstByteMark

const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }
static

Definition at line 285 of file IccConvertUTF.cpp.

285{ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

Referenced by icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF8(), and icConvertUTF32toUTF8().

◆ halfBase

◆ halfMask

const UTF32 halfMask = 0x3FFUL
static

◆ halfShift

◆ offsetsFromUTF8

const UTF32 offsetsFromUTF8[6]
static
Initial value:
= { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL }

Definition at line 275 of file IccConvertUTF.cpp.

275 { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
2760x03C82080UL, 0xFA082080UL, 0x82082080UL };

Referenced by icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), and icConvertUTF8toUTF32().

◆ trailingBytesForUTF8

const char trailingBytesForUTF8[256]
static
Initial value:
= {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
}

Definition at line 259 of file IccConvertUTF.cpp.

259 {
260 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
261 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
262 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
263 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
264 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
265 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
266 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
268};

Referenced by icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), icConvertUTF8toUTF32(), and icIsLegalUTF8Sequence().