Hoyt's FORK of DemoIccMAX 2.1.17.hoyt
Documentation for Hoyt's FORK of DemoIccMAX
Loading...
Searching...
No Matches
IccConvertUTF.h File Reference
#include "IccProfLibConf.h"
+ Include dependency graph for IccConvertUTF.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define UNI_MAX_BMP   (UTF32)0x0000FFFF
 
#define UNI_MAX_LEGAL_UTF32   (UTF32)0x0010FFFF
 
#define UNI_MAX_UTF16   (UTF32)0x0010FFFF
 
#define UNI_MAX_UTF32   (UTF32)0x7FFFFFFF
 
#define UNI_REPLACEMENT_CHAR   (UTF32)0x0000FFFD
 

Typedefs

typedef unsigned char Boolean
 
typedef unsigned short UTF16
 
typedef uint32_t UTF32
 
typedef unsigned char UTF8
 

Enumerations

enum  icUtfConversionFlags { strictConversion = 0 , lenientConversion }
 
enum  icUtfConversionResult { conversionOK , sourceExhausted , targetExhausted , sourceIllegal }
 

Functions

icUtfConversionResult icConvertUTF16toUTF32 (const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF16toUTF8 (const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF32toUTF16 (const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF32toUTF8 (const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF8toUTF16 (const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, icUtfConversionFlags flags)
 
icUtfConversionResult icConvertUTF8toUTF32 (const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, icUtfConversionFlags flags)
 
Boolean icIsLegalUTF8Sequence (const UTF8 *source, const UTF8 *sourceEnd)
 

Macro Definition Documentation

◆ UNI_MAX_BMP

#define UNI_MAX_BMP   (UTF32)0x0000FFFF

◆ UNI_MAX_LEGAL_UTF32

#define UNI_MAX_LEGAL_UTF32   (UTF32)0x0010FFFF

◆ UNI_MAX_UTF16

#define UNI_MAX_UTF16   (UTF32)0x0010FFFF

Definition at line 114 of file IccConvertUTF.h.

Referenced by icConvertUTF8toUTF16(), and icConvertUTF8toUTF16().

◆ UNI_MAX_UTF32

#define UNI_MAX_UTF32   (UTF32)0x7FFFFFFF

Definition at line 115 of file IccConvertUTF.h.

◆ UNI_REPLACEMENT_CHAR

Typedef Documentation

◆ Boolean

typedef unsigned char Boolean

Definition at line 109 of file IccConvertUTF.h.

◆ UTF16

typedef unsigned short UTF16

Definition at line 107 of file IccConvertUTF.h.

◆ UTF32

typedef uint32_t UTF32

Definition at line 106 of file IccConvertUTF.h.

◆ UTF8

typedef unsigned char UTF8

Definition at line 108 of file IccConvertUTF.h.

Enumeration Type Documentation

◆ icUtfConversionFlags

Enumerator
strictConversion 
lenientConversion 

Definition at line 125 of file IccConvertUTF.h.

125 {
icUtfConversionFlags
@ strictConversion
@ lenientConversion

◆ icUtfConversionResult

Enumerator
conversionOK 
sourceExhausted 
targetExhausted 
sourceIllegal 

Definition at line 118 of file IccConvertUTF.h.

118 {
119 conversionOK, /* conversion successful */
120 sourceExhausted, /* partial character in source, but hit end */
121 targetExhausted, /* insuff. room in target for conversion */
122 sourceIllegal /* source sequence is illegal/malformed */
icUtfConversionResult
@ targetExhausted
@ sourceIllegal
@ conversionOK
@ sourceExhausted

Function Documentation

◆ icConvertUTF16toUTF32()

icUtfConversionResult icConvertUTF16toUTF32 ( const UTF16 ** sourceStart,
const UTF16 * sourceEnd,
UTF32 ** targetStart,
UTF32 * targetEnd,
icUtfConversionFlags flags )

Definition at line 147 of file IccConvertUTF.cpp.

149{
151 const UTF16* source = *sourceStart;
152 UTF32* target = *targetStart;
153 UTF32 ch, ch2;
154 while (source < sourceEnd) {
155 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
156 ch = *source++;
157 /* If we have a surrogate pair, convert to UTF32 first. */
158 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
159 /* If the 16 bits following the high surrogate are in the source buffer... */
160 if (source < sourceEnd) {
161 ch2 = *source;
162 /* If it's a low surrogate, convert to UTF32. */
163 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
164 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
165 + (ch2 - UNI_SUR_LOW_START) + halfBase;
166 ++source;
167 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
168 --source; /* return to the illegal value itself */
169 result = sourceIllegal;
170 break;
171 }
172 } else { /* We don't have the 16 bits following the high surrogate. */
173 --source; /* return to the high surrogate */
174 result = sourceExhausted;
175 break;
176 }
177 } else if (flags == strictConversion) {
178 /* UTF-16 surrogate values are illegal in UTF-32 */
179 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
180 --source; /* return to the illegal value itself */
181 result = sourceIllegal;
182 break;
183 }
184 }
185 if (target >= targetEnd) {
186 source = oldSource; /* Back up source pointer! */
187 result = targetExhausted; break;
188 }
189 *target++ = ch;
190 }
191 *sourceStart = source;
192 *targetStart = target;
193#ifdef CVTUTF_DEBUG
194 if (result == sourceIllegal) {
195 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
196 fflush(stderr);
197 }
198#endif
199 return result;
200}
#define UNI_SUR_LOW_START
#define UNI_SUR_HIGH_START
static const UTF32 halfBase
static const int halfShift
#define UNI_SUR_LOW_END
#define UNI_SUR_HIGH_END
uint32_t UTF32
unsigned short UTF16

References conversionOK, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF16toUTF8()

icUtfConversionResult icConvertUTF16toUTF8 ( const UTF16 ** sourceStart,
const UTF16 * sourceEnd,
UTF8 ** targetStart,
UTF8 * targetEnd,
icUtfConversionFlags flags )

Definition at line 299 of file IccConvertUTF.cpp.

301{
303 const UTF16* source = *sourceStart;
304 UTF8* target = *targetStart;
305 while (source < sourceEnd) {
306 UTF32 ch;
307 unsigned short bytesToWrite = 0;
308 const UTF32 byteMask = 0xBF;
309 const UTF32 byteMark = 0x80;
310 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
311 ch = *source++;
312 /* If we have a surrogate pair, convert to UTF32 first. */
313 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
314 /* If the 16 bits following the high surrogate are in the source buffer... */
315 if (source < sourceEnd) {
316 UTF32 ch2 = *source;
317 /* If it's a low surrogate, convert to UTF32. */
318 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
319 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
320 + (ch2 - UNI_SUR_LOW_START) + halfBase;
321 ++source;
322 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
323 --source; /* return to the illegal value itself */
324 result = sourceIllegal;
325 break;
326 }
327 } else { /* We don't have the 16 bits following the high surrogate. */
328 --source; /* return to the high surrogate */
329 result = sourceExhausted;
330 break;
331 }
332 } else if (flags == strictConversion) {
333 /* UTF-16 surrogate values are illegal in UTF-32 */
334 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
335 --source; /* return to the illegal value itself */
336 result = sourceIllegal;
337 break;
338 }
339 }
340 /* Figure out how many bytes the result will require */
341 if (ch < (UTF32)0x80) { bytesToWrite = 1;
342 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
343 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
344 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
345 } else { bytesToWrite = 3;
347 }
348
349 target += bytesToWrite;
350 if (target > targetEnd) {
351 source = oldSource; /* Back up source pointer! */
352 target -= bytesToWrite; result = targetExhausted; break;
353 }
354 switch (bytesToWrite) { /* note: everything falls through. */
355 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
356 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
357 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
359 }
360 target += bytesToWrite;
361 }
362 *sourceStart = source;
363 *targetStart = target;
364 return result;
365}
static const UTF8 firstByteMark[7]
#define UNI_REPLACEMENT_CHAR
unsigned char UTF8

References conversionOK, firstByteMark, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccTagUtf16Text::GetText(), icUtf16ToUtf8(), CIccTagUtf8Text::SetText(), and CIccTagZipUtf8Text::SetText().

+ Here is the caller graph for this function:

◆ icConvertUTF32toUTF16()

icUtfConversionResult icConvertUTF32toUTF16 ( const UTF32 ** sourceStart,
const UTF32 * sourceEnd,
UTF16 ** targetStart,
UTF16 * targetEnd,
icUtfConversionFlags flags )

Definition at line 61 of file IccConvertUTF.cpp.

63{
65 const UTF32* source = *sourceStart;
66 UTF16* target = *targetStart;
67 while (source < sourceEnd) {
68 UTF32 ch;
69 if (target >= targetEnd) {
70 result = targetExhausted; break;
71 }
72 ch = *source++;
73 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76 if (flags == strictConversion) {
77 --source; /* return to the illegal value itself */
78 result = sourceIllegal;
79 break;
80 } else {
81 *target++ = UNI_REPLACEMENT_CHAR;
82 }
83 } else {
84 *target++ = (UTF16)ch; /* normal case */
85 }
86 } else if (ch > UNI_MAX_LEGAL_UTF32) {
87 if (flags == strictConversion) {
88 result = sourceIllegal;
89 } else {
90 *target++ = UNI_REPLACEMENT_CHAR;
91 }
92 } else {
93 /* target is a character in range 0xFFFF - 0x10FFFF. */
94 if (target + 1 >= targetEnd) {
95 --source; /* Back up source pointer! */
96 result = targetExhausted; break;
97 }
98 ch -= halfBase;
99 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101 }
102 }
103 *sourceStart = source;
104 *targetStart = target;
105 return result;
106}
static const UTF32 halfMask
#define UNI_MAX_LEGAL_UTF32
#define UNI_MAX_BMP

References conversionOK, halfBase, halfMask, halfShift, sourceIllegal, strictConversion, targetExhausted, UNI_MAX_BMP, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccLocalizedUnicode::SetText().

+ Here is the caller graph for this function:

◆ icConvertUTF32toUTF8()

icUtfConversionResult icConvertUTF32toUTF8 ( const UTF32 ** sourceStart,
const UTF32 * sourceEnd,
UTF8 ** targetStart,
UTF8 * targetEnd,
icUtfConversionFlags flags )

Definition at line 622 of file IccConvertUTF.cpp.

624{
626 const UTF32* source = *sourceStart;
627 UTF8* target = *targetStart;
628 while (source < sourceEnd) {
629 UTF32 ch;
630 unsigned short bytesToWrite = 0;
631 const UTF32 byteMask = 0xBF;
632 const UTF32 byteMark = 0x80;
633 ch = *source++;
634 if (flags == strictConversion ) {
635 /* UTF-16 surrogate values are illegal in UTF-32 */
636 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
637 --source; /* return to the illegal value itself */
638 result = sourceIllegal;
639 break;
640 }
641 }
642 /*
643 * Figure out how many bytes the result will require. Turn any
644 * illegally large UTF32 things (> Plane 17) into replacement chars.
645 */
646 if (ch < (UTF32)0x80) { bytesToWrite = 1;
647 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
648 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
649 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
650 } else { bytesToWrite = 3;
652 result = sourceIllegal;
653 }
654
655 target += bytesToWrite;
656 if (target > targetEnd) {
657 --source; /* Back up source pointer! */
658 target -= bytesToWrite; result = targetExhausted; break;
659 }
660 switch (bytesToWrite) { /* note: everything falls through. */
661 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
662 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
663 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
664 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
665 }
666 target += bytesToWrite;
667 }
668 *sourceStart = source;
669 *targetStart = target;
670 return result;
671}

References conversionOK, firstByteMark, sourceIllegal, strictConversion, targetExhausted, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

◆ icConvertUTF8toUTF16()

icUtfConversionResult icConvertUTF8toUTF16 ( const UTF8 ** sourceStart,
const UTF8 * sourceEnd,
UTF16 ** targetStart,
UTF16 * targetEnd,
icUtfConversionFlags flags )

Definition at line 489 of file IccConvertUTF.cpp.

491{
493 const UTF8* source = *sourceStart;
494 UTF16* target = *targetStart;
495 while (source < sourceEnd) {
496 UTF32 ch = 0;
497 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
498 if (source + extraBytesToRead >= sourceEnd) {
499 result = sourceExhausted; break;
500 }
501 /* Do this check whether lenient or strict */
502 if (! isLegalUTF8(source, extraBytesToRead+1)) {
503 result = sourceIllegal;
504 break;
505 }
506 /*
507 * The cases all fall through. See "Note A" below.
508 */
509 switch (extraBytesToRead) {
510 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
511 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
512 case 3: ch += *source++; ch <<= 6;
513 case 2: ch += *source++; ch <<= 6;
514 case 1: ch += *source++; ch <<= 6;
515 case 0: ch += *source++;
516 }
517 ch -= offsetsFromUTF8[extraBytesToRead];
518
519 if (target >= targetEnd) {
520 source -= (extraBytesToRead+1); /* Back up source pointer! */
521 result = targetExhausted; break;
522 }
523 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
524 /* UTF-16 surrogate values are illegal in UTF-32 */
525 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
526 if (flags == strictConversion) {
527 source -= (extraBytesToRead+1); /* return to the illegal value itself */
528 result = sourceIllegal;
529 break;
530 } else {
531 *target++ = UNI_REPLACEMENT_CHAR;
532 }
533 } else {
534 *target++ = (UTF16)ch; /* normal case */
535 }
536 } else if (ch > UNI_MAX_UTF16) {
537 if (flags == strictConversion) {
538 result = sourceIllegal;
539 source -= (extraBytesToRead+1); /* return to the start */
540 break; /* Bail out; shouldn't continue */
541 } else {
542 *target++ = UNI_REPLACEMENT_CHAR;
543 }
544 } else {
545 /* target is a character in range 0xFFFF - 0x10FFFF. */
546 if (target + 1 >= targetEnd) {
547 source -= (extraBytesToRead+1); /* Back up source pointer! */
548 result = targetExhausted; break;
549 }
550 ch -= halfBase;
551 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
552 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
553 }
554 }
555 *sourceStart = source;
556 *targetStart = target;
557 return result;
558}
static const char trailingBytesForUTF8[256]
static Boolean isLegalUTF8(const UTF8 *source, int length)
static const UTF32 offsetsFromUTF8[6]
#define UNI_MAX_UTF16

References conversionOK, halfBase, halfMask, halfShift, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, trailingBytesForUTF8, UNI_MAX_BMP, UNI_MAX_UTF16, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccUTF16String::CIccUTF16String(), CIccUTF16String::FromUtf8(), and CIccTagUtf16Text::SetText().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ icConvertUTF8toUTF32()

icUtfConversionResult icConvertUTF8toUTF32 ( const UTF8 ** sourceStart,
const UTF8 * sourceEnd,
UTF32 ** targetStart,
UTF32 * targetEnd,
icUtfConversionFlags flags )

Definition at line 723 of file IccConvertUTF.cpp.

725{
727 const UTF8* source = *sourceStart;
728 UTF32* target = *targetStart;
729 while (source < sourceEnd) {
730 UTF32 ch = 0;
731 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
732 if (source + extraBytesToRead >= sourceEnd) {
733 result = sourceExhausted; break;
734 }
735 /* Do this check whether lenient or strict */
736 if (! isLegalUTF8(source, extraBytesToRead+1)) {
737 result = sourceIllegal;
738 break;
739 }
740 /*
741 * The cases all fall through. See "Note A" below.
742 */
743 switch (extraBytesToRead) {
744 case 5: ch += *source++; ch <<= 6;
745 case 4: ch += *source++; ch <<= 6;
746 case 3: ch += *source++; ch <<= 6;
747 case 2: ch += *source++; ch <<= 6;
748 case 1: ch += *source++; ch <<= 6;
749 case 0: ch += *source++;
750 }
751 ch -= offsetsFromUTF8[extraBytesToRead];
752
753 if (target >= targetEnd) {
754 source -= (extraBytesToRead+1); /* Back up the source pointer! */
755 result = targetExhausted; break;
756 }
757 if (ch <= UNI_MAX_LEGAL_UTF32) {
758 /*
759 * UTF-16 surrogate values are illegal in UTF-32, and anything
760 * over Plane 17 (> 0x10FFFF) is illegal.
761 */
762 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
763 if (flags == strictConversion) {
764 source -= (extraBytesToRead+1); /* return to the illegal value itself */
765 result = sourceIllegal;
766 break;
767 } else {
768 *target++ = UNI_REPLACEMENT_CHAR;
769 }
770 } else {
771 *target++ = ch;
772 }
773 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
774 result = sourceIllegal;
775 *target++ = UNI_REPLACEMENT_CHAR;
776 }
777 }
778 *sourceStart = source;
779 *targetStart = target;
780 return result;
781}

References conversionOK, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, trailingBytesForUTF8, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

+ Here is the call graph for this function:

◆ icIsLegalUTF8Sequence()

Boolean icIsLegalUTF8Sequence ( const UTF8 * source,
const UTF8 * sourceEnd )

Definition at line 478 of file IccConvertUTF.cpp.

479{
480 int length = trailingBytesForUTF8[*source]+1;
481 if (source+length > sourceEnd) {
482 return false;
483 }
484 return isLegalUTF8(source, length);
485}

References isLegalUTF8(), and trailingBytesForUTF8.

+ Here is the call graph for this function: