#include "IccConvertUTF.h"

Include dependency graph for IccConvertUTF.cpp:

Macros
#define	false 0

#define	true 1

#define	UNI_SUR_HIGH_END (UTF32)0xDBFF

#define	UNI_SUR_HIGH_START (UTF32)0xD800

#define	UNI_SUR_LOW_END (UTF32)0xDFFF

#define	UNI_SUR_LOW_START (UTF32)0xDC00

Functions
icUtfConversionResult	icConvertUTF16toUTF32 (const UTF16 *sourceStart, const UTF16 sourceEnd, UTF32 *targetStart, UTF32 targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF16toUTF32 (const UTF16 source, const UTF16 sourceEnd, icUtf32Vector target, UTF32 *targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF16toUTF8 (const UTF16 *sourceStart, const UTF16 sourceEnd, UTF8 *targetStart, UTF8 targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF16toUTF8 (const UTF16 source, const UTF16 sourceEnd, icUtf8Vector &target, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF32toUTF16 (const UTF32 *sourceStart, const UTF32 sourceEnd, UTF16 *targetStart, UTF16 targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF32toUTF16 (const UTF32 source, const UTF32 sourceEnd, icUtf16Vector &target, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF32toUTF8 (const UTF32 *sourceStart, const UTF32 sourceEnd, UTF8 *targetStart, UTF8 targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF32toUTF8 (const UTF32 source, const UTF32 sourceEnd, icUtf8Vector &target, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF8toUTF16 (const UTF8 *sourceStart, const UTF8 sourceEnd, UTF16 *targetStart, UTF16 targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF8toUTF16 (const UTF8 source, const UTF8 sourceEnd, icUtf16Vector &target, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF8toUTF32 (const UTF8 *sourceStart, const UTF8 sourceEnd, UTF32 *targetStart, UTF32 targetEnd, icUtfConversionFlags flags)

icUtfConversionResult	icConvertUTF8toUTF32 (const UTF8 source, const UTF8 sourceEnd, icUtf32Vector &target, icUtfConversionFlags flags)

Boolean	icIsLegalUTF8Sequence (const UTF8 source, const UTF8 sourceEnd)

static Boolean	isLegalUTF8 (const UTF8 *source, int length)

Variables
static const UTF8	firstByteMark [7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }

static const UTF32	halfBase = 0x0010000UL

static const UTF32	halfMask = 0x3FFUL

static const int	halfShift = 10

static const UTF32	offsetsFromUTF8 [6]

static const char	trailingBytesForUTF8 [256]

Macro Definition Documentation

◆ false

#define false 0

Definition at line 56 of file IccConvertUTF.cpp.

◆ true

#define true 1

Definition at line 57 of file IccConvertUTF.cpp.

◆ UNI_SUR_HIGH_END

#define UNI_SUR_HIGH_END (UTF32)0xDBFF

Definition at line 53 of file IccConvertUTF.cpp.

Referenced by icConvertUTF16toUTF32(), icConvertUTF16toUTF32(), icConvertUTF16toUTF8(), and icConvertUTF16toUTF8().

◆ UNI_SUR_HIGH_START

#define UNI_SUR_HIGH_START (UTF32)0xD800

Definition at line 52 of file IccConvertUTF.cpp.

Referenced by icConvertUTF16toUTF32(), icConvertUTF16toUTF32(), icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF16(), icConvertUTF32toUTF16(), icConvertUTF32toUTF8(), icConvertUTF32toUTF8(), icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), and icConvertUTF8toUTF32().

◆ UNI_SUR_LOW_END

#define UNI_SUR_LOW_END (UTF32)0xDFFF

Definition at line 55 of file IccConvertUTF.cpp.

Referenced by icConvertUTF16toUTF32(), icConvertUTF16toUTF32(), icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF16(), icConvertUTF32toUTF16(), icConvertUTF32toUTF8(), icConvertUTF32toUTF8(), icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), and icConvertUTF8toUTF32().

◆ UNI_SUR_LOW_START

#define UNI_SUR_LOW_START (UTF32)0xDC00

Definition at line 54 of file IccConvertUTF.cpp.

Referenced by icConvertUTF16toUTF32(), icConvertUTF16toUTF32(), icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF16(), icConvertUTF32toUTF16(), icConvertUTF8toUTF16(), and icConvertUTF8toUTF16().

Function Documentation

◆ icConvertUTF16toUTF32() [1/2]

icUtfConversionResult icConvertUTF16toUTF32	(	const UTF16 **	sourceStart,
		const UTF16 *	sourceEnd,
		UTF32 **	targetStart,
		UTF32 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 147 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  const UTF16* source = *sourceStart;
  UTF32* target = *targetStart;
  UTF32 ch, ch2;
  while (source < sourceEnd) {
    const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
    ch = *source++;
    /* If we have a surrogate pair, convert to UTF32 first. */
    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
      /* If the 16 bits following the high surrogate are in the source buffer... */
      if (source < sourceEnd) {
        ch2 = *source;
        /* If it's a low surrogate, convert to UTF32. */
        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
          ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
            + (ch2 - UNI_SUR_LOW_START) + halfBase;
          ++source;
        } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
      } else { /* We don't have the 16 bits following the high surrogate. */
        --source; /* return to the high surrogate */
        result = sourceExhausted;
        break;
      }
    } else if (flags == strictConversion) {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }
    if (target >= targetEnd) {
      source = oldSource; /* Back up source pointer! */
      result = targetExhausted; break;
    }
    *target++ = ch;
  }
  *sourceStart = source;
  *targetStart = target;
#ifdef CVTUTF_DEBUG
  if (result == sourceIllegal) {
    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
    fflush(stderr);
  }
#endif
  return result;
}

References conversionOK, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF16toUTF32() [2/2]

icUtfConversionResult icConvertUTF16toUTF32	(	const UTF16 *	source,
		const UTF16 *	sourceEnd,
		icUtf32Vector	target,
		UTF32 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 202 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  target.clear();
  UTF32 ch, ch2;
  while (source < sourceEnd) {
    const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
    ch = *source++;
    /* If we have a surrogate pair, convert to UTF32 first. */
    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
      /* If the 16 bits following the high surrogate are in the source buffer... */
      if (source < sourceEnd) {
        ch2 = *source;
        /* If it's a low surrogate, convert to UTF32. */
        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
          ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
            + (ch2 - UNI_SUR_LOW_START) + halfBase;
          ++source;
        } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
      } else { /* We don't have the 16 bits following the high surrogate. */
        --source; /* return to the high surrogate */
        result = sourceExhausted;
        break;
      }
    } else if (flags == strictConversion) {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }
    target.push_back(ch);
  }
#ifdef CVTUTF_DEBUG
  if (result == sourceIllegal) {
    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
    fflush(stderr);
  }
#endif
  return result;
}

References conversionOK, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF16toUTF8() [1/2]

icUtfConversionResult icConvertUTF16toUTF8	(	const UTF16 **	sourceStart,
		const UTF16 *	sourceEnd,
		UTF8 **	targetStart,
		UTF8 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 299 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  const UTF16* source = *sourceStart;
  UTF8* target = *targetStart;
  while (source < sourceEnd) {
    UTF32 ch;
    unsigned short bytesToWrite = 0;
    const UTF32 byteMask = 0xBF;
    const UTF32 byteMark = 0x80; 
    const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
    ch = *source++;
    /* If we have a surrogate pair, convert to UTF32 first. */
    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
      /* If the 16 bits following the high surrogate are in the source buffer... */
      if (source < sourceEnd) {
        UTF32 ch2 = *source;
        /* If it's a low surrogate, convert to UTF32. */
        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
          ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
            + (ch2 - UNI_SUR_LOW_START) + halfBase;
          ++source;
        } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
      } else { /* We don't have the 16 bits following the high surrogate. */
        --source; /* return to the high surrogate */
        result = sourceExhausted;
        break;
      }
    } else if (flags == strictConversion) {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }
    /* Figure out how many bytes the result will require */
    if (ch < (UTF32)0x80) {      bytesToWrite = 1;
    } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
    } else {                bytesToWrite = 3;
    ch = UNI_REPLACEMENT_CHAR;
    }
 
    target += bytesToWrite;
    if (target > targetEnd) {
      source = oldSource; /* Back up source pointer! */
      target -= bytesToWrite; result = targetExhausted; break;
    }
    switch (bytesToWrite) { /* note: everything falls through. */
      case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    }
    target += bytesToWrite;
  }
  *sourceStart = source;
  *targetStart = target;
  return result;
}

References conversionOK, firstByteMark, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccTagUtf16Text::GetText(), icUtf16ToUtf8(), CIccTagUtf8Text::SetText(), and CIccTagZipUtf8Text::SetText().

Here is the caller graph for this function:

◆ icConvertUTF16toUTF8() [2/2]

icUtfConversionResult icConvertUTF16toUTF8	(	const UTF16 *	source,
		const UTF16 *	sourceEnd,
		icUtf8Vector &	target,
		icUtfConversionFlags	flags )

Definition at line 367 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  target.clear();
  while (source < sourceEnd) {
    UTF32 ch;
    unsigned short bytesToWrite = 0;
    const UTF32 byteMask = 0xBF;
    const UTF32 byteMark = 0x80; 
    const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
    ch = *source++;
    /* If we have a surrogate pair, convert to UTF32 first. */
    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
      /* If the 16 bits following the high surrogate are in the source buffer... */
      if (source < sourceEnd) {
        UTF32 ch2 = *source;
        /* If it's a low surrogate, convert to UTF32. */
        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
          ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
            + (ch2 - UNI_SUR_LOW_START) + halfBase;
          ++source;
        } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
      } else { /* We don't have the 16 bits following the high surrogate. */
        --source; /* return to the high surrogate */
        result = sourceExhausted;
        break;
      }
    } else if (flags == strictConversion) {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }
    /* Figure out how many bytes the result will require */
    if (ch < (UTF32)0x80) {      bytesToWrite = 1;
    } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
    } else {                bytesToWrite = 3;
    ch = UNI_REPLACEMENT_CHAR;
    }
 
    UTF8 buf[5], *ptr = &buf[bytesToWrite];
    switch (bytesToWrite) { /* note: everything falls through. */
      case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 1: *--ptr =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    }
    switch(bytesToWrite) {
      case 4: target.push_back(*ptr++);
      case 3: target.push_back(*ptr++);
      case 2: target.push_back(*ptr++);
      case 1: target.push_back(*ptr++);
    }
  }
  return result;
}

References conversionOK, firstByteMark, halfBase, halfShift, sourceExhausted, sourceIllegal, strictConversion, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_END, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF32toUTF16() [1/2]

icUtfConversionResult icConvertUTF32toUTF16	(	const UTF32 **	sourceStart,
		const UTF32 *	sourceEnd,
		UTF16 **	targetStart,
		UTF16 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 61 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  const UTF32* source = *sourceStart;
  UTF16* target = *targetStart;
  while (source < sourceEnd) {
    UTF32 ch;
    if (target >= targetEnd) {
      result = targetExhausted; break;
    }
    ch = *source++;
    if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
      /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        if (flags == strictConversion) {
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        } else {
          *target++ = UNI_REPLACEMENT_CHAR;
        }
      } else {
        *target++ = (UTF16)ch; /* normal case */
      }
    } else if (ch > UNI_MAX_LEGAL_UTF32) {
      if (flags == strictConversion) {
        result = sourceIllegal;
      } else {
        *target++ = UNI_REPLACEMENT_CHAR;
      }
    } else {
      /* target is a character in range 0xFFFF - 0x10FFFF. */
      if (target + 1 >= targetEnd) {
        --source; /* Back up source pointer! */
        result = targetExhausted; break;
      }
      ch -= halfBase;
      *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
      *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    }
  }
  *sourceStart = source;
  *targetStart = target;
  return result;
}

References conversionOK, halfBase, halfMask, halfShift, sourceIllegal, strictConversion, targetExhausted, UNI_MAX_BMP, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccLocalizedUnicode::SetText().

Here is the caller graph for this function:

◆ icConvertUTF32toUTF16() [2/2]

icUtfConversionResult icConvertUTF32toUTF16	(	const UTF32 *	source,
		const UTF32 *	sourceEnd,
		icUtf16Vector &	target,
		icUtfConversionFlags	flags )

Definition at line 108 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  target.clear();
  while (source < sourceEnd) {
    UTF32 ch;
    ch = *source++;
    if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
      /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        if (flags == strictConversion) {
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        } else {
          target.push_back(UNI_REPLACEMENT_CHAR);
        }
      } else {
        target.push_back((UTF16)ch); /* normal case */
      }
    } else if (ch > UNI_MAX_LEGAL_UTF32) {
      if (flags == strictConversion) {
        result = sourceIllegal;
      } else {
        target.push_back(UNI_REPLACEMENT_CHAR);
      }
    } else {
      ch -= halfBase;
      target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
      target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
    }
  }
  return result;
}

References conversionOK, halfBase, halfMask, halfShift, sourceIllegal, strictConversion, UNI_MAX_BMP, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

◆ icConvertUTF32toUTF8() [1/2]

icUtfConversionResult icConvertUTF32toUTF8	(	const UTF32 **	sourceStart,
		const UTF32 *	sourceEnd,
		UTF8 **	targetStart,
		UTF8 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 622 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  const UTF32* source = *sourceStart;
  UTF8* target = *targetStart;
  while (source < sourceEnd) {
    UTF32 ch;
    unsigned short bytesToWrite = 0;
    const UTF32 byteMask = 0xBF;
    const UTF32 byteMark = 0x80; 
    ch = *source++;
    if (flags == strictConversion ) {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }
    /*
    * Figure out how many bytes the result will require. Turn any
    * illegally large UTF32 things (> Plane 17) into replacement chars.
    */
    if (ch < (UTF32)0x80) {      bytesToWrite = 1;
    } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
    } else {                bytesToWrite = 3;
    ch = UNI_REPLACEMENT_CHAR;
    result = sourceIllegal;
    }
 
    target += bytesToWrite;
    if (target > targetEnd) {
      --source; /* Back up source pointer! */
      target -= bytesToWrite; result = targetExhausted; break;
    }
    switch (bytesToWrite) { /* note: everything falls through. */
      case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
    }
    target += bytesToWrite;
  }
  *sourceStart = source;
  *targetStart = target;
  return result;
}

References conversionOK, firstByteMark, sourceIllegal, strictConversion, targetExhausted, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

◆ icConvertUTF32toUTF8() [2/2]

icUtfConversionResult icConvertUTF32toUTF8	(	const UTF32 *	source,
		const UTF32 *	sourceEnd,
		icUtf8Vector &	target,
		icUtfConversionFlags	flags )

Definition at line 673 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  target.clear();
  while (source < sourceEnd) {
    UTF32 ch;
    unsigned short bytesToWrite = 0;
    const UTF32 byteMask = 0xBF;
    const UTF32 byteMark = 0x80; 
    ch = *source++;
    if (flags == strictConversion ) {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }
    /*
    * Figure out how many bytes the result will require. Turn any
    * illegally large UTF32 things (> Plane 17) into replacement chars.
    */
    if (ch < (UTF32)0x80) {      bytesToWrite = 1;
    } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
    } else {                bytesToWrite = 3;
    ch = UNI_REPLACEMENT_CHAR;
    result = sourceIllegal;
    }
 
    UTF8 buf[5], *ptr = &buf[bytesToWrite];
    switch (bytesToWrite) { /* note: everything falls through. */
      case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
      case 1: *--ptr =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    }
    switch(bytesToWrite) {
      case 4: target.push_back(*ptr++);
      case 3: target.push_back(*ptr++);
      case 2: target.push_back(*ptr++);
      case 1: target.push_back(*ptr++);
    }
  }
  return result;
}

References conversionOK, firstByteMark, sourceIllegal, strictConversion, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

◆ icConvertUTF8toUTF16() [1/2]

icUtfConversionResult icConvertUTF8toUTF16	(	const UTF8 **	sourceStart,
		const UTF8 *	sourceEnd,
		UTF16 **	targetStart,
		UTF16 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 489 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  const UTF8* source = *sourceStart;
  UTF16* target = *targetStart;
  while (source < sourceEnd) {
    UTF32 ch = 0;
    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    if (source + extraBytesToRead >= sourceEnd) {
      result = sourceExhausted; break;
    }
    /* Do this check whether lenient or strict */
    if (! isLegalUTF8(source, extraBytesToRead+1)) {
      result = sourceIllegal;
      break;
    }
    /*
    * The cases all fall through. See "Note A" below.
    */
    switch (extraBytesToRead) {
      case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
      case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
      case 3: ch += *source++; ch <<= 6;
      case 2: ch += *source++; ch <<= 6;
      case 1: ch += *source++; ch <<= 6;
      case 0: ch += *source++;
    }
    ch -= offsetsFromUTF8[extraBytesToRead];
 
    if (target >= targetEnd) {
      source -= (extraBytesToRead+1); /* Back up source pointer! */
      result = targetExhausted; break;
    }
    if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        if (flags == strictConversion) {
          source -= (extraBytesToRead+1); /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        } else {
          *target++ = UNI_REPLACEMENT_CHAR;
        }
      } else {
        *target++ = (UTF16)ch; /* normal case */
      }
    } else if (ch > UNI_MAX_UTF16) {
      if (flags == strictConversion) {
        result = sourceIllegal;
        source -= (extraBytesToRead+1); /* return to the start */
        break; /* Bail out; shouldn't continue */
      } else {
        *target++ = UNI_REPLACEMENT_CHAR;
      }
    } else {
      /* target is a character in range 0xFFFF - 0x10FFFF. */
      if (target + 1 >= targetEnd) {
        source -= (extraBytesToRead+1); /* Back up source pointer! */
        result = targetExhausted; break;
      }
      ch -= halfBase;
      *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
      *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    }
  }
  *sourceStart = source;
  *targetStart = target;
  return result;
}

References conversionOK, halfBase, halfMask, halfShift, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, trailingBytesForUTF8, UNI_MAX_BMP, UNI_MAX_UTF16, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Referenced by CIccUTF16String::CIccUTF16String(), CIccUTF16String::FromUtf8(), and CIccTagUtf16Text::SetText().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ icConvertUTF8toUTF16() [2/2]

icUtfConversionResult icConvertUTF8toUTF16	(	const UTF8 *	source,
		const UTF8 *	sourceEnd,
		icUtf16Vector &	target,
		icUtfConversionFlags	flags )

Definition at line 560 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  target.clear();
  while (source < sourceEnd) {
    UTF32 ch = 0;
    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    if (source + extraBytesToRead >= sourceEnd) {
      result = sourceExhausted; break;
    }
    /* Do this check whether lenient or strict */
    if (! isLegalUTF8(source, extraBytesToRead+1)) {
      result = sourceIllegal;
      break;
    }
    /*
    * The cases all fall through. See "Note A" below.
    */
    switch (extraBytesToRead) {
      case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
      case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
      case 3: ch += *source++; ch <<= 6;
      case 2: ch += *source++; ch <<= 6;
      case 1: ch += *source++; ch <<= 6;
      case 0: ch += *source++;
    }
    ch -= offsetsFromUTF8[extraBytesToRead];
 
    if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        if (flags == strictConversion) {
          source -= (extraBytesToRead+1); /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        } else {
          target.push_back(UNI_REPLACEMENT_CHAR);
        }
      } else {
        target.push_back((UTF16)ch); /* normal case */
      }
    } else if (ch > UNI_MAX_UTF16) {
      if (flags == strictConversion) {
        result = sourceIllegal;
        source -= (extraBytesToRead+1); /* return to the start */
        break; /* Bail out; shouldn't continue */
      } else {
        target.push_back(UNI_REPLACEMENT_CHAR);
      }
    } else {
      /* target is a character in range 0xFFFF - 0x10FFFF. */
      ch -= halfBase;
      target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
      target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
    }
  }
  return result;
}

References conversionOK, halfBase, halfMask, halfShift, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, trailingBytesForUTF8, UNI_MAX_BMP, UNI_MAX_UTF16, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, UNI_SUR_LOW_END, and UNI_SUR_LOW_START.

Here is the call graph for this function:

◆ icConvertUTF8toUTF32() [1/2]

icUtfConversionResult icConvertUTF8toUTF32	(	const UTF8 **	sourceStart,
		const UTF8 *	sourceEnd,
		UTF32 **	targetStart,
		UTF32 *	targetEnd,
		icUtfConversionFlags	flags )

Definition at line 723 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  const UTF8* source = *sourceStart;
  UTF32* target = *targetStart;
  while (source < sourceEnd) {
    UTF32 ch = 0;
    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    if (source + extraBytesToRead >= sourceEnd) {
      result = sourceExhausted; break;
    }
    /* Do this check whether lenient or strict */
    if (! isLegalUTF8(source, extraBytesToRead+1)) {
      result = sourceIllegal;
      break;
    }
    /*
    * The cases all fall through. See "Note A" below.
    */
    switch (extraBytesToRead) {
      case 5: ch += *source++; ch <<= 6;
      case 4: ch += *source++; ch <<= 6;
      case 3: ch += *source++; ch <<= 6;
      case 2: ch += *source++; ch <<= 6;
      case 1: ch += *source++; ch <<= 6;
      case 0: ch += *source++;
    }
    ch -= offsetsFromUTF8[extraBytesToRead];
 
    if (target >= targetEnd) {
      source -= (extraBytesToRead+1); /* Back up the source pointer! */
      result = targetExhausted; break;
    }
    if (ch <= UNI_MAX_LEGAL_UTF32) {
      /*
      * UTF-16 surrogate values are illegal in UTF-32, and anything
      * over Plane 17 (> 0x10FFFF) is illegal.
      */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        if (flags == strictConversion) {
          source -= (extraBytesToRead+1); /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        } else {
          *target++ = UNI_REPLACEMENT_CHAR;
        }
      } else {
        *target++ = ch;
      }
    } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
      result = sourceIllegal;
      *target++ = UNI_REPLACEMENT_CHAR;
    }
  }
  *sourceStart = source;
  *targetStart = target;
  return result;
}

References conversionOK, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, targetExhausted, trailingBytesForUTF8, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

Here is the call graph for this function:

◆ icConvertUTF8toUTF32() [2/2]

icUtfConversionResult icConvertUTF8toUTF32	(	const UTF8 *	source,
		const UTF8 *	sourceEnd,
		icUtf32Vector &	target,
		icUtfConversionFlags	flags )

Definition at line 783 of file IccConvertUTF.cpp.

{
  icUtfConversionResult result = conversionOK;
  target.clear();
  while (source < sourceEnd) {
    UTF32 ch = 0;
    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    if (source + extraBytesToRead >= sourceEnd) {
      result = sourceExhausted; break;
    }
    /* Do this check whether lenient or strict */
    if (! isLegalUTF8(source, extraBytesToRead+1)) {
      result = sourceIllegal;
      break;
    }
    /*
    * The cases all fall through. See "Note A" below.
    */
    switch (extraBytesToRead) {
      case 5: ch += *source++; ch <<= 6;
      case 4: ch += *source++; ch <<= 6;
      case 3: ch += *source++; ch <<= 6;
      case 2: ch += *source++; ch <<= 6;
      case 1: ch += *source++; ch <<= 6;
      case 0: ch += *source++;
    }
    ch -= offsetsFromUTF8[extraBytesToRead];
 
    if (ch <= UNI_MAX_LEGAL_UTF32) {
      /*
      * UTF-16 surrogate values are illegal in UTF-32, and anything
      * over Plane 17 (> 0x10FFFF) is illegal.
      */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
        if (flags == strictConversion) {
          source -= (extraBytesToRead+1); /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        } else {
          target.push_back(UNI_REPLACEMENT_CHAR);
        }
      } else {
        target.push_back(ch);
      }
    } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
      result = sourceIllegal;
      target.push_back(UNI_REPLACEMENT_CHAR);
    }
  }
 
  return result;
}

References conversionOK, isLegalUTF8(), offsetsFromUTF8, sourceExhausted, sourceIllegal, strictConversion, trailingBytesForUTF8, UNI_MAX_LEGAL_UTF32, UNI_REPLACEMENT_CHAR, UNI_SUR_HIGH_START, and UNI_SUR_LOW_END.

Here is the call graph for this function:

◆ icIsLegalUTF8Sequence()

Boolean icIsLegalUTF8Sequence	(	const UTF8 *	source,
		const UTF8 *	sourceEnd )

Definition at line 478 of file IccConvertUTF.cpp.

{
  int length = trailingBytesForUTF8[*source]+1;
  if (source+length > sourceEnd) {
    return false;
  }
  return isLegalUTF8(source, length);
}

References isLegalUTF8(), and trailingBytesForUTF8.

Here is the call graph for this function:

◆ isLegalUTF8()

static Boolean isLegalUTF8	(	const UTF8 *	source,
		int	length )

static

Definition at line 446 of file IccConvertUTF.cpp.

{
  UTF8 a;
  const UTF8 *srcptr = source+length;
  switch (length) {
    default: return false;
      /* Everything else falls through when "true"... */
    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
 
      switch (*source) {
            /* no fall-through in this inner switch */
        case 0xE0: if (a < 0xA0) return false; break;
        case 0xED: if (a > 0x9F) return false; break;
        case 0xF0: if (a < 0x90) return false; break;
        case 0xF4: if (a > 0x8F) return false; break;
        default:   if (a < 0x80) return false;
      }
 
    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  }
  if (*source > 0xF4) return false;
  return true;
}

Referenced by icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), icConvertUTF8toUTF32(), and icIsLegalUTF8Sequence().

Here is the caller graph for this function:

Variable Documentation

◆ firstByteMark

const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }

static

Definition at line 285 of file IccConvertUTF.cpp.

285{ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

Referenced by icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF8(), and icConvertUTF32toUTF8().

◆ halfBase

const UTF32 halfBase = 0x0010000UL

static

Definition at line 49 of file IccConvertUTF.cpp.

Referenced by icConvertUTF16toUTF32(), icConvertUTF16toUTF32(), icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF16(), icConvertUTF32toUTF16(), icConvertUTF8toUTF16(), and icConvertUTF8toUTF16().

◆ halfMask

const UTF32 halfMask = 0x3FFUL

static

Definition at line 50 of file IccConvertUTF.cpp.

Referenced by icConvertUTF32toUTF16(), icConvertUTF32toUTF16(), icConvertUTF8toUTF16(), and icConvertUTF8toUTF16().

◆ halfShift

const int halfShift = 10

static

Definition at line 47 of file IccConvertUTF.cpp.

Referenced by icConvertUTF16toUTF32(), icConvertUTF16toUTF32(), icConvertUTF16toUTF8(), icConvertUTF16toUTF8(), icConvertUTF32toUTF16(), icConvertUTF32toUTF16(), icConvertUTF8toUTF16(), and icConvertUTF8toUTF16().

◆ offsetsFromUTF8

const UTF32 offsetsFromUTF8[6]

static

Initial value:

= { 0x00000000UL, 0x00003080UL, 0x000E2080UL,

0x03C82080UL, 0xFA082080UL, 0x82082080UL }

Definition at line 275 of file IccConvertUTF.cpp.

275 { 0x00000000UL, 0x00003080UL, 0x000E2080UL,

2760x03C82080UL, 0xFA082080UL, 0x82082080UL };

Referenced by icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), and icConvertUTF8toUTF32().

◆ trailingBytesForUTF8

const char trailingBytesForUTF8[256]

static

Initial value:

= {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
}

Definition at line 259 of file IccConvertUTF.cpp.

                                              {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

Referenced by icConvertUTF8toUTF16(), icConvertUTF8toUTF16(), icConvertUTF8toUTF32(), icConvertUTF8toUTF32(), and icIsLegalUTF8Sequence().

Macros

Functions

Variables

Macro Definition Documentation

◆ false

◆ true

◆ UNI_SUR_HIGH_END

◆ UNI_SUR_HIGH_START

◆ UNI_SUR_LOW_END

◆ UNI_SUR_LOW_START

Function Documentation

◆ icConvertUTF16toUTF32() [1/2]

◆ icConvertUTF16toUTF32() [2/2]

◆ icConvertUTF16toUTF8() [1/2]

◆ icConvertUTF16toUTF8() [2/2]

◆ icConvertUTF32toUTF16() [1/2]

◆ icConvertUTF32toUTF16() [2/2]

◆ icConvertUTF32toUTF8() [1/2]

◆ icConvertUTF32toUTF8() [2/2]

◆ icConvertUTF8toUTF16() [1/2]

◆ icConvertUTF8toUTF16() [2/2]

◆ icConvertUTF8toUTF32() [1/2]

◆ icConvertUTF8toUTF32() [2/2]

◆ icIsLegalUTF8Sequence()

◆ isLegalUTF8()

Variable Documentation

◆ firstByteMark

◆ halfBase

◆ halfMask

◆ halfShift

◆ offsetsFromUTF8

◆ trailingBytesForUTF8