Hoyt's FORK of DemoIccMAX 2.1.17.hoyt
Documentation for Hoyt's FORK of DemoIccMAX
Loading...
Searching...
No Matches
IccConvertUTF.cpp
Go to the documentation of this file.
1/*
2* Copyright 2001-2004 Unicode, Inc.
3*
4* Disclaimer
5*
6* This source code is provided as is by Unicode, Inc. No claims are
7* made as to fitness for any particular purpose. No warranties of any
8* kind are expressed or implied. The recipient agrees to determine
9* applicability of information provided. If this file has been
10* purchased on magnetic or optical media from Unicode, Inc., the
11* sole remedy for any claim will be exchange of defective media
12* within 90 days of receipt.
13*
14* Limitations on Rights to Redistribute This Code
15*
16* Unicode, Inc. hereby grants the right to freely use the information
17* supplied in this file in the creation of products supporting the
18* Unicode Standard, and to make copies of this file in any form
19* for internal or external distribution as long as this notice
20* remains attached.
21*/
22
23/* ---------------------------------------------------------------------
24
25Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26Author: Mark E. Davis, 1994.
27Rev History: Rick McGowan, fixes & updates May 2001.
28Sept 2001: fixed const & error conditions per
29mods suggested by S. Parent & A. Lillich.
30June 2002: Tim Dodd added detection and handling of incomplete
31source sequences, enhanced error detection, added casts
32to eliminate compiler warnings.
33July 2003: slight mods to back out aggressive FFFE detection.
34Jan 2004: updated switches in from-UTF8 conversions.
35Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37See the header file "icConvertUTF.h" for complete documentation.
38
39------------------------------------------------------------------------ */
40
41
42#include "IccConvertUTF.h"
43#ifdef CVTUTF_DEBUG
44#include <stdio.h>
45#endif
46
47static const int halfShift = 10; /* used for shifting by 10 bits */
48
49static const UTF32 halfBase = 0x0010000UL;
50static const UTF32 halfMask = 0x3FFUL;
51
52#define UNI_SUR_HIGH_START (UTF32)0xD800
53#define UNI_SUR_HIGH_END (UTF32)0xDBFF
54#define UNI_SUR_LOW_START (UTF32)0xDC00
55#define UNI_SUR_LOW_END (UTF32)0xDFFF
56#define false 0
57#define true 1
58
59/* --------------------------------------------------------------------- */
60
61icUtfConversionResult icConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
62 UTF16** targetStart, UTF16* targetEnd, icUtfConversionFlags flags)
63{
65 const UTF32* source = *sourceStart;
66 UTF16* target = *targetStart;
67 while (source < sourceEnd) {
68 UTF32 ch;
69 if (target >= targetEnd) {
70 result = targetExhausted; break;
71 }
72 ch = *source++;
73 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76 if (flags == strictConversion) {
77 --source; /* return to the illegal value itself */
78 result = sourceIllegal;
79 break;
80 } else {
81 *target++ = UNI_REPLACEMENT_CHAR;
82 }
83 } else {
84 *target++ = (UTF16)ch; /* normal case */
85 }
86 } else if (ch > UNI_MAX_LEGAL_UTF32) {
87 if (flags == strictConversion) {
88 result = sourceIllegal;
89 } else {
90 *target++ = UNI_REPLACEMENT_CHAR;
91 }
92 } else {
93 /* target is a character in range 0xFFFF - 0x10FFFF. */
94 if (target + 1 >= targetEnd) {
95 --source; /* Back up source pointer! */
96 result = targetExhausted; break;
97 }
98 ch -= halfBase;
99 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101 }
102 }
103 *sourceStart = source;
104 *targetStart = target;
105 return result;
106}
107
108icUtfConversionResult icConvertUTF32toUTF16 (const UTF32* source, const UTF32* sourceEnd,
109 icUtf16Vector &target, icUtfConversionFlags flags)
110{
112 target.clear();
113 while (source < sourceEnd) {
114 UTF32 ch;
115 ch = *source++;
116 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
117 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
118 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
119 if (flags == strictConversion) {
120 --source; /* return to the illegal value itself */
121 result = sourceIllegal;
122 break;
123 } else {
124 target.push_back(UNI_REPLACEMENT_CHAR);
125 }
126 } else {
127 target.push_back((UTF16)ch); /* normal case */
128 }
129 } else if (ch > UNI_MAX_LEGAL_UTF32) {
130 if (flags == strictConversion) {
131 result = sourceIllegal;
132 } else {
133 target.push_back(UNI_REPLACEMENT_CHAR);
134 }
135 } else {
136 ch -= halfBase;
137 target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
138 target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
139 }
140 }
141 return result;
142}
143
144
145/* --------------------------------------------------------------------- */
146
147icUtfConversionResult icConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
148 UTF32** targetStart, UTF32* targetEnd, icUtfConversionFlags flags)
149{
151 const UTF16* source = *sourceStart;
152 UTF32* target = *targetStart;
153 UTF32 ch, ch2;
154 while (source < sourceEnd) {
155 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
156 ch = *source++;
157 /* If we have a surrogate pair, convert to UTF32 first. */
158 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
159 /* If the 16 bits following the high surrogate are in the source buffer... */
160 if (source < sourceEnd) {
161 ch2 = *source;
162 /* If it's a low surrogate, convert to UTF32. */
163 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
164 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
165 + (ch2 - UNI_SUR_LOW_START) + halfBase;
166 ++source;
167 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
168 --source; /* return to the illegal value itself */
169 result = sourceIllegal;
170 break;
171 }
172 } else { /* We don't have the 16 bits following the high surrogate. */
173 --source; /* return to the high surrogate */
174 result = sourceExhausted;
175 break;
176 }
177 } else if (flags == strictConversion) {
178 /* UTF-16 surrogate values are illegal in UTF-32 */
179 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
180 --source; /* return to the illegal value itself */
181 result = sourceIllegal;
182 break;
183 }
184 }
185 if (target >= targetEnd) {
186 source = oldSource; /* Back up source pointer! */
187 result = targetExhausted; break;
188 }
189 *target++ = ch;
190 }
191 *sourceStart = source;
192 *targetStart = target;
193#ifdef CVTUTF_DEBUG
194 if (result == sourceIllegal) {
195 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
196 fflush(stderr);
197 }
198#endif
199 return result;
200}
201
202icUtfConversionResult icConvertUTF16toUTF32 (const UTF16* source, const UTF16* sourceEnd,
203 icUtf32Vector target, UTF32* targetEnd, icUtfConversionFlags flags)
204{
206 target.clear();
207 UTF32 ch, ch2;
208 while (source < sourceEnd) {
209 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
210 ch = *source++;
211 /* If we have a surrogate pair, convert to UTF32 first. */
212 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
213 /* If the 16 bits following the high surrogate are in the source buffer... */
214 if (source < sourceEnd) {
215 ch2 = *source;
216 /* If it's a low surrogate, convert to UTF32. */
217 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
218 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
219 + (ch2 - UNI_SUR_LOW_START) + halfBase;
220 ++source;
221 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
222 --source; /* return to the illegal value itself */
223 result = sourceIllegal;
224 break;
225 }
226 } else { /* We don't have the 16 bits following the high surrogate. */
227 --source; /* return to the high surrogate */
228 result = sourceExhausted;
229 break;
230 }
231 } else if (flags == strictConversion) {
232 /* UTF-16 surrogate values are illegal in UTF-32 */
233 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
234 --source; /* return to the illegal value itself */
235 result = sourceIllegal;
236 break;
237 }
238 }
239 target.push_back(ch);
240 }
241#ifdef CVTUTF_DEBUG
242 if (result == sourceIllegal) {
243 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
244 fflush(stderr);
245 }
246#endif
247 return result;
248}
249
250/* --------------------------------------------------------------------- */
251
252/*
253* Index into the table below with the first byte of a UTF-8 sequence to
254* get the number of trailing bytes that are supposed to follow it.
255* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
256* left as-is for anyone who may want to do such conversion, which was
257* allowed in earlier algorithms.
258*/
259static const char trailingBytesForUTF8[256] = {
260 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
261 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
262 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
263 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
264 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
265 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
266 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
268};
269
270/*
271* Magic values subtracted from a buffer value during UTF8 conversion.
272* This table contains as many values as there might be trailing bytes
273* in a UTF-8 sequence.
274*/
275static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
2760x03C82080UL, 0xFA082080UL, 0x82082080UL };
277
278/*
279* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
280* into the first byte, depending on how many bytes follow. There are
281* as many entries in this table as there are UTF-8 sequence types.
282* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
283* for *legal* UTF-8 will be 4 or fewer bytes total.
284*/
285static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
286
287/* --------------------------------------------------------------------- */
288
289/* The interface converts a whole buffer to avoid function-call overhead.
290* Constants have been gathered. Loops & conditionals have been removed as
291* much as possible for efficiency, in favor of drop-through switches.
292* (See "Note A" at the bottom of the file for equivalent code.)
293* If your compiler supports it, the "isLegalUTF8" call can be turned
294* into an inline function.
295*/
296
297/* --------------------------------------------------------------------- */
298
299icUtfConversionResult icConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
300 UTF8** targetStart, UTF8* targetEnd, icUtfConversionFlags flags)
301{
303 const UTF16* source = *sourceStart;
304 UTF8* target = *targetStart;
305 while (source < sourceEnd) {
306 UTF32 ch;
307 unsigned short bytesToWrite = 0;
308 const UTF32 byteMask = 0xBF;
309 const UTF32 byteMark = 0x80;
310 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
311 ch = *source++;
312 /* If we have a surrogate pair, convert to UTF32 first. */
313 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
314 /* If the 16 bits following the high surrogate are in the source buffer... */
315 if (source < sourceEnd) {
316 UTF32 ch2 = *source;
317 /* If it's a low surrogate, convert to UTF32. */
318 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
319 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
320 + (ch2 - UNI_SUR_LOW_START) + halfBase;
321 ++source;
322 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
323 --source; /* return to the illegal value itself */
324 result = sourceIllegal;
325 break;
326 }
327 } else { /* We don't have the 16 bits following the high surrogate. */
328 --source; /* return to the high surrogate */
329 result = sourceExhausted;
330 break;
331 }
332 } else if (flags == strictConversion) {
333 /* UTF-16 surrogate values are illegal in UTF-32 */
334 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
335 --source; /* return to the illegal value itself */
336 result = sourceIllegal;
337 break;
338 }
339 }
340 /* Figure out how many bytes the result will require */
341 if (ch < (UTF32)0x80) { bytesToWrite = 1;
342 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
343 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
344 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
345 } else { bytesToWrite = 3;
347 }
348
349 target += bytesToWrite;
350 if (target > targetEnd) {
351 source = oldSource; /* Back up source pointer! */
352 target -= bytesToWrite; result = targetExhausted; break;
353 }
354 switch (bytesToWrite) { /* note: everything falls through. */
355 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
356 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
357 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
359 }
360 target += bytesToWrite;
361 }
362 *sourceStart = source;
363 *targetStart = target;
364 return result;
365}
366
367icUtfConversionResult icConvertUTF16toUTF8 (const UTF16* source, const UTF16* sourceEnd,
368 icUtf8Vector &target, icUtfConversionFlags flags)
369{
371 target.clear();
372 while (source < sourceEnd) {
373 UTF32 ch;
374 unsigned short bytesToWrite = 0;
375 const UTF32 byteMask = 0xBF;
376 const UTF32 byteMark = 0x80;
377 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
378 ch = *source++;
379 /* If we have a surrogate pair, convert to UTF32 first. */
380 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
381 /* If the 16 bits following the high surrogate are in the source buffer... */
382 if (source < sourceEnd) {
383 UTF32 ch2 = *source;
384 /* If it's a low surrogate, convert to UTF32. */
385 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
386 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
387 + (ch2 - UNI_SUR_LOW_START) + halfBase;
388 ++source;
389 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
390 --source; /* return to the illegal value itself */
391 result = sourceIllegal;
392 break;
393 }
394 } else { /* We don't have the 16 bits following the high surrogate. */
395 --source; /* return to the high surrogate */
396 result = sourceExhausted;
397 break;
398 }
399 } else if (flags == strictConversion) {
400 /* UTF-16 surrogate values are illegal in UTF-32 */
401 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
402 --source; /* return to the illegal value itself */
403 result = sourceIllegal;
404 break;
405 }
406 }
407 /* Figure out how many bytes the result will require */
408 if (ch < (UTF32)0x80) { bytesToWrite = 1;
409 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
410 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
411 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
412 } else { bytesToWrite = 3;
414 }
415
416 UTF8 buf[5], *ptr = &buf[bytesToWrite];
417 switch (bytesToWrite) { /* note: everything falls through. */
418 case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
419 case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
420 case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
421 case 1: *--ptr = (UTF8)(ch | firstByteMark[bytesToWrite]);
422 }
423 switch(bytesToWrite) {
424 case 4: target.push_back(*ptr++);
425 case 3: target.push_back(*ptr++);
426 case 2: target.push_back(*ptr++);
427 case 1: target.push_back(*ptr++);
428 }
429 }
430 return result;
431}
432
433/* --------------------------------------------------------------------- */
434
435/*
436* Utility routine to tell whether a sequence of bytes is legal UTF-8.
437* This must be called with the length pre-determined by the first byte.
438* If not calling this from ConvertUTF8to*, then the length can be set by:
439* length = trailingBytesForUTF8[*source]+1;
440* and the sequence is illegal right away if there aren't that many bytes
441* available.
442* If presented with a length > 4, this returns false. The Unicode
443* definition of UTF-8 goes up to 4-byte sequences.
444*/
445
446static Boolean isLegalUTF8(const UTF8 *source, int length)
447{
448 UTF8 a;
449 const UTF8 *srcptr = source+length;
450 switch (length) {
451 default: return false;
452 /* Everything else falls through when "true"... */
453 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
454 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
455 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
456
457 switch (*source) {
458 /* no fall-through in this inner switch */
459 case 0xE0: if (a < 0xA0) return false; break;
460 case 0xED: if (a > 0x9F) return false; break;
461 case 0xF0: if (a < 0x90) return false; break;
462 case 0xF4: if (a > 0x8F) return false; break;
463 default: if (a < 0x80) return false;
464 }
465
466 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
467 }
468 if (*source > 0xF4) return false;
469 return true;
470}
471
472/* --------------------------------------------------------------------- */
473
474/*
475* Exported function to return whether a UTF-8 sequence is legal or not.
476* This is not used here; it's just exported.
477*/
478Boolean icIsLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
479{
480 int length = trailingBytesForUTF8[*source]+1;
481 if (source+length > sourceEnd) {
482 return false;
483 }
484 return isLegalUTF8(source, length);
485}
486
487/* --------------------------------------------------------------------- */
488
489icUtfConversionResult icConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
490 UTF16** targetStart, UTF16* targetEnd, icUtfConversionFlags flags)
491{
493 const UTF8* source = *sourceStart;
494 UTF16* target = *targetStart;
495 while (source < sourceEnd) {
496 UTF32 ch = 0;
497 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
498 if (source + extraBytesToRead >= sourceEnd) {
499 result = sourceExhausted; break;
500 }
501 /* Do this check whether lenient or strict */
502 if (! isLegalUTF8(source, extraBytesToRead+1)) {
503 result = sourceIllegal;
504 break;
505 }
506 /*
507 * The cases all fall through. See "Note A" below.
508 */
509 switch (extraBytesToRead) {
510 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
511 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
512 case 3: ch += *source++; ch <<= 6;
513 case 2: ch += *source++; ch <<= 6;
514 case 1: ch += *source++; ch <<= 6;
515 case 0: ch += *source++;
516 }
517 ch -= offsetsFromUTF8[extraBytesToRead];
518
519 if (target >= targetEnd) {
520 source -= (extraBytesToRead+1); /* Back up source pointer! */
521 result = targetExhausted; break;
522 }
523 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
524 /* UTF-16 surrogate values are illegal in UTF-32 */
525 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
526 if (flags == strictConversion) {
527 source -= (extraBytesToRead+1); /* return to the illegal value itself */
528 result = sourceIllegal;
529 break;
530 } else {
531 *target++ = UNI_REPLACEMENT_CHAR;
532 }
533 } else {
534 *target++ = (UTF16)ch; /* normal case */
535 }
536 } else if (ch > UNI_MAX_UTF16) {
537 if (flags == strictConversion) {
538 result = sourceIllegal;
539 source -= (extraBytesToRead+1); /* return to the start */
540 break; /* Bail out; shouldn't continue */
541 } else {
542 *target++ = UNI_REPLACEMENT_CHAR;
543 }
544 } else {
545 /* target is a character in range 0xFFFF - 0x10FFFF. */
546 if (target + 1 >= targetEnd) {
547 source -= (extraBytesToRead+1); /* Back up source pointer! */
548 result = targetExhausted; break;
549 }
550 ch -= halfBase;
551 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
552 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
553 }
554 }
555 *sourceStart = source;
556 *targetStart = target;
557 return result;
558}
559
560icUtfConversionResult icConvertUTF8toUTF16 (const UTF8* source, const UTF8* sourceEnd,
561 icUtf16Vector &target, icUtfConversionFlags flags)
562{
564 target.clear();
565 while (source < sourceEnd) {
566 UTF32 ch = 0;
567 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
568 if (source + extraBytesToRead >= sourceEnd) {
569 result = sourceExhausted; break;
570 }
571 /* Do this check whether lenient or strict */
572 if (! isLegalUTF8(source, extraBytesToRead+1)) {
573 result = sourceIllegal;
574 break;
575 }
576 /*
577 * The cases all fall through. See "Note A" below.
578 */
579 switch (extraBytesToRead) {
580 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
581 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
582 case 3: ch += *source++; ch <<= 6;
583 case 2: ch += *source++; ch <<= 6;
584 case 1: ch += *source++; ch <<= 6;
585 case 0: ch += *source++;
586 }
587 ch -= offsetsFromUTF8[extraBytesToRead];
588
589 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
590 /* UTF-16 surrogate values are illegal in UTF-32 */
591 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
592 if (flags == strictConversion) {
593 source -= (extraBytesToRead+1); /* return to the illegal value itself */
594 result = sourceIllegal;
595 break;
596 } else {
597 target.push_back(UNI_REPLACEMENT_CHAR);
598 }
599 } else {
600 target.push_back((UTF16)ch); /* normal case */
601 }
602 } else if (ch > UNI_MAX_UTF16) {
603 if (flags == strictConversion) {
604 result = sourceIllegal;
605 source -= (extraBytesToRead+1); /* return to the start */
606 break; /* Bail out; shouldn't continue */
607 } else {
608 target.push_back(UNI_REPLACEMENT_CHAR);
609 }
610 } else {
611 /* target is a character in range 0xFFFF - 0x10FFFF. */
612 ch -= halfBase;
613 target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
614 target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
615 }
616 }
617 return result;
618}
619
620/* --------------------------------------------------------------------- */
621
622icUtfConversionResult icConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
623 UTF8** targetStart, UTF8* targetEnd, icUtfConversionFlags flags)
624{
626 const UTF32* source = *sourceStart;
627 UTF8* target = *targetStart;
628 while (source < sourceEnd) {
629 UTF32 ch;
630 unsigned short bytesToWrite = 0;
631 const UTF32 byteMask = 0xBF;
632 const UTF32 byteMark = 0x80;
633 ch = *source++;
634 if (flags == strictConversion ) {
635 /* UTF-16 surrogate values are illegal in UTF-32 */
636 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
637 --source; /* return to the illegal value itself */
638 result = sourceIllegal;
639 break;
640 }
641 }
642 /*
643 * Figure out how many bytes the result will require. Turn any
644 * illegally large UTF32 things (> Plane 17) into replacement chars.
645 */
646 if (ch < (UTF32)0x80) { bytesToWrite = 1;
647 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
648 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
649 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
650 } else { bytesToWrite = 3;
652 result = sourceIllegal;
653 }
654
655 target += bytesToWrite;
656 if (target > targetEnd) {
657 --source; /* Back up source pointer! */
658 target -= bytesToWrite; result = targetExhausted; break;
659 }
660 switch (bytesToWrite) { /* note: everything falls through. */
661 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
662 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
663 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
664 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
665 }
666 target += bytesToWrite;
667 }
668 *sourceStart = source;
669 *targetStart = target;
670 return result;
671}
672
673icUtfConversionResult icConvertUTF32toUTF8 (const UTF32* source, const UTF32* sourceEnd,
674 icUtf8Vector &target, icUtfConversionFlags flags)
675{
677 target.clear();
678 while (source < sourceEnd) {
679 UTF32 ch;
680 unsigned short bytesToWrite = 0;
681 const UTF32 byteMask = 0xBF;
682 const UTF32 byteMark = 0x80;
683 ch = *source++;
684 if (flags == strictConversion ) {
685 /* UTF-16 surrogate values are illegal in UTF-32 */
686 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
687 --source; /* return to the illegal value itself */
688 result = sourceIllegal;
689 break;
690 }
691 }
692 /*
693 * Figure out how many bytes the result will require. Turn any
694 * illegally large UTF32 things (> Plane 17) into replacement chars.
695 */
696 if (ch < (UTF32)0x80) { bytesToWrite = 1;
697 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
698 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
699 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
700 } else { bytesToWrite = 3;
702 result = sourceIllegal;
703 }
704
705 UTF8 buf[5], *ptr = &buf[bytesToWrite];
706 switch (bytesToWrite) { /* note: everything falls through. */
707 case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
708 case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
709 case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
710 case 1: *--ptr = (UTF8)(ch | firstByteMark[bytesToWrite]);
711 }
712 switch(bytesToWrite) {
713 case 4: target.push_back(*ptr++);
714 case 3: target.push_back(*ptr++);
715 case 2: target.push_back(*ptr++);
716 case 1: target.push_back(*ptr++);
717 }
718 }
719 return result;
720}
721/* --------------------------------------------------------------------- */
722
723icUtfConversionResult icConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
724 UTF32** targetStart, UTF32* targetEnd, icUtfConversionFlags flags)
725{
727 const UTF8* source = *sourceStart;
728 UTF32* target = *targetStart;
729 while (source < sourceEnd) {
730 UTF32 ch = 0;
731 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
732 if (source + extraBytesToRead >= sourceEnd) {
733 result = sourceExhausted; break;
734 }
735 /* Do this check whether lenient or strict */
736 if (! isLegalUTF8(source, extraBytesToRead+1)) {
737 result = sourceIllegal;
738 break;
739 }
740 /*
741 * The cases all fall through. See "Note A" below.
742 */
743 switch (extraBytesToRead) {
744 case 5: ch += *source++; ch <<= 6;
745 case 4: ch += *source++; ch <<= 6;
746 case 3: ch += *source++; ch <<= 6;
747 case 2: ch += *source++; ch <<= 6;
748 case 1: ch += *source++; ch <<= 6;
749 case 0: ch += *source++;
750 }
751 ch -= offsetsFromUTF8[extraBytesToRead];
752
753 if (target >= targetEnd) {
754 source -= (extraBytesToRead+1); /* Back up the source pointer! */
755 result = targetExhausted; break;
756 }
757 if (ch <= UNI_MAX_LEGAL_UTF32) {
758 /*
759 * UTF-16 surrogate values are illegal in UTF-32, and anything
760 * over Plane 17 (> 0x10FFFF) is illegal.
761 */
762 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
763 if (flags == strictConversion) {
764 source -= (extraBytesToRead+1); /* return to the illegal value itself */
765 result = sourceIllegal;
766 break;
767 } else {
768 *target++ = UNI_REPLACEMENT_CHAR;
769 }
770 } else {
771 *target++ = ch;
772 }
773 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
774 result = sourceIllegal;
775 *target++ = UNI_REPLACEMENT_CHAR;
776 }
777 }
778 *sourceStart = source;
779 *targetStart = target;
780 return result;
781}
782
783icUtfConversionResult icConvertUTF8toUTF32 (const UTF8* source, const UTF8* sourceEnd,
784 icUtf32Vector &target, icUtfConversionFlags flags)
785{
787 target.clear();
788 while (source < sourceEnd) {
789 UTF32 ch = 0;
790 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
791 if (source + extraBytesToRead >= sourceEnd) {
792 result = sourceExhausted; break;
793 }
794 /* Do this check whether lenient or strict */
795 if (! isLegalUTF8(source, extraBytesToRead+1)) {
796 result = sourceIllegal;
797 break;
798 }
799 /*
800 * The cases all fall through. See "Note A" below.
801 */
802 switch (extraBytesToRead) {
803 case 5: ch += *source++; ch <<= 6;
804 case 4: ch += *source++; ch <<= 6;
805 case 3: ch += *source++; ch <<= 6;
806 case 2: ch += *source++; ch <<= 6;
807 case 1: ch += *source++; ch <<= 6;
808 case 0: ch += *source++;
809 }
810 ch -= offsetsFromUTF8[extraBytesToRead];
811
812 if (ch <= UNI_MAX_LEGAL_UTF32) {
813 /*
814 * UTF-16 surrogate values are illegal in UTF-32, and anything
815 * over Plane 17 (> 0x10FFFF) is illegal.
816 */
817 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
818 if (flags == strictConversion) {
819 source -= (extraBytesToRead+1); /* return to the illegal value itself */
820 result = sourceIllegal;
821 break;
822 } else {
823 target.push_back(UNI_REPLACEMENT_CHAR);
824 }
825 } else {
826 target.push_back(ch);
827 }
828 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
829 result = sourceIllegal;
830 target.push_back(UNI_REPLACEMENT_CHAR);
831 }
832 }
833
834 return result;
835}
836
837/* ---------------------------------------------------------------------
838
839Note A.
840The fall-through switches in UTF-8 reading code save a
841temp variable, some decrements & conditionals. The switches
842are equivalent to the following loop:
843{
844int tmpBytesToRead = extraBytesToRead+1;
845do {
846ch += *source++;
847--tmpBytesToRead;
848if (tmpBytesToRead) ch <<= 6;
849} while (tmpBytesToRead > 0);
850}
851In UTF-8 writing code, the switches on "bytesToWrite" are
852similarly unrolled loops.
853
854--------------------------------------------------------------------- */
Boolean icIsLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
#define UNI_SUR_LOW_START
static const char trailingBytesForUTF8[256]
static Boolean isLegalUTF8(const UTF8 *source, int length)
static const UTF32 halfMask
#define UNI_SUR_HIGH_START
icUtfConversionResult icConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, icUtfConversionFlags flags)
icUtfConversionResult icConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, icUtfConversionFlags flags)
icUtfConversionResult icConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, icUtfConversionFlags flags)
static const UTF8 firstByteMark[7]
static const UTF32 halfBase
static const int halfShift
icUtfConversionResult icConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, icUtfConversionFlags flags)
icUtfConversionResult icConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, icUtfConversionFlags flags)
icUtfConversionResult icConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, icUtfConversionFlags flags)
#define UNI_SUR_LOW_END
#define UNI_SUR_HIGH_END
static const UTF32 offsetsFromUTF8[6]
uint32_t UTF32
#define UNI_REPLACEMENT_CHAR
unsigned short UTF16
#define UNI_MAX_UTF16
icUtfConversionResult
@ targetExhausted
@ sourceIllegal
@ conversionOK
@ sourceExhausted
#define UNI_MAX_LEGAL_UTF32
unsigned char UTF8
unsigned char Boolean
#define UNI_MAX_BMP
icUtfConversionFlags
@ strictConversion