Bug Summary

File:IccProfLib/IccConvertUTF.cpp
Warning:line 819, column 11
Value stored to 'source' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-apple-macosx14.0.0 -Wundef-prefix=TARGET_OS_ -Werror=undef-prefix -Wdeprecated-objc-isa-usage -Werror=deprecated-objc-isa-usage -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name IccConvertUTF.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=osx -analyzer-checker=security.insecureAPI.decodeValueOfObjCType -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -mframe-pointer=all -ffp-contract=on -fno-rounding-math -funwind-tables=2 -target-sdk-version=14.0 -fcompatibility-qualified-id-block-type-checking -fvisibility-inlines-hidden-static-local-var -target-cpu penryn -tune-cpu generic -debugger-tuning=lldb -target-linker-version 1015.7 -fprofile-instrument=clang -fcoverage-mapping -fcoverage-compilation-dir=/Users/xss/DemoIccMAX-hoyt-master/build/IccProfLib -resource-dir /usr/local/Cellar/llvm/17.0.3/lib/clang/17 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk -D IccProfLib2_EXPORTS -I /Users/xss/DemoIccMAX-hoyt-master/build/Cmake/../../IccProfLib -I /Developer/Headers/FlatCarbon -F/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk/System/Library/Frameworks -internal-isystem /usr/local/Cellar/llvm/17.0.3/bin/../include/c++/v1 -internal-isystem /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk/usr/local/include -internal-isystem /usr/local/Cellar/llvm/17.0.3/lib/clang/17/include -internal-externc-isystem /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk/usr/include -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/Users/xss/DemoIccMAX-hoyt-master/build/IccProfLib -ferror-limit 19 -fsanitize=address -fsanitize-system-ignorelist=/usr/local/Cellar/llvm/17.0.3/lib/clang/17/share/asan_ignorelist.txt -fno-sanitize-memory-param-retval -fsanitize-address-use-after-scope -fsanitize-address-globals-dead-stripping -fno-assume-sane-operator-new -stack-protector 1 -fblocks -fencode-extended-block-signature -fregister-global-dtors-with-atexit -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fmax-type-align=16 -analyzer-output=html -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /var/folders/l9/sd9kj1px4yq2wc5_lkwhlt6w0000gn/T/scan-build-2023-10-28-102616-57860-1 -x c++ /Users/xss/DemoIccMAX-hoyt-master/IccProfLib/IccConvertUTF.cpp
1/*
2* Copyright 2001-2004 Unicode, Inc.
3*
4* Disclaimer
5*
6* This source code is provided as is by Unicode, Inc. No claims are
7* made as to fitness for any particular purpose. No warranties of any
8* kind are expressed or implied. The recipient agrees to determine
9* applicability of information provided. If this file has been
10* purchased on magnetic or optical media from Unicode, Inc., the
11* sole remedy for any claim will be exchange of defective media
12* within 90 days of receipt.
13*
14* Limitations on Rights to Redistribute This Code
15*
16* Unicode, Inc. hereby grants the right to freely use the information
17* supplied in this file in the creation of products supporting the
18* Unicode Standard, and to make copies of this file in any form
19* for internal or external distribution as long as this notice
20* remains attached.
21*/
22
23/* ---------------------------------------------------------------------
24
25Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26Author: Mark E. Davis, 1994.
27Rev History: Rick McGowan, fixes & updates May 2001.
28Sept 2001: fixed const & error conditions per
29mods suggested by S. Parent & A. Lillich.
30June 2002: Tim Dodd added detection and handling of incomplete
31source sequences, enhanced error detection, added casts
32to eliminate compiler warnings.
33July 2003: slight mods to back out aggressive FFFE detection.
34Jan 2004: updated switches in from-UTF8 conversions.
35Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37See the header file "icConvertUTF.h" for complete documentation.
38
39------------------------------------------------------------------------ */
40
41
42#include "IccConvertUTF.h"
43#ifdef CVTUTF_DEBUG
44#include <stdio.h>
45#endif
46
47static const int halfShift = 10; /* used for shifting by 10 bits */
48
49static const UTF32 halfBase = 0x0010000UL;
50static const UTF32 halfMask = 0x3FFUL;
51
52#define UNI_SUR_HIGH_START(UTF32)0xD800 (UTF32)0xD800
53#define UNI_SUR_HIGH_END(UTF32)0xDBFF (UTF32)0xDBFF
54#define UNI_SUR_LOW_START(UTF32)0xDC00 (UTF32)0xDC00
55#define UNI_SUR_LOW_END(UTF32)0xDFFF (UTF32)0xDFFF
56#define false0 0
57#define true1 1
58
59/* --------------------------------------------------------------------- */
60
61icUtfConversionResult icConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
62 UTF16** targetStart, UTF16* targetEnd, icUtfConversionFlags flags)
63{
64 icUtfConversionResult result = conversionOK;
65 const UTF32* source = *sourceStart;
66 UTF16* target = *targetStart;
67 while (source < sourceEnd) {
68 UTF32 ch;
69 if (target >= targetEnd) {
70 result = targetExhausted; break;
71 }
72 ch = *source++;
73 if (ch <= UNI_MAX_BMP(UTF32)0x0000FFFF) { /* Target is a character <= 0xFFFF */
74 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
76 if (flags == strictConversion) {
77 --source; /* return to the illegal value itself */
78 result = sourceIllegal;
79 break;
80 } else {
81 *target++ = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
82 }
83 } else {
84 *target++ = (UTF16)ch; /* normal case */
85 }
86 } else if (ch > UNI_MAX_LEGAL_UTF32(UTF32)0x0010FFFF) {
87 if (flags == strictConversion) {
88 result = sourceIllegal;
89 } else {
90 *target++ = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
91 }
92 } else {
93 /* target is a character in range 0xFFFF - 0x10FFFF. */
94 if (target + 1 >= targetEnd) {
95 --source; /* Back up source pointer! */
96 result = targetExhausted; break;
97 }
98 ch -= halfBase;
99 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START(UTF32)0xD800);
100 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START(UTF32)0xDC00);
101 }
102 }
103 *sourceStart = source;
104 *targetStart = target;
105 return result;
106}
107
108icUtfConversionResult icConvertUTF32toUTF16 (const UTF32* source, const UTF32* sourceEnd,
109 icUtf16Vector &target, icUtfConversionFlags flags)
110{
111 icUtfConversionResult result = conversionOK;
112 target.clear();
113 while (source < sourceEnd) {
114 UTF32 ch;
115 ch = *source++;
116 if (ch <= UNI_MAX_BMP(UTF32)0x0000FFFF) { /* Target is a character <= 0xFFFF */
117 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
118 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
119 if (flags == strictConversion) {
120 --source; /* return to the illegal value itself */
121 result = sourceIllegal;
122 break;
123 } else {
124 target.push_back(UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD);
125 }
126 } else {
127 target.push_back((UTF16)ch); /* normal case */
128 }
129 } else if (ch > UNI_MAX_LEGAL_UTF32(UTF32)0x0010FFFF) {
130 if (flags == strictConversion) {
131 result = sourceIllegal;
132 } else {
133 target.push_back(UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD);
134 }
135 } else {
136 ch -= halfBase;
137 target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START(UTF32)0xD800));
138 target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START(UTF32)0xDC00));
139 }
140 }
141 return result;
142}
143
144
145/* --------------------------------------------------------------------- */
146
147icUtfConversionResult icConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
148 UTF32** targetStart, UTF32* targetEnd, icUtfConversionFlags flags)
149{
150 icUtfConversionResult result = conversionOK;
151 const UTF16* source = *sourceStart;
152 UTF32* target = *targetStart;
153 UTF32 ch, ch2;
154 while (source < sourceEnd) {
155 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
156 ch = *source++;
157 /* If we have a surrogate pair, convert to UTF32 first. */
158 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_HIGH_END(UTF32)0xDBFF) {
159 /* If the 16 bits following the high surrogate are in the source buffer... */
160 if (source < sourceEnd) {
161 ch2 = *source;
162 /* If it's a low surrogate, convert to UTF32. */
163 if (ch2 >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch2 <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
164 ch = ((ch - UNI_SUR_HIGH_START(UTF32)0xD800) << halfShift)
165 + (ch2 - UNI_SUR_LOW_START(UTF32)0xDC00) + halfBase;
166 ++source;
167 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
168 --source; /* return to the illegal value itself */
169 result = sourceIllegal;
170 break;
171 }
172 } else { /* We don't have the 16 bits following the high surrogate. */
173 --source; /* return to the high surrogate */
174 result = sourceExhausted;
175 break;
176 }
177 } else if (flags == strictConversion) {
178 /* UTF-16 surrogate values are illegal in UTF-32 */
179 if (ch >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
180 --source; /* return to the illegal value itself */
181 result = sourceIllegal;
182 break;
183 }
184 }
185 if (target >= targetEnd) {
186 source = oldSource; /* Back up source pointer! */
187 result = targetExhausted; break;
188 }
189 *target++ = ch;
190 }
191 *sourceStart = source;
192 *targetStart = target;
193#ifdef CVTUTF_DEBUG
194 if (result == sourceIllegal) {
195 fprintf(stderr__stderrp, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
196 fflush(stderr__stderrp);
197 }
198#endif
199 return result;
200}
201
202icUtfConversionResult icConvertUTF16toUTF32 (const UTF16* source, const UTF16* sourceEnd,
203 icUtf32Vector target, UTF32* targetEnd, icUtfConversionFlags flags)
204{
205 icUtfConversionResult result = conversionOK;
206 target.clear();
207 UTF32 ch, ch2;
208 while (source < sourceEnd) {
209 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
210 ch = *source++;
211 /* If we have a surrogate pair, convert to UTF32 first. */
212 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_HIGH_END(UTF32)0xDBFF) {
213 /* If the 16 bits following the high surrogate are in the source buffer... */
214 if (source < sourceEnd) {
215 ch2 = *source;
216 /* If it's a low surrogate, convert to UTF32. */
217 if (ch2 >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch2 <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
218 ch = ((ch - UNI_SUR_HIGH_START(UTF32)0xD800) << halfShift)
219 + (ch2 - UNI_SUR_LOW_START(UTF32)0xDC00) + halfBase;
220 ++source;
221 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
222 --source; /* return to the illegal value itself */
223 result = sourceIllegal;
224 break;
225 }
226 } else { /* We don't have the 16 bits following the high surrogate. */
227 --source; /* return to the high surrogate */
228 result = sourceExhausted;
229 break;
230 }
231 } else if (flags == strictConversion) {
232 /* UTF-16 surrogate values are illegal in UTF-32 */
233 if (ch >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
234 --source; /* return to the illegal value itself */
235 result = sourceIllegal;
236 break;
237 }
238 }
239 target.push_back(ch);
240 }
241#ifdef CVTUTF_DEBUG
242 if (result == sourceIllegal) {
243 fprintf(stderr__stderrp, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
244 fflush(stderr__stderrp);
245 }
246#endif
247 return result;
248}
249
250/* --------------------------------------------------------------------- */
251
252/*
253* Index into the table below with the first byte of a UTF-8 sequence to
254* get the number of trailing bytes that are supposed to follow it.
255* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
256* left as-is for anyone who may want to do such conversion, which was
257* allowed in earlier algorithms.
258*/
259static const char trailingBytesForUTF8[256] = {
260 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
261 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
262 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
263 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
264 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
265 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
266 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
268};
269
270/*
271* Magic values subtracted from a buffer value during UTF8 conversion.
272* This table contains as many values as there might be trailing bytes
273* in a UTF-8 sequence.
274*/
275static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
2760x03C82080UL, 0xFA082080UL, 0x82082080UL };
277
278/*
279* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
280* into the first byte, depending on how many bytes follow. There are
281* as many entries in this table as there are UTF-8 sequence types.
282* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
283* for *legal* UTF-8 will be 4 or fewer bytes total.
284*/
285static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
286
287/* --------------------------------------------------------------------- */
288
289/* The interface converts a whole buffer to avoid function-call overhead.
290* Constants have been gathered. Loops & conditionals have been removed as
291* much as possible for efficiency, in favor of drop-through switches.
292* (See "Note A" at the bottom of the file for equivalent code.)
293* If your compiler supports it, the "isLegalUTF8" call can be turned
294* into an inline function.
295*/
296
297/* --------------------------------------------------------------------- */
298
299icUtfConversionResult icConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
300 UTF8** targetStart, UTF8* targetEnd, icUtfConversionFlags flags)
301{
302 icUtfConversionResult result = conversionOK;
303 const UTF16* source = *sourceStart;
304 UTF8* target = *targetStart;
305 while (source < sourceEnd) {
306 UTF32 ch;
307 unsigned short bytesToWrite = 0;
308 const UTF32 byteMask = 0xBF;
309 const UTF32 byteMark = 0x80;
310 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
311 ch = *source++;
312 /* If we have a surrogate pair, convert to UTF32 first. */
313 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_HIGH_END(UTF32)0xDBFF) {
314 /* If the 16 bits following the high surrogate are in the source buffer... */
315 if (source < sourceEnd) {
316 UTF32 ch2 = *source;
317 /* If it's a low surrogate, convert to UTF32. */
318 if (ch2 >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch2 <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
319 ch = ((ch - UNI_SUR_HIGH_START(UTF32)0xD800) << halfShift)
320 + (ch2 - UNI_SUR_LOW_START(UTF32)0xDC00) + halfBase;
321 ++source;
322 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
323 --source; /* return to the illegal value itself */
324 result = sourceIllegal;
325 break;
326 }
327 } else { /* We don't have the 16 bits following the high surrogate. */
328 --source; /* return to the high surrogate */
329 result = sourceExhausted;
330 break;
331 }
332 } else if (flags == strictConversion) {
333 /* UTF-16 surrogate values are illegal in UTF-32 */
334 if (ch >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
335 --source; /* return to the illegal value itself */
336 result = sourceIllegal;
337 break;
338 }
339 }
340 /* Figure out how many bytes the result will require */
341 if (ch < (UTF32)0x80) { bytesToWrite = 1;
342 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
343 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
344 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
345 } else { bytesToWrite = 3;
346 ch = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
347 }
348
349 target += bytesToWrite;
350 if (target > targetEnd) {
351 source = oldSource; /* Back up source pointer! */
352 target -= bytesToWrite; result = targetExhausted; break;
353 }
354 switch (bytesToWrite) { /* note: everything falls through. */
355 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
356 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
357 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
359 }
360 target += bytesToWrite;
361 }
362 *sourceStart = source;
363 *targetStart = target;
364 return result;
365}
366
367icUtfConversionResult icConvertUTF16toUTF8 (const UTF16* source, const UTF16* sourceEnd,
368 icUtf8Vector &target, icUtfConversionFlags flags)
369{
370 icUtfConversionResult result = conversionOK;
371 target.clear();
372 while (source < sourceEnd) {
373 UTF32 ch;
374 unsigned short bytesToWrite = 0;
375 const UTF32 byteMask = 0xBF;
376 const UTF32 byteMark = 0x80;
377 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
378 ch = *source++;
379 /* If we have a surrogate pair, convert to UTF32 first. */
380 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_HIGH_END(UTF32)0xDBFF) {
381 /* If the 16 bits following the high surrogate are in the source buffer... */
382 if (source < sourceEnd) {
383 UTF32 ch2 = *source;
384 /* If it's a low surrogate, convert to UTF32. */
385 if (ch2 >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch2 <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
386 ch = ((ch - UNI_SUR_HIGH_START(UTF32)0xD800) << halfShift)
387 + (ch2 - UNI_SUR_LOW_START(UTF32)0xDC00) + halfBase;
388 ++source;
389 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
390 --source; /* return to the illegal value itself */
391 result = sourceIllegal;
392 break;
393 }
394 } else { /* We don't have the 16 bits following the high surrogate. */
395 --source; /* return to the high surrogate */
396 result = sourceExhausted;
397 break;
398 }
399 } else if (flags == strictConversion) {
400 /* UTF-16 surrogate values are illegal in UTF-32 */
401 if (ch >= UNI_SUR_LOW_START(UTF32)0xDC00 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
402 --source; /* return to the illegal value itself */
403 result = sourceIllegal;
404 break;
405 }
406 }
407 /* Figure out how many bytes the result will require */
408 if (ch < (UTF32)0x80) { bytesToWrite = 1;
409 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
410 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
411 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
412 } else { bytesToWrite = 3;
413 ch = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
414 }
415
416 UTF8 buf[5], *ptr = &buf[bytesToWrite];
417 switch (bytesToWrite) { /* note: everything falls through. */
418 case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
419 case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
420 case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
421 case 1: *--ptr = (UTF8)(ch | firstByteMark[bytesToWrite]);
422 }
423 switch(bytesToWrite) {
424 case 4: target.push_back(*ptr++);
425 case 3: target.push_back(*ptr++);
426 case 2: target.push_back(*ptr++);
427 case 1: target.push_back(*ptr++);
428 }
429 }
430 return result;
431}
432
433/* --------------------------------------------------------------------- */
434
435/*
436* Utility routine to tell whether a sequence of bytes is legal UTF-8.
437* This must be called with the length pre-determined by the first byte.
438* If not calling this from ConvertUTF8to*, then the length can be set by:
439* length = trailingBytesForUTF8[*source]+1;
440* and the sequence is illegal right away if there aren't that many bytes
441* available.
442* If presented with a length > 4, this returns false. The Unicode
443* definition of UTF-8 goes up to 4-byte sequences.
444*/
445
446static Boolean isLegalUTF8(const UTF8 *source, int length)
447{
448 UTF8 a;
449 const UTF8 *srcptr = source+length;
450 switch (length) {
451 default: return false0;
452 /* Everything else falls through when "true"... */
453 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false0;
454 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false0;
455 case 2: if ((a = (*--srcptr)) > 0xBF) return false0;
456
457 switch (*source) {
458 /* no fall-through in this inner switch */
459 case 0xE0: if (a < 0xA0) return false0; break;
460 case 0xED: if (a > 0x9F) return false0; break;
461 case 0xF0: if (a < 0x90) return false0; break;
462 case 0xF4: if (a > 0x8F) return false0; break;
463 default: if (a < 0x80) return false0;
464 }
465
466 case 1: if (*source >= 0x80 && *source < 0xC2) return false0;
467 }
468 if (*source > 0xF4) return false0;
469 return true1;
470}
471
472/* --------------------------------------------------------------------- */
473
474/*
475* Exported function to return whether a UTF-8 sequence is legal or not.
476* This is not used here; it's just exported.
477*/
478Boolean icIsLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
479{
480 int length = trailingBytesForUTF8[*source]+1;
481 if (source+length > sourceEnd) {
482 return false0;
483 }
484 return isLegalUTF8(source, length);
485}
486
487/* --------------------------------------------------------------------- */
488
489icUtfConversionResult icConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
490 UTF16** targetStart, UTF16* targetEnd, icUtfConversionFlags flags)
491{
492 icUtfConversionResult result = conversionOK;
493 const UTF8* source = *sourceStart;
494 UTF16* target = *targetStart;
495 while (source < sourceEnd) {
496 UTF32 ch = 0;
497 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
498 if (source + extraBytesToRead >= sourceEnd) {
499 result = sourceExhausted; break;
500 }
501 /* Do this check whether lenient or strict */
502 if (! isLegalUTF8(source, extraBytesToRead+1)) {
503 result = sourceIllegal;
504 break;
505 }
506 /*
507 * The cases all fall through. See "Note A" below.
508 */
509 switch (extraBytesToRead) {
510 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
511 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
512 case 3: ch += *source++; ch <<= 6;
513 case 2: ch += *source++; ch <<= 6;
514 case 1: ch += *source++; ch <<= 6;
515 case 0: ch += *source++;
516 }
517 ch -= offsetsFromUTF8[extraBytesToRead];
518
519 if (target >= targetEnd) {
520 source -= (extraBytesToRead+1); /* Back up source pointer! */
521 result = targetExhausted; break;
522 }
523 if (ch <= UNI_MAX_BMP(UTF32)0x0000FFFF) { /* Target is a character <= 0xFFFF */
524 /* UTF-16 surrogate values are illegal in UTF-32 */
525 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
526 if (flags == strictConversion) {
527 source -= (extraBytesToRead+1); /* return to the illegal value itself */
528 result = sourceIllegal;
529 break;
530 } else {
531 *target++ = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
532 }
533 } else {
534 *target++ = (UTF16)ch; /* normal case */
535 }
536 } else if (ch > UNI_MAX_UTF16(UTF32)0x0010FFFF) {
537 if (flags == strictConversion) {
538 result = sourceIllegal;
539 source -= (extraBytesToRead+1); /* return to the start */
540 break; /* Bail out; shouldn't continue */
541 } else {
542 *target++ = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
543 }
544 } else {
545 /* target is a character in range 0xFFFF - 0x10FFFF. */
546 if (target + 1 >= targetEnd) {
547 source -= (extraBytesToRead+1); /* Back up source pointer! */
548 result = targetExhausted; break;
549 }
550 ch -= halfBase;
551 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START(UTF32)0xD800);
552 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START(UTF32)0xDC00);
553 }
554 }
555 *sourceStart = source;
556 *targetStart = target;
557 return result;
558}
559
560icUtfConversionResult icConvertUTF8toUTF16 (const UTF8* source, const UTF8* sourceEnd,
561 icUtf16Vector &target, icUtfConversionFlags flags)
562{
563 icUtfConversionResult result = conversionOK;
564 target.clear();
565 while (source < sourceEnd) {
566 UTF32 ch = 0;
567 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
568 if (source + extraBytesToRead >= sourceEnd) {
569 result = sourceExhausted; break;
570 }
571 /* Do this check whether lenient or strict */
572 if (! isLegalUTF8(source, extraBytesToRead+1)) {
573 result = sourceIllegal;
574 break;
575 }
576 /*
577 * The cases all fall through. See "Note A" below.
578 */
579 switch (extraBytesToRead) {
580 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
581 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
582 case 3: ch += *source++; ch <<= 6;
583 case 2: ch += *source++; ch <<= 6;
584 case 1: ch += *source++; ch <<= 6;
585 case 0: ch += *source++;
586 }
587 ch -= offsetsFromUTF8[extraBytesToRead];
588
589 if (ch <= UNI_MAX_BMP(UTF32)0x0000FFFF) { /* Target is a character <= 0xFFFF */
590 /* UTF-16 surrogate values are illegal in UTF-32 */
591 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
592 if (flags == strictConversion) {
593 source -= (extraBytesToRead+1); /* return to the illegal value itself */
594 result = sourceIllegal;
595 break;
596 } else {
597 target.push_back(UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD);
598 }
599 } else {
600 target.push_back((UTF16)ch); /* normal case */
601 }
602 } else if (ch > UNI_MAX_UTF16(UTF32)0x0010FFFF) {
603 if (flags == strictConversion) {
604 result = sourceIllegal;
605 source -= (extraBytesToRead+1); /* return to the start */
606 break; /* Bail out; shouldn't continue */
607 } else {
608 target.push_back(UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD);
609 }
610 } else {
611 /* target is a character in range 0xFFFF - 0x10FFFF. */
612 ch -= halfBase;
613 target.push_back((UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START(UTF32)0xD800));
614 target.push_back((UTF16)((ch & halfMask) + UNI_SUR_LOW_START(UTF32)0xDC00));
615 }
616 }
617 return result;
618}
619
620/* --------------------------------------------------------------------- */
621
622icUtfConversionResult icConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
623 UTF8** targetStart, UTF8* targetEnd, icUtfConversionFlags flags)
624{
625 icUtfConversionResult result = conversionOK;
626 const UTF32* source = *sourceStart;
627 UTF8* target = *targetStart;
628 while (source < sourceEnd) {
629 UTF32 ch;
630 unsigned short bytesToWrite = 0;
631 const UTF32 byteMask = 0xBF;
632 const UTF32 byteMark = 0x80;
633 ch = *source++;
634 if (flags == strictConversion ) {
635 /* UTF-16 surrogate values are illegal in UTF-32 */
636 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
637 --source; /* return to the illegal value itself */
638 result = sourceIllegal;
639 break;
640 }
641 }
642 /*
643 * Figure out how many bytes the result will require. Turn any
644 * illegally large UTF32 things (> Plane 17) into replacement chars.
645 */
646 if (ch < (UTF32)0x80) { bytesToWrite = 1;
647 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
648 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
649 } else if (ch <= UNI_MAX_LEGAL_UTF32(UTF32)0x0010FFFF) { bytesToWrite = 4;
650 } else { bytesToWrite = 3;
651 ch = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
652 result = sourceIllegal;
653 }
654
655 target += bytesToWrite;
656 if (target > targetEnd) {
657 --source; /* Back up source pointer! */
658 target -= bytesToWrite; result = targetExhausted; break;
659 }
660 switch (bytesToWrite) { /* note: everything falls through. */
661 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
662 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
663 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
664 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
665 }
666 target += bytesToWrite;
667 }
668 *sourceStart = source;
669 *targetStart = target;
670 return result;
671}
672
673icUtfConversionResult icConvertUTF32toUTF8 (const UTF32* source, const UTF32* sourceEnd,
674 icUtf8Vector &target, icUtfConversionFlags flags)
675{
676 icUtfConversionResult result = conversionOK;
677 target.clear();
678 while (source < sourceEnd) {
679 UTF32 ch;
680 unsigned short bytesToWrite = 0;
681 const UTF32 byteMask = 0xBF;
682 const UTF32 byteMark = 0x80;
683 ch = *source++;
684 if (flags == strictConversion ) {
685 /* UTF-16 surrogate values are illegal in UTF-32 */
686 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
687 --source; /* return to the illegal value itself */
688 result = sourceIllegal;
689 break;
690 }
691 }
692 /*
693 * Figure out how many bytes the result will require. Turn any
694 * illegally large UTF32 things (> Plane 17) into replacement chars.
695 */
696 if (ch < (UTF32)0x80) { bytesToWrite = 1;
697 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
698 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
699 } else if (ch <= UNI_MAX_LEGAL_UTF32(UTF32)0x0010FFFF) { bytesToWrite = 4;
700 } else { bytesToWrite = 3;
701 ch = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
702 result = sourceIllegal;
703 }
704
705 UTF8 buf[5], *ptr = &buf[bytesToWrite];
706 switch (bytesToWrite) { /* note: everything falls through. */
707 case 4: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
708 case 3: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
709 case 2: *--ptr = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
710 case 1: *--ptr = (UTF8)(ch | firstByteMark[bytesToWrite]);
711 }
712 switch(bytesToWrite) {
713 case 4: target.push_back(*ptr++);
714 case 3: target.push_back(*ptr++);
715 case 2: target.push_back(*ptr++);
716 case 1: target.push_back(*ptr++);
717 }
718 }
719 return result;
720}
721/* --------------------------------------------------------------------- */
722
723icUtfConversionResult icConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
724 UTF32** targetStart, UTF32* targetEnd, icUtfConversionFlags flags)
725{
726 icUtfConversionResult result = conversionOK;
727 const UTF8* source = *sourceStart;
728 UTF32* target = *targetStart;
729 while (source < sourceEnd) {
730 UTF32 ch = 0;
731 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
732 if (source + extraBytesToRead >= sourceEnd) {
733 result = sourceExhausted; break;
734 }
735 /* Do this check whether lenient or strict */
736 if (! isLegalUTF8(source, extraBytesToRead+1)) {
737 result = sourceIllegal;
738 break;
739 }
740 /*
741 * The cases all fall through. See "Note A" below.
742 */
743 switch (extraBytesToRead) {
744 case 5: ch += *source++; ch <<= 6;
745 case 4: ch += *source++; ch <<= 6;
746 case 3: ch += *source++; ch <<= 6;
747 case 2: ch += *source++; ch <<= 6;
748 case 1: ch += *source++; ch <<= 6;
749 case 0: ch += *source++;
750 }
751 ch -= offsetsFromUTF8[extraBytesToRead];
752
753 if (target >= targetEnd) {
754 source -= (extraBytesToRead+1); /* Back up the source pointer! */
755 result = targetExhausted; break;
756 }
757 if (ch <= UNI_MAX_LEGAL_UTF32(UTF32)0x0010FFFF) {
758 /*
759 * UTF-16 surrogate values are illegal in UTF-32, and anything
760 * over Plane 17 (> 0x10FFFF) is illegal.
761 */
762 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
763 if (flags == strictConversion) {
764 source -= (extraBytesToRead+1); /* return to the illegal value itself */
765 result = sourceIllegal;
766 break;
767 } else {
768 *target++ = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
769 }
770 } else {
771 *target++ = ch;
772 }
773 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
774 result = sourceIllegal;
775 *target++ = UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD;
776 }
777 }
778 *sourceStart = source;
779 *targetStart = target;
780 return result;
781}
782
783icUtfConversionResult icConvertUTF8toUTF32 (const UTF8* source, const UTF8* sourceEnd,
784 icUtf32Vector &target, icUtfConversionFlags flags)
785{
786 icUtfConversionResult result = conversionOK;
787 target.clear();
788 while (source < sourceEnd) {
789 UTF32 ch = 0;
790 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
791 if (source + extraBytesToRead >= sourceEnd) {
792 result = sourceExhausted; break;
793 }
794 /* Do this check whether lenient or strict */
795 if (! isLegalUTF8(source, extraBytesToRead+1)) {
796 result = sourceIllegal;
797 break;
798 }
799 /*
800 * The cases all fall through. See "Note A" below.
801 */
802 switch (extraBytesToRead) {
803 case 5: ch += *source++; ch <<= 6;
804 case 4: ch += *source++; ch <<= 6;
805 case 3: ch += *source++; ch <<= 6;
806 case 2: ch += *source++; ch <<= 6;
807 case 1: ch += *source++; ch <<= 6;
808 case 0: ch += *source++;
809 }
810 ch -= offsetsFromUTF8[extraBytesToRead];
811
812 if (ch <= UNI_MAX_LEGAL_UTF32(UTF32)0x0010FFFF) {
813 /*
814 * UTF-16 surrogate values are illegal in UTF-32, and anything
815 * over Plane 17 (> 0x10FFFF) is illegal.
816 */
817 if (ch >= UNI_SUR_HIGH_START(UTF32)0xD800 && ch <= UNI_SUR_LOW_END(UTF32)0xDFFF) {
818 if (flags == strictConversion) {
819 source -= (extraBytesToRead+1); /* return to the illegal value itself */
Value stored to 'source' is never read
820 result = sourceIllegal;
821 break;
822 } else {
823 target.push_back(UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD);
824 }
825 } else {
826 target.push_back(ch);
827 }
828 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
829 result = sourceIllegal;
830 target.push_back(UNI_REPLACEMENT_CHAR(UTF32)0x0000FFFD);
831 }
832 }
833
834 return result;
835}
836
837/* ---------------------------------------------------------------------
838
839Note A.
840The fall-through switches in UTF-8 reading code save a
841temp variable, some decrements & conditionals. The switches
842are equivalent to the following loop:
843{
844int tmpBytesToRead = extraBytesToRead+1;
845do {
846ch += *source++;
847--tmpBytesToRead;
848if (tmpBytesToRead) ch <<= 6;
849} while (tmpBytesToRead > 0);
850}
851In UTF-8 writing code, the switches on "bytesToWrite" are
852similarly unrolled loops.
853
854--------------------------------------------------------------------- */