UTF8和UTF16和UTF32之间的相互转化
2013-10-21 11:46
316 查看
/* ================================================================ */ /* File: ConvertUTF.C Author: Mark E. Davis Copyright (C) 1994 Taligent, Inc. All rights reserved. This code is copyrighted. Under the copyright laws, this code may not be copied, in whole or part, without prior written consent of Taligent. Taligent grants the right to use or reprint this code as long as this ENTIRE copyright notice is reproduced in the code or reproduction. The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY NOT APPLY TO YOU. RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the government is subject to restrictions as set forth in subparagraph (c)(l)(ii) of the Rights in Technical Data and Computer Software clause at DFARS 252.227-7013 and FAR 52.227-19. This code may be protected by one or more U.S. and International Patents. TRADEMARKS: Taligent and the Taligent Design Mark are registered trademarks of Taligent, Inc. */ /* ================================================================ */ #include "ConvertUTF.h" /* ================================================================ */ const int halfShift = 10; const UCS4 halfBase = 0x0010000UL; const UCS4 halfMask = 0x3FFUL; const UCS4 kSurrogateHighStart = 0xD800UL; const UCS4 kSurrogateHighEnd = 0xDBFFUL; const UCS4 kSurrogateLowStart = 0xDC00UL; const UCS4 kSurrogateLowEnd = 0xDFFFUL; /* ================================================================ */ ConversionResult ConvertUCS4toUTF16 ( UCS4** sourceStart, const UCS4* sourceEnd, UTF16** targetStart, const UTF16* targetEnd) { ConversionResult result = kUTFConversionOK; register UCS4* source = *sourceStart; register UTF16* target = *targetStart; while (source < sourceEnd) { register UCS4 ch; if (target >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; ch = *source++; if (ch <= kMaximumUCS2) { *target++ = ch; } else if (ch > kMaximumUTF16) { *target++ = kReplacementCharacter; } else { if (target + 1 >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; ch -= halfBase; *target++ = (ch >> halfShift) + kSurrogateHighStart; *target++ = (ch & halfMask) + kSurrogateLowStart; }; }; *sourceStart = source; *targetStart = target; return result; }; /* ================================================================ */ ConversionResult ConvertUTF16toUCS4 ( UTF16** sourceStart, UTF16* sourceEnd, UCS4** targetStart, const UCS4* targetEnd) { ConversionResult result = kUTFConversionOK; register UTF16* source = *sourceStart; register UCS4* target = *targetStart; while (source < sourceEnd) { register UCS4 ch; ch = *source++; if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) { register UCS4 ch2 = *source; if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { ch = ((ch - kSurrogateHighStart) << halfShift) + (ch2 - kSurrogateLowStart) + halfBase; ++source; }; }; if (target >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; *target++ = ch; }; *sourceStart = source; *targetStart = target; return result; }; /* ================================================================ */ UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; char bytesFromUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; /* ================================================================ */ /* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code: * it is adapted to be consistent with UTF16, * the interface converts a whole buffer to avoid function-call overhead * constants have been gathered. * loops & conditionals have been removed as much as possible for efficiency, in favor of drop-through switch statements. */ /* ================================================================ */ ConversionResult ConvertUTF16toUTF8 ( UTF16** sourceStart, const UTF16* sourceEnd, UTF8** targetStart, const UTF8* targetEnd) { ConversionResult result = kUTFConversionOK; register UTF16* source = *sourceStart; register UTF8* target = *targetStart; while (source < sourceEnd) { register UCS4 ch; register unsigned short bytesToWrite = 0; register const UCS4 byteMask = 0xBF; register const UCS4 byteMark = 0x80; ch = *source++; if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) { register UCS4 ch2 = *source; if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { ch = ((ch - kSurrogateHighStart) << halfShift) + (ch2 - kSurrogateLowStart) + halfBase; ++source; }; }; if (ch < 0x80) { bytesToWrite = 1; } else if (ch < 0x800) { bytesToWrite = 2; } else if (ch < 0x10000) { bytesToWrite = 3; } else if (ch < 0x200000) { bytesToWrite = 4; } else if (ch < 0x4000000) { bytesToWrite = 5; } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; } else { bytesToWrite = 2; ch = kReplacementCharacter; }; /* I wish there were a smart way to avoid this conditional */ target += bytesToWrite; if (target > targetEnd) { target -= bytesToWrite; result = kUTFConversionTargetExhausted; break; }; switch (bytesToWrite) { /* note: code falls through cases! */ case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 1: *--target = ch | firstByteMark[bytesToWrite]; }; target += bytesToWrite; }; *sourceStart = source; *targetStart = target; return result; }; /* ================================================================ */ ConversionResult ConvertUTF8toUTF16 ( UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, const UTF16* targetEnd) { ConversionResult result = kUTFConversionOK; register UTF8* source = *sourceStart; register UTF16* target = *targetStart; while (source < sourceEnd) { register UCS4 ch = 0; register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; if (source + extraBytesToWrite > sourceEnd) { result = kUTFConversionSourceExhausted; break; }; switch(extraBytesToWrite) { /* note: code falls through cases! */ case 5: ch += *source++; ch <<= 6; case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; }; ch -= offsetsFromUTF8[extraBytesToWrite]; if (target >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; if (ch <= kMaximumUCS2) { *target++ = ch; } else if (ch > kMaximumUTF16) { *target++ = kReplacementCharacter; } else { if (target + 1 >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; ch -= halfBase; *target++ = (ch >> halfShift) + kSurrogateHighStart; *target++ = (ch & halfMask) + kSurrogateLowStart; }; }; *sourceStart = source; *targetStart = target; return result; }; /* ================================================================ */ ConversionResult ConvertUCS4toUTF8 ( UCS4** sourceStart, const UCS4* sourceEnd, UTF8** targetStart, const UTF8* targetEnd) { ConversionResult result = kUTFConversionOK; register UCS4* source = *sourceStart; register UTF8* target = *targetStart; while (source < sourceEnd) { register UCS4 ch; register unsigned short bytesToWrite = 0; register const UCS4 byteMask = 0xBF; register const UCS4 byteMark = 0x80; ch = *source++; if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd cc2b && source < sourceEnd) { register UCS4 ch2 = *source; if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { ch = ((ch - kSurrogateHighStart) << halfShift) + (ch2 - kSurrogateLowStart) + halfBase; ++source; }; }; if (ch < 0x80) { bytesToWrite = 1; } else if (ch < 0x800) { bytesToWrite = 2; } else if (ch < 0x10000) { bytesToWrite = 3; } else if (ch < 0x200000) { bytesToWrite = 4; } else if (ch < 0x4000000) { bytesToWrite = 5; } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; } else { bytesToWrite = 2; ch = kReplacementCharacter; }; /* I wish there were a smart way to avoid this conditional */ target += bytesToWrite; if (target > targetEnd) { target -= bytesToWrite; result = kUTFConversionTargetExhausted; break; }; switch (bytesToWrite) { /* note: code falls through cases! */ case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; case 1: *--target = ch | firstByteMark[bytesToWrite]; }; target += bytesToWrite; }; *sourceStart = source; *targetStart = target; return result; }; /* ================================================================ */ ConversionResult ConvertUTF8toUCS4 ( UTF8** sourceStart, const UTF8* sourceEnd, UCS4** targetStart, const UCS4* targetEnd) { ConversionResult result = kUTFConversionOK; register UTF8* source = *sourceStart; register UCS4* target = *targetStart; while (source < sourceEnd) { register UCS4 ch = 0; register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; if (source + extraBytesToWrite > sourceEnd) { result = kUTFConversionSourceExhausted; break; }; switch(extraBytesToWrite) { /* note: code falls through cases! */ case 5: ch += *source++; ch <<= 6; case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; }; ch -= offsetsFromUTF8[extraBytesToWrite]; if (target >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; if (ch <= kMaximumUCS2) { *target++ = ch; } else if (ch > kMaximumUCS4) { *target++ = kReplacementCharacter; } else { if (target + 1 >= targetEnd) { result = kUTFConversionTargetExhausted; break; }; ch -= halfBase; *target++ = (ch >> halfShift) + kSurrogateHighStart; *target++ = (ch & halfMask) + kSurrogateLowStart; }; }; *sourceStart = source; *targetStart = target; return result; };
相关文章推荐
- 关于PHP通过PDO用中文条件查询MySQL的问题。
- 保证asp叶面一定不会出现乱码 UTF8
- JoshChen_web格式编码UTF8-无BOM的小细节分析
- UTF8编码开发中页面空白问题的解决方法
- UTF8转成GB2312乱码问题解决方案
- MySql修改数据库编码为UTF8避免造成乱码问题
- PHP通过iconv将字符串从GBK转换为UTF8字符集
- PHP写入WRITE编码为UTF8的文件的实现代码
- php UTF8 文件的签名问题
- php下检测字符串是否是utf8编码的代码
- UTF-8 GBK UTF8 GB2312 之间的区别和关系介绍
- php读取mysql乱码,用set names XXX解决的原理分享
- PHP5+UTF8多文件上传类
- utf8的问题
- 汉字和Unicode码(utf-8)之间的转换(Pack/Unpack)
- ORACLE NLS以及多语言问题
- 纯C实现unicode-utf8互转
- Eclipse 乱码解决方案总结
- eclipse_php类和函数提示的中文部分为乱码,成功解决方法