diff options
Diffstat (limited to 'src/utf8.cpp')
-rw-r--r-- | src/utf8.cpp | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/src/utf8.cpp b/src/utf8.cpp new file mode 100644 index 0000000..7c2e3ee --- /dev/null +++ b/src/utf8.cpp @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2004, Nate Nielsen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the + * above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * * The names of contributors to this software may not be + * used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * + * CONTRIBUTORS + * Nate Nielsen <nielsen@memberwebs.com> + * + */ + +#include <wchar.h> +#include "usuals.h" + +bool transcode16to8(wchar_t ch, string& ret) +{ + ret.reserve(ret.length() + 3); + + if(ch <= 0x007F) + { + ret.append(1, (char)ch); + } + else if(ch <= 0x07FF) + { + ret.append(1, (char)(192 | (ch >> 6))); + ret.append(1, (char)(128 | (ch & 63))); + } + else + { + ret.append(1, (char)(224 | (ch >> 12))); + ret.append(1, (char)(128 | ((ch >> 6) & 63))); + ret.append(1, (char)(128 | (ch & 63))); + } + + return true; +} + +bool transcode16to8(const wstring& data, string& ret) +{ + ret.reserve(ret.length() + data.length() + (data.length() / 2)); + + // basic_string.c_str doesn't work properly everywhere + // most notably not in the g++ std library + + const wchar_t* c = data.length() ? data.data() : L""; + const wchar_t* e = c + data.length(); + + for( ; c < e; c++) + { + if(*c <= 0x007F) + { + ret.append(1, (char)*c); + } + else if(*c <= 0x07FF) + { + ret.append(1, (char)(192 | (*c >> 6))); + ret.append(1, (char)(128 | (*c & 63))); + } + else + { + ret.append(1, (char)(224 | (*c >> 12))); + ret.append(1, (char)(128 | ((*c >> 6) & 63))); + ret.append(1, (char)(128 | (*c & 63)) ); + } + } + + return true; +} + +bool transcode8to16(const string& data, wstring& ret) +{ + ret.reserve(data.length()); + + // basic_string.c_str doesn't work properly everywhere + // most notably not in the g++ std library + + const char* c = data.length() ? data.data() : ""; + const char* e = c + data.length(); + + for( ; c < e; c++) + { + // First 4 bits set + if((c[0] & 0xF8) == 0xF0 && + (c[1] & 0xC0) == 0x80 && + (c[2] & 0xC0) == 0x80 && + (c[3] & 0xC0) == 0x80) + { + ret.append(1, (wchar_t)(((wchar_t)c[0] & 7) << 18 | + ((wchar_t)c[1] & 63) << 12 | + ((wchar_t)c[2] & 63) << 6 | + ((wchar_t)c[3] & 63))); + c += 3; + } + + // First 3 bits set + else if((c[0] & 0xF0) == 0xE0 && + (c[1] & 0xC0) == 0x80 && + (c[2] & 0xC0) == 0x80) + { + ret.append(1, (wchar_t)(((wchar_t)c[0] & 15) << 12 | + ((wchar_t)c[1] & 63) << 6 | + ((wchar_t)c[2] & 63))); + c += 2; + } + + // First 2 bits set + else if((c[0] & 0xE0) == 0xC0 && + (c[1] & 0xC0) == 0x80) + { + ret.append(1, (wchar_t)(((wchar_t)c[0] & 31) << 6 | + ((wchar_t)c[1] & 63))); + c += 1; + } + + // First bit set + else if(!(c[0] & 0x80)) + { + ret.append(1, (wchar_t)c[0]); + } + + else + return false; + } + + return true; +} |