diff options
Diffstat (limited to 'src/unicode.c')
-rw-r--r-- | src/unicode.c | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/src/unicode.c b/src/unicode.c new file mode 100644 index 0000000..a8fc612 --- /dev/null +++ b/src/unicode.c @@ -0,0 +1,141 @@ + +#include <wchar.h> + +#include <wchar.h> +#include "sablo.h" + +/* + * Transcode UCS2 to UTF8. + * + * Since the nature of the transformation is that the + * resulting length is unpredictable, this function + * allocates it's own memory. + */ +char* unicode_transcode16to8(const wchar_t* src, size_t len) +{ + char* ret = NULL; + size_t alloc = 0; + size_t pos = 0; + const wchar_t* c; + const wchar_t* e; + + /* Allocate 1.25 times the length initially */ + alloc = len + (len / 4) + 1; + ret = (char*)malloc(alloc * sizeof(char)); + if(!ret) return NULL; + + c = src; + e = c + len; + + for( ; c < e; c++) + { + /* Make sure we have enough memory */ + if(pos + 4 >= alloc) + { + alloc += (len / 2) + 1; + if(!(ret = (char*)reallocf(ret, alloc * sizeof(char)))) + return NULL; + } + + /* Encode as one character */ + if(*c <= 0x007F) + { + ret[pos++] = (char)*c; + } + + /* Encode as two characters */ + else if(*c <= 0x07FF) + { + ret[pos++] = (char)(192 | (*c >> 6)); + ret[pos++] = (char)(128 | (*c & 63)); + } + + /* Encode as three characters */ + else + { + ret[pos++] = (char)(224 | (*c >> 12)); + ret[pos++] = (char)(128 | ((*c >> 6) & 63)); + ret[pos++] = (char)(128 | (*c & 63)); + } + } + + ret[pos] = NULL; + return ret; +} + +/* + * Transcode UTF-8 to UCS2 + * + * Since a semi predictable length of the resulting data is + * known, the caller should allocate the memory for this conversion. + */ +wchar_t* unicode_transcode8to16(const char* src, const wchar_t* out, size_t len) +{ + /* Note: out should always be at least as long as src in chars */ + + size_t pos = 0; + const char* c; + const char* e; + + c = src; + e = c + len; + + for( ; c < e; c++) + { + /* We never have to reallocate here. We will always + be using the same or less number of output characters + than input chars. That's just the nature of the encoding. */ + + /* First 4 bits set */ + if((c + 3) < e && + (c[0] & 0xF8) == 0xF0 && + (c[1] & 0xC0) == 0x80 && + (c[2] & 0xC0) == 0x80 && + (c[3] & 0xC0) == 0x80) + { + out[pos++] = (wchar_t)(((wchar_t)c[0] & 7) << 18 | + ((wchar_t)c[1] & 63) << 12 | + ((wchar_t)c[2] & 63) << 6 | + ((wchar_t)c[3] & 63)); + c += 3; + } + + /* First 3 bits set */ + else if((c + 2) < e && + (c[0] & 0xF0) == 0xE0 && + (c[1] & 0xC0) == 0x80 && + (c[2] & 0xC0) == 0x80) + { + out[pos++] = (wchar_t)(((wchar_t)c[0] & 15) << 12 | + ((wchar_t)c[1] & 63) << 6 | + ((wchar_t)c[2] & 63)); + c += 2; + } + + /* First 2 bits set */ + else if((c + 1) < e && + (c[0] & 0xE0) == 0xC0 && + (c[1] & 0xC0) == 0x80) + { + out[pos++] = (wchar_t)(((wchar_t)c[0] & 31) << 6 | + ((wchar_t)c[1] & 63)); + c += 1; + } + + /* First bit set */ + else if(!(c[0] & 0x80)) + { + out[pos++] = (wchar_t)c[0]; + } + + /* Invalid encoding */ + else + { + out[pos++] = L'?'; + } + } + + out[pos] = NULL; + return out; +} + |