summaryrefslogtreecommitdiff
path: root/src/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/unicode.c')
-rw-r--r--src/unicode.c141
1 files changed, 141 insertions, 0 deletions
diff --git a/src/unicode.c b/src/unicode.c
new file mode 100644
index 0000000..a8fc612
--- /dev/null
+++ b/src/unicode.c
@@ -0,0 +1,141 @@
+
+#include <wchar.h>
+
+#include <wchar.h>
+#include "sablo.h"
+
+/*
+ * Transcode UCS2 to UTF8.
+ *
+ * Since the nature of the transformation is that the
+ * resulting length is unpredictable, this function
+ * allocates it's own memory.
+ */
+char* unicode_transcode16to8(const wchar_t* src, size_t len)
+{
+ char* ret = NULL;
+ size_t alloc = 0;
+ size_t pos = 0;
+ const wchar_t* c;
+ const wchar_t* e;
+
+ /* Allocate 1.25 times the length initially */
+ alloc = len + (len / 4) + 1;
+ ret = (char*)malloc(alloc * sizeof(char));
+ if(!ret) return NULL;
+
+ c = src;
+ e = c + len;
+
+ for( ; c < e; c++)
+ {
+ /* Make sure we have enough memory */
+ if(pos + 4 >= alloc)
+ {
+ alloc += (len / 2) + 1;
+ if(!(ret = (char*)reallocf(ret, alloc * sizeof(char))))
+ return NULL;
+ }
+
+ /* Encode as one character */
+ if(*c <= 0x007F)
+ {
+ ret[pos++] = (char)*c;
+ }
+
+ /* Encode as two characters */
+ else if(*c <= 0x07FF)
+ {
+ ret[pos++] = (char)(192 | (*c >> 6));
+ ret[pos++] = (char)(128 | (*c & 63));
+ }
+
+ /* Encode as three characters */
+ else
+ {
+ ret[pos++] = (char)(224 | (*c >> 12));
+ ret[pos++] = (char)(128 | ((*c >> 6) & 63));
+ ret[pos++] = (char)(128 | (*c & 63));
+ }
+ }
+
+ ret[pos] = NULL;
+ return ret;
+}
+
+/*
+ * Transcode UTF-8 to UCS2
+ *
+ * Since a semi predictable length of the resulting data is
+ * known, the caller should allocate the memory for this conversion.
+ */
+wchar_t* unicode_transcode8to16(const char* src, const wchar_t* out, size_t len)
+{
+ /* Note: out should always be at least as long as src in chars */
+
+ size_t pos = 0;
+ const char* c;
+ const char* e;
+
+ c = src;
+ e = c + len;
+
+ for( ; c < e; c++)
+ {
+ /* We never have to reallocate here. We will always
+ be using the same or less number of output characters
+ than input chars. That's just the nature of the encoding. */
+
+ /* First 4 bits set */
+ if((c + 3) < e &&
+ (c[0] & 0xF8) == 0xF0 &&
+ (c[1] & 0xC0) == 0x80 &&
+ (c[2] & 0xC0) == 0x80 &&
+ (c[3] & 0xC0) == 0x80)
+ {
+ out[pos++] = (wchar_t)(((wchar_t)c[0] & 7) << 18 |
+ ((wchar_t)c[1] & 63) << 12 |
+ ((wchar_t)c[2] & 63) << 6 |
+ ((wchar_t)c[3] & 63));
+ c += 3;
+ }
+
+ /* First 3 bits set */
+ else if((c + 2) < e &&
+ (c[0] & 0xF0) == 0xE0 &&
+ (c[1] & 0xC0) == 0x80 &&
+ (c[2] & 0xC0) == 0x80)
+ {
+ out[pos++] = (wchar_t)(((wchar_t)c[0] & 15) << 12 |
+ ((wchar_t)c[1] & 63) << 6 |
+ ((wchar_t)c[2] & 63));
+ c += 2;
+ }
+
+ /* First 2 bits set */
+ else if((c + 1) < e &&
+ (c[0] & 0xE0) == 0xC0 &&
+ (c[1] & 0xC0) == 0x80)
+ {
+ out[pos++] = (wchar_t)(((wchar_t)c[0] & 31) << 6 |
+ ((wchar_t)c[1] & 63));
+ c += 1;
+ }
+
+ /* First bit set */
+ else if(!(c[0] & 0x80))
+ {
+ out[pos++] = (wchar_t)c[0];
+ }
+
+ /* Invalid encoding */
+ else
+ {
+ out[pos++] = L'?';
+ }
+ }
+
+ out[pos] = NULL;
+ return out;
+}
+