kmscon/src/unicode.c
David Herrmann d0672030ff unicode: implement ucs4 to utf8 encoding
This is the last glib dependency so add a short conversion helper and we
can finally drop glib. Anyway, the pango libs still depend on glib so
there is currently still a glib dependency, but it is no longer direct and
we can always choose the freetype font renderer to drop it.

Signed-off-by: David Herrmann <dh.herrmann@googlemail.com>
2012-07-15 16:29:34 +02:00

474 lines
11 KiB
C

/*
* kmscon - Unicode Handling
*
* Copyright (c) 2011 David Herrmann <dh.herrmann@googlemail.com>
* Copyright (c) 2011-2012 University of Tuebingen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This kmscon-utf8-state-machine is based on the wayland-compositor demos:
*
* Copyright © 2008 Kristian Høgsberg
*
* Permission to use, copy, modify, distribute, and sell this software and
* its documentation for any purpose is hereby granted without fee, provided
* that the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software
* without specific, written prior permission. The copyright holders make
* no representations about the suitability of this software for any
* purpose. It is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
* RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
* CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Unicode Handling
* Main implementation of the symbol datatype. The symbol table contains two-way
* references. The Hash Table contains all the symbols with the symbol ucs4
* string as key and the symbol ID as value.
* The index array contains the symbol ID as key and a pointer to the ucs4
* string as value. But the hash table owns the ucs4 string.
* This allows fast implementations of *_get() and *_append() without long
* search intervals.
*
* When creating a new symbol, we simply return the UCS4 value as new symbol. We
* do not add it to our symbol table as it is only one character. However, if a
* character is appended to an existing symbol, we create a new ucs4 string and
* push the new symbol into the symbol table.
*/
/* TODO: Remove the glib dependencies */
#include <errno.h>
#include <inttypes.h>
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
#include "log.h"
#include "static_misc.h"
#include "unicode.h"
#define LOG_SUBSYSTEM "unicode"
#define KMSCON_UCS4_MAXLEN 10
#define KMSCON_UCS4_MAX 0x7fffffffUL
#define KMSCON_UCS4_INVALID 0xfffd
const kmscon_symbol_t kmscon_symbol_default = 0;
static const char default_u8[] = { 0 };
static pthread_mutex_t table_mutex = PTHREAD_MUTEX_INITIALIZER;
static uint32_t table_next_id;
static struct kmscon_array *table_index;
static struct kmscon_hashtable *table_symbols;
static unsigned int hash_ucs4(const void *key)
{
unsigned int val = 5381;
size_t i;
const uint32_t *ucs4 = key;
i = 0;
while (ucs4[i] <= KMSCON_UCS4_MAX) {
val = val * 33 + ucs4[i];
++i;
}
return val;
}
static bool cmp_ucs4(const void *a, const void *b)
{
size_t i;
const uint32_t *v1, *v2;
v1 = a;
v2 = b;
i = 0;
while (1) {
if (v1[i] > KMSCON_UCS4_MAX && v2[i] > KMSCON_UCS4_MAX)
return true;
if (v1[i] > KMSCON_UCS4_MAX && v2[i] <= KMSCON_UCS4_MAX)
return false;
if (v1[i] <= KMSCON_UCS4_MAX && v2[i] > KMSCON_UCS4_MAX)
return false;
if (v1[i] != v2[i])
return false;
++i;
}
}
static void table_lock()
{
pthread_mutex_lock(&table_mutex);
}
static void table_unlock()
{
pthread_mutex_unlock(&table_mutex);
}
static int table__init()
{
static const uint32_t *val = NULL; /* we need a valid lvalue */
int ret;
if (table_symbols)
return 0;
table_next_id = KMSCON_UCS4_MAX + 2;
ret = kmscon_array_new(&table_index, sizeof(uint32_t*), 4);
if (ret) {
log_err("cannot allocate table-index");
return ret;
}
/* first entry is not used so add dummy */
kmscon_array_push(table_index, &val);
ret = kmscon_hashtable_new(&table_symbols, hash_ucs4, cmp_ucs4,
free, NULL);
if (ret) {
kmscon_array_free(table_index);
return -ENOMEM;
}
return 0;
}
kmscon_symbol_t kmscon_symbol_make(uint32_t ucs4)
{
if (ucs4 > KMSCON_UCS4_MAX) {
log_warn("invalid ucs4 character");
return 0;
} else {
return ucs4;
}
}
/*
* This decomposes a symbol into a ucs4 string and a size value. If \sym is a
* valid UCS4 character, this returns a pointer to \sym and writes 1 into \size.
* Therefore, the returned value may get destroyed if your \sym argument gets
* destroyed.
* If \sym is a composed ucs4 string, then the returned value points into the
* hash table of the symbol table and lives as long as the symbol table does.
*
* This always returns a valid value. If an error happens, the default character
* is returned. If \size is NULL, then the size value is omitted.
*/
static const uint32_t *table__get(kmscon_symbol_t *sym, size_t *size)
{
uint32_t *ucs4;
if (*sym <= KMSCON_UCS4_MAX) {
if (size)
*size = 1;
return sym;
}
if (table__init()) {
if (size)
*size = 1;
return &kmscon_symbol_default;
}
ucs4 = *KMSCON_ARRAY_AT(table_index, uint32_t*,
*sym - (KMSCON_UCS4_MAX + 1));
if (!ucs4) {
if (size)
*size = 1;
return &kmscon_symbol_default;
}
if (size) {
*size = 0;
while (ucs4[*size] <= KMSCON_UCS4_MAX)
++*size;
}
return ucs4;
}
const uint32_t *kmscon_symbol_get(kmscon_symbol_t *sym, size_t *size)
{
const uint32_t *res;
table_lock();
res = table__get(sym, size);
table_unlock();
return res;
}
kmscon_symbol_t kmscon_symbol_append(kmscon_symbol_t sym, uint32_t ucs4)
{
uint32_t buf[KMSCON_UCS4_MAXLEN + 1], nsym, *nval;
const uint32_t *ptr;
size_t s;
kmscon_symbol_t rsym;
void *tmp;
bool res;
table_lock();
if (table__init()) {
rsym = sym;
goto unlock;
}
if (ucs4 > KMSCON_UCS4_MAX) {
log_warn("invalid ucs4 character");
rsym = sym;
goto unlock;
}
ptr = table__get(&sym, &s);
if (s >= KMSCON_UCS4_MAXLEN) {
rsym = sym;
goto unlock;
}
memcpy(buf, ptr, s * sizeof(uint32_t));
buf[s++] = ucs4;
buf[s++] = KMSCON_UCS4_MAX + 1;
res = kmscon_hashtable_find(table_symbols, &tmp, buf);
if (res) {
rsym = (uint32_t)(long)tmp;
goto unlock;
}
log_debug("adding new composed symbol");
nval = malloc(sizeof(uint32_t) * s);
if (!nval) {
rsym = sym;
goto unlock;
}
memcpy(nval, buf, s * sizeof(uint32_t));
nsym = table_next_id++;
kmscon_hashtable_insert(table_symbols, nval, (void*)(long)nsym);
kmscon_array_push(table_index, &nval);
rsym = nsym;
unlock:
table_unlock();
return rsym;
}
/*
* Convert UCS4 character to UTF-8. This creates one of:
* 0xxxxxxx
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* This is based on the same function from "terminology" from the Enlightenment
* project. See COPYING for more information.
*/
static size_t ucs4_to_utf8(uint32_t g, char *txt)
{
if (g < (1 << 7)) {
txt[0] = g & 0x7f;
return 1;
} else if (g < (1 << (5 + 6))) {
txt[0] = 0xc0 | ((g >> 6) & 0x1f);
txt[1] = 0x80 | ((g ) & 0x3f);
return 2;
} else if (g < (1 << (4 + 6 + 6))) {
txt[0] = 0xe0 | ((g >> 12) & 0x0f);
txt[1] = 0x80 | ((g >> 6) & 0x3f);
txt[2] = 0x80 | ((g ) & 0x3f);
return 3;
} else if (g < (1 << (3 + 6 + 6 + 6))) {
txt[0] = 0xf0 | ((g >> 18) & 0x07);
txt[1] = 0x80 | ((g >> 12) & 0x3f);
txt[2] = 0x80 | ((g >> 6) & 0x3f);
txt[3] = 0x80 | ((g ) & 0x3f);
return 4;
} else {
return 0;
}
}
const char *kmscon_symbol_get_u8(kmscon_symbol_t sym, size_t *size)
{
const uint32_t *ucs4;
char *val;
size_t i, pos, len;
ucs4 = kmscon_symbol_get(&sym, &len);
val = malloc(4 * len);
if (!val)
goto err_out;
pos = 0;
for (i = 0; i < len; ++i)
pos += ucs4_to_utf8(ucs4[i], &val[pos]);
if (!pos)
goto err_out;
if (size)
*size = pos;
return val;
err_out:
if (size)
*size = sizeof(default_u8);
return default_u8;
}
void kmscon_symbol_free_u8(const char *s)
{
if (s != default_u8)
free((void*)s);
}
struct kmscon_utf8_mach {
int state;
uint32_t ch;
};
int kmscon_utf8_mach_new(struct kmscon_utf8_mach **out)
{
struct kmscon_utf8_mach *mach;
if (!out)
return -EINVAL;
mach = malloc(sizeof(*mach));
if (!mach)
return -ENOMEM;
memset(mach, 0, sizeof(*mach));
mach->state = KMSCON_UTF8_START;
*out = mach;
return 0;
}
void kmscon_utf8_mach_free(struct kmscon_utf8_mach *mach)
{
if (!mach)
return;
free(mach);
}
int kmscon_utf8_mach_feed(struct kmscon_utf8_mach *mach, char ci)
{
uint32_t c;
if (!mach)
return KMSCON_UTF8_START;
c = ci;
switch (mach->state) {
case KMSCON_UTF8_START:
case KMSCON_UTF8_ACCEPT:
case KMSCON_UTF8_REJECT:
if (c == 0xC0 || c == 0xC1) {
/* overlong encoding for ASCII, reject */
mach->state = KMSCON_UTF8_REJECT;
} else if ((c & 0x80) == 0) {
/* single byte, accept */
mach->ch = c;
mach->state = KMSCON_UTF8_ACCEPT;
} else if ((c & 0xC0) == 0x80) {
/* parser out of sync, ignore byte */
mach->state = KMSCON_UTF8_START;
} else if ((c & 0xE0) == 0xC0) {
/* start of two byte sequence */
mach->ch = (c & 0x1F) << 6;
mach->state = KMSCON_UTF8_EXPECT1;
} else if ((c & 0xF0) == 0xE0) {
/* start of three byte sequence */
mach->ch = (c & 0x0F) << 12;
mach->state = KMSCON_UTF8_EXPECT2;
} else if ((c & 0xF8) == 0xF0) {
/* start of four byte sequence */
mach->ch = (c & 0x07) << 18;
mach->state = KMSCON_UTF8_EXPECT3;
} else {
/* overlong encoding, reject */
mach->state = KMSCON_UTF8_REJECT;
}
break;
case KMSCON_UTF8_EXPECT3:
mach->ch |= (c & 0x3F) << 12;
if ((c & 0xC0) == 0x80)
mach->state = KMSCON_UTF8_EXPECT2;
else
mach->state = KMSCON_UTF8_REJECT;
break;
case KMSCON_UTF8_EXPECT2:
mach->ch |= (c & 0x3F) << 6;
if ((c & 0xC0) == 0x80)
mach->state = KMSCON_UTF8_EXPECT1;
else
mach->state = KMSCON_UTF8_REJECT;
break;
case KMSCON_UTF8_EXPECT1:
mach->ch |= c & 0x3F;
if ((c & 0xC0) == 0x80)
mach->state = KMSCON_UTF8_ACCEPT;
else
mach->state = KMSCON_UTF8_REJECT;
break;
default:
mach->state = KMSCON_UTF8_REJECT;
break;
}
return mach->state;
}
uint32_t kmscon_utf8_mach_get(struct kmscon_utf8_mach *mach)
{
if (!mach || mach->state != KMSCON_UTF8_ACCEPT)
return KMSCON_UCS4_INVALID;
return mach->ch;
}
void kmscon_utf8_mach_reset(struct kmscon_utf8_mach *mach)
{
if (!mach)
return;
mach->state = KMSCON_UTF8_START;
}