kmscon/src/tsm_unicode.c
David Herrmann 17a56a24f2 tsm: unicode: do not encode invalid UTF8
We must under all conditions avoid encoding invalid UTF8. Otherwise, we
would rely on other applications to do error-recovery.
Unfortunately, this is no syntactical change but a semnatical fix as the
Unicode standard defines several codepoints which are invalid or which
must never be used in UTF8.
See the Unicode standard if you're interested in these codepoint ranges.

Signed-off-by: David Herrmann <dh.herrmann@googlemail.com>
2012-09-30 17:59:36 +02:00

572 lines
16 KiB
C

/*
* TSM - Unicode Handling
*
* Copyright (c) 2011 David Herrmann <dh.herrmann@googlemail.com>
* Copyright (c) 2011-2012 University of Tuebingen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* The tsm-utf8-state-machine is based on the wayland-compositor demos:
*
* Copyright © 2008 Kristian Høgsberg
*
* Permission to use, copy, modify, distribute, and sell this software and
* its documentation for any purpose is hereby granted without fee, provided
* that the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software
* without specific, written prior permission. The copyright holders make
* no representations about the suitability of this software for any
* purpose. It is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
* RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
* CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Unicode Helpers
* This implements several helpers for Unicode/UTF8/UCS4 input and output. See
* below for comments on each helper.
*/
#include <errno.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include "shl_array.h"
#include "shl_hashtable.h"
#include "tsm_unicode.h"
/*
* Unicode Symbol Handling
* The main goal of the tsm_symbol_* functions is to provide a datatype which
* can contain the representation of any printable character. This includes all
* basic Unicode characters but also combined characters.
* To avoid all the memory management we still represent a character as a single
* integer value (tsm_symbol_t) but internally we allocate a string which is
* represented by this value.
*
* A tsm_symbol_t is an integer which represents a single character point.
* For most Unicode characters this is simply the UCS4 representation. In fact,
* every UCS4 characters is a valid tsm_symbol_t object.
* However, Unicode standard allows combining marks. Therefore, some characters
* consists of more than one Unicode character.
* A global symbol-table provides all those combined characters as single
* integers. You simply create a valid base character and append your combining
* marks and the table will return a new valid tsm_symbol_t. It is no longer
* a valid UCS4 value, though. But no memory management is needed as all
* tsm_symbol_t objects are simple integers.
*
* The symbol table contains two-way
* references. The Hash Table contains all the symbols with the symbol ucs4
* string as key and the symbol ID as value.
* The index array contains the symbol ID as key and a pointer to the ucs4
* string as value. But the hash table owns the ucs4 string.
* This allows fast implementations of *_get() and *_append() without long
* search intervals.
*
* When creating a new symbol, we simply return the UCS4 value as new symbol. We
* do not add it to our symbol table as it is only one character. However, if a
* character is appended to an existing symbol, we create a new ucs4 string and
* push the new symbol into the symbol table.
*/
const tsm_symbol_t tsm_symbol_default = 0;
struct tsm_symbol_table {
unsigned long ref;
uint32_t next_id;
struct shl_array *index;
struct shl_hashtable *symbols;
};
/* TODO: remove the default context */
static struct tsm_symbol_table *tsm_symbol_table_default;
static unsigned int hash_ucs4(const void *key)
{
unsigned int val = 5381;
size_t i;
const uint32_t *ucs4 = key;
i = 0;
while (ucs4[i] <= TSM_UCS4_MAX) {
val = val * 33 + ucs4[i];
++i;
}
return val;
}
static bool cmp_ucs4(const void *a, const void *b)
{
size_t i;
const uint32_t *v1, *v2;
v1 = a;
v2 = b;
i = 0;
while (1) {
if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
return true;
if (v1[i] > TSM_UCS4_MAX && v2[i] <= TSM_UCS4_MAX)
return false;
if (v1[i] <= TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
return false;
if (v1[i] != v2[i])
return false;
++i;
}
}
int tsm_symbol_table_new(struct tsm_symbol_table **out)
{
struct tsm_symbol_table *tbl;
int ret;
static const uint32_t *val = NULL; /* we need a valid lvalue */
if (!out)
return -EINVAL;
tbl = malloc(sizeof(*tbl));
if (!tbl)
return -ENOMEM;
memset(tbl, 0, sizeof(*tbl));
tbl->ref = 1;
tbl->next_id = TSM_UCS4_MAX + 2;
ret = shl_array_new(&tbl->index, sizeof(uint32_t*), 4);
if (ret)
goto err_free;
/* first entry is not used so add dummy */
shl_array_push(tbl->index, &val);
ret = shl_hashtable_new(&tbl->symbols, hash_ucs4, cmp_ucs4,
free, NULL);
if (ret)
goto err_array;
*out = tbl;
return 0;
err_array:
shl_array_free(tbl->index);
err_free:
free(tbl);
return ret;
}
void tsm_symbol_table_ref(struct tsm_symbol_table *tbl)
{
if (!tbl || !tbl->ref)
return;
++tbl->ref;
}
void tsm_symbol_table_unref(struct tsm_symbol_table *tbl)
{
if (!tbl || !tbl->ref || --tbl->ref)
return;
shl_hashtable_free(tbl->symbols);
shl_array_free(tbl->index);
free(tbl);
}
tsm_symbol_t tsm_symbol_make(uint32_t ucs4)
{
if (ucs4 > TSM_UCS4_MAX)
return 0;
else
return ucs4;
}
/*
* This decomposes a symbol into a ucs4 string and a size value. If \sym is a
* valid UCS4 character, this returns a pointer to \sym and writes 1 into \size.
* Therefore, the returned value may get destroyed if your \sym argument gets
* destroyed.
* If \sym is a composed ucs4 string, then the returned value points into the
* hash table of the symbol table and lives as long as the symbol table does.
*
* This always returns a valid value. If an error happens, the default character
* is returned. If \size is NULL, then the size value is omitted.
*/
const uint32_t *tsm_symbol_get(struct tsm_symbol_table *tbl,
tsm_symbol_t *sym, size_t *size)
{
uint32_t *ucs4, idx;
int ret;
if (*sym <= TSM_UCS4_MAX) {
if (size)
*size = 1;
return sym;
}
if (!tbl)
tbl = tsm_symbol_table_default;
if (!tbl) {
ret = tsm_symbol_table_new(&tbl);
if (ret) {
if (size)
*size = 1;
return &tsm_symbol_default;
}
tsm_symbol_table_default = tbl;
}
idx = *sym - (TSM_UCS4_MAX + 1);
if (idx >= shl_array_get_length(tbl->index))
ucs4 = NULL;
else
ucs4 = *SHL_ARRAY_AT(tbl->index, uint32_t*, idx);
if (!ucs4) {
if (size)
*size = 1;
return &tsm_symbol_default;
}
if (size) {
*size = 0;
while (ucs4[*size] <= TSM_UCS4_MAX)
++*size;
}
return ucs4;
}
tsm_symbol_t tsm_symbol_append(struct tsm_symbol_table *tbl,
tsm_symbol_t sym, uint32_t ucs4)
{
uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval;
const uint32_t *ptr;
size_t s;
void *tmp;
bool res;
int ret;
if (!tbl)
tbl = tsm_symbol_table_default;
if (!tbl) {
ret = tsm_symbol_table_new(&tbl);
if (ret)
return sym;
tsm_symbol_table_default = tbl;
}
if (ucs4 > TSM_UCS4_MAX)
return sym;
ptr = tsm_symbol_get(tbl, &sym, &s);
if (s >= TSM_UCS4_MAXLEN)
return sym;
memcpy(buf, ptr, s * sizeof(uint32_t));
buf[s++] = ucs4;
buf[s++] = TSM_UCS4_MAX + 1;
res = shl_hashtable_find(tbl->symbols, &tmp, buf);
if (res)
return (uint32_t)(long)tmp;
nval = malloc(sizeof(uint32_t) * s);
if (!nval)
return sym;
memcpy(nval, buf, s * sizeof(uint32_t));
nsym = tbl->next_id + 1;
/* Out of IDs; we actually have 2 Billion IDs so this seems
* very unlikely but lets be safe here */
if (nsym <= tbl->next_id++)
goto err_id;
ret = shl_hashtable_insert(tbl->symbols, nval, (void*)(long)nsym);
if (ret)
goto err_id;
ret = shl_array_push(tbl->index, &nval);
if (ret)
goto err_symbol;
return nsym;
err_symbol:
shl_hashtable_remove(tbl->symbols, nval);
err_id:
--tbl->next_id;
free(nval);
return sym;
}
/*
* Convert UCS4 character to UTF-8. This creates one of:
* 0xxxxxxx
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* This is based on the same function from "terminology" from the Enlightenment
* project. See COPYING for more information.
*
* @txt must point to a 4 byte-buffer. A number between 0 and 4 is returned and
* indicates how long the written UTF8 string is.
*
* Please note @g is a real UCS4 code and not a tsm_symbol_t object!
*
* Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for
* UTF16 compatibility. It is an error to encode them. Same applies to numbers
* greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with
* 0xFFFF or 0xFFFE.
*/
size_t tsm_ucs4_to_utf8(uint32_t g, char *txt)
{
if (g >= 0xd800 && g <= 0xdfff)
return 0;
if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe)
return 0;
if (g >= 0xfdd0 && g <= 0xfdef)
return 0;
if (g < (1 << 7)) {
txt[0] = g & 0x7f;
return 1;
} else if (g < (1 << (5 + 6))) {
txt[0] = 0xc0 | ((g >> 6) & 0x1f);
txt[1] = 0x80 | ((g ) & 0x3f);
return 2;
} else if (g < (1 << (4 + 6 + 6))) {
txt[0] = 0xe0 | ((g >> 12) & 0x0f);
txt[1] = 0x80 | ((g >> 6) & 0x3f);
txt[2] = 0x80 | ((g ) & 0x3f);
return 3;
} else if (g < (1 << (3 + 6 + 6 + 6))) {
txt[0] = 0xf0 | ((g >> 18) & 0x07);
txt[1] = 0x80 | ((g >> 12) & 0x3f);
txt[2] = 0x80 | ((g >> 6) & 0x3f);
txt[3] = 0x80 | ((g ) & 0x3f);
return 4;
} else {
return 0;
}
}
char *tsm_ucs4_to_utf8_alloc(const uint32_t *ucs4, size_t len, size_t *len_out)
{
char *val;
size_t i, pos;
val = malloc(4 * len);
if (!val)
return NULL;
pos = 0;
for (i = 0; i < len; ++i)
pos += tsm_ucs4_to_utf8(ucs4[i], &val[pos]);
if (!pos) {
free(val);
return NULL;
}
if (len_out)
*len_out = pos;
return val;
}
/*
* UTF8 State Machine
* This state machine parses UTF8 and converts it into a stream of Unicode
* characters (UCS4 values). A state-machine is represented by a
* "struct tsm_utf8_mach" object. It has no global state and all functions are
* re-entrant if called with different state-machine objects.
*
* tsm_utf8_mach_new(): This creates a new state-machine and resets it to its
* initial state. Returns 0 on success.
*
* tsm_uft8_mach_free(): This destroys a state-machine and frees all internally
* allocated memory.
*
* tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This
* is the same state the machine is in after it got created.
*
* tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the
* state-machine. This function returns the new state of the state-machine after
* this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT,
* then there is a pending UCS4 character that you should retrieve via
* tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was
* successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and
* some error recovery was tried or a replacement character was choosen. All
* other states mean that the machine needs more input to parse the stream.
*
* tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on
* the state machine so you can call it multiple times.
*
* Internally, we use TSM_UTF8_START whenever the state-machine is reset. This
* can be used to ignore the last read input or to simply reset the machine.
* TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to
* get a full UTF8 sequence.
* If an error occurs during reading, we go to state TSM_UTF8_REJECT and the
* user will read a replacement character. If further errors occur, we go to
* state TSM_UTF8_START to avoid printing multiple replacement characters for a
* single misinterpreted UTF8 sequence. However, under some circumstances it may
* happen that we stay in TSM_UTF8_REJECT and a next replacement character is
* returned.
* It is difficult to decide how to interpret wrong input but this machine seems
* to be quite good at deciding what to do. Generally, we prefer discarding or
* replacing input instead of trying to decipher ASCII values from the invalid
* data. This guarantees that we do not send wrong values to the terminal
* emulator. Some might argue that an ASCII fallback would be better. However,
* this means that we might send very weird escape-sequences to the VTE layer.
* Especially with C1 codes applications can really break many terminal features
* so we avoid any non-ASCII+non-UTF8 input to prevent this.
*/
struct tsm_utf8_mach {
int state;
uint32_t ch;
};
int tsm_utf8_mach_new(struct tsm_utf8_mach **out)
{
struct tsm_utf8_mach *mach;
if (!out)
return -EINVAL;
mach = malloc(sizeof(*mach));
if (!mach)
return -ENOMEM;
memset(mach, 0, sizeof(*mach));
mach->state = TSM_UTF8_START;
*out = mach;
return 0;
}
void tsm_utf8_mach_free(struct tsm_utf8_mach *mach)
{
if (!mach)
return;
free(mach);
}
int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci)
{
uint32_t c;
if (!mach)
return TSM_UTF8_START;
c = ci;
switch (mach->state) {
case TSM_UTF8_START:
case TSM_UTF8_ACCEPT:
case TSM_UTF8_REJECT:
if (c == 0xC0 || c == 0xC1) {
/* overlong encoding for ASCII, reject */
mach->state = TSM_UTF8_REJECT;
} else if ((c & 0x80) == 0) {
/* single byte, accept */
mach->ch = c;
mach->state = TSM_UTF8_ACCEPT;
} else if ((c & 0xC0) == 0x80) {
/* parser out of sync, ignore byte */
mach->state = TSM_UTF8_START;
} else if ((c & 0xE0) == 0xC0) {
/* start of two byte sequence */
mach->ch = (c & 0x1F) << 6;
mach->state = TSM_UTF8_EXPECT1;
} else if ((c & 0xF0) == 0xE0) {
/* start of three byte sequence */
mach->ch = (c & 0x0F) << 12;
mach->state = TSM_UTF8_EXPECT2;
} else if ((c & 0xF8) == 0xF0) {
/* start of four byte sequence */
mach->ch = (c & 0x07) << 18;
mach->state = TSM_UTF8_EXPECT3;
} else {
/* overlong encoding, reject */
mach->state = TSM_UTF8_REJECT;
}
break;
case TSM_UTF8_EXPECT3:
mach->ch |= (c & 0x3F) << 12;
if ((c & 0xC0) == 0x80)
mach->state = TSM_UTF8_EXPECT2;
else
mach->state = TSM_UTF8_REJECT;
break;
case TSM_UTF8_EXPECT2:
mach->ch |= (c & 0x3F) << 6;
if ((c & 0xC0) == 0x80)
mach->state = TSM_UTF8_EXPECT1;
else
mach->state = TSM_UTF8_REJECT;
break;
case TSM_UTF8_EXPECT1:
mach->ch |= c & 0x3F;
if ((c & 0xC0) == 0x80)
mach->state = TSM_UTF8_ACCEPT;
else
mach->state = TSM_UTF8_REJECT;
break;
default:
mach->state = TSM_UTF8_REJECT;
break;
}
return mach->state;
}
uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach)
{
if (!mach || mach->state != TSM_UTF8_ACCEPT)
return TSM_UCS4_REPLACEMENT;
return mach->ch;
}
void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach)
{
if (!mach)
return;
mach->state = TSM_UTF8_START;
}