/* * kmscon - Unicode Handling * * Copyright (c) 2011 David Herrmann * Copyright (c) 2011-2012 University of Tuebingen * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files * (the "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* * The tsm-utf8-state-machine is based on the wayland-compositor demos: * * Copyright © 2008 Kristian Høgsberg * * Permission to use, copy, modify, distribute, and sell this software and * its documentation for any purpose is hereby granted without fee, provided * that the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of the copyright holders not be used in * advertising or publicity pertaining to distribution of the software * without specific, written prior permission. The copyright holders make * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Unicode Helpers * This implements several helpers for Unicode/UTF8/UCS4 input and output. See * below for comments on each helper. */ #include #include #include #include #include #include "log.h" #include "shl_array.h" #include "static_misc.h" #include "unicode.h" #define LOG_SUBSYSTEM "unicode" /* * Unicode Symbol Handling * The main goal of the tsm_symbol_* functions is to provide a datatype which * can contain the representation of any printable character. This includes all * basic Unicode characters but also combined characters. * To avoid all the memory management we still represent a character as a single * integer value (tsm_symbol_t) but internally we allocate a string which is * represented by this value. * * A tsm_symbol_t is an integer which represents a single character point. * For most Unicode characters this is simply the UCS4 representation. In fact, * every UCS4 characters is a valid tsm_symbol_t object. * However, Unicode standard allows combining marks. Therefore, some characters * consists of more than one Unicode character. * A global symbol-table provides all those combined characters as single * integers. You simply create a valid base character and append your combining * marks and the table will return a new valid tsm_symbol_t. It is no longer * a valid UCS4 value, though. But no memory management is needed as all * tsm_symbol_t objects are simple integers. * * The symbol table contains two-way * references. The Hash Table contains all the symbols with the symbol ucs4 * string as key and the symbol ID as value. * The index array contains the symbol ID as key and a pointer to the ucs4 * string as value. But the hash table owns the ucs4 string. * This allows fast implementations of *_get() and *_append() without long * search intervals. * * When creating a new symbol, we simply return the UCS4 value as new symbol. We * do not add it to our symbol table as it is only one character. However, if a * character is appended to an existing symbol, we create a new ucs4 string and * push the new symbol into the symbol table. */ const tsm_symbol_t tsm_symbol_default = 0; static const char default_u8[] = { 0 }; static pthread_mutex_t table_mutex = PTHREAD_MUTEX_INITIALIZER; static uint32_t table_next_id; static struct shl_array *table_index; static struct kmscon_hashtable *table_symbols; static unsigned int hash_ucs4(const void *key) { unsigned int val = 5381; size_t i; const uint32_t *ucs4 = key; i = 0; while (ucs4[i] <= TSM_UCS4_MAX) { val = val * 33 + ucs4[i]; ++i; } return val; } static bool cmp_ucs4(const void *a, const void *b) { size_t i; const uint32_t *v1, *v2; v1 = a; v2 = b; i = 0; while (1) { if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX) return true; if (v1[i] > TSM_UCS4_MAX && v2[i] <= TSM_UCS4_MAX) return false; if (v1[i] <= TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX) return false; if (v1[i] != v2[i]) return false; ++i; } } static int table__init() { static const uint32_t *val = NULL; /* we need a valid lvalue */ int ret; if (table_symbols) return 0; table_next_id = TSM_UCS4_MAX + 2; ret = shl_array_new(&table_index, sizeof(uint32_t*), 4); if (ret) { log_err("cannot allocate table-index"); return ret; } /* first entry is not used so add dummy */ shl_array_push(table_index, &val); ret = kmscon_hashtable_new(&table_symbols, hash_ucs4, cmp_ucs4, free, NULL); if (ret) { shl_array_free(table_index); return -ENOMEM; } return 0; } tsm_symbol_t tsm_symbol_make(uint32_t ucs4) { if (ucs4 > TSM_UCS4_MAX) { log_warn("invalid ucs4 character"); return 0; } else { return ucs4; } } /* * This decomposes a symbol into a ucs4 string and a size value. If \sym is a * valid UCS4 character, this returns a pointer to \sym and writes 1 into \size. * Therefore, the returned value may get destroyed if your \sym argument gets * destroyed. * If \sym is a composed ucs4 string, then the returned value points into the * hash table of the symbol table and lives as long as the symbol table does. * * This always returns a valid value. If an error happens, the default character * is returned. If \size is NULL, then the size value is omitted. */ static const uint32_t *table__get(tsm_symbol_t *sym, size_t *size) { uint32_t *ucs4; if (*sym <= TSM_UCS4_MAX) { if (size) *size = 1; return sym; } if (table__init()) { if (size) *size = 1; return &tsm_symbol_default; } ucs4 = *SHL_ARRAY_AT(table_index, uint32_t*, *sym - (TSM_UCS4_MAX + 1)); if (!ucs4) { if (size) *size = 1; return &tsm_symbol_default; } if (size) { *size = 0; while (ucs4[*size] <= TSM_UCS4_MAX) ++*size; } return ucs4; } const uint32_t *tsm_symbol_get(tsm_symbol_t *sym, size_t *size) { const uint32_t *res; pthread_mutex_lock(&table_mutex); res = table__get(sym, size); pthread_mutex_unlock(&table_mutex); return res; } tsm_symbol_t tsm_symbol_append(tsm_symbol_t sym, uint32_t ucs4) { uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval; const uint32_t *ptr; size_t s; tsm_symbol_t rsym; void *tmp; bool res; pthread_mutex_lock(&table_mutex); if (table__init()) { rsym = sym; goto unlock; } if (ucs4 > TSM_UCS4_MAX) { log_warn("invalid ucs4 character"); rsym = sym; goto unlock; } ptr = table__get(&sym, &s); if (s >= TSM_UCS4_MAXLEN) { rsym = sym; goto unlock; } memcpy(buf, ptr, s * sizeof(uint32_t)); buf[s++] = ucs4; buf[s++] = TSM_UCS4_MAX + 1; res = kmscon_hashtable_find(table_symbols, &tmp, buf); if (res) { rsym = (uint32_t)(long)tmp; goto unlock; } log_debug("adding new composed symbol"); nval = malloc(sizeof(uint32_t) * s); if (!nval) { rsym = sym; goto unlock; } memcpy(nval, buf, s * sizeof(uint32_t)); nsym = table_next_id++; kmscon_hashtable_insert(table_symbols, nval, (void*)(long)nsym); shl_array_push(table_index, &nval); rsym = nsym; unlock: pthread_mutex_unlock(&table_mutex); return rsym; } /* * Convert UCS4 character to UTF-8. This creates one of: * 0xxxxxxx * 110xxxxx 10xxxxxx * 1110xxxx 10xxxxxx 10xxxxxx * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * This is based on the same function from "terminology" from the Enlightenment * project. See COPYING for more information. */ static size_t ucs4_to_utf8(uint32_t g, char *txt) { if (g < (1 << 7)) { txt[0] = g & 0x7f; return 1; } else if (g < (1 << (5 + 6))) { txt[0] = 0xc0 | ((g >> 6) & 0x1f); txt[1] = 0x80 | ((g ) & 0x3f); return 2; } else if (g < (1 << (4 + 6 + 6))) { txt[0] = 0xe0 | ((g >> 12) & 0x0f); txt[1] = 0x80 | ((g >> 6) & 0x3f); txt[2] = 0x80 | ((g ) & 0x3f); return 3; } else if (g < (1 << (3 + 6 + 6 + 6))) { txt[0] = 0xf0 | ((g >> 18) & 0x07); txt[1] = 0x80 | ((g >> 12) & 0x3f); txt[2] = 0x80 | ((g >> 6) & 0x3f); txt[3] = 0x80 | ((g ) & 0x3f); return 4; } else { return 0; } } const char *tsm_symbol_get_u8(tsm_symbol_t sym, size_t *size) { const uint32_t *ucs4; char *val; size_t i, pos, len; ucs4 = tsm_symbol_get(&sym, &len); val = malloc(4 * len); if (!val) goto err_out; pos = 0; for (i = 0; i < len; ++i) pos += ucs4_to_utf8(ucs4[i], &val[pos]); if (!pos) goto err_out; if (size) *size = pos; return val; err_out: if (size) *size = sizeof(default_u8); return default_u8; } void tsm_symbol_free_u8(const char *s) { if (s != default_u8) free((void*)s); } /* * UTF8 State Machine * This state machine parses UTF8 and converts it into a stream of Unicode * characters (UCS4 values). A state-machine is represented by a * "struct tsm_utf8_mach" object. It has no global state and all functions are * re-entrant if called with different state-machine objects. * * tsm_utf8_mach_new(): This creates a new state-machine and resets it to its * initial state. Returns 0 on success. * * tsm_uft8_mach_free(): This destroys a state-machine and frees all internally * allocated memory. * * tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This * is the same state the machine is in after it got created. * * tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the * state-machine. This function returns the new state of the state-machine after * this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT, * then there is a pending UCS4 character that you should retrieve via * tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was * successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and * some error recovery was tried or a replacement character was choosen. All * other states mean that the machine needs more input to parse the stream. * * tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on * the state machine so you can call it multiple times. * * Internally, we use TSM_UTF8_START whenever the state-machine is reset. This * can be used to ignore the last read input or to simply reset the machine. * TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to * get a full UTF8 sequence. * If an error occurs during reading, we go to state TSM_UTF8_REJECT and the * user will read a replacement character. If further errors occur, we go to * state TSM_UTF8_START to avoid printing multiple replacement characters for a * single misinterpreted UTF8 sequence. However, under some circumstances it may * happen that we stay in TSM_UTF8_REJECT and a next replacement character is * returned. * It is difficult to decide how to interpret wrong input but this machine seems * to be quite good at deciding what to do. Generally, we prefer discarding or * replacing input instead of trying to decipher ASCII values from the invalid * data. This guarantees that we do not send wrong values to the terminal * emulator. Some might argue that an ASCII fallback would be better. However, * this means that we might send very weird escape-sequences to the VTE layer. * Especially with C1 codes applications can really break many terminal features * so we avoid any non-ASCII+non-UTF8 input to prevent this. */ struct tsm_utf8_mach { int state; uint32_t ch; }; int tsm_utf8_mach_new(struct tsm_utf8_mach **out) { struct tsm_utf8_mach *mach; if (!out) return -EINVAL; mach = malloc(sizeof(*mach)); if (!mach) return -ENOMEM; memset(mach, 0, sizeof(*mach)); mach->state = TSM_UTF8_START; *out = mach; return 0; } void tsm_utf8_mach_free(struct tsm_utf8_mach *mach) { if (!mach) return; free(mach); } int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci) { uint32_t c; if (!mach) return TSM_UTF8_START; c = ci; switch (mach->state) { case TSM_UTF8_START: case TSM_UTF8_ACCEPT: case TSM_UTF8_REJECT: if (c == 0xC0 || c == 0xC1) { /* overlong encoding for ASCII, reject */ mach->state = TSM_UTF8_REJECT; } else if ((c & 0x80) == 0) { /* single byte, accept */ mach->ch = c; mach->state = TSM_UTF8_ACCEPT; } else if ((c & 0xC0) == 0x80) { /* parser out of sync, ignore byte */ mach->state = TSM_UTF8_START; } else if ((c & 0xE0) == 0xC0) { /* start of two byte sequence */ mach->ch = (c & 0x1F) << 6; mach->state = TSM_UTF8_EXPECT1; } else if ((c & 0xF0) == 0xE0) { /* start of three byte sequence */ mach->ch = (c & 0x0F) << 12; mach->state = TSM_UTF8_EXPECT2; } else if ((c & 0xF8) == 0xF0) { /* start of four byte sequence */ mach->ch = (c & 0x07) << 18; mach->state = TSM_UTF8_EXPECT3; } else { /* overlong encoding, reject */ mach->state = TSM_UTF8_REJECT; } break; case TSM_UTF8_EXPECT3: mach->ch |= (c & 0x3F) << 12; if ((c & 0xC0) == 0x80) mach->state = TSM_UTF8_EXPECT2; else mach->state = TSM_UTF8_REJECT; break; case TSM_UTF8_EXPECT2: mach->ch |= (c & 0x3F) << 6; if ((c & 0xC0) == 0x80) mach->state = TSM_UTF8_EXPECT1; else mach->state = TSM_UTF8_REJECT; break; case TSM_UTF8_EXPECT1: mach->ch |= c & 0x3F; if ((c & 0xC0) == 0x80) mach->state = TSM_UTF8_ACCEPT; else mach->state = TSM_UTF8_REJECT; break; default: mach->state = TSM_UTF8_REJECT; break; } return mach->state; } uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach) { if (!mach || mach->state != TSM_UTF8_ACCEPT) return TSM_UCS4_REPLACEMENT; return mach->ch; } void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach) { if (!mach) return; mach->state = TSM_UTF8_START; }