From 046d3979877eabfd814ed14b5dda7cf27dba2aa7 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sun, 29 Jan 2012 14:19:49 +0100 Subject: [PATCH] unicode: add utf8 state machine The state machine is used to convert a stream of UTF8 data into UCS4 characters. It is slightly based on the machine found in the wayland-compositor demos. Signed-off-by: David Herrmann --- src/unicode.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode.h | 21 ++++++++ 2 files changed, 154 insertions(+) diff --git a/src/unicode.c b/src/unicode.c index 028e949..f0db1f4 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -24,6 +24,30 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* + * This kmscon-utf8-state-machine is based on the wayland-compositor demos: + * + * Copyright © 2008 Kristian Høgsberg + * + * Permission to use, copy, modify, distribute, and sell this software and + * its documentation for any purpose is hereby granted without fee, provided + * that the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. The copyright holders make + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER + * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF + * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + /* * Unicode Handling * Main implementation of the symbol datatype. The symbol table contains two-way @@ -51,6 +75,7 @@ #define KMSCON_UCS4_MAXLEN 10 #define KMSCON_UCS4_MAX 0x7fffffffUL +#define KMSCON_UCS4_INVALID 0xfffd const kmscon_symbol_t kmscon_symbol_default = 0; static const char default_u8[] = { 0 }; @@ -286,3 +311,111 @@ void kmscon_symbol_free_u8(const char *s) if (s != default_u8) g_free((char*)s); } + +struct kmscon_utf8_mach { + int state; + uint32_t ch; +}; + +int kmscon_utf8_mach_new(struct kmscon_utf8_mach **out) +{ + struct kmscon_utf8_mach *mach; + + if (!out) + return -EINVAL; + + mach = malloc(sizeof(*mach)); + if (!mach) + return -ENOMEM; + + memset(mach, 0, sizeof(*mach)); + mach->state = KMSCON_UTF8_START; + + *out = mach; + return 0; +} + +void kmscon_utf8_mach_free(struct kmscon_utf8_mach *mach) +{ + if (!mach) + return; + + free(mach); +} + +int kmscon_utf8_mach_feed(struct kmscon_utf8_mach *mach, char ci) +{ + uint32_t c; + + if (!mach) + return KMSCON_UTF8_START; + + c = ci; + + switch (mach->state) { + case KMSCON_UTF8_START: + case KMSCON_UTF8_ACCEPT: + case KMSCON_UTF8_REJECT: + if (c == 0xC0 || c == 0xC1) { + /* overlong encoding for ASCII, reject */ + mach->state = KMSCON_UTF8_REJECT; + } else if ((c & 0x80) == 0) { + /* single byte, accept */ + mach->ch = c; + mach->state = KMSCON_UTF8_ACCEPT; + } else if ((c & 0xC0) == 0x80) { + /* parser out of sync, ignore byte */ + mach->state = KMSCON_UTF8_START; + } else if ((c & 0xE0) == 0xC0) { + /* start of two byte sequence */ + mach->ch = (c & 0x1F) << 6; + mach->state = KMSCON_UTF8_EXPECT1; + } else if ((c & 0xF0) == 0xE0) { + /* start of three byte sequence */ + mach->ch = (c & 0x0F) << 12; + mach->state = KMSCON_UTF8_EXPECT2; + } else if ((c & 0xF8) == 0xF0) { + /* start of four byte sequence */ + mach->ch = (c & 0x07) << 18; + mach->state = KMSCON_UTF8_EXPECT3; + } else { + /* overlong encoding, reject */ + mach->state = KMSCON_UTF8_REJECT; + } + break; + case KMSCON_UTF8_EXPECT3: + mach->ch |= (c & 0x3F) << 12; + if ((c & 0xC0) == 0x80) + mach->state = KMSCON_UTF8_EXPECT2; + else + mach->state = KMSCON_UTF8_REJECT; + break; + case KMSCON_UTF8_EXPECT2: + mach->ch |= (c & 0x3F) << 6; + if ((c & 0xC0) == 0x80) + mach->state = KMSCON_UTF8_EXPECT1; + else + mach->state = KMSCON_UTF8_REJECT; + break; + case KMSCON_UTF8_EXPECT1: + mach->ch |= c & 0x3F; + if ((c & 0xC0) == 0x80) + mach->state = KMSCON_UTF8_ACCEPT; + else + mach->state = KMSCON_UTF8_REJECT; + break; + default: + mach->state = KMSCON_UTF8_REJECT; + break; + } + + return mach->state; +} + +uint32_t kmscon_utf8_mach_get(struct kmscon_utf8_mach *mach) +{ + if (!mach || mach->state != KMSCON_UTF8_ACCEPT) + return KMSCON_UCS4_INVALID; + + return mach->ch; +} diff --git a/src/unicode.h b/src/unicode.h index a9a4318..2075d27 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -51,6 +51,8 @@ #include #include +/* symbols and symbol table */ + struct kmscon_symbol_table; typedef uint32_t kmscon_symbol_t; @@ -69,4 +71,23 @@ const char *kmscon_symbol_get_u8(const struct kmscon_symbol_table *st, kmscon_symbol_t sym, size_t *size); void kmscon_symbol_free_u8(const char *s); +/* utf8 state machine */ + +struct kmscon_utf8_mach; + +enum kmscon_utf8_mach_state { + KMSCON_UTF8_START, + KMSCON_UTF8_ACCEPT, + KMSCON_UTF8_REJECT, + KMSCON_UTF8_EXPECT1, + KMSCON_UTF8_EXPECT2, + KMSCON_UTF8_EXPECT3, +}; + +int kmscon_utf8_mach_new(struct kmscon_utf8_mach **out); +void kmscon_utf8_mach_free(struct kmscon_utf8_mach *mach); + +int kmscon_utf8_mach_feed(struct kmscon_utf8_mach *mach, char c); +uint32_t kmscon_utf8_mach_get(struct kmscon_utf8_mach *mach); + #endif /* KMSCON_UNICODE_H */