From 17a56a24f2cfc1ea781fa51a334c21434804cc64 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sun, 30 Sep 2012 17:59:36 +0200 Subject: [PATCH] tsm: unicode: do not encode invalid UTF8 We must under all conditions avoid encoding invalid UTF8. Otherwise, we would rely on other applications to do error-recovery. Unfortunately, this is no syntactical change but a semnatical fix as the Unicode standard defines several codepoints which are invalid or which must never be used in UTF8. See the Unicode standard if you're interested in these codepoint ranges. Signed-off-by: David Herrmann --- src/tsm_unicode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/tsm_unicode.c b/src/tsm_unicode.c index 7b54cea..914c6aa 100644 --- a/src/tsm_unicode.c +++ b/src/tsm_unicode.c @@ -344,10 +344,22 @@ err_id: * indicates how long the written UTF8 string is. * * Please note @g is a real UCS4 code and not a tsm_symbol_t object! + * + * Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for + * UTF16 compatibility. It is an error to encode them. Same applies to numbers + * greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with + * 0xFFFF or 0xFFFE. */ size_t tsm_ucs4_to_utf8(uint32_t g, char *txt) { + if (g >= 0xd800 && g <= 0xdfff) + return 0; + if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe) + return 0; + if (g >= 0xfdd0 && g <= 0xfdef) + return 0; + if (g < (1 << 7)) { txt[0] = g & 0x7f; return 1;