From e297debdbd5b8abdd3f71c8fcc4f9a76445e54cf Mon Sep 17 00:00:00 2001 From: Kubat <mael.martin31@gmail.com> Date: Mon, 18 Jan 2021 11:41:50 +0100 Subject: [PATCH] JSON: Decode strings correctly --- src/base/json.c | 99 +++++++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/src/base/json.c b/src/base/json.c index 70f6b2bd..9423ed41 100644 --- a/src/base/json.c +++ b/src/base/json.c @@ -99,30 +99,46 @@ __hex_val(const char *c, uint32_t *result) return 0; } -static inline int -__decode_codepoint(uint32_t codepoint) +/* sizeof(utf8) == 4 */ +static struct { + int32_t lo; + int32_t hi; +} utf8[] = { + { 0xd800, 0xdfff }, /* forbidden range: surrogates */ + { 0, 0x7f }, + { 0x80, 0x7ff }, + { 0x800, 0xffff }, + { 0x10000, 0x10ffff } +}; + +static inline size_t +__encode_codepoint(char p[4], int32_t codepoint) { - union __utf8_codepoint { - uint32_t codepoint; - char chars[4]; - }; - - union __utf8_codepoint s = { .codepoint = codepoint }; - - /* Count # of leading 1 bits */ - int k = s.chars[0] ? __builtin_clz(~(s.chars[0] << 24)) : 0; - /* All 1's with k leading 0's */ - int mask = (1 << (8 - k)) - 1; - int value = s.chars[0] & mask; - int i = 1; - - /* NOTE: k = #total bytes, and #total bytes <= 4 */ - for (--k; k > 0 && i < 4; --k, ++i) { - value <<= 6; - value += (s.chars[i] & 0x3F); + size_t i, n= 0; /* number of bytes to be written */ + + while (n< sizeof(utf8)) { + if (utf8[n].lo <= codepoint && codepoint <= utf8[n].hi) + break; + n++; } - return value; + if (n < 1 || n >= sizeof(utf8)) { + /* Invalid! */ + return 0; + } + + if (n == 1) + p[0] = codepoint; + + else { + for (i = n - 1; i > 0; i--) { + p[i] = 0x80 | (0x3f & codepoint); + codepoint >>= 6; + } + p[0] = (0xf00 >> n) | codepoint; + } + + return n; } static inline void @@ -130,27 +146,26 @@ __decode_json(char *string, size_t len) { char *offset = string; int size = 0; - char *target = NULL; + size_t written = 0; uint32_t codepoint = 0; + char *string_end = string + len; while ((offset = strchr(offset, '\\'))) { - len = len - (offset - string); - target = NULL; - switch (offset[1]) { -#define __case_single(c, sub) \ - case c: \ - size = 2; \ - target = sub; \ +#define __case_single(c, sub) \ + case c: \ + size = 2; \ + written = 1; \ + offset[0] = sub; \ break; - __case_single('"', "\"") - __case_single('\\', "\\") - __case_single('/', "/" ) - __case_single('f', "\f") - __case_single('b', "\b") - __case_single('n', "\n") - __case_single('r', "\r") - __case_single('t', "\t") + __case_single('"', '\"') + __case_single('\\', '\\') + __case_single('/', '/' ) + __case_single('f', '\f') + __case_single('b', '\b') + __case_single('n', '\n') + __case_single('r', '\r') + __case_single('t', '\t') #undef __case_single /* UTF-8 decode, 4 hex digits */ @@ -159,9 +174,9 @@ __decode_json(char *string, size_t len) LOG_ERROR("JSON", "Invalid unicode string, abort decoding"); return; } - size = 5; /* u + 4 hex digits */ + size = 6; /* \u + 4 hex digits */ __hex_val(&offset[2], &codepoint); - printf("\\u%04x => %lc\n", codepoint, __decode_codepoint(codepoint)); + written = __encode_codepoint(offset, codepoint); break; /* Oups */ @@ -170,9 +185,11 @@ __decode_json(char *string, size_t len) return; } - /* Simple, replace <size> characters by <target> from 'offset' */ - + /* Now move the rest of the string of (size - written) characters*/ + memmove(&offset[written], &offset[size], (string_end - &offset[size]) * sizeof(char)); + string_end -= (size - written); } + string_end[0] = '\0'; } LKT_PUSH -- GitLab