From 40080adbf1e432fb05b6af0ae8002e4459c6726d Mon Sep 17 00:00:00 2001 From: Kubat <mael.martin31@gmail.com> Date: Mon, 18 Jan 2021 10:58:47 +0100 Subject: [PATCH] JSON: Almost decode utf-8 in json parser... --- inc/lektor/common.h | 20 ++++ src/base/json.c | 233 ++++++++++++++++++++++++++++++-------------- src/main/server.c | 5 + 3 files changed, 183 insertions(+), 75 deletions(-) diff --git a/inc/lektor/common.h b/inc/lektor/common.h index 21f6076d..2e26a249 100644 --- a/inc/lektor/common.h +++ b/inc/lektor/common.h @@ -5,6 +5,26 @@ #include <stdint.h> #include <stdlib.h> +#if defined(__clang__) + #define LKT_COMPILER "clang" + #define LKT_PUSH _Pragma("clang diagnostic push") + #define LKT_IGNORE_WARN_FMT_SECURITY _Pragma("clang diagnostic ignored \"-Wformat-security\"") + #define LKT_POP _Pragma("clang diagnostic pop") + +#elif defined(__GNUC__) || defined(__GNUG__) + #define LKT_COMPILER "GCC" + #define LKT_PUSH _Pragma("GCC diagnostic push") + #define LKT_IGNORE_WARN_FMT_SECURITY _Pragma("GCC diagnostic ignored \"-Wformat-security\"") + #define LKT_POP _Pragma("GCC diagnostic pop") + +#elif defined(_MSC_VER) + #error "MS compiler not supported" + +#else + #error "Unknown compiler" + +#endif + /* Max value for any buffer, to not squash the stack. */ #define BUFFER_MAX 4096 diff --git a/src/base/json.c b/src/base/json.c index 976854f4..70f6b2bd 100644 --- a/src/base/json.c +++ b/src/base/json.c @@ -14,97 +14,176 @@ #define __SKIP_JSON(str) (str = &str[strspn(str, __JSON_SPACE)]) -#define __NEXT_JSON(str, len, dest) \ -{ \ - int is_paren = 0; \ - if (str[0] == __JSON_SEP) { \ - ++str; \ - const char *begin = str, *end; \ - for (;;) { \ - end = strchr(begin, __JSON_SEP); \ - if (*(end - 1) == '\\') { \ - begin = end + 1; \ - continue; \ - } \ - break; \ - } \ - len = (end - str); \ - is_paren = 1; \ - } else \ - len = strcspn(str, __JSON_SPACE __JSON_END); \ - if (level == asked_level && len < LKT_LINE_MAX - 1) \ - strncpy(dest, str, len); \ - str += len; \ - /* Also decode '\"' => '"' */ \ - if (is_paren) { \ - ++str; \ - __replace(dest, "\\\"", "\""); \ - } \ +#define __NEXT_JSON(str, len, dest, tmp, tmp_len) \ +{ \ + int is_paren = 0; \ + if (str[0] == __JSON_SEP) { \ + ++str; \ + const char *begin = str, *end; \ + for (;;) { \ + end = strchr(begin, __JSON_SEP); \ + if (*(end - 1) == '\\') { \ + begin = end + 1; \ + continue; \ + } \ + break; \ + } \ + len = (end - str); \ + is_paren = 1; \ + } else \ + len = strcspn(str, __JSON_SPACE __JSON_END); \ + if (level == asked_level) { \ + size_t local_len = LKT_LINE_MAX - 1 < len ? LKT_LINE_MAX - 1 : len; \ + strncpy(dest, str, local_len); \ + __decode_json(dest, local_len); \ + LOG_DEBUG("JSON", "Set " #dest " to %s", dest); \ + } \ + str += len; \ + if (is_paren) { \ + ++str; \ + } \ } -#define __SKIP_NEXT_JSON(str, len) \ -{ \ - int is_paren = 0; \ - if (str[0] == __JSON_SEP) { \ - ++str; \ - const char *begin = str, *end; \ - for (;;) { \ - end = strchr(begin, __JSON_SEP); \ - if (*(end - 1) == '\\') { \ - begin = end + 1; \ - continue; \ - } \ - break; \ - } \ - len = (end - str); \ - is_paren = 1; \ - } else \ - len = strcspn(str, __JSON_SPACE __JSON_BEGIN __JSON_END); \ - str += len; \ - /* Also decode '\"' => '"' */ \ - if (is_paren) { \ - ++str; \ - } \ +#define __SKIP_NEXT_JSON(str, len) \ +{ \ + int is_paren = 0; \ + if (str[0] == __JSON_SEP) { \ + ++str; \ + const char *begin = str, *end; \ + for (;;) { \ + end = strchr(begin, __JSON_SEP); \ + if (*(end - 1) == '\\') { \ + begin = end + 1; \ + continue; \ + } \ + break; \ + } \ + len = (end - str); \ + is_paren = 1; \ + } else \ + len = strcspn(str, __JSON_SPACE __JSON_BEGIN __JSON_END); \ + str += len; \ + if (is_paren) { \ + ++str; \ + } \ } -/* WARN: strlen(from) >= strlen(to) */ static inline int -__replace(char *string, const char *from, const char *to) +__hex_digit(const char c) { - if (strlen(from) < strlen(to)) { - LOG_ERROR("JSON", "The size of the 'to' string must be inferior " - "or equal to the size of the 'from' string"); - return 1; + if ('0' <= c && c <= '9') + return c - '0'; + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + return -1; +} + +static inline int +__hex_val(const char *c, uint32_t *result) +{ + const char *p; + const size_t size = 4; + int digit; + *result = 0; + + for (p = c; (size_t) (p - c) < size; ++p) { + *result <<= 4; + digit = __hex_digit(*p); + if (digit < 0 || digit > 15) + return 1; + *result |= (uint8_t) digit; } - if (strstr(to, from)) { - LOG_ERROR("JSON", "At least one string '%s' has been found in '%s', exit " - "before looping until the end of the universe", from, to); - return 1; + + return 0; +} + +static inline int +__decode_codepoint(uint32_t codepoint) +{ + union __utf8_codepoint { + uint32_t codepoint; + char chars[4]; + }; + + union __utf8_codepoint s = { .codepoint = codepoint }; + + /* Count # of leading 1 bits */ + int k = s.chars[0] ? __builtin_clz(~(s.chars[0] << 24)) : 0; + /* All 1's with k leading 0's */ + int mask = (1 << (8 - k)) - 1; + int value = s.chars[0] & mask; + int i = 1; + + /* NOTE: k = #total bytes, and #total bytes <= 4 */ + for (--k; k > 0 && i < 4; --k, ++i) { + value <<= 6; + value += (s.chars[i] & 0x3F); } - size_t step = strlen(from) - strlen(to); - char *start = string; - for (;;) { - start = strstr(start, from); - if (start == NULL || *start == '\0') + return value; +} + +static inline void +__decode_json(char *string, size_t len) +{ + char *offset = string; + int size = 0; + char *target = NULL; + uint32_t codepoint = 0; + + while ((offset = strchr(offset, '\\'))) { + len = len - (offset - string); + target = NULL; + + switch (offset[1]) { +#define __case_single(c, sub) \ + case c: \ + size = 2; \ + target = sub; \ + break; + __case_single('"', "\"") + __case_single('\\', "\\") + __case_single('/', "/" ) + __case_single('f', "\f") + __case_single('b', "\b") + __case_single('n', "\n") + __case_single('r', "\r") + __case_single('t', "\t") +#undef __case_single + + /* UTF-8 decode, 4 hex digits */ + case 'u': + if (len < 4) { + LOG_ERROR("JSON", "Invalid unicode string, abort decoding"); + return; + } + size = 5; /* u + 4 hex digits */ + __hex_val(&offset[2], &codepoint); + printf("\\u%04x => %lc\n", codepoint, __decode_codepoint(codepoint)); break; - strncpy(start, to, strlen(to)); - start += strlen(to); - memmove(start, start + step, strlen(start + step) * sizeof(char)); - start[strlen(start) - step] = '\0'; - } - return 0; + /* Oups */ + default: + LOG_ERROR("JSON", "Invalid json string, abort the decoding"); + return; + } + + /* Simple, replace <size> characters by <target> from 'offset' */ + + } } +LKT_PUSH +LKT_IGNORE_WARN_FMT_SECURITY int json_parse(const char *str, int asked_level, json_parse_callback call, void *user) { int level = 0; char key[LKT_LINE_MAX]; char val[LKT_LINE_MAX]; - - LOG_DEBUG("JSON", "Begin parsing of json"); + char tmp[LKT_LINE_MAX]; for (;;) { size_t len = 0; @@ -128,13 +207,16 @@ json_parse(const char *str, int asked_level, json_parse_callback call, void *use else { memset(key, 0, sizeof(key)); memset(val, 0, sizeof(val)); + memset(tmp, 0, sizeof(tmp)); - __NEXT_JSON(str, len, key); + __NEXT_JSON(str, len, key, tmp, LKT_LINE_MAX); __SKIP_JSON(str); - __NEXT_JSON(str, len, val); + __NEXT_JSON(str, len, val, tmp, LKT_LINE_MAX); - if (asked_level == level) + if (asked_level == level) { + LOG_DEBUG("JSON", "Call with %s => %s", key, val); call(key, val, 0, user); + } } if (level <= 0) @@ -142,6 +224,7 @@ json_parse(const char *str, int asked_level, json_parse_callback call, void *use } return 1; } +LKT_POP int json_parse_get_count(const char *str, int asked_level) diff --git a/src/main/server.c b/src/main/server.c index 852059f4..55441568 100644 --- a/src/main/server.c +++ b/src/main/server.c @@ -9,6 +9,7 @@ #include <lektor/database.h> #include <lektor/commands.h> +#include <locale.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> @@ -89,6 +90,10 @@ main(int argc, char *argv[]) #endif REG_END() + RETURN_UNLESS(setlocale(LC_ALL, "en_US.UTF-8"), "Failed to set LC_ALL to en_US.UTF-8", 1); + RETURN_UNLESS(setlocale(LC_CTYPE, ""), "Failed to set LC_CTYPE", 1); + RETURN_UNLESS(setlocale(LC_NUMERIC, "C"), "Failed to set LC_NUMERIC to C for mpv", 1); + char exe[PATH_MAX]; int autoclear, check_exclusive = 1, opt, dump_and_abort = 0; char *conf_file = safe_zero_malloc(PATH_MAX * sizeof(char)); -- GitLab