From 40080adbf1e432fb05b6af0ae8002e4459c6726d Mon Sep 17 00:00:00 2001
From: Kubat <mael.martin31@gmail.com>
Date: Mon, 18 Jan 2021 10:58:47 +0100
Subject: [PATCH] JSON: Almost decode utf-8 in json parser...

---
 inc/lektor/common.h |  20 ++++
 src/base/json.c     | 233 ++++++++++++++++++++++++++++++--------------
 src/main/server.c   |   5 +
 3 files changed, 183 insertions(+), 75 deletions(-)

diff --git a/inc/lektor/common.h b/inc/lektor/common.h
index 21f6076d..2e26a249 100644
--- a/inc/lektor/common.h
+++ b/inc/lektor/common.h
@@ -5,6 +5,26 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#if defined(__clang__)
+    #define LKT_COMPILER                    "clang"
+    #define LKT_PUSH                        _Pragma("clang diagnostic push")
+    #define LKT_IGNORE_WARN_FMT_SECURITY    _Pragma("clang diagnostic ignored \"-Wformat-security\"")
+    #define LKT_POP                         _Pragma("clang diagnostic pop")
+
+#elif defined(__GNUC__) || defined(__GNUG__)
+    #define LKT_COMPILER                    "GCC"
+    #define LKT_PUSH                        _Pragma("GCC diagnostic push")
+    #define LKT_IGNORE_WARN_FMT_SECURITY    _Pragma("GCC diagnostic ignored \"-Wformat-security\"")
+    #define LKT_POP                         _Pragma("GCC diagnostic pop")
+
+#elif defined(_MSC_VER)
+    #error "MS compiler not supported"
+
+#else
+    #error "Unknown compiler"
+
+#endif
+
 /* Max value for any buffer, to not squash the stack. */
 #define BUFFER_MAX 4096
 
diff --git a/src/base/json.c b/src/base/json.c
index 976854f4..70f6b2bd 100644
--- a/src/base/json.c
+++ b/src/base/json.c
@@ -14,97 +14,176 @@
 
 #define __SKIP_JSON(str) (str = &str[strspn(str, __JSON_SPACE)])
 
-#define __NEXT_JSON(str, len, dest)                                 \
-{                                                                   \
-    int is_paren = 0;                                               \
-    if (str[0] == __JSON_SEP) {                                     \
-        ++str;                                                      \
-        const char *begin = str, *end;                              \
-        for (;;) {                                                  \
-            end = strchr(begin, __JSON_SEP);                        \
-            if (*(end - 1) == '\\') {                               \
-                begin = end + 1;                                    \
-                continue;                                           \
-            }                                                       \
-            break;                                                  \
-        }                                                           \
-        len = (end - str);                                          \
-        is_paren = 1;                                               \
-    } else                                                          \
-        len = strcspn(str, __JSON_SPACE __JSON_END);                \
-    if (level == asked_level && len < LKT_LINE_MAX - 1)             \
-        strncpy(dest, str, len);                                    \
-    str += len;                                                     \
-    /* Also decode '\"' => '"' */                                   \
-    if (is_paren) {                                                 \
-        ++str;                                                      \
-        __replace(dest, "\\\"", "\"");                              \
-    }                                                               \
+#define __NEXT_JSON(str, len, dest, tmp, tmp_len)                               \
+{                                                                               \
+    int is_paren = 0;                                                           \
+    if (str[0] == __JSON_SEP) {                                                 \
+        ++str;                                                                  \
+        const char *begin = str, *end;                                          \
+        for (;;) {                                                              \
+            end = strchr(begin, __JSON_SEP);                                    \
+            if (*(end - 1) == '\\') {                                           \
+                begin = end + 1;                                                \
+                continue;                                                       \
+            }                                                                   \
+            break;                                                              \
+        }                                                                       \
+        len = (end - str);                                                      \
+        is_paren = 1;                                                           \
+    } else                                                                      \
+        len = strcspn(str, __JSON_SPACE __JSON_END);                            \
+    if (level == asked_level) {                                                 \
+        size_t local_len = LKT_LINE_MAX - 1 < len ? LKT_LINE_MAX - 1 : len;     \
+        strncpy(dest, str, local_len);                                          \
+        __decode_json(dest, local_len);                                         \
+        LOG_DEBUG("JSON", "Set " #dest " to %s", dest);                         \
+    }                                                                           \
+    str += len;                                                                 \
+    if (is_paren) {                                                             \
+        ++str;                                                                  \
+    }                                                                           \
 }
 
-#define __SKIP_NEXT_JSON(str, len)                                  \
-{                                                                   \
-    int is_paren = 0;                                               \
-    if (str[0] == __JSON_SEP) {                                     \
-        ++str;                                                      \
-        const char *begin = str, *end;                              \
-        for (;;) {                                                  \
-            end = strchr(begin, __JSON_SEP);                        \
-            if (*(end - 1) == '\\') {                               \
-                begin = end + 1;                                    \
-                continue;                                           \
-            }                                                       \
-            break;                                                  \
-        }                                                           \
-        len = (end - str);                                          \
-        is_paren = 1;                                               \
-    } else                                                          \
-        len = strcspn(str, __JSON_SPACE __JSON_BEGIN __JSON_END);   \
-    str += len;                                                     \
-    /* Also decode '\"' => '"' */                                   \
-    if (is_paren) {                                                 \
-        ++str;                                                      \
-    }                                                               \
+#define __SKIP_NEXT_JSON(str, len)                                              \
+{                                                                               \
+    int is_paren = 0;                                                           \
+    if (str[0] == __JSON_SEP) {                                                 \
+        ++str;                                                                  \
+        const char *begin = str, *end;                                          \
+        for (;;) {                                                              \
+            end = strchr(begin, __JSON_SEP);                                    \
+            if (*(end - 1) == '\\') {                                           \
+                begin = end + 1;                                                \
+                continue;                                                       \
+            }                                                                   \
+            break;                                                              \
+        }                                                                       \
+        len = (end - str);                                                      \
+        is_paren = 1;                                                           \
+    } else                                                                      \
+        len = strcspn(str, __JSON_SPACE __JSON_BEGIN __JSON_END);               \
+    str += len;                                                                 \
+    if (is_paren) {                                                             \
+        ++str;                                                                  \
+    }                                                                           \
 }
 
-/* WARN: strlen(from) >= strlen(to) */
 static inline int
-__replace(char *string, const char *from, const char *to)
+__hex_digit(const char c)
 {
-    if (strlen(from) < strlen(to)) {
-        LOG_ERROR("JSON", "The size of the 'to' string must be inferior "
-                  "or equal to the size of the 'from' string");
-        return 1;
+    if ('0' <= c && c <= '9')
+        return c - '0';
+    if ('a' <= c && c <= 'f')
+        return c - 'a' + 10;
+    if ('A' <= c && c <= 'F')
+        return c - 'A' + 10;
+    return -1;
+}
+
+static inline int
+__hex_val(const char *c, uint32_t *result)
+{
+    const char *p;
+    const size_t size = 4;
+    int digit;
+    *result = 0;
+
+    for (p = c; (size_t) (p - c) < size; ++p) {
+        *result <<= 4;
+        digit = __hex_digit(*p);
+        if (digit < 0 || digit > 15)
+            return 1;
+        *result |= (uint8_t) digit;
     }
-    if (strstr(to, from)) {
-        LOG_ERROR("JSON", "At least one string '%s' has been found in '%s', exit "
-                  "before looping until the end of the universe", from, to);
-        return 1;
+
+    return 0;
+}
+
+static inline int
+__decode_codepoint(uint32_t codepoint)
+{
+    union __utf8_codepoint {
+        uint32_t codepoint;
+        char chars[4];
+    };
+
+    union __utf8_codepoint s = { .codepoint = codepoint };
+
+    /* Count # of leading 1 bits  */
+    int k = s.chars[0] ? __builtin_clz(~(s.chars[0] << 24)) : 0;
+    /* All 1's with k leading 0's */
+    int mask = (1 << (8 - k)) - 1;
+    int value = s.chars[0] & mask;
+    int i = 1;
+
+    /* NOTE: k = #total bytes, and #total bytes <= 4 */
+    for (--k; k > 0 && i < 4; --k, ++i) {
+        value <<= 6;
+        value += (s.chars[i] & 0x3F);
     }
 
-    size_t step = strlen(from) - strlen(to);
-    char *start = string;
-    for (;;) {
-        start = strstr(start, from);
-        if (start == NULL || *start == '\0')
+    return value;
+}
+
+static inline void
+__decode_json(char *string, size_t len)
+{
+    char *offset       = string;
+    int size           = 0;
+    char *target       = NULL;
+    uint32_t codepoint = 0;
+
+    while ((offset = strchr(offset, '\\'))) {
+        len    = len - (offset - string);
+        target = NULL;
+
+        switch (offset[1]) {
+#define __case_single(c, sub)   \
+            case c:             \
+                size   = 2;     \
+                target = sub;   \
+                break;
+            __case_single('"',  "\"")
+            __case_single('\\', "\\")
+            __case_single('/',  "/" )
+            __case_single('f',  "\f")
+            __case_single('b',  "\b")
+            __case_single('n',  "\n")
+            __case_single('r',  "\r")
+            __case_single('t',  "\t")
+#undef __case_single
+
+        /* UTF-8 decode, 4 hex digits */
+        case 'u':
+            if (len < 4) {
+                LOG_ERROR("JSON", "Invalid unicode string, abort decoding");
+                return;
+            }
+            size = 5; /* u + 4 hex digits */
+            __hex_val(&offset[2], &codepoint);
+            printf("\\u%04x => %lc\n", codepoint, __decode_codepoint(codepoint));
             break;
-        strncpy(start, to, strlen(to));
-        start += strlen(to);
-        memmove(start, start + step, strlen(start + step) * sizeof(char));
-        start[strlen(start) - step] = '\0';
-    }
 
-    return 0;
+        /* Oups */
+        default:
+            LOG_ERROR("JSON", "Invalid json string, abort the decoding");
+            return;
+        }
+
+        /* Simple, replace <size> characters by <target> from 'offset' */
+
+    }
 }
 
+LKT_PUSH
+LKT_IGNORE_WARN_FMT_SECURITY
 int
 json_parse(const char *str, int asked_level, json_parse_callback call, void *user)
 {
     int level = 0;
     char key[LKT_LINE_MAX];
     char val[LKT_LINE_MAX];
-
-    LOG_DEBUG("JSON", "Begin parsing of json");
+    char tmp[LKT_LINE_MAX];
 
     for (;;) {
         size_t len = 0;
@@ -128,13 +207,16 @@ json_parse(const char *str, int asked_level, json_parse_callback call, void *use
         else {
             memset(key, 0, sizeof(key));
             memset(val, 0, sizeof(val));
+            memset(tmp, 0, sizeof(tmp));
 
-            __NEXT_JSON(str, len, key);
+            __NEXT_JSON(str, len, key, tmp, LKT_LINE_MAX);
             __SKIP_JSON(str);
-            __NEXT_JSON(str, len, val);
+            __NEXT_JSON(str, len, val, tmp, LKT_LINE_MAX);
 
-            if (asked_level == level)
+            if (asked_level == level) {
+                LOG_DEBUG("JSON", "Call with %s => %s", key, val);
                 call(key, val, 0, user);
+            }
         }
 
         if (level <= 0)
@@ -142,6 +224,7 @@ json_parse(const char *str, int asked_level, json_parse_callback call, void *use
     }
     return 1;
 }
+LKT_POP
 
 int
 json_parse_get_count(const char *str, int asked_level)
diff --git a/src/main/server.c b/src/main/server.c
index 852059f4..55441568 100644
--- a/src/main/server.c
+++ b/src/main/server.c
@@ -9,6 +9,7 @@
 #include <lektor/database.h>
 #include <lektor/commands.h>
 
+#include <locale.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -89,6 +90,10 @@ main(int argc, char *argv[])
 #endif
     REG_END()
 
+    RETURN_UNLESS(setlocale(LC_ALL, "en_US.UTF-8"), "Failed to set LC_ALL to en_US.UTF-8", 1);
+    RETURN_UNLESS(setlocale(LC_CTYPE, ""), "Failed to set LC_CTYPE", 1);
+    RETURN_UNLESS(setlocale(LC_NUMERIC, "C"), "Failed to set LC_NUMERIC to C for mpv", 1);
+
     char exe[PATH_MAX];
     int autoclear, check_exclusive = 1, opt, dump_and_abort = 0;
     char *conf_file = safe_zero_malloc(PATH_MAX * sizeof(char));
-- 
GitLab