#include "highlight.h" // based on https://github.com/Theldus/kat #include "arena.h" #include "str.h" #include "os.h" typedef enum { HL_STATE_DEFAULT, HL_STATE_KEYWORD, HL_STATE_NUMBER, HL_STATE_CHAR, HL_STATE_STRING, HL_STATE_COMMENT_MULTI, HL_STATE_PREPROCESSOR, HL_STATE_PREPROCESSOR_INCLUDE, HL_STATE_PREPROCESSOR_INCLUDE_STRING, } hl_state_e; typedef enum { HL_HTABLE_FAILED, HL_HTABLE_REPLACED, HL_HTABLE_ADDED, } hl_htable_result_e; typedef struct hl_node_t { strview_t key; hl_color_e value; struct hl_node_t *next; } hl_node_t; typedef struct { hl_node_t **buckets; uint count; uint used; uint collisions; } hl_hashtable_t; static hl_hashtable_t hl_htable_init(arena_t *arena, uint pow2_exp); static hl_htable_result_e hl_htable_add(arena_t *arena, hl_hashtable_t *table, strview_t key, hl_color_e value); static hl_node_t *hl_htable_get(hl_hashtable_t *table, strview_t key); static u64 hl_htable_hash(const void *bytes, usize count); typedef struct hl_ctx_t { hl_state_e state; hl_flags_e flags; usize kw_beg; strview_t colors[HL_COLOR__COUNT]; // todo: maybe should be str_t? outstream_t ostr; hl_hashtable_t kw_htable; bool symbol_table[256]; } hl_ctx_t; #define KW(str, col) { { str, sizeof(str)-1 }, HL_COLOR_##col } static hl_keyword_t hl_c_cpp_kwrds[] = { /* C Types. */ KW("double", TYPES), KW("int", TYPES), KW("long", TYPES), KW("char", TYPES), KW("float", TYPES), KW("short", TYPES), KW("unsigned", TYPES), KW("signed", TYPES), KW("bool", TYPES), /* Common typedefs. */ KW("int8_t", TYPES), KW("uint8_t", TYPES), KW("int16_t", TYPES), KW("uint16_t", TYPES), KW("int32_t", TYPES), KW("uint32_t", TYPES), KW("int64_t", TYPES), KW("uint64_t", TYPES), KW("int8", TYPES), KW("uint8", TYPES), KW("int16", TYPES), KW("uint16", TYPES), KW("int32", TYPES), KW("uint32", TYPES), KW("int64", TYPES), KW("uint64", TYPES), KW("i8", TYPES), KW("u8", TYPES), KW("i16", TYPES), KW("u16", TYPES), KW("i32", TYPES), KW("u32", TYPES), KW("i64", TYPES), KW("u64", TYPES), /* Colla keywords */ KW("uchar", TYPES), KW("ushort", TYPES), KW("uint", TYPES), KW("usize", TYPES), KW("isize", TYPES), KW("byte", TYPES), /* Other keywords. */ KW("auto", KEYWORDS), KW("struct", KEYWORDS), KW("break", KEYWORDS), KW("else", KEYWORDS), KW("switch", KEYWORDS), KW("case", KEYWORDS), KW("enum", KEYWORDS), KW("register", KEYWORDS), KW("typedef", KEYWORDS), KW("extern", KEYWORDS), KW("return", KEYWORDS), KW("union", KEYWORDS), KW("const", KEYWORDS), KW("continue", KEYWORDS), KW("for", KEYWORDS), KW("void", KEYWORDS), KW("default", KEYWORDS), KW("goto", KEYWORDS), KW("sizeof", KEYWORDS), KW("volatile", KEYWORDS), KW("do", KEYWORDS), KW("if", KEYWORDS), KW("static", KEYWORDS), KW("inline", KEYWORDS), KW("while", KEYWORDS), }; #undef KW static bool hl_default_symbols_table[256] = { ['['] = true, [']'] = true, ['('] = true, [')'] = true, ['{'] = true, ['}'] = true, ['*'] = true, [':'] = true, ['='] = true, [';'] = true, ['-'] = true, ['>'] = true, ['&'] = true, ['+'] = true, ['~'] = true, ['!'] = true, ['/'] = true, ['%'] = true, ['<'] = true, ['^'] = true, ['|'] = true, ['?'] = true, ['#'] = true, }; static void hl_write_char(hl_ctx_t *ctx, char c); static void hl_write(hl_ctx_t *ctx, strview_t v); static bool hl_is_char_keyword(char c); static bool hl_highlight_symbol(hl_ctx_t *ctx, char c); static hl_color_e hl_get_keyword_color(hl_ctx_t *ctx, strview_t keyword); static bool hl_is_capitalised(strview_t string); static strview_t hl_finish_keyword(hl_ctx_t *ctx, usize beg, instream_t *in); static void hl_print_keyword(hl_ctx_t *ctx, strview_t keyword, hl_color_e color); hl_ctx_t *hl_init(arena_t *arena, hl_config_t *config) { if (!config) { err(" cannot be null"); return NULL; } hl_ctx_t *out = alloc(arena, hl_ctx_t); out->flags = config->flags; memcpy(out->symbol_table, hl_default_symbols_table, sizeof(hl_default_symbols_table)); memcpy(out->colors, config->colors, sizeof(config->colors)); int kw_count = arrlen(hl_c_cpp_kwrds); out->kw_htable = hl_htable_init(arena, 8); for (int i = 0; i < kw_count; ++i) { hl_keyword_t *kw = &hl_c_cpp_kwrds[i]; hl_htable_add(arena, &out->kw_htable, kw->keyword, kw->color); } for (int i = 0; i < config->kwrds_count; ++i) { hl_keyword_t *kw = &config->extra_kwrds[i]; hl_htable_add(arena, &out->kw_htable, kw->keyword, kw->color); } return out; } void hl_next_char(hl_ctx_t *ctx, instream_t *in) { char cur = istr_get(in); bool is_last = istr_is_finished(in); switch (ctx->state) { case HL_STATE_DEFAULT: { /* * If potential keyword. * * A valid C keyword may contain numbers, but *not* * as a suffix. */ if (hl_is_char_keyword(cur) && !char_is_num(cur)) { ctx->kw_beg = istr_tell(in); ctx->state = HL_STATE_KEYWORD; } // potential number else if (char_is_num(cur)) { ctx->kw_beg = istr_tell(in); ctx->state = HL_STATE_NUMBER; } // potential char else if (cur == '\'') { ctx->kw_beg = istr_tell(in); ctx->state = HL_STATE_CHAR; } // potential string else if (cur == '"') { ctx->kw_beg = istr_tell(in); ctx->state = HL_STATE_STRING; } // line or multiline comment else if (cur == '/') { // single line comment if (istr_peek(in) == '/') { // rewind before comment begins istr_rewind_n(in, 1); // comment until the end of line hl_print_keyword(ctx, istr_get_line(in), HL_COLOR_COMMENT); } // multiline comment else if (istr_peek(in) == '*') { ctx->state = HL_STATE_COMMENT_MULTI; ctx->kw_beg = istr_tell(in); istr_skip(in, 1); // skip * } else { // maybe a symbol? hl_highlight_symbol(ctx, cur); } } // preprocessor else if (cur == '#') { // print the # as a symbol hl_highlight_symbol(ctx, cur); ctx->kw_beg = istr_tell(in); ctx->state = HL_STATE_PREPROCESSOR; } // other suppored symbols else if (hl_highlight_symbol(ctx, cur)) { // noop } else { hl_write_char(ctx, cur); } break; } case HL_STATE_KEYWORD: { // end of keyword, check if it really is a valid keyword if (!hl_is_char_keyword(cur)) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); hl_color_e kw_color = hl_get_keyword_color(ctx, keyword); if (kw_color != HL_COLOR__COUNT) { hl_print_keyword(ctx, keyword, kw_color); // maybe we should highlight this remaining char. if (!hl_highlight_symbol(ctx, cur)) { hl_write_char(ctx, cur); } } /* * If not keyword, maybe its a function call. * * Important to note that this is hacky and will only work * if there is no space between keyword and '('. */ else if (cur == '(') { hl_print_keyword(ctx, keyword, HL_COLOR_FUNC); // Opening parenthesis will always be highlighted hl_highlight_symbol(ctx, cur); } else { if (hl_is_capitalised(keyword)) { hl_print_keyword(ctx, keyword, HL_COLOR_MACRO); } else { hl_write(ctx, keyword); } if (!hl_highlight_symbol(ctx, cur)) { hl_write_char(ctx, cur); } } } break; } case HL_STATE_NUMBER: { char c = char_lower(cur); /* * Should we end the state?. * * Very important observation: * Although the number highlight works fine for most (if not all) * of the possible cases, it also assumes that the code is written * correctly and the source is able to compile, meaning that: * * Numbers like: 123, 0xABC123, 12.3e4f, 123ULL.... * will be correctly identified and highlighted * * But, 'numbers' like: 123ABC, 0xxxxABCxx123, 123UUUUU.... * will also be highlighted. * * It also assumes that no keyword will start with a number * and everything starting with a number (except inside strings or * comments) will be a number. */ if (!char_is_num(c) && (c < 'a' || c > 'f') && c != 'b' && c != 'x' && c != 'u' && c != 'l' && c != '.' ) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); // if not a valid char keyword: valid number if (!hl_is_char_keyword(cur)) { hl_print_keyword(ctx, keyword, HL_COLOR_NUMBER); } else { hl_write(ctx, keyword); } // maybe we should highlight this remaining char. if (!hl_highlight_symbol(ctx, cur)) { hl_write_char(ctx, cur); } } break; } case HL_STATE_CHAR: { if (is_last || (cur == '\'' && istr_peek(in) != '\'')) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); keyword.len++; hl_print_keyword(ctx, keyword, HL_COLOR_STRING); } break; } case HL_STATE_STRING: { if (is_last || (cur == '"' && istr_prev_prev(in) != '\\')) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); keyword.len++; hl_print_keyword(ctx, keyword, HL_COLOR_STRING); } break; } case HL_STATE_COMMENT_MULTI: { /* * If we are at the end of line _or_ have identified * an end of comment... */ if (is_last || (cur == '*' && istr_peek(in) == '/')) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); hl_print_keyword(ctx, keyword, HL_COLOR_COMMENT); } break; } case HL_STATE_PREPROCESSOR: { if (!hl_is_char_keyword(cur)) { hl_write_char(ctx, cur); break; } #define hl_check(str, new_state) \ if (cur == str[0]) { \ instream_t temp = *in; \ strview_t a = { &(str[1]), sizeof(str) - 2 }; \ strview_t b = istr_get_view_len(&temp, a.len); \ if (strv_equals(a, b)) { \ *in = temp; \ hl_print_keyword(ctx, (strview_t){ str, sizeof(str) - 1 }, HL_COLOR_PREPROC); \ ctx->state = new_state; \ break; \ } \ } if (is_last) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); hl_print_keyword(ctx, keyword, HL_COLOR_PREPROC); break; } hl_check("include", HL_STATE_PREPROCESSOR_INCLUDE) hl_check("define", HL_STATE_DEFAULT) hl_check("undef", HL_STATE_DEFAULT) hl_check("ifdef", HL_STATE_DEFAULT) hl_check("ifndef", HL_STATE_DEFAULT) hl_check("if", HL_STATE_DEFAULT) hl_check("endif", HL_STATE_DEFAULT) hl_check("pragma", HL_STATE_DEFAULT) #undef hl_check break; } /* * Preprocessor/Preprocessor include * * This is a 'dumb' preprocessor highlighter: * it highlights everything with the same color * and if and only if an '#include' is detected * the included header will be handled as string * and thus, will have the same color as the string. * * In fact, it is somehow similar to what GtkSourceView * does (Mousepad, Gedit...) but with one silly difference: * single-line/multi-line comments will not be handled * while inside the preprocessor state, meaning that * comments will also have the same color as the remaining * of the line, yeah, ugly. */ case HL_STATE_PREPROCESSOR_INCLUDE: { if (cur == '<' || cur == '"' || is_last) { ctx->kw_beg = istr_tell(in); ctx->state = HL_STATE_PREPROCESSOR_INCLUDE_STRING; } else { hl_write_char(ctx, cur); } break; } case HL_STATE_PREPROCESSOR_INCLUDE_STRING: { if (cur == '>' || cur == '"' || is_last) { strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in); keyword.len += 1; hl_print_keyword(ctx, keyword, HL_COLOR_STRING); } break; } } } str_t hl_highlight(arena_t *arena, hl_ctx_t *ctx, strview_t data) { ctx->ostr = ostr_init(arena); ctx->state = HL_STATE_DEFAULT; ctx->kw_beg = 0; instream_t in = istr_init(data); while (!istr_is_finished(&in)) { hl_next_char(ctx, &in); } hl_next_char(ctx, &in); return ostr_to_str(&ctx->ostr); } void hl_set_symbol_in_table(hl_ctx_t *ctx, char symbol, bool value) { if (!ctx) return; ctx->symbol_table[(unsigned char)symbol] = value; } void hl_add_keyword(arena_t *arena, hl_ctx_t *ctx, hl_keyword_t *keyword) { hl_htable_add(arena, &ctx->kw_htable, keyword->keyword, keyword->color); } //// HASH TABLE /////////////////////////////////////////////////// static hl_hashtable_t hl_htable_init(arena_t *arena, uint pow2_exp) { uint count = 1 << pow2_exp; return (hl_hashtable_t) { .count = count, .buckets = alloc(arena, hl_node_t*, count), }; } static hl_htable_result_e hl_htable_add(arena_t *arena, hl_hashtable_t *table, strview_t key, hl_color_e value) { if (!table) { return HL_HTABLE_FAILED; } if ((float)table->used >= table->count * 0.6f) { warn("more than 60%% of the arena is being used: %d/%d", table->used, table->count); } u64 hash = hl_htable_hash(key.buf, key.len); usize index = hash & (table->count - 1); hl_node_t *bucket = table->buckets[index]; if (bucket) table->collisions++; while (bucket) { // already exists if (strv_equals(bucket->key, key)) { bucket->value = value; return HL_HTABLE_REPLACED; } bucket = bucket->next; } bucket = alloc(arena, hl_node_t); bucket->key = key; bucket->value = value; bucket->next = table->buckets[index]; table->buckets[index] = bucket; table->used++; return HL_HTABLE_ADDED; } static hl_node_t *hl_htable_get(hl_hashtable_t *table, strview_t key) { if (!table || table->count == 0) { return NULL; } u64 hash = hl_htable_hash(key.buf, key.len); usize index = hash & (table->count - 1); hl_node_t *bucket = table->buckets[index]; while (bucket) { if (strv_equals(bucket->key, key)) { return bucket; } bucket = bucket->next; } return NULL; } // uses the sdbm algorithm static u64 hl_htable_hash(const void *bytes, usize count) { const u8 *data = bytes; u64 hash = 0; for (usize i = 0; i < count; ++i) { hash = data[i] + (hash << 6) + (hash << 16) - hash; } return hash; } //// STATIC FUNCTIONS ///////////////////////////////////////////// static inline void hl_escape_html(outstream_t *out, char c) { switch (c) { case '&': ostr_puts(out, strv("&")); break; case '<': ostr_puts(out, strv("<")); break; case '>': ostr_puts(out, strv(">")); break; default: ostr_putc(out, c); break; } } static void hl_write_char(hl_ctx_t *ctx, char c) { if (ctx->flags & HL_FLAG_HTML) { hl_escape_html(&ctx->ostr, c); } else { ostr_putc(&ctx->ostr, c); } } static void hl_write(hl_ctx_t *ctx, strview_t v) { if (ctx->flags & HL_FLAG_HTML) { for (usize i = 0; i < v.len; ++i) { hl_escape_html(&ctx->ostr, v.buf[i]); } } else { ostr_puts(&ctx->ostr, v); } } static bool hl_is_char_keyword(char c) { return char_is_alpha(c) || char_is_num(c) || c == '_'; } static bool hl_highlight_symbol(hl_ctx_t *ctx, char c) { if (!ctx->symbol_table[(unsigned char)c]) { return false; } ostr_puts(&ctx->ostr, ctx->colors[HL_COLOR_SYMBOL]); hl_write_char(ctx, c); ostr_puts(&ctx->ostr, ctx->colors[HL_COLOR_NORMAL]); return true; } static hl_color_e hl_get_keyword_color(hl_ctx_t *ctx, strview_t keyword) { // todo: make this an option? if (strv_ends_with_view(keyword, strv("_t"))) { return HL_COLOR_CUSTOM_TYPES; } hl_node_t *node = hl_htable_get(&ctx->kw_htable, keyword); return node ? node->value : HL_COLOR__COUNT; } static bool hl_is_capitalised(strview_t string) { for (usize i = 0; i < string.len; ++i) { char c = string.buf[i]; if (!char_is_num(c) && c != '_' && (c < 'A' || c > 'Z')) { return false; } } return true; } static strview_t hl_finish_keyword(hl_ctx_t *ctx, usize beg, instream_t *in) { ctx->state = HL_STATE_DEFAULT; beg -= 1; usize end = istr_tell(in) - 1; return strv(in->beg + beg, end - beg); } static void hl_print_keyword(hl_ctx_t *ctx, strview_t keyword, hl_color_e color) { ostr_puts(&ctx->ostr, ctx->colors[color]); hl_write(ctx, keyword); ostr_puts(&ctx->ostr, ctx->colors[HL_COLOR_NORMAL]); }