update stuff

This commit is contained in:
alessandro bason 2025-06-15 11:32:55 +02:00
parent 6d36aa4442
commit 95d74c2ef4
13 changed files with 1196 additions and 48 deletions

629
highlight.c Normal file
View file

@ -0,0 +1,629 @@
#include "highlight.h"
// based on https://github.com/Theldus/kat
#include "arena.h"
#include "str.h"
#include "os.h"
typedef enum {
HL_STATE_DEFAULT,
HL_STATE_KEYWORD,
HL_STATE_NUMBER,
HL_STATE_CHAR,
HL_STATE_STRING,
HL_STATE_COMMENT_MULTI,
HL_STATE_PREPROCESSOR,
HL_STATE_PREPROCESSOR_INCLUDE,
HL_STATE_PREPROCESSOR_INCLUDE_STRING,
} hl_state_e;
typedef enum {
HL_HTABLE_FAILED,
HL_HTABLE_REPLACED,
HL_HTABLE_ADDED,
} hl_htable_result_e;
typedef struct hl_node_t {
strview_t key;
hl_color_e value;
struct hl_node_t *next;
} hl_node_t;
typedef struct {
hl_node_t **buckets;
uint count;
uint used;
uint collisions;
} hl_hashtable_t;
static hl_hashtable_t hl_htable_init(arena_t *arena, uint pow2_exp);
static hl_htable_result_e hl_htable_add(arena_t *arena, hl_hashtable_t *table, strview_t key, hl_color_e value);
static hl_node_t *hl_htable_get(hl_hashtable_t *table, strview_t key);
static u64 hl_htable_hash(const void *bytes, usize count);
typedef struct hl_ctx_t {
hl_state_e state;
hl_flags_e flags;
usize kw_beg;
strview_t colors[HL_COLOR__COUNT]; // todo: maybe should be str_t?
outstream_t ostr;
hl_hashtable_t kw_htable;
bool symbol_table[256];
} hl_ctx_t;
#define KW(str, col) { { str, sizeof(str)-1 }, HL_COLOR_##col }
static hl_keyword_t hl_c_cpp_kwrds[] = {
/* C Types. */
KW("double", TYPES),
KW("int", TYPES),
KW("long", TYPES),
KW("char", TYPES),
KW("float", TYPES),
KW("short", TYPES),
KW("unsigned", TYPES),
KW("signed", TYPES),
KW("bool", TYPES),
/* Common typedefs. */
KW("int8_t", TYPES), KW("uint8_t", TYPES),
KW("int16_t", TYPES), KW("uint16_t", TYPES),
KW("int32_t", TYPES), KW("uint32_t", TYPES),
KW("int64_t", TYPES), KW("uint64_t", TYPES),
KW("int8", TYPES), KW("uint8", TYPES),
KW("int16", TYPES), KW("uint16", TYPES),
KW("int32", TYPES), KW("uint32", TYPES),
KW("int64", TYPES), KW("uint64", TYPES),
KW("i8", TYPES), KW("u8", TYPES),
KW("i16", TYPES), KW("u16", TYPES),
KW("i32", TYPES), KW("u32", TYPES),
KW("i64", TYPES), KW("u64", TYPES),
/* Colla keywords */
KW("uchar", TYPES),
KW("ushort", TYPES),
KW("uint", TYPES),
KW("usize", TYPES),
KW("isize", TYPES),
KW("byte", TYPES),
/* Other keywords. */
KW("auto", KEYWORDS), KW("struct", KEYWORDS), KW("break", KEYWORDS),
KW("else", KEYWORDS), KW("switch", KEYWORDS), KW("case", KEYWORDS),
KW("enum", KEYWORDS), KW("register", KEYWORDS), KW("typedef", KEYWORDS),
KW("extern", KEYWORDS), KW("return", KEYWORDS), KW("union", KEYWORDS),
KW("const", KEYWORDS), KW("continue", KEYWORDS), KW("for", KEYWORDS),
KW("void", KEYWORDS), KW("default", KEYWORDS), KW("goto", KEYWORDS),
KW("sizeof", KEYWORDS), KW("volatile", KEYWORDS), KW("do", KEYWORDS),
KW("if", KEYWORDS), KW("static", KEYWORDS), KW("inline", KEYWORDS),
KW("while", KEYWORDS),
};
#undef KW
static bool hl_default_symbols_table[256] = {
['['] = true, [']'] = true, ['('] = true,
[')'] = true, ['{'] = true, ['}'] = true,
['*'] = true, [':'] = true, ['='] = true,
[';'] = true, ['-'] = true, ['>'] = true,
['&'] = true, ['+'] = true, ['~'] = true,
['!'] = true, ['/'] = true, ['%'] = true,
['<'] = true, ['^'] = true, ['|'] = true,
['?'] = true, ['#'] = true,
};
static void hl_write_char(hl_ctx_t *ctx, char c);
static void hl_write(hl_ctx_t *ctx, strview_t v);
static bool hl_is_char_keyword(char c);
static bool hl_highlight_symbol(hl_ctx_t *ctx, char c);
static hl_color_e hl_get_keyword_color(hl_ctx_t *ctx, strview_t keyword);
static bool hl_is_capitalised(strview_t string);
static strview_t hl_finish_keyword(hl_ctx_t *ctx, usize beg, instream_t *in);
static void hl_print_keyword(hl_ctx_t *ctx, strview_t keyword, hl_color_e color);
hl_ctx_t *hl_init(arena_t *arena, hl_config_t *config) {
if (!config) {
err("<config> cannot be null");
return NULL;
}
hl_ctx_t *out = alloc(arena, hl_ctx_t);
out->flags = config->flags;
memcpy(out->symbol_table, hl_default_symbols_table, sizeof(hl_default_symbols_table));
memcpy(out->colors, config->colors, sizeof(config->colors));
int kw_count = arrlen(hl_c_cpp_kwrds);
out->kw_htable = hl_htable_init(arena, 8);
for (int i = 0; i < kw_count; ++i) {
hl_keyword_t *kw = &hl_c_cpp_kwrds[i];
hl_htable_add(arena, &out->kw_htable, kw->keyword, kw->color);
}
for (int i = 0; i < config->kwrds_count; ++i) {
hl_keyword_t *kw = &config->extra_kwrds[i];
hl_htable_add(arena, &out->kw_htable, kw->keyword, kw->color);
}
return out;
}
void hl_next_char(hl_ctx_t *ctx, instream_t *in) {
char cur = istr_get(in);
bool is_last = istr_is_finished(in);
switch (ctx->state) {
case HL_STATE_DEFAULT:
{
/*
* If potential keyword.
*
* A valid C keyword may contain numbers, but *not*
* as a suffix.
*/
if (hl_is_char_keyword(cur) && !char_is_num(cur)) {
ctx->kw_beg = istr_tell(in);
ctx->state = HL_STATE_KEYWORD;
}
// potential number
else if (char_is_num(cur)) {
ctx->kw_beg = istr_tell(in);
ctx->state = HL_STATE_NUMBER;
}
// potential char
else if (cur == '\'') {
ctx->kw_beg = istr_tell(in);
ctx->state = HL_STATE_CHAR;
}
// potential string
else if (cur == '"') {
ctx->kw_beg = istr_tell(in);
ctx->state = HL_STATE_STRING;
}
// line or multiline comment
else if (cur == '/') {
// single line comment
if (istr_peek(in) == '/') {
// rewind before comment begins
istr_rewind_n(in, 1);
// comment until the end of line
hl_print_keyword(ctx, istr_get_line(in), HL_COLOR_COMMENT);
}
// multiline comment
else if (istr_peek(in) == '*') {
ctx->state = HL_STATE_COMMENT_MULTI;
ctx->kw_beg = istr_tell(in);
istr_skip(in, 1); // skip *
}
else {
// maybe a symbol?
hl_highlight_symbol(ctx, cur);
}
}
// preprocessor
else if (cur == '#') {
// print the # as a symbol
hl_highlight_symbol(ctx, cur);
ctx->kw_beg = istr_tell(in);
ctx->state = HL_STATE_PREPROCESSOR;
}
// other suppored symbols
else if (hl_highlight_symbol(ctx, cur)) {
// noop
}
else {
hl_write_char(ctx, cur);
}
break;
}
case HL_STATE_KEYWORD:
{
// end of keyword, check if it really is a valid keyword
if (!hl_is_char_keyword(cur)) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
hl_color_e kw_color = hl_get_keyword_color(ctx, keyword);
if (kw_color != HL_COLOR__COUNT) {
hl_print_keyword(ctx, keyword, kw_color);
// maybe we should highlight this remaining char.
if (!hl_highlight_symbol(ctx, cur)) {
hl_write_char(ctx, cur);
}
}
/*
* If not keyword, maybe its a function call.
*
* Important to note that this is hacky and will only work
* if there is no space between keyword and '('.
*/
else if (cur == '(') {
hl_print_keyword(ctx, keyword, HL_COLOR_FUNC);
// Opening parenthesis will always be highlighted
hl_highlight_symbol(ctx, cur);
}
else {
if (hl_is_capitalised(keyword)) {
hl_print_keyword(ctx, keyword, HL_COLOR_MACRO);
}
else {
hl_write(ctx, keyword);
}
if (!hl_highlight_symbol(ctx, cur)) {
hl_write_char(ctx, cur);
}
}
}
break;
}
case HL_STATE_NUMBER:
{
char c = char_lower(cur);
/*
* Should we end the state?.
*
* Very important observation:
* Although the number highlight works fine for most (if not all)
* of the possible cases, it also assumes that the code is written
* correctly and the source is able to compile, meaning that:
*
* Numbers like: 123, 0xABC123, 12.3e4f, 123ULL....
* will be correctly identified and highlighted
*
* But, 'numbers' like: 123ABC, 0xxxxABCxx123, 123UUUUU....
* will also be highlighted.
*
* It also assumes that no keyword will start with a number
* and everything starting with a number (except inside strings or
* comments) will be a number.
*/
if (!char_is_num(c) &&
(c < 'a' || c > 'f') &&
c != 'b' && c != 'x' &&
c != 'u' && c != 'l' &&
c != '.'
) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
// if not a valid char keyword: valid number
if (!hl_is_char_keyword(cur)) {
hl_print_keyword(ctx, keyword, HL_COLOR_NUMBER);
}
else {
hl_write(ctx, keyword);
}
// maybe we should highlight this remaining char.
if (!hl_highlight_symbol(ctx, cur)) {
hl_write_char(ctx, cur);
}
}
break;
}
case HL_STATE_CHAR:
{
if (is_last || (cur == '\'' && istr_peek(in) != '\'')) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
keyword.len++;
hl_print_keyword(ctx, keyword, HL_COLOR_STRING);
}
break;
}
case HL_STATE_STRING:
{
if (is_last || (cur == '"' && istr_prev_prev(in) != '\\')) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
keyword.len++;
hl_print_keyword(ctx, keyword, HL_COLOR_STRING);
}
break;
}
case HL_STATE_COMMENT_MULTI:
{
/*
* If we are at the end of line _or_ have identified
* an end of comment...
*/
if (is_last || (cur == '*' && istr_peek(in) == '/')) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
hl_print_keyword(ctx, keyword, HL_COLOR_COMMENT);
}
break;
}
case HL_STATE_PREPROCESSOR:
{
if (!hl_is_char_keyword(cur)) {
hl_write_char(ctx, cur);
break;
}
#define hl_check(str, new_state) \
if (cur == str[0]) { \
instream_t temp = *in; \
strview_t a = { &(str[1]), sizeof(str) - 2 }; \
strview_t b = istr_get_view_len(&temp, a.len); \
if (strv_equals(a, b)) { \
*in = temp; \
hl_print_keyword(ctx, (strview_t){ str, sizeof(str) - 1 }, HL_COLOR_PREPROC); \
ctx->state = new_state; \
break; \
} \
}
if (is_last) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
hl_print_keyword(ctx, keyword, HL_COLOR_PREPROC);
break;
}
hl_check("include", HL_STATE_PREPROCESSOR_INCLUDE)
hl_check("define", HL_STATE_DEFAULT)
hl_check("undef", HL_STATE_DEFAULT)
hl_check("ifdef", HL_STATE_DEFAULT)
hl_check("ifndef", HL_STATE_DEFAULT)
hl_check("if", HL_STATE_DEFAULT)
hl_check("endif", HL_STATE_DEFAULT)
hl_check("pragma", HL_STATE_DEFAULT)
#undef hl_check
break;
}
/*
* Preprocessor/Preprocessor include
*
* This is a 'dumb' preprocessor highlighter:
* it highlights everything with the same color
* and if and only if an '#include' is detected
* the included header will be handled as string
* and thus, will have the same color as the string.
*
* In fact, it is somehow similar to what GtkSourceView
* does (Mousepad, Gedit...) but with one silly difference:
* single-line/multi-line comments will not be handled
* while inside the preprocessor state, meaning that
* comments will also have the same color as the remaining
* of the line, yeah, ugly.
*/
case HL_STATE_PREPROCESSOR_INCLUDE:
{
if (cur == '<' || cur == '"' || is_last) {
ctx->kw_beg = istr_tell(in);
ctx->state = HL_STATE_PREPROCESSOR_INCLUDE_STRING;
}
else {
hl_write_char(ctx, cur);
}
break;
}
case HL_STATE_PREPROCESSOR_INCLUDE_STRING:
{
if (cur == '>' || cur == '"' || is_last) {
strview_t keyword = hl_finish_keyword(ctx, ctx->kw_beg, in);
keyword.len += 1;
hl_print_keyword(ctx, keyword, HL_COLOR_STRING);
}
break;
}
}
}
str_t hl_highlight(arena_t *arena, hl_ctx_t *ctx, strview_t data) {
ctx->ostr = ostr_init(arena);
ctx->state = HL_STATE_DEFAULT;
ctx->kw_beg = 0;
instream_t in = istr_init(data);
while (!istr_is_finished(&in)) {
hl_next_char(ctx, &in);
}
hl_next_char(ctx, &in);
return ostr_to_str(&ctx->ostr);
}
void hl_set_symbol_in_table(hl_ctx_t *ctx, char symbol, bool value) {
if (!ctx) return;
ctx->symbol_table[(unsigned char)symbol] = value;
}
void hl_add_keyword(arena_t *arena, hl_ctx_t *ctx, hl_keyword_t *keyword) {
hl_htable_add(arena, &ctx->kw_htable, keyword->keyword, keyword->color);
}
//// HASH TABLE ///////////////////////////////////////////////////
static hl_hashtable_t hl_htable_init(arena_t *arena, uint pow2_exp) {
uint count = 1 << pow2_exp;
return (hl_hashtable_t) {
.count = count,
.buckets = alloc(arena, hl_node_t*, count),
};
}
static hl_htable_result_e hl_htable_add(arena_t *arena, hl_hashtable_t *table, strview_t key, hl_color_e value) {
if (!table) {
return HL_HTABLE_FAILED;
}
if ((float)table->used >= table->count * 0.6f) {
warn("more than 60%% of the arena is being used: %d/%d", table->used, table->count);
}
u64 hash = hl_htable_hash(key.buf, key.len);
usize index = hash & (table->count - 1);
hl_node_t *bucket = table->buckets[index];
if (bucket) table->collisions++;
while (bucket) {
// already exists
if (strv_equals(bucket->key, key)) {
bucket->value = value;
return HL_HTABLE_REPLACED;
}
bucket = bucket->next;
}
bucket = alloc(arena, hl_node_t);
bucket->key = key;
bucket->value = value;
bucket->next = table->buckets[index];
table->buckets[index] = bucket;
table->used++;
return HL_HTABLE_ADDED;
}
static hl_node_t *hl_htable_get(hl_hashtable_t *table, strview_t key) {
if (!table || table->count == 0) {
return NULL;
}
u64 hash = hl_htable_hash(key.buf, key.len);
usize index = hash & (table->count - 1);
hl_node_t *bucket = table->buckets[index];
while (bucket) {
if (strv_equals(bucket->key, key)) {
return bucket;
}
bucket = bucket->next;
}
return NULL;
}
// uses the sdbm algorithm
static u64 hl_htable_hash(const void *bytes, usize count) {
const u8 *data = bytes;
u64 hash = 0;
for (usize i = 0; i < count; ++i) {
hash = data[i] + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
//// STATIC FUNCTIONS /////////////////////////////////////////////
static inline void hl_escape_html(outstream_t *out, char c) {
switch (c) {
case '&':
ostr_puts(out, strv("&amp"));
break;
case '<':
ostr_puts(out, strv("&lt"));
break;
case '>':
ostr_puts(out, strv("&gt"));
break;
default:
ostr_putc(out, c);
break;
}
}
static void hl_write_char(hl_ctx_t *ctx, char c) {
if (ctx->flags & HL_FLAG_HTML) {
hl_escape_html(&ctx->ostr, c);
}
else {
ostr_putc(&ctx->ostr, c);
}
}
static void hl_write(hl_ctx_t *ctx, strview_t v) {
if (ctx->flags & HL_FLAG_HTML) {
for (usize i = 0; i < v.len; ++i) {
hl_escape_html(&ctx->ostr, v.buf[i]);
}
}
else {
ostr_puts(&ctx->ostr, v);
}
}
static bool hl_is_char_keyword(char c) {
return char_is_alpha(c) || char_is_num(c) || c == '_';
}
static bool hl_highlight_symbol(hl_ctx_t *ctx, char c) {
if (!ctx->symbol_table[(unsigned char)c]) {
return false;
}
ostr_puts(&ctx->ostr, ctx->colors[HL_COLOR_SYMBOL]);
hl_write_char(ctx, c);
ostr_puts(&ctx->ostr, ctx->colors[HL_COLOR_NORMAL]);
return true;
}
static hl_color_e hl_get_keyword_color(hl_ctx_t *ctx, strview_t keyword) {
// todo: make this an option?
if (strv_ends_with_view(keyword, strv("_t"))) {
return HL_COLOR_CUSTOM_TYPES;
}
hl_node_t *node = hl_htable_get(&ctx->kw_htable, keyword);
return node ? node->value : HL_COLOR__COUNT;
}
static bool hl_is_capitalised(strview_t string) {
for (usize i = 0; i < string.len; ++i) {
char c = string.buf[i];
if (!char_is_num(c) && c != '_' && (c < 'A' || c > 'Z')) {
return false;
}
}
return true;
}
static strview_t hl_finish_keyword(hl_ctx_t *ctx, usize beg, instream_t *in) {
ctx->state = HL_STATE_DEFAULT;
beg -= 1;
usize end = istr_tell(in) - 1;
return strv(in->beg + beg, end - beg);
}
static void hl_print_keyword(hl_ctx_t *ctx, strview_t keyword, hl_color_e color) {
ostr_puts(&ctx->ostr, ctx->colors[color]);
hl_write(ctx, keyword);
ostr_puts(&ctx->ostr, ctx->colors[HL_COLOR_NORMAL]);
}