colla/utf8.c

#include "utf8.h"

static const uint8 masks[] = {
    0x7f, // 0111-1111
    0x1f, // 0001-1111
    0x0f, // 0000-1111
    0x07, // 0000-0111
    0x03, // 0000-0011
    0x01  // 0000-0001
};

static struct {
    uint8 mask;
    uint8 result;
    int octets;
} sizes[] = {
    { 0x80, 0x00, 1 },  // 1000-0000, 0000-0000
    { 0xE0, 0xC0, 2 },  // 1110-0000, 1100-0000
    { 0xF0, 0xE0, 3 },  // 1111-0000, 1110-0000
    { 0xF8, 0xF0, 4 },  // 1111-1000, 1111-0000
    { 0xFC, 0xF8, 5 },  // 1111-1100, 1111-1000
    { 0xFE, 0xF8, 6 },  // 1111-1110, 1111-1000
    { 0x80, 0x80, -1 }, // 1000-0000, 1000-0000
};


/*
UTF-8 codepoints are encoded using the first bits of the first character

byte 1    | byte 2    | byte 3    | byte 4
0xxx xxxx |           |           |
110x xxxx | 10xx xxxx |           |
1110 xxxx | 10xx xxxx | 10xx xxxx |
1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx

so when we decode it we first find the size of the codepoint (from 1 to 4)
then we apply the mask to the first byte to get the first character
then we keep shifting the rune left 6 and applying the next byte to the mask
until the codepoint is finished (size is 0)

## EXAMPLE

utf8 string (€) = 1110-0010 1000-0010 1010-1100

cp = 0000-0000 0000-0000 0000-0000 0000-0000
size = 3
mask = 0x0f -> 0000-1111
cp = *s & mask = 1110-0010 & 0000-1111 = 0000-0000 0000-0000 0000-0000 0000-0010
++s = 1000-0010

--size = 2
cp <<= 6 = 0000-0000 0000-0000 0000-0000 1000-0000
cp |= *s & 0x3f = 1000-0010 & 0011-1111 = 0000-0000 0000-0000 0000-0000 1000-0010
++s = 1010-1100

--size = 1
cp <<= 6 = 0000-0000 0000-0000 0010-0000 1000-0000
cp |= *s & 0x3f = 1010-1100 & 0011-1111 = 0000-0000 0000-0000 0010-0000 1010-1100
++s = ----------

final codepoint = 0010-0000 1010-1100
€ codepoint     = 0010-0000 1010-1100
*/

rune utf8Decode(const char **char_str) {
    const uint8 **s = (const uint8 **)char_str;

    rune ch = 0;
    // if is ascii
    if (**s < 128) {
        ch = **s;
        ++*s;
        return ch;
    }
    int size = utf8Size((const char *)*s);
    if (size == -1) {
        ++*s;
        return UTF8_INVALID;
    }
    uint8 mask = masks[size - 1];
    ch = **s & mask;
    ++*s;
    while(--size) {
        ch <<= 6;
        ch |= **s & 0x3f; // 0011-1111
        ++*s;
    }
    return ch;
}


/*
to encode a codepoint in a utf8 string we first need to find
the length of the codepoint
then we start from the rightmost byte and loop for each byte of the codepoint
using the length we got before until the first byte (which we skip)
> and (&) with 0x3f so we ignore the first to bits of the codepoint
> or (|) with 0x80 so we make sure that the first two bits are 10
> bitshift the codepoint right 6

finally, we apply the correct length-mask to the first byte

## EXAMPLE

ch € = 0010-0000 1010-1100
ch < 0x10000
    first = 0xe0 = 1110-0000
    len = 3

str[2] = (ch & 0x3f) | 0x80 = 1010-1100 & 0011-1111 | 1000-0000
       = 1010-1100
ch >>= 6 = 0010-0000 1010-1100 >> 6 = 1000-0010

str[1] = (ch & 0x3f) | 0x80 = 1000-0010 & 0011-1111 | 1000-000
       = 1000-0010
ch >>= 6 = 1000-0010 >> 6 = 0000-0010

str[0] = ch | first_mask = 0000-0010 | 1111-0000
       = 1111-0010

str    = 1111-0010 1000-0010 1010-1100
utf8 € = 1110-0010 1000-0010 1010-1100
*/

usize utf8Encode(char *str, rune codepoint) {
    usize len = 0;
    uint8 first;

    if (codepoint < 0x80) {            // 0000-0000 0000-0000 0000-0000 1000-0000
        first = 0;
        len = 1;
    }
    else if (codepoint < 0x800) {      // 0000-0000 0000-0000 0000-1000 0000-0000
        first = 0xc0; // 1100-0000
        len = 2;
    }
    else if (codepoint < 0x10000) {    // 0000-0000 0000-0001 0000-0000 0000-0000
        first = 0xe0; // 1110-0000
        len = 3;
    }
    else {
        first = 0xf0; // 1111-0000
        len = 4;
    }

    for (usize i = len - 1; i > 0; --i) {
        // 0x3f -> 0011-1111
        // 0x80 -> 1000-0000
        str[i] = (codepoint & 0x3f) | 0x80;
        codepoint >>= 6;
    }

    str[0] = (char)(codepoint | first);
    return len;
}

int utf8Size(const char *str) {
    uint8 c = (uint8)*str;
    for(usize i = 0; i < (sizeof(sizes) / sizeof(*sizes)); ++i) {
        if ((c & sizes[i].mask) == sizes[i].result) {
            return sizes[i].octets;
        }
    }
    return -1;
}

usize utf8CpSize(rune ch) {
    if (ch < 0x80)         return 1;
    else if (ch < 0x800)   return 2;
    else if (ch < 0x10000) return 3;
    return 4;
}