colla/utf8.c
2024-11-29 16:10:48 +01:00

172 lines
4.4 KiB
C

#include "utf8.h"
static const uint8 masks[] = {
0x7f, // 0111-1111
0x1f, // 0001-1111
0x0f, // 0000-1111
0x07, // 0000-0111
0x03, // 0000-0011
0x01 // 0000-0001
};
static struct {
uint8 mask;
uint8 result;
int octets;
} sizes[] = {
{ 0x80, 0x00, 1 }, // 1000-0000, 0000-0000
{ 0xE0, 0xC0, 2 }, // 1110-0000, 1100-0000
{ 0xF0, 0xE0, 3 }, // 1111-0000, 1110-0000
{ 0xF8, 0xF0, 4 }, // 1111-1000, 1111-0000
{ 0xFC, 0xF8, 5 }, // 1111-1100, 1111-1000
{ 0xFE, 0xF8, 6 }, // 1111-1110, 1111-1000
{ 0x80, 0x80, -1 }, // 1000-0000, 1000-0000
};
/*
UTF-8 codepoints are encoded using the first bits of the first character
byte 1 | byte 2 | byte 3 | byte 4
0xxx xxxx | | |
110x xxxx | 10xx xxxx | |
1110 xxxx | 10xx xxxx | 10xx xxxx |
1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx
so when we decode it we first find the size of the codepoint (from 1 to 4)
then we apply the mask to the first byte to get the first character
then we keep shifting the rune left 6 and applying the next byte to the mask
until the codepoint is finished (size is 0)
## EXAMPLE
utf8 string (€) = 1110-0010 1000-0010 1010-1100
cp = 0000-0000 0000-0000 0000-0000 0000-0000
size = 3
mask = 0x0f -> 0000-1111
cp = *s & mask = 1110-0010 & 0000-1111 = 0000-0000 0000-0000 0000-0000 0000-0010
++s = 1000-0010
--size = 2
cp <<= 6 = 0000-0000 0000-0000 0000-0000 1000-0000
cp |= *s & 0x3f = 1000-0010 & 0011-1111 = 0000-0000 0000-0000 0000-0000 1000-0010
++s = 1010-1100
--size = 1
cp <<= 6 = 0000-0000 0000-0000 0010-0000 1000-0000
cp |= *s & 0x3f = 1010-1100 & 0011-1111 = 0000-0000 0000-0000 0010-0000 1010-1100
++s = ----------
final codepoint = 0010-0000 1010-1100
€ codepoint = 0010-0000 1010-1100
*/
rune utf8Decode(const char **char_str) {
const uint8 **s = (const uint8 **)char_str;
rune ch = 0;
// if is ascii
if (**s < 128) {
ch = **s;
++*s;
return ch;
}
int size = utf8Size((const char *)*s);
if (size == -1) {
++*s;
return UTF8_INVALID;
}
uint8 mask = masks[size - 1];
ch = **s & mask;
++*s;
while(--size) {
ch <<= 6;
ch |= **s & 0x3f; // 0011-1111
++*s;
}
return ch;
}
/*
to encode a codepoint in a utf8 string we first need to find
the length of the codepoint
then we start from the rightmost byte and loop for each byte of the codepoint
using the length we got before until the first byte (which we skip)
> and (&) with 0x3f so we ignore the first to bits of the codepoint
> or (|) with 0x80 so we make sure that the first two bits are 10
> bitshift the codepoint right 6
finally, we apply the correct length-mask to the first byte
## EXAMPLE
ch € = 0010-0000 1010-1100
ch < 0x10000
first = 0xe0 = 1110-0000
len = 3
str[2] = (ch & 0x3f) | 0x80 = 1010-1100 & 0011-1111 | 1000-0000
= 1010-1100
ch >>= 6 = 0010-0000 1010-1100 >> 6 = 1000-0010
str[1] = (ch & 0x3f) | 0x80 = 1000-0010 & 0011-1111 | 1000-000
= 1000-0010
ch >>= 6 = 1000-0010 >> 6 = 0000-0010
str[0] = ch | first_mask = 0000-0010 | 1111-0000
= 1111-0010
str = 1111-0010 1000-0010 1010-1100
utf8 € = 1110-0010 1000-0010 1010-1100
*/
usize utf8Encode(char *str, rune codepoint) {
usize len = 0;
uint8 first;
if (codepoint < 0x80) { // 0000-0000 0000-0000 0000-0000 1000-0000
first = 0;
len = 1;
}
else if (codepoint < 0x800) { // 0000-0000 0000-0000 0000-1000 0000-0000
first = 0xc0; // 1100-0000
len = 2;
}
else if (codepoint < 0x10000) { // 0000-0000 0000-0001 0000-0000 0000-0000
first = 0xe0; // 1110-0000
len = 3;
}
else {
first = 0xf0; // 1111-0000
len = 4;
}
for (usize i = len - 1; i > 0; --i) {
// 0x3f -> 0011-1111
// 0x80 -> 1000-0000
str[i] = (codepoint & 0x3f) | 0x80;
codepoint >>= 6;
}
str[0] = (char)(codepoint | first);
return len;
}
int utf8Size(const char *str) {
uint8 c = (uint8)*str;
for(usize i = 0; i < (sizeof(sizes) / sizeof(*sizes)); ++i) {
if ((c & sizes[i].mask) == sizes[i].result) {
return sizes[i].octets;
}
}
return -1;
}
usize utf8CpSize(rune ch) {
if (ch < 0x80) return 1;
else if (ch < 0x800) return 2;
else if (ch < 0x10000) return 3;
return 4;
}