172 lines
4.4 KiB
C
172 lines
4.4 KiB
C
#include "utf8.h"
|
|
|
|
static const uint8 masks[] = {
|
|
0x7f, // 0111-1111
|
|
0x1f, // 0001-1111
|
|
0x0f, // 0000-1111
|
|
0x07, // 0000-0111
|
|
0x03, // 0000-0011
|
|
0x01 // 0000-0001
|
|
};
|
|
|
|
static struct {
|
|
uint8 mask;
|
|
uint8 result;
|
|
int octets;
|
|
} sizes[] = {
|
|
{ 0x80, 0x00, 1 }, // 1000-0000, 0000-0000
|
|
{ 0xE0, 0xC0, 2 }, // 1110-0000, 1100-0000
|
|
{ 0xF0, 0xE0, 3 }, // 1111-0000, 1110-0000
|
|
{ 0xF8, 0xF0, 4 }, // 1111-1000, 1111-0000
|
|
{ 0xFC, 0xF8, 5 }, // 1111-1100, 1111-1000
|
|
{ 0xFE, 0xF8, 6 }, // 1111-1110, 1111-1000
|
|
{ 0x80, 0x80, -1 }, // 1000-0000, 1000-0000
|
|
};
|
|
|
|
|
|
/*
|
|
UTF-8 codepoints are encoded using the first bits of the first character
|
|
|
|
byte 1 | byte 2 | byte 3 | byte 4
|
|
0xxx xxxx | | |
|
|
110x xxxx | 10xx xxxx | |
|
|
1110 xxxx | 10xx xxxx | 10xx xxxx |
|
|
1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx
|
|
|
|
so when we decode it we first find the size of the codepoint (from 1 to 4)
|
|
then we apply the mask to the first byte to get the first character
|
|
then we keep shifting the rune left 6 and applying the next byte to the mask
|
|
until the codepoint is finished (size is 0)
|
|
|
|
## EXAMPLE
|
|
|
|
utf8 string (€) = 1110-0010 1000-0010 1010-1100
|
|
|
|
cp = 0000-0000 0000-0000 0000-0000 0000-0000
|
|
size = 3
|
|
mask = 0x0f -> 0000-1111
|
|
cp = *s & mask = 1110-0010 & 0000-1111 = 0000-0000 0000-0000 0000-0000 0000-0010
|
|
++s = 1000-0010
|
|
|
|
--size = 2
|
|
cp <<= 6 = 0000-0000 0000-0000 0000-0000 1000-0000
|
|
cp |= *s & 0x3f = 1000-0010 & 0011-1111 = 0000-0000 0000-0000 0000-0000 1000-0010
|
|
++s = 1010-1100
|
|
|
|
--size = 1
|
|
cp <<= 6 = 0000-0000 0000-0000 0010-0000 1000-0000
|
|
cp |= *s & 0x3f = 1010-1100 & 0011-1111 = 0000-0000 0000-0000 0010-0000 1010-1100
|
|
++s = ----------
|
|
|
|
final codepoint = 0010-0000 1010-1100
|
|
€ codepoint = 0010-0000 1010-1100
|
|
*/
|
|
|
|
rune utf8Decode(const char **char_str) {
|
|
const uint8 **s = (const uint8 **)char_str;
|
|
|
|
rune ch = 0;
|
|
// if is ascii
|
|
if (**s < 128) {
|
|
ch = **s;
|
|
++*s;
|
|
return ch;
|
|
}
|
|
int size = utf8Size((const char *)*s);
|
|
if (size == -1) {
|
|
++*s;
|
|
return UTF8_INVALID;
|
|
}
|
|
uint8 mask = masks[size - 1];
|
|
ch = **s & mask;
|
|
++*s;
|
|
while(--size) {
|
|
ch <<= 6;
|
|
ch |= **s & 0x3f; // 0011-1111
|
|
++*s;
|
|
}
|
|
return ch;
|
|
}
|
|
|
|
|
|
/*
|
|
to encode a codepoint in a utf8 string we first need to find
|
|
the length of the codepoint
|
|
then we start from the rightmost byte and loop for each byte of the codepoint
|
|
using the length we got before until the first byte (which we skip)
|
|
> and (&) with 0x3f so we ignore the first to bits of the codepoint
|
|
> or (|) with 0x80 so we make sure that the first two bits are 10
|
|
> bitshift the codepoint right 6
|
|
|
|
finally, we apply the correct length-mask to the first byte
|
|
|
|
## EXAMPLE
|
|
|
|
ch € = 0010-0000 1010-1100
|
|
ch < 0x10000
|
|
first = 0xe0 = 1110-0000
|
|
len = 3
|
|
|
|
str[2] = (ch & 0x3f) | 0x80 = 1010-1100 & 0011-1111 | 1000-0000
|
|
= 1010-1100
|
|
ch >>= 6 = 0010-0000 1010-1100 >> 6 = 1000-0010
|
|
|
|
str[1] = (ch & 0x3f) | 0x80 = 1000-0010 & 0011-1111 | 1000-000
|
|
= 1000-0010
|
|
ch >>= 6 = 1000-0010 >> 6 = 0000-0010
|
|
|
|
str[0] = ch | first_mask = 0000-0010 | 1111-0000
|
|
= 1111-0010
|
|
|
|
str = 1111-0010 1000-0010 1010-1100
|
|
utf8 € = 1110-0010 1000-0010 1010-1100
|
|
*/
|
|
|
|
usize utf8Encode(char *str, rune codepoint) {
|
|
usize len = 0;
|
|
uint8 first;
|
|
|
|
if (codepoint < 0x80) { // 0000-0000 0000-0000 0000-0000 1000-0000
|
|
first = 0;
|
|
len = 1;
|
|
}
|
|
else if (codepoint < 0x800) { // 0000-0000 0000-0000 0000-1000 0000-0000
|
|
first = 0xc0; // 1100-0000
|
|
len = 2;
|
|
}
|
|
else if (codepoint < 0x10000) { // 0000-0000 0000-0001 0000-0000 0000-0000
|
|
first = 0xe0; // 1110-0000
|
|
len = 3;
|
|
}
|
|
else {
|
|
first = 0xf0; // 1111-0000
|
|
len = 4;
|
|
}
|
|
|
|
for (usize i = len - 1; i > 0; --i) {
|
|
// 0x3f -> 0011-1111
|
|
// 0x80 -> 1000-0000
|
|
str[i] = (codepoint & 0x3f) | 0x80;
|
|
codepoint >>= 6;
|
|
}
|
|
|
|
str[0] = (char)(codepoint | first);
|
|
return len;
|
|
}
|
|
|
|
int utf8Size(const char *str) {
|
|
uint8 c = (uint8)*str;
|
|
for(usize i = 0; i < (sizeof(sizes) / sizeof(*sizes)); ++i) {
|
|
if ((c & sizes[i].mask) == sizes[i].result) {
|
|
return sizes[i].octets;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
usize utf8CpSize(rune ch) {
|
|
if (ch < 0x80) return 1;
|
|
else if (ch < 0x800) return 2;
|
|
else if (ch < 0x10000) return 3;
|
|
return 4;
|
|
}
|