utf8.c (1764B)
1 #include "rcx.h" 2 #include "utf8.h" 3 4 #define SURROGATE_MIN 0xD800 5 #define SURROGATE_MAX 0xDFFF 6 7 static const uchar utf8byte[] = { 0x0, 0xC0, 0xE0, 0xF0}; 8 static const uchar utf8mask[] = {0x80, 0xE0, 0xF0, 0xF8}; 9 static const rune utf8min[] = { 0x0, 0x80, 0x800, 0x10000}; 10 static const rune utf8max[] = {0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; 11 12 static bool 13 utf8_overlong(rune c, usize len) { 14 return c < utf8min[len-1]; 15 } 16 17 static bool 18 utf8_encodable(rune c) { 19 return c <= RUNE_MAX && (c < SURROGATE_MIN || c > SURROGATE_MAX); 20 } 21 22 static usize 23 utf8_len(rune c) { 24 if (!utf8_encodable(c)) 25 return 0; 26 27 usize len = 1; 28 while (c > utf8max[len-1]) 29 len++; 30 return len; 31 } 32 33 usize 34 r_utf8_encode(char *s, rune c) { 35 usize len = utf8_len(c); 36 if (!s || len == 0) 37 return len; 38 39 for (usize i = len-1; i > 0; i--) { /* Continuation bytes */ 40 ((uchar *)s)[i] = 0x80 | (c & 0x3F); 41 c >>= 6; 42 } 43 ((uchar *)s)[0] = utf8byte[len-1] | (uchar)c; /* Leading byte */ 44 45 return len; 46 } 47 48 usize 49 r_utf8_decode(rune *c, char *s, usize slen) { 50 if (c) 51 *c = RUNE_BAD; 52 53 if (!s || slen == 0) /* No input? */ 54 return 0; 55 56 /* Determine encoded sequence length based on first byte */ 57 usize len = 1; 58 for (; len <= R_UTF8_SIZE; len++) { 59 if (((uchar)s[0] & utf8mask[len-1]) == utf8byte[len-1]) 60 break; 61 } 62 if (len > R_UTF8_SIZE) /* Invalid leading byte? */ 63 return 1; 64 65 /* Decode codepoint */ 66 rune r = (uchar)s[0] & ~utf8mask[len-1]; 67 usize l = MIN(len, slen); 68 for (usize i = 1; i < l; i++) { 69 if (((uchar)s[i] & 0xC0) != 0x80) /* Invalid continuation byte? */ 70 return i; 71 r = (r << 6) | ((uchar)s[i] & 0x3F); 72 } 73 if (len > slen) 74 return 0; /* Looks good so far, but not enough input */ 75 76 if (c && utf8_encodable(r) && !utf8_overlong(r, len)) 77 *c = r; 78 return len; 79 }