rcx

miscellaneous C library
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

utf8.c (1764B)


      1 #include "rcx.h"
      2 #include "utf8.h"
      3 
      4 #define SURROGATE_MIN 0xD800
      5 #define SURROGATE_MAX 0xDFFF
      6 
      7 static const uchar utf8byte[] = { 0x0,  0xC0,   0xE0,     0xF0};
      8 static const uchar utf8mask[] = {0x80,  0xE0,   0xF0,     0xF8};
      9 static const rune  utf8min[]  = { 0x0,  0x80,  0x800,  0x10000};
     10 static const rune  utf8max[]  = {0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
     11 
     12 static bool
     13 utf8_overlong(rune c, usize len) {
     14 	return c < utf8min[len-1];
     15 }
     16 
     17 static bool
     18 utf8_encodable(rune c) {
     19 	return c <= RUNE_MAX && (c < SURROGATE_MIN || c > SURROGATE_MAX);
     20 }
     21 
     22 static usize
     23 utf8_len(rune c) {
     24 	if (!utf8_encodable(c))
     25 		return 0;
     26 
     27 	usize len = 1;
     28 	while (c > utf8max[len-1])
     29 		len++;
     30 	return len;
     31 }
     32 
     33 usize
     34 r_utf8_encode(char *s, rune c) {
     35 	usize len = utf8_len(c);
     36 	if (!s || len == 0)
     37 		return len;
     38 
     39 	for (usize i = len-1; i > 0; i--) { /* Continuation bytes */
     40 		((uchar *)s)[i] = 0x80 | (c & 0x3F);
     41 		c >>= 6;
     42 	}
     43 	((uchar *)s)[0] = utf8byte[len-1] | (uchar)c; /* Leading byte */
     44 
     45 	return len;
     46 }
     47 
     48 usize
     49 r_utf8_decode(rune *c, char *s, usize slen) {
     50 	if (c)
     51 		*c = RUNE_BAD;
     52 
     53 	if (!s || slen == 0) /* No input? */
     54 		return 0;
     55 
     56 	/* Determine encoded sequence length based on first byte */
     57 	usize len = 1;
     58 	for (; len <= R_UTF8_SIZE; len++) {
     59 		if (((uchar)s[0] & utf8mask[len-1]) == utf8byte[len-1])
     60 			break;
     61 	}
     62 	if (len > R_UTF8_SIZE) /* Invalid leading byte? */
     63 		return 1;
     64 
     65 	/* Decode codepoint */
     66 	rune r = (uchar)s[0] & ~utf8mask[len-1];
     67 	usize l = MIN(len, slen);
     68 	for (usize i = 1; i < l; i++) {
     69 		if (((uchar)s[i] & 0xC0) != 0x80) /* Invalid continuation byte? */
     70 			return i;
     71 		r = (r << 6) | ((uchar)s[i] & 0x3F);
     72 	}
     73 	if (len > slen)
     74 		return 0; /* Looks good so far, but not enough input */
     75 
     76 	if (c && utf8_encodable(r) && !utf8_overlong(r, len))
     77 		*c = r;
     78 	return len;
     79 }