rcx

miscellaneous C library
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

ucattab.c (2624B)


      1 #include <errno.h>
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <string.h>
      5 
      6 #include "alloc.h"
      7 #include "debug.h"
      8 #include "rcx.h"
      9 #include "log.h"
     10 #include "str.h"
     11 
     12 #define NF 15 /* Number of fields in UnicodeData.txt */
     13 
     14 u8
     15 cattoi(char *cat) {
     16 	static char ucats[] =
     17 		"Lu\0Ll\0Lt\0Lm\0Lo\0"
     18 		"Mn\0Mc\0Me\0"
     19 		"Nd\0Nl\0No\0"
     20 		"Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0"
     21 		"Sm\0Sc\0Sk\0So\0"
     22 		"Zs\0Zl\0Zp\0"
     23 		"Cc\0Cf\0Cs\0Co\0Cn";
     24 	for (usize i = 0; i < LEN(ucats); i += 3) {
     25 		if (!strcmp(cat, &ucats[i]))
     26 			return i / 3;
     27 	}
     28 	r_fatalf("bad category '%s'", cat);
     29 	return 0; /* Suppress warning */
     30 }
     31 
     32 u8 *
     33 parse_cats(char *filename) {
     34 	FILE *f = fopen(filename, "rb");
     35 	if (!f) r_fatalf("fopen: %s", strerror(errno));
     36 
     37 	u8 *cats = r_ealloc(RUNE_MAX + 1);
     38 
     39 	char line[512];
     40 	bool inrange = false;
     41 	i32 prevcp = -1;
     42 	while (fgets(line, sizeof line, f)) {
     43 		char *nl = strchr(line, '\n');
     44 		if (!nl) r_fatalf("line too long");
     45 		*nl = '\0';
     46 
     47 		char **fields = (char *[NF]){0};
     48 		if (r_str_split(&fields, line, ";", NF) != NF)
     49 			r_fatalf("line has too few fields");
     50 		i32 cp = strtol(fields[0], 0, 16);
     51 		char *name = fields[1];
     52 		u8 cat = cattoi(!strcmp(fields[2], "") ? "Cn" : fields[2]);
     53 
     54 		/* We expect UnicodeData.txt to be sorted, but I can't find any
     55 		 * guarantee of that in UAX #44. */
     56 		ASSERT(cp > prevcp);
     57 		ASSERT(!inrange || r_str_ends_with(name, "Last>"));
     58 
     59 		for (i32 c = prevcp+1; c <= cp; c++)
     60 			cats[c] = inrange || c == cp ? cat : cattoi("Cn");
     61 
     62 		inrange = r_str_ends_with(name, "First>");
     63 		prevcp = cp;
     64 	}
     65 	if (!feof(f)) r_fatalf("fgets: %s", strerror(errno));
     66 
     67 	for (i32 c = prevcp+1; c <= RUNE_MAX; c++)
     68 		cats[c] = cattoi("Cn");
     69 
     70 	fclose(f);
     71 
     72 	return cats;
     73 }
     74 
     75 int
     76 main(int argc, char **argv) {
     77 	if (argc != 2)
     78 		r_fatalf("usage: %s UNICODE_DATA_FILE", argv[0]);
     79 
     80 	u8 *cats = parse_cats(argv[1]);
     81 
     82 	printf(
     83 		"/* This file is automatically generated. */\n"
     84 		"\n"
     85 		"/* Special table to optimize for Latin 1 */\n"
     86 		"static u8 ucatl1tab[] = {"
     87 	);
     88 
     89 	for (rune i = 0; i <= 0xff; i++)
     90 		printf("%s0x%02x,", i%8 == 0 ? "\n\t" : " ", cats[i]);
     91 
     92 	printf(
     93 		"\n}; /* 256 bytes */\n"
     94 		"\n"
     95 		"/* The high byte is the category, and the low three bytes are the\n"
     96 		" * codepoint. Codepoints are listed in order, with consecutive\n"
     97 		" * entries with the same category compressed into one entry. */\n"
     98 		"static u32 ucattab[] = {"
     99 	);
    100 
    101 	usize tablen = 0;
    102 	for (rune i = 0x100; i <= RUNE_MAX; i++) {
    103 		if (i > 0x100 && cats[i] == cats[i-1])
    104 			continue;
    105 		printf("%s0x%08"PRIx32"ul,", tablen%4 == 0 ? "\n\t" : " ",
    106 				((rune) cats[i] << 24) | i);
    107 		tablen++;
    108 	}
    109 
    110 	printf("\n}; /* %zd bytes */\n", 4 * tablen);
    111 }