ucattab.c (2624B)
1 #include <errno.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 6 #include "alloc.h" 7 #include "debug.h" 8 #include "rcx.h" 9 #include "log.h" 10 #include "str.h" 11 12 #define NF 15 /* Number of fields in UnicodeData.txt */ 13 14 u8 15 cattoi(char *cat) { 16 static char ucats[] = 17 "Lu\0Ll\0Lt\0Lm\0Lo\0" 18 "Mn\0Mc\0Me\0" 19 "Nd\0Nl\0No\0" 20 "Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0" 21 "Sm\0Sc\0Sk\0So\0" 22 "Zs\0Zl\0Zp\0" 23 "Cc\0Cf\0Cs\0Co\0Cn"; 24 for (usize i = 0; i < LEN(ucats); i += 3) { 25 if (!strcmp(cat, &ucats[i])) 26 return i / 3; 27 } 28 r_fatalf("bad category '%s'", cat); 29 return 0; /* Suppress warning */ 30 } 31 32 u8 * 33 parse_cats(char *filename) { 34 FILE *f = fopen(filename, "rb"); 35 if (!f) r_fatalf("fopen: %s", strerror(errno)); 36 37 u8 *cats = r_ealloc(RUNE_MAX + 1); 38 39 char line[512]; 40 bool inrange = false; 41 i32 prevcp = -1; 42 while (fgets(line, sizeof line, f)) { 43 char *nl = strchr(line, '\n'); 44 if (!nl) r_fatalf("line too long"); 45 *nl = '\0'; 46 47 char **fields = (char *[NF]){0}; 48 if (r_str_split(&fields, line, ";", NF) != NF) 49 r_fatalf("line has too few fields"); 50 i32 cp = strtol(fields[0], 0, 16); 51 char *name = fields[1]; 52 u8 cat = cattoi(!strcmp(fields[2], "") ? "Cn" : fields[2]); 53 54 /* We expect UnicodeData.txt to be sorted, but I can't find any 55 * guarantee of that in UAX #44. */ 56 ASSERT(cp > prevcp); 57 ASSERT(!inrange || r_str_ends_with(name, "Last>")); 58 59 for (i32 c = prevcp+1; c <= cp; c++) 60 cats[c] = inrange || c == cp ? cat : cattoi("Cn"); 61 62 inrange = r_str_ends_with(name, "First>"); 63 prevcp = cp; 64 } 65 if (!feof(f)) r_fatalf("fgets: %s", strerror(errno)); 66 67 for (i32 c = prevcp+1; c <= RUNE_MAX; c++) 68 cats[c] = cattoi("Cn"); 69 70 fclose(f); 71 72 return cats; 73 } 74 75 int 76 main(int argc, char **argv) { 77 if (argc != 2) 78 r_fatalf("usage: %s UNICODE_DATA_FILE", argv[0]); 79 80 u8 *cats = parse_cats(argv[1]); 81 82 printf( 83 "/* This file is automatically generated. */\n" 84 "\n" 85 "/* Special table to optimize for Latin 1 */\n" 86 "static u8 ucatl1tab[] = {" 87 ); 88 89 for (rune i = 0; i <= 0xff; i++) 90 printf("%s0x%02x,", i%8 == 0 ? "\n\t" : " ", cats[i]); 91 92 printf( 93 "\n}; /* 256 bytes */\n" 94 "\n" 95 "/* The high byte is the category, and the low three bytes are the\n" 96 " * codepoint. Codepoints are listed in order, with consecutive\n" 97 " * entries with the same category compressed into one entry. */\n" 98 "static u32 ucattab[] = {" 99 ); 100 101 usize tablen = 0; 102 for (rune i = 0x100; i <= RUNE_MAX; i++) { 103 if (i > 0x100 && cats[i] == cats[i-1]) 104 continue; 105 printf("%s0x%08"PRIx32"ul,", tablen%4 == 0 ? "\n\t" : " ", 106 ((rune) cats[i] << 24) | i); 107 tablen++; 108 } 109 110 printf("\n}; /* %zd bytes */\n", 4 * tablen); 111 }