/* * Copyright (c) 2014 SGI. * All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Generator for a compact trie for unicode normalization */ #include <sys/types.h> #include <stddef.h> #include <stdlib.h> #include <stdio.h> #include <assert.h> #include <string.h> #include <unistd.h> #include <errno.h> /* Default names of the in- and output files. */ #define AGE_NAME "DerivedAge.txt" #define CCC_NAME "DerivedCombiningClass.txt" #define PROP_NAME "DerivedCoreProperties.txt" #define DATA_NAME "UnicodeData.txt" #define FOLD_NAME "CaseFolding.txt" #define NORM_NAME "NormalizationCorrections.txt" #define TEST_NAME "NormalizationTest.txt" #define UTF8_NAME "utf8data.h" const char *age_name = AGE_NAME; const char *ccc_name = CCC_NAME; const char *prop_name = PROP_NAME; const char *data_name = DATA_NAME; const char *fold_name = FOLD_NAME; const char *norm_name = NORM_NAME; const char *test_name = TEST_NAME; const char *utf8_name = UTF8_NAME; int verbose = 0; /* An arbitrary line size limit on input lines. */ #define LINESIZE 1024 char line[LINESIZE]; char buf0[LINESIZE]; char buf1[LINESIZE]; char buf2[LINESIZE]; char buf3[LINESIZE]; const char *argv0; #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) /* ------------------------------------------------------------------ */ /* * Unicode version numbers consist of three parts: major, minor, and a * revision. These numbers are packed into an unsigned int to obtain * a single version number. * * To save space in the generated trie, the unicode version is not * stored directly, instead we calculate a generation number from the * unicode versions seen in the DerivedAge file, and use that as an * index into a table of unicode versions. */ #define UNICODE_MAJ_SHIFT (16) #define UNICODE_MIN_SHIFT (8) #define UNICODE_MAJ_MAX ((unsigned short)-1) #define UNICODE_MIN_MAX ((unsigned char)-1) #define UNICODE_REV_MAX ((unsigned char)-1) #define UNICODE_AGE(MAJ,MIN,REV) \ (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ ((unsigned int)(REV))) unsigned int *ages; int ages_count; unsigned int unicode_maxage; static int age_valid(unsigned int major, unsigned int minor, unsigned int revision) { if (major > UNICODE_MAJ_MAX) return 0; if (minor > UNICODE_MIN_MAX) return 0; if (revision > UNICODE_REV_MAX) return 0; return 1; } /* ------------------------------------------------------------------ */ /* * utf8trie_t * * A compact binary tree, used to decode UTF-8 characters. * * Internal nodes are one byte for the node itself, and up to three * bytes for an offset into the tree. The first byte contains the * following information: * NEXTBYTE - flag - advance to next byte if set * BITNUM - 3 bit field - the bit number to tested * OFFLEN - 2 bit field - number of bytes in the offset * if offlen == 0 (non-branching node) * RIGHTPATH - 1 bit field - set if the following node is for the * right-hand path (tested bit is set) * TRIENODE - 1 bit field - set if the following node is an internal * node, otherwise it is a leaf node * if offlen != 0 (branching node) * LEFTNODE - 1 bit field - set if the left-hand node is internal * RIGHTNODE - 1 bit field - set if the right-hand node is internal * * Due to the way utf8 works, there cannot be branching nodes with * NEXTBYTE set, and moreover those nodes always have a righthand * descendant. */ typedef unsigned char utf8trie_t; #define BITNUM 0x07 #define NEXTBYTE 0x08 #define OFFLEN 0x30 #define OFFLEN_SHIFT 4 #define RIGHTPATH 0x40 #define TRIENODE 0x80 #define RIGHTNODE 0x40 #define LEFTNODE 0x80 /* * utf8leaf_t * * The leaves of the trie are embedded in the trie, and so the same * underlying datatype, unsigned char. * * leaf[0]: The unicode version, stored as a generation number that is * an index into utf8agetab[]. With this we can filter code * points based on the unicode version in which they were * defined. The CCC of a non-defined code point is 0. * leaf[1]: Canonical Combining Class. During normalization, we need * to do a stable sort into ascending order of all characters * with a non-zero CCC that occur between two characters with * a CCC of 0, or at the begin or end of a string. * The unicode standard guarantees that all CCC values are * between 0 and 254 inclusive, which leaves 255 available as * a special value. * Code points with CCC 0 are known as stoppers. * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the * start of a NUL-terminated string that is the decomposition * of the character. * The CCC of a decomposable character is the same as the CCC * of the first character of its decomposition. * Some characters decompose as the empty string: these are * characters with the Default_Ignorable_Code_Point property. * These do affect normalization, as they all have CCC 0. * * The decompositions in the trie have been fully expanded. * * Casefolding, if applicable, is also done using decompositions. */ typedef unsigned char utf8leaf_t; #define LEAF_GEN(LEAF) ((LEAF)[0]) #define LEAF_CCC(LEAF) ((LEAF)[1]) #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) #define MAXGEN (255) #define MINCCC (0) #define MAXCCC (254) #define STOPPER (0) #define DECOMPOSE (255) #define HANGUL ((char)(255)) #define UTF8HANGULLEAF (12) struct tree; static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, const char *, size_t); static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); unsigned char *utf8data; size_t utf8data_size; utf8trie_t *nfdi; utf8trie_t *nfdicf; /* ------------------------------------------------------------------ */ /* * UTF8 valid ranges. * * The UTF-8 encoding spreads the bits of a 32bit word over several * bytes. This table gives the ranges that can be held and how they'd * be represented. * * 0x00000000 0x0000007F: 0xxxxxxx * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * There is an additional requirement on UTF-8, in that only the * shortest representation of a 32bit value is to be used. A decoder * must not decode sequences that do not satisfy this requirement. * Thus the allowed ranges have a lower bound. * * 0x00000000 0x0000007F: 0xxxxxxx * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, * 17 planes of 65536 values. This limits the sequences actually seen * even more, to just the following. * * 0 - 0x7f: 0 0x7f * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf * * Even within those ranges not all values are allowed: the surrogates * 0xd800 - 0xdfff should never be seen. * * Note that the longest sequence seen with valid usage is 4 bytes, * the same a single UTF-32 character. This makes the UTF-8 * representation of Unicode strictly smaller than UTF-32. * * The shortest sequence requirement was introduced by: * Corrigendum #1: UTF-8 Shortest Form * It can be found here: * http://www.unicode.org/versions/corrigendum1.html * */ #define UTF8_2_BITS 0xC0 #define UTF8_3_BITS 0xE0 #define UTF8_4_BITS 0xF0 #define UTF8_N_BITS 0x80 #define UTF8_2_MASK 0xE0 #define UTF8_3_MASK 0xF0 #define UTF8_4_MASK 0xF8 #define UTF8_N_MASK 0xC0 #define UTF8_V_MASK 0x3F #define UTF8_V_SHIFT 6 static int utf8encode(char *str, unsigned int val) { int len; if (val < 0x80) { str[0] = val; len = 1; } else if (val < 0x800) { str[1] = val & UTF8_V_MASK; str[1] |= UTF8_N_BITS; val >>= UTF8_V_SHIFT; str[0] = val; str[0] |= UTF8_2_BITS; len = 2; } else if (val < 0x10000) { str[2] = val & UTF8_V_MASK; str[2] |= UTF8_N_BITS; val >>= UTF8_V_SHIFT; str[1] = val & UTF8_V_MASK; str[1] |= UTF8_N_BITS; val >>= UTF8_V_SHIFT; str[0] = val; str[0] |= UTF8_3_BITS; len = 3; } else if (val < 0x110000) { str[3] = val & UTF8_V_MASK; str[3] |= UTF8_N_BITS; val >>= UTF8_V_SHIFT; str[2] = val & UTF8_V_MASK; str[2] |= UTF8_N_BITS; val >>= UTF8_V_SHIFT; str[1] = val & UTF8_V_MASK; str[1] |= UTF8_N_BITS; val >>= UTF8_V_SHIFT; str[0] = val; str[0] |= UTF8_4_BITS; len = 4; } else { printf("%#x: illegal val\n", val); len = 0; } return len; } static unsigned int utf8decode(const char *str) { const unsigned char *s = (const unsigned char*)str; unsigned int unichar = 0; if (*s < 0x80) { unichar = *s; } else if (*s < UTF8_3_BITS) { unichar = *s++ & 0x1F; unichar <<= UTF8_V_SHIFT; unichar |= *s & 0x3F; } else if (*s < UTF8_4_BITS) { unichar = *s++ & 0x0F; unichar <<= UTF8_V_SHIFT; unichar |= *s++ & 0x3F; unichar <<= UTF8_V_SHIFT; unichar |= *s & 0x3F; } else { unichar = *s++ & 0x0F; unichar <<= UTF8_V_SHIFT; unichar |= *s++ & 0x3F; unichar <<= UTF8_V_SHIFT; unichar |= *s++ & 0x3F; unichar <<= UTF8_V_SHIFT; unichar |= *s & 0x3F; } return unichar; } static int utf32valid(unsigned int unichar) { return unichar < 0x110000; } #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) #define NODE 1 #define LEAF 0 struct tree { void *root; int childnode; const char *type; unsigned int maxage; struct tree *next; int (*leaf_equal)(void *, void *); void (*leaf_print)(void *, int); int (*leaf_mark)(void *); int (*leaf_size)(void *); int *(*leaf_index)(struct tree *, void *); unsigned char *(*leaf_emit)(void *, unsigned char *); int leafindex[0x110000]; int index; }; struct node { int index; int offset; int mark; int size; struct node *parent; void *left; void *right; unsigned char bitnum; unsigned char nextbyte; unsigned char leftnode; unsigned char rightnode; unsigned int keybits; unsigned int keymask; }; /* * Example lookup function for a tree. */ static void *lookup(struct tree *tree, const char *key) { struct node *node; void *leaf = NULL; node = tree->root; while (!leaf && node) { if (node->nextbyte) key++; if (*key & (1 << (node->bitnum & 7))) { /* Right leg */ if (node->rightnode == NODE) { node = node->right; } else if (node->rightnode == LEAF) { leaf = node->right; } else { node = NULL; } } else { /* Left leg */ if (node->leftnode == NODE) { node = node->left; } else if (node->leftnode == LEAF) { leaf = node->left; } else { node = NULL; } } } return leaf; } /* * A simple non-recursive tree walker: keep track of visits to the * left and right branches in the leftmask and rightmask. */ static void tree_walk(struct tree *tree) { struct node *node; unsigned int leftmask; unsigned int rightmask; unsigned int bitmask; int indent = 1; int nodes, singletons, leaves; nodes = singletons = leaves = 0; printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); if (tree->childnode == LEAF) { assert(tree->root); tree->leaf_print(tree->root, indent); leaves = 1; } else { assert(tree->childnode == NODE); node = tree->root; leftmask = rightmask = 0; while (node) { printf("%*snode @ %p bitnum %d nextbyte %d" " left %p right %p mask %x bits %x\n", indent, "", node, node->bitnum, node->nextbyte, node->left, node->right, node->keymask, node->keybits); nodes += 1; if (!(node->left && node->right)) singletons += 1; while (node) { bitmask = 1 << node->bitnum; if ((leftmask & bitmask) == 0) { leftmask |= bitmask; if (node->leftnode == LEAF) { assert(node->left); tree->leaf_print(node->left, indent+1); leaves += 1; } else if (node->left) { assert(node->leftnode == NODE); indent += 1; node = node->left; break; } } if ((rightmask & bitmask) == 0) { rightmask |= bitmask; if (node->rightnode == LEAF) { assert(node->right); tree->leaf_print(node->right, indent+1); leaves += 1; } else if (node->right) { assert(node->rightnode == NODE); indent += 1; node = node->right; break; } } leftmask &= ~bitmask; rightmask &= ~bitmask; node = node->parent; indent -= 1; } } } printf("nodes %d leaves %d singletons %d\n", nodes, leaves, singletons); } /* * Allocate an initialize a new internal node. */ static struct node *alloc_node(struct node *parent) { struct node *node; int bitnum; node = malloc(sizeof(*node)); node->left = node->right = NULL; node->parent = parent; node->leftnode = NODE; node->rightnode = NODE; node->keybits = 0; node->keymask = 0; node->mark = 0; node->index = 0; node->offset = -1; node->size = 4; if (node->parent) { bitnum = parent->bitnum; if ((bitnum & 7) == 0) { node->bitnum = bitnum + 7 + 8; node->nextbyte = 1; } else { node->bitnum = bitnum - 1; node->nextbyte = 0; } } else { node->bitnum = 7; node->nextbyte = 0; } return node; } /* * Insert a new leaf into the tree, and collapse any subtrees that are * fully populated and end in identical leaves. A nextbyte tagged * internal node will not be removed to preserve the tree's integrity. * Note that due to the structure of utf8, no nextbyte tagged node * will be a candidate for removal. */ static int insert(struct tree *tree, char *key, int keylen, void *leaf) { struct node *node; struct node *parent; void **cursor; int keybits; assert(keylen >= 1 && keylen <= 4); node = NULL; cursor = &tree->root; keybits = 8 * keylen; /* Insert, creating path along the way. */ while (keybits) { if (!*cursor) *cursor = alloc_node(node); node = *cursor; if (node->nextbyte) key++; if (*key & (1 << (node->bitnum & 7))) cursor = &node->right; else cursor = &node->left; keybits--; } *cursor = leaf; /* Merge subtrees if possible. */ while (node) { if (*key & (1 << (node->bitnum & 7))) node->rightnode = LEAF; else node->leftnode = LEAF; if (node->nextbyte) break; if (node->leftnode == NODE || node->rightnode == NODE) break; assert(node->left); assert(node->right); /* Compare */ if (! tree->leaf_equal(node->left, node->right)) break; /* Keep left, drop right leaf. */ leaf = node->left; /* Check in parent */ parent = node->parent; if (!parent) { /* root of tree! */ tree->root = leaf; tree->childnode = LEAF; } else if (parent->left == node) { parent->left = leaf; parent->leftnode = LEAF; if (parent->right) { parent->keymask = 0; parent->keybits = 0; } else { parent->keymask |= (1 << node->bitnum); } } else if (parent->right == node) { parent->right = leaf; parent->rightnode = LEAF; if (parent->left) { parent->keymask = 0; parent->keybits = 0; } else { parent->keymask |= (1 << node->bitnum); parent->keybits |= (1 << node->bitnum); } } else { /* internal tree error */ assert(0); } free(node); node = parent; } /* Propagate keymasks up along singleton chains. */ while (node) { parent = node->parent; if (!parent) break; /* Nix the mask for parents with two children. */ if (node->keymask == 0) { parent->keymask = 0; parent->keybits = 0; } else if (parent->left && parent->right) { parent->keymask = 0; parent->keybits = 0; } else { assert((parent->keymask & node->keymask) == 0); parent->keymask |= node->keymask; parent->keymask |= (1 << parent->bitnum); parent->keybits |= node->keybits; if (parent->right) parent->keybits |= (1 << parent->bitnum); } node = parent; } return 0; } /* * Prune internal nodes. * * Fully populated subtrees that end at the same leaf have already * been collapsed. There are still internal nodes that have for both * their left and right branches a sequence of singletons that make * identical choices and end in identical leaves. The keymask and * keybits collected in the nodes describe the choices made in these * singleton chains. When they are identical for the left and right * branch of a node, and the two leaves comare identical, the node in * question can be removed. * * Note that nodes with the nextbyte tag set will not be removed by * this to ensure tree integrity. Note as well that the structure of * utf8 ensures that these nodes would not have been candidates for * removal in any case. */ static void prune(struct tree *tree) { struct node *node; struct node *left; struct node *right; struct node *parent; void *leftleaf; void *rightleaf; unsigned int leftmask; unsigned int rightmask; unsigned int bitmask; int count; if (verbose > 0) printf("Pruning %s_%x\n", tree->type, tree->maxage); count = 0; if (tree->childnode == LEAF) return; if (!tree->root) return; leftmask = rightmask = 0; node = tree->root; while (node) { if (node->nextbyte) goto advance; if (node->leftnode == LEAF) goto advance; if (node->rightnode == LEAF) goto advance; if (!node->left) goto advance; if (!node->right) goto advance; left = node->left; right = node->right; if (left->keymask == 0) goto advance; if (right->keymask == 0) goto advance; if (left->keymask != right->keymask) goto advance; if (left->keybits != right->keybits) goto advance; leftleaf = NULL; while (!leftleaf) { assert(left->left || left->right); if (left->leftnode == LEAF) leftleaf = left->left; else if (left->rightnode == LEAF) leftleaf = left->right; else if (left->left) left = left->left; else if (left->right) left = left->right; else assert(0); } rightleaf = NULL; while (!rightleaf) { assert(right->left || right->right); if (right->leftnode == LEAF) rightleaf = right->left; else if (right->rightnode == LEAF) rightleaf = right->right; else if (right->left) right = right->left; else if (right->right) right = right->right; else assert(0); } if (! tree->leaf_equal(leftleaf, rightleaf)) goto advance; /* * This node has identical singleton-only subtrees. * Remove it. */ parent = node->parent; left = node->left; right = node->right; if (parent->left == node) parent->left = left; else if (parent->right == node) parent->right = left; else assert(0); left->parent = parent; left->keymask |= (1 << node->bitnum); node->left = NULL; while (node) { bitmask = 1 << node->bitnum; leftmask &= ~bitmask; rightmask &= ~bitmask; if (node->leftnode == NODE && node->left) { left = node->left; free(node); count++; node = left; } else if (node->rightnode == NODE && node->right) { right = node->right; free(node); count++; node = right; } else { node = NULL; } } /* Propagate keymasks up along singleton chains. */ node = parent; /* Force re-check */ bitmask = 1 << node->bitnum; leftmask &= ~bitmask; rightmask &= ~bitmask; for (;;) { if (node->left && node->right) break; if (node->left) { left = node->left; node->keymask |= left->keymask; node->keybits |= left->keybits; } if (node->right) { right = node->right; node->keymask |= right->keymask; node->keybits |= right->keybits; } node->keymask |= (1 << node->bitnum); node = node->parent; /* Force re-check */ bitmask = 1 << node->bitnum; leftmask &= ~bitmask; rightmask &= ~bitmask; } advance: bitmask = 1 << node->bitnum; if ((leftmask & bitmask) == 0 && node->leftnode == NODE && node->left) { leftmask |= bitmask; node = node->left; } else if ((rightmask & bitmask) == 0 && node->rightnode == NODE && node->right) { rightmask |= bitmask; node = node->right; } else { leftmask &= ~bitmask; rightmask &= ~bitmask; node = node->parent; } } if (verbose > 0) printf("Pruned %d nodes\n", count); } /* * Mark the nodes in the tree that lead to leaves that must be * emitted. */ static void mark_nodes(struct tree *tree) { struct node *node; struct node *n; unsigned int leftmask; unsigned int rightmask; unsigned int bitmask; int marked; marked = 0; if (verbose > 0) printf("Marking %s_%x\n", tree->type, tree->maxage); if (tree->childnode == LEAF) goto done; assert(tree->childnode == NODE); node = tree->root; leftmask = rightmask = 0; while (node) { bitmask = 1 << node->bitnum; if ((leftmask & bitmask) == 0) { leftmask |= bitmask; if (node->leftnode == LEAF) { assert(node->left); if (tree->leaf_mark(node->left)) { n = node; while (n && !n->mark) { marked++; n->mark = 1; n = n->parent; } } } else if (node->left) { assert(node->leftnode == NODE); node = node->left; continue; } } if ((rightmask & bitmask) == 0) { rightmask |= bitmask; if (node->rightnode == LEAF) { assert(node->right); if (tree->leaf_mark(node->right)) { n = node; while (n && !n->mark) { marked++; n->mark = 1; n = n->parent; } } } else if (node->right) { assert(node->rightnode == NODE); node = node->right; continue; } } leftmask &= ~bitmask; rightmask &= ~bitmask; node = node->parent; } /* second pass: left siblings and singletons */ assert(tree->childnode == NODE); node = tree->root; leftmask = rightmask = 0; while (node) { bitmask = 1 << node->bitnum; if ((leftmask & bitmask) == 0) { leftmask |= bitmask; if (node->leftnode == LEAF) { assert(node->left); if (tree->leaf_mark(node->left)) { n = node; while (n && !n->mark) { marked++; n->mark = 1; n = n->parent; } } } else if (node->left) { assert(node->leftnode == NODE); node = node->left; if (!node->mark && node->parent->mark) { marked++; node->mark = 1; } continue; } } if ((rightmask & bitmask) == 0) { rightmask |= bitmask; if (node->rightnode == LEAF) { assert(node->right); if (tree->leaf_mark(node->right)) { n = node; while (n && !n->mark) { marked++; n->mark = 1; n = n->parent; } } } else if (node->right) { assert(node->rightnode == NODE); node = node->right; if (!node->mark && node->parent->mark && !node->parent->left) { marked++; node->mark = 1; } continue; } } leftmask &= ~bitmask; rightmask &= ~bitmask; node = node->parent; } done: if (verbose > 0) printf("Marked %d nodes\n", marked); } /* * Compute the index of each node and leaf, which is the offset in the * emitted trie. These values must be pre-computed because relative * offsets between nodes are used to navigate the tree. */ static int index_nodes(struct tree *tree, int index) { struct node *node; unsigned int leftmask; unsigned int rightmask; unsigned int bitmask; int count; int indent; /* Align to a cache line (or half a cache line?). */ while (index % 64) index++; tree->index = index; indent = 1; count = 0; if (verbose > 0) printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); if (tree->childnode == LEAF) { index += tree->leaf_size(tree->root); goto done; } assert(tree->childnode == NODE); node = tree->root; leftmask = rightmask = 0; while (node) { if (!node->mark) goto skip; count++; if (node->index != index) node->index = index; index += node->size; skip: while (node) { bitmask = 1 << node->bitnum; if (node->mark && (leftmask & bitmask) == 0) { leftmask |= bitmask; if (node->leftnode == LEAF) { assert(node->left); *tree->leaf_index(tree, node->left) = index; index += tree->leaf_size(node->left); count++; } else if (node->left) { assert(node->leftnode == NODE); indent += 1; node = node->left; break; } } if (node->mark && (rightmask & bitmask) == 0) { rightmask |= bitmask; if (node->rightnode == LEAF) { assert(node->right); *tree->leaf_index(tree, node->right) = index; index += tree->leaf_size(node->right); count++; } else if (node->right) { assert(node->rightnode == NODE); indent += 1; node = node->right; break; } } leftmask &= ~bitmask; rightmask &= ~bitmask; node = node->parent; indent -= 1; } } done: /* Round up to a multiple of 16 */ while (index % 16) index++; if (verbose > 0) printf("Final index %d\n", index); return index; } /* * Mark the nodes in a subtree, helper for size_nodes(). */ static int mark_subtree(struct node *node) { int changed; if (!node || node->mark) return 0; node->mark = 1; node->index = node->parent->index; changed = 1; if (node->leftnode == NODE) changed += mark_subtree(node->left); if (node->rightnode == NODE) changed += mark_subtree(node->right); return changed; } /* * Compute the size of nodes and leaves. We start by assuming that * each node needs to store a three-byte offset. The indexes of the * nodes are calculated based on that, and then this function is * called to see if the sizes of some nodes can be reduced. This is * repeated until no more changes are seen. */ static int size_nodes(struct tree *tree) { struct tree *next; struct node *node; struct node *right; struct node *n; unsigned int leftmask; unsigned int rightmask; unsigned int bitmask; unsigned int pathbits; unsigned int pathmask; unsigned int nbit; int changed; int offset; int size; int indent; indent = 1; changed = 0; size = 0; if (verbose > 0) printf("Sizing %s_%x\n", tree->type, tree->maxage); if (tree->childnode == LEAF) goto done; assert(tree->childnode == NODE); pathbits = 0; pathmask = 0; node = tree->root; leftmask = rightmask = 0; while (node) { if (!node->mark) goto skip; offset = 0; if (!node->left || !node->right) { size = 1; } else { if (node->rightnode == NODE) { /* * If the right node is not marked, * look for a corresponding node in * the next tree. Such a node need * not exist. */ right = node->right; next = tree->next; while (!right->mark) { assert(next); n = next->root; while (n->bitnum != node->bitnum) { nbit = 1 << n->bitnum; if (!(pathmask & nbit)) break; if (pathbits & nbit) { if (n->rightnode == LEAF) break; n = n->right; } else { if (n->leftnode == LEAF) break; n = n->left; } } if (n->bitnum != node->bitnum) break; n = n->right; right = n; next = next->next; } /* Make sure the right node is marked. */ if (!right->mark) changed += mark_subtree(right); offset = right->index - node->index; } else { offset = *tree->leaf_index(tree, node->right); offset -= node->index; } assert(offset >= 0); assert(offset <= 0xffffff); if (offset <= 0xff) { size = 2; } else if (offset <= 0xffff) { size = 3; } else { /* offset <= 0xffffff */ size = 4; } } if (node->size != size || node->offset != offset) { node->size = size; node->offset = offset; changed++; } skip: while (node) { bitmask = 1 << node->bitnum; pathmask |= bitmask; if (node->mark && (leftmask & bitmask) == 0) { leftmask |= bitmask; if (node->leftnode == LEAF) { assert(node->left); } else if (node->left) { assert(node->leftnode == NODE); indent += 1; node = node->left; break; } } if (node->mark && (rightmask & bitmask) == 0) { rightmask |= bitmask; pathbits |= bitmask; if (node->rightnode == LEAF) { assert(node->right); } else if (node->right) { assert(node->rightnode == NODE); indent += 1; node = node->right; break; } } leftmask &= ~bitmask; rightmask &= ~bitmask; pathmask &= ~bitmask; pathbits &= ~bitmask; node = node->parent; indent -= 1; } } done: if (verbose > 0) printf("Found %d changes\n", changed); return changed; } /* * Emit a trie for the given tree into the data array. */ static void emit(struct tree *tree, unsigned char *data) { struct node *node; unsigned int leftmask; unsigned int rightmask; unsigned int bitmask; int offlen; int offset; int index; int indent; int size; int bytes; int leaves; int nodes[4]; unsigned char byte; nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; leaves = 0; bytes = 0; index = tree->index; data += index; indent = 1; if (verbose > 0) printf("Emitting %s_%x\n", tree->type, tree->maxage); if (tree->childnode == LEAF) { assert(tree->root); tree->leaf_emit(tree->root, data); size = tree->leaf_size(tree->root); index += size; leaves++; goto done; } assert(tree->childnode == NODE); node = tree->root; leftmask = rightmask = 0; while (node) { if (!node->mark) goto skip; assert(node->offset != -1); assert(node->index == index); byte = 0; if (node->nextbyte) byte |= NEXTBYTE; byte |= (node->bitnum & BITNUM); if (node->left && node->right) { if (node->leftnode == NODE) byte |= LEFTNODE; if (node->rightnode == NODE) byte |= RIGHTNODE; if (node->offset <= 0xff) offlen = 1; else if (node->offset <= 0xffff) offlen = 2; else offlen = 3; nodes[offlen]++; offset = node->offset; byte |= offlen << OFFLEN_SHIFT; *data++ = byte; index++; while (offlen--) { *data++ = offset & 0xff; index++; offset >>= 8; } } else if (node->left) { if (node->leftnode == NODE) byte |= TRIENODE; nodes[0]++; *data++ = byte; index++; } else if (node->right) { byte |= RIGHTNODE; if (node->rightnode == NODE) byte |= TRIENODE; nodes[0]++; *data++ = byte; index++; } else { assert(0); } skip: while (node) { bitmask = 1 << node->bitnum; if (node->mark && (leftmask & bitmask) == 0) { leftmask |= bitmask; if (node->leftnode == LEAF) { assert(node->left); data = tree->leaf_emit(node->left, data); size = tree->leaf_size(node->left); index += size; bytes += size; leaves++; } else if (node->left) { assert(node->leftnode == NODE); indent += 1; node = node->left; break; } } if (node->mark && (rightmask & bitmask) == 0) { rightmask |= bitmask; if (node->rightnode == LEAF) { assert(node->right); data = tree->leaf_emit(node->right, data); size = tree->leaf_size(node->right); index += size; bytes += size; leaves++; } else if (node->right) { assert(node->rightnode == NODE); indent += 1; node = node->right; break; } } leftmask &= ~bitmask; rightmask &= ~bitmask; node = node->parent; indent -= 1; } } done: if (verbose > 0) { printf("Emitted %d (%d) leaves", leaves, bytes); printf(" %d (%d+%d+%d+%d) nodes", nodes[0] + nodes[1] + nodes[2] + nodes[3], nodes[0], nodes[1], nodes[2], nodes[3]); printf(" %d total\n", index - tree->index); } } /* ------------------------------------------------------------------ */ /* * Unicode data. * * We need to keep track of the Canonical Combining Class, the Age, * and decompositions for a code point. * * For the Age, we store the index into the ages table. Effectively * this is a generation number that the table maps to a unicode * version. * * The correction field is used to indicate that this entry is in the * corrections array, which contains decompositions that were * corrected in later revisions. The value of the correction field is * the Unicode version in which the mapping was corrected. */ struct unicode_data { unsigned int code; int ccc; int gen; int correction; unsigned int *utf32nfdi; unsigned int *utf32nfdicf; char *utf8nfdi; char *utf8nfdicf; }; struct unicode_data unicode_data[0x110000]; struct unicode_data *corrections; int corrections_count; struct tree *nfdi_tree; struct tree *nfdicf_tree; struct tree *trees; int trees_count; /* * Check the corrections array to see if this entry was corrected at * some point. */ static struct unicode_data *corrections_lookup(struct unicode_data *u) { int i; for (i = 0; i != corrections_count; i++) if (u->code == corrections[i].code) return &corrections[i]; return u; } static int nfdi_equal(void *l, void *r) { struct unicode_data *left = l; struct unicode_data *right = r; if (left->gen != right->gen) return 0; if (left->ccc != right->ccc) return 0; if (left->utf8nfdi && right->utf8nfdi && strcmp(left->utf8nfdi, right->utf8nfdi) == 0) return 1; if (left->utf8nfdi || right->utf8nfdi) return 0; return 1; } static int nfdicf_equal(void *l, void *r) { struct unicode_data *left = l; struct unicode_data *right = r; if (left->gen != right->gen) return 0; if (left->ccc != right->ccc) return 0; if (left->utf8nfdicf && right->utf8nfdicf && strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) return 1; if (left->utf8nfdicf && right->utf8nfdicf) return 0; if (left->utf8nfdicf || right->utf8nfdicf) return 0; if (left->utf8nfdi && right->utf8nfdi && strcmp(left->utf8nfdi, right->utf8nfdi) == 0) return 1; if (left->utf8nfdi || right->utf8nfdi) return 0; return 1; } static void nfdi_print(void *l, int indent) { struct unicode_data *leaf = l; printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, leaf->code, leaf->ccc, leaf->gen); if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); else if (leaf->utf8nfdi) printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); printf("\n"); } static void nfdicf_print(void *l, int indent) { struct unicode_data *leaf = l; printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, leaf->code, leaf->ccc, leaf->gen); if (leaf->utf8nfdicf) printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); else if (leaf->utf8nfdi) printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); printf("\n"); } static int nfdi_mark(void *l) { return 1; } static int nfdicf_mark(void *l) { struct unicode_data *leaf = l; if (leaf->utf8nfdicf) return 1; return 0; } static int correction_mark(void *l) { struct unicode_data *leaf = l; return leaf->correction; } static int nfdi_size(void *l) { struct unicode_data *leaf = l; int size = 2; if (HANGUL_SYLLABLE(leaf->code)) size += 1; else if (leaf->utf8nfdi) size += strlen(leaf->utf8nfdi) + 1; return size; } static int nfdicf_size(void *l) { struct unicode_data *leaf = l; int size = 2; if (HANGUL_SYLLABLE(leaf->code)) size += 1; else if (leaf->utf8nfdicf) size += strlen(leaf->utf8nfdicf) + 1; else if (leaf->utf8nfdi) size += strlen(leaf->utf8nfdi) + 1; return size; } static int *nfdi_index(struct tree *tree, void *l) { struct unicode_data *leaf = l; return &tree->leafindex[leaf->code]; } static int *nfdicf_index(struct tree *tree, void *l) { struct unicode_data *leaf = l; return &tree->leafindex[leaf->code]; } static unsigned char *nfdi_emit(void *l, unsigned char *data) { struct unicode_data *leaf = l; unsigned char *s; *data++ = leaf->gen; if (HANGUL_SYLLABLE(leaf->code)) { *data++ = DECOMPOSE; *data++ = HANGUL; } else if (leaf->utf8nfdi) { *data++ = DECOMPOSE; s = (unsigned char*)leaf->utf8nfdi; while ((*data++ = *s++) != 0) ; } else { *data++ = leaf->ccc; } return data; } static unsigned char *nfdicf_emit(void *l, unsigned char *data) { struct unicode_data *leaf = l; unsigned char *s; *data++ = leaf->gen; if (HANGUL_SYLLABLE(leaf->code)) { *data++ = DECOMPOSE; *data++ = HANGUL; } else if (leaf->utf8nfdicf) { *data++ = DECOMPOSE; s = (unsigned char*)leaf->utf8nfdicf; while ((*data++ = *s++) != 0) ; } else if (leaf->utf8nfdi) { *data++ = DECOMPOSE; s = (unsigned char*)leaf->utf8nfdi; while ((*data++ = *s++) != 0) ; } else { *data++ = leaf->ccc; } return data; } static void utf8_create(struct unicode_data *data) { char utf[18*4+1]; char *u; unsigned int *um; int i; if (data->utf8nfdi) { assert(data->utf8nfdi[0] == HANGUL); return; } u = utf; um = data->utf32nfdi; if (um) { for (i = 0; um[i]; i++) u += utf8encode(u, um[i]); *u = '\0'; data->utf8nfdi = strdup(utf); } u = utf; um = data->utf32nfdicf; if (um) { for (i = 0; um[i]; i++) u += utf8encode(u, um[i]); *u = '\0'; if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) data->utf8nfdicf = strdup(utf); } } static void utf8_init(void) { unsigned int unichar; int i; for (unichar = 0; unichar != 0x110000; unichar++) utf8_create(&unicode_data[unichar]); for (i = 0; i != corrections_count; i++) utf8_create(&corrections[i]); } static void trees_init(void) { struct unicode_data *data; unsigned int maxage; unsigned int nextage; int count; int i; int j; /* Count the number of different ages. */ count = 0; nextage = (unsigned int)-1; do { maxage = nextage; nextage = 0; for (i = 0; i <= corrections_count; i++) { data = &corrections[i]; if (nextage < data->correction && data->correction < maxage) nextage = data->correction; } count++; } while (nextage); /* Two trees per age: nfdi and nfdicf */ trees_count = count * 2; trees = calloc(trees_count, sizeof(struct tree)); /* Assign ages to the trees. */ count = trees_count; nextage = (unsigned int)-1; do { maxage = nextage; trees[--count].maxage = maxage; trees[--count].maxage = maxage; nextage = 0; for (i = 0; i <= corrections_count; i++) { data = &corrections[i]; if (nextage < data->correction && data->correction < maxage) nextage = data->correction; } } while (nextage); /* The ages assigned above are off by one. */ for (i = 0; i != trees_count; i++) { j = 0; while (ages[j] < trees[i].maxage) j++; trees[i].maxage = ages[j-1]; } /* Set up the forwarding between trees. */ trees[trees_count-2].next = &trees[trees_count-1]; trees[trees_count-1].leaf_mark = nfdi_mark; trees[trees_count-2].leaf_mark = nfdicf_mark; for (i = 0; i != trees_count-2; i += 2) { trees[i].next = &trees[trees_count-2]; trees[i].leaf_mark = correction_mark; trees[i+1].next = &trees[trees_count-1]; trees[i+1].leaf_mark = correction_mark; } /* Assign the callouts. */ for (i = 0; i != trees_count; i += 2) { trees[i].type = "nfdicf"; trees[i].leaf_equal = nfdicf_equal; trees[i].leaf_print = nfdicf_print; trees[i].leaf_size = nfdicf_size; trees[i].leaf_index = nfdicf_index; trees[i].leaf_emit = nfdicf_emit; trees[i+1].type = "nfdi"; trees[i+1].leaf_equal = nfdi_equal; trees[i+1].leaf_print = nfdi_print; trees[i+1].leaf_size = nfdi_size; trees[i+1].leaf_index = nfdi_index; trees[i+1].leaf_emit = nfdi_emit; } /* Finish init. */ for (i = 0; i != trees_count; i++) trees[i].childnode = NODE; } static void trees_populate(void) { struct unicode_data *data; unsigned int unichar; char keyval[4]; int keylen; int i; for (i = 0; i != trees_count; i++) { if (verbose > 0) { printf("Populating %s_%x\n", trees[i].type, trees[i].maxage); } for (unichar = 0; unichar != 0x110000; unichar++) { if (unicode_data[unichar].gen < 0) continue; keylen = utf8encode(keyval, unichar); data = corrections_lookup(&unicode_data[unichar]); if (data->correction <= trees[i].maxage) data = &unicode_data[unichar]; insert(&trees[i], keyval, keylen, data); } } } static void trees_reduce(void) { int i; int size; int changed; for (i = 0; i != trees_count; i++) prune(&trees[i]); for (i = 0; i != trees_count; i++) mark_nodes(&trees[i]); do { size = 0; for (i = 0; i != trees_count; i++) size = index_nodes(&trees[i], size); changed = 0; for (i = 0; i != trees_count; i++) changed += size_nodes(&trees[i]); } while (changed); utf8data = calloc(size, 1); utf8data_size = size; for (i = 0; i != trees_count; i++) emit(&trees[i], utf8data); if (verbose > 0) { for (i = 0; i != trees_count; i++) { printf("%s_%x idx %d\n", trees[i].type, trees[i].maxage, trees[i].index); } } nfdi = utf8data + trees[trees_count-1].index; nfdicf = utf8data + trees[trees_count-2].index; nfdi_tree = &trees[trees_count-1]; nfdicf_tree = &trees[trees_count-2]; } static void verify(struct tree *tree) { struct unicode_data *data; utf8leaf_t *leaf; unsigned int unichar; char key[4]; unsigned char hangul[UTF8HANGULLEAF]; int report; int nocf; if (verbose > 0) printf("Verifying %s_%x\n", tree->type, tree->maxage); nocf = strcmp(tree->type, "nfdicf"); for (unichar = 0; unichar != 0x110000; unichar++) { report = 0; data = corrections_lookup(&unicode_data[unichar]); if (data->correction <= tree->maxage) data = &unicode_data[unichar]; utf8encode(key,unichar); leaf = utf8lookup(tree, hangul, key); if (!leaf) { if (data->gen != -1) report++; if (unichar < 0xd800 || unichar > 0xdfff) report++; } else { if (unichar >= 0xd800 && unichar <= 0xdfff) report++; if (data->gen == -1) report++; if (data->gen != LEAF_GEN(leaf)) report++; if (LEAF_CCC(leaf) == DECOMPOSE) { if (HANGUL_SYLLABLE(data->code)) { if (data->utf8nfdi[0] != HANGUL) report++; } else if (nocf) { if (!data->utf8nfdi) { report++; } else if (strcmp(data->utf8nfdi, LEAF_STR(leaf))) { report++; } } else { if (!data->utf8nfdicf && !data->utf8nfdi) { report++; } else if (data->utf8nfdicf) { if (strcmp(data->utf8nfdicf, LEAF_STR(leaf))) report++; } else if (strcmp(data->utf8nfdi, LEAF_STR(leaf))) { report++; } } } else if (data->ccc != LEAF_CCC(leaf)) { report++; } } if (report) { printf("%X code %X gen %d ccc %d" " nfdi -> \"%s\"", unichar, data->code, data->gen, data->ccc, data->utf8nfdi); if (leaf) { printf(" gen %d ccc %d" " nfdi -> \"%s\"", LEAF_GEN(leaf), LEAF_CCC(leaf), LEAF_CCC(leaf) == DECOMPOSE ? LEAF_STR(leaf) : ""); } printf("\n"); } } } static void trees_verify(void) { int i; for (i = 0; i != trees_count; i++) verify(&trees[i]); } /* ------------------------------------------------------------------ */ static void help(void) { printf("Usage: %s [options]\n", argv0); printf("\n"); printf("This program creates an a data trie used for parsing and\n"); printf("normalization of UTF-8 strings. The trie is derived from\n"); printf("a set of input files from the Unicode character database\n"); printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); printf("\n"); printf("The generated tree supports two normalization forms:\n"); printf("\n"); printf("\tnfdi:\n"); printf("\t- Apply unicode normalization form NFD.\n"); printf("\t- Remove any Default_Ignorable_Code_Point.\n"); printf("\n"); printf("\tnfdicf:\n"); printf("\t- Apply unicode normalization form NFD.\n"); printf("\t- Remove any Default_Ignorable_Code_Point.\n"); printf("\t- Apply a full casefold (C + F).\n"); printf("\n"); printf("These forms were chosen as being most useful when dealing\n"); printf("with file names: NFD catches most cases where characters\n"); printf("should be considered equivalent. The ignorables are mostly\n"); printf("invisible, making names hard to type.\n"); printf("\n"); printf("The options to specify the files to be used are listed\n"); printf("below with their default values, which are the names used\n"); printf("by version 11.0.0 of the Unicode Character Database.\n"); printf("\n"); printf("The input files:\n"); printf("\t-a %s\n", AGE_NAME); printf("\t-c %s\n", CCC_NAME); printf("\t-p %s\n", PROP_NAME); printf("\t-d %s\n", DATA_NAME); printf("\t-f %s\n", FOLD_NAME); printf("\t-n %s\n", NORM_NAME); printf("\n"); printf("Additionally, the generated tables are tested using:\n"); printf("\t-t %s\n", TEST_NAME); printf("\n"); printf("Finally, the output file:\n"); printf("\t-o %s\n", UTF8_NAME); printf("\n"); } static void usage(void) { help(); exit(1); } static void open_fail(const char *name, int error) { printf("Error %d opening %s: %s\n", error, name, strerror(error)); exit(1); } static void file_fail(const char *filename) { printf("Error parsing %s\n", filename); exit(1); } static void line_fail(const char *filename, const char *line) { printf("Error parsing %s:%s\n", filename, line); exit(1); } /* ------------------------------------------------------------------ */ static void print_utf32(unsigned int *utf32str) { int i; for (i = 0; utf32str[i]; i++) printf(" %X", utf32str[i]); } static void print_utf32nfdi(unsigned int unichar) { printf(" %X ->", unichar); print_utf32(unicode_data[unichar].utf32nfdi); printf("\n"); } static void print_utf32nfdicf(unsigned int unichar) { printf(" %X ->", unichar); print_utf32(unicode_data[unichar].utf32nfdicf); printf("\n"); } /* ------------------------------------------------------------------ */ static void age_init(void) { FILE *file; unsigned int first; unsigned int last; unsigned int unichar; unsigned int major; unsigned int minor; unsigned int revision; int gen; int count; int ret; if (verbose > 0) printf("Parsing %s\n", age_name); file = fopen(age_name, "r"); if (!file) open_fail(age_name, errno); count = 0; gen = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "# Age=V%d_%d_%d", &major, &minor, &revision); if (ret == 3) { ages_count++; if (verbose > 1) printf(" Age V%d_%d_%d\n", major, minor, revision); if (!age_valid(major, minor, revision)) line_fail(age_name, line); continue; } ret = sscanf(line, "# Age=V%d_%d", &major, &minor); if (ret == 2) { ages_count++; if (verbose > 1) printf(" Age V%d_%d\n", major, minor); if (!age_valid(major, minor, 0)) line_fail(age_name, line); continue; } } /* We must have found something above. */ if (verbose > 1) printf("%d age entries\n", ages_count); if (ages_count == 0 || ages_count > MAXGEN) file_fail(age_name); /* There is a 0 entry. */ ages_count++; ages = calloc(ages_count + 1, sizeof(*ages)); /* And a guard entry. */ ages[ages_count] = (unsigned int)-1; rewind(file); count = 0; gen = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "# Age=V%d_%d_%d", &major, &minor, &revision); if (ret == 3) { ages[++gen] = UNICODE_AGE(major, minor, revision); if (verbose > 1) printf(" Age V%d_%d_%d = gen %d\n", major, minor, revision, gen); if (!age_valid(major, minor, revision)) line_fail(age_name, line); continue; } ret = sscanf(line, "# Age=V%d_%d", &major, &minor); if (ret == 2) { ages[++gen] = UNICODE_AGE(major, minor, 0); if (verbose > 1) printf(" Age V%d_%d = %d\n", major, minor, gen); if (!age_valid(major, minor, 0)) line_fail(age_name, line); continue; } ret = sscanf(line, "%X..%X ; %d.%d #", &first, &last, &major, &minor); if (ret == 4) { for (unichar = first; unichar <= last; unichar++) unicode_data[unichar].gen = gen; count += 1 + last - first; if (verbose > 1) printf(" %X..%X gen %d\n", first, last, gen); if (!utf32valid(first) || !utf32valid(last)) line_fail(age_name, line); continue; } ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); if (ret == 3) { unicode_data[unichar].gen = gen; count++; if (verbose > 1) printf(" %X gen %d\n", unichar, gen); if (!utf32valid(unichar)) line_fail(age_name, line); continue; } } unicode_maxage = ages[gen]; fclose(file); /* Nix surrogate block */ if (verbose > 1) printf(" Removing surrogate block D800..DFFF\n"); for (unichar = 0xd800; unichar <= 0xdfff; unichar++) unicode_data[unichar].gen = -1; if (verbose > 0) printf("Found %d entries\n", count); if (count == 0) file_fail(age_name); } static void ccc_init(void) { FILE *file; unsigned int first; unsigned int last; unsigned int unichar; unsigned int value; int count; int ret; if (verbose > 0) printf("Parsing %s\n", ccc_name); file = fopen(ccc_name, "r"); if (!file) open_fail(ccc_name, errno); count = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); if (ret == 3) { for (unichar = first; unichar <= last; unichar++) { unicode_data[unichar].ccc = value; count++; } if (verbose > 1) printf(" %X..%X ccc %d\n", first, last, value); if (!utf32valid(first) || !utf32valid(last)) line_fail(ccc_name, line); continue; } ret = sscanf(line, "%X ; %d #", &unichar, &value); if (ret == 2) { unicode_data[unichar].ccc = value; count++; if (verbose > 1) printf(" %X ccc %d\n", unichar, value); if (!utf32valid(unichar)) line_fail(ccc_name, line); continue; } } fclose(file); if (verbose > 0) printf("Found %d entries\n", count); if (count == 0) file_fail(ccc_name); } static int ignore_compatibility_form(char *type) { int i; char *ignored_types[] = {"font", "noBreak", "initial", "medial", "final", "isolated", "circle", "super", "sub", "vertical", "wide", "narrow", "small", "square", "fraction", "compat"}; for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) if (strcmp(type, ignored_types[i]) == 0) return 1; return 0; } static void nfdi_init(void) { FILE *file; unsigned int unichar; unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ char *s; char *type; unsigned int *um; int count; int i; int ret; if (verbose > 0) printf("Parsing %s\n", data_name); file = fopen(data_name, "r"); if (!file) open_fail(data_name, errno); count = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", &unichar, buf0); if (ret != 2) continue; if (!utf32valid(unichar)) line_fail(data_name, line); s = buf0; /* skip over <tag> */ if (*s == '<') { type = ++s; while (*++s != '>'); *s++ = '\0'; if(ignore_compatibility_form(type)) continue; } /* decode the decomposition into UTF-32 */ i = 0; while (*s) { mapping[i] = strtoul(s, &s, 16); if (!utf32valid(mapping[i])) line_fail(data_name, line); i++; } mapping[i++] = 0; um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdi = um; if (verbose > 1) print_utf32nfdi(unichar); count++; } fclose(file); if (verbose > 0) printf("Found %d entries\n", count); if (count == 0) file_fail(data_name); } static void nfdicf_init(void) { FILE *file; unsigned int unichar; unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ char status; char *s; unsigned int *um; int i; int count; int ret; if (verbose > 0) printf("Parsing %s\n", fold_name); file = fopen(fold_name, "r"); if (!file) open_fail(fold_name, errno); count = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); if (ret != 3) continue; if (!utf32valid(unichar)) line_fail(fold_name, line); /* Use the C+F casefold. */ if (status != 'C' && status != 'F') continue; s = buf0; if (*s == '<') while (*s++ != ' ') ; i = 0; while (*s) { mapping[i] = strtoul(s, &s, 16); if (!utf32valid(mapping[i])) line_fail(fold_name, line); i++; } mapping[i++] = 0; um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; if (verbose > 1) print_utf32nfdicf(unichar); count++; } fclose(file); if (verbose > 0) printf("Found %d entries\n", count); if (count == 0) file_fail(fold_name); } static void ignore_init(void) { FILE *file; unsigned int unichar; unsigned int first; unsigned int last; unsigned int *um; int count; int ret; if (verbose > 0) printf("Parsing %s\n", prop_name); file = fopen(prop_name, "r"); if (!file) open_fail(prop_name, errno); assert(file); count = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); if (ret == 3) { if (strcmp(buf0, "Default_Ignorable_Code_Point")) continue; if (!utf32valid(first) || !utf32valid(last)) line_fail(prop_name, line); for (unichar = first; unichar <= last; unichar++) { free(unicode_data[unichar].utf32nfdi); um = malloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdi = um; free(unicode_data[unichar].utf32nfdicf); um = malloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdicf = um; count++; } if (verbose > 1) printf(" %X..%X Default_Ignorable_Code_Point\n", first, last); continue; } ret = sscanf(line, "%X ; %s # ", &unichar, buf0); if (ret == 2) { if (strcmp(buf0, "Default_Ignorable_Code_Point")) continue; if (!utf32valid(unichar)) line_fail(prop_name, line); free(unicode_data[unichar].utf32nfdi); um = malloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdi = um; free(unicode_data[unichar].utf32nfdicf); um = malloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdicf = um; if (verbose > 1) printf(" %X Default_Ignorable_Code_Point\n", unichar); count++; continue; } } fclose(file); if (verbose > 0) printf("Found %d entries\n", count); if (count == 0) file_fail(prop_name); } static void corrections_init(void) { FILE *file; unsigned int unichar; unsigned int major; unsigned int minor; unsigned int revision; unsigned int age; unsigned int *um; unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ char *s; int i; int count; int ret; if (verbose > 0) printf("Parsing %s\n", norm_name); file = fopen(norm_name, "r"); if (!file) open_fail(norm_name, errno); count = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", &unichar, buf0, buf1, &major, &minor, &revision); if (ret != 6) continue; if (!utf32valid(unichar) || !age_valid(major, minor, revision)) line_fail(norm_name, line); count++; } corrections = calloc(count, sizeof(struct unicode_data)); corrections_count = count; rewind(file); count = 0; while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", &unichar, buf0, buf1, &major, &minor, &revision); if (ret != 6) continue; if (!utf32valid(unichar) || !age_valid(major, minor, revision)) line_fail(norm_name, line); corrections[count] = unicode_data[unichar]; assert(corrections[count].code == unichar); age = UNICODE_AGE(major, minor, revision); corrections[count].correction = age; i = 0; s = buf0; while (*s) { mapping[i] = strtoul(s, &s, 16); if (!utf32valid(mapping[i])) line_fail(norm_name, line); i++; } mapping[i++] = 0; um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); corrections[count].utf32nfdi = um; if (verbose > 1) printf(" %X -> %s -> %s V%d_%d_%d\n", unichar, buf0, buf1, major, minor, revision); count++; } fclose(file); if (verbose > 0) printf("Found %d entries\n", count); if (count == 0) file_fail(norm_name); } /* ------------------------------------------------------------------ */ /* * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) * * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; * * SBase = 0xAC00 * LBase = 0x1100 * VBase = 0x1161 * TBase = 0x11A7 * LCount = 19 * VCount = 21 * TCount = 28 * NCount = 588 (VCount * TCount) * SCount = 11172 (LCount * NCount) * * Decomposition: * SIndex = s - SBase * * LV (Canonical/Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * LPart = LBase + LIndex * VPart = VBase + VIndex * * LVT (Canonical) * LVIndex = (SIndex / TCount) * TCount * TIndex = (Sindex % TCount) * LVPart = SBase + LVIndex * TPart = TBase + TIndex * * LVT (Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * TIndex = (Sindex % TCount) * LPart = LBase + LIndex * VPart = VBase + VIndex * if (TIndex == 0) { * d = <LPart, VPart> * } else { * TPart = TBase + TIndex * d = <LPart, VPart, TPart> * } * */ static void hangul_decompose(void) { unsigned int sb = 0xAC00; unsigned int lb = 0x1100; unsigned int vb = 0x1161; unsigned int tb = 0x11a7; /* unsigned int lc = 19; */ unsigned int vc = 21; unsigned int tc = 28; unsigned int nc = (vc * tc); /* unsigned int sc = (lc * nc); */ unsigned int unichar; unsigned int mapping[4]; unsigned int *um; int count; int i; if (verbose > 0) printf("Decomposing hangul\n"); /* Hangul */ count = 0; for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { unsigned int si = unichar - sb; unsigned int li = si / nc; unsigned int vi = (si % nc) / tc; unsigned int ti = si % tc; i = 0; mapping[i++] = lb + li; mapping[i++] = vb + vi; if (ti) mapping[i++] = tb + ti; mapping[i++] = 0; assert(!unicode_data[unichar].utf32nfdi); um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdi = um; assert(!unicode_data[unichar].utf32nfdicf); um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; /* * Add a cookie as a reminder that the hangul syllable * decompositions must not be stored in the generated * trie. */ unicode_data[unichar].utf8nfdi = malloc(2); unicode_data[unichar].utf8nfdi[0] = HANGUL; unicode_data[unichar].utf8nfdi[1] = '\0'; if (verbose > 1) print_utf32nfdi(unichar); count++; } if (verbose > 0) printf("Created %d entries\n", count); } static void nfdi_decompose(void) { unsigned int unichar; unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ unsigned int *um; unsigned int *dc; int count; int i; int j; int ret; if (verbose > 0) printf("Decomposing nfdi\n"); count = 0; for (unichar = 0; unichar != 0x110000; unichar++) { if (!unicode_data[unichar].utf32nfdi) continue; for (;;) { ret = 1; i = 0; um = unicode_data[unichar].utf32nfdi; while (*um) { dc = unicode_data[*um].utf32nfdi; if (dc) { for (j = 0; dc[j]; j++) mapping[i++] = dc[j]; ret = 0; } else { mapping[i++] = *um; } um++; } mapping[i++] = 0; if (ret) break; free(unicode_data[unichar].utf32nfdi); um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdi = um; } /* Add this decomposition to nfdicf if there is no entry. */ if (!unicode_data[unichar].utf32nfdicf) { um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; } if (verbose > 1) print_utf32nfdi(unichar); count++; } if (verbose > 0) printf("Processed %d entries\n", count); } static void nfdicf_decompose(void) { unsigned int unichar; unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ unsigned int *um; unsigned int *dc; int count; int i; int j; int ret; if (verbose > 0) printf("Decomposing nfdicf\n"); count = 0; for (unichar = 0; unichar != 0x110000; unichar++) { if (!unicode_data[unichar].utf32nfdicf) continue; for (;;) { ret = 1; i = 0; um = unicode_data[unichar].utf32nfdicf; while (*um) { dc = unicode_data[*um].utf32nfdicf; if (dc) { for (j = 0; dc[j]; j++) mapping[i++] = dc[j]; ret = 0; } else { mapping[i++] = *um; } um++; } mapping[i++] = 0; if (ret) break; free(unicode_data[unichar].utf32nfdicf); um = malloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; } if (verbose > 1) print_utf32nfdicf(unichar); count++; } if (verbose > 0) printf("Processed %d entries\n", count); } /* ------------------------------------------------------------------ */ int utf8agemax(struct tree *, const char *); int utf8nagemax(struct tree *, const char *, size_t); int utf8agemin(struct tree *, const char *); int utf8nagemin(struct tree *, const char *, size_t); ssize_t utf8len(struct tree *, const char *); ssize_t utf8nlen(struct tree *, const char *, size_t); struct utf8cursor; int utf8cursor(struct utf8cursor *, struct tree *, const char *); int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); int utf8byte(struct utf8cursor *); /* * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) * * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; * * SBase = 0xAC00 * LBase = 0x1100 * VBase = 0x1161 * TBase = 0x11A7 * LCount = 19 * VCount = 21 * TCount = 28 * NCount = 588 (VCount * TCount) * SCount = 11172 (LCount * NCount) * * Decomposition: * SIndex = s - SBase * * LV (Canonical/Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * LPart = LBase + LIndex * VPart = VBase + VIndex * * LVT (Canonical) * LVIndex = (SIndex / TCount) * TCount * TIndex = (Sindex % TCount) * LVPart = SBase + LVIndex * TPart = TBase + TIndex * * LVT (Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * TIndex = (Sindex % TCount) * LPart = LBase + LIndex * VPart = VBase + VIndex * if (TIndex == 0) { * d = <LPart, VPart> * } else { * TPart = TBase + TIndex * d = <LPart, VPart, TPart> * } */ /* Constants */ #define SB (0xAC00) #define LB (0x1100) #define VB (0x1161) #define TB (0x11A7) #define LC (19) #define VC (21) #define TC (28) #define NC (VC * TC) #define SC (LC * NC) /* Algorithmic decomposition of hangul syllable. */ static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) { unsigned int si; unsigned int li; unsigned int vi; unsigned int ti; unsigned char *h; /* Calculate the SI, LI, VI, and TI values. */ si = utf8decode(str) - SB; li = si / NC; vi = (si % NC) / TC; ti = si % TC; /* Fill in base of leaf. */ h = hangul; LEAF_GEN(h) = 2; LEAF_CCC(h) = DECOMPOSE; h += 2; /* Add LPart, a 3-byte UTF-8 sequence. */ h += utf8encode((char *)h, li + LB); /* Add VPart, a 3-byte UTF-8 sequence. */ h += utf8encode((char *)h, vi + VB); /* Add TPart if required, also a 3-byte UTF-8 sequence. */ if (ti) h += utf8encode((char *)h, ti + TB); /* Terminate string. */ h[0] = '\0'; return hangul; } /* * Use trie to scan s, touching at most len bytes. * Returns the leaf if one exists, NULL otherwise. * * A non-NULL return guarantees that the UTF-8 sequence starting at s * is well-formed and corresponds to a known unicode code point. The * shorthand for this will be "is valid UTF-8 unicode". */ static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, const char *s, size_t len) { utf8trie_t *trie; int offlen; int offset; int mask; int node; if (!tree) return NULL; if (len == 0) return NULL; node = 1; trie = utf8data + tree->index; while (node) { offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; if (*trie & NEXTBYTE) { if (--len == 0) return NULL; s++; } mask = 1 << (*trie & BITNUM); if (*s & mask) { /* Right leg */ if (offlen) { /* Right node at offset of trie */ node = (*trie & RIGHTNODE); offset = trie[offlen]; while (--offlen) { offset <<= 8; offset |= trie[offlen]; } trie += offset; } else if (*trie & RIGHTPATH) { /* Right node after this node */ node = (*trie & TRIENODE); trie++; } else { /* No right node. */ return NULL; } } else { /* Left leg */ if (offlen) { /* Left node after this node. */ node = (*trie & LEFTNODE); trie += offlen + 1; } else if (*trie & RIGHTPATH) { /* No left node. */ return NULL; } else { /* Left node after this node */ node = (*trie & TRIENODE); trie++; } } } /* * Hangul decomposition is done algorithmically. These are the * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is * always 3 bytes long, so s has been advanced twice, and the * start of the sequence is at s-2. */ if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) trie = utf8hangul(s - 2, hangul); return trie; } /* * Use trie to scan s. * Returns the leaf if one exists, NULL otherwise. * * Forwards to trie_nlookup(). */ static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, const char *s) { return utf8nlookup(tree, hangul, s, (size_t)-1); } /* * Return the number of bytes used by the current UTF-8 sequence. * Assumes the input points to the first byte of a valid UTF-8 * sequence. */ static inline int utf8clen(const char *s) { unsigned char c = *s; return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); } /* * Maximum age of any character in s. * Return -1 if s is not valid UTF-8 unicode. * Return 0 if only non-assigned code points are used. */ int utf8agemax(struct tree *tree, const char *s) { utf8leaf_t *leaf; int age = 0; int leaf_age; unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; while (*s) { leaf = utf8lookup(tree, hangul, s); if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age > age) age = leaf_age; s += utf8clen(s); } return age; } /* * Minimum age of any character in s. * Return -1 if s is not valid UTF-8 unicode. * Return 0 if non-assigned code points are used. */ int utf8agemin(struct tree *tree, const char *s) { utf8leaf_t *leaf; int age; int leaf_age; unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; age = tree->maxage; while (*s) { leaf = utf8lookup(tree, hangul, s); if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age < age) age = leaf_age; s += utf8clen(s); } return age; } /* * Maximum age of any character in s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ int utf8nagemax(struct tree *tree, const char *s, size_t len) { utf8leaf_t *leaf; int age = 0; int leaf_age; unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; while (len && *s) { leaf = utf8nlookup(tree, hangul, s, len); if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age > age) age = leaf_age; len -= utf8clen(s); s += utf8clen(s); } return age; } /* * Maximum age of any character in s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ int utf8nagemin(struct tree *tree, const char *s, size_t len) { utf8leaf_t *leaf; int leaf_age; int age; unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; age = tree->maxage; while (len && *s) { leaf = utf8nlookup(tree, hangul, s, len); if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age < age) age = leaf_age; len -= utf8clen(s); s += utf8clen(s); } return age; } /* * Length of the normalization of s. * Return -1 if s is not valid UTF-8 unicode. * * A string of Default_Ignorable_Code_Point has length 0. */ ssize_t utf8len(struct tree *tree, const char *s) { utf8leaf_t *leaf; size_t ret = 0; unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; while (*s) { leaf = utf8lookup(tree, hangul, s); if (!leaf) return -1; if (ages[LEAF_GEN(leaf)] > tree->maxage) ret += utf8clen(s); else if (LEAF_CCC(leaf) == DECOMPOSE) ret += strlen(LEAF_STR(leaf)); else ret += utf8clen(s); s += utf8clen(s); } return ret; } /* * Length of the normalization of s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) { utf8leaf_t *leaf; size_t ret = 0; unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; while (len && *s) { leaf = utf8nlookup(tree, hangul, s, len); if (!leaf) return -1; if (ages[LEAF_GEN(leaf)] > tree->maxage) ret += utf8clen(s); else if (LEAF_CCC(leaf) == DECOMPOSE) ret += strlen(LEAF_STR(leaf)); else ret += utf8clen(s); len -= utf8clen(s); s += utf8clen(s); } return ret; } /* * Cursor structure used by the normalizer. */ struct utf8cursor { struct tree *tree; const char *s; const char *p; const char *ss; const char *sp; unsigned int len; unsigned int slen; short int ccc; short int nccc; unsigned int unichar; unsigned char hangul[UTF8HANGULLEAF]; }; /* * Set up an utf8cursor for use by utf8byte(). * * s : string. * len : length of s. * u8c : pointer to cursor. * trie : utf8trie_t to use for normalization. * * Returns -1 on error, 0 on success. */ int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, size_t len) { if (!tree) return -1; if (!s) return -1; u8c->tree = tree; u8c->s = s; u8c->p = NULL; u8c->ss = NULL; u8c->sp = NULL; u8c->len = len; u8c->slen = 0; u8c->ccc = STOPPER; u8c->nccc = STOPPER; u8c->unichar = 0; /* Check we didn't clobber the maximum length. */ if (u8c->len != len) return -1; /* The first byte of s may not be an utf8 continuation. */ if (len > 0 && (*s & 0xC0) == 0x80) return -1; return 0; } /* * Set up an utf8cursor for use by utf8byte(). * * s : NUL-terminated string. * u8c : pointer to cursor. * trie : utf8trie_t to use for normalization. * * Returns -1 on error, 0 on success. */ int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) { return utf8ncursor(u8c, tree, s, (unsigned int)-1); } /* * Get one byte from the normalized form of the string described by u8c. * * Returns the byte cast to an unsigned char on succes, and -1 on failure. * * The cursor keeps track of the location in the string in u8c->s. * When a character is decomposed, the current location is stored in * u8c->p, and u8c->s is set to the start of the decomposition. Note * that bytes from a decomposition do not count against u8c->len. * * Characters are emitted if they match the current CCC in u8c->ccc. * Hitting end-of-string while u8c->ccc == STOPPER means we're done, * and the function returns 0 in that case. * * Sorting by CCC is done by repeatedly scanning the string. The * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at * the start of the scan. The first pass finds the lowest CCC to be * emitted and stores it in u8c->nccc, the second pass emits the * characters with this CCC and finds the next lowest CCC. This limits * the number of passes to 1 + the number of different CCCs in the * sequence being scanned. * * Therefore: * u8c->p != NULL -> a decomposition is being scanned. * u8c->ss != NULL -> this is a repeating scan. * u8c->ccc == -1 -> this is the first scan of a repeating scan. */ int utf8byte(struct utf8cursor *u8c) { utf8leaf_t *leaf; int ccc; for (;;) { /* Check for the end of a decomposed character. */ if (u8c->p && *u8c->s == '\0') { u8c->s = u8c->p; u8c->p = NULL; } /* Check for end-of-string. */ if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { /* There is no next byte. */ if (u8c->ccc == STOPPER) return 0; /* End-of-string during a scan counts as a stopper. */ ccc = STOPPER; goto ccc_mismatch; } else if ((*u8c->s & 0xC0) == 0x80) { /* This is a continuation of the current character. */ if (!u8c->p) u8c->len--; return (unsigned char)*u8c->s++; } /* Look up the data for the current character. */ if (u8c->p) { leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); } else { leaf = utf8nlookup(u8c->tree, u8c->hangul, u8c->s, u8c->len); } /* No leaf found implies that the input is a binary blob. */ if (!leaf) return -1; /* Characters that are too new have CCC 0. */ if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { ccc = STOPPER; } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { u8c->len -= utf8clen(u8c->s); u8c->p = u8c->s + utf8clen(u8c->s); u8c->s = LEAF_STR(leaf); /* Empty decomposition implies CCC 0. */ if (*u8c->s == '\0') { if (u8c->ccc == STOPPER) continue; ccc = STOPPER; goto ccc_mismatch; } leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); ccc = LEAF_CCC(leaf); } u8c->unichar = utf8decode(u8c->s); /* * If this is not a stopper, then see if it updates * the next canonical class to be emitted. */ if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) u8c->nccc = ccc; /* * Return the current byte if this is the current * combining class. */ if (ccc == u8c->ccc) { if (!u8c->p) u8c->len--; return (unsigned char)*u8c->s++; } /* Current combining class mismatch. */ ccc_mismatch: if (u8c->nccc == STOPPER) { /* * Scan forward for the first canonical class * to be emitted. Save the position from * which to restart. */ assert(u8c->ccc == STOPPER); u8c->ccc = MINCCC - 1; u8c->nccc = ccc; u8c->sp = u8c->p; u8c->ss = u8c->s; u8c->slen = u8c->len; if (!u8c->p) u8c->len -= utf8clen(u8c->s); u8c->s += utf8clen(u8c->s); } else if (ccc != STOPPER) { /* Not a stopper, and not the ccc we're emitting. */ if (!u8c->p) u8c->len -= utf8clen(u8c->s); u8c->s += utf8clen(u8c->s); } else if (u8c->nccc != MAXCCC + 1) { /* At a stopper, restart for next ccc. */ u8c->ccc = u8c->nccc; u8c->nccc = MAXCCC + 1; u8c->s = u8c->ss; u8c->p = u8c->sp; u8c->len = u8c->slen; } else { /* All done, proceed from here. */ u8c->ccc = STOPPER; u8c->nccc = STOPPER; u8c->sp = NULL; u8c->ss = NULL; u8c->slen = 0; } } } /* ------------------------------------------------------------------ */ static int normalize_line(struct tree *tree) { char *s; char *t; int c; struct utf8cursor u8c; /* First test: null-terminated string. */ s = buf2; t = buf3; if (utf8cursor(&u8c, tree, s)) return -1; while ((c = utf8byte(&u8c)) > 0) if (c != (unsigned char)*t++) return -1; if (c < 0) return -1; if (*t != 0) return -1; /* Second test: length-limited string. */ s = buf2; /* Replace NUL with a value that will cause an error if seen. */ s[strlen(s) + 1] = -1; t = buf3; if (utf8cursor(&u8c, tree, s)) return -1; while ((c = utf8byte(&u8c)) > 0) if (c != (unsigned char)*t++) return -1; if (c < 0) return -1; if (*t != 0) return -1; return 0; } static void normalization_test(void) { FILE *file; unsigned int unichar; struct unicode_data *data; char *s; char *t; int ret; int ignorables; int tests = 0; int failures = 0; if (verbose > 0) printf("Parsing %s\n", test_name); /* Step one, read data from file. */ file = fopen(test_name, "r"); if (!file) open_fail(test_name, errno); while (fgets(line, LINESIZE, file)) { ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];", buf0, buf1); if (ret != 2 || *line == '#') continue; s = buf0; t = buf2; while (*s) { unichar = strtoul(s, &s, 16); t += utf8encode(t, unichar); } *t = '\0'; ignorables = 0; s = buf1; t = buf3; while (*s) { unichar = strtoul(s, &s, 16); data = &unicode_data[unichar]; if (data->utf8nfdi && !*data->utf8nfdi) ignorables = 1; else t += utf8encode(t, unichar); } *t = '\0'; tests++; if (normalize_line(nfdi_tree) < 0) { printf("Line %s -> %s", buf0, buf1); if (ignorables) printf(" (ignorables removed)"); printf(" failure\n"); failures++; } } fclose(file); if (verbose > 0) printf("Ran %d tests with %d failures\n", tests, failures); if (failures) file_fail(test_name); } /* ------------------------------------------------------------------ */ static void write_file(void) { FILE *file; int i; int j; int t; int gen; if (verbose > 0) printf("Writing %s\n", utf8_name); file = fopen(utf8_name, "w"); if (!file) open_fail(utf8_name, errno); fprintf(file, "/* This file is generated code, do not edit. */\n"); fprintf(file, "\n"); fprintf(file, "#include <linux/module.h>\n"); fprintf(file, "#include <linux/kernel.h>\n"); fprintf(file, "#include \"utf8n.h\"\n"); fprintf(file, "\n"); fprintf(file, "static const unsigned int utf8agetab[] = {\n"); for (i = 0; i != ages_count; i++) fprintf(file, "\t%#x%s\n", ages[i], ages[i] == unicode_maxage ? "" : ","); fprintf(file, "};\n"); fprintf(file, "\n"); fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n"); t = 0; for (gen = 0; gen < ages_count; gen++) { fprintf(file, "\t{ %#x, %d }%s\n", ages[gen], trees[t].index, ages[gen] == unicode_maxage ? "" : ","); if (trees[t].maxage == ages[gen]) t += 2; } fprintf(file, "};\n"); fprintf(file, "\n"); fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n"); t = 1; for (gen = 0; gen < ages_count; gen++) { fprintf(file, "\t{ %#x, %d }%s\n", ages[gen], trees[t].index, ages[gen] == unicode_maxage ? "" : ","); if (trees[t].maxage == ages[gen]) t += 2; } fprintf(file, "};\n"); fprintf(file, "\n"); fprintf(file, "static const unsigned char utf8data[%zd] = {\n", utf8data_size); t = 0; for (i = 0; i != utf8data_size; i += 16) { if (i == trees[t].index) { fprintf(file, "\t/* %s_%x */\n", trees[t].type, trees[t].maxage); if (t < trees_count-1) t++; } fprintf(file, "\t"); for (j = i; j != i + 16; j++) fprintf(file, "0x%.2x%s", utf8data[j], (j < utf8data_size -1 ? "," : "")); fprintf(file, "\n"); } fprintf(file, "};\n"); fprintf(file, "\n"); fprintf(file, "struct utf8data_table utf8_data_table = {\n"); fprintf(file, "\t.utf8agetab = utf8agetab,\n"); fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); fprintf(file, "\n"); fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n"); fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n"); fprintf(file, "\n"); fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n"); fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n"); fprintf(file, "\n"); fprintf(file, "\t.utf8data = utf8data,\n"); fprintf(file, "};\n"); fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); fprintf(file, "\n"); fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); fclose(file); } /* ------------------------------------------------------------------ */ int main(int argc, char *argv[]) { unsigned int unichar; int opt; argv0 = argv[0]; while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { switch (opt) { case 'a': age_name = optarg; break; case 'c': ccc_name = optarg; break; case 'd': data_name = optarg; break; case 'f': fold_name = optarg; break; case 'n': norm_name = optarg; break; case 'o': utf8_name = optarg; break; case 'p': prop_name = optarg; break; case 't': test_name = optarg; break; case 'v': verbose++; break; case 'h': help(); exit(0); default: usage(); } } if (verbose > 1) help(); for (unichar = 0; unichar != 0x110000; unichar++) unicode_data[unichar].code = unichar; age_init(); ccc_init(); nfdi_init(); nfdicf_init(); ignore_init(); corrections_init(); hangul_decompose(); nfdi_decompose(); nfdicf_decompose(); utf8_init(); trees_init(); trees_populate(); trees_reduce(); trees_verify(); /* Prevent "unused function" warning. */ (void)lookup(nfdi_tree, " "); if (verbose > 2) tree_walk(nfdi_tree); if (verbose > 2) tree_walk(nfdicf_tree); normalization_test(); write_file(); return 0; }