// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2014 SGI. * All rights reserved. */ #include "utf8n.h" int utf8version_is_supported(const struct unicode_map *um, unsigned int version) { int i = um->tables->utf8agetab_size - 1; while (i >= 0 && um->tables->utf8agetab[i] != 0) { if (version == um->tables->utf8agetab[i]) return 1; i--; } return 0; } /* * UTF-8 valid ranges. * * The UTF-8 encoding spreads the bits of a 32bit word over several * bytes. This table gives the ranges that can be held and how they'd * be represented. * * 0x00000000 0x0000007F: 0xxxxxxx * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * There is an additional requirement on UTF-8, in that only the * shortest representation of a 32bit value is to be used. A decoder * must not decode sequences that do not satisfy this requirement. * Thus the allowed ranges have a lower bound. * * 0x00000000 0x0000007F: 0xxxxxxx * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, * 17 planes of 65536 values. This limits the sequences actually seen * even more, to just the following. * * 0 - 0x7F: 0 - 0x7F * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF * * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. * * Note that the longest sequence seen with valid usage is 4 bytes, * the same a single UTF-32 character. This makes the UTF-8 * representation of Unicode strictly smaller than UTF-32. * * The shortest sequence requirement was introduced by: * Corrigendum #1: UTF-8 Shortest Form * It can be found here: * http://www.unicode.org/versions/corrigendum1.html * */ /* * Return the number of bytes used by the current UTF-8 sequence. * Assumes the input points to the first byte of a valid UTF-8 * sequence. */ static inline int utf8clen(const char *s) { unsigned char c = *s; return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); } /* * Decode a 3-byte UTF-8 sequence. */ static unsigned int utf8decode3(const char *str) { unsigned int uc; uc = *str++ & 0x0F; uc <<= 6; uc |= *str++ & 0x3F; uc <<= 6; uc |= *str++ & 0x3F; return uc; } /* * Encode a 3-byte UTF-8 sequence. */ static int utf8encode3(char *str, unsigned int val) { str[2] = (val & 0x3F) | 0x80; val >>= 6; str[1] = (val & 0x3F) | 0x80; val >>= 6; str[0] = val | 0xE0; return 3; } /* * utf8trie_t * * A compact binary tree, used to decode UTF-8 characters. * * Internal nodes are one byte for the node itself, and up to three * bytes for an offset into the tree. The first byte contains the * following information: * NEXTBYTE - flag - advance to next byte if set * BITNUM - 3 bit field - the bit number to tested * OFFLEN - 2 bit field - number of bytes in the offset * if offlen == 0 (non-branching node) * RIGHTPATH - 1 bit field - set if the following node is for the * right-hand path (tested bit is set) * TRIENODE - 1 bit field - set if the following node is an internal * node, otherwise it is a leaf node * if offlen != 0 (branching node) * LEFTNODE - 1 bit field - set if the left-hand node is internal * RIGHTNODE - 1 bit field - set if the right-hand node is internal * * Due to the way utf8 works, there cannot be branching nodes with * NEXTBYTE set, and moreover those nodes always have a righthand * descendant. */ typedef const unsigned char utf8trie_t; #define BITNUM 0x07 #define NEXTBYTE 0x08 #define OFFLEN 0x30 #define OFFLEN_SHIFT 4 #define RIGHTPATH 0x40 #define TRIENODE 0x80 #define RIGHTNODE 0x40 #define LEFTNODE 0x80 /* * utf8leaf_t * * The leaves of the trie are embedded in the trie, and so the same * underlying datatype: unsigned char. * * leaf[0]: The unicode version, stored as a generation number that is * an index into ->utf8agetab[]. With this we can filter code * points based on the unicode version in which they were * defined. The CCC of a non-defined code point is 0. * leaf[1]: Canonical Combining Class. During normalization, we need * to do a stable sort into ascending order of all characters * with a non-zero CCC that occur between two characters with * a CCC of 0, or at the begin or end of a string. * The unicode standard guarantees that all CCC values are * between 0 and 254 inclusive, which leaves 255 available as * a special value. * Code points with CCC 0 are known as stoppers. * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the * start of a NUL-terminated string that is the decomposition * of the character. * The CCC of a decomposable character is the same as the CCC * of the first character of its decomposition. * Some characters decompose as the empty string: these are * characters with the Default_Ignorable_Code_Point property. * These do affect normalization, as they all have CCC 0. * * The decompositions in the trie have been fully expanded, with the * exception of Hangul syllables, which are decomposed algorithmically. * * Casefolding, if applicable, is also done using decompositions. * * The trie is constructed in such a way that leaves exist for all * UTF-8 sequences that match the criteria from the "UTF-8 valid * ranges" comment above, and only for those sequences. Therefore a * lookup in the trie can be used to validate the UTF-8 input. */ typedef const unsigned char utf8leaf_t; #define LEAF_GEN(LEAF) ((LEAF)[0]) #define LEAF_CCC(LEAF) ((LEAF)[1]) #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) #define MINCCC (0) #define MAXCCC (254) #define STOPPER (0) #define DECOMPOSE (255) /* Marker for hangul syllable decomposition. */ #define HANGUL ((char)(255)) /* Size of the synthesized leaf used for Hangul syllable decomposition. */ #define UTF8HANGULLEAF (12) /* * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) * * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; * * SBase = 0xAC00 * LBase = 0x1100 * VBase = 0x1161 * TBase = 0x11A7 * LCount = 19 * VCount = 21 * TCount = 28 * NCount = 588 (VCount * TCount) * SCount = 11172 (LCount * NCount) * * Decomposition: * SIndex = s - SBase * * LV (Canonical/Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * LPart = LBase + LIndex * VPart = VBase + VIndex * * LVT (Canonical) * LVIndex = (SIndex / TCount) * TCount * TIndex = (Sindex % TCount) * LVPart = SBase + LVIndex * TPart = TBase + TIndex * * LVT (Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * TIndex = (Sindex % TCount) * LPart = LBase + LIndex * VPart = VBase + VIndex * if (TIndex == 0) { * d = <LPart, VPart> * } else { * TPart = TBase + TIndex * d = <LPart, TPart, VPart> * } */ /* Constants */ #define SB (0xAC00) #define LB (0x1100) #define VB (0x1161) #define TB (0x11A7) #define LC (19) #define VC (21) #define TC (28) #define NC (VC * TC) #define SC (LC * NC) /* Algorithmic decomposition of hangul syllable. */ static utf8leaf_t * utf8hangul(const char *str, unsigned char *hangul) { unsigned int si; unsigned int li; unsigned int vi; unsigned int ti; unsigned char *h; /* Calculate the SI, LI, VI, and TI values. */ si = utf8decode3(str) - SB; li = si / NC; vi = (si % NC) / TC; ti = si % TC; /* Fill in base of leaf. */ h = hangul; LEAF_GEN(h) = 2; LEAF_CCC(h) = DECOMPOSE; h += 2; /* Add LPart, a 3-byte UTF-8 sequence. */ h += utf8encode3((char *)h, li + LB); /* Add VPart, a 3-byte UTF-8 sequence. */ h += utf8encode3((char *)h, vi + VB); /* Add TPart if required, also a 3-byte UTF-8 sequence. */ if (ti) h += utf8encode3((char *)h, ti + TB); /* Terminate string. */ h[0] = '\0'; return hangul; } /* * Use trie to scan s, touching at most len bytes. * Returns the leaf if one exists, NULL otherwise. * * A non-NULL return guarantees that the UTF-8 sequence starting at s * is well-formed and corresponds to a known unicode code point. The * shorthand for this will be "is valid UTF-8 unicode". */ static utf8leaf_t *utf8nlookup(const struct unicode_map *um, enum utf8_normalization n, unsigned char *hangul, const char *s, size_t len) { utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; int offlen; int offset; int mask; int node; if (len == 0) return NULL; node = 1; while (node) { offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; if (*trie & NEXTBYTE) { if (--len == 0) return NULL; s++; } mask = 1 << (*trie & BITNUM); if (*s & mask) { /* Right leg */ if (offlen) { /* Right node at offset of trie */ node = (*trie & RIGHTNODE); offset = trie[offlen]; while (--offlen) { offset <<= 8; offset |= trie[offlen]; } trie += offset; } else if (*trie & RIGHTPATH) { /* Right node after this node */ node = (*trie & TRIENODE); trie++; } else { /* No right node. */ return NULL; } } else { /* Left leg */ if (offlen) { /* Left node after this node. */ node = (*trie & LEFTNODE); trie += offlen + 1; } else if (*trie & RIGHTPATH) { /* No left node. */ return NULL; } else { /* Left node after this node */ node = (*trie & TRIENODE); trie++; } } } /* * Hangul decomposition is done algorithmically. These are the * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is * always 3 bytes long, so s has been advanced twice, and the * start of the sequence is at s-2. */ if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) trie = utf8hangul(s - 2, hangul); return trie; } /* * Use trie to scan s. * Returns the leaf if one exists, NULL otherwise. * * Forwards to utf8nlookup(). */ static utf8leaf_t *utf8lookup(const struct unicode_map *um, enum utf8_normalization n, unsigned char *hangul, const char *s) { return utf8nlookup(um, n, hangul, s, (size_t)-1); } /* * Length of the normalization of s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, const char *s, size_t len) { utf8leaf_t *leaf; size_t ret = 0; unsigned char hangul[UTF8HANGULLEAF]; while (len && *s) { leaf = utf8nlookup(um, n, hangul, s, len); if (!leaf) return -1; if (um->tables->utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage) ret += utf8clen(s); else if (LEAF_CCC(leaf) == DECOMPOSE) ret += strlen(LEAF_STR(leaf)); else ret += utf8clen(s); len -= utf8clen(s); s += utf8clen(s); } return ret; } /* * Set up an utf8cursor for use by utf8byte(). * * u8c : pointer to cursor. * data : const struct utf8data to use for normalization. * s : string. * len : length of s. * * Returns -1 on error, 0 on success. */ int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, enum utf8_normalization n, const char *s, size_t len) { if (!s) return -1; u8c->um = um; u8c->n = n; u8c->s = s; u8c->p = NULL; u8c->ss = NULL; u8c->sp = NULL; u8c->len = len; u8c->slen = 0; u8c->ccc = STOPPER; u8c->nccc = STOPPER; /* Check we didn't clobber the maximum length. */ if (u8c->len != len) return -1; /* The first byte of s may not be an utf8 continuation. */ if (len > 0 && (*s & 0xC0) == 0x80) return -1; return 0; } /* * Get one byte from the normalized form of the string described by u8c. * * Returns the byte cast to an unsigned char on succes, and -1 on failure. * * The cursor keeps track of the location in the string in u8c->s. * When a character is decomposed, the current location is stored in * u8c->p, and u8c->s is set to the start of the decomposition. Note * that bytes from a decomposition do not count against u8c->len. * * Characters are emitted if they match the current CCC in u8c->ccc. * Hitting end-of-string while u8c->ccc == STOPPER means we're done, * and the function returns 0 in that case. * * Sorting by CCC is done by repeatedly scanning the string. The * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at * the start of the scan. The first pass finds the lowest CCC to be * emitted and stores it in u8c->nccc, the second pass emits the * characters with this CCC and finds the next lowest CCC. This limits * the number of passes to 1 + the number of different CCCs in the * sequence being scanned. * * Therefore: * u8c->p != NULL -> a decomposition is being scanned. * u8c->ss != NULL -> this is a repeating scan. * u8c->ccc == -1 -> this is the first scan of a repeating scan. */ int utf8byte(struct utf8cursor *u8c) { utf8leaf_t *leaf; int ccc; for (;;) { /* Check for the end of a decomposed character. */ if (u8c->p && *u8c->s == '\0') { u8c->s = u8c->p; u8c->p = NULL; } /* Check for end-of-string. */ if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { /* There is no next byte. */ if (u8c->ccc == STOPPER) return 0; /* End-of-string during a scan counts as a stopper. */ ccc = STOPPER; goto ccc_mismatch; } else if ((*u8c->s & 0xC0) == 0x80) { /* This is a continuation of the current character. */ if (!u8c->p) u8c->len--; return (unsigned char)*u8c->s++; } /* Look up the data for the current character. */ if (u8c->p) { leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); } else { leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul, u8c->s, u8c->len); } /* No leaf found implies that the input is a binary blob. */ if (!leaf) return -1; ccc = LEAF_CCC(leaf); /* Characters that are too new have CCC 0. */ if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > u8c->um->ntab[u8c->n]->maxage) { ccc = STOPPER; } else if (ccc == DECOMPOSE) { u8c->len -= utf8clen(u8c->s); u8c->p = u8c->s + utf8clen(u8c->s); u8c->s = LEAF_STR(leaf); /* Empty decomposition implies CCC 0. */ if (*u8c->s == '\0') { if (u8c->ccc == STOPPER) continue; ccc = STOPPER; goto ccc_mismatch; } leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); if (!leaf) return -1; ccc = LEAF_CCC(leaf); } /* * If this is not a stopper, then see if it updates * the next canonical class to be emitted. */ if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) u8c->nccc = ccc; /* * Return the current byte if this is the current * combining class. */ if (ccc == u8c->ccc) { if (!u8c->p) u8c->len--; return (unsigned char)*u8c->s++; } /* Current combining class mismatch. */ ccc_mismatch: if (u8c->nccc == STOPPER) { /* * Scan forward for the first canonical class * to be emitted. Save the position from * which to restart. */ u8c->ccc = MINCCC - 1; u8c->nccc = ccc; u8c->sp = u8c->p; u8c->ss = u8c->s; u8c->slen = u8c->len; if (!u8c->p) u8c->len -= utf8clen(u8c->s); u8c->s += utf8clen(u8c->s); } else if (ccc != STOPPER) { /* Not a stopper, and not the ccc we're emitting. */ if (!u8c->p) u8c->len -= utf8clen(u8c->s); u8c->s += utf8clen(u8c->s); } else if (u8c->nccc != MAXCCC + 1) { /* At a stopper, restart for next ccc. */ u8c->ccc = u8c->nccc; u8c->nccc = MAXCCC + 1; u8c->s = u8c->ss; u8c->p = u8c->sp; u8c->len = u8c->slen; } else { /* All done, proceed from here. */ u8c->ccc = STOPPER; u8c->nccc = STOPPER; u8c->sp = NULL; u8c->ss = NULL; u8c->slen = 0; } } } #ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); EXPORT_SYMBOL_GPL(utf8byte); #endif