diff --git a/src/regex/regc_locale.c b/src/regex/regc_locale.c new file mode 100644 index 0000000000..4e13b8488b --- /dev/null +++ b/src/regex/regc_locale.c @@ -0,0 +1,838 @@ +/* + * regc_locale.c -- + * + * This file contains locale-specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * This software is copyrighted by the Regents of the University of + * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + * Corporation and other parties. The following terms apply to all files + * associated with the software unless explicitly disclaimed in + * individual files. + * + * The authors hereby grant permission to use, copy, modify, distribute, + * and license this software and its documentation for any purpose, provided + * that existing copyright notices are retained in all copies and that this + * notice is included verbatim in any distributions. No written agreement, + * license, or royalty fee is required for any of the authorized uses. + * Modifications to this software may be copyrighted by their authors + * and need not follow the licensing terms described here, provided that + * the new terms are clearly indicated on the first page of each file where + * they apply. + * + * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + * MODIFICATIONS. + * + * GOVERNMENT USE: If you are acquiring this software on behalf of the + * U.S. government, the Government shall have only "Restricted Rights" + * in the software and related documentation as defined in the Federal + * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + * are acquiring the software on behalf of the Department of Defense, the + * software shall be classified as "Commercial Computer Software" and the + * Government shall have only "Restricted Rights" as defined in Clause + * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + * authors grant the U.S. Government and others acting in its behalf + * permission to use and distribute the software in accordance with the + * terms specified in this license. + * + * $Header$ + */ + +int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum) +{ + while(*cp++ == (const char)*wp++ && --nNum){} + + return nNum; +} + +/* ASCII character-name table */ + +static struct cname +{ + char *name; + char code; +} cnames[] = + +{ + { + "NUL", '\0' + }, + { + "SOH", '\001' + }, + { + "STX", '\002' + }, + { + "ETX", '\003' + }, + { + "EOT", '\004' + }, + { + "ENQ", '\005' + }, + { + "ACK", '\006' + }, + { + "BEL", '\007' + }, + { + "alert", '\007' + }, + { + "BS", '\010' + }, + { + "backspace", '\b' + }, + { + "HT", '\011' + }, + { + "tab", '\t' + }, + { + "LF", '\012' + }, + { + "newline", '\n' + }, + { + "VT", '\013' + }, + { + "vertical-tab", '\v' + }, + { + "FF", '\014' + }, + { + "form-feed", '\f' + }, + { + "CR", '\015' + }, + { + "carriage-return", '\r' + }, + { + "SO", '\016' + }, + { + "SI", '\017' + }, + { + "DLE", '\020' + }, + { + "DC1", '\021' + }, + { + "DC2", '\022' + }, + { + "DC3", '\023' + }, + { + "DC4", '\024' + }, + { + "NAK", '\025' + }, + { + "SYN", '\026' + }, + { + "ETB", '\027' + }, + { + "CAN", '\030' + }, + { + "EM", '\031' + }, + { + "SUB", '\032' + }, + { + "ESC", '\033' + }, + { + "IS4", '\034' + }, + { + "FS", '\034' + }, + { + "IS3", '\035' + }, + { + "GS", '\035' + }, + { + "IS2", '\036' + }, + { + "RS", '\036' + }, + { + "IS1", '\037' + }, + { + "US", '\037' + }, + { + "space", ' ' + }, + { + "exclamation-mark", '!' + }, + { + "quotation-mark", '"' + }, + { + "number-sign", '#' + }, + { + "dollar-sign", '$' + }, + { + "percent-sign", '%' + }, + { + "ampersand", '&' + }, + { + "apostrophe", '\'' + }, + { + "left-parenthesis", '(' + }, + { + "right-parenthesis", ')' + }, + { + "asterisk", '*' + }, + { + "plus-sign", '+' + }, + { + "comma", ',' + }, + { + "hyphen", '-' + }, + { + "hyphen-minus", '-' + }, + { + "period", '.' + }, + { + "full-stop", '.' + }, + { + "slash", '/' + }, + { + "solidus", '/' + }, + { + "zero", '0' + }, + { + "one", '1' + }, + { + "two", '2' + }, + { + "three", '3' + }, + { + "four", '4' + }, + { + "five", '5' + }, + { + "six", '6' + }, + { + "seven", '7' + }, + { + "eight", '8' + }, + { + "nine", '9' + }, + { + "colon", ':' + }, + { + "semicolon", ';' + }, + { + "less-than-sign", '<' + }, + { + "equals-sign", '=' + }, + { + "greater-than-sign", '>' + }, + { + "question-mark", '?' + }, + { + "commercial-at", '@' + }, + { + "left-square-bracket", '[' + }, + { + "backslash", '\\' + }, + { + "reverse-solidus", '\\' + }, + { + "right-square-bracket", ']' + }, + { + "circumflex", '^' + }, + { + "circumflex-accent", '^' + }, + { + "underscore", '_' + }, + { + "low-line", '_' + }, + { + "grave-accent", '`' + }, + { + "left-brace", '{' + }, + { + "left-curly-bracket", '{' + }, + { + "vertical-line", '|' + }, + { + "right-brace", '}' + }, + { + "right-curly-bracket", '}' + }, + { + "tilde", '~' + }, + { + "DEL", '\177' + }, + { + NULL, 0 + } +}; + +/* + * some ctype functions with non-ascii-char guard + */ +static int +wx_isdigit(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c)); +} + +static int +wx_isalpha(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c)); +} + +static int +wx_isalnum(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c)); +} + +static int +wx_isupper(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c)); +} + +static int +wx_islower(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c)); +} + +static int +wx_isgraph(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c)); +} + +static int +wx_ispunct(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c)); +} + +static int +wx_isspace(wx_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c)); +} + +static wx_wchar +wx_toupper(wx_wchar c) +{ + if (c >= 0 && c <= UCHAR_MAX) + return toupper((unsigned char) c); + return c; +} + +static wx_wchar +wx_tolower(wx_wchar c) +{ + if (c >= 0 && c <= UCHAR_MAX) + return tolower((unsigned char) c); + return c; +} + + +/* + * nmcces - how many distinct MCCEs are there? + */ +static int +nmcces(struct vars * v) +{ + /* + * No multi-character collating elements defined at the moment. + */ + return 0; +} + +/* + * nleaders - how many chrs can be first chrs of MCCEs? + */ +static int +nleaders(struct vars * v) +{ + return 0; +} + +/* + * allmcces - return a cvec with all the MCCEs of the locale + */ +static struct cvec * +allmcces(struct vars * v, /* context */ + struct cvec * cv) /* this is supposed to have enough room */ +{ + return clearcvec(cv); +} + +/* + * element - map collating-element name to celt + */ +static celt +element(struct vars * v, /* context */ + chr *startp, /* points to start of name */ + chr *endp) /* points just past end of name */ +{ + struct cname *cn; + size_t len; + + /* generic: one-chr names stand for themselves */ + assert(startp < endp); + len = endp - startp; + if (len == 1) + return *startp; + + NOTE(REG_ULOCALE); + + /* search table */ + for (cn = cnames; cn->name != NULL; cn++) + { + if (strlen(cn->name) == len && + char_and_wchar_strncmp(cn->name, startp, len) == 0) + { + break; /* NOTE BREAK OUT */ + } + } + if (cn->name != NULL) + return CHR(cn->code); + + /* couldn't find it */ + ERR(REG_ECOLLATE); + return 0; +} + +/* + * range - supply cvec for a range, including legality check + */ +static struct cvec * +range(struct vars * v, /* context */ + celt a, /* range start */ + celt b, /* range end, might equal a */ + int cases) /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + celt c, + lc, + uc; + + if (a != b && !before(a, b)) + { + ERR(REG_ERANGE); + return NULL; + } + + if (!cases) + { /* easy version */ + cv = getcvec(v, 0, 1, 0); + NOERRN(); + addrange(cv, a, b); + return cv; + } + + /* + * When case-independent, it's hard to decide when cvec ranges are + * usable, so for now at least, we won't try. We allocate enough + * space for two case variants plus a little extra for the two title + * case variants. + */ + + nchrs = (b - a + 1) * 2 + 4; + + cv = getcvec(v, nchrs, 0, 0); + NOERRN(); + + for (c = a; c <= b; c++) + { + addchr(cv, c); + lc = wx_tolower((chr) c); + if (c != lc) + addchr(cv, lc); + uc = wx_toupper((chr) c); + if (c != uc) + addchr(cv, uc); + } + + return cv; +} + +/* + * before - is celt x before celt y, for purposes of range legality? + */ +static int /* predicate */ +before(celt x, celt y) +{ + /* trivial because no MCCEs */ + if (x < y) + return 1; + return 0; +} + +/* + * eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + */ +static struct cvec * +eclass(struct vars * v, /* context */ + celt c, /* Collating element representing the + * equivalence class. */ + int cases) /* all cases? */ +{ + struct cvec *cv; + + /* crude fake equivalence class for testing */ + if ((v->cflags & REG_FAKE) && c == 'x') + { + cv = getcvec(v, 4, 0, 0); + addchr(cv, (chr) 'x'); + addchr(cv, (chr) 'y'); + if (cases) + { + addchr(cv, (chr) 'X'); + addchr(cv, (chr) 'Y'); + } + return cv; + } + + /* otherwise, none */ + if (cases) + return allcases(v, c); + cv = getcvec(v, 1, 0, 0); + assert(cv != NULL); + addchr(cv, (chr) c); + return cv; +} + +/* + * cclass - supply cvec for a character class + * + * Must include case counterparts on request. + */ +static struct cvec * +cclass(struct vars * v, /* context */ + chr *startp, /* where the name starts */ + chr *endp, /* just past the end of the name */ + int cases) /* case-independent? */ +{ + size_t len; + struct cvec *cv = NULL; + char **namePtr; + int i, + index; + + /* + * The following arrays define the valid character class names. + */ + + static char *classNames[] = { + "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit", NULL + }; + + enum classes + { + CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT + }; + + /* + * Map the name to the corresponding enumerated value. + */ + len = endp - startp; + index = -1; + for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) + { + if (strlen(*namePtr) == len && + char_and_wchar_strncmp(*namePtr, startp, len) == 0) + { + index = i; + break; + } + } + if (index == -1) + { + ERR(REG_ECTYPE); + return NULL; + } + + /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && + ((enum classes) index == CC_LOWER || + (enum classes) index == CC_UPPER)) + index = (int) CC_ALPHA; + + /* + * Now compute the character class contents. + * + * For the moment, assume that only char codes < 256 can be in these + * classes. + */ + + switch ((enum classes) index) + { + case CC_PRINT: + case CC_ALNUM: + cv = getcvec(v, UCHAR_MAX, 1, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_isalpha((chr) i)) + addchr(cv, (chr) i); + } + addrange(cv, (chr) '0', (chr) '9'); + } + break; + case CC_ALPHA: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_isalpha((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_ASCII: + cv = getcvec(v, 0, 1, 0); + if (cv) + addrange(cv, 0, 0x7f); + break; + case CC_BLANK: + cv = getcvec(v, 2, 0, 0); + addchr(cv, '\t'); + addchr(cv, ' '); + break; + case CC_CNTRL: + cv = getcvec(v, 0, 2, 0); + addrange(cv, 0x0, 0x1f); + addrange(cv, 0x7f, 0x9f); + break; + case CC_DIGIT: + cv = getcvec(v, 0, 1, 0); + if (cv) + addrange(cv, (chr) '0', (chr) '9'); + break; + case CC_PUNCT: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_ispunct((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_XDIGIT: + cv = getcvec(v, 0, 3, 0); + if (cv) + { + addrange(cv, '0', '9'); + addrange(cv, 'a', 'f'); + addrange(cv, 'A', 'F'); + } + break; + case CC_SPACE: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_isspace((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_LOWER: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_islower((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_UPPER: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_isupper((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_GRAPH: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) + { + for (i = 0; i <= UCHAR_MAX; i++) + { + if (wx_isgraph((chr) i)) + addchr(cv, (chr) i); + } + } + break; + } + if (cv == NULL) + ERR(REG_ESPACE); + return cv; +} + +/* + * allcases - supply cvec for all case counterparts of a chr (including itself) + * + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + */ +static struct cvec * +allcases(struct vars * v, /* context */ + chr pc) /* character to get case equivs of */ +{ + struct cvec *cv; + chr c = (chr) pc; + chr lc, + uc; + + lc = wx_tolower((chr) c); + uc = wx_toupper((chr) c); + + cv = getcvec(v, 2, 0, 0); + addchr(cv, lc); + if (lc != uc) + addchr(cv, uc); + return cv; +} + +/* + * cmp - chr-substring compare + * + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +cmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len * sizeof(chr)); +} + +/* + * casecmp - case-independent chr-substring compare + * + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + for (; len > 0; len--, x++, y++) + { + if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y))) + return 1; + } + return 0; +}