From f768cf70037ccedeffd4e2dd481694e4fbd3af7b Mon Sep 17 00:00:00 2001 From: zherczeg Date: Wed, 24 May 2017 10:14:43 +0000 Subject: [PATCH] Rework character range parsing in glob conversion. --- src/pcre2_convert.c | 254 +++++++++++++++++++++++++----------------- testdata/testinput24 | 12 ++ testdata/testoutput24 | 83 ++++++++------ 3 files changed, 215 insertions(+), 134 deletions(-) diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c index 0cf74a8..1d055f8 100644 --- a/src/pcre2_convert.c +++ b/src/pcre2_convert.c @@ -423,6 +423,47 @@ out->output_size = output_size; } +/* Prints the separator into the output. + +Arguments: + out output context + separator glob separator + with_escape backslash is needed before separator +*/ + +static void +convert_glob_print_separator(pcre2_output_context *out, + PCRE2_UCHAR separator, BOOL with_escape) +{ +if (with_escape) + convert_glob_write(out, CHAR_BACKSLASH); + +convert_glob_write(out, separator); +} + + +/* Prints a wildcard into the output. + +Arguments: + out output context + separator glob separator + with_escape backslash is needed before separator +*/ + +static void +convert_glob_print_wildcard(pcre2_output_context *out, + PCRE2_UCHAR separator, BOOL with_escape) +{ +out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; +out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; +convert_glob_write_str(out, 2); + +convert_glob_print_separator(out, separator, with_escape); + +convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); +} + + /* Parse a posix class. Arguments: @@ -519,77 +560,89 @@ Returns: 0 => success static int convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, - pcre2_output_context *out, PCRE2_UCHAR separator, BOOL with_escape) + pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator, + BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep) { +BOOL is_negative = FALSE; +BOOL separator_seen = FALSE; +BOOL has_prev_c; PCRE2_SPTR pattern = *from; -PCRE2_UCHAR c; +PCRE2_SPTR char_start = NULL; +uint32_t c, prev_c; int result, len; +(void)utf; /* Avoid compiler warning. */ + if (pattern >= pattern_end) { *from = pattern; return ERROR_MISSING_SQUARE_BRACKET; } -c = *pattern; - -if (c == CHAR_EXCLAMATION_MARK - || c == CHAR_CIRCUMFLEX_ACCENT) - { - out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; - out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; - len = 2; - } -else - { - out->out_str[0] = CHAR_LEFT_PARENTHESIS; - out->out_str[1] = CHAR_QUESTION_MARK; - out->out_str[2] = CHAR_EXCLAMATION_MARK; - len = 3; - } - -if (with_escape) - { - out->out_str[len] = CHAR_BACKSLASH; - len++; - } - -out->out_str[len] = (uint8_t) separator; - -convert_glob_write_str(out, len + 1); - -if (c == CHAR_EXCLAMATION_MARK - || c == CHAR_CIRCUMFLEX_ACCENT) +if (*pattern == CHAR_EXCLAMATION_MARK + || *pattern == CHAR_CIRCUMFLEX_ACCENT) { pattern++; + if (pattern >= pattern_end) { *from = pattern; return ERROR_MISSING_SQUARE_BRACKET; } - c = *pattern; + + is_negative = TRUE; + + out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; + out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; + len = 2; + + if (!no_wildsep) + { + if (with_escape) + { + out->out_str[len] = CHAR_BACKSLASH; + len++; + } + out->out_str[len] = (uint8_t) separator; + } + + convert_glob_write_str(out, len + 1); } else - { - out->out_str[0] = CHAR_RIGHT_PARENTHESIS; - out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET; - convert_glob_write_str(out, 2); - } + convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET); -if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET) +has_prev_c = FALSE; +prev_c = 0; + +if (*pattern == CHAR_RIGHT_SQUARE_BRACKET) { - convert_glob_write(out, CHAR_BACKSLASH); - convert_glob_write(out, c); + convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); + has_prev_c = TRUE; + prev_c = CHAR_RIGHT_SQUARE_BRACKET; pattern++; } while (pattern < pattern_end) { - c = *pattern++; + char_start = pattern; + GETCHARINCTEST(c, pattern); if (c == CHAR_RIGHT_SQUARE_BRACKET) { convert_glob_write(out, c); + + if (!is_negative && !no_wildsep && separator_seen) + { + out->out_str[0] = CHAR_LEFT_PARENTHESIS; + out->out_str[1] = CHAR_QUESTION_MARK; + out->out_str[2] = CHAR_LESS_THAN_SIGN; + out->out_str[3] = CHAR_EXCLAMATION_MARK; + convert_glob_write_str(out, 4); + + convert_glob_print_separator(out, separator, with_escape); + convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); + } + *from = pattern; return 0; } @@ -605,32 +658,64 @@ while (pattern < pattern_end) pattern = *from; - /* A dash after a character class is a normal character. */ - if (pattern >= pattern_end || *pattern != CHAR_MINUS) - continue; - - c = CHAR_MINUS; - pattern++; + has_prev_c = FALSE; + prev_c = 0; + separator_seen = TRUE; + continue; } - else if (c == CHAR_MINUS) + else if (c == CHAR_MINUS && has_prev_c && + *pattern != CHAR_RIGHT_SQUARE_BRACKET) { convert_glob_write(out, CHAR_MINUS); - c = *pattern++; - if (c == CHAR_BACKSLASH) + char_start = pattern; + GETCHARINCTEST(c, pattern); + + if (pattern >= pattern_end) break; + + if (escape != 0 && c == escape) { - if (pattern >= pattern_end) break; - c = *pattern++; + char_start = pattern; + GETCHARINCTEST(c, pattern); } + else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) + { + *from = pattern; + return PCRE2_ERROR_CONVERT_SYNTAX; + } + + if (prev_c > c) + { + *from = pattern; + return PCRE2_ERROR_CONVERT_SYNTAX; + } + + if (prev_c < separator && separator < c) separator_seen = TRUE; + + has_prev_c = FALSE; + prev_c = 0; + } + else + { + if (escape != 0 && c == escape) + { + char_start = pattern; + GETCHARINCTEST(c, pattern); + + if (pattern >= pattern_end) break; + } + + has_prev_c = TRUE; + prev_c = c; } - else if (c == CHAR_BACKSLASH) - c = *pattern++; if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET || c == CHAR_BACKSLASH || c == CHAR_MINUS) convert_glob_write(out, CHAR_BACKSLASH); - convert_glob_write(out, c); + if (c == separator) separator_seen = TRUE; + + do convert_glob_write(out, *char_start++); while (char_start < pattern); } *from = pattern; @@ -638,47 +723,6 @@ return ERROR_MISSING_SQUARE_BRACKET; } -/* Prints the separator into the output. - -Arguments: - out output context - separator glob separator - with_escape backslash is needed before separator -*/ - -static void -convert_glob_print_separator(pcre2_output_context *out, - PCRE2_UCHAR separator, BOOL with_escape) -{ -if (with_escape) - convert_glob_write(out, CHAR_BACKSLASH); - -convert_glob_write(out, separator); -} - - -/* Prints a wildcard into the output. - -Arguments: - out output context - separator glob separator - with_escape backslash is needed before separator -*/ - -static void -convert_glob_print_wildcard(pcre2_output_context *out, - PCRE2_UCHAR separator, BOOL with_escape) -{ -out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; -out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; -convert_glob_write_str(out, 2); - -convert_glob_print_separator(out, separator, with_escape); - -convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); -} - - /* Prints a (*COMMIT) into the output. Arguments: @@ -727,8 +771,8 @@ pcre2_output_context out; PCRE2_SPTR pattern_start = pattern; PCRE2_SPTR pattern_end = pattern + plength; PCRE2_UCHAR separator = ccontext->glob_separator; +PCRE2_UCHAR escape = ccontext->glob_escape; PCRE2_UCHAR c; -BOOL no_escape = ccontext->glob_escape == 0; BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0; BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0; BOOL in_atomic = FALSE; @@ -736,14 +780,16 @@ BOOL after_starstar = FALSE; BOOL with_escape, is_start; int result, len; -(void)utf; /* Avoid compiler warning */ +(void)utf; /* Avoid compiler warning. */ -if (separator >= 128) +#ifdef SUPPORT_UNICODE +if (utf && (separator >= 128 || escape >= 128)) { - /* Currently only ASCII separators are supported. */ + /* Currently only ASCII characters are supported. */ *bufflenptr = 0; return PCRE2_ERROR_CONVERT_SYNTAX; } +#endif with_escape = strchr(pcre2_escaped_literals, separator) != NULL; @@ -809,7 +855,7 @@ while (pattern < pattern_end) break; } - if (!no_escape && *pattern == ccontext->glob_escape) + if (escape != 0 && *pattern == escape) { pattern++; if (pattern >= pattern_end) @@ -908,6 +954,8 @@ while (pattern < pattern_end) out.out_str[0] = CHAR_ASTERISK; out.out_str[1] = CHAR_QUESTION_MARK; + if (pattern >= pattern_end) + out.out_str[1] = CHAR_PLUS; convert_glob_write_str(&out, 2); continue; } @@ -924,12 +972,12 @@ while (pattern < pattern_end) if (c == CHAR_LEFT_SQUARE_BRACKET) { result = convert_glob_parse_range(&pattern, pattern_end, - &out, separator, with_escape); + &out, utf, separator, with_escape, escape, no_wildsep); if (result != 0) break; continue; } - if (!no_escape && c == ccontext->glob_escape) + if (escape != 0 && c == escape) { if (pattern >= pattern_end) { diff --git a/testdata/testinput24 b/testdata/testinput24 index 20d4d5f..f96e4cc 100644 --- a/testdata/testinput24 +++ b/testdata/testinput24 @@ -227,6 +227,18 @@ /[[:alpha:][:xdigit:][:word:]]/ +"[/-/]" + +/[-----]/ + +/[------]/ + +/[!------]/ + +/[[:alpha:]-a]/ + +/[a-[:alpha:]]/ + /[[:alpha:/ /[[:alpha:]/ diff --git a/testdata/testoutput24 b/testdata/testoutput24 index dbb248e..9b4c11f 100644 --- a/testdata/testoutput24 +++ b/testdata/testoutput24 @@ -22,10 +22,10 @@ # Can't have separator in a class "[ab/cd]" -(?s)\A(?!/)[ab/cd]\z +(?s)\A[ab/cd](?/ -(?s)\A<(?!/)[a-c-d]>\z +(?s)\A<[a-c\-d]>\z 0: @@ -291,7 +294,7 @@ No match 0: <-> /a[[:digit:].]z/ -(?s)\Aa(?!/)[[:digit:].]z\z +(?s)\Aa[[:digit:].](?[^/]*?a)(?>[^/]*?\z) +(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*+\z) xx/xx/xx/xax 0: /xax xx/xx/xx/xax/xx No match /**\/*a*\/**\/*b*/ -(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*?\z) +(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*+\z) xx/xx/xx/xax/xx/xb 0: /xax/xx/xb xx/xx/xx/xax/xx/x @@ -402,10 +423,10 @@ No match #pattern convert=glob:glob_no_starstar /***/ -(?s)\A[^/]*?\z +(?s)\A[^/]*+\z /**a**/ -(?s)\A[^/]*?a(*COMMIT)[^/]*?\z +(?s)\A[^/]*?a(*COMMIT)[^/]*+\z #pattern convert=unset #pattern convert=glob:glob_no_wild_separator