From d802f1eca38e7eac3fa54e0010c880ee9962877c Mon Sep 17 00:00:00 2001 From: ph10 Date: Thu, 15 Jun 2017 16:41:44 +0000 Subject: [PATCH] Implement PCRE2_LITERAL and REG_NOSPEC. --- ChangeLog | 2 + RunTest | 2 +- doc/pcre2api.3 | 15 ++- doc/pcre2posix.3 | 22 +++-- doc/pcre2test.1 | 5 +- src/pcre2.h | 1 + src/pcre2.h.in | 1 + src/pcre2_compile.c | 216 +++++++++++++++++++++++++----------------- src/pcre2_error.c | 3 +- src/pcre2posix.c | 6 +- src/pcre2posix.h | 1 + src/pcre2test.c | 22 +++-- testdata/testinput18 | 5 + testdata/testinput2 | 35 +++++++ testdata/testinput5 | 3 + testdata/testoutput18 | 7 ++ testdata/testoutput2 | 68 ++++++++++++- testdata/testoutput5 | 4 + 18 files changed, 309 insertions(+), 109 deletions(-) diff --git a/ChangeLog b/ChangeLog index b775e16..d41ddff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -187,6 +187,8 @@ starting offset greater than zero. 40. Implement the subject_literal modifier in pcre2test, and allow jitstack on pattern lines. +41. Implement PCRE2_LITERAL and use it to support REG_NOSPEC. + Version 10.23 14-February-2017 ------------------------------ diff --git a/RunTest b/RunTest index e7a3f7d..c0e3b1f 100755 --- a/RunTest +++ b/RunTest @@ -500,7 +500,7 @@ for bmode in "$test8" "$test16" "$test32"; do for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry if [ $? = 0 ] ; then - $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,101,191,192 >>testtry + $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,101,191,200 >>testtry checkresult $? 2 "$opt" fi done diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index aeaf36a..5972d3e 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "01 June 2017" "PCRE2 10.30" +.TH PCRE2API 3 "15 June 2017" "PCRE2 10.30" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1393,6 +1393,17 @@ continue over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. +.sp + PCRE2_LITERAL +.sp +If this option is set, all meta-characters in the pattern are disabled, and it +is treated as a literal string. Matching literal strings with a regular +expression engine is not the most efficient way of doing it. If you are doing a +lot of literal matching and are worried about efficiency, you should consider +using other approaches. The only other options that are allowed with +PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, +PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, +PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. Any other options cause an error. .sp PCRE2_MATCH_UNSET_BACKREF .sp @@ -3508,6 +3519,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 01 June 2017 +Last updated: 15 June 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/doc/pcre2posix.3 b/doc/pcre2posix.3 index cce65fa..399e2a8 100644 --- a/doc/pcre2posix.3 +++ b/doc/pcre2posix.3 @@ -1,4 +1,4 @@ -.TH PCRE2POSIX 3 "05 June 2017" "PCRE2 10.30" +.TH PCRE2POSIX 3 "15 June 2017" "PCRE2 10.30" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SYNOPSIS" @@ -71,7 +71,7 @@ The function \fBregcomp()\fP is called to compile a pattern into an internal form. By default, the pattern is a C string terminated by a binary zero (but see REG_PEND below). The \fIpreg\fP argument is a pointer to a \fBregex_t\fP structure that is used as a base for storing information about -the compiled regular expression. (It is also used for input when REG_PEND is +the compiled regular expression. (It is also used for input when REG_PEND is set.) .P The argument \fIcflags\fP is either zero, or contains one or more of the bits @@ -93,6 +93,14 @@ compilation to the native function. The PCRE2_MULTILINE option is set when the regular expression is passed for compilation to the native function. Note that this does \fInot\fP mimic the defined POSIX behaviour for REG_NEWLINE (see the following section). +.sp + REG_NOSPEC +.sp +The PCRE2_LITERAL option is set when the regular expression is passed for +compilation to the native function. This disables all meta characters in the +pattern, causing it to be treated as a literal string. The only other options +that are allowed with REG_NOSPEC are REG_ICASE, REG_NOSUB, REG_PEND, and +REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard. .sp REG_NOSUB .sp @@ -104,8 +112,8 @@ because it disables the use of back references. .sp REG_PEND .sp -If this option is set, the \fBreg_endp\fP field in the \fIpreg\fP structure -(which has the type const char *) must be set to point to the character beyond +If this option is set, the \fBreg_endp\fP field in the \fIpreg\fP structure +(which has the type const char *) must be set to point to the character beyond the end of the pattern before calling \fBregcomp()\fP. The pattern itself may now contain binary zeroes, which are treated as data characters. Without REG_PEND, a binary zero terminates the pattern and the \fBre_endp\fP field is @@ -218,8 +226,8 @@ function. .sp When this option is set, the subject string is starts at \fIstring\fP + \fIpmatch[0].rm_so\fP and ends at \fIstring\fP + \fIpmatch[0].rm_eo\fP, which -should point to the first character beyond the string. There may be binary -zeroes within the subject string, and indeed, using REG_STARTEND is the only +should point to the first character beyond the string. There may be binary +zeroes within the subject string, and indeed, using REG_STARTEND is the only way to pass a subject string that contains a binary zero. .P Whatever the value of \fIpmatch[0].rm_so\fP, the offsets of the matched string @@ -292,6 +300,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 05 June 2017 +Last updated: 15 June 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index e92a04e..d0bcce2 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "12 June 2017" "PCRE 10.30" +.TH PCRE2TEST 1 "15 June 2017" "PCRE 10.30" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -555,6 +555,7 @@ for a description of the effects of these options. /x extended set PCRE2_EXTENDED /xx extended_more set PCRE2_EXTENDED_MORE firstline set PCRE2_FIRSTLINE + literal set PCRE2_LITERAL match_unset_backref set PCRE2_MATCH_UNSET_BACKREF /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C @@ -1834,6 +1835,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 12 June 2017 +Last updated: 15 June 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/src/pcre2.h b/src/pcre2.h index cc2a642..2024263 100644 --- a/src/pcre2.h +++ b/src/pcre2.h @@ -138,6 +138,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ #define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ +#define PCRE2_LITERAL 0x02000000u /* C */ /* An additional compile options word is available in the compile context. */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index b6fd9d9..ec080cc 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -138,6 +138,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ #define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ +#define PCRE2_LITERAL 0x02000000u /* C */ /* An additional compile options word is available in the compile context. */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 949897c..ff9261b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -696,13 +696,18 @@ static int posix_substitutes[] = { (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_ENDANCHORED|PCRE2_EXTENDED| \ - PCRE2_EXTENDED_MORE|PCRE2_FIRSTLINE| \ + PCRE2_EXTENDED_MORE|PCRE2_FIRSTLINE|PCRE2_LITERAL| \ PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \ PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ PCRE2_UTF) +#define PUBLIC_LITERAL_COMPILE_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \ + PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \ + PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF) + /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and eint2 in pcre2posix.c may need to be updated, and a new error text must be @@ -718,7 +723,7 @@ enum { ERR0 = COMPILE_ERROR_BASE, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90, - ERR91}; + ERR91, ERR92}; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward @@ -1613,8 +1618,8 @@ else if (c >= CHAR_8) break; - /* Fall through */ - + /* Fall through */ + /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used @@ -2170,7 +2175,7 @@ the parsed pattern. Arguments: ptr current pattern pointer pcalloutptr points to a pointer to previous callout, or NULL - options the compiling options + auto_callout TRUE if auto_callouts are enabled parsed_pattern the parsed pattern pointer cb compile block @@ -2178,7 +2183,7 @@ Returns: possibly updated parsed_pattern pointer. */ static uint32_t * -manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, uint32_t options, +manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout, uint32_t *parsed_pattern, compile_block *cb) { uint32_t *previous_callout = *pcalloutptr; @@ -2186,7 +2191,7 @@ uint32_t *previous_callout = *pcalloutptr; if (previous_callout != NULL) previous_callout[2] = ptr - cb->start_pattern - (PCRE2_SIZE)previous_callout[1]; -if ((options & PCRE2_AUTO_CALLOUT) == 0) previous_callout = NULL; else +if (!auto_callout) previous_callout = NULL; else { if (previous_callout == NULL || previous_callout != parsed_pattern - 4 || @@ -2288,15 +2293,44 @@ int i; BOOL inescq = FALSE; BOOL inverbname = FALSE; BOOL utf = (options & PCRE2_UTF) != 0; +BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0; BOOL isdupname; BOOL negate_class; BOOL okquantifier = FALSE; +PCRE2_SPTR thisptr; PCRE2_SPTR name; PCRE2_SPTR ptrend = cb->end_pattern; PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ named_group *ng; -nest_save *top_nest = NULL; -nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); +nest_save *top_nest, *end_nests; + +/* If the pattern is actually a literal string, process it separately to avoid +cluttering up the main loop. */ + +if ((options & PCRE2_LITERAL) != 0) + { + while (ptr < ptrend) + { + if (parsed_pattern >= parsed_pattern_end) + { + errorcode = ERR63; /* Internal error (parsed pattern overflow) */ + goto FAILED; + } + thisptr = ptr; + GETCHARINCTEST(c, ptr); + if (auto_callout) + parsed_pattern = manage_callouts(thisptr, &previous_callout, + auto_callout, parsed_pattern, cb); + PARSED_LITERAL(c, parsed_pattern); + } + *parsed_pattern = META_END; + return 0; + } + +/* Process a real regex which may contain meta-characters. */ + +top_nest = NULL; +end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); /* The size of the nest_save structure might not be a factor of the size of the workspace. Therefore we must round down end_nests so as to correctly avoid @@ -2311,8 +2345,6 @@ if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED; /* Now scan the pattern */ -*has_lookbehind = FALSE; - while (ptr < ptrend) { int prev_expect_cond_assert; @@ -2322,7 +2354,6 @@ while (ptr < ptrend) uint32_t prev_meta_quantifier; BOOL prev_okquantifier; PCRE2_SPTR tempptr; - PCRE2_SPTR thisptr; PCRE2_SIZE offset; if (parsed_pattern >= parsed_pattern_end) @@ -2334,7 +2365,7 @@ while (ptr < ptrend) if (nest_depth > cb->cx->parens_nest_limit) { errorcode = ERR19; - goto FAILED; + goto FAILED; /* Parentheses too deeply nested */ } /* Get next input character, save its position for callout handling. */ @@ -2361,8 +2392,8 @@ while (ptr < ptrend) goto FAILED; } if (!inverbname && after_manual_callout-- <= 0) - parsed_pattern = manage_callouts(thisptr, &previous_callout, options, - parsed_pattern, cb); + parsed_pattern = manage_callouts(thisptr, &previous_callout, + auto_callout, parsed_pattern, cb); PARSED_LITERAL(c, parsed_pattern); meta_quantifier = 0; } @@ -2507,7 +2538,7 @@ while (ptr < ptrend) !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode)))) { if (after_manual_callout-- <= 0) - parsed_pattern = manage_callouts(thisptr, &previous_callout, options, + parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout, parsed_pattern, cb); } @@ -2601,9 +2632,9 @@ while (ptr < ptrend) goto FAILED; ptr = tempptr; if (ptr >= ptrend) c = CHAR_BACKSLASH; else - { + { GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ - } + } escape = 0; /* Treat as literal character */ } @@ -3151,10 +3182,10 @@ while (ptr < ptrend) else { - tempptr = ptr; + tempptr = ptr; escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, TRUE, cb); - + if (errorcode != 0) { CLASS_ESCAPE_FAILED: @@ -3162,12 +3193,12 @@ while (ptr < ptrend) goto FAILED; ptr = tempptr; if (ptr >= ptrend) c = CHAR_BACKSLASH; else - { + { GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ - } + } escape = 0; /* Treat as literal character */ } - + if (escape == 0) /* Escaped character code point is in c */ { char_is_literal = FALSE; @@ -3281,7 +3312,7 @@ while (ptr < ptrend) default: /* All others are not allowed in a class */ errorcode = ERR7; - ptr--; + ptr--; goto CLASS_ESCAPE_FAILED; } } @@ -4135,7 +4166,7 @@ if (inverbname && ptr >= ptrend) /* Manage callout for the final item */ -parsed_pattern = manage_callouts(ptr, &previous_callout, options, +parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, parsed_pattern, cb); /* Terminate the parsed pattern, then return success if all groups are closed. @@ -6426,7 +6457,7 @@ for (;; pptr++) group_return = -1; /* Set "may match empty string" */ /* Now treat as a repeated OP_BRA. */ - /* Fall through */ + /* Fall through */ /* If previous was a bracket group, we may have to replicate it in certain cases. Note that at this point we can encounter only the "basic" @@ -8552,7 +8583,7 @@ for (;; pptr++) goto RECURSE_OR_BACKREF_LENGTH; } - /* Fall through */ + /* Fall through */ /* For groups >= 10 - picking up group twice does no harm. */ /* A true recursion implies not fixed length, but a subroutine call may @@ -8891,7 +8922,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) { BOOL utf; /* Set TRUE for UTF mode */ -BOOL has_lookbehind; /* Set TRUE if a lookbehind is found */ +BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ pcre2_real_code *re = NULL; /* What we will return */ compile_block cb; /* "Static" compile-time data */ @@ -8961,6 +8992,13 @@ if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) return NULL; } +if ((options & PCRE2_LITERAL) != 0 && + (options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0) + { + *errorptr = ERR92; + return NULL; + } + /* A NULL compile context means "use a default context" */ if (ccontext == NULL) @@ -9039,10 +9077,11 @@ for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET; /* --------------- Start looking at the pattern --------------- */ -/* Check for global one-time option settings at the start of the pattern, and -remember the offset to the actual regex. With valgrind support, make the -terminator of a zero-terminated pattern inaccessible. This catches bugs that -would otherwise only show up for non-zero-terminated patterns. */ +/* Unless PCRE2_LITERAL is set, check for global one-time option settings at +the start of the pattern, and remember the offset to the actual regex. With +valgrind support, make the terminator of a zero-terminated pattern +inaccessible. This catches bugs that would otherwise only show up for +non-zero-terminated patterns. */ #ifdef SUPPORT_VALGRIND if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); @@ -9051,72 +9090,75 @@ if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); ptr = pattern; skipatstart = 0; -while (patlen - skipatstart >= 2 && - ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && - ptr[skipatstart+1] == CHAR_ASTERISK) +if ((options & PCRE2_LITERAL) == 0) { - for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) + while (patlen - skipatstart >= 2 && + ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && + ptr[skipatstart+1] == CHAR_ASTERISK) { - pso *p = pso_list + i; - - if (patlen - skipatstart - 2 >= p->length && - PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0) + for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) { uint32_t c, pp; + pso *p = pso_list + i; - skipatstart += p->length + 2; - switch(p->type) + if (patlen - skipatstart - 2 >= p->length && + PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name), + p->length) == 0) { - case PSO_OPT: - cb.external_options |= p->value; - break; - - case PSO_FLG: - setflags |= p->value; - break; - - case PSO_NL: - newline = p->value; - setflags |= PCRE2_NL_SET; - break; - - case PSO_BSR: - bsr = p->value; - setflags |= PCRE2_BSR_SET; - break; - - case PSO_LIMM: - case PSO_LIMD: - case PSO_LIMH: - c = 0; - pp = skipatstart; - if (!IS_DIGIT(ptr[pp])) + skipatstart += p->length + 2; + switch(p->type) { - errorcode = ERR60; - ptr += pp; - goto HAD_EARLY_ERROR; + case PSO_OPT: + cb.external_options |= p->value; + break; + + case PSO_FLG: + setflags |= p->value; + break; + + case PSO_NL: + newline = p->value; + setflags |= PCRE2_NL_SET; + break; + + case PSO_BSR: + bsr = p->value; + setflags |= PCRE2_BSR_SET; + break; + + case PSO_LIMM: + case PSO_LIMD: + case PSO_LIMH: + c = 0; + pp = skipatstart; + if (!IS_DIGIT(ptr[pp])) + { + errorcode = ERR60; + ptr += pp; + goto HAD_EARLY_ERROR; + } + while (IS_DIGIT(ptr[pp])) + { + if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ + c = c*10 + (ptr[pp++] - CHAR_0); + } + if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR60; + ptr += pp; + goto HAD_EARLY_ERROR; + } + if (p->type == PSO_LIMH) limit_heap = c; + else if (p->type == PSO_LIMM) limit_match = c; + else limit_depth = c; + skipatstart += pp - skipatstart; + break; } - while (IS_DIGIT(ptr[pp])) - { - if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ - c = c*10 + (ptr[pp++] - CHAR_0); - } - if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR60; - ptr += pp; - goto HAD_EARLY_ERROR; - } - if (p->type == PSO_LIMH) limit_heap = c; - else if (p->type == PSO_LIMM) limit_match = c; - else limit_depth = c; - skipatstart += pp - skipatstart; - break; + break; /* Out of the table scan loop */ } - break; /* Out of the table scan loop */ } + if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ } - if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ } /* End of pattern-start options; advance to start of real regex. */ diff --git a/src/pcre2_error.c b/src/pcre2_error.c index e8203b4..daeb2a2 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -176,7 +176,8 @@ static const unsigned char compile_error_texts[] = "internal error: unknown code in parsed pattern\0" /* 90 */ "internal error: bad code value in parsed_skip()\0" - "PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0" + "PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0" + "invalid option bits with PCRE2_LITERAL\0" ; /* Match-time and UTF error texts are in the same format. */ diff --git a/src/pcre2posix.c b/src/pcre2posix.c index 0c460cb..ebe187d 100644 --- a/src/pcre2posix.c +++ b/src/pcre2posix.c @@ -142,6 +142,7 @@ static const int eint2[] = { 32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */ 37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */ 56, REG_INVARG, /* internal error: unknown newline setting */ + 92, REG_INVARG, /* invalid option bits with PCRE2_LITERAL */ }; /* Table of texts corresponding to POSIX error codes */ @@ -242,6 +243,7 @@ patlen = ((cflags & REG_PEND) != 0)? (PCRE2_SIZE)(preg->re_endp - pattern) : if ((cflags & REG_ICASE) != 0) options |= PCRE2_CASELESS; if ((cflags & REG_NEWLINE) != 0) options |= PCRE2_MULTILINE; if ((cflags & REG_DOTALL) != 0) options |= PCRE2_DOTALL; +if ((cflags & REG_NOSPEC) != 0) options |= PCRE2_LITERAL; if ((cflags & REG_UTF) != 0) options |= PCRE2_UTF; if ((cflags & REG_UCP) != 0) options |= PCRE2_UCP; if ((cflags & REG_UNGREEDY) != 0) options |= PCRE2_UNGREEDY; @@ -260,10 +262,10 @@ if (preg->re_pcre2_code == NULL) if (errorcode < COMPILE_ERROR_BASE) return REG_BADPAT; errorcode -= COMPILE_ERROR_BASE; - + if (errorcode < (int)(sizeof(eint1)/sizeof(const int))) return eint1[errorcode]; - for (i = 0; i < sizeof(eint2)/(2*sizeof(const int)); i += 2) + for (i = 0; i < sizeof(eint2)/sizeof(const int); i += 2) if (errorcode == eint2[i]) return eint2[i+1]; return REG_BADPAT; } diff --git a/src/pcre2posix.h b/src/pcre2posix.h index c17be3b..651eedd 100644 --- a/src/pcre2posix.h +++ b/src/pcre2posix.h @@ -63,6 +63,7 @@ extern "C" { #define REG_UNGREEDY 0x0200 /* NOT defined by POSIX; maps to PCRE2_UNGREEDY */ #define REG_UCP 0x0400 /* NOT defined by POSIX; maps to PCRE2_UCP */ #define REG_PEND 0x0800 /* GNU feature: pass end pattern by re_endp */ +#define REG_NOSPEC 0x1000 /* Maps to PCRE2_LITERAL */ /* This is not used by PCRE2, but by defining it we make it easier to slot PCRE2 into existing programs that make POSIX calls. */ diff --git a/src/pcre2test.c b/src/pcre2test.c index ce052f6..14e9153 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -634,6 +634,7 @@ static modstruct modlist[] = { { "jitfast", MOD_PAT, MOD_CTL, CTL_JITFAST, PO(control) }, { "jitstack", MOD_PNDP, MOD_INT, 0, PO(jitstack) }, { "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) }, + { "literal", MOD_PAT, MOD_OPT, PCRE2_LITERAL, PO(options) }, { "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) }, { "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) }, { "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) }, @@ -696,8 +697,8 @@ static modstruct modlist[] = { /* Controls and options that are supported for use with the POSIX interface. */ #define POSIX_SUPPORTED_COMPILE_OPTIONS ( \ - PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_UCP|PCRE2_UTF| \ - PCRE2_UNGREEDY) + PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_LITERAL|PCRE2_MULTILINE|PCRE2_UCP| \ + PCRE2_UTF|PCRE2_UNGREEDY) #define POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS (0) @@ -4030,7 +4031,7 @@ static void show_compile_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", @@ -4046,6 +4047,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s% ((options & PCRE2_EXTENDED) != 0)? " extended" : "", ((options & PCRE2_EXTENDED_MORE) != 0)? " extended_more" : "", ((options & PCRE2_FIRSTLINE) != 0)? " firstline" : "", + ((options & PCRE2_LITERAL) != 0)? " literal" : "", ((options & PCRE2_MATCH_UNSET_BACKREF) != 0)? " match_unset_backref" : "", ((options & PCRE2_MULTILINE) != 0)? " multiline" : "", ((options & PCRE2_NEVER_BACKSLASH_C) != 0)? " never_backslash_c" : "", @@ -4905,6 +4907,7 @@ uint8_t *p = buffer; unsigned int delimiter = *p++; int errorcode; void *use_pat_context; +uint32_t use_forbid_utf = forbid_utf; PCRE2_SIZE patlen; PCRE2_SIZE valgrind_access_length; PCRE2_SIZE erroroffset; @@ -5263,6 +5266,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0) if ((pat_patctl.control & CTL_POSIX_NOSUB) != 0) cflags |= REG_NOSUB; if ((pat_patctl.options & PCRE2_UCP) != 0) cflags |= REG_UCP; if ((pat_patctl.options & PCRE2_CASELESS) != 0) cflags |= REG_ICASE; + if ((pat_patctl.options & PCRE2_LITERAL) != 0) cflags |= REG_NOSPEC; if ((pat_patctl.options & PCRE2_MULTILINE) != 0) cflags |= REG_NEWLINE; if ((pat_patctl.options & PCRE2_DOTALL) != 0) cflags |= REG_DOTALL; if ((pat_patctl.options & PCRE2_UNGREEDY) != 0) cflags |= REG_UNGREEDY; @@ -5534,6 +5538,11 @@ NULL context. */ use_pat_context = ((pat_patctl.control & CTL_NULLCONTEXT) != 0)? NULL : PTR(pat_context); + +/* If PCRE2_LITERAL is set, set use_forbid_utf zero because PCRE2_NEVER_UTF +and PCRE2_NEVER_UCP are invalid with it. */ + +if ((pat_patctl.options & PCRE2_LITERAL) != 0) use_forbid_utf = 0; /* Compile many times when timing. */ @@ -5545,7 +5554,8 @@ if (timeit > 0) { clock_t start_time = clock(); PCRE2_COMPILE(compiled_code, pbuffer, patlen, - pat_patctl.options|forbid_utf, &errorcode, &erroroffset, use_pat_context); + pat_patctl.options|use_forbid_utf, &errorcode, &erroroffset, + use_pat_context); time_taken += clock() - start_time; if (TEST(compiled_code, !=, NULL)) { SUB1(pcre2_code_free, compiled_code); } @@ -5558,7 +5568,7 @@ if (timeit > 0) /* A final compile that is used "for real". */ -PCRE2_COMPILE(compiled_code, pbuffer, patlen, pat_patctl.options|forbid_utf, +PCRE2_COMPILE(compiled_code, pbuffer, patlen, pat_patctl.options|use_forbid_utf, &errorcode, &erroroffset, use_pat_context); /* Call the JIT compiler if requested. When timing, we must free and recompile @@ -5576,7 +5586,7 @@ if (TEST(compiled_code, !=, NULL) && pat_patctl.jit != 0) clock_t start_time; SUB1(pcre2_code_free, compiled_code); PCRE2_COMPILE(compiled_code, pbuffer, patlen, - pat_patctl.options|forbid_utf, &errorcode, &erroroffset, + pat_patctl.options|use_forbid_utf, &errorcode, &erroroffset, use_pat_context); start_time = clock(); PCRE2_JIT_COMPILE(jitrc,compiled_code, pat_patctl.jit); diff --git a/testdata/testinput18 b/testdata/testinput18 index a133532..755a0c9 100644 --- a/testdata/testinput18 +++ b/testdata/testinput18 @@ -129,4 +129,9 @@ /ABC/use_length ABC +/a\b(c/literal,posix + a\\b(c + +/a\b(c/literal,posix,dotall + # End of testdata/testinput18 diff --git a/testdata/testinput2 b/testdata/testinput2 index 20d526c..64640a7 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5292,4 +5292,39 @@ a)"xI # ---------------------------------------------------------------------- +/a\b(c/literal + a\\b(c + +/a\b(c/literal,caseless + a\\b(c + a\\B(c + +/a\b(c/literal,firstline + XYYa\\b(c +\= Expect no match + X\na\\b(c + +/a\b?c/literal,use_offset_limit + XXXXa\\b?c\=offset_limit=5 +\= Expect no match + XXXXa\\b?c\=offset_limit=3 + +/a\b(c/literal,anchored,endanchored + a\\b(c +\= Expect no match + Xa\\b(c + a\\b(cX + Xa\\b(cX + +//literal,extended + +/a\b(c/literal,auto_callout,no_start_optimize + XXXXa\\b(c + +/a\b(c/literal,auto_callout + XXXXa\\b(c + +/(*CR)abc/literal + (*CR)abc + # End of testinput2 diff --git a/testdata/testinput5 b/testdata/testinput5 index dd39bb0..3931b6c 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2024,4 +2024,7 @@ # ---------------------------------------------------------------------- +/Aሴ+B/literal,utf,no_utf_check + Aሴ+B + # End of testinput5 diff --git a/testdata/testoutput18 b/testdata/testoutput18 index b02631c..d51423d 100644 --- a/testdata/testoutput18 +++ b/testdata/testoutput18 @@ -199,4 +199,11 @@ No match: POSIX code 17: match failed ABC 0: ABC +/a\b(c/literal,posix + a\\b(c + 0: a\b(c + +/a\b(c/literal,posix,dotall +Failed: POSIX code 16: bad argument at offset 0 + # End of testdata/testinput18 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 4a9dea9..f80bd56 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -16015,6 +16015,72 @@ Failed: error 108 at offset 4: range out of order in character class # ---------------------------------------------------------------------- +/a\b(c/literal + a\\b(c + 0: a\b(c + +/a\b(c/literal,caseless + a\\b(c + 0: a\b(c + a\\B(c + 0: a\B(c + +/a\b(c/literal,firstline + XYYa\\b(c + 0: a\b(c +\= Expect no match + X\na\\b(c +No match + +/a\b?c/literal,use_offset_limit + XXXXa\\b?c\=offset_limit=5 + 0: a\b?c +\= Expect no match + XXXXa\\b?c\=offset_limit=3 +No match + +/a\b(c/literal,anchored,endanchored + a\\b(c + 0: a\b(c +\= Expect no match + Xa\\b(c +No match + a\\b(cX +No match + Xa\\b(cX +No match + +//literal,extended +Failed: error 192 at offset 0: invalid option bits with PCRE2_LITERAL + +/a\b(c/literal,auto_callout,no_start_optimize + XXXXa\\b(c +--->XXXXa\b(c + +0 ^ a + +0 ^ a + +0 ^ a + +0 ^ a + +0 ^ a + +1 ^^ \ + +2 ^ ^ b + +3 ^ ^ ( + +4 ^ ^ + 0: a\b(c + +/a\b(c/literal,auto_callout + XXXXa\\b(c +--->XXXXa\b(c + +0 ^ a + +1 ^^ \ + +2 ^ ^ b + +3 ^ ^ ( + +4 ^ ^ + 0: a\b(c + +/(*CR)abc/literal + (*CR)abc + 0: (*CR)abc + # End of testinput2 Error -65: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data @@ -16024,4 +16090,4 @@ Error 0: PCRE2_ERROR_BADDATA (unknown error number) Error 100: no error Error 101: \ at end of pattern Error 191: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode -Error 192: PCRE2_ERROR_BADDATA (unknown error number) +Error 200: PCRE2_ERROR_BADDATA (unknown error number) diff --git a/testdata/testoutput5 b/testdata/testoutput5 index fd71dd3..619942c 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4600,4 +4600,8 @@ No match # ---------------------------------------------------------------------- +/Aሴ+B/literal,utf,no_utf_check + Aሴ+B + 0: A\x{1234}+B + # End of testinput5