From c5c788763aade728a655505a3f66702aec740629 Mon Sep 17 00:00:00 2001 From: ph10 Date: Fri, 24 Jul 2015 18:18:05 +0000 Subject: [PATCH] Make EBCDIC [a-z] type ranges Perl compatible. --- ChangeLog | 4 +++ doc/pcre2pattern.3 | 19 +++++++--- src/pcre2_compile.c | 84 ++++++++++++++++++++++++++++++++++++--------- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/ChangeLog b/ChangeLog index c6d3480..6ea8aae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer. very pedantic coding infelicities and a buffer overflow while checking a UTF-8 string if the final multi-byte UTF-8 character was truncated. +22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a +class, where both values are literal letters in the same case, omit the +non-letter EBCDIC code points within the range. + Version 10.20 30-June-2015 -------------------------- diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 04325c7..9f0ff4f 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21" +.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point where a range ending character is expected. For example, [z-\exff] is valid, but [A-\ed] and [A-[:digit:]] are not. .P -Ranges operate in the collating sequence of character values. They can also be -used for characters specified numerically, for example [\e000-\e037]. Ranges -can include any characters that are valid for the current mode. +Ranges normally include all code points between the start and end characters, +inclusive. They can also be used for code points specified numerically, for +example [\e000-\e037]. Ranges can include any characters that are valid for the +current mode. +.P +There is a special case in EBCDIC environments for ranges whose end points are +both specified as literal letters in the same case. For compatibility with +Perl, EBCDIC code points within the range that are not letters are omitted. For +example, [h-k] matches only four characters, even though the codes for h and k +are 0x88 and 0x92, a range of 11 code points. However, if the range is +specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points +are included. .P If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to @@ -3367,6 +3376,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 17 July 2015 +Last updated: 24 July 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 08ea585..c25970f 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++) goto FAILED; } break; - + /* Conditional group */ case CHAR_LEFT_PARENTHESIS: if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */ - { + { nest_depth++; ptr += 2; - break; + break; } - + /* Must be an assertion or a callout */ - + switch(ptr[4]) { case CHAR_LESS_THAN_SIGN: - if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) + if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) goto MISSING_ASSERTION; - /* Fall through */ + /* Fall through */ case CHAR_C: case CHAR_EXCLAMATION_MARK: case CHAR_EQUALS_SIGN: ptr++; break; - + default: - MISSING_ASSERTION: - ptr += 3; /* To improve error message */ + MISSING_ASSERTION: + ptr += 3; /* To improve error message */ errorcode = ERR28; - goto FAILED; - } + goto FAILED; + } break; case CHAR_COLON: @@ -3939,7 +3939,7 @@ for (;; ptr++) { nestptr = ptr + 7; ptr = sub_start_of_word; /* Do not combine these statements; clang's */ - ptr--; /* sanitizer moans about a negative index. */ + ptr--; /* sanitizer moans about a negative index. */ continue; } @@ -3947,7 +3947,7 @@ for (;; ptr++) { nestptr = ptr + 7; ptr = sub_end_of_word; /* Do not combine these statements; clang's */ - ptr--; /* sanitizer moans about a negative index. */ + ptr--; /* sanitizer moans about a negative index. */ continue; } @@ -4046,6 +4046,9 @@ for (;; ptr++) for(;;) { PCRE2_SPTR oldptr; +#ifdef EBCDIC + BOOL range_is_literal = TRUE; +#endif if (c == CHAR_NULL && ptr >= cb->end_pattern) { @@ -4226,7 +4229,13 @@ for (;; ptr++) { escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb); if (*errorcodeptr != 0) goto FAILED; - if (escape == 0) c = ec; /* Escaped single char */ + if (escape == 0) /* Escaped single char */ + { + c = ec; +#ifdef EBCDIC + range_is_literal = FALSE; +#endif + } else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ else if (escape == ESC_N) /* \N is not supported in a class */ { @@ -4430,7 +4439,9 @@ for (;; ptr++) int descape; descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb); if (*errorcodeptr != 0) goto FAILED; - +#ifdef EBCDIC + range_is_literal = FALSE; +#endif /* 0 means a character was put into d; \b is backspace; any other special causes an error. */ @@ -4476,9 +4487,48 @@ for (;; ptr++) if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; + /* In an EBCDIC environment, Perl treats alphabetic ranges specially + because there are holes in the encoding, and simply using the range A-Z + (for example) would include the characters in the holes. This applies + only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ + +#ifdef EBCDIC + if (range_is_literal && + (cb->ctypes[c] & ctype_letter) != 0 && + (cb->ctypes[d] & ctype_letter) != 0 && + (c <= CHAR_z) == (d <= CHAR_z)) + { + uint32_t uc = (c <= CHAR_z)? 0 : 64; + uint32_t C = c - uc; + uint32_t D = d - uc; + + if (C <= CHAR_i) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, C + uc, + ((D < CHAR_i)? D : CHAR_i) + uc); + C = CHAR_j; + } + + if (C <= D && C <= CHAR_r) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, C + uc, + ((D < CHAR_r)? D : CHAR_r) + uc); + C = CHAR_s; + } + + if (C <= D) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, C + uc, + D + uc); + } + } + else +#endif class_has_8bitchar += add_to_class(classbits, &class_uchardata, options, cb, c, d); - goto CONTINUE_CLASS; /* Go get the next char in the class */ }