Make EBCDIC [a-z] type ranges Perl compatible.
This commit is contained in:
parent
98b389e1fd
commit
c5c788763a
@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer.
|
||||
very pedantic coding infelicities and a buffer overflow while checking a UTF-8
|
||||
string if the final multi-byte UTF-8 character was truncated.
|
||||
|
||||
22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
|
||||
class, where both values are literal letters in the same case, omit the
|
||||
non-letter EBCDIC code points within the range.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
@ -1,4 +1,4 @@
|
||||
.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point
|
||||
where a range ending character is expected. For example, [z-\exff] is valid,
|
||||
but [A-\ed] and [A-[:digit:]] are not.
|
||||
.P
|
||||
Ranges operate in the collating sequence of character values. They can also be
|
||||
used for characters specified numerically, for example [\e000-\e037]. Ranges
|
||||
can include any characters that are valid for the current mode.
|
||||
Ranges normally include all code points between the start and end characters,
|
||||
inclusive. They can also be used for code points specified numerically, for
|
||||
example [\e000-\e037]. Ranges can include any characters that are valid for the
|
||||
current mode.
|
||||
.P
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
|
||||
are included.
|
||||
.P
|
||||
If a range that includes letters is used when caseless matching is set, it
|
||||
matches the letters in either case. For example, [W-c] is equivalent to
|
||||
@ -3367,6 +3376,6 @@ Cambridge, England.
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 24 July 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++)
|
||||
goto FAILED;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
/* Conditional group */
|
||||
|
||||
case CHAR_LEFT_PARENTHESIS:
|
||||
if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */
|
||||
{
|
||||
{
|
||||
nest_depth++;
|
||||
ptr += 2;
|
||||
break;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/* Must be an assertion or a callout */
|
||||
|
||||
|
||||
switch(ptr[4])
|
||||
{
|
||||
case CHAR_LESS_THAN_SIGN:
|
||||
if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
|
||||
if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
|
||||
goto MISSING_ASSERTION;
|
||||
/* Fall through */
|
||||
/* Fall through */
|
||||
|
||||
case CHAR_C:
|
||||
case CHAR_EXCLAMATION_MARK:
|
||||
case CHAR_EQUALS_SIGN:
|
||||
ptr++;
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
MISSING_ASSERTION:
|
||||
ptr += 3; /* To improve error message */
|
||||
MISSING_ASSERTION:
|
||||
ptr += 3; /* To improve error message */
|
||||
errorcode = ERR28;
|
||||
goto FAILED;
|
||||
}
|
||||
goto FAILED;
|
||||
}
|
||||
break;
|
||||
|
||||
case CHAR_COLON:
|
||||
@ -3939,7 +3939,7 @@ for (;; ptr++)
|
||||
{
|
||||
nestptr = ptr + 7;
|
||||
ptr = sub_start_of_word; /* Do not combine these statements; clang's */
|
||||
ptr--; /* sanitizer moans about a negative index. */
|
||||
ptr--; /* sanitizer moans about a negative index. */
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3947,7 +3947,7 @@ for (;; ptr++)
|
||||
{
|
||||
nestptr = ptr + 7;
|
||||
ptr = sub_end_of_word; /* Do not combine these statements; clang's */
|
||||
ptr--; /* sanitizer moans about a negative index. */
|
||||
ptr--; /* sanitizer moans about a negative index. */
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -4046,6 +4046,9 @@ for (;; ptr++)
|
||||
for(;;)
|
||||
{
|
||||
PCRE2_SPTR oldptr;
|
||||
#ifdef EBCDIC
|
||||
BOOL range_is_literal = TRUE;
|
||||
#endif
|
||||
|
||||
if (c == CHAR_NULL && ptr >= cb->end_pattern)
|
||||
{
|
||||
@ -4226,7 +4229,13 @@ for (;; ptr++)
|
||||
{
|
||||
escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
|
||||
if (*errorcodeptr != 0) goto FAILED;
|
||||
if (escape == 0) c = ec; /* Escaped single char */
|
||||
if (escape == 0) /* Escaped single char */
|
||||
{
|
||||
c = ec;
|
||||
#ifdef EBCDIC
|
||||
range_is_literal = FALSE;
|
||||
#endif
|
||||
}
|
||||
else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
|
||||
else if (escape == ESC_N) /* \N is not supported in a class */
|
||||
{
|
||||
@ -4430,7 +4439,9 @@ for (;; ptr++)
|
||||
int descape;
|
||||
descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
|
||||
if (*errorcodeptr != 0) goto FAILED;
|
||||
|
||||
#ifdef EBCDIC
|
||||
range_is_literal = FALSE;
|
||||
#endif
|
||||
/* 0 means a character was put into d; \b is backspace; any other
|
||||
special causes an error. */
|
||||
|
||||
@ -4476,9 +4487,48 @@ for (;; ptr++)
|
||||
|
||||
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
|
||||
|
||||
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
|
||||
because there are holes in the encoding, and simply using the range A-Z
|
||||
(for example) would include the characters in the holes. This applies
|
||||
only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
|
||||
|
||||
#ifdef EBCDIC
|
||||
if (range_is_literal &&
|
||||
(cb->ctypes[c] & ctype_letter) != 0 &&
|
||||
(cb->ctypes[d] & ctype_letter) != 0 &&
|
||||
(c <= CHAR_z) == (d <= CHAR_z))
|
||||
{
|
||||
uint32_t uc = (c <= CHAR_z)? 0 : 64;
|
||||
uint32_t C = c - uc;
|
||||
uint32_t D = d - uc;
|
||||
|
||||
if (C <= CHAR_i)
|
||||
{
|
||||
class_has_8bitchar +=
|
||||
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
|
||||
((D < CHAR_i)? D : CHAR_i) + uc);
|
||||
C = CHAR_j;
|
||||
}
|
||||
|
||||
if (C <= D && C <= CHAR_r)
|
||||
{
|
||||
class_has_8bitchar +=
|
||||
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
|
||||
((D < CHAR_r)? D : CHAR_r) + uc);
|
||||
C = CHAR_s;
|
||||
}
|
||||
|
||||
if (C <= D)
|
||||
{
|
||||
class_has_8bitchar +=
|
||||
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
|
||||
D + uc);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
class_has_8bitchar +=
|
||||
add_to_class(classbits, &class_uchardata, options, cb, c, d);
|
||||
|
||||
goto CONTINUE_CLASS; /* Go get the next char in the class */
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user