Refactor match_ref() and fix UTF-8 caseless bug.

2014-08-27 16:59:56 +00:00 · 2014-08-27 16:59:56 +00:00 · 8c0c55e3e6
commit 8c0c55e3e6
parent 41f673e0c8
4 changed files with 224 additions and 99 deletions
--- a/19
+++ b/19
@ -5,11 +5,14 @@ Version 10.0 xx-xxxx-2014
 -------------------------

 Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
-Changes prior to 10.0 are logged in the ChangeLog file for the old API.
+Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to 
+item 20 for release 8.36.

 The code of the library was heavily revised as part of the new API 
 implementation. Details of each and every modification were not individually 
-logged. In addition to the API changes, the following changes were made:
+logged. In addition to the API changes, the following changes were made. They 
+are either new functionality, or bugs that were fixed after the code had been 
+forked.

 1. The test program, now called pcre2test, was re-specified and almost 
 completely re-written. Its input is not compatible with input for pcretest.
@ -23,4 +26,16 @@ not writing the function calls themselves, it is possible to check the PCRE2
 version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a 
 string such as "yesno".

+4. There are case-equivalent Unicode characters whose encodings use different 
+numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is 
+theoretically possible for this to happen in UTF-16 too.) If a backreference to 
+a group containing one of these characters was greedily repeated, and during 
+the match a backtrack occurred, the subject might be backtracked by the wrong
+number of code units. For example, if /^(\x{23a})\1*(.)/ is matched caselessly 
+(and in UTF-8 mode) against "\x{23a}\x{2c65}\x{2c65}\x{2c65}", group 2 should 
+capture the final character, which is the three bytes E2, B1, and A5 in UTF-8.
+Incorrect backtracking meant that group 2 captured only the last two bytes. 
+This bug has been fixed; the new code is slower, but it is used only when the 
+strings matched by the repetition are not all the same length.
+
 ****
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -122,39 +122,54 @@ ovector length is always a multiple of 3. */
 *          Match a back-reference                *
 *************************************************/

-/* Normally, if a back reference hasn't been set, the length that is passed is
-negative, so the match always fails. However, in JavaScript compatibility mode,
-the length passed is zero. Note that in caseless UTF-8 mode, the number of
-subject bytes matched may be different to the number of reference bytes.
+/* This function is called only when it is known that the offset lies within
+the offsets that have so far been used in the match. Note that in caseless
+UTF-8 mode, the number of subject bytes matched may be different to the number
+of reference bytes. (In theory this could also happen in UTF-16 mode, but it 
+seems unlikely.)

 Arguments:
  offset      index into the offset vector
+  offset_top  top of the used offset vector 
  eptr        pointer into the subject
-  length      length of reference to be matched (number of code units)
  mb          points to match block
  caseless    TRUE if caseless
+  lengthptr   pointer for returning the length matched 

-Returns:      >= 0 the number of subject code units matched
-              -1 no match
-              -2 partial match; always given if at end subject
+Returns:      = 0 sucessful match; number of code units matched is set
+              < 0 no match
+              > 0 partial match 
 */

 static int
-match_ref(int offset, register PCRE2_SPTR eptr, int length, match_block *mb,
-  BOOL caseless)
+match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr, 
+  match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr)
 {
-PCRE2_SPTR eptr_start = eptr;
-register PCRE2_SPTR p = mb->start_subject + mb->ovector[offset];
 #if defined SUPPORT_UTF
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
 #endif

-/* Always fail if reference not set (unless PCRE2_MATCH_UNSET_BACKREF is set,
-in which case the length is passed as zero). */
+register PCRE2_SPTR p;
+PCRE2_SIZE length;
+PCRE2_SPTR eptr_start = eptr;

-if (length < 0) return -1;
+/* Deal with an unset group. The default is no match, but there is an option to 
+match an empty string. */

-/* Separate the caseless and UTF case for speed. */
+if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
+  {
+  if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
+    {
+    *lengthptr = 0;
+    return 0;      /* Match */
+    }
+  else return -1;  /* No match */
+  }        
+
+/* Separate the caseless and UTF cases for speed. */
+
+p = mb->start_subject + mb->ovector[offset];
+length = mb->ovector[offset+1] - mb->ovector[offset];

 if (caseless)
  {
@ -175,7 +190,7 @@ if (caseless)
      {
      uint32_t c, d;
      const ucd_record *ur;
-      if (eptr >= mb->end_subject) return -2;   /* Partial match */
+      if (eptr >= mb->end_subject) return 1;   /* Partial match */
      GETCHARINC(c, eptr);
      GETCHARINC(d, p);
      ur = GET_UCD(d);
@ -184,7 +199,7 @@ if (caseless)
        const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
        for (;;)
          {
-          if (c < *pp) return -1;
+          if (c < *pp) return -1;  /* No match */
          if (c == *pp++) break;
          }
        }
@ -199,29 +214,31 @@ if (caseless)
    while (length-- > 0)
      {
      uint32_t cc, cp;
-      if (eptr >= mb->end_subject) return -2;   /* Partial match */
+      if (eptr >= mb->end_subject) return 1;   /* Partial match */
      cc = UCHAR21TEST(eptr);
      cp = UCHAR21TEST(p);
-      if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) return -1;
+      if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) 
+        return -1;  /* No match */
      p++;
      eptr++;
      }
    }
  }

-/* In the caseful case, we can just compare the bytes, whether or not we
-are in UTF-8 mode. */
+/* In the caseful case, we can just compare the code units, whether or not we
+are in UT mode. */

 else
  {
  while (length-- > 0)
    {
-    if (eptr >= mb->end_subject) return -2;   /* Partial match */
-    if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
+    if (eptr >= mb->end_subject) return 1;   /* Partial match */
+    if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;  /*No match */
    }
  }

-return (int)(eptr - eptr_start);
+*lengthptr = eptr - eptr_start;
+return 0;  /* Match */
 }


@ -350,6 +367,7 @@ typedef struct heapframe {
  
  eptrblock *Xeptrb;

+  PCRE2_SIZE Xlength;
  PCRE2_SIZE Xoffset;
  PCRE2_SIZE Xoffset_top;
  PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3;
@ -370,7 +388,6 @@ typedef struct heapframe {
  int Xcodelink;
  int Xctype;
  int Xfi;
-  int Xlength;
  int Xmax;
  int Xmin;
  int Xwhere;    /* Where to jump back to */
@ -425,7 +442,7 @@ Arguments:
  callpat     the recursion point in the pattern
  mstart      pointer to the current match start position (can be modified
                by encountering \K)
-  offset_top  current top pointer
+  offset_top  current top pointer (highest ovector offset used + 1)
  mb          pointer to "static" info block for the match
  eptrb       pointer to chain of blocks containing eptr at start of
                brackets - for testing for empty matches
@ -529,7 +546,7 @@ Arguments:
   ecode       pointer to current position in compiled code
   mstart      pointer to the current match start position (can be modified
                 by encountering \K)
-   offset_top  current top pointer
+   offset_top  current top pointer (highest ovector offset used + 1)
   mb          pointer to "static" info block for the match
   eptrb       pointer to chain of blocks containing eptr at start of
                 brackets - for testing for empty matches
@ -659,6 +676,7 @@ PCRE2_SPTR pp;
 PCRE2_SPTR prev;
 PCRE2_SPTR saved_eptr;

+PCRE2_SIZE length;
 PCRE2_SIZE offset;
 PCRE2_SIZE save_offset1, save_offset2, save_offset3;

@ -676,7 +694,6 @@ PCRE2_UCHAR occhars[6];

 int codelink;
 int ctype;
-int length;
 int max;
 int min;

@ -693,13 +710,13 @@ of the local variables that are used only in localised parts of the code, but
 still need to be preserved over recursive calls of match(). These macros define
 the alternative names that are used. */

-#define allow_zero    cur_is_word
-#define cbegroup      condition
-#define code_offset   codelink
-#define condassert    condition
-#define matched_once  prev_is_word
-#define foc           number
-#define save_mark     data
+#define allow_zero      cur_is_word
+#define cbegroup        condition
+#define code_offset     codelink
+#define condassert      condition
+#define foc             number
+#define matched_once    prev_is_word
+#define save_mark       data

 /* These statements are here to stop the compiler complaining about unitialized
 variables. */
@ -2671,22 +2688,7 @@ for (;;)


    /* Match a back reference, possibly repeatedly. Look past the end of the
-    item to see if there is repeat information following. The code is similar
-    to that for character classes, but repeated for efficiency. Then obey
-    similar code to character type repeats - written out again for speed.
-    However, if the referenced string is the empty string, always treat
-    it as matched, any number of times (otherwise there could be infinite
-    loops). If the reference is unset, there are two possibilities:
-
-    (a) In the default, Perl-compatible state, set the length negative;
-    this ensures that every attempt at a match fails. We can't just fail
-    here, because of the possibility of quantifiers with zero minima.
-
-    (b) If the JavaScript compatibility flag is set, set the length to zero
-    so that the back reference matches an empty string.
-
-    Otherwise, set the length to the length of what was matched by the
-    referenced subpattern.
+    item to see if there is repeat information following.
     
    The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
    or to a non-duplicated named group. For a duplicated named group, OP_DNREF
@ -2701,20 +2703,14 @@ for (;;)
      PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
      ecode += 1 + 2*IMM2_SIZE;

-      /* Setting the default length first and initializing 'offset' avoids
-      compiler warnings in the REF_REPEAT code. */
+      /* Initializing 'offset' avoids a compiler warning in the REF_REPEAT
+      code. */
       
-      length = ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)? 0 : -1;
      offset = 0;
-
      while (count-- > 0)
        {
        offset = GET2(slot, 0) << 1;
-        if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
-          {
-          length = mb->ovector[offset+1] - mb->ovector[offset];
-          break;
-          }
+        if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) break;
        slot += mb->name_entry_size;
        }
      }
@ -2725,10 +2721,6 @@ for (;;)
    caseless = op == OP_REFI;
    offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
    ecode += 1 + IMM2_SIZE;
-    if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
-      length = ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)? 0 : -1;
-    else
-      length = mb->ovector[offset+1] - mb->ovector[offset];
    
    /* Set up for repetition, or handle the non-repeated case */

@ -2757,25 +2749,35 @@ for (;;)
      ecode += 1 + 2 * IMM2_SIZE;
      break;

-      default:               /* No repeat follows */
-      if ((length = match_ref(offset, eptr, length, mb, caseless)) < 0)
+      default:                  /* No repeat follows */
        { 
-        if (length == -2) eptr = mb->end_subject;   /* Partial match */
-        CHECK_PARTIAL();
-        RRETURN(MATCH_NOMATCH);
+        int rc = match_ref(offset, offset_top, eptr, mb, caseless, &length);
+        if (rc != 0)
+          {
+          if (rc > 0) eptr = mb->end_subject;   /* Partial match */
+          CHECK_PARTIAL();
+          RRETURN(MATCH_NOMATCH);
+          }
        }   
      eptr += length;
      continue;              /* With the main loop */
      }

-    /* Handle repeated back references. If the length of the reference is
-    zero, just continue with the main loop. If the length is negative, it
-    means the reference is unset in non-Java-compatible mode. If the minimum is
+    /* Handle repeated back references. If a set group has length zero, just 
+    continue with the main loop, because it matches however many times. For an 
+    unset reference, in non-match-unset-backref mode, if the minimum is
    zero, we can continue at the same level without recursion. For any other
    minimum, carrying on will result in NOMATCH. */
    
-    if (length == 0) continue;
-    if (length < 0 && min == 0) continue;
+    if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
+      { 
+      if (mb->ovector[offset] == mb->ovector[offset + 1]) continue;
+      }
+    else
+      {
+      if (min == 0 && (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) == 0)
+        continue; 
+      }      

    /* First, ensure the minimum number of matches are present. We get back
    the length of the reference string explicitly rather than passing the
@ -2783,10 +2785,11 @@ for (;;)

    for (i = 1; i <= min; i++)
      {
-      int slength;
-      if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
+      PCRE2_SIZE slength;
+      int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength); 
+      if (rc != 0)
        {
-        if (slength == -2) eptr = mb->end_subject;   /* Partial match */
+        if (rc > 0) eptr = mb->end_subject;   /* Partial match */
        CHECK_PARTIAL();
        RRETURN(MATCH_NOMATCH);
        }
@ -2804,13 +2807,15 @@ for (;;)
      {
      for (fi = min;; fi++)
        {
-        int slength;
+        int rc; 
+        PCRE2_SIZE slength;
        RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14);
        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
        if (fi >= max) RRETURN(MATCH_NOMATCH);
-        if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
+        rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
+        if (rc != 0) 
          {
-          if (slength == -2) eptr = mb->end_subject;   /* Partial match */
+          if (rc > 0) eptr = mb->end_subject;   /* Partial match */
          CHECK_PARTIAL();
          RRETURN(MATCH_NOMATCH);
          }
@ -2819,20 +2824,26 @@ for (;;)
      /* Control never gets here */
      }

-    /* If maximizing, find the longest string and work backwards */
+    /* If maximizing, find the longest string and work backwards, as long as 
+    the matched lengths for each iteration are the same. */

    else
      {
+      BOOL samelengths = TRUE; 
      pp = eptr;
+      length = mb->ovector[offset+1] - mb->ovector[offset];
+
      for (i = min; i < max; i++)
        {
-        int slength;
-        if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
+        PCRE2_SIZE slength;
+        int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
+
+        if (rc != 0) 
          {
          /* Can't use CHECK_PARTIAL because we don't want to update eptr in
          the soft partial matching case. */

-          if (slength == -2 && mb->partial != 0 &&
+          if (rc > 0 && mb->partial != 0 &&
              mb->end_subject > mb->start_used_ptr)
            {
            mb->hitend = TRUE;
@ -2840,15 +2851,49 @@ for (;;)
            }
          break;
          }
+
+        if (slength != length) samelengths = FALSE;
        eptr += slength;
        }

-      while (eptr >= pp)
+      /* If the length matched for each repetiaion is the same as the length of 
+      the captured group, we can easily work backwards. This is the normal 
+      case. However, in caseless UTF-8 mode there are pairs of case-equivalent 
+      characters whose lengths (in terms of code units) differ. However, this
+      is very rare, so we handle it by re-matching fewer and fewer times. */
+      
+      if (samelengths)
        { 
-        RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
-        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-        eptr -= length;
+        while (eptr >= pp)
+          {
+          RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
+          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+          eptr -= length;
+          }
        }
+        
+      /* The rare case of non-matching lengths. Re-scan the repetition for each 
+      iteration. We know that match_ref() will succeed every time. */
+       
+      else
+        {
+        max = i; 
+        for (;;)
+          {
+          RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
+          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+          if (eptr == pp) break;  /* Failed after minimal repetition */
+          eptr = pp;
+          max--; 
+          for (i = min; i < max; i++)
+            {
+            PCRE2_SIZE slength;
+            (void)match_ref(offset, offset_top, eptr, mb, caseless, &slength);
+            eptr += slength;
+            }
+          }
+        }        
+ 
      RRETURN(MATCH_NOMATCH);
      }
    /* Control never gets here */
@ -3223,7 +3268,7 @@ for (;;)
      length = 1;
      ecode++;
      GETCHARLEN(fc, ecode, length);
-      if (length > mb->end_subject - eptr)
+      if (length > (PCRE2_SIZE)(mb->end_subject - eptr))
        {
        CHECK_PARTIAL();             /* Not SCHECK_PARTIAL() */
        RRETURN(MATCH_NOMATCH);
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -2195,4 +2195,25 @@
 /^s?c/im,utf
    scat
    
+# The next four tests are for repeated caseless back references when the 
+# code unit length of the matched text is different to that of the original
+# group in the UTF-8 case.
+
+/^(\x{23a})\1*(.)/i,utf
+    \x{23a}\x{23a}\x{23a}\x{23a}
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+
+/^(\x{23a})\1*(..)/i,utf
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+
+/^(\x{23a})\1*(...)/i,utf
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+
+/^(\x{23a})\1*(....)/i,utf
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+
 # End of testinput4
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -3691,4 +3691,48 @@ No match
    scat
 0: sc
    
+# The next four tests are for repeated caseless back references when the 
+# code unit length of the matched text is different to that of the original
+# group in the UTF-8 case.
+
+/^(\x{23a})\1*(.)/i,utf
+    \x{23a}\x{23a}\x{23a}\x{23a}
+ 0: \x{23a}\x{23a}\x{23a}\x{23a}
+ 1: \x{23a}
+ 2: \x{23a}
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}
+ 2: \x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+ 0: \x{23a}\x{23a}\x{2c65}\x{23a}
+ 1: \x{23a}
+ 2: \x{23a}
+
+/^(\x{23a})\1*(..)/i,utf
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}
+ 2: \x{2c65}\x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+ 0: \x{23a}\x{23a}\x{2c65}\x{23a}
+ 1: \x{23a}
+ 2: \x{2c65}\x{23a}
+
+/^(\x{23a})\1*(...)/i,utf
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}
+ 2: \x{2c65}\x{2c65}\x{2c65}
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+ 0: \x{23a}\x{23a}\x{2c65}\x{23a}
+ 1: \x{23a}
+ 2: \x{23a}\x{2c65}\x{23a}
+
+/^(\x{23a})\1*(....)/i,utf
+    \x{23a}\x{2c65}\x{2c65}\x{2c65}
+No match
+    \x{23a}\x{23a}\x{2c65}\x{23a}
+No match
+
 # End of testinput4