Tidy pcre2demo.c

This commit is contained in:
ph10 2016-02-02 16:25:47 +00:00
parent acc1a9b13c
commit efb2a55e9a
12 changed files with 1116 additions and 1020 deletions

View File

@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
modifier had this effect. That option is now ignored when the POSIX API is in
use.
8. Minor tidies to the pcre2demo.c sample program, including more comments
about its 8-bit-ness.
Version 10.21 12-January-2016
-----------------------------

View File

@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
were followed by ?: but named parentheses can still be used for capturing (and
they acquire numbers in the usual way). There is no equivalent of this option
in Perl.
in Perl. Note that, if this option is set, references to capturing groups (back
references or recursion/subroutine calls) may only refer to named groups,
though the reference can be by name or by number.
<pre>
PCRE2_NO_AUTO_POSSESS
</pre>
@ -3121,9 +3123,9 @@ Cambridge, England.
</P>
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
<P>
Last updated: 16 December 2015
Last updated: 31 January 2016
<br>
Copyright &copy; 1997-2015 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i &lt; argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\n");
printf("Exactly two arguments required: a regex and a subject string\n");
return 1;
}
@ -201,7 +213,7 @@ if (rc &lt; 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
printf("Match succeeded at offset %d\n", (int)ovector[0]);
/*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&amp;namecount); /* where to put the answer */
if (namecount &lt;= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\n");
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;)
{
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
@ -371,7 +383,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */
if (crlf_is_newline &amp;&amp; /* If CRLF is a newline &amp; */
start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
subject[start_offset] == '\r' &amp;&amp;
subject[start_offset + 1] == '\n')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
if (namecount &lt;= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n");

View File

@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
<a href="#lookbehind">(described below)</a>
in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind. Neither the alternative matching function
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
<b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter.
</P>

View File

@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the
<a href="pcre2api.html"><b>pcre2api</b></a>
documentation for a description of PCRE2's native API, which contains much
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
and 32-bit libraries.
</P>
<P>
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
replacement library. Other POSIX options are not even defined.
</P>
<P>
There are also some other options that are not defined by POSIX. These have
been added at the request of users who want to make use of certain
PCRE2-specific features via the POSIX calling interface.
There are also some options that are not defined by POSIX. These have been
added at the request of users who want to make use of certain PCRE2-specific
features via the POSIX calling interface.
</P>
<P>
When PCRE2 is called via these functions, it is only the API that is POSIX-like
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
<pre>
REG_NOSUB
</pre>
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed
for compilation to the native function. In addition, when a pattern that is
compiled with this flag is passed to <b>regexec()</b> for matching, the
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
are returned.
When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
captured strings are returned. Versions of the PCRE library prior to 10.22 used
to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
because it disables the use of back references.
<pre>
REG_UCP
</pre>
@ -241,14 +241,15 @@ mutually exclusive; the error REG_INVARG is returned.
<P>
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
<b>regexec()</b> are ignored.
<b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
</P>
<P>
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL,
no data about any matched strings is returned.
The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
(unless REG_STARTEND is set); in both these cases no data about any matched
strings is returned.
</P>
<P>
Otherwise,the portion of the string that was matched, and also any captured
Otherwise, the portion of the string that was matched, and also any captured
substrings, are returned via the <i>pmatch</i> argument, which points to an
array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the
members <i>rm_so</i> and <i>rm_eo</i>. These contain the byte offset to the first
@ -290,9 +291,9 @@ Cambridge, England.
</P>
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
<P>
Last updated: 29 November 2015
Last updated: 31 January 2016
<br>
Copyright &copy; 1997-2015 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of <i>pcre2demo.c</i>.
</P>
<P>
The demonstration program, which uses the PCRE2 8-bit library, compiles the
regular expression that is its first argument, and matches it against the
subject string in its second argument. No PCRE2 options are set, and default
character tables are used. If matching succeeds, the program outputs the
portion of the subject that matched, together with the contents of any captured
substrings.
The demonstration program compiles the regular expression that is its
first argument, and matches it against the subject string in its second
argument. No PCRE2 options are set, and default character tables are used. If
matching succeeds, the program outputs the portion of the subject that matched,
together with the contents of any captured substrings.
</P>
<P>
If the -g option is given on the command line, the program then goes on to
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on.
</P>
<P>
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
</P>
<P>
If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using
this command:
a command like this:
<pre>
gcc -o pcre2demo pcre2demo.c -lpcre2-8
cc -o pcre2demo pcre2demo.c -lpcre2-8
</pre>
If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in
<i>/usr/local</i>, you can compile the demonstration program using a command
like this:
<pre>
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
</PRE>
</P>
<P>
Once you have compiled and linked the demonstration program, you can run simple
tests like this:
cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
</pre>
Once you have built the demonstration program, you can run simple tests like
this:
<pre>
./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
</pre>
Note that there is a much more comprehensive test program, called
<a href="pcre2test.html"><b>pcre2test</b>,</a>
which supports many more facilities for testing regular expressions using the
PCRE2 libraries. The
which supports many more facilities for testing regular expressions using all
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
<a href="pcre2demo.html"><b>pcre2demo</b></a>
program is provided as a simple coding example.
program is provided as a relatively simple coding example.
</P>
<P>
If you try to run
@ -73,7 +77,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris):
<pre>
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
</pre>
This is caused by the way shared library support works on those systems. You
need to add
@ -97,9 +101,9 @@ Cambridge, England.
REVISION
</b><br>
<P>
Last updated: 20 October 2014
Last updated: 02 February 2016
<br>
Copyright &copy; 1997-2014 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -98,10 +98,11 @@ further data is read.
</P>
<P>
For maximum portability, therefore, it is safest to avoid non-printing
characters in <b>pcre2test</b> input files. There is a facility for specifying a
pattern's characters as hexadecimal pairs, thus making it possible to include
binary zeroes in a pattern for testing purposes. Subject lines are processed
for backslash escapes, which makes it possible to include any data value.
characters in <b>pcre2test</b> input files. There is a facility for specifying
some or all of a pattern's characters as hexadecimal pairs, thus making it
possible to include binary zeroes in a pattern for testing purposes. Subject
lines are processed for backslash escapes, which makes it possible to include
any data value.
</P>
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P>
@ -559,7 +560,7 @@ about the pattern:
debug same as info,fullbincode
fullbincode show binary code with lengths
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
hex unquoted characters are hexadecimal
jit[=&#60;number&#62;] use JIT
jitfast use JIT fast path
jitverify verify JIT use
@ -570,6 +571,7 @@ about the pattern:
null_context compile with a NULL context
parens_nest_limit=&#60;n&#62; set maximum parentheses depth
posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack
stackguard=&#60;number&#62; test the stackguard feature
tables=[0|1|2] select internal tables
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
default values).
</P>
<br><b>
Specifying a pattern in hex
Specifying pattern characters in hexadecimal
</b><br>
<P>
The <b>hex</b> modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted between
pairs. For example:
The <b>hex</b> modifier specifies that the characters of the pattern, except for
substrings enclosed in single or double quotes, are to be interpreted as pairs
of hexadecimal digits. This feature is provided as a way of creating patterns
that contain binary zeros and other non-printing characters. White space is
permitted between pairs of digits. For example, this pattern contains three
characters:
<pre>
/ab 32 59/hex
</pre>
This feature is provided as a way of creating patterns that contain binary zero
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
actual length of the pattern is passed.
Parts of such a pattern are taken literally if quoted. This pattern contains
nine characters, only two of which are specified in hexadecimal:
<pre>
/ab "literal" 32/hex
</pre>
Either single or double quotes may be used. There is no way of including
the delimiter within a substring.
</P>
<P>
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
patterns specified with the <b>hex</b> modifier, the actual length of the
pattern is passed.
</P>
<br><b>
Generating long repetitive patterns
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
Using the POSIX wrapper API
</b><br>
<P>
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
wrapper API rather than its native API. This supports only the 8-bit library.
Note that it does not imply POSIX matching semantics; for more detail see the
The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
PCRE2 via the POSIX wrapper API rather than its native API. When
<b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
it does not imply POSIX matching semantics; for more detail see the
<a href="pcre2posix.html"><b>pcre2posix</b></a>
documentation. When the POSIX API is being used, the following pattern
modifiers set options for the <b>regcomp()</b> function:
documentation. The following pattern modifiers set options for the
<b>regcomp()</b> function:
<pre>
caseless REG_ICASE
multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard
@ -847,7 +861,8 @@ large buffer is used.
</P>
<P>
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
below. All other modifiers cause an error.
below. All other modifiers are either ignored, with a warning message, or cause
an error.
</P>
<br><b>
Testing the stack guard feature
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any effect
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
Any other modifiers cause an error.
The other modifiers are ignored, with a warning message.
</P>
<br><b>
Setting match controls
@ -1001,7 +1016,10 @@ pattern.
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
zero_terminate pass the subject as zero-terminated
</pre>
The effects of these modifiers are described in the following sections.
The effects of these modifiers are described in the following sections. When
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
and <b>ovector</b> subject modifiers work as described below. All other
modifiers are either ignored, with a warning message, or cause an error.
</P>
<br><b>
Showing more text
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
modifier list containing only
<a href="#controlmodifiers">control modifiers</a>
that act after a pattern has been compiled. In particular, <b>hex</b>,
<b>posix</b>, and <b>push</b> are not allowed, nor are any
<b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
<a href="#optionmodifiers">option-setting modifiers.</a>
The JIT modifiers are, however permitted. Here is an example that saves and
reloads two patterns.
@ -1660,9 +1678,9 @@ Cambridge, England.
</P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P>
Last updated: 12 December 2015
Last updated: 31 January 2016
<br>
Copyright &copy; 1997-2015 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

File diff suppressed because it is too large Load Diff

View File

@ -20,28 +20,31 @@
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\en", argv[i]);
return 1;
}
else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\en");
printf("Exactly two arguments required: a regex and a subject string\en");
return 1;
}
@ -201,7 +213,7 @@ if (rc < 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
printf("Match succeeded at offset %d\en", (int)ovector[0]);
/*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\en"); else
if (namecount == 0) printf("No named substrings\en"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\en");
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;)
{
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
@ -371,7 +383,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */
if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\er' &&
subject[start_offset + 1] == '\en')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
}
if (namecount <= 0) printf("No named substrings\en"); else
if (namecount == 0) printf("No named substrings\en"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\en");

View File

@ -1,4 +1,4 @@
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00"
.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 SAMPLE PROGRAM"
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of \fIpcre2demo.c\fP.
.P
The demonstration program, which uses the PCRE2 8-bit library, compiles the
regular expression that is its first argument, and matches it against the
subject string in its second argument. No PCRE2 options are set, and default
character tables are used. If matching succeeds, the program outputs the
portion of the subject that matched, together with the contents of any captured
substrings.
The demonstration program compiles the regular expression that is its
first argument, and matches it against the subject string in its second
argument. No PCRE2 options are set, and default character tables are used. If
matching succeeds, the program outputs the portion of the subject that matched,
together with the contents of any captured substrings.
.P
If the -g option is given on the command line, the program then goes on to
check for further matches of the same regular expression in the same subject
string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on.
.P
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
.P
If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using
this command:
a command like this:
.sp
gcc -o pcre2demo pcre2demo.c -lpcre2-8
cc -o pcre2demo pcre2demo.c -lpcre2-8
.sp
If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
like this:
.sp
.\" JOINSH
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e
-L/usr/local/lib -lpcre2-8
cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
-L/usr/local/lib -lpcre2-8
.sp
.P
Once you have compiled and linked the demonstration program, you can run simple
tests like this:
Once you have built the demonstration program, you can run simple tests like
this:
.sp
./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
.\" HREF
\fBpcre2test\fP,
.\"
which supports many more facilities for testing regular expressions using the
PCRE2 libraries. The
which supports many more facilities for testing regular expressions using all
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
.\" HREF
\fBpcre2demo\fP
.\"
program is provided as a simple coding example.
program is provided as a relatively simple coding example.
.P
If you try to run
.\" HREF
@ -65,7 +70,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris):
.sp
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
.sp
This is caused by the way shared library support works on those systems. You
need to add
@ -89,6 +94,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 20 October 2014
Copyright (c) 1997-2014 University of Cambridge.
Last updated: 02 February 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi

View File

@ -67,10 +67,10 @@ INPUT ENCODING
For maximum portability, therefore, it is safest to avoid non-printing
characters in pcre2test input files. There is a facility for specifying
a pattern's characters as hexadecimal pairs, thus making it possible to
include binary zeroes in a pattern for testing purposes. Subject lines
are processed for backslash escapes, which makes it possible to include
any data value.
some or all of a pattern's characters as hexadecimal pairs, thus making
it possible to include binary zeroes in a pattern for testing purposes.
Subject lines are processed for backslash escapes, which makes it pos-
sible to include any data value.
COMMAND LINE OPTIONS
@ -505,7 +505,7 @@ PATTERN MODIFIERS
debug same as info,fullbincode
fullbincode show binary code with lengths
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
hex unquoted characters are hexadecimal
jit[=<number>] use JIT
jitfast use JIT fast path
jitverify verify JIT use
@ -516,6 +516,7 @@ PATTERN MODIFIERS
null_context compile with a NULL context
parens_nest_limit=<n> set maximum parentheses depth
posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack
stackguard=<number> test the stackguard feature
tables=[0|1|2] select internal tables
@ -591,59 +592,70 @@ PATTERN MODIFIERS
testing that pcre2_compile() behaves correctly in this case (it uses
default values).
Specifying a pattern in hex
Specifying pattern characters in hexadecimal
The hex modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted
between pairs. For example:
The hex modifier specifies that the characters of the pattern, except
for substrings enclosed in single or double quotes, are to be inter-
preted as pairs of hexadecimal digits. This feature is provided as a
way of creating patterns that contain binary zeros and other non-print-
ing characters. White space is permitted between pairs of digits. For
example, this pattern contains three characters:
/ab 32 59/hex
This feature is provided as a way of creating patterns that contain
binary zero and other non-printing characters. By default, pcre2test
passes patterns as zero-terminated strings to pcre2_compile(), giving
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
hexadecimal, the actual length of the pattern is passed.
Parts of such a pattern are taken literally if quoted. This pattern
contains nine characters, only two of which are specified in hexadeci-
mal:
/ab "literal" 32/hex
Either single or double quotes may be used. There is no way of includ-
ing the delimiter within a substring.
By default, pcre2test passes patterns as zero-terminated strings to
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
for patterns specified with the hex modifier, the actual length of the
pattern is passed.
Generating long repetitive patterns
Some tests use long patterns that are very repetitive. Instead of cre-
ating a very long input line for such a pattern, you can use a special
repetition feature, similar to the one described for subject lines
above. If the expand modifier is present on a pattern, parts of the
Some tests use long patterns that are very repetitive. Instead of cre-
ating a very long input line for such a pattern, you can use a special
repetition feature, similar to the one described for subject lines
above. If the expand modifier is present on a pattern, parts of the
pattern that have the form
\[<characters>]{<count>}
are expanded before the pattern is passed to pcre2_compile(). For exam-
ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
cannot be nested. An initial "\[" sequence is recognized only if "]{"
followed by decimal digits and "}" is found later in the pattern. If
cannot be nested. An initial "\[" sequence is recognized only if "]{"
followed by decimal digits and "}" is found later in the pattern. If
not, the characters remain in the pattern unaltered.
If part of an expanded pattern looks like an expansion, but is really
If part of an expanded pattern looks like an expansion, but is really
part of the actual pattern, unwanted expansion can be avoided by giving
two values in the quantifier. For example, \[AB]{6000,6000} is not rec-
ognized as an expansion item.
If the info modifier is set on an expanded pattern, the result of the
If the info modifier is set on an expanded pattern, the result of the
expansion is included in the information that is output.
JIT compilation
Just-in-time (JIT) compiling is a heavyweight optimization that can
greatly speed up pattern matching. See the pcre2jit documentation for
details. JIT compiling happens, optionally, after a pattern has been
successfully compiled into an internal form. The JIT compiler converts
Just-in-time (JIT) compiling is a heavyweight optimization that can
greatly speed up pattern matching. See the pcre2jit documentation for
details. JIT compiling happens, optionally, after a pattern has been
successfully compiled into an internal form. The JIT compiler converts
this to optimized machine code. It needs to know whether the match-time
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
because different code is generated for the different cases. See the
partial modifier in "Subject Modifiers" below for details of how these
because different code is generated for the different cases. See the
partial modifier in "Subject Modifiers" below for details of how these
options are specified for each match attempt.
JIT compilation is requested by the /jit pattern modifier, which may
JIT compilation is requested by the /jit pattern modifier, which may
optionally be followed by an equals sign and a number in the range 0 to
7. The three bits that make up the number specify which of the three
7. The three bits that make up the number specify which of the three
JIT operating modes are to be compiled:
1 compile JIT code for non-partial matching
@ -660,31 +672,31 @@ PATTERN MODIFIERS
6 soft and hard partial matching only
7 all three modes
If no number is given, 7 is assumed. The phrase "partial matching"
If no number is given, 7 is assumed. The phrase "partial matching"
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
plete match; the options enable the possibility of a partial match, but
do not require it. Note also that if you request JIT compilation only
for partial matching (for example, /jit=2) but do not set the partial
modifier on a subject line, that match will not use JIT code because
do not require it. Note also that if you request JIT compilation only
for partial matching (for example, /jit=2) but do not set the partial
modifier on a subject line, that match will not use JIT code because
none was compiled for non-partial matching.
If JIT compilation is successful, the compiled JIT code will automati-
cally be used when an appropriate type of match is run, except when
incompatible run-time options are specified. For more details, see the
pcre2jit documentation. See also the jitstack modifier below for a way
If JIT compilation is successful, the compiled JIT code will automati-
cally be used when an appropriate type of match is run, except when
incompatible run-time options are specified. For more details, see the
pcre2jit documentation. See also the jitstack modifier below for a way
of setting the size of the JIT stack.
If the jitfast modifier is specified, matching is done using the JIT
"fast path" interface, pcre2_jit_match(), which skips some of the san-
ity checks that are done by pcre2_match(), and of course does not work
when JIT is not supported. If jitfast is specified without jit, jit=7
If the jitfast modifier is specified, matching is done using the JIT
"fast path" interface, pcre2_jit_match(), which skips some of the san-
ity checks that are done by pcre2_match(), and of course does not work
when JIT is not supported. If jitfast is specified without jit, jit=7
is assumed.
If the jitverify modifier is specified, information about the compiled
pattern shows whether JIT compilation was or was not successful. If
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
tion is successful when jitverify is set, the text "(JIT)" is added to
If the jitverify modifier is specified, information about the compiled
pattern shows whether JIT compilation was or was not successful. If
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
tion is successful when jitverify is set, the text "(JIT)" is added to
the first output line after a match or non match when JIT-compiled code
was actually used in the match.
@ -695,18 +707,18 @@ PATTERN MODIFIERS
/pattern/locale=fr_FR
The given locale is set, pcre2_maketables() is called to build a set of
character tables for the locale, and this is then passed to pcre2_com-
pile() when compiling the regular expression. The same tables are used
character tables for the locale, and this is then passed to pcre2_com-
pile() when compiling the regular expression. The same tables are used
when matching the following subject lines. The /locale modifier applies
only to the pattern on which it appears, but can be given in a #pattern
command if a default is needed. Setting a locale and alternate charac-
command if a default is needed. Setting a locale and alternate charac-
ter tables are mutually exclusive.
Showing pattern memory
The /memory modifier causes the size in bytes of the memory used to
hold the compiled pattern to be output. This does not include the size
of the pcre2_code block; it is just the actual compiled data. If the
The /memory modifier causes the size in bytes of the memory used to
hold the compiled pattern to be output. This does not include the size
of the pcre2_code block; it is just the actual compiled data. If the
pattern is subsequently passed to the JIT compiler, the size of the JIT
compiled code is also output. Here is an example:
@ -717,31 +729,31 @@ PATTERN MODIFIERS
Limiting nested parentheses
The parens_nest_limit modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation
error. The default for the library is set when PCRE2 is built, but
pcre2test sets its own default of 220, which is required for running
The parens_nest_limit modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation
error. The default for the library is set when PCRE2 is built, but
pcre2test sets its own default of 220, which is required for running
the standard test suite.
Limiting the pattern length
The max_pattern_length modifier sets a limit, in code units, to the
The max_pattern_length modifier sets a limit, in code units, to the
length of pattern that pcre2_compile() will accept. Breaching the limit
causes a compilation error. The default is the largest number a
causes a compilation error. The default is the largest number a
PCRE2_SIZE variable can hold (essentially unlimited).
Using the POSIX wrapper API
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
per API rather than its native API. This supports only the 8-bit
library. Note that it does not imply POSIX matching semantics; for
more detail see the pcre2posix documentation. When the POSIX API is
being used, the following pattern modifiers set options for the reg-
comp() function:
The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
the POSIX wrapper API rather than its native API. When posix_nosub is
used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
wrapper supports only the 8-bit library. Note that it does not imply
POSIX matching semantics; for more detail see the pcre2posix documenta-
tion. The following pattern modifiers set options for the regcomp()
function:
caseless REG_ICASE
multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard
@ -758,23 +770,24 @@ PATTERN MODIFIERS
been set, a large buffer is used.
The aftertext and allaftertext subject modifiers work as described
below. All other modifiers cause an error.
below. All other modifiers are either ignored, with a warning message,
or cause an error.
Testing the stack guard feature
The /stackguard modifier is used to test the use of pcre2_set_com-
pile_recursion_guard(), a function that is provided to enable stack
availability to be checked during compilation (see the pcre2api docu-
mentation for details). If the number specified by the modifier is
The /stackguard modifier is used to test the use of pcre2_set_com-
pile_recursion_guard(), a function that is provided to enable stack
availability to be checked during compilation (see the pcre2api docu-
mentation for details). If the number specified by the modifier is
greater than zero, pcre2_set_compile_recursion_guard() is called to set
up callback from pcre2_compile() to a local function. The argument it
receives is the current nesting parenthesis depth; if this is greater
up callback from pcre2_compile() to a local function. The argument it
receives is the current nesting parenthesis depth; if this is greater
than the value given by the modifier, non-zero is returned, causing the
compilation to be aborted.
Using alternative character tables
The value specified for the /tables modifier must be one of the digits
The value specified for the /tables modifier must be one of the digits
0, 1, or 2. It causes a specific set of built-in character tables to be
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
haviour with different character tables. The digit specifies the tables
@ -785,15 +798,15 @@ PATTERN MODIFIERS
pcre2_chartables.c.dist
2 a set of tables defining ISO 8859 characters
In table 2, some characters whose codes are greater than 128 are iden-
tified as letters, digits, spaces, etc. Setting alternate character
In table 2, some characters whose codes are greater than 128 are iden-
tified as letters, digits, spaces, etc. Setting alternate character
tables and a locale are mutually exclusive.
Setting certain match controls
The following modifiers are really subject modifiers, and are described
below. However, they may be included in a pattern's modifier list, in
which case they are applied to every subject line that is processed
below. However, they may be included in a pattern's modifier list, in
which case they are applied to every subject line that is processed
with that pattern. They may not appear in #pattern commands. These mod-
ifiers do not affect the compilation process.
@ -810,20 +823,20 @@ PATTERN MODIFIERS
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
These modifiers may not appear in a #pattern command. If you want them
These modifiers may not appear in a #pattern command. If you want them
as defaults, set them in a #subject command.
Saving a compiled pattern
When a pattern with the push modifier is successfully compiled, it is
pushed onto a stack of compiled patterns, and pcre2test expects the
next line to contain a new pattern (or a command) instead of a subject
When a pattern with the push modifier is successfully compiled, it is
pushed onto a stack of compiled patterns, and pcre2test expects the
next line to contain a new pattern (or a command) instead of a subject
line. This facility is used when saving compiled patterns to a file, as
described in the section entitled "Saving and restoring compiled pat-
described in the section entitled "Saving and restoring compiled pat-
terns" below. The push modifier is incompatible with compilation modi-
fiers such as global that act at match time. Any that are specified are
ignored, with a warning message, except for replace, which causes an
error. Note that, jitverify, which is allowed, does not carry through
ignored, with a warning message, except for replace, which causes an
error. Note that, jitverify, which is allowed, does not carry through
to any subsequent matching that uses this pattern.
@ -834,7 +847,7 @@ SUBJECT MODIFIERS
Setting match options
The following modifiers set options for pcre2_match() or
The following modifiers set options for pcre2_match() or
pcre2_dfa_match(). See pcreapi for a description of their effects.
anchored set PCRE2_ANCHORED
@ -848,20 +861,20 @@ SUBJECT MODIFIERS
partial_hard (or ph) set PCRE2_PARTIAL_HARD
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
The partial matching modifiers are provided with abbreviations because
The partial matching modifiers are provided with abbreviations because
they appear frequently in tests.
If the /posix modifier was present on the pattern, causing the POSIX
If the /posix modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
Any other modifiers cause an error.
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
The other modifiers are ignored, with a warning message.
Setting match controls
The following modifiers affect the matching process or request addi-
tional information. Some of them may also be specified on a pattern
line (see above), in which case they apply to every subject line that
The following modifiers affect the matching process or request addi-
tional information. Some of them may also be specified on a pattern
line (see above), in which case they apply to every subject line that
is matched against that pattern.
aftertext show text after match
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
zero_terminate pass the subject as zero-terminated
The effects of these modifiers are described in the following sections.
When matching via the POSIX wrapper API, the aftertext, allaftertext,
and ovector subject modifiers work as described below. All other modi-
fiers are either ignored, with a warning message, or cause an error.
Showing more text
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
matched with the pattern, terminated as usual by an empty line or end
of file. This command may be followed by a modifier list containing
only control modifiers that act after a pattern has been compiled. In
particular, hex, posix, and push are not allowed, nor are any option-
setting modifiers. The JIT modifiers are, however permitted. Here is
an example that saves and reloads two patterns.
particular, hex, posix, posix_nosub, and push are not allowed, nor are
any option-setting modifiers. The JIT modifiers are, however permit-
ted. Here is an example that saves and reloads two patterns.
/abc/push
/xyz/push
@ -1505,5 +1521,5 @@ AUTHOR
REVISION
Last updated: 12 December 2015
Copyright (c) 1997-2015 University of Cambridge.
Last updated: 31 January 2016
Copyright (c) 1997-2016 University of Cambridge.

View File

@ -3,28 +3,31 @@
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -39,9 +42,14 @@ the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@ -62,19 +70,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@ -89,15 +97,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
@ -105,7 +117,7 @@ and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\n");
printf("Exactly two arguments required: a regex and a subject string\n");
return 1;
}
@ -184,7 +196,7 @@ if (rc < 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
printf("Match succeeded at offset %d\n", (int)ovector[0]);
/*************************************************************************
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\n");
@ -313,8 +325,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;)
{
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
@ -354,7 +366,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */
if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\r' &&
subject[start_offset + 1] == '\n')
@ -400,7 +412,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
if (namecount <= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n");