mirror of
https://github.com/sigmasternchen/php-doc-en
synced 2025-03-16 17:08:54 +00:00

removing the others git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@78562 c90b9560-bf6c-de11-be94-00142212c4b1
2629 lines
99 KiB
XML
2629 lines
99 KiB
XML
<!-- D O N O T E D I T T H I S F I L E ! ! !
|
|
|
|
it is still here for historical reasons only
|
|
(as translators may need to check old revision diffs)
|
|
|
|
if you want to change things documented in this file
|
|
you should now edit the files found under en/reference
|
|
instead -->
|
|
|
|
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
<!-- $Revision: 1.79 $ -->
|
|
<reference id="ref.pcre">
|
|
<title>Regular Expression Functions (Perl-Compatible)</title>
|
|
<titleabbrev>PCRE</titleabbrev>
|
|
|
|
<partintro>
|
|
<para id="pcre.intro">
|
|
The syntax for patterns used in these functions closely resembles
|
|
Perl. The expression should be enclosed in the delimiters, a
|
|
forward slash (/), for example. Any character can be used for
|
|
delimiter as long as it's not alphanumeric or backslash (\). If
|
|
the delimiter character has to be used in the expression itself,
|
|
it needs to be escaped by backslash. Since PHP 4.0.4, you can also use
|
|
Perl-style (), {}, [], and <> matching delimiters.
|
|
</para>
|
|
<para>
|
|
The ending delimiter may be followed by various modifiers that
|
|
affect the matching.
|
|
See <link linkend="pcre.pattern.modifiers">Pattern Modifiers</link>.
|
|
</para>
|
|
<para>
|
|
PHP also supports regular expressions using a POSIX-extended syntax
|
|
using the <link linkend="ref.regex">POSIX-extended regex functions.</link>.
|
|
</para>
|
|
|
|
<section id="pcre.requirements">
|
|
<title>Requirements</title>
|
|
<para>
|
|
Regular expression support is provided by the PCRE library
|
|
package, which is open source software, written by Philip Hazel,
|
|
and copyright by the University of Cambridge, England. It is
|
|
available at <ulink url="&url.pcre;">&url.pcre;</ulink>.
|
|
</para>
|
|
</section>
|
|
|
|
<section id="pcre.installation">
|
|
<title>Installation</title>
|
|
<para>
|
|
Beginning with PHP 4.2.0 this function are enabled by default.
|
|
For older versions you have to configure and compile PHP
|
|
with <option role="configure">--with-pcre-regex[=DIR]</option> in order
|
|
to use these functions. You can disable the pcre functions with <option
|
|
role="configure">--without-pcre-regex</option>.
|
|
</para>
|
|
</section>
|
|
|
|
<section id="pcre.configuration">
|
|
<title>Runtime Configuration</title>
|
|
&no.config;
|
|
</section>
|
|
|
|
<section id="pcre.resources">
|
|
<title>Resource types</title>
|
|
&no.resource;
|
|
</section>
|
|
|
|
<section id="pcre.constants">
|
|
<title>Predefined constants</title>
|
|
<para>
|
|
<constant>PREG_PATTERN_ORDER</constant>
|
|
<constant>PREG_SET_ORDER</constant>
|
|
<constant>PREG_SPLIT_NO_EMPTY</constant>
|
|
<constant>PREG_SPLIT_DELIM_CAPTURE</constant>
|
|
</para>
|
|
</section>
|
|
|
|
<section id="pcre.examples">
|
|
<title>Examples</title>
|
|
<para>
|
|
<example>
|
|
<title>Examples of valid patterns</title>
|
|
<itemizedlist>
|
|
<listitem><simpara>/<\/\w+>/</simpara></listitem>
|
|
<listitem><simpara>|(\d{3})-\d+|Sm</simpara></listitem>
|
|
<listitem><simpara>/^(?i)php[34]/</simpara></listitem>
|
|
<listitem><simpara>{^\s+(\s+)?$}</simpara></listitem>
|
|
</itemizedlist>
|
|
</example>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Examples of invalid patterns</title>
|
|
<itemizedlist>
|
|
<listitem>
|
|
<simpara>
|
|
/href='(.*)' - missing ending delimiter
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
/\w+\s*\w+/J - unknown modifier 'J'
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
1-\d3-\d3-\d4| - missing starting delimiter
|
|
</simpara>
|
|
</listitem>
|
|
</itemizedlist>
|
|
</example>
|
|
</para>
|
|
</section>
|
|
|
|
</partintro>
|
|
|
|
<refentry id="function.preg-match">
|
|
<refnamediv>
|
|
<refname>preg_match</refname>
|
|
<refpurpose>Perform a regular expression match</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>int</type><methodname>preg_match</methodname>
|
|
<methodparam><type>string</type><parameter>pattern</parameter></methodparam>
|
|
<methodparam><type>string</type><parameter>subject</parameter></methodparam>
|
|
<methodparam choice="opt"><type>array</type><parameter>matches</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
Searches <parameter>subject</parameter> for a match to the regular
|
|
expression given in <parameter>pattern</parameter>.
|
|
</para>
|
|
<para>
|
|
If <parameter>matches</parameter> is provided, then it is filled
|
|
with the results of search. $matches[0] will contain the text that
|
|
matched the full pattern, $matches[1] will have the text that matched
|
|
the first captured parenthesized subpattern, and so on.
|
|
</para>
|
|
<para>
|
|
Returns &true; if a match for <parameter>pattern</parameter> was
|
|
found in the subject string, or &false; if not match was found
|
|
or an error occurred.
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Find the string of text "php"</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// the "i" after the pattern delimiter indicates a case-insensitive search
|
|
if (preg_match ("/php/i", "PHP is the web scripting language of choice.")) {
|
|
print "A match was found.";
|
|
} else {
|
|
print "A match was not found.";
|
|
}
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
<example>
|
|
<title>find the word "web"</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// the \b in the pattern indicates a word boundary, so only the distinct
|
|
// word "web" is matched, and not a word partial like "webbing" or "cobweb"
|
|
if (preg_match ("/\bweb\b/i", "PHP is the web scripting language of choice.")) {
|
|
print "A match was found.";
|
|
} else {
|
|
print "A match was not found.";
|
|
}
|
|
if (preg_match ("/\bweb\b/i", "PHP is the website scripting language of choice.")) {
|
|
print "A match was found.";
|
|
} else {
|
|
print "A match was not found.";
|
|
}
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
<example>
|
|
<title>Getting the domain name out of a URL</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// get host name from URL
|
|
preg_match("/^(http:\/\/)?([^\/]+)/i",
|
|
"http://www.php.net/index.html", $matches);
|
|
$host = $matches[2];
|
|
// get last two segments of host name
|
|
preg_match("/[^\.\/]+\.[^\.\/]+$/",$host,$matches);
|
|
echo "domain name is: ".$matches[0]."\n";
|
|
]]>
|
|
</programlisting>
|
|
<para>
|
|
This example will produce:
|
|
<screen>
|
|
<![CDATA[
|
|
domain name is: php.net
|
|
]]>
|
|
</screen>
|
|
</para>
|
|
</example>
|
|
See also <function>preg_match_all</function>,
|
|
<function>preg_replace</function>, and
|
|
<function>preg_split</function>.
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="function.preg-match-all">
|
|
<refnamediv>
|
|
<refname>preg_match_all</refname>
|
|
<refpurpose>Perform a global regular expression match</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>int</type><methodname>preg_match_all</methodname>
|
|
<methodparam><type>string</type><parameter>pattern</parameter></methodparam>
|
|
<methodparam><type>string</type><parameter>subject</parameter></methodparam>
|
|
<methodparam><type>array</type><parameter>matches</parameter></methodparam>
|
|
<methodparam choice="opt"><type>int</type><parameter>order</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
Searches <parameter>subject</parameter> for all matches to the regular
|
|
expression given in <parameter>pattern</parameter> and puts them in
|
|
<parameter>matches</parameter> in the order specified by
|
|
<parameter>order</parameter>.
|
|
</para>
|
|
<para>
|
|
After the first match is found, the subsequent searches are continued
|
|
on from end of the last match.
|
|
</para>
|
|
<para>
|
|
<parameter>order</parameter> can be one of two things:
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>PREG_PATTERN_ORDER</term>
|
|
<listitem>
|
|
<para>
|
|
Orders results so that $matches[0] is an array of full
|
|
pattern matches, $matches[1] is an array of strings matched by
|
|
the first parenthesized subpattern, and so on.
|
|
<informalexample>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
preg_match_all ("|<[^>]+>(.*)</[^>]+>|U",
|
|
"<b>example: </b><div align=left>this is a test</div>",
|
|
$out, PREG_PATTERN_ORDER);
|
|
print $out[0][0].", ".$out[0][1]."\n";
|
|
print $out[1][0].", ".$out[1][1]."\n";
|
|
]]>
|
|
</programlisting>
|
|
<para>
|
|
This example will produce:
|
|
<screen>
|
|
<![CDATA[
|
|
<b>example: </b>, <div align=left>this is a test</div>
|
|
example: , this is a test
|
|
]]>
|
|
</screen>
|
|
So, $out[0] contains array of strings that matched full pattern,
|
|
and $out[1] contains array of strings enclosed by tags.
|
|
</para>
|
|
</informalexample>
|
|
</para>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>PREG_SET_ORDER</term>
|
|
<listitem>
|
|
<para>
|
|
Orders results so that $matches[0] is an array of first set
|
|
of matches, $matches[1] is an array of second set of matches,
|
|
and so on.
|
|
<informalexample>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
preg_match_all ("|<[^>]+>(.*)</[^>]+>|U",
|
|
"<b>example: </b><div align=left>this is a test</div>",
|
|
$out, PREG_SET_ORDER);
|
|
print $out[0][0].", ".$out[0][1]."\n";
|
|
print $out[1][0].", ".$out[1][1]."\n";
|
|
]]>
|
|
</programlisting>
|
|
</informalexample>
|
|
This example will produce:
|
|
<informalexample>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
<b>example: </b>, example:
|
|
<div align=left>this is a test</div>, this is a test
|
|
]]>
|
|
</programlisting>
|
|
</informalexample>
|
|
In this case, $matches[0] is the first set of matches, and
|
|
$matches[0][0] has text matched by full pattern, $matches[0][1]
|
|
has text matched by first subpattern and so on. Similarly,
|
|
$matches[1] is the second set of matches, etc.
|
|
</para>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
<para>
|
|
If <parameter>order</parameter> is not specified, it is assumed
|
|
to be PREG_PATTERN_ORDER.
|
|
</para>
|
|
<para>
|
|
Returns the number of full pattern matches, or &false; if
|
|
no match is found or an error occurred.
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Getting all phone numbers out of some text.</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
preg_match_all ("/\(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{4}/x",
|
|
"Call 555-1212 or 1-800-555-1212", $phones);
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Find matching HTML tags (greedy)</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// The \\2 is an example of backreferencing. This tells pcre that
|
|
// it must match the second set of parentheses in the regular expression
|
|
// itself, which would be the ([\w]+) in this case. The extra backslash is
|
|
// required because the string is in double quotes.
|
|
$html = "<b>bold text</b><a href=howdy.html>click me</a>;
|
|
|
|
preg_match_all ("/(<([\w]+)[^>]*>)(.*)(<\/\\2>)/", $html, $matches);
|
|
|
|
for ($i=0; $i< count($matches[0]); $i++) {
|
|
echo "matched: ".$matches[0][$i]."\n";
|
|
echo "part 1: ".$matches[1][$i]."\n";
|
|
echo "part 2: ".$matches[3][$i]."\n";
|
|
echo "part 3: ".$matches[4][$i]."\n\n";
|
|
}
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
This example will produce:
|
|
<programlisting>
|
|
<![CDATA[
|
|
matched: <b>bold text</b>
|
|
part 1: <b>
|
|
part 2: bold text
|
|
part 3: </b>
|
|
|
|
matched: <a href=howdy.html>click me</a>
|
|
part 1: <a href=howdy.html>
|
|
part 2: click me
|
|
part 3: </a>
|
|
]]>
|
|
</programlisting>
|
|
</para>
|
|
<simpara>
|
|
See also <function>preg_match</function>,
|
|
<function>preg_replace</function>,
|
|
and <function>preg_split</function>.
|
|
</simpara>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="function.preg-replace">
|
|
<refnamediv>
|
|
<refname>preg_replace</refname>
|
|
<refpurpose>Perform a regular expression search and replace</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>mixed</type><methodname>preg_replace</methodname>
|
|
<methodparam><type>mixed</type><parameter>pattern</parameter></methodparam>
|
|
<methodparam><type>mixed</type><parameter>replacement</parameter></methodparam>
|
|
<methodparam><type>mixed</type><parameter>subject</parameter></methodparam>
|
|
<methodparam choice="opt"><type>int</type><parameter>limit</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
Searches <parameter>subject</parameter> for matches to
|
|
<parameter> pattern</parameter> and replaces them with
|
|
<parameter>replacement</parameter>. If
|
|
<parameter>limit</parameter> is specified, then only
|
|
<parameter>limit</parameter> matches will be replaced; if
|
|
<parameter>limit</parameter> is omitted or is -1, then all
|
|
matches are replaced.
|
|
</para>
|
|
<para>
|
|
<parameter>Replacement</parameter> may contain references of the form
|
|
<literal>\\<replaceable>n</replaceable></literal> or (since PHP 4.0.4)
|
|
<literal><replaceable>$n</replaceable></literal>, with the latter form
|
|
being the preferred one. Every such reference will be replaced by the text
|
|
captured by the <replaceable>n</replaceable>'th parenthesized pattern.
|
|
<replaceable>n </replaceable>can be from 0 to 99, and
|
|
<literal>\\0</literal> or <literal>$0</literal> refers to the text matched
|
|
by the whole pattern. Opening parentheses are counted from left to right
|
|
(starting from 1) to obtain the number of the capturing subpattern.
|
|
</para>
|
|
<para>
|
|
If matches are found, the new <parameter>subject</parameter> will
|
|
be returned, otherwise <parameter>subject</parameter> will be
|
|
returned unchanged.
|
|
</para>
|
|
<para>
|
|
Every parameter to <function>preg_replace</function> can be an
|
|
array.
|
|
</para>
|
|
<para>
|
|
If <parameter>subject</parameter> is an array, then the search
|
|
and replace is performed on every entry of
|
|
<parameter>subject</parameter>, and the return value is an array
|
|
as well.
|
|
</para>
|
|
<para>
|
|
If <parameter>pattern</parameter> and
|
|
<parameter>replacement</parameter> are arrays, then
|
|
<function>preg_replace</function> takes a value from each array
|
|
and uses them to do search and replace on
|
|
<parameter>subject</parameter>. If
|
|
<parameter>replacement</parameter> has fewer values than
|
|
<parameter>pattern</parameter>, then empty string is used for the
|
|
rest of replacement values. If <parameter>pattern </parameter>
|
|
is an array and <parameter>replacement</parameter> is a string,
|
|
then this replacement string is used for every value of
|
|
<parameter>pattern</parameter>. The converse would not make
|
|
sense, though.
|
|
</para>
|
|
<para>
|
|
<literal>/e</literal> modifier makes
|
|
<function>preg_replace</function> treat the
|
|
<parameter>replacement</parameter> parameter as PHP code after
|
|
the appropriate references substitution is done. Tip: make sure
|
|
that <parameter>replacement</parameter> constitutes a valid PHP
|
|
code string, otherwise PHP will complain about a parse error at
|
|
the line containing <function>preg_replace</function>.
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Replacing several values</title>
|
|
<programlisting>
|
|
<![CDATA[
|
|
$patterns = array ("/(19|20)(\d{2})-(\d{1,2})-(\d{1,2})/",
|
|
"/^\s*{(\w+)}\s*=/");
|
|
$replace = array ("\\3/\\4/\\1\\2", "$\\1 =");
|
|
print preg_replace ($patterns, $replace, "{startDate} = 1999-5-27");
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
This example will produce:
|
|
<programlisting>
|
|
<![CDATA[
|
|
$startDate = 5/27/1999
|
|
]]>
|
|
</programlisting>
|
|
<example>
|
|
<title>Using /e modifier</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
preg_replace ("/(<\/?)(\w+)([^>]*>)/e",
|
|
"'\\1'.strtoupper('\\2').'\\3'",
|
|
$html_body);
|
|
]]>
|
|
</programlisting>
|
|
<para>
|
|
This would capitalize all HTML tags in the input text.
|
|
</para>
|
|
</example>
|
|
<example>
|
|
<title>Convert HTML to text</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// $document should contain an HTML document.
|
|
// This will remove HTML tags, javascript sections
|
|
// and white space. It will also convert some
|
|
// common HTML entities to their text equivalent.
|
|
|
|
$search = array ("'<script[^>]*?>.*?</script>'si", // Strip out javascript
|
|
"'<[\/\!]*?[^<>]*?>'si", // Strip out html tags
|
|
"'([\r\n])[\s]+'", // Strip out white space
|
|
"'&(quot|#34);'i", // Replace html entities
|
|
"'&(amp|#38);'i",
|
|
"'&(lt|#60);'i",
|
|
"'&(gt|#62);'i",
|
|
"'&(nbsp|#160);'i",
|
|
"'&(iexcl|#161);'i",
|
|
"'&(cent|#162);'i",
|
|
"'&(pound|#163);'i",
|
|
"'&(copy|#169);'i",
|
|
"'&#(\d+);'e"); // evaluate as php
|
|
|
|
$replace = array ("",
|
|
"",
|
|
"\\1",
|
|
"\"",
|
|
"&",
|
|
"<",
|
|
">",
|
|
" ",
|
|
chr(161),
|
|
chr(162),
|
|
chr(163),
|
|
chr(169),
|
|
"chr(\\1)");
|
|
|
|
$text = preg_replace ($search, $replace, $document);
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
<note>
|
|
<para>
|
|
Parameter <parameter>limit</parameter> was added after PHP 4.0.1pl2.
|
|
</para>
|
|
</note>
|
|
<para>
|
|
See also <function>preg_match</function>,
|
|
<function>preg_match_all</function>, and
|
|
<function>preg_split</function>.
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="function.preg-replace-callback">
|
|
<refnamediv>
|
|
<refname>preg_replace_callback</refname>
|
|
<refpurpose>Perform a regular expression search and replace using a callback</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>mixed</type><methodname>preg_replace_callback</methodname>
|
|
<methodparam><type>mixed</type><parameter>pattern</parameter></methodparam>
|
|
<methodparam><type>mixed</type><parameter>callback</parameter></methodparam>
|
|
<methodparam><type>mixed</type><parameter>subject</parameter></methodparam>
|
|
<methodparam choice="opt"><type>int</type><parameter>limit</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
The behavior of this function is almost identical to
|
|
<function>preg_replace</function>, except for the fact that instead of
|
|
<parameter>replacement</parameter> parameter, one should specify a
|
|
<parameter>callback</parameter> that will be called and passed an array of
|
|
matched elements in the subject string. The callback should return the
|
|
replacement string. This function was added in PHP 4.0.5.
|
|
</para>
|
|
<para>
|
|
See also <function>preg_replace</function>.
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="function.preg-split">
|
|
<refnamediv>
|
|
<refname>preg_split</refname>
|
|
<refpurpose>Split string by a regular expression</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>array</type><methodname>preg_split</methodname>
|
|
<methodparam><type>string</type><parameter>pattern</parameter></methodparam>
|
|
<methodparam><type>string</type><parameter>subject</parameter></methodparam>
|
|
<methodparam choice="opt"><type>int</type><parameter>limit</parameter></methodparam>
|
|
<methodparam choice="opt"><type>int</type><parameter>flags</parameter></methodparam>
|
|
</methodsynopsis>
|
|
|
|
<note>
|
|
<para>
|
|
Parameter <parameter>flags</parameter> was added in PHP 4 Beta 3.
|
|
</para>
|
|
</note>
|
|
|
|
<para>
|
|
Returns an array containing substrings of
|
|
<parameter>subject</parameter> split along boundaries matched by
|
|
<parameter>pattern</parameter>.
|
|
</para>
|
|
|
|
<para>
|
|
If <parameter>limit</parameter> is specified, then only substrings up to
|
|
<parameter>limit</parameter> are returned, and if
|
|
<parameter>limit</parameter> is -1, it actually means "no limit", which is
|
|
useful for specifying the <parameter>flags</parameter>.
|
|
</para>
|
|
|
|
<para>
|
|
<parameter>flags</parameter> can be any combination of the following flags
|
|
(combined with bitwise | operator):
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>PREG_SPLIT_NO_EMPTY</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this flag is set, only non-empty pieces will be returned by
|
|
<function>preg_split</function>.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>PREG_SPLIT_DELIM_CAPTURE</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this flag is set, parenthesized expression in the delimiter pattern
|
|
will be captured and returned as well. This flag was added for 4.0.5.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title><function>preg_split</function> example : Get the parts of a search string.</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// split the phrase by any number of commas or space characters,
|
|
// which include " ", \r, \t, \n and \f
|
|
$keywords = preg_split ("/[\s,]+/", "hypertext language, programming");
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Splitting a string into component characters.</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
$str = 'string';
|
|
$chars = preg_split('//', $str, -1, PREG_SPLIT_NO_EMPTY);
|
|
print_r($chars);
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
<para>
|
|
See also
|
|
<function>spliti</function>,
|
|
<function>split</function>,
|
|
<function>implode</function>,
|
|
<function>preg_match</function>,
|
|
<function>preg_match_all</function>, and
|
|
<function>preg_replace</function>.
|
|
</para>
|
|
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="function.preg-quote">
|
|
<refnamediv>
|
|
<refname>preg_quote</refname>
|
|
<refpurpose>Quote regular expression characters</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>string</type><methodname>preg_quote</methodname>
|
|
<methodparam><type>string</type><parameter>str</parameter></methodparam>
|
|
<methodparam choice="opt"><type>string</type><parameter>delimiter</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
<function>preg_quote</function> takes <parameter>str</parameter>
|
|
and puts a backslash in front of every character that is part of
|
|
the regular expression syntax. This is useful if you have a
|
|
run-time string that you need to match in some text and the
|
|
string may contain special regex characters.
|
|
</para>
|
|
<para>
|
|
If the optional <parameter>delimiter</parameter> is specified, it
|
|
will also be escaped. This is useful for escaping the delimiter
|
|
that is required by the PCRE functions. The / is the most commonly
|
|
used delimiter.</para>
|
|
<para>
|
|
The special regular expression characters are:
|
|
<screen>. \\ + * ? [ ^ ] $ ( ) { } = ! < > | :</screen>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title></title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
$keywords = "$40 for a g3/400";
|
|
$keywords = preg_quote ($keywords, "/");
|
|
echo $keywords; // returns \$40 for a g3\/400
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
<example>
|
|
<title>Italicizing a word within some text</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// In this example, preg_quote($word) is used to keep the
|
|
// asterisks from having special meaning to the regular
|
|
// expression.
|
|
|
|
$textbody = "This book is *very* difficult to find.";
|
|
$word = "*very*";
|
|
$textbody = preg_replace ("/".preg_quote($word)."/",
|
|
"<i>".$word."</i>",
|
|
$textbody);
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="function.preg-grep">
|
|
<refnamediv>
|
|
<refname>preg_grep</refname>
|
|
<refpurpose>
|
|
Return array entries that match the pattern
|
|
</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>array</type><methodname>preg_grep</methodname>
|
|
<methodparam><type>string</type><parameter>pattern</parameter></methodparam>
|
|
<methodparam><type>array</type><parameter>input</parameter></methodparam>
|
|
</methodsynopsis>
|
|
|
|
<para>
|
|
<function>preg_grep</function> returns the array consisting of
|
|
the elements of the <parameter>input</parameter> array that match
|
|
the given <parameter>pattern</parameter>.
|
|
</para>
|
|
|
|
<para>
|
|
Since PHP 4.0.4, the results returned by <function>preg_grep</function>
|
|
are indexed using the keys from the input array. If this behavior is
|
|
undesirable, use <function>array_values</function> on the array returned by
|
|
<function>preg_grep</function> to reindex the values.
|
|
</para>
|
|
|
|
<para>
|
|
<example>
|
|
<title><function>preg_grep</function> example</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
// return all array elements
|
|
// containing floating point numbers
|
|
$fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="pcre.pattern.modifiers">
|
|
<refnamediv>
|
|
<refname>Pattern Modifiers</refname>
|
|
<refpurpose>Describes possible modifiers in regex
|
|
patterns</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<para>
|
|
The current possible PCRE modifiers are listed below. The names
|
|
in parentheses refer to internal PCRE names for these modifiers.
|
|
</para>
|
|
<para>
|
|
<blockquote>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>i</emphasis> (PCRE_CASELESS)</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this modifier is set, letters in the pattern match both
|
|
upper and lower case letters.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>m</emphasis> (PCRE_MULTILINE)</term>
|
|
<listitem>
|
|
<simpara>
|
|
By default, PCRE treats the subject string as consisting of a
|
|
single "line" of characters (even if it actually contains
|
|
several newlines). The "start of line" metacharacter (^)
|
|
matches only at the start of the string, while the "end of
|
|
line" metacharacter ($) matches only at the end of the
|
|
string, or before a terminating newline (unless
|
|
<emphasis>D</emphasis> modifier is set). This is the same as
|
|
Perl.
|
|
</simpara>
|
|
<simpara>
|
|
When this modifier is set, the "start of line" and "end of
|
|
line" constructs match immediately following or immediately
|
|
before any newline in the subject string, respectively, as
|
|
well as at the very start and end. This is equivalent to
|
|
Perl's /m modifier. If there are no "\n" characters in a
|
|
subject string, or no occurrences of ^ or $ in a pattern,
|
|
setting this modifier has no effect.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>s</emphasis> (PCRE_DOTALL)</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this modifier is set, a dot metacharacter in the pattern
|
|
matches all characters, including newlines. Without it,
|
|
newlines are excluded. This modifier is equivalent to Perl's
|
|
/s modifier. A negative class such as [^a] always matches a
|
|
newline character, independent of the setting of this
|
|
modifier.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>x</emphasis> (PCRE_EXTENDED)</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this modifier is set, whitespace data characters in the
|
|
pattern are totally ignored except when escaped or inside a
|
|
character class, and characters between an unescaped #
|
|
outside a character class and the next newline character,
|
|
inclusive, are also ignored. This is equivalent to Perl's /x
|
|
modifier, and makes it possible to include comments inside
|
|
complicated patterns. Note, however, that this applies only
|
|
to data characters. Whitespace characters may never appear
|
|
within special character sequences in a pattern, for example
|
|
within the sequence (?( which introduces a conditional
|
|
subpattern.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>e</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
If this modifier is set, <function>preg_replace</function>
|
|
does normal substitution of backreferences in the
|
|
replacement string, evaluates it as PHP code, and uses the
|
|
result for replacing the search string.
|
|
</simpara>
|
|
<simpara>
|
|
Only <function>preg_replace</function> uses this modifier;
|
|
it is ignored by other PCRE functions.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>A</emphasis> (PCRE_ANCHORED)</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this modifier is set, the pattern is forced to be
|
|
"anchored", that is, it is constrained to match only at the
|
|
start of the string which is being searched (the "subject
|
|
string"). This effect can also be achieved by appropriate
|
|
constructs in the pattern itself, which is the only way to
|
|
do it in Perl.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>D</emphasis> (PCRE_DOLLAR_ENDONLY)</term>
|
|
<listitem>
|
|
<simpara>
|
|
If this modifier is set, a dollar metacharacter in the pattern
|
|
matches only at the end of the subject string. Without this
|
|
modifier, a dollar also matches immediately before the final
|
|
character if it is a newline (but not before any other
|
|
newlines). This modifier is ignored if <emphasis>m</emphasis>
|
|
modifier is set. There is no equivalent to this modifier in
|
|
Perl.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>S</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
When a pattern is going to be used several times, it is
|
|
worth spending more time analyzing it in order to speed up
|
|
the time taken for matching. If this modifier is set, then
|
|
this extra analysis is performed. At present, studying a
|
|
pattern is useful only for non-anchored patterns that do not
|
|
have a single fixed starting character.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>U</emphasis> (PCRE_UNGREEDY)</term>
|
|
<listitem>
|
|
<simpara>
|
|
This modifier inverts the "greediness" of the quantifiers so
|
|
that they are not greedy by default, but become greedy if
|
|
followed by "?". It is not compatible with Perl. It can also
|
|
be set by a (?U) modifier setting within the pattern.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>X</emphasis> (PCRE_EXTRA)</term>
|
|
<listitem>
|
|
<simpara>
|
|
This modifier turns on additional functionality of PCRE that
|
|
is incompatible with Perl. Any backslash in a pattern that
|
|
is followed by a letter that has no special meaning causes
|
|
an error, thus reserving these combinations for future
|
|
expansion. By default, as in Perl, a backslash followed by a
|
|
letter with no special meaning is treated as a literal.
|
|
There are at present no other features controlled by this
|
|
modifier.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>u</emphasis> (PCRE_UTF8)</term>
|
|
<listitem>
|
|
<simpara>
|
|
This modifier turns on additional functionality of PCRE that
|
|
is incompatible with Perl. Pattern strings are treated as
|
|
UTF-8. This modifier is available from PHP 4.1.0 or greater.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</blockquote>
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<refentry id="pcre.pattern.syntax">
|
|
<refnamediv>
|
|
<refname>Pattern Syntax</refname>
|
|
<refpurpose>Describes PCRE regex syntax</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<simpara>
|
|
The PCRE library is a set of functions that implement regular
|
|
expression pattern matching using the same syntax and semantics
|
|
as Perl 5, with just a few differences (see below). The current
|
|
implementation corresponds to Perl 5.005.
|
|
</simpara>
|
|
</refsect1>
|
|
|
|
<refsect1>
|
|
<title>Differences From Perl</title>
|
|
<para>
|
|
The differences described here are with respect to Perl
|
|
5.005.
|
|
<orderedlist>
|
|
<listitem>
|
|
<simpara>
|
|
By default, a whitespace character is any character that
|
|
the C library function isspace() recognizes, though it is
|
|
possible to compile PCRE with alternative character type
|
|
tables. Normally isspace() matches space, formfeed, newline,
|
|
carriage return, horizontal tab, and vertical tab. Perl 5 no
|
|
longer includes vertical tab in its set of whitespace characters.
|
|
The \v escape that was in the Perl documentation for
|
|
a long time was never in fact recognized. However, the character
|
|
itself was treated as whitespace at least up to 5.002.
|
|
In 5.004 and 5.005 it does not match \s.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
PCRE does not allow repeat quantifiers on lookahead
|
|
assertions. Perl permits them, but they do not mean what you
|
|
might think. For example, (?!a){3} does not assert that the
|
|
next three characters are not "a". It just asserts that the
|
|
next character is not "a" three times.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
Capturing subpatterns that occur inside negative looka-
|
|
head assertions are counted, but their entries in the
|
|
offsets vector are never set. Perl sets its numerical vari-
|
|
ables from any such patterns that are matched before the
|
|
assertion fails to match something (thereby succeeding), but
|
|
only if the negative lookahead assertion contains just one
|
|
branch.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
Though binary zero characters are supported in the sub-
|
|
ject string, they are not allowed in a pattern string
|
|
because it is passed as a normal C string, terminated by
|
|
zero. The escape sequence "\0" can be used in the pattern to
|
|
represent a binary zero.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
The following Perl escape sequences are not supported:
|
|
\l, \u, \L, \U, \E, \Q. In fact these are implemented by
|
|
Perl's general string-handling and are not part of its pat-
|
|
tern matching engine.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
The Perl \G assertion is not supported as it is not
|
|
relevant to single pattern matches.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
Fairly obviously, PCRE does not support the (?{code})
|
|
construction.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
There are at the time of writing some oddities in Perl
|
|
5.005_02 concerned with the settings of captured strings
|
|
when part of a pattern is repeated. For example, matching
|
|
"aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
|
|
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2
|
|
unset. However, if the pattern is changed to
|
|
/^(aa(b(b))?)+$/ then $2 (and $3) get set.
|
|
In Perl 5.004 $2 is set in both cases, and that is also &true;
|
|
of PCRE. If in the future Perl changes to a consistent state
|
|
that is different, PCRE may change to follow.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
Another as yet unresolved discrepancy is that in Perl
|
|
5.005_02 the pattern /^(a)?(?(1)a|b)+$/ matches the string
|
|
"a", whereas in PCRE it does not. However, in both Perl and
|
|
PCRE /^(a)?a/ matched against "a" leaves $1 unset.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<para>
|
|
PCRE provides some extensions to the Perl regular
|
|
expression facilities:
|
|
<orderedlist>
|
|
<listitem>
|
|
<simpara>
|
|
Although lookbehind assertions must match fixed length
|
|
strings, each alternative branch of a lookbehind assertion
|
|
can match a different length of string. Perl 5.005 requires
|
|
them all to have the same length.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
If <link linkend="pcre.pattern.modifiers">PCRE_DOLLAR_ENDONLY</link> is set and
|
|
<link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> is not
|
|
set, the $ meta- character matches only at the very end of
|
|
the string.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
If <link linkend="pcre.pattern.modifiers">PCRE_EXTRA</link> is set, a backslash followed by a letter
|
|
with no special meaning is faulted.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
If <link linkend="pcre.pattern.modifiers">PCRE_UNGREEDY</link> is set, the greediness of the repeti-
|
|
tion quantifiers is inverted, that is, by default they are
|
|
not greedy, but if followed by a question mark they are.
|
|
</simpara>
|
|
</listitem>
|
|
</orderedlist>
|
|
</para>
|
|
</listitem>
|
|
</orderedlist>
|
|
</para>
|
|
</refsect1>
|
|
|
|
<refsect1 id="regexp.reference">
|
|
<title>Regular Expression Details</title>
|
|
<refsect2 id="regexp.introduction">
|
|
<title>Introduction</title>
|
|
<para>
|
|
The syntax and semantics of the regular expressions sup-
|
|
ported by PCRE are described below. Regular expressions are
|
|
also described in the Perl documentation and in a number of
|
|
other books, some of which have copious examples. Jeffrey
|
|
Friedl's "Mastering Regular Expressions", published by
|
|
O'Reilly (ISBN 1-56592-257-3), covers them in great detail.
|
|
The description here is intended as reference documentation.
|
|
|
|
A regular expression is a pattern that is matched against a
|
|
subject string from left to right. Most characters stand for
|
|
themselves in a pattern, and match the corresponding charac-
|
|
ters in the subject. As a trivial example, the pattern
|
|
<literal>The quick brown fox</literal>
|
|
matches a portion of a subject string that is identical to
|
|
itself.
|
|
</para>
|
|
</refsect2>
|
|
<refsect2 id="regexp.reference.meta">
|
|
<title>Meta-caracters</title>
|
|
<para>
|
|
The power of regular expressions comes from the
|
|
ability to include alternatives and repetitions in the pat-
|
|
tern. These are encoded in the pattern by the use of <emphasis>meta</emphasis>-
|
|
<emphasis>characters</emphasis>, which do not stand for themselves but instead
|
|
are interpreted in some special way.
|
|
</para>
|
|
<para>
|
|
There are two different sets of meta-characters: those that
|
|
are recognized anywhere in the pattern except within square
|
|
brackets, and those that are recognized in square brackets.
|
|
Outside square brackets, the meta-characters are as follows:
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>\</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
general escape character with several uses
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>^</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
assert start of subject (or line, in multiline mode)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>$</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
assert end of subject (or line, in multiline mode)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>.</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
match any character except newline (by default)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>[</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
start character class definition
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>]</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
end character class definition
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>|</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
start of alternative branch
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>(</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
start subpattern
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>)</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
end subpattern
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>?</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
extends the meaning of (, also 0 or 1 quantifier, also quantifier minimizer
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>*</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
0 or more quantifier
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>+</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
1 or more quantifier
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>{</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
start min/max quantifier
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>}</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
end min/max quantifier
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
|
|
Part of a pattern that is in square brackets is called a
|
|
"character class". In a character class the only meta-
|
|
characters are:
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>\</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
general escape character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>^</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
negate the class, but only if the first character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>-</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
indicates character range
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>]</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
terminates the character class
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
The following sections describe the use of each of the
|
|
meta-characters.
|
|
</para>
|
|
</refsect2>
|
|
<refsect2 id="regexp.reference.backslash">
|
|
<title>backslash</title>
|
|
<para>
|
|
The backslash character has several uses. Firstly, if it is
|
|
followed by a non-alphameric character, it takes away any
|
|
special meaning that character may have. This use of
|
|
backslash as an escape character applies both inside and
|
|
outside character classes.
|
|
</para>
|
|
<para>
|
|
For example, if you want to match a "*" character, you write
|
|
"\*" in the pattern. This applies whether or not the follow-
|
|
ing character would otherwise be interpreted as a meta-
|
|
character, so it is always safe to precede a non-alphameric
|
|
with "\" to specify that it stands for itself. In particu-
|
|
lar, if you want to match a backslash, you write "\\".
|
|
</para>
|
|
<para>
|
|
If a pattern is compiled with the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> option, whi-
|
|
tespace in the pattern (other than in a character class) and
|
|
characters between a "#" outside a character class and the
|
|
next newline character are ignored. An escaping backslash
|
|
can be used to include a whitespace or "#" character as part
|
|
of the pattern.
|
|
</para>
|
|
<para>
|
|
A second use of backslash provides a way of encoding non-
|
|
printing characters in patterns in a visible manner. There
|
|
is no restriction on the appearance of non-printing charac-
|
|
ters, apart from the binary zero that terminates a pattern,
|
|
but when a pattern is being prepared by text editing, it is
|
|
usually easier to use one of the following escape sequences
|
|
than the binary character it represents:
|
|
</para>
|
|
<para>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>\a</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
alarm, that is, the BEL character (hex 07)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\cx</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
"control-x", where x is any character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\e</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
escape (hex 1B)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\f</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
formfeed (hex 0C)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\n</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
newline (hex 0A)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\r</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
carriage return (hex 0D)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\t</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
tab (hex 09)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\xhh</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
character with hex code hh
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\ddd</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
character with octal code ddd, or backreference
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
<para>
|
|
The precise effect of "<literal>\cx</literal>" is as follows: if "<literal>x</literal>" is a lower
|
|
case letter, it is converted to upper case. Then bit 6 of
|
|
the character (hex 40) is inverted. Thus "<literal>\cz</literal>" becomes hex
|
|
1A, but "<literal>\c{</literal>" becomes hex 3B, while "<literal>\c;</literal>" becomes hex 7B.
|
|
</para>
|
|
<para>
|
|
After "<literal>\x</literal>", up to two hexadecimal digits are read (letters
|
|
can be in upper or lower case).
|
|
</para>
|
|
<para>
|
|
After "<literal>\0</literal>" up to two further octal digits are read. In both
|
|
cases, if there are fewer than two digits, just those that
|
|
are present are used. Thus the sequence "<literal>\0\x\07</literal>" specifies
|
|
two binary zeros followed by a BEL character. Make sure you
|
|
supply two digits after the initial zero if the character
|
|
that follows is itself an octal digit.
|
|
</para>
|
|
<para>
|
|
The handling of a backslash followed by a digit other than 0
|
|
is complicated. Outside a character class, PCRE reads it
|
|
and any following digits as a decimal number. If the number
|
|
is less than 10, or if there have been at least that many
|
|
previous capturing left parentheses in the expression, the
|
|
entire sequence is taken as a <emphasis>back</emphasis> <emphasis>reference</emphasis>. A description
|
|
of how this works is given later, following the discussion
|
|
of parenthesized subpatterns.
|
|
</para>
|
|
<para>
|
|
Inside a character class, or if the decimal number is
|
|
greater than 9 and there have not been that many capturing
|
|
subpatterns, PCRE re-reads up to three octal digits follow-
|
|
ing the backslash, and generates a single byte from the
|
|
least significant 8 bits of the value. Any subsequent digits
|
|
stand for themselves. For example:
|
|
</para>
|
|
<para>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>\040</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is another way of writing a space
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\40</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is the same, provided there are fewer than 40
|
|
previous capturing subpatterns
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\7</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is always a back reference
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\11</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
might be a back reference, or another way of
|
|
writing a tab
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\011</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is always a tab
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\0113</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is a tab followed by the character "3"
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\113</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is the character with octal code 113 (since there
|
|
can be no more than 99 back references)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\377</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is a byte consisting entirely of 1 bits
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\81</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
is either a back reference, or a binary zero
|
|
followed by the two characters "8" and "1"
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
<para>
|
|
Note that octal values of 100 or greater must not be intro-
|
|
duced by a leading zero, because no more than three octal
|
|
digits are ever read.
|
|
</para>
|
|
<para>
|
|
All the sequences that define a single byte value can be
|
|
used both inside and outside character classes. In addition,
|
|
inside a character class, the sequence "<literal>\b</literal>" is interpreted
|
|
as the backspace character (hex 08). Outside a character
|
|
class it has a different meaning (see below).
|
|
</para>
|
|
<para>
|
|
The third use of backslash is for specifying generic charac-
|
|
ter types:
|
|
</para>
|
|
<para>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>\d</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
any decimal digit
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\D</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
any character that is not a decimal digit
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\s</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
any whitespace character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\S</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
any character that is not a whitespace character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\w</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
any "word" character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\W</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
any "non-word" character
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
<para>
|
|
Each pair of escape sequences partitions the complete set of
|
|
characters into two disjoint sets. Any given character
|
|
matches one, and only one, of each pair.
|
|
</para>
|
|
<para>
|
|
A "word" character is any letter or digit or the underscore
|
|
character, that is, any character which can be part of a
|
|
Perl "<literal>word</literal>". The definition of letters and digits is
|
|
controlled by PCRE's character tables, and may vary if locale-specific
|
|
matching is taking place (see "Locale support"
|
|
above). For example, in the "fr" (French) locale, some char-
|
|
acter codes greater than 128 are used for accented letters,
|
|
and these are matched by <literal>\w</literal>.
|
|
</para>
|
|
<para>
|
|
These character type sequences can appear both inside and
|
|
outside character classes. They each match one character of
|
|
the appropriate type. If the current matching point is at
|
|
the end of the subject string, all of them fail, since there
|
|
is no character to match.
|
|
</para>
|
|
<para>
|
|
The fourth use of backslash is for certain simple asser-
|
|
tions. An assertion specifies a condition that has to be met
|
|
at a particular point in a match, without consuming any
|
|
characters from the subject string. The use of subpatterns
|
|
for more complicated assertions is described below. The
|
|
backslashed assertions are
|
|
</para>
|
|
<para>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><emphasis>\b</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
word boundary
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\B</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
not a word boundary
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\A</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
start of subject (independent of multiline mode)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\Z</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
end of subject or newline at end (independent of
|
|
multiline mode)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><emphasis>\z</emphasis></term>
|
|
<listitem>
|
|
<simpara>
|
|
end of subject (independent of multiline mode)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
<para>
|
|
These assertions may not appear in character classes (but
|
|
note that "<literal>\b</literal>" has a different meaning, namely the backspace
|
|
character, inside a character class).
|
|
</para>
|
|
<para>
|
|
A word boundary is a position in the subject string where
|
|
the current character and the previous character do not both
|
|
match <literal>\w</literal> or <literal>\W</literal> (i.e. one matches
|
|
<literal>\w</literal> and the other matches
|
|
<literal>\W</literal>), or the start or end of the string if the first or last
|
|
character matches \w, respectively.
|
|
</para>
|
|
<para>
|
|
The <literal>\A</literal>, <literal>\Z</literal>, and <literal>\z</literal> assertions differ from the traditional
|
|
circumflex and dollar (described below) in that they only
|
|
ever match at the very start and end of the subject string,
|
|
whatever options are set. They are not affected by the
|
|
<link linkend="pcre.pattern.modifiers">PCRE_NOTBOL</link> or <link linkend="pcre.pattern.modifiers">PCRE_NOTEOL</link> options. The difference between
|
|
<literal>\Z</literal> and <literal>\z</literal> is that <literal>\Z</literal>
|
|
matches before a newline that is the
|
|
last character of the string as well as at the end of the
|
|
string, whereas <literal>\z</literal> matches only at the end.
|
|
</para>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.circudollar">
|
|
<title>Circumflex and dollar</title>
|
|
<literallayout>
|
|
Outside a character class, in the default matching mode, the
|
|
circumflex character is an assertion which is true only if
|
|
the current matching point is at the start of the subject
|
|
string. Inside a character class, circumflex has an entirely
|
|
different meaning (see below).
|
|
|
|
Circumflex need not be the first character of the pattern if
|
|
a number of alternatives are involved, but it should be the
|
|
first thing in each alternative in which it appears if the
|
|
pattern is ever to match that branch. If all possible alter-
|
|
natives start with a circumflex, that is, if the pattern is
|
|
constrained to match only at the start of the subject, it is
|
|
said to be an "anchored" pattern. (There are also other con-
|
|
structs that can cause a pattern to be anchored.)
|
|
|
|
A dollar character is an assertion which is &true; only if the
|
|
current matching point is at the end of the subject string,
|
|
or immediately before a newline character that is the last
|
|
character in the string (by default). Dollar need not be the
|
|
last character of the pattern if a number of alternatives
|
|
are involved, but it should be the last item in any branch
|
|
in which it appears. Dollar has no special meaning in a
|
|
character class.
|
|
|
|
The meaning of dollar can be changed so that it matches only
|
|
at the very end of the string, by setting the
|
|
<link linkend="pcre.pattern.modifiers">PCRE_DOLLAR_ENDONLY</link> option at compile or matching time. This
|
|
does not affect the \Z assertion.
|
|
|
|
The meanings of the circumflex and dollar characters are
|
|
changed if the <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> option is set. When this is
|
|
the case, they match immediately after and immediately
|
|
before an internal "\n" character, respectively, in addition
|
|
to matching at the start and end of the subject string. For
|
|
example, the pattern /^abc$/ matches the subject string
|
|
"def\nabc" in multiline mode, but not otherwise. Conse-
|
|
quently, patterns that are anchored in single line mode
|
|
because all branches start with "^" are not anchored in mul-
|
|
tiline mode. The <link linkend="pcre.pattern.modifiers">PCRE_DOLLAR_ENDONLY</link> option is ignored if
|
|
<link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> is set.
|
|
|
|
Note that the sequences \A, \Z, and \z can be used to match
|
|
the start and end of the subject in both modes, and if all
|
|
branches of a pattern start with \A is it always anchored,
|
|
whether <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> is set or not.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.dot">
|
|
<title>FULL STOP</title>
|
|
<literallayout>
|
|
Outside a character class, a dot in the pattern matches any
|
|
one character in the subject, including a non-printing
|
|
character, but not (by default) newline. If the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>
|
|
option is set, then dots match newlines as well. The han-
|
|
dling of dot is entirely independent of the handling of cir-
|
|
cumflex and dollar, the only relationship being that they
|
|
both involve newline characters. Dot has no special meaning
|
|
in a character class.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.squarebrackets">
|
|
<title>Square brackets</title>
|
|
<literallayout>
|
|
An opening square bracket introduces a character class, ter-
|
|
minated by a closing square bracket. A closing square
|
|
bracket on its own is not special. If a closing square
|
|
bracket is required as a member of the class, it should be
|
|
the first data character in the class (after an initial cir-
|
|
cumflex, if present) or escaped with a backslash.
|
|
|
|
A character class matches a single character in the subject;
|
|
the character must be in the set of characters defined by
|
|
the class, unless the first character in the class is a cir-
|
|
cumflex, in which case the subject character must not be in
|
|
the set defined by the class. If a circumflex is actually
|
|
required as a member of the class, ensure it is not the
|
|
first character, or escape it with a backslash.
|
|
|
|
For example, the character class [aeiou] matches any lower
|
|
case vowel, while [^aeiou] matches any character that is not
|
|
a lower case vowel. Note that a circumflex is just a con-
|
|
venient notation for specifying the characters which are in
|
|
the class by enumerating those that are not. It is not an
|
|
assertion: it still consumes a character from the subject
|
|
string, and fails if the current pointer is at the end of
|
|
the string.
|
|
|
|
When caseless matching is set, any letters in a class
|
|
represent both their upper case and lower case versions, so
|
|
for example, a caseless [aeiou] matches "A" as well as "a",
|
|
and a caseless [^aeiou] does not match "A", whereas a case-
|
|
ful version would.
|
|
|
|
The newline character is never treated in any special way in
|
|
character classes, whatever the setting of the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>
|
|
or <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> options is. A class such as [^a] will
|
|
always match a newline.
|
|
|
|
The minus (hyphen) character can be used to specify a range
|
|
of characters in a character class. For example, [d-m]
|
|
matches any letter between d and m, inclusive. If a minus
|
|
character is required in a class, it must be escaped with a
|
|
backslash or appear in a position where it cannot be inter-
|
|
preted as indicating a range, typically as the first or last
|
|
character in the class.
|
|
|
|
It is not possible to have the literal character "]" as the
|
|
end character of a range. A pattern such as [W-]46] is
|
|
interpreted as a class of two characters ("W" and "-") fol-
|
|
lowed by a literal string "46]", so it would match "W46]" or
|
|
"-46]". However, if the "]" is escaped with a backslash it
|
|
is interpreted as the end of range, so [W-\]46] is inter-
|
|
preted as a single class containing a range followed by two
|
|
separate characters. The octal or hexadecimal representation
|
|
of "]" can also be used to end a range.
|
|
|
|
Ranges operate in ASCII collating sequence. They can also be
|
|
used for characters specified numerically, for example
|
|
[\000-\037]. If a range that includes letters is used when
|
|
caseless matching is set, it matches the letters in either
|
|
case. For example, [W-c] is equivalent to [][\^_`wxyzabc],
|
|
matched caselessly, and if character tables for the "fr"
|
|
locale are in use, [\xc8-\xcb] matches accented E characters
|
|
in both cases.
|
|
|
|
The character types \d, \D, \s, \S, \w, and \W may also
|
|
appear in a character class, and add the characters that
|
|
they match to the class. For example, [\dABCDEF] matches any
|
|
hexadecimal digit. A circumflex can conveniently be used
|
|
with the upper case character types to specify a more res-
|
|
tricted set of characters than the matching lower case type.
|
|
For example, the class [^\W_] matches any letter or digit,
|
|
but not underscore.
|
|
|
|
All non-alphameric characters other than \, -, ^ (at the
|
|
start) and the terminating ] are non-special in character
|
|
classes, but it does no harm if they are escaped.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.verticalbar">
|
|
<title>Vertical bar</title>
|
|
<literallayout>
|
|
Vertical bar characters are used to separate alternative
|
|
patterns. For example, the pattern
|
|
|
|
gilbert|sullivan
|
|
|
|
matches either "gilbert" or "sullivan". Any number of alter-
|
|
natives may appear, and an empty alternative is permitted
|
|
(matching the empty string). The matching process tries
|
|
each alternative in turn, from left to right, and the first
|
|
one that succeeds is used. If the alternatives are within a
|
|
subpattern (defined below), "succeeds" means matching the
|
|
rest of the main pattern as well as the alternative in the
|
|
subpattern.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.internal-options">
|
|
<title>Internal option setting</title>
|
|
<literallayout>
|
|
The settings of <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link> ,
|
|
<link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> ,
|
|
<link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> ,
|
|
and <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> can be changed from within the pattern by
|
|
a sequence of Perl option letters enclosed between "(?" and
|
|
")". The option letters are
|
|
|
|
i for <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link>
|
|
m for <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>
|
|
s for <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>
|
|
x for <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>
|
|
|
|
For example, (?im) sets caseless, multiline matching. It is
|
|
also possible to unset these options by preceding the letter
|
|
with a hyphen, and a combined setting and unsetting such as
|
|
(?im-sx), which sets <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link> and <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> while
|
|
unsetting <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> and <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> , is also permitted.
|
|
If a letter appears both before and after the hyphen, the
|
|
option is unset.
|
|
|
|
The scope of these option changes depends on where in the
|
|
pattern the setting occurs. For settings that are outside
|
|
any subpattern (defined below), the effect is the same as if
|
|
the options were set or unset at the start of matching. The
|
|
following patterns all behave in exactly the same way:
|
|
|
|
(?i)abc
|
|
a(?i)bc
|
|
ab(?i)c
|
|
abc(?i)
|
|
|
|
which in turn is the same as compiling the pattern abc with
|
|
<link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link> set. In other words, such "top level" set-
|
|
tings apply to the whole pattern (unless there are other
|
|
changes inside subpatterns). If there is more than one set-
|
|
ting of the same option at top level, the rightmost setting
|
|
is used.
|
|
|
|
If an option change occurs inside a subpattern, the effect
|
|
is different. This is a change of behaviour in Perl 5.005.
|
|
An option change inside a subpattern affects only that part
|
|
of the subpattern that follows it, so
|
|
|
|
(a(?i)b)c
|
|
|
|
matches abc and aBc and no other strings (assuming
|
|
<link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link> is not used). By this means, options can be
|
|
made to have different settings in different parts of the
|
|
pattern. Any changes made in one alternative do carry on
|
|
into subsequent branches within the same subpattern. For
|
|
example,
|
|
|
|
(a(?i)b|c)
|
|
|
|
matches "ab", "aB", "c", and "C", even though when matching
|
|
"C" the first branch is abandoned before the option setting.
|
|
This is because the effects of option settings happen at
|
|
compile time. There would be some very weird behaviour oth-
|
|
erwise.
|
|
|
|
The PCRE-specific options <link linkend="pcre.pattern.modifiers">PCRE_UNGREEDY</link> and
|
|
<link linkend="pcre.pattern.modifiers">PCRE_EXTRA</link> can
|
|
be changed in the same way as the Perl-compatible options by
|
|
using the characters U and X respectively. The (?X) flag
|
|
setting is special in that it must always occur earlier in
|
|
the pattern than any of the additional features it turns on,
|
|
even when it is at top level. It is best put at the start.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.subpatterns">
|
|
<title>subpatterns</title>
|
|
<literallayout>
|
|
Subpatterns are delimited by parentheses (round brackets),
|
|
which can be nested. Marking part of a pattern as a subpat-
|
|
tern does two things:
|
|
|
|
1. It localizes a set of alternatives. For example, the pat-
|
|
tern
|
|
|
|
cat(aract|erpillar|)
|
|
|
|
matches one of the words "cat", "cataract", or "caterpil-
|
|
lar". Without the parentheses, it would match "cataract",
|
|
"erpillar" or the empty string.
|
|
|
|
2. It sets up the subpattern as a capturing subpattern (as
|
|
defined above). When the whole pattern matches, that por-
|
|
tion of the subject string that matched the subpattern is
|
|
passed back to the caller via the <emphasis>ovector</emphasis> argument of
|
|
<function>pcre_exec</function>. Opening parentheses are counted from left to
|
|
right (starting from 1) to obtain the numbers of the captur-
|
|
ing subpatterns.
|
|
|
|
For example, if the string "the red king" is matched against
|
|
the pattern
|
|
|
|
the ((red|white) (king|queen))
|
|
|
|
the captured substrings are "red king", "red", and "king",
|
|
and are numbered 1, 2, and 3.
|
|
|
|
The fact that plain parentheses fulfil two functions is not
|
|
always helpful. There are often times when a grouping sub-
|
|
pattern is required without a capturing requirement. If an
|
|
opening parenthesis is followed by "?:", the subpattern does
|
|
not do any capturing, and is not counted when computing the
|
|
number of any subsequent capturing subpatterns. For example,
|
|
if the string "the white queen" is matched against the
|
|
pattern
|
|
|
|
the ((?:red|white) (king|queen))
|
|
|
|
the captured substrings are "white queen" and "queen", and
|
|
are numbered 1 and 2. The maximum number of captured sub-
|
|
strings is 99, and the maximum number of all subpatterns,
|
|
both capturing and non-capturing, is 200.
|
|
|
|
As a convenient shorthand, if any option settings are
|
|
required at the start of a non-capturing subpattern, the
|
|
option letters may appear between the "?" and the ":". Thus
|
|
the two patterns
|
|
|
|
(?i:saturday|sunday)
|
|
(?:(?i)saturday|sunday)
|
|
|
|
match exactly the same set of strings. Because alternative
|
|
branches are tried from left to right, and options are not
|
|
reset until the end of the subpattern is reached, an option
|
|
setting in one branch does affect subsequent branches, so
|
|
the above patterns match "SUNDAY" as well as "Saturday".
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.repetition">
|
|
<title>Repetition</title>
|
|
<literallayout>
|
|
Repetition is specified by quantifiers, which can follow any
|
|
of the following items:
|
|
|
|
a single character, possibly escaped
|
|
the . metacharacter
|
|
a character class
|
|
a back reference (see next section)
|
|
a parenthesized subpattern (unless it is an assertion -
|
|
see below)
|
|
|
|
The general repetition quantifier specifies a minimum and
|
|
maximum number of permitted matches, by giving the two
|
|
numbers in curly brackets (braces), separated by a comma.
|
|
The numbers must be less than 65536, and the first must be
|
|
less than or equal to the second. For example:
|
|
|
|
z{2,4}
|
|
|
|
matches "zz", "zzz", or "zzzz". A closing brace on its own
|
|
is not a special character. If the second number is omitted,
|
|
but the comma is present, there is no upper limit; if the
|
|
second number and the comma are both omitted, the quantifier
|
|
specifies an exact number of required matches. Thus
|
|
|
|
[aeiou]{3,}
|
|
|
|
matches at least 3 successive vowels, but may match many
|
|
more, while
|
|
|
|
\d{8}
|
|
|
|
matches exactly 8 digits. An opening curly bracket that
|
|
appears in a position where a quantifier is not allowed, or
|
|
one that does not match the syntax of a quantifier, is taken
|
|
as a literal character. For example, {,6} is not a quantif-
|
|
ier, but a literal string of four characters.
|
|
|
|
The quantifier {0} is permitted, causing the expression to
|
|
behave as if the previous item and the quantifier were not
|
|
present.
|
|
|
|
For convenience (and historical compatibility) the three
|
|
most common quantifiers have single-character abbreviations:
|
|
|
|
* is equivalent to {0,}
|
|
+ is equivalent to {1,}
|
|
? is equivalent to {0,1}
|
|
|
|
It is possible to construct infinite loops by following a
|
|
subpattern that can match no characters with a quantifier
|
|
that has no upper limit, for example:
|
|
|
|
(a?)*
|
|
|
|
Earlier versions of Perl and PCRE used to give an error at
|
|
compile time for such patterns. However, because there are
|
|
cases where this can be useful, such patterns are now
|
|
accepted, but if any repetition of the subpattern does in
|
|
fact match no characters, the loop is forcibly broken.
|
|
|
|
By default, the quantifiers are "greedy", that is, they
|
|
match as much as possible (up to the maximum number of per-
|
|
mitted times), without causing the rest of the pattern to
|
|
fail. The classic example of where this gives problems is in
|
|
trying to match comments in C programs. These appear between
|
|
the sequences /* and */ and within the sequence, individual
|
|
* and / characters may appear. An attempt to match C com-
|
|
ments by applying the pattern
|
|
|
|
/\*.*\*/
|
|
|
|
to the string
|
|
|
|
/* first command */ not comment /* second comment */
|
|
|
|
fails, because it matches the entire string due to the
|
|
greediness of the .* item.
|
|
|
|
However, if a quantifier is followed by a question mark,
|
|
then it ceases to be greedy, and instead matches the minimum
|
|
number of times possible, so the pattern
|
|
|
|
/\*.*?\*/
|
|
|
|
does the right thing with the C comments. The meaning of the
|
|
various quantifiers is not otherwise changed, just the pre-
|
|
ferred number of matches. Do not confuse this use of ques-
|
|
tion mark with its use as a quantifier in its own right.
|
|
Because it has two uses, it can sometimes appear doubled, as
|
|
in
|
|
|
|
\d??\d
|
|
|
|
which matches one digit by preference, but can match two if
|
|
that is the only way the rest of the pattern matches.
|
|
|
|
If the <link linkend="pcre.pattern.modifiers">PCRE_UNGREEDY</link> option is set (an option which is not
|
|
available in Perl) then the quantifiers are not greedy by
|
|
default, but individual ones can be made greedy by following
|
|
them with a question mark. In other words, it inverts the
|
|
default behaviour.
|
|
|
|
When a parenthesized subpattern is quantified with a minimum
|
|
repeat count that is greater than 1 or with a limited max-
|
|
imum, more store is required for the compiled pattern, in
|
|
proportion to the size of the minimum or maximum.
|
|
|
|
If a pattern starts with .* or .{0,} and the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>
|
|
option (equivalent to Perl's /s) is set, thus allowing the .
|
|
to match newlines, then the pattern is implicitly anchored,
|
|
because whatever follows will be tried against every charac-
|
|
ter position in the subject string, so there is no point in
|
|
retrying the overall match at any position after the first.
|
|
PCRE treats such a pattern as though it were preceded by \A.
|
|
In cases where it is known that the subject string contains
|
|
no newlines, it is worth setting <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> when the pat-
|
|
tern begins with .* in order to obtain this optimization, or
|
|
alternatively using ^ to indicate anchoring explicitly.
|
|
|
|
When a capturing subpattern is repeated, the value captured
|
|
is the substring that matched the final iteration. For exam-
|
|
ple, after
|
|
|
|
(tweedle[dume]{3}\s*)+
|
|
|
|
has matched "tweedledum tweedledee" the value of the cap-
|
|
tured substring is "tweedledee". However, if there are
|
|
nested capturing subpatterns, the corresponding captured
|
|
values may have been set in previous iterations. For exam-
|
|
ple, after
|
|
|
|
/(a|(b))+/
|
|
|
|
matches "aba" the value of the second captured substring is
|
|
"b".
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.back-references">
|
|
<title>BACK REFERENCES</title>
|
|
<literallayout>
|
|
Outside a character class, a backslash followed by a digit
|
|
greater than 0 (and possibly further digits) is a back
|
|
reference to a capturing subpattern earlier (i.e. to its
|
|
left) in the pattern, provided there have been that many
|
|
previous capturing left parentheses.
|
|
|
|
However, if the decimal number following the backslash is
|
|
less than 10, it is always taken as a back reference, and
|
|
causes an error only if there are not that many capturing
|
|
left parentheses in the entire pattern. In other words, the
|
|
parentheses that are referenced need not be to the left of
|
|
the reference for numbers less than 10. See the section
|
|
entitled "Backslash" above for further details of the han-
|
|
dling of digits following a backslash.
|
|
|
|
A back reference matches whatever actually matched the cap-
|
|
turing subpattern in the current subject string, rather than
|
|
anything matching the subpattern itself. So the pattern
|
|
|
|
(sens|respons)e and \1ibility
|
|
|
|
matches "sense and sensibility" and "response and responsi-
|
|
bility", but not "sense and responsibility". If caseful
|
|
matching is in force at the time of the back reference, then
|
|
the case of letters is relevant. For example,
|
|
|
|
((?i)rah)\s+\1
|
|
|
|
matches "rah rah" and "RAH RAH", but not "RAH rah", even
|
|
though the original capturing subpattern is matched case-
|
|
lessly.
|
|
|
|
There may be more than one back reference to the same sub-
|
|
pattern. If a subpattern has not actually been used in a
|
|
particular match, then any back references to it always
|
|
fail. For example, the pattern
|
|
|
|
(a|(bc))\2
|
|
|
|
always fails if it starts to match "a" rather than "bc".
|
|
Because there may be up to 99 back references, all digits
|
|
following the backslash are taken as part of a potential
|
|
back reference number. If the pattern continues with a digit
|
|
character, then some delimiter must be used to terminate the
|
|
back reference. If the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> option is set, this can
|
|
be whitespace. Otherwise an empty comment can be used.
|
|
|
|
A back reference that occurs inside the parentheses to which
|
|
it refers fails when the subpattern is first used, so, for
|
|
example, (a\1) never matches. However, such references can
|
|
be useful inside repeated subpatterns. For example, the pat-
|
|
tern
|
|
|
|
(a|b\1)+
|
|
|
|
matches any number of "a"s and also "aba", "ababaa" etc. At
|
|
each iteration of the subpattern, the back reference matches
|
|
the character string corresponding to the previous itera-
|
|
tion. In order for this to work, the pattern must be such
|
|
that the first iteration does not need to match the back
|
|
reference. This can be done using alternation, as in the
|
|
example above, or by a quantifier with a minimum of zero.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.assertions">
|
|
<title>Assertions</title>
|
|
<literallayout>
|
|
An assertion is a test on the characters following or
|
|
preceding the current matching point that does not actually
|
|
consume any characters. The simple assertions coded as \b,
|
|
\B, \A, \Z, \z, ^ and $ are described above. More compli-
|
|
cated assertions are coded as subpatterns. There are two
|
|
kinds: those that look ahead of the current position in the
|
|
subject string, and those that look behind it.
|
|
|
|
An assertion subpattern is matched in the normal way, except
|
|
that it does not cause the current matching position to be
|
|
changed. Lookahead assertions start with (?= for positive
|
|
assertions and (?! for negative assertions. For example,
|
|
|
|
\w+(?=;)
|
|
|
|
matches a word followed by a semicolon, but does not include
|
|
the semicolon in the match, and
|
|
|
|
foo(?!bar)
|
|
|
|
matches any occurrence of "foo" that is not followed by
|
|
"bar". Note that the apparently similar pattern
|
|
|
|
(?!foo)bar
|
|
|
|
does not find an occurrence of "bar" that is preceded by
|
|
something other than "foo"; it finds any occurrence of "bar"
|
|
whatsoever, because the assertion (?!foo) is always &true;
|
|
when the next three characters are "bar". A lookbehind
|
|
assertion is needed to achieve this effect.
|
|
|
|
Lookbehind assertions start with (?<= for positive asser-
|
|
tions and (?<! for negative assertions. For example,
|
|
|
|
(?<!foo)bar
|
|
|
|
does find an occurrence of "bar" that is not preceded by
|
|
"foo". The contents of a lookbehind assertion are restricted
|
|
such that all the strings it matches must have a fixed
|
|
length. However, if there are several alternatives, they do
|
|
not all have to have the same fixed length. Thus
|
|
|
|
(?<=bullock|donkey)
|
|
|
|
is permitted, but
|
|
|
|
(?<!dogs?|cats?)
|
|
|
|
causes an error at compile time. Branches that match dif-
|
|
ferent length strings are permitted only at the top level of
|
|
a lookbehind assertion. This is an extension compared with
|
|
Perl 5.005, which requires all branches to match the same
|
|
length of string. An assertion such as
|
|
|
|
(?<=ab(c|de))
|
|
|
|
is not permitted, because its single top-level branch can
|
|
match two different lengths, but it is acceptable if rewrit-
|
|
ten to use two top-level branches:
|
|
|
|
(?<=abc|abde)
|
|
|
|
The implementation of lookbehind assertions is, for each
|
|
alternative, to temporarily move the current position back
|
|
by the fixed width and then try to match. If there are
|
|
insufficient characters before the current position, the
|
|
match is deemed to fail. Lookbehinds in conjunction with
|
|
once-only subpatterns can be particularly useful for match-
|
|
ing at the ends of strings; an example is given at the end
|
|
of the section on once-only subpatterns.
|
|
|
|
Several assertions (of any sort) may occur in succession.
|
|
For example,
|
|
|
|
(?<=\d{3})(?<!999)foo
|
|
|
|
matches "foo" preceded by three digits that are not "999".
|
|
Notice that each of the assertions is applied independently
|
|
at the same point in the subject string. First there is a
|
|
check that the previous three characters are all digits,
|
|
then there is a check that the same three characters are not
|
|
"999". This pattern does not match "foo" preceded by six
|
|
characters, the first of which are digits and the last three
|
|
of which are not "999". For example, it doesn't match
|
|
"123abcfoo". A pattern to do that is
|
|
|
|
(?<=\d{3}...)(?<!999)foo
|
|
|
|
This time the first assertion looks at the preceding six
|
|
characters, checking that the first three are digits, and
|
|
then the second assertion checks that the preceding three
|
|
characters are not "999".
|
|
|
|
Assertions can be nested in any combination. For example,
|
|
|
|
(?<=(?<!foo)bar)baz
|
|
|
|
matches an occurrence of "baz" that is preceded by "bar"
|
|
which in turn is not preceded by "foo", while
|
|
|
|
(?<=\d{3}(?!999)...)foo
|
|
|
|
is another pattern which matches "foo" preceded by three
|
|
digits and any three characters that are not "999".
|
|
|
|
Assertion subpatterns are not capturing subpatterns, and may
|
|
not be repeated, because it makes no sense to assert the
|
|
same thing several times. If any kind of assertion contains
|
|
capturing subpatterns within it, these are counted for the
|
|
purposes of numbering the capturing subpatterns in the whole
|
|
pattern. However, substring capturing is carried out only
|
|
for positive assertions, because it does not make sense for
|
|
negative assertions.
|
|
|
|
Assertions count towards the maximum of 200 parenthesized
|
|
subpatterns.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.onlyonce">
|
|
<title>Once-only subpatterns</title>
|
|
<literallayout>
|
|
With both maximizing and minimizing repetition, failure of
|
|
what follows normally causes the repeated item to be re-
|
|
evaluated to see if a different number of repeats allows the
|
|
rest of the pattern to match. Sometimes it is useful to
|
|
prevent this, either to change the nature of the match, or
|
|
to cause it fail earlier than it otherwise might, when the
|
|
author of the pattern knows there is no point in carrying
|
|
on.
|
|
|
|
Consider, for example, the pattern \d+foo when applied to
|
|
the subject line
|
|
|
|
123456bar
|
|
|
|
After matching all 6 digits and then failing to match "foo",
|
|
the normal action of the matcher is to try again with only 5
|
|
digits matching the \d+ item, and then with 4, and so on,
|
|
before ultimately failing. Once-only subpatterns provide the
|
|
means for specifying that once a portion of the pattern has
|
|
matched, it is not to be re-evaluated in this way, so the
|
|
matcher would give up immediately on failing to match "foo"
|
|
the first time. The notation is another kind of special
|
|
parenthesis, starting with (?> as in this example:
|
|
|
|
(?>\d+)bar
|
|
|
|
This kind of parenthesis "locks up" the part of the pattern
|
|
it contains once it has matched, and a failure further into
|
|
the pattern is prevented from backtracking into it. Back-
|
|
tracking past it to previous items, however, works as nor-
|
|
mal.
|
|
|
|
An alternative description is that a subpattern of this type
|
|
matches the string of characters that an identical stan-
|
|
dalone pattern would match, if anchored at the current point
|
|
in the subject string.
|
|
|
|
Once-only subpatterns are not capturing subpatterns. Simple
|
|
cases such as the above example can be thought of as a max-
|
|
imizing repeat that must swallow everything it can. So,
|
|
while both \d+ and \d+? are prepared to adjust the number of
|
|
digits they match in order to make the rest of the pattern
|
|
match, (?>\d+) can only match an entire sequence of digits.
|
|
|
|
This construction can of course contain arbitrarily compli-
|
|
cated subpatterns, and it can be nested.
|
|
|
|
Once-only subpatterns can be used in conjunction with look-
|
|
behind assertions to specify efficient matching at the end
|
|
of the subject string. Consider a simple pattern such as
|
|
|
|
abcd$
|
|
|
|
when applied to a long string which does not match. Because
|
|
matching proceeds from left to right, PCRE will look for
|
|
each "a" in the subject and then see if what follows matches
|
|
the rest of the pattern. If the pattern is specified as
|
|
|
|
^.*abcd$
|
|
|
|
then the initial .* matches the entire string at first, but
|
|
when this fails (because there is no following "a"), it
|
|
backtracks to match all but the last character, then all but
|
|
the last two characters, and so on. Once again the search
|
|
for "a" covers the entire string, from right to left, so we
|
|
are no better off. However, if the pattern is written as
|
|
|
|
^(?>.*)(?<=abcd)
|
|
|
|
then there can be no backtracking for the .* item; it can
|
|
match only the entire string. The subsequent lookbehind
|
|
assertion does a single test on the last four characters. If
|
|
it fails, the match fails immediately. For long strings,
|
|
this approach makes a significant difference to the process-
|
|
ing time.
|
|
|
|
When a pattern contains an unlimited repeat inside a subpat-
|
|
tern that can itself be repeated an unlimited number of
|
|
times, the use of a once-only subpattern is the only way to
|
|
avoid some failing matches taking a very long time indeed.
|
|
The pattern
|
|
|
|
(\D+|<\d+>)*[!?]
|
|
|
|
matches an unlimited number of substrings that either con-
|
|
sist of non-digits, or digits enclosed in <>, followed by
|
|
either ! or ?. When it matches, it runs quickly. However, if
|
|
it is applied to
|
|
|
|
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
|
|
|
it takes a long time before reporting failure. This is
|
|
because the string can be divided between the two repeats in
|
|
a large number of ways, and all have to be tried. (The exam-
|
|
ple used [!?] rather than a single character at the end,
|
|
because both PCRE and Perl have an optimization that allows
|
|
for fast failure when a single character is used. They
|
|
remember the last single character that is required for a
|
|
match, and fail early if it is not present in the string.)
|
|
If the pattern is changed to
|
|
|
|
((?>\D+)|<\d+>)*[!?]
|
|
|
|
sequences of non-digits cannot be broken, and failure hap-
|
|
pens quickly.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.conditional">
|
|
<title>Conditional subpatterns</title>
|
|
<literallayout>
|
|
It is possible to cause the matching process to obey a sub-
|
|
pattern conditionally or to choose between two alternative
|
|
subpatterns, depending on the result of an assertion, or
|
|
whether a previous capturing subpattern matched or not. The
|
|
two possible forms of conditional subpattern are
|
|
|
|
(?(condition)yes-pattern)
|
|
(?(condition)yes-pattern|no-pattern)
|
|
|
|
If the condition is satisfied, the yes-pattern is used; oth-
|
|
erwise the no-pattern (if present) is used. If there are
|
|
more than two alternatives in the subpattern, a compile-time
|
|
error occurs.
|
|
|
|
There are two kinds of condition. If the text between the
|
|
parentheses consists of a sequence of digits, then the
|
|
condition is satisfied if the capturing subpattern of that
|
|
number has previously matched. Consider the following pat-
|
|
tern, which contains non-significant white space to make it
|
|
more readable (assume the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> option) and to
|
|
divide it into three parts for ease of discussion:
|
|
|
|
( \( )? [^()]+ (?(1) \) )
|
|
|
|
The first part matches an optional opening parenthesis, and
|
|
if that character is present, sets it as the first captured
|
|
substring. The second part matches one or more characters
|
|
that are not parentheses. The third part is a conditional
|
|
subpattern that tests whether the first set of parentheses
|
|
matched or not. If they did, that is, if subject started
|
|
with an opening parenthesis, the condition is &true;, and so
|
|
the yes-pattern is executed and a closing parenthesis is
|
|
required. Otherwise, since no-pattern is not present, the
|
|
subpattern matches nothing. In other words, this pattern
|
|
matches a sequence of non-parentheses, optionally enclosed
|
|
in parentheses.
|
|
|
|
If the condition is not a sequence of digits, it must be an
|
|
assertion. This may be a positive or negative lookahead or
|
|
lookbehind assertion. Consider this pattern, again contain-
|
|
ing non-significant white space, and with the two alterna-
|
|
tives on the second line:
|
|
|
|
(?(?=[^a-z]*[a-z])
|
|
\d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
|
|
|
|
The condition is a positive lookahead assertion that matches
|
|
an optional sequence of non-letters followed by a letter. In
|
|
other words, it tests for the presence of at least one
|
|
letter in the subject. If a letter is found, the subject is
|
|
matched against the first alternative; otherwise it is
|
|
matched against the second. This pattern matches strings in
|
|
one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
|
|
letters and dd are digits.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.comments">
|
|
<title>Comments</title>
|
|
<literallayout>
|
|
The sequence (?# marks the start of a comment which
|
|
continues up to the next closing parenthesis. Nested
|
|
parentheses are not permitted. The characters that make up a
|
|
comment play no part in the pattern matching at all.
|
|
|
|
If the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> option is set, an unescaped # character
|
|
outside a character class introduces a comment that contin-
|
|
ues up to the next newline character in the pattern.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.recursive">
|
|
<title>Recursive patterns</title>
|
|
<literallayout>
|
|
Consider the problem of matching a string in parentheses,
|
|
allowing for unlimited nested parentheses. Without the use
|
|
of recursion, the best that can be done is to use a pattern
|
|
that matches up to some fixed depth of nesting. It is not
|
|
possible to handle an arbitrary nesting depth. Perl 5.6 has
|
|
provided an experimental facility that allows regular
|
|
expressions to recurse (amongst other things). The special
|
|
item (?R) is provided for the specific case of recursion.
|
|
This PCRE pattern solves the parentheses problem (assume
|
|
the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> option is set so that white space is
|
|
ignored):
|
|
|
|
\( ( (?>[^()]+) | (?R) )* \)
|
|
|
|
First it matches an opening parenthesis. Then it matches any
|
|
number of substrings which can either be a sequence of non-
|
|
parentheses, or a recursive match of the pattern itself
|
|
(i.e. a correctly parenthesized substring). Finally there is
|
|
a closing parenthesis.
|
|
|
|
This particular example pattern contains nested unlimited
|
|
repeats, and so the use of a once-only subpattern for match-
|
|
ing strings of non-parentheses is important when applying
|
|
the pattern to strings that do not match. For example, when
|
|
it is applied to
|
|
|
|
(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
|
|
|
|
it yields "no match" quickly. However, if a once-only sub-
|
|
pattern is not used, the match runs for a very long time
|
|
indeed because there are so many different ways the + and *
|
|
repeats can carve up the subject, and all have to be tested
|
|
before failure can be reported.
|
|
|
|
The values set for any capturing subpatterns are those from
|
|
the outermost level of the recursion at which the subpattern
|
|
value is set. If the pattern above is matched against
|
|
|
|
(ab(cd)ef)
|
|
|
|
the value for the capturing parentheses is "ef", which is
|
|
the last value taken on at the top level. If additional
|
|
parentheses are added, giving
|
|
|
|
\( ( ( (?>[^()]+) | (?R) )* ) \)
|
|
^ ^
|
|
^ ^ then the string they capture
|
|
is "ab(cd)ef", the contents of the top level parentheses. If
|
|
there are more than 15 capturing parentheses in a pattern,
|
|
PCRE has to obtain extra memory to store data during a
|
|
recursion, which it does by using pcre_malloc, freeing it
|
|
via pcre_free afterwards. If no memory can be obtained, it
|
|
saves data for the first 15 capturing parentheses only, as
|
|
there is no way to give an out-of-memory error from within a
|
|
recursion.
|
|
</literallayout>
|
|
</refsect2>
|
|
|
|
<refsect2 id="regexp.reference.performances">
|
|
<title>Performances</title>
|
|
<literallayout>
|
|
Certain items that may appear in patterns are more efficient
|
|
than others. It is more efficient to use a character class
|
|
like [aeiou] than a set of alternatives such as (a|e|i|o|u).
|
|
In general, the simplest construction that provides the
|
|
required behaviour is usually the most efficient. Jeffrey
|
|
Friedl's book contains a lot of discussion about optimizing
|
|
regular expressions for efficient performance.
|
|
|
|
When a pattern begins with .* and the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> option is
|
|
set, the pattern is implicitly anchored by PCRE, since it
|
|
can match only at the start of a subject string. However, if
|
|
<link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> is not set, PCRE cannot make this optimization,
|
|
because the . metacharacter does not then match a newline,
|
|
and if the subject string contains newlines, the pattern may
|
|
match from the character immediately following one of them
|
|
instead of from the very start. For example, the pattern
|
|
|
|
(.*) second
|
|
|
|
matches the subject "first\nand second" (where \n stands for
|
|
a newline character) with the first captured substring being
|
|
"and". In order to do this, PCRE has to retry the match
|
|
starting after every newline in the subject.
|
|
|
|
If you are using such a pattern with subject strings that do
|
|
not contain newlines, the best performance is obtained by
|
|
setting <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> , or starting the pattern with ^.* to
|
|
indicate explicit anchoring. That saves PCRE from having to
|
|
scan along the subject looking for a newline to restart at.
|
|
|
|
Beware of patterns that contain nested indefinite repeats.
|
|
These can take a long time to run when applied to a string
|
|
that does not match. Consider the pattern fragment
|
|
|
|
(a+)*
|
|
|
|
This can match "aaaa" in 33 different ways, and this number
|
|
increases very rapidly as the string gets longer. (The *
|
|
repeat can match 0, 1, 2, 3, or 4 times, and for each of
|
|
those cases other than 0, the + repeats can match different
|
|
numbers of times.) When the remainder of the pattern is such
|
|
that the entire match is going to fail, PCRE has in princi-
|
|
ple to try every possible variation, and this can take an
|
|
extremely long time.
|
|
|
|
An optimization catches some of the more simple cases such
|
|
as
|
|
|
|
(a+)*b
|
|
|
|
where a literal character follows. Before embarking on the
|
|
standard matching procedure, PCRE checks that there is a "b"
|
|
later in the subject string, and if there is not, it fails
|
|
the match immediately. However, when there is no following
|
|
literal this optimization cannot be used. You can see the
|
|
difference by comparing the behaviour of
|
|
|
|
(a+)*\d
|
|
|
|
with the pattern above. The former gives a failure almost
|
|
instantly when applied to a whole line of "a" characters,
|
|
whereas the latter takes an appreciable time with strings
|
|
longer than about 20 characters.
|
|
</literallayout>
|
|
</refsect2>
|
|
</refsect1>
|
|
</refentry>
|
|
</reference>
|
|
|
|
<!-- Keep this comment at the end of the file
|
|
Local variables:
|
|
mode: sgml
|
|
sgml-omittag:t
|
|
sgml-shorttag:t
|
|
sgml-minimize-attributes:nil
|
|
sgml-always-quote-attributes:t
|
|
sgml-indent-step:1
|
|
sgml-indent-data:t
|
|
indent-tabs-mode:nil
|
|
sgml-parent-document:nil
|
|
sgml-default-dtd-file:"../../manual.ced"
|
|
sgml-exposed-tags:nil
|
|
sgml-local-catalogs:nil
|
|
sgml-local-ecat-files:nil
|
|
End:
|
|
vim600: syn=xml fen fdm=syntax fdl=2 si
|
|
vim: et tw=78 syn=sgml
|
|
vi: ts=1 sw=1
|
|
-->
|
|
|