From 2d24777fae845ba001be4d5aba3d6d60787b2ff7 Mon Sep 17 00:00:00 2001
From: Jakub Vrana <vrana@php.net>
Date: Fri, 17 Jun 2005 11:40:21 +0000
Subject: [PATCH] PCRE 5.0

git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@188618 c90b9560-bf6c-de11-be94-00142212c4b1
---
 reference/pcre/pattern.modifiers.xml |   4 +-
 reference/pcre/pattern.syntax.xml    | 165 +++++++++++++++++++++++++--
 2 files changed, 161 insertions(+), 8 deletions(-)
diff --git a/reference/pcre/pattern.modifiers.xml b/reference/pcre/pattern.modifiers.xml
index a7c8c6d92a..4ad58a1816 100644
--- a/reference/pcre/pattern.modifiers.xml
+++ b/reference/pcre/pattern.modifiers.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="iso-8859-1"?>
-<!-- $Revision: 1.7 $ -->
+<!-- $Revision: 1.8 $ -->
 <!-- splitted from ./en/functions/pcre.xml, last change in rev 1.2 -->
 <refentry id="reference.pcre.pattern.modifiers">
  <refnamediv>
@@ -12,6 +12,7 @@
   <para>
    The current possible PCRE modifiers are listed below.  The names
    in parentheses refer to internal PCRE names for these modifiers.
+   Spaces and newlines are ignored in modifiers, other characters cause error.
   </para>
   <para>
    <blockquote>
@@ -179,6 +180,7 @@
         is incompatible with Perl. Pattern strings are treated as
         UTF-8. This modifier is available from PHP 4.1.0 or greater
         on Unix and from PHP 4.2.3 on win32.
+        UTF-8 validity of the pattern is checked since PHP 4.3.5.
        </simpara>
       </listitem>
      </varlistentry>
diff --git a/reference/pcre/pattern.syntax.xml b/reference/pcre/pattern.syntax.xml
index 9e958831f7..eb3181f050 100644
--- a/reference/pcre/pattern.syntax.xml
+++ b/reference/pcre/pattern.syntax.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="iso-8859-1"?>
-<!-- $Revision: 1.8 $ -->
+<!-- $Revision: 1.9 $ -->
 <!-- splitted from ./en/functions/pcre.xml, last change in rev 1.2 -->
   <refentry id="reference.pcre.pattern.syntax">
    <refnamediv>
@@ -274,7 +274,7 @@
     </refsect2>
 
     <refsect2 id="regexp.reference.backslash">
-     <title>backslash</title>
+     <title>Backslash</title>
      <para>
       The backslash character has several uses. Firstly, if it  is
       followed by a non-alphanumeric character, it takes away any
@@ -358,6 +358,12 @@
     <para>
      After "<literal>\x</literal>", up to two hexadecimal digits are
      read (letters can be in upper or lower case).
+     In <emphasis>UTF-8 mode</emphasis>, "<literal>\x{...}</literal>" is
+     allowed, where the contents of the braces is a string of hexadecimal
+     digits. It is interpreted as a UTF-8 character whose code number is the
+     given hexadecimal number. The original hexadecimal escape sequence,
+     <literal>\xhh</literal>, matches a two-byte UTF-8 character if the value
+     is greater than 127.
     </para>
     <para>
      After "<literal>\0</literal>" up to two further octal digits are read.
@@ -545,7 +551,11 @@
       </varlistentry>
       <varlistentry>
        <term><emphasis>\z</emphasis></term>
-       <listitem><simpara>end of subject(independent of multiline mode)</simpara></listitem>
+       <listitem><simpara>end of subject (independent of multiline mode)</simpara></listitem>
+      </varlistentry>
+      <varlistentry>
+       <term><emphasis>\G</emphasis></term>
+       <listitem><simpara>first matching position in subject</simpara></listitem>
       </varlistentry>
      </variablelist>
     </para>
@@ -575,6 +585,14 @@
      newline that is the last character of the string as well as at the end of
      the string, whereas <literal>\z</literal> matches only at the end.
      </para>
+     <para>
+      The <literal>\G</literal> assertion is true only when the current
+      matching position is at the start point of the match, as specified by
+      the <parameter>offset</parameter> argument of
+      <function>preg_match</function>. It differs from <literal>\A</literal>
+      when the value of <parameter>offset</parameter> is non-zero.
+      It is available since PHP 4.3.3.
+     </para>
      
      <para>
       <literal>\Q</literal> and <literal>\E</literal> can be used to ignore
@@ -586,6 +604,116 @@
      
     </refsect2>
 
+    <refsect2 id="regexp.reference.unicode">
+     <title>Unicode character properties</title>
+     <para>
+      Since PHP 4.4.0 and 5.1.0, three
+      additional escape sequences to match generic character types are available
+      when <emphasis>UTF-8 mode</emphasis> is selected. They are:
+     </para>
+     <variablelist>
+      <varlistentry>
+       <term><emphasis>\p{xx}</emphasis></term>
+       <listitem><simpara>a character with the xx property</simpara></listitem>
+      </varlistentry>
+      <varlistentry>
+       <term><emphasis>\P{xx}</emphasis></term>
+       <listitem><simpara>a character without the xx property</simpara></listitem>
+      </varlistentry>
+      <varlistentry>
+       <term><emphasis>\X</emphasis></term>
+       <listitem><simpara>an extended Unicode sequence</simpara></listitem>
+      </varlistentry>
+     </variablelist>
+     <para>
+      The property names represented by <literal>xx</literal> above are limited to the Unicode
+      general category properties. Each character has exactly one such
+      property, specified by a two-letter abbreviation. For compatibility with
+      Perl, negation can be specified by including a circumflex between the
+      opening brace and the property name. For example, <literal>\p{^Lu}</literal> is the same
+      as <literal>\P{Lu}</literal>.
+     </para>
+     <para>
+      If only one letter is specified with <literal>\p</literal> or <literal>\P</literal>, it includes all the
+      properties that start with that letter. In this case, in the absence of
+      negation, the curly brackets in the escape sequence are optional; these
+      two examples have the same effect:
+     </para>
+     <literallayout>
+      \p{L}
+      \pL
+     </literallayout>
+     <table>
+      <title>Supported property codes</title>
+      <tgroup cols="2">
+       <tbody>
+        <row><entry><literal>C</literal></entry><entry>Other</entry></row>
+        <row><entry><literal>Cc</literal></entry><entry>Control</entry></row>
+        <row><entry><literal>Cf</literal></entry><entry>Format</entry></row>
+        <row><entry><literal>Cn</literal></entry><entry>Unassigned</entry></row>
+        <row><entry><literal>Co</literal></entry><entry>Private use</entry></row>
+        <row rowsep="1"><entry><literal>Cs</literal></entry><entry>Surrogate</entry></row>
+        <row><entry><literal>L</literal></entry><entry>Letter</entry></row>
+        <row><entry><literal>Ll</literal></entry><entry>Lower case letter</entry></row>
+        <row><entry><literal>Lm</literal></entry><entry>Modifier letter</entry></row>
+        <row><entry><literal>Lo</literal></entry><entry>Other letter</entry></row>
+        <row><entry><literal>Lt</literal></entry><entry>Title case letter</entry></row>
+        <row rowsep="1"><entry><literal>Lu</literal></entry><entry>Upper case letter</entry></row>
+        <row><entry><literal>M</literal></entry><entry>Mark</entry></row>
+        <row><entry><literal>Mc</literal></entry><entry>Spacing mark</entry></row>
+        <row><entry><literal>Me</literal></entry><entry>Enclosing mark</entry></row>
+        <row rowsep="1"><entry><literal>Mn</literal></entry><entry>Non-spacing mark</entry></row>
+        <row><entry><literal>N</literal></entry><entry>Number</entry></row>
+        <row><entry><literal>Nd</literal></entry><entry>Decimal number</entry></row>
+        <row><entry><literal>Nl</literal></entry><entry>Letter number</entry></row>
+        <row rowsep="1"><entry><literal>No</literal></entry><entry>Other number</entry></row>
+        <row><entry><literal>P</literal></entry><entry>Punctuation</entry></row>
+        <row><entry><literal>Pc</literal></entry><entry>Connector punctuation</entry></row>
+        <row><entry><literal>Pd</literal></entry><entry>Dash punctuation</entry></row>
+        <row><entry><literal>Pe</literal></entry><entry>Close punctuation</entry></row>
+        <row><entry><literal>Pf</literal></entry><entry>Final punctuation</entry></row>
+        <row><entry><literal>Pi</literal></entry><entry>Initial punctuation</entry></row>
+        <row><entry><literal>Po</literal></entry><entry>Other punctuation</entry></row>
+        <row rowsep="1"><entry><literal>Ps</literal></entry><entry>Open punctuation</entry></row>
+        <row><entry><literal>S</literal></entry><entry>Symbol</entry></row>
+        <row><entry><literal>Sc</literal></entry><entry>Currency symbol</entry></row>
+        <row><entry><literal>Sk</literal></entry><entry>Modifier symbol</entry></row>
+        <row><entry><literal>Sm</literal></entry><entry>Mathematical symbol</entry></row>
+        <row rowsep="1"><entry><literal>So</literal></entry><entry>Other symbol</entry></row>
+        <row><entry><literal>Z</literal></entry><entry>Separator</entry></row>
+        <row><entry><literal>Zl</literal></entry><entry>Line separator</entry></row>
+        <row><entry><literal>Zp</literal></entry><entry>Paragraph separator</entry></row>
+        <row><entry><literal>Zs</literal></entry><entry>Space separator</entry></row>
+       </tbody>
+      </tgroup>
+     </table>
+     <para>
+      Extended properties such as "Greek" or "InMusicalSymbols" are not
+      supported by PCRE.
+     </para>
+     <para>
+      Specifying caseless matching does not affect these escape sequences.
+      For example, <literal>\p{Lu}</literal> always matches only upper case letters.
+     </para>
+     <para>
+      The <literal>\X</literal> escape matches any number of Unicode characters that form an
+      extended Unicode sequence. <literal>\X</literal> is equivalent to
+      <literal>(?>\PM\pM*)</literal>.
+     </para>
+     <para>
+      That is, it matches a character without the "mark" property, followed
+      by zero or more characters with the "mark" property, and treats the
+      sequence as an atomic group (see below). Characters with the "mark"
+      property are typically accents that affect the preceding character.
+     </para>
+     <para>
+      Matching characters by Unicode property is not fast, because PCRE has
+      to search a structure that contains data for over fifteen thousand
+      characters. That is why the traditional escape sequences such as <literal>\d</literal> and
+      <literal>\w</literal> do not use Unicode properties in PCRE.
+     </para>
+    </refsect2>
+
     <refsect2 id="regexp.reference.circudollar">
      <title>Circumflex and dollar</title>
      <para>
@@ -646,7 +774,7 @@
     </refsect2>
 
     <refsect2 id="regexp.reference.dot">
-     <title>FULL STOP</title>
+     <title>Full stop</title>
      <para>
      Outside a character class, a dot in the pattern matches  any
      one  character  in  the  subject,  including  a non-printing
@@ -658,6 +786,11 @@
      both involve newline characters.  Dot has no special meaning
      in a character class.
      </para>
+     <para>
+      <emphasis>\C</emphasis> can be used to match single byte. It makes sense
+      in <emphasis>UTF-8 mode</emphasis> where full stop matches the whole
+      character which can consist of multiple bytes.
+     </para>
     </refsect2>
 
     <refsect2 id="regexp.reference.squarebrackets">
@@ -862,7 +995,7 @@
     </refsect2>
 
     <refsect2 id="regexp.reference.subpatterns">
-     <title>subpatterns</title>
+     <title>Subpatterns</title>
      <para>
      Subpatterns are delimited by parentheses  (round  brackets),
      which can be nested.  Marking part of a pattern as a subpattern
@@ -1119,7 +1252,7 @@
     </refsect2>
 
     <refsect2 id="regexp.reference.back-references">
-     <title>BACK REFERENCES</title>
+     <title>Back references</title>
      <para>
      Outside a character class, a backslash followed by  a  digit
      greater  than  0  (and  possibly  further  digits) is a back
@@ -1479,7 +1612,12 @@
      in parentheses.
     </para>
     <para>
-     If the condition is not a sequence of digits, it must be  an
+     If the condition is the string <literal>(R)</literal>, it is satisfied if
+     a recursive call to the pattern or subpattern has been made. At "top
+     level", the condition is false.
+    </para>
+    <para>
+     If the condition is not a sequence of digits or (R), it must be  an
      assertion.  This  may be a positive or negative lookahead or
      lookbehind assertion. Consider this pattern, again  containing
      non-significant  white space, and with the two alternatives on
@@ -1585,6 +1723,19 @@
       for recursive subpatterns too. It is also possible to use named
       subpatterns: <literal>(?P>foo)</literal>.
      </para>
+     <para>
+      If the syntax for a recursive subpattern reference (either by number or
+      by name) is used outside the parentheses to which it refers, it operates
+      like a subroutine in a programming language. An earlier example
+      pointed out that the pattern
+      <literal>(sens|respons)e and \1ibility</literal>
+      matches "sense and sensibility" and "response and responsibility", but
+      not "sense and responsibility". If instead the pattern
+      <literal>(sens|respons)e and (?1)ibility</literal>
+      is used, it does match "sense and responsibility" as well as the other
+      two strings. Such references must, however, follow the subpattern to
+      which they refer.
+     </para>
      
     </refsect2>