From 1b7a6951af66e971da15aeafee7f21a69a592bdd Mon Sep 17 00:00:00 2001
From: Damien Seguy <dams@php.net>
Date: Tue, 29 May 2001 12:58:30 +0000
Subject: [PATCH] Added interesting functions to preg_split

git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@48832 c90b9560-bf6c-de11-be94-00142212c4b1
---
 functions/pcre.xml | 749 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 579 insertions(+), 170 deletions(-)
diff --git a/functions/pcre.xml b/functions/pcre.xml
index 32e0fa3142..8497aee4d1 100644
--- a/functions/pcre.xml
+++ b/functions/pcre.xml
@@ -491,7 +491,7 @@ $text = preg_replace ($search, $replace, $document);
     <title>Description</title>
     <funcsynopsis>
      <funcprototype>
-      <funcdef>array preg_split</funcdef>
+      <funcdef>array <function>preg_split</function></funcdef>
       <paramdef>string <parameter>pattern</parameter></paramdef>
       <paramdef>string <parameter>subject</parameter></paramdef>
       <paramdef>int 
@@ -546,30 +546,32 @@ $text = preg_replace ($search, $replace, $document);
        </varlistentry>
 	  </variablelist>
     </para>
-
+    <para>
     <example>
-     <title><function>preg_split</function> example</title>
-     <para>
-      Get the parts of a search string.
-     </para>
+     <title><function>preg_split</function> example : Get the parts of a search string.</title>
      <programlisting role="php">
 // split the phrase by any number of commas or space characters,
 // which include " ", \r, \t, \n and \f
 $keywords = preg_split ("/[\s,]+/", "hypertext language, programming");
      </programlisting>
-
-     <para>
-      Splitting a string into component characters.
-     </para>
+    </example>
+    </para>
+    <para>
+    <example>
+     <title>Splitting a string into component characters.</title>
      <programlisting role="php">
 $str = 'string';
 $chars = preg_split('//', $str, -1, PREG_SPLIT_NO_EMPTY);
 print_r($chars);
      </programlisting>
     </example>
-
+    </para>
     <para>
-     See also <function>preg_match</function>,
+     See also 
+     <function>spliti</function>,
+     <function>split</function>,
+     <function>implode</function>,
+      <function>preg_match</function>,
      <function>preg_match_all</function>, and
      <function>preg_replace</function>.
     </para>
@@ -853,21 +855,23 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
 
    <refsect1>
     <title>Description</title>
-    <literallayout>
+    <simpara>
      The PCRE library is a set of functions that implement regular
      expression pattern matching using the same syntax and semantics
      as Perl 5, with just a few differences (see below).  The current
      implementation corresponds to Perl 5.005.
-    </literallayout>
+    </literal>
    </refsect1>
 
    <refsect1>
     <title>Differences From Perl</title>
-    <literallayout>
+    <para>
      The differences described here  are  with  respect  to  Perl
      5.005.
-
-     1. By default, a whitespace character is any character  that
+    <orderedlist>
+ 	 <listitem>
+	  <simpara>
+     By default, a whitespace character is any character  that
      the  C  library  function isspace() recognizes, though it is
      possible to compile PCRE  with  alternative  character  type
      tables. Normally isspace() matches space, formfeed, newline,
@@ -877,13 +881,19 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      a long time was never in fact recognized. However, the char-
      acter itself was treated as whitespace at least up to 5.002.
      In 5.004 and 5.005 it does not match \s.
-
-     2. PCRE does  not  allow  repeat  quantifiers  on  lookahead
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
+     PCRE does  not  allow  repeat  quantifiers  on  lookahead
      assertions. Perl permits them, but they do not mean what you
      might think. For example, (?!a){3} does not assert that  the
      next  three characters are not "a". It just asserts that the
      next character is not "a" three times.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      3. Capturing subpatterns that occur inside  negative  looka-
      head  assertions  are  counted,  but  their  entries  in the
      offsets vector are never set. Perl sets its numerical  vari-
@@ -891,24 +901,39 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      assertion fails to match something (thereby succeeding), but
      only  if  the negative lookahead assertion contains just one
      branch.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      4. Though binary zero characters are supported in  the  sub-
      ject  string,  they  are  not  allowed  in  a pattern string
      because it is passed as a normal  C  string,  terminated  by
      zero. The escape sequence "\0" can be used in the pattern to
      represent a binary zero.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      5. The following Perl escape sequences  are  not  supported:
      \l,  \u,  \L,  \U,  \E, \Q. In fact these are implemented by
      Perl's general string-handling and are not part of its  pat-
      tern matching engine.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      6. The Perl \G assertion is  not  supported  as  it  is  not
      relevant to single pattern matches.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      7. Fairly obviously, PCRE does  not  support  the  (?{code})
      construction.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      8. There are at the time of writing some  oddities  in  Perl
      5.005_02  concerned  with  the  settings of captured strings
      when part of a pattern is repeated.  For  example,  matching
@@ -916,42 +941,64 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves  $2
      unset.    However,    if   the   pattern   is   changed   to
      /^(aa(b(b))?)+$/ then $2 (and $3) get set.
-
      In Perl 5.004 $2 is set in both cases, and that is also true
      of PCRE. If in the future Perl changes to a consistent state
      that is different, PCRE may change to follow.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <simpara>
      9. Another as yet unresolved discrepancy  is  that  in  Perl
      5.005_02  the  pattern /^(a)?(?(1)a|b)+$/ matches the string
      "a", whereas in PCRE it does not.  However, in both Perl and
      PCRE /^(a)?a/ matched against "a" leaves $1 unset.
-
+	  </simpara>
+ 	 </listitem>
+ 	 <listitem>
+	  <para>
      10. PCRE  provides  some  extensions  to  the  Perl  regular
      expression facilities:
-
+ 	   <orderedlist>
+ 	    <listitem>
+ 	     <simpara>
      (a) Although lookbehind assertions must match  fixed  length
      strings,  each  alternative branch of a lookbehind assertion
      can match a different length of string. Perl 5.005  requires
      them all to have the same length.
-
-     (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is  not
+         </simpara>
+        </listitem>
+        <listitem>
+         <simpara>
+     (b) If <link linkend="pcre.pattern.modifiers">PCRE_DOLLAR_ENDONLY</link>  is set and 
+     <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>  is  not
      set,  the  $ meta- character matches only at the very end of
      the string.
-
-     (c) If PCRE_EXTRA is set, a backslash followed by  a  letter
+         </simpara>
+        </listitem>
+        <listitem>
+         <simpara>
+     (c) If <link linkend="pcre.pattern.modifiers">PCRE_EXTRA</link>  is set, a backslash followed by  a  letter
      with no special meaning is faulted.
-
-     (d) If PCRE_UNGREEDY is set, the greediness of  the  repeti-
+         </simpara>
+        </listitem>
+        <listitem>
+         <simpara>
+     (d) If <link linkend="pcre.pattern.modifiers">PCRE_UNGREEDY</link>  is set, the greediness of  the  repeti-
      tion  quantifiers  is inverted, that is, by default they are
      not greedy, but if followed by a question mark they are.
-    </literallayout>
+         </simpara>
+        </listitem>
+       </orderedlist>
+	  </para>
+ 	 </listitem>
+ 	</orderedlist>
    </refsect1>
 
    <refsect1 id="regexp.reference">
     <title>Regular Expression Details</title>
      <refsect2 id="regexp.introduction">
       <title>Introduction</title>
-      <literallayout>
+      <para>
      The syntax and semantics of  the  regular  expressions  sup-
      ported  by PCRE are described below. Regular expressions are
      also described in the Perl documentation and in a number  of
@@ -964,79 +1011,207 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      subject string from left to right. Most characters stand for
      themselves in a pattern, and match the corresponding charac-
      ters in the subject. As a trivial example, the pattern
-
-       The quick brown fox
-
+       <literal>The quick brown fox</literal>
      matches a portion of a subject string that is  identical  to
      itself.  
      </literallayout>
     </refsect2>
 	<refsect2 id="regexp.reference.meta">
 	 <title>Meta-caracters</title>
-     <literallayout>     
+     <para>     
      The  power  of  regular  expressions comes from the
      ability to include alternatives and repetitions in the  pat-
      tern.  These  are encoded in the pattern by the use of <emphasis>meta</emphasis>-
      <emphasis>characters</emphasis>, which do not stand for  themselves  but  instead
      are interpreted in some special way.
-
+    </para>
+    <para>
      There are two different sets of meta-characters: those  that
      are  recognized anywhere in the pattern except within square
      brackets, and those that are recognized in square  brackets.
      Outside square brackets, the meta-characters are as follows:
-
-       \      general escape character with several uses
-       ^      assert start of  subject  (or  line,  in  multiline
-     mode)
-       $      assert end of subject (or line, in multiline mode)
-       .      match any character except newline (by default)
-       [      start character class definition
-       |      start of alternative branch
-       (      start subpattern
-       )      end subpattern
-       ?      extends the meaning of (
-              also 0 or 1 quantifier
-              also quantifier minimizer
-       *      0 or more quantifier
-       +      1 or more quantifier
-       {      start min/max quantifier
+      <variablelist>
+       <varlistentry>
+		<term><emphasis>\</emphasis></term>
+   	    <listitem>
+	     <simpara>
+	      general escape character with several uses
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>^</emphasis></term>
+   	    <listitem>
+	     <simpara>
+	      assert start of  subject  (or  line,  in  multiline mode)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>$</emphasis></term>
+   	    <listitem>
+	     <simpara>
+	      assert end of subject (or line, in multiline mode)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>.</emphasis></term>
+   	    <listitem>
+	     <simpara>
+	      match any character except newline (by default)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>[</emphasis></term>
+   	    <listitem>
+	     <simpara>
+	       start character class definition
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>]</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          end character class definition
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>|</emphasis></term>
+   	    <listitem>
+	     <simpara>
+           start of alternative branch
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>(</emphasis></term>
+   	    <listitem>
+	     <simpara>
+           start subpattern
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>)</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          end subpattern
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>?</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          extends the meaning of (, also 0 or 1 quantifier, also quantifier minimizer
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>*</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          0 or more quantifier
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>+</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          1 or more quantifier
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>{</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          start min/max quantifier
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>}</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          end min/max quantifier
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+      </variablelist>
 
      Part of a pattern that is in square  brackets  is  called  a
      "character  class".  In  a  character  class  the only meta-
      characters are:
-
-       \      general escape character
-       ^      negate the class, but only if the first character
-       -      indicates character range
-       ]      terminates the character class
-
+      <variablelist>
+       <varlistentry>
+		<term><emphasis>\</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          general escape character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>^</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          negate the class, but only if the first character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>-</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          indicates character range
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>]</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          terminates the character class
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+      </variablelist>
      The following sections describe  the  use  of  each  of  the
      meta-characters.
      </literallayout>
     </refsect2>
 	<refsect2 id="regexp.reference.backslash">
 	 <title>backslash</title>
-     <literallayout>
+     <para>
      The backslash character has several uses. Firstly, if it  is
      followed  by  a  non-alphameric character, it takes away any
      special  meaning  that  character  may  have.  This  use  of
      backslash  as  an  escape  character applies both inside and
      outside character classes.
-
+    </para>
+    <para>
      For example, if you want to match a "*" character, you write
      "\*" in the pattern. This applies whether or not the follow-
      ing character would otherwise  be  interpreted  as  a  meta-
      character,  so it is always safe to precede a non-alphameric
      with "\" to specify that it stands for itself.  In  particu-
      lar, if you want to match a backslash, you write "\\".
-
-     If a pattern is compiled with the PCRE_EXTENDED option, whi-
+    </para>
+    <para>
+     If a pattern is compiled with the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>  option, whi-
      tespace in the pattern (other than in a character class) and
      characters between a "#" outside a character class  and  the
      next  newline  character  are ignored. An escaping backslash
      can be used to include a whitespace or "#" character as part
      of the pattern.
-
+    </para>
+    <para>
      A second use of backslash provides a way  of  encoding  non-
      printing  characters  in patterns in a visible manner. There
      is no restriction on the appearance of non-printing  charac-
@@ -1044,32 +1219,102 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      but when a pattern is being prepared by text editing, it  is
      usually  easier to use one of the following escape sequences
      than the binary character it represents:
-
-       \a     alarm, that is, the BEL character (hex 07)
-       \cx    "control-x", where x is any character
-       \e     escape (hex 1B)
-       \f     formfeed (hex 0C)
-       \n     newline (hex 0A)
-       \r     carriage return (hex 0D)
-       \t     tab (hex 09)
-       \xhh   character with hex code hh
-       \ddd   character with octal code ddd, or backreference
-
-     The precise effect of "\cx" is as follows: if "x" is a lower
+    </para>
+    <para>
+      <variablelist>
+       <varlistentry>
+		<term><emphasis>\a</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          alarm, that is, the BEL character (hex 07)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\cx</emphasis></term>
+   	    <listitem>
+	     <simpara>
+           "control-x", where x is any character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\e</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          escape (hex 1B)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\f</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          formfeed (hex 0C)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\n</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          newline (hex 0A)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\r</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          carriage return (hex 0D)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\t</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          tab (hex 09)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\xhh</emphasis></term>
+   	    <listitem>
+	     <simpara>
+           character with hex code hh
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\ddd</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          character with octal code ddd, or backreference
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+      </variablelist>
+    </para>
+    <para>
+     The precise effect of "<literal>\cx</literal>" is as follows: if "<literal>x</literal>" is a lower
      case  letter,  it  is converted to upper case. Then bit 6 of
-     the character (hex 40) is inverted.  Thus "\cz" becomes  hex
-     1A, but "\c{" becomes hex 3B, while "\c;" becomes hex 7B.
-
-     After "\x", up to two hexadecimal digits are  read  (letters
+     the character (hex 40) is inverted.  Thus "<literal>\cz</literal>" becomes  hex
+     1A, but "<literal>\c{</literal>" becomes hex 3B, while "<literal>\c;</literal>" becomes hex 7B.
+    </para>
+    <para>
+     After "<literal>\x</literal>", up to two hexadecimal digits are  read  (letters
      can be in upper or lower case).
-
-     After "\0" up to two further octal digits are read. In  both
+    </para>
+    <para>
+     After "<literal>\0</literal>" up to two further octal digits are read. In  both
      cases,  if  there are fewer than two digits, just those that
-     are present are used. Thus the sequence "\0\x\07"  specifies
+     are present are used. Thus the sequence "<literal>\0\x\07</literal>"  specifies
      two binary zeros followed by a BEL character.  Make sure you
      supply two digits after the initial zero  if  the  character
      that follows is itself an octal digit.
-
+    </para>
+    <para>
      The handling of a backslash followed by a digit other than 0
      is  complicated.   Outside  a character class, PCRE reads it
      and any following digits as a decimal number. If the  number
@@ -1078,100 +1323,261 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      entire  sequence is taken as a <emphasis>back</emphasis> <emphasis>reference</emphasis>. A description
      of how this works is given later, following  the  discussion
      of parenthesized subpatterns.
-
+    </para>
+    <para>
      Inside a character  class,  or  if  the  decimal  number  is
      greater  than  9 and there have not been that many capturing
      subpatterns, PCRE re-reads up to three octal digits  follow-
      ing  the  backslash,  and  generates  a single byte from the
      least significant 8 bits of the value. Any subsequent digits
      stand for themselves.  For example:
-
-       \040   is another way of writing a space
-       \40    is the same, provided there are fewer than 40
-                 previous capturing subpatterns
-       \7     is always a back reference
-       \11    might be a back reference, or another way of
-                 writing a tab
-       \011   is always a tab
-       \0113  is a tab followed by the character "3"
-       \113   is the character with octal code 113 (since there
-                 can be no more than 99 back references)
-       \377   is a byte consisting entirely of 1 bits
-       \81    is either a back reference, or a binary zero
-                 followed by the two characters "8" and "1"
-
+    </para>
+    <para>
+      <variablelist>
+       <varlistentry>
+		<term><emphasis>\040</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is another way of writing a space
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\40</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is the same, provided there are fewer than 40
+          previous capturing subpatterns
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\7</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is always a back reference
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\11</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          might be a back reference, or another way of
+          writing a tab
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\011</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is always a tab
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\0113</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is a tab followed by the character "3"
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\113</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is the character with octal code 113 (since there
+          can be no more than 99 back references)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\377</emphasis></term>
+   	    <listitem>
+	     <simpara>
+           is a byte consisting entirely of 1 bits
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\81</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          is either a back reference, or a binary zero
+          followed by the two characters "8" and "1"
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+    </para>
+    <para>
      Note that octal values of 100 or greater must not be  intro-
      duced  by  a  leading zero, because no more than three octal
      digits are ever read.
-
+    </para>
+    <para>
      All the sequences that define a single  byte  value  can  be
      used both inside and outside character classes. In addition,
-     inside a character class, the sequence "\b"  is  interpreted
+     inside a character class, the sequence "<literal>\b<literal>" is  interpreted
      as  the  backspace  character  (hex 08). Outside a character
      class it has a different meaning (see below).
-
+    </para>
+    <para>
      The third use of backslash is for specifying generic charac-
      ter types:
-
-       \d     any decimal digit
-       \D     any character that is not a decimal digit
-       \s     any whitespace character
-       \S     any character that is not a whitespace character
-       \w     any "word" character
-       \W     any "non-word" character
-
+    </para>
+    <para>
+      <variablelist>
+       <varlistentry>
+		<term><emphasis>\d</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          any decimal digit
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\D</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          any character that is not a decimal digit
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\s</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          any whitespace character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\S</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          any character that is not a whitespace character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\w</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          any "word" character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\W</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          any "non-word" character
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+      </variablelist>
+    </para>
+    <para>
      Each pair of escape sequences partitions the complete set of
      characters  into  two  disjoint  sets.  Any  given character
      matches one, and only one, of each pair.
-
+    </para>
+    <para>
      A "word" character is any letter or digit or the  underscore
      character,  that  is,  any  character which can be part of a
-     Perl "word". The definition of letters and  digits  is  con-
-     trolled  by PCRE's character tables, and may vary if locale-
-     specific matching is  taking  place  (see  "Locale  support"
+     Perl "<literal>word</literal>". The definition of letters and digits is  
+     controlled  by PCRE's character tables, and may vary if locale-specific
+     matching is  taking  place  (see  "Locale  support"
      above). For example, in the "fr" (French) locale, some char-
      acter codes greater than 128 are used for accented  letters,
-     and these are matched by \w.
-
+     and these are matched by <literal>\w</literal>.
+    </para>
+    <para>
      These character type sequences can appear  both  inside  and
      outside  character classes. They each match one character of
      the appropriate type. If the current matching  point  is  at
      the end of the subject string, all of them fail, since there
      is no character to match.
-
+    </para>
+    <para>
      The fourth use of backslash is  for  certain  simple  asser-
      tions. An assertion specifies a condition that has to be met
      at a particular point in  a  match,  without  consuming  any
      characters  from  the subject string. The use of subpatterns
      for more complicated  assertions  is  described  below.  The
      backslashed assertions are
-
-       \b     word boundary
-       \B     not a word boundary
-       \A     start of subject (independent of multiline mode)
-       \Z     end of subject or newline at  end  (independent  of
-     multiline mode)
-       \z     end of subject (independent of multiline mode)
-
+    </para>
+    <para>
+      <variablelist>
+       <varlistentry>
+		<term><emphasis>\b</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          word boundary
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\B</emphasis></term>
+   	    <listitem>
+	     <simpara>
+           not a word boundary
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\A</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          start of subject (independent of multiline mode)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\Z</emphasis></term>
+   	    <listitem>
+	     <simpara>
+         end of subject or newline at  end  (independent  of
+         multiline mode)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+       <varlistentry>
+		<term><emphasis>\z</emphasis></term>
+   	    <listitem>
+	     <simpara>
+          end of subject (independent of multiline mode)
+	     </simpara>
+	    </listitem>
+       </varlistentry>
+      </variablelist>
+    </para>
+    <para>
      These assertions may not appear in  character  classes  (but
-     note that "\b" has a different meaning, namely the backspace
+     note that "<literal>\b</literal>" has a different meaning, namely the backspace
      character, inside a character class).
-
+    </para>
+    <para>
      A word boundary is a position in the  subject  string  where
      the current character and the previous character do not both
-     match \w or \W (i.e. one matches \w and  the  other  matches
-     \W),  or the start or end of the string if the first or last
+     match <literal>\w</literal> or <literal>\W</literal> (i.e. one matches 
+     <literal>\w</literal> and  the  other  matches
+     <literal>\W</literal>),  or the start or end of the string if the first or last
      character matches \w, respectively.
-
-     The \A, \Z, and \z assertions differ  from  the  traditional
+    </para>
+    <para>
+     The <literal>\A</literal>, <literal>\Z</literal>, and <literal>\z</literal> assertions differ  from  the  traditional
      circumflex  and  dollar  (described below) in that they only
      ever match at the very start and end of the subject  string,
      whatever  options  are  set.  They  are  not affected by the
-     PCRE_NOTBOL or PCRE_NOTEOL options. The  difference  between
-     \Z  and  \z  is that \Z matches before a newline that is the
+     <link linkend="pcre.pattern.modifiers">PCRE_NOTBOL</link>  or <link linkend="pcre.pattern.modifiers">PCRE_NOTEOL</link>  options. The  difference  between
+     <literal>\Z</literal>  and  <literal>\z</literal>  is that <literal>\Z</literal>
+     matches before a newline that is the
      last character of the string as well as at the  end  of  the
-     string, whereas \z matches only at the end.
-     </literallayout>
+     string, whereas <literal>\z</literal> matches only at the end.
+     </para>
     </refsect2>
 	<refsect2 id="regexp.reference.circudollar">
 	 <title>Cicumflex and dollar</title>
@@ -1202,11 +1608,11 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
 
      The meaning of dollar can be changed so that it matches only
      at   the   very   end   of   the   string,  by  setting  the
-     PCRE_DOLLAR_ENDONLY option at compile or matching time. This
+     <link linkend="pcre.pattern.modifiers">PCRE_DOLLAR_ENDONLY</link>  option at compile or matching time. This
      does not affect the \Z assertion.
 
      The meanings of the circumflex  and  dollar  characters  are
-     changed  if  the  PCRE_MULTILINE option is set. When this is
+     changed  if  the  <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>  option is set. When this is
      the case,  they  match  immediately  after  and  immediately
      before an internal "\n" character, respectively, in addition
      to matching at the start and end of the subject string.  For
@@ -1214,13 +1620,13 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      "def\nabc" in multiline  mode,  but  not  otherwise.  Conse-
      quently,  patterns  that  are  anchored  in single line mode
      because all branches start with "^" are not anchored in mul-
-     tiline  mode.  The  PCRE_DOLLAR_ENDONLY option is ignored if
-     PCRE_MULTILINE is set.
+     tiline  mode.  The  <link linkend="pcre.pattern.modifiers">PCRE_DOLLAR_ENDONLY</link>  option is ignored if
+     <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>  is set.
 
      Note that the sequences \A, \Z, and \z can be used to  match
      the  start  and end of the subject in both modes, and if all
      branches of a pattern start with \A is it  always  anchored,
-     whether PCRE_MULTILINE is set or not.
+     whether <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>  is set or not.
      </literallayout>
     </refsect2>
 	<refsect2 id="regexp.reference.dot">
@@ -1228,7 +1634,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      <literallayout>
      Outside a character class, a dot in the pattern matches  any
      one  character  in  the  subject,  including  a non-printing
-     character, but not (by default) newline.  If the PCRE_DOTALL
+     character, but not (by default) newline.  If the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> 
      option  is  set,  then dots match newlines as well. The han-
      dling of dot is entirely independent of the handling of cir-
      cumflex  and  dollar,  the only relationship being that they
@@ -1270,8 +1676,8 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      ful version would.
 
      The newline character is never treated in any special way in
-     character  classes,  whatever the setting of the PCRE_DOTALL
-     or PCRE_MULTILINE options is. A  class  such  as  [^a]  will
+     character  classes,  whatever the setting of the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> 
+     or <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>  options is. A  class  such  as  [^a]  will
      always match a newline.
 
      The minus (hyphen) character can be used to specify a  range
@@ -1336,21 +1742,23 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
 	<refsect2 id="regexp.reference.internal_options">
 	 <title>Internal option setting</title>
      <literallayout>
-     The settings of PCRE_CASELESS, PCRE_MULTILINE,  PCRE_DOTALL,
-     and  PCRE_EXTENDED can be changed from within the pattern by
+     The settings of <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link> , 
+     <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> ,  
+     <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> ,
+     and  <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>  can be changed from within the pattern by
      a sequence of Perl option letters enclosed between "(?"  and
      ")". The option letters are
 
-       i  for PCRE_CASELESS
-       m  for PCRE_MULTILINE
-       s  for PCRE_DOTALL
-       x  for PCRE_EXTENDED
+       i  for <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link> 
+       m  for <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link> 
+       s  for <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> 
+       x  for <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> 
 
      For example, (?im) sets caseless, multiline matching. It  is
      also possible to unset these options by preceding the letter
      with a hyphen, and a combined setting and unsetting such  as
-     (?im-sx),  which sets PCRE_CASELESS and PCRE_MULTILINE while
-     unsetting PCRE_DOTALL and PCRE_EXTENDED, is also  permitted.
+     (?im-sx),  which sets <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link>  and <link linkend="pcre.pattern.modifiers">PCRE_MULTILINE</link>  while
+     unsetting <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>  and <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link> , is also  permitted.
      If  a  letter  appears both before and after the hyphen, the
      option is unset.
 
@@ -1366,7 +1774,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
        abc(?i)
 
      which in turn is the same as compiling the pattern abc  with
-     PCRE_CASELESS  set.   In  other words, such "top level" set-
+     <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link>   set.   In  other words, such "top level" set-
      tings apply to the whole pattern  (unless  there  are  other
      changes  inside subpatterns). If there is more than one set-
      ting of the same option at top level, the rightmost  setting
@@ -1380,7 +1788,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
        (a(?i)b)c
 
      matches  abc  and  aBc  and  no  other   strings   (assuming
-     PCRE_CASELESS  is  not used).  By this means, options can be
+     <link linkend="pcre.pattern.modifiers">PCRE_CASELESS</link>   is  not used).  By this means, options can be
      made to have different settings in different  parts  of  the
      pattern.  Any  changes  made  in one alternative do carry on
      into subsequent branches within  the  same  subpattern.  For
@@ -1394,7 +1802,8 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      compile  time. There would be some very weird behaviour oth-
      erwise.
 
-     The PCRE-specific options PCRE_UNGREEDY and  PCRE_EXTRA  can
+     The PCRE-specific options <link linkend="pcre.pattern.modifiers">PCRE_UNGREEDY</link>  and  
+     <link linkend="pcre.pattern.modifiers">PCRE_EXTRA</link>   can
      be changed in the same way as the Perl-compatible options by
      using the characters U and X  respectively.  The  (?X)  flag
      setting  is  special in that it must always occur earlier in
@@ -1564,7 +1973,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      which matches one digit by preference, but can match two  if
      that is the only way the rest of the pattern matches.
 
-     If the PCRE_UNGREEDY option is set (an option which  is  not
+     If the <link linkend="pcre.pattern.modifiers">PCRE_UNGREEDY</link>  option is set (an option which  is  not
      available  in  Perl)  then the quantifiers are not greedy by
      default, but individual ones can be made greedy by following
      them  with  a  question mark. In other words, it inverts the
@@ -1575,7 +1984,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      imum, more store is required for the  compiled  pattern,  in
      proportion to the size of the minimum or maximum.
 
-     If a pattern starts with .* or  .{0,}  and  the  PCRE_DOTALL
+     If a pattern starts with .* or  .{0,}  and  the  <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> 
      option (equivalent to Perl's /s) is set, thus allowing the .
      to match newlines, then the pattern is implicitly  anchored,
      because whatever follows will be tried against every charac-
@@ -1583,7 +1992,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      retrying  the overall match at any position after the first.
      PCRE treats such a pattern as though it were preceded by \A.
      In  cases where it is known that the subject string contains
-     no newlines, it is worth setting PCRE_DOTALL when  the  pat-
+     no newlines, it is worth setting <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>  when  the  pat-
      tern begins with .* in order to obtain this optimization, or
      alternatively using ^ to indicate anchoring explicitly.
 
@@ -1652,7 +2061,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      following the backslash are taken as  part  of  a  potential
      back reference number. If the pattern continues with a digit
      character, then some delimiter must be used to terminate the
-     back reference. If the PCRE_EXTENDED option is set, this can
+     back reference. If the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>  option is set, this can
      be whitespace.  Otherwise an empty comment can be used.
 
      A back reference that occurs inside the parentheses to which
@@ -1925,7 +2334,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      condition is satisfied if the capturing subpattern  of  that
      number  has  previously matched. Consider the following pat-
      tern, which contains non-significant white space to make  it
-     more  readable  (assume  the  PCRE_EXTENDED  option)  and to
+     more  readable  (assume  the  <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>   option)  and to
      divide it into three parts for ease of discussion:
 
        ( \( )?    [^()]+    (?(1) \) )
@@ -1970,7 +2379,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      parentheses are not permitted. The characters that make up a
      comment play no part in the pattern matching at all.
 
-     If the PCRE_EXTENDED option is set, an unescaped # character
+     If the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>  option is set, an unescaped # character
      outside  a character class introduces a comment that contin-
      ues up to the next newline character in the pattern.
      </literallayout>
@@ -1987,7 +2396,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      expressions to recurse (amongst other things).  The  special 
      item (?R) is  provided for  the specific  case of recursion. 
      This PCRE  pattern  solves the  parentheses  problem (assume 
-     the PCRE_EXTENDED option is set so that white space is 
+     the <link linkend="pcre.pattern.modifiers">PCRE_EXTENDED</link>  option is set so that white space is 
      ignored):
 
        \( ( (?>[^()]+) | (?R) )* \)
@@ -2046,10 +2455,10 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
      Friedl's  book contains a lot of discussion about optimizing
      regular expressions for efficient performance.
 
-     When a pattern begins with .* and the PCRE_DOTALL option  is
+     When a pattern begins with .* and the <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>  option  is
      set,  the  pattern  is implicitly anchored by PCRE, since it
      can match only at the start of a subject string. However, if
-     PCRE_DOTALL  is not set, PCRE cannot make this optimization,
+     <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link>   is not set, PCRE cannot make this optimization,
      because the . metacharacter does not then match  a  newline,
      and if the subject string contains newlines, the pattern may
      match from the character immediately following one  of  them
@@ -2064,7 +2473,7 @@ $fl_array = preg_grep ("/^(\d+)?\.\d+$/", $array);
 
      If you are using such a pattern with subject strings that do
      not  contain  newlines,  the best performance is obtained by
-     setting PCRE_DOTALL, or starting the  pattern  with  ^.*  to
+     setting <link linkend="pcre.pattern.modifiers">PCRE_DOTALL</link> , or starting the  pattern  with  ^.*  to
      indicate  explicit anchoring. That saves PCRE from having to
      scan along the subject looking for a newline to restart at.