From 08516232aa30a89aa6fe4880761f40fb75aa1404 Mon Sep 17 00:00:00 2001
From: Hartmut Holzgraefe <hholzgra@php.net>
Date: Wed, 27 Sep 2000 20:41:17 +0000
Subject: [PATCH] more on Levenshtein ...

git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@33019 c90b9560-bf6c-de11-be94-00142212c4b1
---
 functions/strings.xml | 109 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 11 deletions(-)
diff --git a/functions/strings.xml b/functions/strings.xml
index b22e9ae44a..7fdabd0429 100644
--- a/functions/strings.xml
+++ b/functions/strings.xml
@@ -930,15 +930,31 @@ $colon_separated = implode (":", $array);
     <title>Description</title>
     <funcsynopsis>
      <funcprototype>
-      <funcdef>int <function>levenshtein</function></funcdef>
-      <paramdef>string <parameter>str1</parameter></paramdef>
-      <paramdef>string <parameter>str2</parameter></paramdef>
+			<funcdef>int <function>levenshtein</function></funcdef>
+			<paramdef>string <parameter>str1</parameter></paramdef>
+			<paramdef>string <parameter>str2</parameter></paramdef>
+		 </funcprototype>
+		 <funcprototype>
+			<funcdef>int <function>levenshtein</function></funcdef>
+			<paramdef>string <parameter>str1</parameter></paramdef>
+			<paramdef>string <parameter>str2</parameter></paramdef>
+			<paramdef>int <parameter>cost_ins</parameter></paramdef>
+			<paramdef>int <parameter>cost_rep</parameter></paramdef>
+			<paramdef>int <parameter>cost_del</parameter></paramdef>
+		 </funcprototype>
+		 <funcprototype>
+			<funcdef>int <function>levenshtein</function></funcdef>
+			<paramdef>string <parameter>str1</parameter></paramdef>
+			<paramdef>string <parameter>str2</parameter></paramdef>
+			<paramdef>function <parameter>cost</parameter></paramdef>
      </funcprototype>
     </funcsynopsis>
     <para>
-     This function return the Levenshtein-Distance between the two
-     argument strings or -1, if one of the argument strings is longer
-     than the limit of 255 characters.
+		 This function return the Levenshtein-Distance between the
+     two argument strings or -1, if one of the argument strings
+     is longer than the limit of 255 characters (255 should be
+		 more than enough for name or dictionary comarison, and 
+     nobody serious would be doing genetic analysis with PHP).
     </para>
     <para>
      The Levenshtein distance is defined as the minimal number of
@@ -948,13 +964,84 @@ $colon_separated = implode (":", $array);
      where <literal>n</literal> and <literal>m</literal> are the
      length of <parameter>str1</parameter> and
      <parameter>str2</parameter> (rather good when compared to
-     <function>similar_text</function>, which is O(max(n,m)**3), but
-     still expensive).
+     <function>similar_text</function>, which is O(max(n,m)**3),
+     but still expensive).  
     </para>
+	  <para>
+		 In its simpelest form the function will take only the two
+     strings as parameter and will calculate just the number of
+     insert, replace and delete operations needed to transform
+     <parameter>str1</parameter> into <parameter>str2</parameter>.
+    </para>
+	  <para> 
+		 A second variant will take three additional parameters that
+		 define the cost of insert, replace and delete operations.
+     This is more general and adaptive than variant one, but not
+     as efficient.
+		</para>
+	  <para>  
+		 The third variant (which is not implemented yet) will be
+		 the  most general and adaptive, but also the slowest alternative.
+		 It will call a user-supplied function that will determine the
+		 cost for every possible operation.
+		</para>
+	  <para>
+		 The user-supplied function will be called with the following 
+     arguments:
+     <itemizedlist>
+      <listitem>
+       <simpara>
+		 	  operation to apply: 'I', 'R' or 'D'
+       </simpara>
+      </listitem>
+      <listitem>
+       <simpara>
+		 	  actual character in string 1
+       </simpara>
+      </listitem>
+      <listitem>
+       <simpara>
+			  actual character in string 2
+       </simpara>
+      </listitem>
+      <listitem>
+       <simpara>
+			  position in string 1
+       </simpara>
+      </listitem>
+      <listitem>
+       <simpara>
+		 	  position in string 2
+       </simpara>
+      </listitem>
+      <listitem>
+       <simpara>
+			  remaining characters in string 1
+       </simpara>
+      </listitem>
+      <listitem>
+       <simpara>
+			  remaining characters in string 2
+       </simpara>
+      </listitem>
+     </itemizedlist>
+		 The user-supplied function has to return a positive integer
+		 describing the cost for this particular operation, but it
+		 may decide to use only some of the supplied arguments.
+		</para>
+		<para> 
+		 The user-suplied function approach offers the possibility to
+		 take into account the relevance of and/or difference between 
+     certain symbols (characters) or even the context those symbols
+     appear in to determine the cost of insert, replace and delete 
+     operations, but at the cost of loosing all optimizations done
+     regarding cpu register utilization and cache misses that have
+     been worked into the other two variants. 
+		</para>
     <para>
-     See also <function>soundex</function>,
-     <function>similar_text</function> and
-     <function>metaphone</function>.
+     See also <function>soundex</function>, 
+     <function>similar_text</function>
+		 and <function>metaphone</function>.
     </para>
    </refsect1>
   </refentry>