mirror of
https://github.com/sigmasternchen/php-doc-en
synced 2025-03-27 14:28:56 +00:00

git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@78496 c90b9560-bf6c-de11-be94-00142212c4b1
148 lines
5.2 KiB
XML
148 lines
5.2 KiB
XML
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
<!-- $Revision: 1.2 $ -->
|
|
<!-- splitted from ./en/functions/strings.xml, last change in rev 1.12 -->
|
|
<refentry id="function.levenshtein">
|
|
<refnamediv>
|
|
<refname>levenshtein</refname>
|
|
<refpurpose>
|
|
Calculate Levenshtein distance between two strings
|
|
</refpurpose>
|
|
</refnamediv>
|
|
<refsect1>
|
|
<title>Description</title>
|
|
<methodsynopsis>
|
|
<type>int</type><methodname>levenshtein</methodname>
|
|
<methodparam><type>string</type><parameter>str1</parameter></methodparam>
|
|
<methodparam><type>string</type><parameter>str2</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<methodsynopsis>
|
|
<type>int</type><methodname>levenshtein</methodname>
|
|
<methodparam><type>string</type><parameter>str1</parameter></methodparam>
|
|
<methodparam><type>string</type><parameter>str2</parameter></methodparam>
|
|
<methodparam><type>int</type><parameter>cost_ins</parameter></methodparam>
|
|
<methodparam><type>int</type><parameter>cost_rep</parameter></methodparam>
|
|
<methodparam><type>int</type><parameter>cost_del</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<methodsynopsis>
|
|
<type>int</type><methodname>levenshtein</methodname>
|
|
<methodparam><type>string</type><parameter>str1</parameter></methodparam>
|
|
<methodparam><type>string</type><parameter>str2</parameter></methodparam>
|
|
<methodparam><type>function</type><parameter>cost</parameter></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
This function returns the Levenshtein-Distance between the
|
|
two argument strings or -1, if one of the argument strings
|
|
is longer than the limit of 255 characters (255 should be
|
|
more than enough for name or dictionary comparison, and
|
|
nobody serious would be doing genetic analysis with PHP).
|
|
</para>
|
|
<para>
|
|
The Levenshtein distance is defined as the minimal number of
|
|
characters you have to replace, insert or delete to transform
|
|
<parameter>str1</parameter> into <parameter>str2</parameter>.
|
|
The complexity of the algorithm is <literal>O(m*n)</literal>,
|
|
where <literal>n</literal> and <literal>m</literal> are the
|
|
length of <parameter>str1</parameter> and
|
|
<parameter>str2</parameter> (rather good when compared to
|
|
<function>similar_text</function>, which is O(max(n,m)**3),
|
|
but still expensive).
|
|
</para>
|
|
<para>
|
|
In its simplest form the function will take only the two
|
|
strings as parameter and will calculate just the number of
|
|
insert, replace and delete operations needed to transform
|
|
<parameter>str1</parameter> into <parameter>str2</parameter>.
|
|
</para>
|
|
<para>
|
|
A second variant will take three additional parameters that
|
|
define the cost of insert, replace and delete operations. This
|
|
is more general and adaptive than variant one, but not as
|
|
efficient.
|
|
</para>
|
|
<para>
|
|
The third variant (which is not implemented yet) will be the most
|
|
general and adaptive, but also the slowest alternative. It will
|
|
call a user-supplied function that will determine the cost for
|
|
every possible operation.
|
|
</para>
|
|
<para>
|
|
The user-supplied function will be called with the following
|
|
arguments:
|
|
<itemizedlist>
|
|
<listitem>
|
|
<simpara>
|
|
operation to apply: 'I', 'R' or 'D'
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
actual character in string 1
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
actual character in string 2
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
position in string 1
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
position in string 2
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
remaining characters in string 1
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
remaining characters in string 2
|
|
</simpara>
|
|
</listitem>
|
|
</itemizedlist>
|
|
The user-supplied function has to return a positive integer
|
|
describing the cost for this particular operation, but it may
|
|
decide to use only some of the supplied arguments.
|
|
</para>
|
|
<para>
|
|
The user-supplied function approach offers the possibility to
|
|
take into account the relevance of and/or difference between
|
|
certain symbols (characters) or even the context those symbols
|
|
appear in to determine the cost of insert, replace and delete
|
|
operations, but at the cost of losing all optimizations done
|
|
regarding cpu register utilization and cache misses that have
|
|
been worked into the other two variants.
|
|
</para>
|
|
<para>
|
|
See also <function>soundex</function>,
|
|
<function>similar_text</function>, and
|
|
<function>metaphone</function>.
|
|
</para>
|
|
</refsect1>
|
|
</refentry>
|
|
|
|
<!-- Keep this comment at the end of the file
|
|
Local variables:
|
|
mode: sgml
|
|
sgml-omittag:t
|
|
sgml-shorttag:t
|
|
sgml-minimize-attributes:nil
|
|
sgml-always-quote-attributes:t
|
|
sgml-indent-step:1
|
|
sgml-indent-data:t
|
|
indent-tabs-mode:nil
|
|
sgml-parent-document:nil
|
|
sgml-default-dtd-file:"../../../../manual.ced"
|
|
sgml-exposed-tags:nil
|
|
sgml-local-catalogs:nil
|
|
sgml-local-ecat-files:nil
|
|
End:
|
|
vim600: syn=xml fen fdm=syntax fdl=2 si
|
|
vim: et tw=78 syn=sgml
|
|
vi: ts=1 sw=1
|
|
-->
|