mirror of
https://github.com/sigmasternchen/php-doc-en
synced 2025-03-26 13:58:55 +00:00
225 lines
6.9 KiB
XML
225 lines
6.9 KiB
XML
<?xml version="1.0" encoding="utf-8"?>
|
|
<!-- $Revision$ -->
|
|
<refentry xml:id="function.mb-detect-encoding" xmlns="http://docbook.org/ns/docbook" xmlns:xlink="http://www.w3.org/1999/xlink">
|
|
<refnamediv>
|
|
<refname>mb_detect_encoding</refname>
|
|
<refpurpose>Detect character encoding</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsect1 role="description">
|
|
&reftitle.description;
|
|
<methodsynopsis>
|
|
<type class="union"><type>string</type><type>false</type></type><methodname>mb_detect_encoding</methodname>
|
|
<methodparam><type>string</type><parameter>string</parameter></methodparam>
|
|
<methodparam choice="opt"><type class="union"><type>array</type><type>string</type><type>null</type></type><parameter>encodings</parameter><initializer>&null;</initializer></methodparam>
|
|
<methodparam choice="opt"><type>bool</type><parameter>strict</parameter><initializer>&false;</initializer></methodparam>
|
|
</methodsynopsis>
|
|
<para>
|
|
Detects the most likely character encoding for <type>string</type> <parameter>string</parameter>
|
|
from an ordered list of candidates.
|
|
</para>
|
|
<para>
|
|
Automatic detection of the intended character encoding can never be entirely reliable;
|
|
without some additional information, it is similar to decoding an encrypted string
|
|
without the key. It is always preferable to use an indication of character encoding
|
|
stored or transmitted with the data, such as a "Content-Type" HTTP header.
|
|
</para>
|
|
<para>
|
|
This function is most useful with multibyte encodings, where not all sequences of
|
|
bytes form a valid string. If the input string contains such a sequence, that
|
|
encoding will be rejected, and the next encoding checked.
|
|
</para>
|
|
</refsect1>
|
|
|
|
<refsect1 role="parameters">
|
|
&reftitle.parameters;
|
|
<para>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term><parameter>string</parameter></term>
|
|
<listitem>
|
|
<para>
|
|
The <type>string</type> being inspected.
|
|
</para>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><parameter>encodings</parameter></term>
|
|
<listitem>
|
|
<para>
|
|
A list of character encodings to try, in order. The list may be specified as
|
|
an array of strings, or a single string separated by commas.
|
|
</para>
|
|
<para>
|
|
If <parameter>encodings</parameter> is omitted or &null;,
|
|
the current detect_order (set with the <link linkend="ini.mbstring.detect-order">
|
|
mbstring.detect_order</link> configuration option, or <function>mb_detect_order</function>
|
|
function) will be used.
|
|
</para>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term><parameter>strict</parameter></term>
|
|
<listitem>
|
|
<para>
|
|
Controls the behaviour when <parameter>string</parameter>
|
|
is not valid in any of the listed <parameter>encodings</parameter>.
|
|
If <parameter>strict</parameter> is set to &false;, the closest matching
|
|
encoding will be returned; if <parameter>strict</parameter> is set to &true;,
|
|
&false; will be returned.
|
|
</para>
|
|
<para>
|
|
The default value for <parameter>strict</parameter> can be set
|
|
with the <link linkend="ini.mbstring.strict-detection">
|
|
mbstring.strict_detection</link> configuration option.
|
|
</para>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</para>
|
|
</refsect1>
|
|
|
|
<refsect1 role="returnvalues">
|
|
&reftitle.returnvalues;
|
|
<para>
|
|
The detected character encoding, or &false; if the string is not valid
|
|
in any of the listed encodings.
|
|
</para>
|
|
</refsect1>
|
|
|
|
<refsect1 role="examples">
|
|
&reftitle.examples;
|
|
<para>
|
|
<example>
|
|
<title><function>mb_detect_encoding</function> example</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
<?php
|
|
// Detect character encoding with current detect_order
|
|
echo mb_detect_encoding($str);
|
|
|
|
// "auto" is expanded according to mbstring.language
|
|
echo mb_detect_encoding($str, "auto");
|
|
|
|
// Specify "encodings" parameter by list separated by comma
|
|
echo mb_detect_encoding($str, "JIS, eucjp-win, sjis-win");
|
|
|
|
// Use array to specify "encodings" parameter
|
|
$encodings = [
|
|
"ASCII",
|
|
"JIS",
|
|
"EUC-JP"
|
|
];
|
|
echo mb_detect_encoding($str, $encodings);
|
|
?>
|
|
]]>
|
|
</programlisting>
|
|
</example>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Effect of <parameter>strict</parameter> parameter</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
<?php
|
|
// 'áéóú' encoded in ISO-8859-1
|
|
$str = "\xE1\xE9\xF3\xFA";
|
|
|
|
// The string is not valid ASCII or UTF-8, but UTF-8 is considered a closer match
|
|
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8'], false));
|
|
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8'], true));
|
|
|
|
// If a valid encoding is found, the strict parameter does not change the result
|
|
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8', 'ISO-8859-1'], false));
|
|
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8', 'ISO-8859-1'], true));
|
|
?>
|
|
]]>
|
|
</programlisting>
|
|
&example.outputs;
|
|
<screen>
|
|
<![CDATA[
|
|
string(5) "UTF-8"
|
|
bool(false)
|
|
string(10) "ISO-8859-1"
|
|
string(10) "ISO-8859-1"
|
|
]]>
|
|
</screen>
|
|
</example>
|
|
</para>
|
|
<para>
|
|
In some cases, the same sequence of bytes may form a valid string in multiple
|
|
character encodings, and it is impossible to know which interpretation was
|
|
intended. For instance, among many others, the byte sequence "\xC4\xA2" could be:
|
|
</para>
|
|
<para>
|
|
<simplelist>
|
|
<member>
|
|
"Ä¢" (U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS followed by U+00A2 CENT SIGN)
|
|
encoded in any of ISO-8859-1, ISO-8859-15, or Windows-1252
|
|
</member>
|
|
<member>
|
|
"ФЂ" (U+0424 CYRILLIC CAPITAL LETTER EF followed by U+0402 CYRILLIC CAPITAL LETTER
|
|
DJE) encoded in ISO-8859-5
|
|
</member>
|
|
<member>
|
|
"Ģ" (U+0122 LATIN CAPITAL LETTER G WITH CEDILLA) encoded in UTF-8
|
|
</member>
|
|
</simplelist>
|
|
</para>
|
|
<para>
|
|
<example>
|
|
<title>Effect of order when multiple encodings match</title>
|
|
<programlisting role="php">
|
|
<![CDATA[
|
|
<?php
|
|
$str = "\xC4\xA2";
|
|
|
|
// The string is valid in all three encodings, so the first one listed will be returned
|
|
var_dump(mb_detect_encoding($str, ['UTF-8', 'ISO-8859-1', 'ISO-8859-5']));
|
|
var_dump(mb_detect_encoding($str, ['ISO-8859-1', 'ISO-8859-5', 'UTF-8']));
|
|
var_dump(mb_detect_encoding($str, ['ISO-8859-5', 'UTF-8', 'ISO-8859-1']));
|
|
?>
|
|
]]>
|
|
</programlisting>
|
|
&example.outputs;
|
|
<screen>
|
|
<![CDATA[
|
|
string(5) "UTF-8"
|
|
string(10) "ISO-8859-1"
|
|
string(10) "ISO-8859-5"
|
|
]]>
|
|
</screen>
|
|
</example>
|
|
</para>
|
|
</refsect1>
|
|
|
|
<refsect1 role="seealso">
|
|
&reftitle.seealso;
|
|
<para>
|
|
<simplelist>
|
|
<member><function>mb_detect_order</function></member>
|
|
</simplelist>
|
|
</para>
|
|
</refsect1>
|
|
|
|
</refentry>
|
|
<!-- Keep this comment at the end of the file
|
|
Local variables:
|
|
mode: sgml
|
|
sgml-omittag:t
|
|
sgml-shorttag:t
|
|
sgml-minimize-attributes:nil
|
|
sgml-always-quote-attributes:t
|
|
sgml-indent-step:1
|
|
sgml-indent-data:t
|
|
indent-tabs-mode:nil
|
|
sgml-parent-document:nil
|
|
sgml-default-dtd-file:"~/.phpdoc/manual.ced"
|
|
sgml-exposed-tags:nil
|
|
sgml-local-catalogs:nil
|
|
sgml-local-ecat-files:nil
|
|
End:
|
|
vim600: syn=xml fen fdm=syntax fdl=2 si
|
|
vim: et tw=78 syn=sgml
|
|
vi: ts=1 sw=1
|
|
-->
|