mirror of
https://github.com/sigmasternchen/php-doc-en
synced 2025-03-16 08:58:56 +00:00

git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@346848 c90b9560-bf6c-de11-be94-00142212c4b1
506 lines
12 KiB
XML
506 lines
12 KiB
XML
<?xml version="1.0" encoding="utf-8"?>
|
|
<!-- $Revision$ -->
|
|
<chapter xml:id="mbstring.encodings" xmlns="http://docbook.org/ns/docbook" xmlns:xlink="http://www.w3.org/1999/xlink">
|
|
<title>Summaries of supported encodings</title>
|
|
<segmentedlist>
|
|
<title>Summaries of supported encodings</title>
|
|
<segtitle>Name in the IANA character set registry</segtitle>
|
|
<segtitle>Underlying character set</segtitle>
|
|
<segtitle>Description</segtitle>
|
|
<segtitle>Additional note</segtitle>
|
|
<seglistitem>
|
|
<seg>ISO-10646-UCS-4</seg>
|
|
<seg>ISO 10646</seg>
|
|
<seg>
|
|
The Universal Character Set with 31-bit code space, standardized as UCS-4
|
|
by ISO/IEC 10646. It is kept synchronized with the latest version of the
|
|
Unicode code map.
|
|
</seg>
|
|
<seg>
|
|
If this name is used in the encoding conversion facility,
|
|
the converter attempts to identify by the preceding BOM
|
|
(byte order mark)in which endian the subsequent bytes
|
|
are represented.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-10646-UCS-4</seg>
|
|
<seg>UCS-4</seg>
|
|
<seg>
|
|
See above.
|
|
</seg>
|
|
<seg>
|
|
In contrast to <literal>UCS-4</literal>, strings are always assumed
|
|
to be in big endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-10646-UCS-4</seg>
|
|
<seg>UCS-4</seg>
|
|
<seg>
|
|
See above.
|
|
</seg>
|
|
<seg>
|
|
In contrast to <literal>UCS-4</literal>, strings are always assumed
|
|
to be in little endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-10646-UCS-2</seg>
|
|
<seg>UCS-2</seg>
|
|
<seg>
|
|
The Universal Character Set with 16-bit code space, standardized as UCS-2
|
|
by ISO/IEC 10646. It is kept synchronized with the latest version of the
|
|
unicode code map.
|
|
</seg>
|
|
<seg>
|
|
If this name is used in the encoding conversion facility,
|
|
the converter attempts to identify by the preceding BOM
|
|
(byte order mark)in which endian the subsequent bytes
|
|
are represented.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-10646-UCS-2</seg>
|
|
<seg>UCS-2</seg>
|
|
<seg>
|
|
See above.
|
|
</seg>
|
|
<seg>
|
|
In contrast to <literal>UCS-2</literal>, strings are always assumed
|
|
to be in big endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-10646-UCS-2</seg>
|
|
<seg>UCS-2</seg>
|
|
<seg>
|
|
See above.
|
|
</seg>
|
|
<seg>
|
|
In contrast to <literal>UCS-2</literal>, strings are always assumed
|
|
to be in little endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-32</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>
|
|
Unicode Transformation Format of 32-bit unit width, whose encoding space
|
|
refers to the Unicode's codeset standard. This encoding scheme wasn't
|
|
identical to UCS-4 because the code space of Unicode were limited to
|
|
a 21-bit value.
|
|
</seg>
|
|
<seg>
|
|
If this name is used in the encoding conversion facility,
|
|
the converter attempts to identify by the preceding BOM
|
|
(byte order mark)in which endian the subsequent bytes
|
|
are represented.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-32BE</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>See above</seg>
|
|
<seg>
|
|
In contrast to <literal>UTF-32</literal>, strings are always assumed
|
|
to be in big endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-32LE</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>See above</seg>
|
|
<seg>
|
|
In contrast to <literal>UTF-32</literal>, strings are always assumed
|
|
to be in little endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-16</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>
|
|
Unicode Transformation Format of 16-bit unit width. It's worth a note
|
|
that UTF-16 is no longer the same specification as UCS-2 because the
|
|
surrogate mechanism has been introduced since Unicode 2.0 and
|
|
UTF-16 now refers to a 21-bit code space.
|
|
</seg>
|
|
<seg>
|
|
If this name is used in the encoding conversion facility,
|
|
the converter attempts to identify by the preceding BOM
|
|
(byte order mark)in which endian the subsequent bytes
|
|
are represented.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-16BE</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>
|
|
See above.
|
|
</seg>
|
|
<seg>
|
|
In contrast to <literal>UTF-16</literal>, strings are always assumed
|
|
to be in big endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-16LE</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>
|
|
See above.
|
|
</seg>
|
|
<seg>
|
|
In contrast to <literal>UTF-16</literal>, strings are always assumed
|
|
to be in little endian form.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-8</seg>
|
|
<seg>Unicode / UCS</seg>
|
|
<seg>
|
|
Unicode Transformation Format of 8-bit unit width.
|
|
</seg>
|
|
<seg>none</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UTF-7</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>
|
|
A mail-safe transformation format of Unicode, specified in
|
|
<link xlink:href="&url.rfc;2152">RFC2152</link>.
|
|
</seg>
|
|
<seg>none</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>(none)</seg>
|
|
<seg>Unicode</seg>
|
|
<seg>
|
|
A variant of UTF-7 which is specialized for use in the
|
|
<link xlink:href="&url.rfc;3501">IMAP protocol</link>.
|
|
</seg>
|
|
<seg>none</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>
|
|
US-ASCII (preferred MIME name) / iso-ir-6 / ANSI_X3.4-1986 /
|
|
ISO_646.irv:1991 / ASCII / ISO646-US / us / IBM367 / CP367 / csASCII
|
|
</seg>
|
|
<seg>ASCII / ISO 646</seg>
|
|
<seg>
|
|
American Standard Code for Information Interchange is a commonly-used
|
|
7-bit encoding. Also standardized as an international standard, ISO 646.
|
|
</seg>
|
|
<seg>(none)</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>
|
|
EUC-JP (preferred MIME name) /
|
|
Extended_UNIX_Code_Packed_Format_for_Japanese / csEUCPkdFmtJapanese
|
|
</seg>
|
|
<seg>
|
|
Compound of US-ASCII / JIS X0201:1997 (hankaku kana part) /
|
|
JIS X0208:1990 / JIS X0212:1990
|
|
</seg>
|
|
<seg>
|
|
As you see the name is derived from an abbreviation of Extended UNIX Code
|
|
Packed Format for Japanese, this encoding is mostly used on UNIX or
|
|
alike platforms. The original encoding scheme, Extended UNIX Code, is
|
|
designed on the basis of ISO 2022.
|
|
</seg>
|
|
<seg>
|
|
The character set referred to by EUC-JP is different to IBM932 / CP932,
|
|
which are used by OS/2® and Microsoft® Windows®.
|
|
For information interchange with those platforms, use EUCJP-WIN instead.
|
|
</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>Shift_JIS (preferred MIME name) / MS_Kanji / csShift_JIS</seg>
|
|
<seg>Compound of JIS X0201:1997 / JIS X0208:1997</seg>
|
|
<seg>
|
|
Shift_JIS was developed in early 80's, at the time personal Japanese word
|
|
processors were brought into the market, in order to maintain
|
|
compatibilities with the legacy encoding scheme JIS X 0201:1976.
|
|
According to the IANA definition the codeset of Shift_JIS is slightly
|
|
different to IBM932 / CP932. However, the names "SJIS" / "Shift_JIS" are
|
|
often wrongly used to refer to these codesets.
|
|
</seg>
|
|
<seg>For the CP932 codemap, use SJIS-WIN instead.</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>(none)</seg>
|
|
<seg>
|
|
Compound of JIS X0201:1997 / JIS X0208:1997 / IBM extensions / NEC extensions
|
|
</seg>
|
|
<seg>
|
|
While this "encoding" uses the same encoding scheme as EUC-JP,
|
|
the underlying character set is different. That is, some code points map
|
|
to different characters than EUC-JP.
|
|
</seg>
|
|
<seg>none</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>Windows-31J / csWindows31J</seg>
|
|
<seg>
|
|
Compound of JIS X0201:1997 / JIS X0208:1997 / IBM extensions / NEC extensions
|
|
</seg>
|
|
<seg>
|
|
While this "encoding" uses the same encoding scheme as
|
|
Shift_JIS, the underlying character set is different. That means some code
|
|
points map to different characters than Shift_JIS.
|
|
</seg>
|
|
<seg>(none)</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-2022-JP (preferred MIME name) / csISO2022JP</seg>
|
|
<seg>
|
|
US-ASCII / JIS X0201:1976 / JIS X0208:1978 / JIS X0208:1983
|
|
</seg>
|
|
<seg><link xlink:href="&url.rfc;1468">RFC1468</link></seg>
|
|
<seg>(none)</seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>JIS</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-1</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-2</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-3</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-4</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-5</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-6</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-7</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-8</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-9</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-10</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-13</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-14</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-15</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-8859-16</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>byte2be</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>byte2le</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>byte4be</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>byte4le</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>BASE64</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>HTML-ENTITIES</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>7bit</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>8bit</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>EUC-CN</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>CP936</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>HZ</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>EUC-TW</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>CP950</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>BIG-5</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>EUC-KR</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>UHC (CP949)</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>ISO-2022-KR</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>Windows-1251 (CP1251)</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>Windows-1252 (CP1252)</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>CP866 (IBM866)</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>KOI8-R</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
<seglistitem>
|
|
<seg>KOI8-U</seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
<seg></seg>
|
|
</seglistitem>
|
|
</segmentedlist>
|
|
</chapter>
|
|
|
|
<!-- Keep this comment at the end of the file
|
|
Local variables:
|
|
mode: sgml
|
|
sgml-omittag:t
|
|
sgml-shorttag:t
|
|
sgml-minimize-attributes:nil
|
|
sgml-always-quote-attributes:t
|
|
sgml-indent-step:1
|
|
sgml-indent-data:t
|
|
indent-tabs-mode:nil
|
|
sgml-parent-document:nil
|
|
sgml-default-dtd-file:"~/.phpdoc/manual.ced"
|
|
sgml-exposed-tags:nil
|
|
sgml-local-catalogs:nil
|
|
sgml-local-ecat-files:nil
|
|
End:
|
|
|
|
vim600: syn=xml fen fdm=syntax fdl=2 si
|
|
vim: et tw=78 syn=sgml
|
|
vi: ts=1 sw=1
|
|
-->
|