mirror of
https://github.com/sigmasternchen/php-doc-en
synced 2025-03-16 00:48:54 +00:00
- Add "summaries of supported encodings" section. Be sure to rerun "configure".
TODO: maybe I'm not using <segmentedlist> correctly. docbook experts there? - Updated the location of cjk.inf - Correct some silly typos. git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@153601 c90b9560-bf6c-de11-be94-00142212c4b1
This commit is contained in:
parent
0eed3b7b5e
commit
cee35d96cd
2 changed files with 1179 additions and 295 deletions
879
reference/mbstring/encodings.xml
Normal file
879
reference/mbstring/encodings.xml
Normal file
|
@ -0,0 +1,879 @@
|
|||
<?xml version="1.0" encoding="iso-8859-1"?>
|
||||
<!-- $Revision: 1.1 $ -->
|
||||
<section id="mbstring.encodings">
|
||||
<title>Summaries of supported encodings</title>
|
||||
<segmentedlist>
|
||||
<title>UCS-4</title>
|
||||
<segtitle>Name in the IANA character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-10646-UCS-4</seg>
|
||||
<seg>ISO 10646</seg>
|
||||
<seg>
|
||||
The Universal Character Set with 31-bit code space, standardized as UCS-4
|
||||
by ISO/IEC 10646. It is kept synchronized with the latest version of the
|
||||
Unicode code map.
|
||||
</seg>
|
||||
<seg>
|
||||
If this name is used in the encoding conversion facility,
|
||||
the converter attempts to identify by the preceding BOM
|
||||
(byte order mark)in which endian the subsequent bytes
|
||||
are represented.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UCS-4BE</title>
|
||||
<segtitle>Name in the IANA character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-10646-UCS-4</seg>
|
||||
<seg>UCS-4</seg>
|
||||
<seg>
|
||||
See above.
|
||||
</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UCS-4</literal>, strings are always assumed
|
||||
to be in big endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UCS-4LE</title>
|
||||
<segtitle>Name in the IANA character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-10646-UCS-4</seg>
|
||||
<seg>UCS-4</seg>
|
||||
<seg>
|
||||
See above.
|
||||
</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UCS-4</literal>, strings are always assumed
|
||||
to be in little endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UCS-2</title>
|
||||
<segtitle>Name in the IANA character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-10646-UCS-2</seg>
|
||||
<seg>UCS-2</seg>
|
||||
<seg>
|
||||
The Universal Character Set with 16-bit code space, standardized as UCS-2
|
||||
by ISO/IEC 10646. It is kept synchronized with the latest version of the
|
||||
unicode code map.
|
||||
</seg>
|
||||
<seg>
|
||||
If this name is used in the encoding conversion facility,
|
||||
the converter attempts to identify by the preceding BOM
|
||||
(byte order mark)in which endian the subsequent bytes
|
||||
are represented.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UCS-2BE</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-10646-UCS-2</seg>
|
||||
<seg>UCS-2</seg>
|
||||
<seg>
|
||||
See above.
|
||||
</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UCS-2</literal>, strings are always assumed
|
||||
to be in big endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UCS-2LE</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Ddditional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-10646-UCS-2</seg>
|
||||
<seg>UCS-2</seg>
|
||||
<seg>
|
||||
See above.
|
||||
</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UCS-2</literal>, strings are always assumed
|
||||
to be in little endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-32</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-32</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>
|
||||
Unicode Transformation Format of 32-bit unit width, whose encoding space
|
||||
refers to the Unicode's codeset standard. This encoding scheme wasn't
|
||||
identical to UCS-4 because the code space of Unicode were limited to
|
||||
a 21-bit value.
|
||||
</seg>
|
||||
<seg>
|
||||
If this name is used in the encoding conversion facility,
|
||||
the converter attempts to identify by the preceding BOM
|
||||
(byte order mark)in which endian the subsequent bytes
|
||||
are represented.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-32BE</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-32BE</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>See above</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UTF-32</literal>, strings are always assumed
|
||||
to be in big endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-32LE</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-32LE</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>See above</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UTF-32</literal>, strings are always assumed
|
||||
to be in little endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-16</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-16</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>
|
||||
Unicode Transformation Format of 16-bit unit width. It's worth a note
|
||||
that UTF-16 is no longer the same specification as UCS-2 because the
|
||||
surrogate mechanism has been introduced since Unicode 2.0 and
|
||||
UTF-16 now refers to a 21-bit code space.
|
||||
</seg>
|
||||
<seg>
|
||||
If this name is used in the encoding conversion facility,
|
||||
the converter attempts to identify by the preceding BOM
|
||||
(byte order mark)in which endian the subsequent bytes
|
||||
are represented.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-16BE</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-16BE</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>
|
||||
See above.
|
||||
</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UTF-16</literal>, strings are always assumed
|
||||
to be in big endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-16LE</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-16BE</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>
|
||||
See above.
|
||||
</seg>
|
||||
<seg>
|
||||
In contrast to <literal>UTF-16</literal>, strings are always assumed
|
||||
to be in big endian form.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-8</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-8</seg>
|
||||
<seg>Unicode / UCS</seg>
|
||||
<seg>
|
||||
Unicode Transformation Format of 8-bit unit width.
|
||||
</seg>
|
||||
<seg>none</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF-7</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>UTF-7</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>
|
||||
A mail-safe transformation format of Unicode, specified in
|
||||
<ulink url="&url.rfc;2152">RFC2152</ulink>.
|
||||
</seg>
|
||||
<seg>none</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UTF7-IMAP</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>(none)</seg>
|
||||
<seg>Unicode</seg>
|
||||
<seg>
|
||||
A variant of UTF-7 which is specialized for use in the
|
||||
<ulink url="&url.rfc;3501">IMAP protocol</ulink>.
|
||||
</seg>
|
||||
<seg>none</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ASCII</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>
|
||||
US-ASCII (preferred MIME name) / iso-ir-6 / ANSI_X3.4-1986 /
|
||||
ISO_646.irv:1991 / ASCII / ISO646-US / us / IBM367 / CP367 / csASCII
|
||||
</seg>
|
||||
<seg>ASCII / ISO 646</seg>
|
||||
<seg>
|
||||
American Standard Code for Information Interchange is a commonly-used
|
||||
7-bit encoding. Also standardized as an international standard, ISO 646.
|
||||
</seg>
|
||||
<seg>(none)</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>EUC-JP</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>
|
||||
EUC-JP (preferred MIME name) /
|
||||
Extended_UNIX_Code_Packed_Format_for_Japanese / csEUCPkdFmtJapanese
|
||||
</seg>
|
||||
<seg>
|
||||
Compound of US-ASCII / JIS X0201:1997 (hankaku kana part) /
|
||||
JIS X0208:1990 / JIS X0212:1990
|
||||
</seg>
|
||||
<seg>
|
||||
As you see the name is derived from an abbreviation of Extended UNIX Code
|
||||
Packed Format for Japanese, this encoding is mostly used on UNIX or
|
||||
alike platforms. The original encoding scheme, Extended UNIX Code, is
|
||||
designed on the basis of ISO 2022.
|
||||
</seg>
|
||||
<seg>
|
||||
The character set referred to by EUC-JP is different to IBM932 / CP932,
|
||||
which are used by OS/2® and Microsoft® Windows®.
|
||||
For information interchange with those platforms, use EUCJP-WIN instead.
|
||||
</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>SJIS</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>Shift_JIS (preferred MIME name) / MS_Kanji / csShift_JIS</seg>
|
||||
<seg>Compound of JIS X0201:1997 / JIS X0208:1997</seg>
|
||||
<seg>
|
||||
Shift_JIS was developed in early 80's, at the time personal Japanese word
|
||||
processors were brought into the market, in order to maintain
|
||||
compatiblities with the legacy encoding scheme JIS X 0201:1976.
|
||||
According to the IANA definition the codeset of Shift_JIS is slightly
|
||||
different to IBM932 / CP932. However, the names "SJIS" / "Shift_JIS" are
|
||||
often wrongly used to refer to these codesets.
|
||||
</seg>
|
||||
<seg>For the CP932 codemap, use SJIS-WIN instead.</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>EUCJP-WIN</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>(none)</seg>
|
||||
<seg>
|
||||
Compound of JIS X0201:1997 / JIS X0208:1997 / IBM extensions / NEC extensions
|
||||
</seg>
|
||||
<seg>
|
||||
While this "encoding" uses the same encoding scheme as EUC-JP,
|
||||
the underlying character set is different. That is, some code points map
|
||||
to different characters than EUC-JP.
|
||||
</seg>
|
||||
<seg>none</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>SJIS-win</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>Windows-31J / csWindows31J</seg>
|
||||
<seg>
|
||||
Compound of JIS X0201:1997 / JIS X0208:1997 / IBM extensions / NEC extensions
|
||||
</seg>
|
||||
<seg>
|
||||
While this "encoding" uses the same encoding scheme as
|
||||
Shift_JIS, the underlying character set is different. That means some code
|
||||
points map to different characters than Shift_JIS.
|
||||
</seg>
|
||||
<seg>(none)</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-2022-JP</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg>ISO-2022-JP (preferred MIME name) / csISO2022JP</seg>
|
||||
<seg>
|
||||
US-ASCII / JIS X0201:1976 / JIS X0208:1978 / JIS X0208:1983
|
||||
</seg>
|
||||
<seg><ulink url="&url.rfc;1468">RFC1468</ulink></seg>
|
||||
<seg>(none)</seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>JIS</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-1</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-2</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-3</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-4</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-5</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-6</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-7</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-8</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-9</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-10</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-13</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-14</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-8859-15</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>byte2be</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>byte2le</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>byte4be</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>byte4le</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>BASE64</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>HTML-ENTITIES</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>7bit</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>8bit</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>EUC-CN</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>CP936</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>HZ</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>EUC-TW</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>CP950</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>BIG-5</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>EUC-KR</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>UHC (CP949)</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>ISO-2022-KR</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>Windows-1251 (CP1251)</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>Windows-1252 (CP1252)</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>CP866 (IBM866)</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
<segmentedlist>
|
||||
<title>KOI8-R</title>
|
||||
<segtitle>Name in the iana character set registry</segtitle>
|
||||
<segtitle>Underlying character set</segtitle>
|
||||
<segtitle>Description</segtitle>
|
||||
<segtitle>Additional note</segtitle>
|
||||
<seglistitem>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
<seg></seg>
|
||||
</seglistitem>
|
||||
</segmentedlist>
|
||||
</section>
|
||||
|
||||
<!-- Keep this comment at the end of the file
|
||||
Local variables:
|
||||
mode: sgml
|
||||
sgml-omittag:t
|
||||
sgml-shorttag:t
|
||||
sgml-minimize-attributes:nil
|
||||
sgml-always-quote-attributes:t
|
||||
sgml-indent-step:1
|
||||
sgml-indent-data:t
|
||||
indent-tabs-mode:nil
|
||||
sgml-parent-document:nil
|
||||
sgml-default-dtd-file:"../../../manual.ced"
|
||||
sgml-exposed-tags:nil
|
||||
sgml-local-catalogs:nil
|
||||
sgml-local-ecat-files:nil
|
||||
End:
|
||||
|
||||
vim600: syn=xml fen fdm=syntax fdl=2 si
|
||||
vim: et tw=78 syn=sgml
|
||||
vi: ts=1 sw=1
|
||||
-->
|
|
@ -1,8 +1,8 @@
|
|||
<?xml version="1.0" encoding="iso-8859-1"?>
|
||||
<!-- $Revision: 1.18 $ -->
|
||||
<!-- $Revision: 1.19 $ -->
|
||||
<reference id="ref.mbstring">
|
||||
<title>Multi-Byte String Functions</title>
|
||||
<titleabbrev>Multi-Byte String</titleabbrev>
|
||||
<title>Multibyte String Functions</title>
|
||||
<titleabbrev>Multibyte String</titleabbrev>
|
||||
<partintro>
|
||||
|
||||
<section id="mbstring.intro">
|
||||
|
@ -110,7 +110,6 @@ JIS, SJIS, ISO-2022-JP, BIG-5
|
|||
scanner and the character encoding.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If you have some database connected with PHP, it is recommended that
|
||||
|
@ -148,13 +147,13 @@ JIS, SJIS, ISO-2022-JP, BIG-5
|
|||
</para>
|
||||
<note>
|
||||
<para>
|
||||
In PHP 4.3.2 or earlier versions, <literal>mbstring</literal>
|
||||
there is a limitation in this functionality that
|
||||
<literal>mbstring</literal> does not perform character encoding
|
||||
conversion in POST data if the <literal>enctype</literal> attribute in
|
||||
the <literal>form</literal> element is set to
|
||||
<literal>multipart/form-data</literal>. So you have to convert
|
||||
the incoming data by yourself in this case if necessary.
|
||||
In PHP 4.3.2 or earlier versions, there was a limitation in this
|
||||
functionality that <literal>mbstring</literal> does not perform
|
||||
character encoding conversion in POST data if the
|
||||
<literal>enctype</literal> attribute in the <literal>form</literal>
|
||||
element is set to <literal>multipart/form-data</literal>.
|
||||
So you have to convert the incoming data by yourself in this case
|
||||
if necessary.
|
||||
</para>
|
||||
<para>
|
||||
Beginning with PHP 4.3.3, if <literal>enctype</literal> for HTML form is
|
||||
|
@ -257,300 +256,306 @@ ob_start('mb_output_handler');
|
|||
</para>
|
||||
</section>
|
||||
|
||||
<section id="mbstring.encodings">
|
||||
<title>Supported Character Encodings</title>
|
||||
<simpara>
|
||||
Currently the following character encodings are supported by the
|
||||
<literal>mbstring</literal> module. Any of those Character encodings
|
||||
can be specified in the <literal>encoding</literal> parameter of
|
||||
<literal>mbstring</literal> functions.
|
||||
</simpara>
|
||||
<para>
|
||||
The following character encoding is supported in this PHP
|
||||
extension:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem><simpara>UCS-4</simpara></listitem>
|
||||
<listitem><simpara>UCS-4BE</simpara></listitem>
|
||||
<listitem><simpara>UCS-4LE</simpara></listitem>
|
||||
<listitem><simpara>UCS-2</simpara></listitem>
|
||||
<listitem><simpara>UCS-2BE</simpara></listitem>
|
||||
<listitem><simpara>UCS-2LE</simpara></listitem>
|
||||
<listitem><simpara>UTF-32</simpara></listitem>
|
||||
<listitem><simpara>UTF-32BE</simpara></listitem>
|
||||
<listitem><simpara>UTF-32LE</simpara></listitem>
|
||||
<listitem><simpara>UTF-16</simpara></listitem>
|
||||
<listitem><simpara>UTF-16BE</simpara></listitem>
|
||||
<listitem><simpara>UTF-16LE</simpara></listitem>
|
||||
<listitem><simpara>UTF-7</simpara></listitem>
|
||||
<listitem><simpara>UTF7-IMAP</simpara></listitem>
|
||||
<listitem><simpara>UTF-8</simpara></listitem>
|
||||
<listitem><simpara>ASCII</simpara></listitem>
|
||||
<listitem><simpara>EUC-JP</simpara></listitem>
|
||||
<listitem><simpara>SJIS</simpara></listitem>
|
||||
<listitem><simpara>eucJP-win</simpara></listitem>
|
||||
<listitem><simpara>SJIS-win</simpara></listitem>
|
||||
<listitem><simpara>ISO-2022-JP</simpara></listitem>
|
||||
<listitem><simpara>JIS</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-1</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-2</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-3</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-4</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-5</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-6</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-7</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-8</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-9</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-10</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-13</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-14</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-15</simpara></listitem>
|
||||
<listitem><simpara>byte2be</simpara></listitem>
|
||||
<listitem><simpara>byte2le</simpara></listitem>
|
||||
<listitem><simpara>byte4be</simpara></listitem>
|
||||
<listitem><simpara>byte4le</simpara></listitem>
|
||||
<listitem><simpara>BASE64</simpara></listitem>
|
||||
<listitem><simpara>HTML-ENTITIES</simpara></listitem>
|
||||
<listitem><simpara>7bit</simpara></listitem>
|
||||
<listitem><simpara>8bit</simpara></listitem>
|
||||
<listitem><simpara>EUC-CN</simpara></listitem>
|
||||
<listitem><simpara>CP936</simpara></listitem>
|
||||
<listitem><simpara>HZ</simpara></listitem>
|
||||
<listitem><simpara>EUC-TW</simpara></listitem>
|
||||
<listitem><simpara>CP950</simpara></listitem>
|
||||
<listitem><simpara>BIG-5</simpara></listitem>
|
||||
<listitem><simpara>EUC-KR</simpara></listitem>
|
||||
<listitem><simpara>UHC (CP949)</simpara></listitem>
|
||||
<listitem><simpara>ISO-2022-KR</simpara></listitem>
|
||||
<listitem><simpara>Windows-1251 (CP1251)</simpara></listitem>
|
||||
<listitem><simpara>Windows-1252 (CP1252)</simpara></listitem>
|
||||
<listitem><simpara>CP866 (IBM866)</simpara></listitem>
|
||||
<listitem><simpara>KOI8-R</simpara></listitem>
|
||||
</itemizedlist>
|
||||
<para>
|
||||
&php.ini; entry, which accepts encoding name,
|
||||
accepts "<literal>auto</literal>" and
|
||||
"<literal>pass</literal>" also.
|
||||
<literal>mbstring</literal> functions, which accepts encoding
|
||||
name, and accepts "<literal>auto</literal>".
|
||||
</para>
|
||||
<para>
|
||||
If "<literal>pass</literal>" is set, no character
|
||||
encoding conversion is performed.
|
||||
</para>
|
||||
<para>
|
||||
If "<literal>auto</literal>" is set, it is expanded to
|
||||
the list of encodings defined per the <link linkend="mbstring.configuration">NLS</link>.
|
||||
For instance, if the NLS is set to <literal>Japanese</literal>,
|
||||
the value is assumed to be
|
||||
"<literal>ASCII,JIS,UTF-8,EUC-JP,SJIS</literal>".
|
||||
</para>
|
||||
<para>
|
||||
See also <function>mb_detect_order</function>
|
||||
</para>
|
||||
<section id="mbstring.supported-encodings">
|
||||
<title>Supported Character Encodings</title>
|
||||
<simpara>
|
||||
Currently the following character encodings are supported by the
|
||||
<literal>mbstring</literal> module. Any of those Character encodings
|
||||
can be specified in the <literal>encoding</literal> parameter of
|
||||
<literal>mbstring</literal> functions.
|
||||
</simpara>
|
||||
<para>
|
||||
The following character encoding is supported in this PHP
|
||||
extension:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem><simpara>UCS-4</simpara></listitem>
|
||||
<listitem><simpara>UCS-4BE</simpara></listitem>
|
||||
<listitem><simpara>UCS-4LE</simpara></listitem>
|
||||
<listitem><simpara>UCS-2</simpara></listitem>
|
||||
<listitem><simpara>UCS-2BE</simpara></listitem>
|
||||
<listitem><simpara>UCS-2LE</simpara></listitem>
|
||||
<listitem><simpara>UTF-32</simpara></listitem>
|
||||
<listitem><simpara>UTF-32BE</simpara></listitem>
|
||||
<listitem><simpara>UTF-32LE</simpara></listitem>
|
||||
<listitem><simpara>UTF-16</simpara></listitem>
|
||||
<listitem><simpara>UTF-16BE</simpara></listitem>
|
||||
<listitem><simpara>UTF-16LE</simpara></listitem>
|
||||
<listitem><simpara>UTF-7</simpara></listitem>
|
||||
<listitem><simpara>UTF7-IMAP</simpara></listitem>
|
||||
<listitem><simpara>UTF-8</simpara></listitem>
|
||||
<listitem><simpara>ASCII</simpara></listitem>
|
||||
<listitem><simpara>EUC-JP</simpara></listitem>
|
||||
<listitem><simpara>SJIS</simpara></listitem>
|
||||
<listitem><simpara>eucJP-win</simpara></listitem>
|
||||
<listitem><simpara>SJIS-win</simpara></listitem>
|
||||
<listitem><simpara>ISO-2022-JP</simpara></listitem>
|
||||
<listitem><simpara>JIS</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-1</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-2</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-3</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-4</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-5</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-6</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-7</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-8</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-9</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-10</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-13</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-14</simpara></listitem>
|
||||
<listitem><simpara>ISO-8859-15</simpara></listitem>
|
||||
<listitem><simpara>byte2be</simpara></listitem>
|
||||
<listitem><simpara>byte2le</simpara></listitem>
|
||||
<listitem><simpara>byte4be</simpara></listitem>
|
||||
<listitem><simpara>byte4le</simpara></listitem>
|
||||
<listitem><simpara>BASE64</simpara></listitem>
|
||||
<listitem><simpara>HTML-ENTITIES</simpara></listitem>
|
||||
<listitem><simpara>7bit</simpara></listitem>
|
||||
<listitem><simpara>8bit</simpara></listitem>
|
||||
<listitem><simpara>EUC-CN</simpara></listitem>
|
||||
<listitem><simpara>CP936</simpara></listitem>
|
||||
<listitem><simpara>HZ</simpara></listitem>
|
||||
<listitem><simpara>EUC-TW</simpara></listitem>
|
||||
<listitem><simpara>CP950</simpara></listitem>
|
||||
<listitem><simpara>BIG-5</simpara></listitem>
|
||||
<listitem><simpara>EUC-KR</simpara></listitem>
|
||||
<listitem><simpara>UHC (CP949)</simpara></listitem>
|
||||
<listitem><simpara>ISO-2022-KR</simpara></listitem>
|
||||
<listitem><simpara>Windows-1251 (CP1251)</simpara></listitem>
|
||||
<listitem><simpara>Windows-1252 (CP1252)</simpara></listitem>
|
||||
<listitem><simpara>CP866 (IBM866)</simpara></listitem>
|
||||
<listitem><simpara>KOI8-R</simpara></listitem>
|
||||
</itemizedlist>
|
||||
<para>
|
||||
&php.ini; entry, which accepts encoding name,
|
||||
accepts "<literal>auto</literal>" and
|
||||
"<literal>pass</literal>" also.
|
||||
<literal>mbstring</literal> functions, which accepts encoding
|
||||
name, and accepts "<literal>auto</literal>".
|
||||
</para>
|
||||
<para>
|
||||
If "<literal>pass</literal>" is set, no character
|
||||
encoding conversion is performed.
|
||||
</para>
|
||||
<para>
|
||||
If "<literal>auto</literal>" is set, it is expanded to
|
||||
the list of encodings defined per the <link linkend="mbstring.configuration">NLS</link>.
|
||||
For instance, if the NLS is set to <literal>Japanese</literal>,
|
||||
the value is assumed to be
|
||||
"<literal>ASCII,JIS,UTF-8,EUC-JP,SJIS</literal>".
|
||||
</para>
|
||||
<para>
|
||||
See also <function>mb_detect_order</function>
|
||||
</para>
|
||||
</section>
|
||||
|
||||
<section id="mbstring.overload">
|
||||
<title>
|
||||
Function Overloading Feature
|
||||
</title>
|
||||
<para>
|
||||
You might often find it difficult to get an existing PHP application
|
||||
work in a given multibyte environment. That's mostly because lots of
|
||||
PHP applications out there are written with the standard
|
||||
string functions such as <function>substr</function>, which are
|
||||
known to not properly handle multibyte-encoded strings.
|
||||
</para>
|
||||
<para>
|
||||
mbstring supports 'function overloading' feature which enables
|
||||
you to add multibyte awareness to such an application without
|
||||
code modification by overloading multibyte counterparts on
|
||||
the standard string functions. For example,
|
||||
<function>mb_substr</function> is called instead of
|
||||
<function>substr</function> if function overloading is enabled.
|
||||
This feature makes it easy to port applications that only support
|
||||
single-byte encodings to a multibyte environment in many cases.
|
||||
</para>
|
||||
<para>
|
||||
To use the function overloading, set
|
||||
<literal>mbstring.func_overload</literal> in &php.ini; to a
|
||||
positive value that represents a combination of bitmasks specifying
|
||||
the categories of functions to be overloaded. It should be set
|
||||
to 1 to overload the <function>mail</function> function. 2 for string
|
||||
functions, 4 for regular expression functions. For example,
|
||||
if is set for 7, mail, strings and regular expression functions should
|
||||
be overloaded. The list of overloaded functions are shown below.
|
||||
<table>
|
||||
<title>Functions to be overloaded</title>
|
||||
<tgroup cols="3">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>value of mbstring.func_overload</entry>
|
||||
<entry>original function</entry>
|
||||
<entry>overloaded function</entry>
|
||||
</row>
|
||||
</thead>
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>1</entry>
|
||||
<entry><function>mail</function></entry>
|
||||
<entry><function>mb_send_mail</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strlen</function></entry>
|
||||
<entry><function>mb_strlen</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strpos</function></entry>
|
||||
<entry><function>mb_strpos</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strrpos</function></entry>
|
||||
<entry><function>mb_strrpos</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>substr</function></entry>
|
||||
<entry><function>mb_substr</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strtolower</function></entry>
|
||||
<entry><function>mb_strtolower</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strtoupper</function></entry>
|
||||
<entry><function>mb_strtoupper</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>substr_count</function></entry>
|
||||
<entry><function>mb_substr_count</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>ereg</function></entry>
|
||||
<entry><function>mb_ereg</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>eregi</function></entry>
|
||||
<entry><function>mb_eregi</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>ereg_replace</function></entry>
|
||||
<entry><function>mb_ereg_replace</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>eregi_replace</function></entry>
|
||||
<entry><function>mb_eregi_replace</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>split</function></entry>
|
||||
<entry><function>mb_split</function></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
</para>
|
||||
<title>
|
||||
Function Overloading Feature
|
||||
</title>
|
||||
<para>
|
||||
You might often find it difficult to get an existing PHP application
|
||||
work in a given multibyte environment. That's mostly because lots of
|
||||
PHP applications out there are written with the standard
|
||||
string functions such as <function>substr</function>, which are
|
||||
known to not properly handle multibyte-encoded strings.
|
||||
</para>
|
||||
<para>
|
||||
mbstring supports 'function overloading' feature which enables
|
||||
you to add multibyte awareness to such an application without
|
||||
code modification by overloading multibyte counterparts on
|
||||
the standard string functions. For example,
|
||||
<function>mb_substr</function> is called instead of
|
||||
<function>substr</function> if function overloading is enabled.
|
||||
This feature makes it easy to port applications that only support
|
||||
single-byte encodings to a multibyte environment in many cases.
|
||||
</para>
|
||||
<para>
|
||||
To use the function overloading, set
|
||||
<literal>mbstring.func_overload</literal> in &php.ini; to a
|
||||
positive value that represents a combination of bitmasks specifying
|
||||
the categories of functions to be overloaded. It should be set
|
||||
to 1 to overload the <function>mail</function> function. 2 for string
|
||||
functions, 4 for regular expression functions. For example,
|
||||
if is set for 7, mail, strings and regular expression functions should
|
||||
be overloaded. The list of overloaded functions are shown below.
|
||||
<table>
|
||||
<title>Functions to be overloaded</title>
|
||||
<tgroup cols="3">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>value of mbstring.func_overload</entry>
|
||||
<entry>original function</entry>
|
||||
<entry>overloaded function</entry>
|
||||
</row>
|
||||
</thead>
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>1</entry>
|
||||
<entry><function>mail</function></entry>
|
||||
<entry><function>mb_send_mail</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strlen</function></entry>
|
||||
<entry><function>mb_strlen</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strpos</function></entry>
|
||||
<entry><function>mb_strpos</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strrpos</function></entry>
|
||||
<entry><function>mb_strrpos</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>substr</function></entry>
|
||||
<entry><function>mb_substr</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strtolower</function></entry>
|
||||
<entry><function>mb_strtolower</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>strtoupper</function></entry>
|
||||
<entry><function>mb_strtoupper</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>2</entry>
|
||||
<entry><function>substr_count</function></entry>
|
||||
<entry><function>mb_substr_count</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>ereg</function></entry>
|
||||
<entry><function>mb_ereg</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>eregi</function></entry>
|
||||
<entry><function>mb_eregi</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>ereg_replace</function></entry>
|
||||
<entry><function>mb_ereg_replace</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>eregi_replace</function></entry>
|
||||
<entry><function>mb_eregi_replace</function></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>4</entry>
|
||||
<entry><function>split</function></entry>
|
||||
<entry><function>mb_split</function></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
It is not recommended to use the function overloading option in
|
||||
the per-directory context, because it's not confirmed yet to be
|
||||
stable enough in a production environment and may lead to undefined
|
||||
behaviour.
|
||||
</para>
|
||||
</note>
|
||||
</section>
|
||||
|
||||
<section id="mbstring.ja-basic">
|
||||
<title>Basics of Japanese multi-byte encodings</title>
|
||||
<para>
|
||||
It is often said quite hard to figure out how Japanese texts are
|
||||
handled in the computer. This is not only because Japanese characters
|
||||
can only be represented by multibyte encodings, but because different
|
||||
encoding standards are adopted for different purposes / platforms.
|
||||
Moreover, not a few character set standards are used there, which
|
||||
are slightly different from one another. Those facts have often led
|
||||
developers to inevitable mess-up.
|
||||
</para>
|
||||
<para>
|
||||
To create a working web application that would be put in the Japanese
|
||||
environment, it is important to use the proper character encoding and
|
||||
character set for the task in hand.
|
||||
</para>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>Storage for a character can be up to six bytes</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Most of multibyte characters often appear twice as wide as
|
||||
a single-byte character on display. Those characters are called
|
||||
"zen-kaku" in Japanese which means "full width", and the other
|
||||
(narrower) characters are called "han-kaku" - means half width.
|
||||
However the graphical properties of the characters depend on
|
||||
the glyphs of the type faces used to display them or print them out.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Some character encodings use shift(escape) sequences defined
|
||||
in ISO2022 to switch the code map of the specific code area
|
||||
(<literal>00h</literal> to <literal>7fh</literal>).
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
ISO-2022-JP should be used in SMTP/NNTP, and headers and entities
|
||||
should be reencoded as per RFC requirements. Although those are not
|
||||
requisites, it's still a good idea because several popular user
|
||||
agents cannot recognize any other encoding methods.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Webpages created for mobile phone services such as
|
||||
<ulink url="http://www.eurotechnology.com/imode/faq.html">i-mode</ulink>,
|
||||
<ulink url="http://www.vodafone.jp/english/live/">Vodafone live!</ulink>, or <ulink url="http://www.au.kddi.com/english/ezweb/">ezweb</ulink>
|
||||
are supposed to use Shift_JIS.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<title>Basics of Japanese multi-byte encodings</title>
|
||||
<para>
|
||||
It is often said quite hard to figure out how Japanese texts are
|
||||
handled in the computer. This is not only because Japanese characters
|
||||
can only be represented by multibyte encodings, but because different
|
||||
encoding standards are adopted for different purposes / platforms.
|
||||
Moreover, not a few character set standards are used there, which
|
||||
are slightly different from one another. Those facts have often led
|
||||
developers to inevitable mess-up.
|
||||
</para>
|
||||
<para>
|
||||
To create a working web application that would be put in the Japanese
|
||||
environment, it is important to use the proper character encoding and
|
||||
character set for the task in hand.
|
||||
</para>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>Storage for a character can be up to six bytes</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Most of multibyte characters often appear twice as wide as
|
||||
a single-byte character on display. Those characters are called
|
||||
"zen-kaku" in Japanese which means "full width", and the other
|
||||
(narrower) characters are called "han-kaku" - means half width.
|
||||
However the graphical properties of the characters depend on
|
||||
the glyphs of the type faces used to display them or print them out.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Some character encodings use shift(escape) sequences defined
|
||||
in ISO2022 to switch the code map of the specific code area
|
||||
(<literal>00h</literal> to <literal>7fh</literal>).
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
ISO-2022-JP should be used in SMTP/NNTP, and headers and entities
|
||||
should be reencoded as per RFC requirements. Although those are not
|
||||
requisites, it's still a good idea because several popular user
|
||||
agents cannot recognize any other encoding methods.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Webpages created for mobile phone services such as
|
||||
<ulink url="http://www.nttdocomo.com/corebiz/imode/">i-mode</ulink>,
|
||||
<ulink url="http://www.vodafone.jp/english/live/">Vodafone live!</ulink>, or <ulink url="http://www.au.kddi.com/english/ezweb/">EZweb</ulink>
|
||||
are supposed to use Shift_JIS.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</section>
|
||||
|
||||
<section id="mbstring.ref">
|
||||
<title>References</title>
|
||||
<para>
|
||||
Multibyte character encoding schemes and the related issues are very
|
||||
complicated. There should be too few space to cover in sufficient details.
|
||||
Please refer to the following URLs and other resources for
|
||||
further readings.
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Unicode materials
|
||||
</para>
|
||||
<para>
|
||||
<ulink url="&url.unicode;">&url.unicode;</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Japanese/Korean/Chinese character information
|
||||
</para>
|
||||
<para>
|
||||
<ulink url="ftp://ftp.ora.com/pub/examples/nutshell/ujip/doc/cjk.inf">
|
||||
<literal>
|
||||
ftp://ftp.ora.com/pub/examples/nutshell/ujip/doc/cjk.inf
|
||||
</literal>
|
||||
</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<title>References</title>
|
||||
<para>
|
||||
Multibyte character encoding schemes and the related issues are very
|
||||
complicated. There should be too few space to cover in sufficient details.
|
||||
Please refer to the following URLs and other resources for
|
||||
further readings.
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Unicode materials
|
||||
</para>
|
||||
<para>
|
||||
<ulink url="&url.unicode;">&url.unicode;</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Japanese/Korean/Chinese character information
|
||||
</para>
|
||||
<para>
|
||||
<ulink url="http://examples.oreilly.com/cjkvinfo/doc/cjk.inf">http://examples.oreilly.com/cjkvinfo/doc/cjk.inf</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</section>
|
||||
&reference.mbstring.encodings;
|
||||
|
||||
</partintro>
|
||||
|
||||
&reference.mbstring.functions;
|
||||
|
|
Loading…
Reference in a new issue