From 838941f6cce51f3beda16012eb497b26295a8238 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Mon, 17 Oct 2016 14:31:39 +0000 Subject: [PATCH] Improve utf8_decode() and utf8_encode() documentation This rewrites the descriptions of both to clarify that they convert specifically between ISO-8859-1 and UTF-8, adds a warning about confusion with Windows-1252, and adds helpful "See also" links to other character set conversion functions. Additionally, the behaviour for invalid characters in utf8_decode() was clarified, and the description of the UTF-8 binary encoding was removed. git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@340506 c90b9560-bf6c-de11-be94-00142212c4b1 --- reference/xml/functions/utf8-decode.xml | 29 +++++++++-- reference/xml/functions/utf8-encode.xml | 67 ++++++++----------------- 2 files changed, 45 insertions(+), 51 deletions(-) diff --git a/reference/xml/functions/utf8-decode.xml b/reference/xml/functions/utf8-decode.xml index 94f5f3eec5..a83f8c26f4 100644 --- a/reference/xml/functions/utf8-decode.xml +++ b/reference/xml/functions/utf8-decode.xml @@ -16,9 +16,27 @@ stringdata - This function decodes data, assumed to be - UTF-8 encoded, to ISO-8859-1. + This function converts the string data from the + UTF-8 encoding to ISO-8859-1. Bytes + in the string which are not valid UTF-8, and + UTF-8 characters which do not exist in + ISO-8859-1 (that is, characters above + U+00FF) are replaced with ?. + + + Many web pages marked as using the ISO-8859-1 character + encoding actually use the similar Windows-1252 encoding, + and web browsers will interpret ISO-8859-1 web pages as + Windows-1252. Windows-1252 features + additional printable characters, such as the Euro sign + () and curly quotes ( + ), instead of certain ISO-8859-1 + control characters. This function will not convert such + Windows-1252 characters correctly. Use a different + function if Windows-1252 conversion is required. + + @@ -29,7 +47,7 @@ data - An UTF-8 encoded string. + A UTF-8 encoded string. @@ -48,7 +66,10 @@ &reftitle.seealso; - utf8_encode (contains an explanation of UTF-8 encoding) + utf8_encode - Performs the reverse conversion + mb_convert_encoding - Converts between various character encodings, including UTF-8, ISO-8859-1 and Windows-1252 + iconv - Converts between various character encodings + recode_string - Converts between various character encodings diff --git a/reference/xml/functions/utf8-encode.xml b/reference/xml/functions/utf8-encode.xml index 6c46caf304..d350d54db4 100644 --- a/reference/xml/functions/utf8-encode.xml +++ b/reference/xml/functions/utf8-encode.xml @@ -13,53 +13,23 @@ stringdata - This function encodes the string data to - UTF-8, and returns the encoded version. - UTF-8 is a standard mechanism used by - Unicode for encoding wide - character values into a byte stream. - UTF-8 is transparent to plain ASCII - characters, is self-synchronized (meaning it is possible for a program to - figure out where in the bytestream characters start) and can be used with - normal string comparison functions for sorting and such. PHP encodes - UTF-8 characters in up to four bytes, like this: - - UTF-8 encoding - - - - bytes - bits - representation - - - - - 1 - 7 - 0bbbbbbb - - - 2 - 11 - 110bbbbb 10bbbbbb - - - 3 - 16 - 1110bbbb 10bbbbbb 10bbbbbb - - - 4 - 21 - 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb - - - -
- Each b represents a bit that can be - used to store character data. + This function converts the string data from the + ISO-8859-1 encoding to UTF-8.
+ + + Many web pages marked as using the ISO-8859-1 character + encoding actually use the similar Windows-1252 encoding, + and web browsers will interpret ISO-8859-1 web pages as + Windows-1252. Windows-1252 features + additional printable characters, such as the Euro sign + () and curly quotes ( + ), instead of certain ISO-8859-1 + control characters. This function will not convert such + Windows-1252 characters correctly. Use a different + function if Windows-1252 conversion is required. + + @@ -89,7 +59,10 @@ &reftitle.seealso; - utf8_decode + utf8_encode - Performs the reverse conversion + mb_convert_encoding - Converts between various character encodings, including UTF-8, ISO-8859-1 and Windows-1252 + iconv - Converts between various character encodings + recode_string - Converts between various character encodings