From a91fd61caa165a38ca4cb537103502e5edc81ba2 Mon Sep 17 00:00:00 2001 From: Rui Hirokawa Date: Sun, 20 May 2001 07:43:42 +0000 Subject: [PATCH] added mbstring.xml. git-svn-id: https://svn.php.net/repository/phpdoc/en/trunk@47941 c90b9560-bf6c-de11-be94-00142212c4b1 --- functions/mbstring.xml | 1402 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1402 insertions(+) create mode 100644 functions/mbstring.xml diff --git a/functions/mbstring.xml b/functions/mbstring.xml new file mode 100644 index 0000000000..a231fc4234 --- /dev/null +++ b/functions/mbstring.xml @@ -0,0 +1,1402 @@ + + Multi-Byte String Functions + Multi-Byte String + + + Introduction + + + This module is EXPERIMENTAL. Function name/API is subject to be + changed. Current conversion filter supports Japanese only. + + + + There are many languages that all characters cannot be expressed + by single byte. Multi-byte character codes are used to express + many characters for many languages. mbstring + is developed to handle Japanese characters. However, many + mbstring functions are able to handle + character codes other than Japanese. + + + Multi-byte character encoding represents single character with + consecutive bytes. Some character encoding has shift(escape) + sequences to start/end multi-byte character string. Therefore, + multi-byte character string may be destroyed when it is divided + and/or counted, unless multi-byte character encoding safe method + is used. mbstring functions support multi-byte + character safe string functions and other utility functions such + as conversion functions. + + + + Basics for Japanese multi-byte character + + Most Japanese characters need more than 1 byte for a + character. In addition to this, several character encodings are + used under Japanese environment. There are EUC-JP, Shift_JIS and + ISO-2022-JP character encoding. As Unicode is getting popular, + UTF-8 is used also. To develop Web application for Japanese + environment, it is important to use these character codes depend + on its purpose, HTTP input/output, RDBMS and E-mail. + + + + + + Storage for a character can be upto four bytes + + + + + A multi-byte character usually has twice of width compare to + single byte characters. Wider character is called "zen-kaku" + - meaning full width, narrower character called "han-kaku" - + meaning half width. "zen-kaku" characters are fixed width + usually. + + + + + Some character encoding defines shift sequence for + entering/exiting multi-byte character strings. + + + + + Database may allocate storage for characters that differs + from size used in PHP even if the same character encoding is + used. (For example, PostgreSQL) + + + + + E-mail is supposed to use ISO-2022-JP. + + + + + "i-mode" web site is supposed to use Shift_JIS. + + + + + + + + Supported character encodings + + Following character encodings are supported in this PHP + extension : UCS-4, + UCS-4BE, UCS-4LE, + UCS-2, UCS-2BE, + UCS-2LE, UTF-32, + UTF-32BE, UTF-32LE, + UCS-2LE, UTF-16, + UTF-16BE, UTF-16LE, + UTF-8, UTF-7, + ASCII, EUC-JP, + SJIS, eucJP-win, + SJIS-win, + ISO-2022-JP(JIS), + ISO-8859-1, ISO-8859-2, + ISO-8859-3, ISO-8859-4, + ISO-8859-5, ISO-8859-6, + ISO-8859-7, ISO-8859-8, + ISO-8859-9, ISO-8859-10, + ISO-8859-13, ISO-8859-14, + ISO-8859-15. + + + + + php.ini settings + + + + + mbstring.internal_encoding defines default + internal character encoding. + + + + + mbstring.http_input defines default HTTP input + character encoding. + + + + + mbstring.http_output defines default HTTP output + character encoding. + + + + + mbstring.detect_order defines default character + encoding detection order. + + + + + mbstring.substitute_character defines character + to substitute for invalid character codes. + + + + + + + <literal>php.ini</literal> setting example + +;; Set default internal encoding +mbstring.internal_encoding = UTF-8 ; Set internal encoding to UTF-8 + +;; Set default HTTP input character code +mbstring.http_input = auto ; Set HTTP input to auto +; or +; mbstring.http_input = SJIS ; Set HTTP input to SJIS +; mbstring.http_input = eucjp-win, sjis-win, UTF-8 ; Specify order + +;; Set default HTTP output character code +mbstring.http_output = UTF-8 ; Set HTTP output encoding to UTF-8 + +;; Set default character code detection order +mbstring.detect_order = auto ; Set HTTP output to auto +; or +; mbstring.detect_order = eucjp-win, sjis-win, UTF-8 ; Specify order + +;; Set default substitute character +mbstring.substitute_character = 12307 ; Specify character code +; or +; mbstring.substitute_character = none ; Null character +; mbstring.substitute_character = long ; Long + + + + + + + + + + mb_internal_encoding + + Set/Get internal character encoding + + + + Description + + + string + mb_internal_encoding + string + encoding + + + + mb_internal_encoding sets internal character + encoding to encoding If parameter is + omitted, it returns current internal encoding. + + + encoding is used for HTTP input character + encoding conversion, HTTP output character encoding conversion + and default character encoding for string functions defined by + mbstring module. + + + encoding: Character encoding name + + + Return Value: If encoding is + set,mb_internal_encoding returns + TRUE for success, otherwise returns + FALSE. If encoding is + omitted, it returns current character encoding name. + + + + <function>mb_internal_encoding</function> example + +/* Set internal character encoding to UTF-8 */ +mb_internal_encoding("UTF-8"); + +/* Display current internal character encoding */ +echo mb_internal_encoding(); + + + + + See also mb_http_input, + mb_http_output, + mb_detect_order + + + + + + + mb_http_input + Detect HTTP input character encoding + + + Description + + + string mb_http_input + string + type + + + + + mb_http_input returns result of HTTP input + character encoding detection. + + + type: Input string specifies input + type. "G" for GET, "P" for POST, + "C" for COOKIE. If type is omitted, it returns last + input type processed. + + + Return Value: Character encoding name. + If mb_http_input does not process specified + HTTP input, it returns FALSE. + + + See also mb_internal_encoding, + mb_http_output, + mb_detect_order + + + + + + + mb_http_output + Set/Get HTTP output character encoding + + + Description + + + string mb_http_output + string + encoding + + + + + If encoding is set, + mb_http_output sets HTTP output character + encoding to encoding. Output after this + function is converted to encoding. + mb_http_output returns TRUE for success and + FALSE for failure. + + + If encoding is omitted, + mb_http_output returns current HTTP output + character encoding. + + + See also mb_internal_encoding, + mb_http_input, + mb_detect_order + + + + + + + mb_detect_order + + Set/Get character encoding detection order + + + + Description + + + array mb_detect_order + mixed + encoding-list + + + + + mb_detect_order sets automatic character + encoding detection order to encoding-list. + It returns TRUE for success, FALSE for failure. + + + encoding-list is array or comma separated + list of character encodings. ("auto" is expanded to + "ASCII, JIS, UTF-8, EUC-JP, SJIS") + + + If encoding-list is omitted, it returns + current character encoding detection order as array. + + + This setting affects mb_detect_encoding and + mb_send_mail. + + + + <function>mb_detect_order</function> examples + +/* Set detection order by enumerated list */ +mb_detect_order("eucjp-win,sjis-win,UTF-8"); + +/* Set detection order by array */ +$ary[] = "ASCII"; +$ary[] = "JIS"; +$ary[] = "EUC-JP"; +mb_detect_order($ary); + +/* Display current detection order */ +echo implode(", ", mb_detect_order()); + + + + + See also mb_internal_encoding, + mb_http_input, + mb_http_output + mb_send_mail + + + + + + + mb_substitute_character + Set/Get substitution character + + + Description + + + mixed mb_substitute_character + mixed + substrchar + + + + + mb_substitute_character specifies + substitution character when input character encoding is invalid + or character code is not exist in output character + encoding. Invalid characters may be substituted null(no output), + string or hex value (Unicode character code value). + + + This setting affects mb_detect_encoding + and mb_send_mail. + + + substchar : Specify Unicode value as + integer or specify as string as follows + + + + "none" : no output + + + + + "long" : Output hex value (Example: U+3000,JIS+7E7E) + + + + + + Return Value: If substchar is set, it + returns TRUE for success, otherwise returns FALSE. If + substchar is not set, it returns Unicode + value or + "none"/"long". + + + + <function>mb_substitute_character</function> example + +/* Set with Unicode U+3013 (GETA MARK) */ +mb_substitute_character(0x3013); + +/* Set hex format */ +mb_substitute_character("long"); + +/* Display current setting */ +echo mb_substitute_character(); + + + + + + + + + mb_output_handler + + Callback function converts character encoding in output buffer + + + + Description + + + string mb_output_handler + string contents + int status + + + + mb_output_handler is + ob_start callback + function. mb_output_handler converts + characters in output buffer from internal character encoding to + HTTP output character encoding. + + + contents : Output buffer contents + + + status : Output buffer status + + + Return Value: String converted + + + + <function>mb_output_handler</function> example + +mb_http_output("UTF-8"); +ob_start("mb_output_handler"); + + + + + + If you want to output some binary data such as image from php + script, you must set output encoding to "pass" using + mb_http_output. + + + + See also ob_start. + + + + + + + mb_preferred_mime_name + Get MIME charset string + + + Description + + + string mb_preferred_mime_name + string encoding + + + + mb_preferred_mime_name returns MIME + charset string for character encoding + encoding. It returns + charset string. + + + + <function>mb_preferred_mime_string</function> example + +$outputenc = "sjis-win"; +mb_http_output($outputenc); +ob_start("mb_output_handler"); +Header("Content-Type: text/html; charset=" . mb_preferred_mime_name($outputenc)); + + + + + + + + + mb_strlen + Get string length + + + Description + + + string mb_strlen + string str + string + encoding + + + + + mb_strlen returns number of characters in + string str having character encoding + encoding. A multi-byte character is + counted as 1. + + + See also mb_internal_encoding, + strlen. + + + + + + + mb_strpos + + Find position of first occurrence of string in a string + + + + Description + + + string mb_strpos + string haystack + string needle + int + offset + + string + encoding + + + + + mb_strpos returns the numeric position of + the first occurrence of needle in the + haystack string. If + needle is not found, it returns FALSE. + + + mb_strpos performs multi-byte safe + strpos operation based on number of + characters. needle position is counted + from the beginning of the haystack. First + character's position is 0. Second character position is 1, and so + on. + + + If encoding is omitted, internal + character encoding is used. mb_strrpos + accepts string for + needle where strrpos + accepts only character. + + + offset is search offset. If it is not + specified, 0 is used. + + + encoding is character encoding name. If it + is not specified, internal character encoding is used. + + + See also mb_strpos, + mb_internal_encoding, + strpos + + + + + + + mb_strrpos + + Find position of last occurrence of a string in a string + + + + Description + + + string mb_strrpos + string haystack + string needle + string + encoding + + + + + mb_strrpos returns the numeric position of + the last occurrence of needle in the + haystack string. If + needle is not found, it returns FALSE. + + + mb_strrpos performs multi-byte safe + strrpos operation based on + number of characters. needle position is + counted from the beginning of + haystack. First character's position is + 0. Second character position is 1. + + + If encoding is not set, internal encoding + is assumed. mb_strrpos accepts + string for needle where + strrpos accepts only character. + + + encoding is character encoding. If it is + not specified, internal character encoding is used. + + + See also mb_strpos, + mb_internal_encoding, + strrpos. + + + + + + + mb_substr + Get part of string + + + Description + + + string mb_substr + string str + int start + int + length + + string + encoding + + + + + mb_substr returns the portion of + str specified by the + start and + length parameters. + + + mb_substr performs multi-byte safe + substr operation based on + number of characters. Position is + counted from the beginning of + str. First character's position is + 0. Second character position is 1, and so on. + + + If encoding is omitted, internal encoding + is assumed. + + + encoding is character encoding. If it is + omitted, internal character encoding is used. + + + See also mb_struct, + mb_internal_encoding. + + + + + + + mb_strcut + Get part of string + + + Description + + + string mb_strcut + string str + int start + int + length + + string + encoding + + + + + mb_strcut returns the portion of + str specified by the + start and + length parameters. + + + mb_strcut performs equivalent operation as + mb_substr with different method. If + start position is multi-byte character's + second byte or larger, it starts from first byte of multi-byte + character. + + + It subtracts string from str that is + shorter than length AND character that is + not part of multi-byte string or not being middle of shift + sequence. + + + encoding is character encoding. If it is + not set, internal character encoding is used. + + + See also mb_substr, + mb_internal_encoding. + + + + + + + mb_strwidth + Return width of string + + + Description + + + int mb_strwidth + string str + string + encoding + + + + + mb_strwidth returns width of string + str. + + + Multi-byte character usually twice of width compare to single + byte character. + + + + + Character width + + U+0000 - U+0019 0 + U+0020 - U+1FFF 1 + U+2000 - U+FF60 2 + U+FF61 - U+FF9F 1 + U+FFA0 - 2 + + + + + encoding is character encoding. If it is + omitted, internal encoding is used. + + + See also: mb_strimwidth, + mb_internal_encoding. + + + + + + + mb_strimwidth + Get truncated string with specified width + + + Description + + + string mb_strmwidth + string str + int start + int width + string trimmarker + string + encoding + + + + + mb_strmwidth truncates string + str to specified + width. It returns truncated string. + + + If trimmarker is set, + trimmarker is appended to return value. + + + start is start position offset. Number of + characters from the beginning of string. (Fist character is 0) + + + trimmarker is string that is added to the + end of string when string is truncated. + + + encoding is character encoding. If it is + omitted, internal encoding is used. + + + + <function>mb_strimwidth</function> example + +$str = mb_strimwidth($str, 0, 40, "..>"); + + + + + See also: mb_strwidth, + mb_internal_encoding. + + + + + + + mb_convert_encoding + Convert character encoding + + + Description + + + string mb_convert_encoding + string str + string to-encoding + mixed + from-encoding + + + + + mb_convert_encoding converts + character encoding of string str from + from-encoding to + to-encoding. + + + str : String to be converted. + + + from-encoding is specified by character + code name before conversion. it can be array or string - comma + separated enumerated list. + + + + <function>mb_convert_encoding</function> example + +/* Convert internal character encoding to SJIS */ +$str = mb_convert_encoding($str, "SJIS"); + +/* Convert EUC-JP to UTF-7 */ +$str = mb_convert_encoding($str, "UTF-7", "EUC-JP"); + +/* Auto detect encoding from JIS, eucjp-win, sjis-win, then convert str to UCS-2LE */ +$str = mb_convert_encoding($str, "UCS-2LE", "JIS, eucjp-win, sjis-win"); + +/* "auto" is expanded to "ASCII,JIS,UTF-8,EUC-JP,SJIS" */ +$str = mb_convert_encoding($str, "EUC-JP", "auto"); + + + + + + See also: mb_detect_order. + + + + + + + mb_detect_encoding + Detect character encoding + + + Description + + + string mb_detect_encoding + string str + mixed + encoding-list + + + + + mb_detect_encoding detects character + encoding in string str. It returns + detected character encoding. + + + encoding-list is list of character + encoding. Encoding order may be specified by array or comma + separated list string. + + + If encoding_list is omitted, + detect_order is used. + + + + <function>mb_detect_encoding</function> example + +/* Detect character encoding with current detect_order */ +echo mb_detect_encoding($str); + +/* "auto" is expanded to "ASCII,JIS,UTF-8,EUC-JP,SJIS" */ +echo mb_detect_encoding($str, "auto"); + +/* Specify encoding_list character encoding by comma separated list */ +echo mb_detect_encoding($str, "JIS, eucjp-win, sjis-win"); + +/* Use array to specify encoding_list */ +$ary[] = "ASCII"; +$ary[] = "JIS"; +$ary[] = "EUC-JP"; +echo mb_detect_encoding($str, $ary); + + + + + See also: mb_detect_order. + + + + + + + mb_convert_kana + + Convert "kana" one from another ("zen-kaku" ,"han-kaku" and more) + + + + Description + + + string mb_convert_kana + string str + string option + mixed + encoding + + + + + mb_convert_kana performs "han-kaku" - + "zen-kaku" conversion for string str. It + returns converted string. This function is only useful for + Japanese. + + + option is conversion option. Default value + is "KV". + + + encoding is character encoding. If it is + omitted, internal character encoding is used. + + + + + Applicable Conversion Options + + option : Specify with conversion of following options. Default "KV" + "r" : Convert "zen-kaku" alphabets to "han-kaku" + "R" : Convert "han-kaku" alphabets to "zen-kaku" + "n" : Convert "zen-kaku" numbers to "han-kaku" + "N" : Convert "han-kaku" numbers to "zen-kaku" + "a" : Convert "zen-kaku" alphabets and numbers to "han-kaku" + "A" : Convert "zen-kaku" alphabets and numbers to "han-kaku" + (Characters included in "a", "A" options are + U+0021 - U+007E excluding U+0022, U+0027, U+005C, U+007E) + "s" : Convert "zen-kaku" space to "han-kaku" (U+3000 -> U+0020) + "S" : Convert "han-kaku" space to "zen-kaku" (U+0020 -> U+3000) + "k" : Convert "zen-kaku kata-kana" to "han-kaku kata-kana" + "K" : Convert "han-kaku kata-kana" to "zen-kaku kata-kana" + "h" : Convert "zen-kaku hira-gana" to "han-kaku kata-kana" + "H" : Convert "han-kaku kata-kana" to "zen-kaku hira-gana" + "c" : Convert "zen-kaku kata-kana" to "zen-kaku hira-gana" + "C" : Convert "zen-kaku hira-gana" to "zen-kaku kata-kana" + "V" : Collapse voiced sound notation and convert them into a character. Use with "K","H" + + + + + + <function>mb_convert_kana</function> example + +/* Convert all "kana" to "zen-kaku" "kata-kana" */ +$str = mb_convert_kana($str, "KVC"); + +/* Convert "han-kaku" "kata-kana" to "zen-kaku" "kata-kana" + and "zen-kaku" alpha-numeric to "han-kaku" */ +$str = mb_convert_kana($str, "KVa"); + + + + + + + + + mb_encode_mimeheader + Encode string for MIME header + + + Description + + + string mb_encode_mimeheader + string str + string + charset + + string + transfer-encoding + + string + linefeed + + + + + mb_encode_mimeheader converts string + str to encoded-word for header field. + It returns converted string in ASCII encoding. + + + charset is character encoding + name. Default is ISO-2022-JP. + + + transfer-encoding is transfer encoding. It + should be one of "B" (Base64) or + "Q" (Quoted-Printable). Default is + "B". + + + linefeed is end of line marker. Default is + "\r\n" (CRLF). + + + + <function>mb_convert_kana</function> example + +$name = ""; // kanji +$mbox = "kru"; +$doma = "gtinn.mon"; +$addr = mb_encode_mimeheader($name, "UTF-7", "Q") . " <" . $mbox . "@" . $doma . ">"; +echo $addr; + + + + + See also mb_decode_mimeheader. + + + + + + + mb_decode_mimeheader + Decode string in MIME header field + + + Description + + + string mb_decode_mimeheader + string str + + + + mb_decode_mimeheader decodes encoded-word + string str in MIME header. + + + It returns decoded string in internal character encoding. + + + See also mb_encode_mimeheader. + + + + + + + mb_convert_variables + Convert character code in variable(s) + + + Description + + + string mb_convert_variables + string to-encoding + mixed from-encoding + mixed vars + + + + mb_convert_variables convert + character encoding of variables vars in + encoding from-encoding to encoding + to-encoding. It returns character encoding + before conversion for success, FALSE for failure. + + + It from-encoding is specified by + array or comma separated string, it tries to detect encoding from + from-coding. When + encoding is omitted, + detect_order is used. + + + vars (3rd and larger) is reference to + variable to be converted. String, Array and Object are accepted. + + + + <function>mb_convert_variables</function> example + +/* Convert variables $post1, $post2 to internal encoding */ +$interenc = mb_internal_encoding(); +$inputenc = mb_convert_variables($interenc, "ASCII,UTF-8,SJIS-win", $post1, $post2); + + + + + + + + + mb_encode_numericentity + + Encode character to HTML numeric string reference + + + + Description + + + string mb_encode_numericentity + string str + array convmap + string + encoding + + + + + mb_encode_numericentity converts + specified character codes in string str + from HTML numeric character reference to character code. It + returns converted string. + + + array is array specifies code area to + convert. + + + encoding is character encoding. + + + + <parameter>convmap</parameter> example + +$convmap = array ( + int start_code1, int end_code1, int offset1, int mask1, + int start_code2, int end_code2, int offset2, int mask2, + ........ + int start_codeN, int end_codeN, int offsetN, int maskN ); +// Specify Unicode value for start_codeN and end_codeN +// Add offsetN to value and take bit-wise 'AND' with maskN, then +// it converts value to numeric string reference. + + + + + + + <function>mb_encode_numericentity</function> example + + +/* Convert Left side of ISO-8859-1 to HTML numeric character reference */ +$convmap = array(0x80, 0xff, 0, 0xff); +$str = mb_encode_numericentity($str, $convmap, "ISO-8859-1"); + +/* Convert user defined SJIS-win code in block 95-104 to numeric + string reference */ +$convmap = array( + 0xe000, 0xe03e, 0x1040, 0xffff, + 0xe03f, 0xe0bb, 0x1041, 0xffff, + 0xe0bc, 0xe0fa, 0x1084, 0xffff, + 0xe0fb, 0xe177, 0x1085, 0xffff, + 0xe178, 0xe1b6, 0x10c8, 0xffff, + 0xe1b7, 0xe233, 0x10c9, 0xffff, + 0xe234, 0xe272, 0x110c, 0xffff, + 0xe273, 0xe2ef, 0x110d, 0xffff, + 0xe2f0, 0xe32e, 0x1150, 0xffff, + 0xe32f, 0xe3ab, 0x1151, 0xffff ); +$str = mb_encode_numericentity($str, $convmap, "sjis-win"); + + + + + See also: mb_decode_numericentity. + + + + + + + mb_decode_numericentity + + Decode HTML numeric string reference to character + + + + Description + + + string mb_decode_numericentity + string str + array convmap + string + encoding + + + + + Convert numeric string reference of string + str in specified block to character. It + returns converted string. + + + array is array to specifies code area to + convert. + + + encoding is character encoding. + + + + <parameter>convmap</parameter> example + +$convmap = array ( + int start_code1, int end_code1, int offset1, int mask1, + int start_code2, int end_code2, int offset2, int mask2, + ........ + int start_codeN, int end_codeN, int offsetN, int maskN ); +// Specify Unicode value for start_codeN and end_codeN +// Add offsetN to value and take bit-wise 'AND' with maskN, +// then convert value to numeric string reference. + + + + + See also: mb_encode_numericentity. + + + + + + + mb_send_mail + + Send mail with ISO-2022-JP character code. (Japanese specific) + + + + Description + + + boolean mb_send_mail + string to + string subject + string message + string + additional_headers + + string + additional_parameter + + + + + mb_send_mail sends email. Headers and + message are converted and encoded in ISO-2022-JP. + mb_send_mail is wrapper + function of mail. See + mail for details. + + + to is mail addresses send to. Multiple + recipients can be specified by putting a comma between each + address in to. + + + subject is subject of mail. + + + message is mail message. + + + string additional_headers is inserted at + the end of the header. This is typically used to add + extra headers. Multiple extra headers are separated with a + newline(\n). + + + It returns TRUE for success, otherwise it returns FALSE. + + + additional_parameter is added this + data to the call to the mailer by PHP. This is useful when + setting the correct Return-Path header when using sendmail. + + + See also: mail. + + + + + + + + \ No newline at end of file