# $Id: README.UNICODE-UPGRADES.ja 372 2008-01-01 00:06:30Z m-takagi $ # EN-Revision: 1.17 # (http://cvs.php.net/viewcvs.cgi/php-src/README.UNICODE-UPGRADES?revision=1.17&view=markup) このドキュメントでは、新しい Unicode 機能に関連する API について説明しま す。また、既存の関数を Unicode 対応にするためのうまい方法についても説明 します。 まず最初に読むべきなのは README.UNICODE です。ここでは Unicode の機能や 概念について、技術的に深入りしない範囲で説明しています。 内部エンコーディング ==================== Unicode 文字列の内部エンコーディングとして UTF-16 を使用します。UTF-16 は、基本多言語面にある Unicode 文字をすべて 2 バイトで表します。現在世界 で用いられている文字のほとんどは基本多言語面に存在します。ASCII テキスト を扱う際のメモリ効率は悪くなりますが、これにより ICU とのやりとりを単純 化することができます。というのも、ICU でも内部処理に UTF-16 を使用してい るからです。 Zval 構造体の変更点 =================== IS_UNICODE 型用に、新たな構造体を共用体に追加しました。 union { .... struct { UChar *val; /* Unicode 文字列の値 */ int len; /* UChar の数 */ } ustr; .... } value; これにより二種類の文字列型を完全に別のものとして扱い、過去との互換性を保 持するようにしています。 IS_STRING や IS_UNICODE への実行時のアクセスを最適化するために、別の構造 体を定義する必要があります。 union { .... struct { /* 共通の文字列型 */ zstr val; int len; } uni; .... } value; ここで、zstr とは char*、UChar* および void* の共用体となります。 パラメータ解析 API の変更点 =========================== 新たな指定子が 5 種類 ('u', 't', 'T', 'U', 'S', 'x')、そして新たな修飾子 '&' が追加されました。 't' 指定子 ---------- この指定子は、呼び出し元が文字列型(IS_STRING, IS_UNICODE) を要求してい ることを表します。呼び出し元は文字列の値と長さ、そして型を渡す必要があ ります。 void *str; int len; zend_uchar type; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t", &str, &len, &type) == FAILURE) { return; } if (type == IS_UNICODE) { /* Unicode 文字列の処理 */ } else { /* バイナリ文字列の処理 */ } IS_STRING 型の場合は、長さはバイト数を表します。一方 IS_UNICODE 型の場 合は、長さは UChar の数を表します。その他の型 (数値や論理型など) から 文字列に変換する際には、Unicode セマンティクスの設定によって挙動が異な ります。これがオンの場合は IS_UNICODE に、それ以外の場合は IS_STRING に変換されます。 'u' 指定子 ---------- この指定子は呼び出し元が Unicode でエンコードされた文字列を要求してい ることを表します。非 Unicode 文字列が渡された場合はその文字列のコピー を作成し、自動的に Unicode 型に変換してから内部関数に渡されます。もち ろん Unicode 文字列の場合はこのような変換は行われません。 UChar *str; int len; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "u", &str, &len) == FAILURE) { return; } /* Unicode 文字列の処理 */ 'T' 指定子 ---------- この指定子は、複数の文字列を受け取って処理する関数などで便利です。各パ ラメータに対して 't' 指定子を使用すると、異なる型の文字列が渡された場 合に面倒です。何をするにも余計なチェックが必要となってしまいます。'T' 指定子をつけたすべてのパラメータは、同じ型となります。 'T' を指定したパラメータの中にひとつでも Unicode 型の文字が渡されると、 残りのパラメータがすべて IS_UNICODE に変換されます。それ以外の場合は、 すべての 'T' パラメータが IS_STRING に変換されます。 void *str1, *str2; int len1, len2; zend_uchar type1, type2; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "TT", &str1, &len1, &type1, &str2, &len2, &type2) == FAILURE) { return; } if (type1 == IS_UNICODE) { /* Unicode として処理します。str2 も Unicode であることが保証され ています */ } else { /* バイナリ文字列として処理します。str2 も同じ型であることが保証 されています */ } 'x' 指定子 ---------- この指定子は、Unicode セマンティクススイッチの設定に応じて 'u' あるい は 's' のいずれかと同じ働きをします。UG(unicode) がオンの場合は 'u'、 それ以外の場合は 's' として機能します。 既存の 's' 指定子の挙動も変更されています。もし Unicode 文字列が渡される と、それを自動的にコピーして実行時エンコーディングに変換し、同時に警告を 発生させます。バイナリ文字列が渡された場合は何も変換は行われません。'&' 修飾しを 's' 指定子の後に使用すると、代わりに別のコンバータを使用させる ことができます。 char *str; int len; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&", &str, &len, UG(utf8_conv)) == FAILURE) { return; } /* Unicode 文字列が渡された場合、str は UTF-8 となります */ 'U' 指定子および 'S' 指定子はそれぞれ 'u' および 's' と同様です。しかし、 渡されたパラメータに対してより厳格な型チェックを行います。'U' を指定した パラメータにバイナリ文字列が渡された場合は、自動変換を行わずに警告を発生 させます。逆方向の 'S' 指定子も同じです。 Unicode の世界の挙動 ==================== 文字列 ------ 内部の機能の多くが unicode.semantics によって制御されるようになります。 この値は、Unicode グローバル変数 UG(unicode) で取得できます。この値は、 リクエスト全体においてオンあるいはオフのいずれかになります。 さらに、大きな変更点として、新たな文字列型 IS_UNICODE が追加されています。 zval 共用体に、この型用の領域 (value.ustr) を持つようになっています。一 方、非 Unicode (バイナリ) 文字列は IS_STRING 型をそのまま使用し、zval の value.str 要素を使用します。 Unicode 文字列にアクセスするための新たなマクロが用意されています。これは Z_STRVAL/Z_STRLEN に対応するものです。 Z_USTRVAL(), Z_USTRLEN() - Unicode 型の文字列の値 (UChar*) および長さ (コードユニット数)、 つまり value.ustr.val と value.ustr.len にアクセスします。 Z_UNIVAL(), Z_UNILEN() - 値 (zstr) および長さ (型に応じた単位)、 つまり value.uni.val と value.uni.len にアクセスします。 Z_USTRCPLEN() - Unicode 型の文字列のコードポイント数 (コードユニットではありません) を取得します。このマクロはサロゲートペアを考慮して文字列を調べ、 UChar32(UTF32) のコードポイント数を返します。この値は、文字列バッファ の UChar(UTF16) コードユニット数より少なくなる可能性があります。この 値を繰り返し使用する場合は、毎回文字列を調べることを避けるためにロー カル変数に保存しておくようにしましょう。 ZVAL_* マクロ ------------- ZVAL_STRING()/RETVAL_STRING()/RETURN_STRING() 型のマクロの 'dup' パラメ ータを少し拡張しました。現在は、代わりに以下の定義を使用することを推奨し ます。 #define ZSTR_DUPLICATE (1<<0) #define ZSTR_AUTOFREE (1<<1) ZSTR_DUPLICATE (which has a resulting value of 1) serves the same purpose as a truth value in old-style 'dup' flags. The value of 1 was specifically chosen to match the common practice of passing a 1 for this parameter. Warning: If you find extension code which uses a truth value other than one for the dup flag, its logic should be modified to explicitly pass ZSTR_DUPLICATE instead. ZSTR_AUTOFREE is used with macros such as ZVAL_RT_STRING which may populate Unicode zvals from non-unicode source strings. When UG(unicode) is on, the source string will be implicitly copied (to make a UChar* version). If the original string needed copying anyway this is fine. However if the original string was emalloc()'d and would have ordinarily been given to the engine (i.e. RETURN_STRING(estrdup("foo"), 0)) then it will need to be freed in UG(unicode) mode to avoid leaking. The ZSTR_AUTOFREE flag ensures that the original string is freed in UG(unicode) mode. ZVAL_UNICODE(pzv, str, dup), ZVAL_UNICODEL(pzv, str, str_len, dup) - Sets zval to hold a Unicode string. Takes the same parameters as Z_STRING(), Z_STRINGL(). ZVAL_U_STRING(conv, pzv, str, dup), ZVAL_U_STRINGL(conv, pzv, str, str_len, dup) - When UG(unicode) is off, it's equivalent to Z_STRING(), ZSTRINGL() and the conv parameter is ignored. When UG(unicode) is on, it sets zval to hold a Unicode representation of the passed-in string using the UConverter* specified by conv. Since a new string is always created in this case, passing ZSTR_DUPLICATE for 'dup' does not matter, but ZSTR_AUTOFREE will be used will be used to efree the original value ZVAL_RT_STRING(pzv, str, dup), ZVAL_RT_STRINGL(pzv, str, str_len, dup) - When UG(unicode) is off, it's equivalent to Z_STRING(), Z_STRINGL(). When UG(unicode) is on, it takes the input string, converts it to Unicode using the runtime_encoding converter and sets zval to it. Since a new string is always created in this case, passing ZSTR_DUPLICATE for 'dup' does not matter, but ZSTR_AUTOFREE will be used will be used to efree the original value ZVAL_ASCII_STRING(pzv, str, dup), ZVAL_ASCII_STRINGL(pzv, str, str_len, dup) - When UG(unicode) is off, it's equivalent to Z_STRING(), Z_STRINGL(). When UG(unicode) is on, it takes the input string, converts it to Unicode using an ASCII converter and sets zval to it. Since a new string is always created in this case, passing ZSTR_DUPLICATE for 'dup' does not matter, but ZSTR_AUTOFREE will be used will be used to efree the original value ZVAL_UTF8_STRING(pzv, str, dup), ZVAL_UTF8_STRINGL(pzv, str, str_len, dup) - When UG(unicode) is off, it's equivalent to Z_STRING(), Z_STRINGL(). When UG(unicode) is on, it takes the input string, converts it to Unicode using a UTF8 converter and sets zval to it. Since a new string is always created in this case, passing ZSTR_DUPLICATE for 'dup' does not matter, but ZSTR_AUTOFREE will be used will be used to efree the original value ZVAL_ZSTR(pzv, type, zstr, dup), ZVAL_ZSTRL(pzv, type, zstr, zstr_len, dup) - This macro uses 'type' to switch between calling ZVAL_STRING(pzv, zstr.s, dup) and ZVAL_UNICODE(pzv, zstr.u, dup). No conversion happens so the presense of absense of ZSTR_AUTOFREE is ignored. ZVAL_TEXT(pzv, zstr, dup), ZVAL_TEXTL(pzv, zstr, zstr_len, dup) - This macro sets the zval to hold either a Unicode or a normal string, depending on the value of UG(unicode). No conversion happens, so be certain that the string passed in matches the type expected by UG(unicode). One example of its usage would be to initialize zval to hold the name of a user function. ZVAL_EMPTY_UNICODE(pzv) / ZVAL_EMPTY_TEXT(pzv) - These macros work identically to ZVAL_EMPTY_STRING() with the UNICODE version always generating an IS_UNICODE zval, and the TEXT version generating a UG(unicode) dependent string type. ZVAL_UCHAR32(pzv, char) - Converts the character provided into a UChar string (which may potentially be 1 or 2 characters long in the case of surrogate pairs) and dispatches to ZVAL_UNICODEL(). As usual, for each ZVAL_* macro, there is a matching RETVAL_* and RETURN_* macro. 変換マクロ ---------- convert_to_string_with_converter(zval *op, UConverter *conv) - converts a zval to native string using the specified converter, if necessary. convert_to_unicode_with_converter(zval *op, UConverter *conv) - converts a zval to Unicode string using the specified converter, if necessary. convert_to_unicode(zval *op) - converts a zval to Unicode string. convert_to_string(zval *op) - Behaves just as it currently does, converting to IS_STRING type convert_to_text(zval *op) - converts a zval to either Unicode or native string, depending on the value of UG(unicode) switch zend_ascii_to_unicode() function can be used to convert an ASCII char* string to Unicode. This is useful especially for inline string literals, in which case you can simply use USTR_MAKE() macro, e.g.: UChar* ustr; ustr = USTR_MAKE("main"); If you need to initialize a few such variables, it may be more efficient to use ICU macros, which avoid the conversion, depending on the platform. See [1] for more information. USTR_FREE(zstr) can be used to free a UChar* string safely, since it checks for NULL argument. USTR_LEN() takes a zstr as its argument, and depending on the UG(unicode) value, and returns its strlen() or u_strlen(). 配列の操作 ---------- The add_next_index_*(), add_index_*() and add_assoc_*() functions have been significantly expanded both to allow for the unicode type as a value and to permit various types of keys. Values: In the following examples, {1} represents a placeholder for the keytype and its arguments (covered later). add_{1}_unicode(zval *arr, {1}, UChar *ustr, int dup); add_{1}_unicodel(zval *arr, {1}, UChar *ustr, int ustr_len, int dup); - Works like add_{1}_string() and add_{1}_stringl() but takes a UChar* value and adds an IS_UNICODE type. add_{1}_rt_string(zval *arr, {1}, char *str, int dup); add_{1}_rt_stringl(zval *arr, {1}, char *str, int str_len, int dup); - Works like add_{1}_string() and add_{1}_stringl() but converts the char* value to Unicode using runtime encoding when UG(unicode) is on. add_{1}_ascii_string(zval *arr, {1}, char *str, int dup); add_{1}_ascii_stringl(zval *arr, {1}, char *str, int str_len, int dup); - Works like add_{1}_rt_string() and add_{1}_rt_stringl() but uses an ASCII converter rather than runtime encoding. add_{1}_utf8_string(zval *arr, {1}, char *str, int dup); add_{1}_utf8_stringl(zval *arr, {1}, char *str, int str_len, int dup); - Works like add_{1}_rt_string() and add_{1}_rt_stringl() but uses a UTF8 converter rather than runtime encoding. add_{1}_text(zval *arr, {1}, zstr str, int dup); add_{1}_textl(zval *arr, {1}, zstr str, int str_len, int dup); - Wrapper which dispatches to add_{1}_string(l)() or add_{1}_unicode(l)() depending on the setting of UG(unicode). add_{1}_zstr(zval *arr, {1}, zend_uchar type, zstr str, int dup); add_{1}_zstrl(zval *arr, {1}, zend_uchar type, zstr str, int str_len, int dup); - Works like add_{1}_text() and add_{1}_textl(), but dispatches based on 'type'. Keys: In the following example, the zval* type is used for values, however each of the value types (including those listed above) are supported. The existing key types work as they always have: add_next_index_zval(zval *arr, zval *val); add_index_zval(zval *arr, long idx, zval *val); add_assoc_zval(zval *arr, char *key, zval *val); add_assoc_zval_ex(zval *arr, char *key, int key_len, zval *val); . Associative keys are considered binary (IS_STRING) . Remember that key_len includes the terminating NULL The following additional methods provide unicode capable keytypes: add_u_assoc_zval(zval *arr, zend_uchar type, zstr key, zval *val); add_u_assoc_zval_ex(zval *arr, zend_uchar type, zstr key, int key_len, zval *val); . When type==IS_STRING, these behave identically to their add_assoc_zval() and add_assoc_zval_ex() counterparts. When type==IS_STRING, the key is considered to be Unicode (UChar*). add_rt_assoc_zval(zval *arr, char *key, zval *val); add_rt_assoc_zval_ex(zval *arr, char *key, int key_len, zval *val); . When UG(unicode) is off, these behave identically to their add_assoc_zval() and add_assoc_zval_ex() counterparts. When UG(unicode) is on, key is converted to Unicode using runtime encoding. add_ascii_assoc_zval(zval *arr, char *key, zval *val); add_ascii_assoc_zval_ex(zval *arr, char *key, int key_len, zval *val); . When UG(unicode) is off, these behave identically to their add_assoc_zval() and add_assoc_zval_ex() counterparts. When UG(unicode) is on, key is converted to Unicode using an ASCII converter. add_utf8_assoc_zval(zval *arr, char *key, zval *val); add_utf8_assoc_zval_ex(zval *arr, char *key, int key_len, zval *val); . When UG(unicode) is off, these behave identically to their add_assoc_zval() and add_assoc_zval_ex() counterparts. When UG(unicode) is on, key is converted to Unicode using a UTF8 converter. Keytype and Valuetype specification may be mixed in any combination, for example: add_utf8_assoc_ascii_stringl_ex(zval *arr, char *key, int key_len, char *val, int val_len, int dup); その他 ------ UBYTES() macro can be used to obtain the number of bytes necessary to store the given number of UChar's. The typical usage is: char *constant_name = colon + (UG(unicode)?UBYTES(2):2); コードポイントおよびコードユニット ---------------------------------- Unicode type strings are in the UTF-16 encoding where 1 Unicode character may be represented by 1 or 2 UChar's. Each UChar is referred to as a "code unit", and a full Unicode character as a "code point". Consequently, number of code units and number of code points for the same Unicode string may be different. This has many implications, the most important of which is that you cannot simply index the UChar* string to get the desired codepoint. The zval's value.ustr.len contains the number of code units (UChar -- UTF16). To obtain the number of code points, one can use u_countChar32() ICU API function or Z_USTRCPLEN() macro. ICU provides a number of macros for working with UTF-16 strings on the codepoint level [2]. They allow you to do things like obtain a codepoint at random code unit offset, move forward and backward over the string, etc. There are two versions of iterator macros, *_SAFE and *_UNSAFE. It is strong recommended to use *_SAFE version, since they handle unpaired surrogates and check for string boundaries. Here is an example of how to move through UChar* string and work on codepoints. UChar *str = ...; int32_t str_len = ...; UChar32 codepoint; int32_t offset = 0; while (offset < str_len) { U16_NEXT(str, offset, str_len, codepoint); /* now we have the Unicode character in codepoint */ } There is not macro to get a codepoint at a certain code point offset, but there is a Zend API function that does it. inline UChar32 zend_get_codepoint_at(UChar *str, int32_t length, int32_t n); To retrieve 3rd codepoint, you would call: zend_get_codepoint_at(str, str_len, 3); If you have a UChar32 codepoint and need to put it into a UChar* string, there is another helper function, zend_codepoint_to_uchar(). It takes a single UChar32 and converts it to a UChar sequence (1 or 2 UChar's). UChar buf[8]; UChar32 codepoint = 0x101a2; int8_t num_uchars; num_uchars = zend_codepoint_to_uchar(codepoint, buf); The return value is the number of resulting UChar's or 0, which indicates invalid codepoint. メモリの確保 ------------ For ease of use and to reduce possible bugs, there are memory allocation functions specific to Unicode strings. Please use them at all times when allocating UChar's. eumalloc(size) eurealloc(ptr, size) eustrndup(s, length) eustrdup(s) peumalloc(size, persistent) peurealloc(ptr, size, persistent) The size parameter refers to the number of UChar's, not bytes. zend_zstrndup(type, zstr, length) ハッシュ -------- Hashes API has been upgraded to work with Unicode and binary strings. All hash functions that worked with string keys now have their equivalent zend_u_hash_* API. The zend_u_hash_* functions take the type of the key string as the second argument. When UG(unicode) switch is on, the IS_STRING keys are upconverted to IS_UNICODE and then used in the hash lookup. A new HASH_KEY constant has been added for differentiating key types: . HASH_KEY_IS_UNICODE Note that zend_hash_get_current_key_ex() does not have a zend_u_hash_* version. It returns the key as a char* pointer, you can can cast it appropriately based on the key type. 識別子およびクラスエントリ -------------------------- In Unicode mode all the identifiers are Unicode strings. This means that while various structures such as zend_class_entry, zend_function, etc store the identifier name as a char* pointer, it will actually point to UChar* string. Be careful when accessing the names of classes, functions, and such -- always check UG(unicode) before using them. 出力のフォーマット ------------------ Since UTF-16 strings frequently contain NULL bytes, you cannot simpley use %s format to print them out. Towards that end, output functions such as php_printf(), spprintf(), etc now have three different formats for use with Unicode strings: %r This format treats the corresponding argument as a Unicode string. The string is automatically converted to the output encoding. If you wish to apply a different converter to the string, use %*r and pass the converter before the string argument. UChar *class_name = USTR_NAME("ReflectionClass"); zend_printf("%r", class_name); spprintf(&utf8_buffer, 0, "%*r", UG(utf8_conv), class_name); %R This format requires at least two arguments: the first one specifies the type of the string to follow (IS_STRING or IS_UNICODE), and the second one - the string itself. If the string is of Unicode type, it is automatically converted to the output encoding. If you wish to apply a different converter to the string, use %*R and pass the converter before the string argument. zend_throw_exception_ex(U_CLASS_ENTRY(reflection_exception_ptr), 0 TSRMLS_CC, "Interface %R does not exist", Z_TYPE_P(class_name), Z_UNIVAL_P(class_name)); %v This format takes only one parameter, the string, but the expected string type depends on the UG(unicode) value. If the string is of Unicode type, it is automatically converted to the output encoding. If you wish to apply a different converter to the string, use %*R and pass the converter before the string argument. zend_error(E_WARNING, "%v::__toString() did not return anything", Z_OBJCE_P(object)->name); Since [v]spprintf() can only output native strings there are also the new functions [v]uspprintf() and [v]zspprintf() that create unicode strings and return the number of characters printed. That is they return the length rather than the byte size. The second pair of functions also takes an additional type parameter that allows to create a string of arbitrary type. The following example illustrates the use. Assume it fetches a unicode/native string into path, path_len and path_type inorder to create sub_name, sub_len and sub_type. zstr path, sub_name; int path_len, sub_len; zend_uchar path_type, sub_type; /* fetch */ if (path.v) { sub_type = path_type; sub_len = zspprintf(path_type, &sub_name, 0, "%R%c%s", path_type, path, DEFAULT_SLASH, entry.d_name); } 関数の変更点 ============ Let's take a look at a couple of functions that have been upgraded to support new string types. substr() -------- This functions returns part of a string based on offset and length parameters. zstr str; int str_len, cp_len; zend_uchar str_type; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "tl|l", &str, &str_len, &str_type, &f, &l) == FAILURE) { return; } The first thing we notice is that the incoming string specifier is 't', which means that we can accept all 3 string types. The 'str' variable is declared as zstr, because it can point to either UChar* or char*. The actual type of the incoming string is stored in 'str_type' variable. if (str_type == IS_UNICODE) { cp_len = u_countChar32(str.u, str_len); } else { cp_len = str_len; } If the string is a Unicode one, we cannot rely on the str_len value to tell us the number of characters in it. Instead, we call u_countChar32() to obtain it. The next several lines normalize start and length parameters to fit within the string. Nothing new here. Then we locate the appropriate segment. if (str_type == IS_UNICODE) { int32_t start = 0, end = 0; U16_FWD_N(str.u, end, str_len, f); start = end; U16_FWD_N(str.u, end, str_len, l); RETURN_UNICODEL(str.u + start, end-start, ZSTR_DUPLICATE); Since codepoint (character) #n is not necessarily at offset #n in Unicode strings, we start at the beginning and iterate forward until we have gone through the required number of codepoints to reach the start of the segment. Then we save the location in 'start' and continue iterating through the number of codepoints specified by the offset. Once that's done, we can return the segment as a Unicode string. } else { RETURN_STRINGL(str.s + f, l, ZSTR_DUPLICATE); } For native strings, we can return the segment directly. strrev() -------- Let's look at strrev() which requires somewhat more complicated upgrade. While one of the guidelines for upgrades is that combining sequences are not really taken into account during processing -- substr() can break them up, for example -- in this case, we actually should be concerned, because reversing combining sequence may result in a completely different string. To illustrate: a (U+0061 LATIN SMALL LETTER A) o (U+006f LATIN SMALL LETTER O) + ' (U+0301 COMBINING ACUTE ACCENT) + _ (U+0320 COMBINING MINUS SIGN BELOW) l (U+006C LATIN SMALL LETTER L) Reversing this would result in: l (U+006C LATIN SMALL LETTER L) + _ (U+0320 COMBINING MINUS SIGN BELOW) + ' (U+0301 COMBINING ACUTE ACCENT) o (U+006f LATIN SMALL LETTER O) a (U+0061 LATIN SMALL LETTER A) All of a sudden the combining marks are being applied to 'l' instead of 'o'. To avoid this, we need to treat combininig sequences as a unit, by checking the combining character class of each character with u_getCombiningClass(). strrev() obtains its single argument, a string, and unless the string is of Unicode type, processes it exactly as before, simply swapping bytes around. For Unicode case, the magic is like this: int32_t i, x1, x2; UChar32 ch; UChar *u_s, *u_n, *u_p; u_n = eumalloc(Z_USTRLEN_PP(str)+1); u_p = u_n; u_s = Z_USTRVAL_PP(str); i = Z_USTRLEN_PP(str); while (i > 0) { U16_PREV(u_s, 0, i, ch); if (u_getCombiningClass(ch) == 0) { u_p += zend_codepoint_to_uchar(ch, u_p); } else { x2 = i; do { U16_PREV(u_s, 0, i, ch); } while (u_getCombiningClass(ch) != 0); x1 = i; while (x1 <= x2) { U16_NEXT(u_s, x1, Z_USTRLEN_PP(str), ch); u_p += zend_codepoint_to_uchar(ch, u_p); } } } *u_p = 0; The basic idea is to walk the string backwards from the end, using U16_PREV() macro. If the combining class of the current character is 0, meaning it's a base character and not a combining mark, we simply append it to the new string. Otherwise, we save the location of the index and do a run over the characters until we get to the next one with combining class 0. At that point we append the sequence as is, without reversing, to the new string. Voila. Note that the code uses zend_codepoint_to_uchar() to convert full Unicode characters (UChar32 type) to 1 or 2 UTF-16 code units (UChar type). realpath() ---------- Filenames use their own converter as it's not uncommon, for example, to need to access files on a filesystem with latin1 entries while outputting UTF8 runtime content. The most common approach to parsing filenames can be found in realpath(): zval **ppfilename; char *filename; int filename_len; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Z", &ppfilename) == FAILURE || php_stream_path_param_encode(ppfilename, &filename, &filename_len, REPORT_ERRORS, FG(default_context)) == FAILURE) { return; } Here, the filename is taken first as a generic zval**, then converted (separating if necessary) and populated into local char* and int storage. The filename will be converted according to unicode.filesystem_encoding unless the wrapper specified overrides this with its own conversion function (The http:// wrapper, for example, enforces utf8 conversion). rmdir() ------- If the function accepts a context parameter, then this context should be used in place of FG(default_context) zval **ppdir, *zcontext = NULL; char *dir; int dir_len; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Z|r", &ppdir, &zcontext) == FAILURE) { return; } context = php_stream_context_from_zval(zcontext, 0); if (php_stream_path_param_encode(ppdir, &dir, &dir_len, REPORT_ERRORS, context) == FAILURE) { return; } sqlite_query() -------------- If the function's underlying library expects a particular encoding (i.e. UTF8), then the alternate form of the string parameter may be used with zend_parse_parameters(). char *sql; int sql_len; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&", &sql, &sql_len, UG(utf8_conv)) == FAILURE) { return; } コンバータ ========== 標準のコンバータ ---------------- The following converters (UConverter*) are initialized by Zend and are always available (regardless of UG(unicode) mode): UG(utf8_conv) UG(ascii_conv) UG(fallback_encoding_conv) - UTF8 unless overridden by INI setting unicode.fallback_encoding Additional converters will be optionally initialized depending on INI settings: UG(runtime_encoding_conv) - unicode.runtime_encoding . Unicode output generated by a script will be encoding using this converter UG(script_encoding_conv) - unicode.script_encoding . Scripts read from disk will be decoded using this converter UG(filesystem_encoding_conv) - unicode.filesystem_encoding . Filenames and paths will be encoding using this converter Since these additional converters may not be instatiated (because their INI value is not set), all uses of these converters must be wrapped in ZEND_U_CONVERTER() for safety. If the converter hasn't been instantiated, then UG(fallback_encoding_conv) will be used instead. For example, RETURN_RT_STRING("foo", ZSTR_DUPLICATE); expands out to: RETURN_U_STRING(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), "foo", ZSTR_DUPLICATE); Which uses UG(runtime_encoding_conv) if it's been set, otherwise using UG(fallback_encoding_conv). Note that the INI setting unicode.stream_encoding does not instantiate a UConverter* automatically for use by the process/thread, it stores the value as a string for use during fopen() style calls where a UConverter* is instantiated for that particular stream. 参照 ==== [1] http://icu.sourceforge.net/apiref/icu4c/ustring_8h.html#a1 [2] http://icu.sourceforge.net/apiref/icu4c/utf16_8h.html