qdL+on error. * * @ingroup php_wrappers */ function drupal_xml_parser_create(&$data) { // Default XML encoding is UTF-8 $encoding = 'utf-8'; $bom = FALSE; // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it). if (!strncmp($data, "\xEF\xBB\xBF", 3)) { $bom = TRUE; $data = substr($data, 3); } // Check for an encoding declaration in the XML prolog if no BOM was found. if (!$bom && preg_match('/^<\?xml[^>]+encoding="(.+?)"/', $data, $match)) { $encoding = $match[1]; } // Unsupported encodings are converted here into UTF-8. $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii'); if (!in_array(strtolower($encoding), $php_supported)) { $out = drupal_convert_to_utf8($data, $encoding); if ($out !== FALSE) { $encoding = 'utf-8'; $data = preg_replace('/^(<\?xml[^>]+encoding)="(.+?)"/', '\\1="utf-8"', $out); } else { watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING); return FALSE; } } $xml_parser = xml_parser_create($encoding); xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8'); return $xml_parser; } /** * Converts data to UTF-8. * * Requires the iconv, GNU recode or mbstring PHP extension. * * @param $data * The data to be converted. * @param $encoding * The encoding that the data is in. * * @return * Converted data or FALSE. */ function drupal_convert_to_utf8($data, $encoding) { if (function_exists('iconv')) { $out = @iconv($encoding, 'utf-8', $data); } elseif (function_exists('mb_convert_encoding')) { $out = @mb_convert_encoding($data, 'utf-8', $encoding); } elseif (function_exists('recode_string')) { $out = @recode_string($encoding . '..utf-8', $data); } else { watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR); return FALSE; } return $out; } /** * Truncates a UTF-8-encoded string safely to a number of bytes. * * If the end position is in the middle of a UTF-8 sequence, it scans backwards * until the beginning of the byte sequence. * * Use this function whenever you want to chop off a string at an unsure * location. On the other hand, if you're sure that you're splitting on a * character boundary (e.g. after using strpos() or similar), you can safely * use substr() instead. * * @param $string * The string to truncate. * @param $len * An upper limit on the returned string length. * * @return * The truncated string. */ function drupal_truncate_bytes($string, $len) { if (strlen($string) <= $len) { return $string; } if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) { return substr($string, 0, $len); } // Scan backwards to beginning of the byte sequence. while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0); return substr($string, 0, $len); } /** * Truncates a UTF-8-encoded string safely to a number of characters. * * @param $string * The string to truncate. * @param $max_length * An upper limit on the returned string length, including trailing ellipsis * if $add_ellipsis is TRUE. * @param $wordsafe * If TRUE, attempt to truncate on a word boundary. Word boundaries are * spaces, punctuation, and Unicode characters used as word boundaries in * non-Latin languages; see PREG_CLASS_UNICODE_WORD_BOUNDARY for more * information. If a word boundary cannot be found that would make the length * of the returned string fall within length guidelines (see parameters * $max_length and $min_wordsafe_length), word boundaries are ignored. * @param $add_ellipsis * If TRUE, add t('...') to the end of the truncated string (defaults to * FALSE). The string length will still fall within $max_length. * @param $min_wordsafe_length * If $wordsafe is TRUE, the minimum acceptable length for truncation (before * adding an ellipsis, if $add_ellipsis is TRUE). Has no effect if $wordsafe * is FALSE. This can be used to prevent having a very short resulting string * that will not be understandable. For instance, if you are truncating the * string "See myverylongurlexample.com for more information" to a word-safe * return length of 20, the only available word boundary within 20 characters * is after the word "See", which wouldn't leave a very informative string. If * you had set $min_wordsafe_length to 10, though, the function would realise * that "See" alone is too short, and would then just truncate ignoring word * boundaries, giving you "See myverylongurl..." (assuming you had set * $add_ellipses to TRUE). * * @return string * The truncated string. */ function truncate_utf8($string, $max_length, $wordsafe = FALSE, $add_ellipsis = FALSE, $min_wordsafe_length = 1) { $ellipsis = ''; $max_length = max($max_length, 0); $min_wordsafe_length = max($min_wordsafe_length, 0); if (drupal_strlen($string) <= $max_length) { // No truncation needed, so don't add ellipsis, just return. return $string; } if ($add_ellipsis) { // Truncate ellipsis in case $max_length is small. $ellipsis = drupal_substr(t('...'), 0, $max_length); $max_length -= drupal_strlen($ellipsis); $max_length = max($max_length, 0); } if ($max_length <= $min_wordsafe_length) { // Do not attempt word-safe if lengths are bad. $wordsafe = FALSE; } if ($wordsafe) { $matches = array(); // Find the last word boundary, if there is one within $min_wordsafe_length // to $max_length characters. preg_match() is always greedy, so it will // find the longest string possible. $found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']/u', $string, $matches); if ($found) { $string = $matches[1]; } else { $string = drupal_substr($string, 0, $max_length); } } else { $string = drupal_substr($string, 0, $max_length); } if ($add_ellipsis) { $string .= $ellipsis; } return $string; } /** * Encodes MIME/HTTP header values that contain incorrectly encoded characters. * * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=". * * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information. * * Notes: * - Only encode strings that contain non-ASCII characters. * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure * each chunk starts and ends on a character boundary. * - Using \n as the chunk separator may cause problems on some systems and may * have to be changed to \r\n or \r. * * @param $string * The header to encode. * * @return string * The mime-encoded header. * * @see mime_header_decode() */ function mime_header_encode($string) { if (preg_match('/[^\x20-\x7E]/', $string)) { $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75); $len = strlen($string); $output = ''; while ($len > 0) { $chunk = drupal_truncate_bytes($string, $chunk_size); $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n"; $c = strlen($chunk); $string = substr($string, $c); $len -= $c; } return trim($output); } return $string; } /** * Decodes MIME/HTTP encoded header values. * * @param $header * The header to decode. * * @return string * The mime-decoded header. * * @see mime_header_encode() */ function mime_header_decode($header) { // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace) $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header); // Second step: remaining chunks (do not collapse whitespace) return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header); } /** * Decodes encoded header data passed from mime_header_decode(). * * Callback for preg_replace_callback() within mime_header_decode(). * * @param $matches * The array of matches from preg_replace_callback(). * * @return string * The mime-decoded string. * * @see mime_header_decode() */ function _mime_header_decode($matches) { // Regexp groups: // 1: Character set name // 2: Escaping method (Q or B) // 3: Encoded data $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3])); if (strtolower($matches[1]) != 'utf-8') { $data = drupal_convert_to_utf8($data, $matches[1]); } return $data; } /** * Decodes all HTML entities (including numerical ones) to regular UTF-8 bytes. * * Double-escaped entities will only be decoded once ("&lt;" becomes "<" * , not "<"). Be careful when using this function, as decode_entities can * revert previous sanitization efforts (<script> will become