163 lines
4.3 KiB
PHP
163 lines
4.3 KiB
PHP
<?php
|
|
|
|
namespace dokuwiki\Utf8;
|
|
|
|
/**
|
|
* Methods to convert from and to UTF-8 strings
|
|
*/
|
|
class Conversion
|
|
{
|
|
|
|
/**
|
|
* Encodes UTF-8 characters to HTML entities
|
|
*
|
|
* @author Tom N Harris <tnharris@whoopdedo.org>
|
|
* @author <vpribish at shopping dot com>
|
|
* @link http://php.net/manual/en/function.utf8-decode.php
|
|
*
|
|
* @param string $str
|
|
* @param bool $all Encode non-utf8 char to HTML as well
|
|
* @return string
|
|
*/
|
|
public static function toHtml($str, $all = false)
|
|
{
|
|
$ret = '';
|
|
foreach (Unicode::fromUtf8($str) as $cp) {
|
|
if ($cp < 0x80 && !$all) {
|
|
$ret .= chr($cp);
|
|
} elseif ($cp < 0x100) {
|
|
$ret .= "&#$cp;";
|
|
} else {
|
|
$ret .= '&#x' . dechex($cp) . ';';
|
|
}
|
|
}
|
|
return $ret;
|
|
}
|
|
|
|
/**
|
|
* Decodes HTML entities to UTF-8 characters
|
|
*
|
|
* Convert any &#..; entity to a codepoint,
|
|
* The entities flag defaults to only decoding numeric entities.
|
|
* Pass HTML_ENTITIES and named entities, including & < etc.
|
|
* are handled as well. Avoids the problem that would occur if you
|
|
* had to decode "&#38;&amp;#38;"
|
|
*
|
|
* unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&"
|
|
* \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;"
|
|
* what it should be -> "&&#38;"
|
|
*
|
|
* @author Tom N Harris <tnharris@whoopdedo.org>
|
|
*
|
|
* @param string $str UTF-8 encoded string
|
|
* @param boolean $entities decode name entities in addtition to numeric ones
|
|
* @return string UTF-8 encoded string with numeric (and named) entities replaced.
|
|
*/
|
|
public static function fromHtml($str, $entities = false)
|
|
{
|
|
if (!$entities) {
|
|
return preg_replace_callback(
|
|
'/(&#([Xx])?([0-9A-Za-z]+);)/m',
|
|
[__CLASS__, 'decodeNumericEntity'],
|
|
$str
|
|
);
|
|
}
|
|
|
|
return preg_replace_callback(
|
|
'/&(#)?([Xx])?([0-9A-Za-z]+);/m',
|
|
[__CLASS__, 'decodeAnyEntity'],
|
|
$str
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Decodes any HTML entity to it's correct UTF-8 char equivalent
|
|
*
|
|
* @param string $ent An entity
|
|
* @return string
|
|
*/
|
|
protected static function decodeAnyEntity($ent)
|
|
{
|
|
// create the named entity lookup table
|
|
static $table = null;
|
|
if ($table === null) {
|
|
$table = get_html_translation_table(HTML_ENTITIES);
|
|
$table = array_flip($table);
|
|
$table = array_map(
|
|
static function ($c) {
|
|
return Unicode::toUtf8(array(ord($c)));
|
|
},
|
|
$table
|
|
);
|
|
}
|
|
|
|
if ($ent[1] === '#') {
|
|
return self::decodeNumericEntity($ent);
|
|
}
|
|
|
|
if (array_key_exists($ent[0], $table)) {
|
|
return $table[$ent[0]];
|
|
}
|
|
|
|
return $ent[0];
|
|
}
|
|
|
|
/**
|
|
* Decodes numeric HTML entities to their correct UTF-8 characters
|
|
*
|
|
* @param $ent string A numeric entity
|
|
* @return string|false
|
|
*/
|
|
protected static function decodeNumericEntity($ent)
|
|
{
|
|
switch ($ent[2]) {
|
|
case 'X':
|
|
case 'x':
|
|
$cp = hexdec($ent[3]);
|
|
break;
|
|
default:
|
|
$cp = intval($ent[3]);
|
|
break;
|
|
}
|
|
return Unicode::toUtf8(array($cp));
|
|
}
|
|
|
|
/**
|
|
* UTF-8 to UTF-16BE conversion.
|
|
*
|
|
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
|
|
*
|
|
* @param string $str
|
|
* @param bool $bom
|
|
* @return string
|
|
*/
|
|
public static function toUtf16be($str, $bom = false)
|
|
{
|
|
$out = $bom ? "\xFE\xFF" : '';
|
|
if (UTF8_MBSTRING) {
|
|
return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
|
|
}
|
|
|
|
$uni = Unicode::fromUtf8($str);
|
|
foreach ($uni as $cp) {
|
|
$out .= pack('n', $cp);
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* UTF-8 to UTF-16BE conversion.
|
|
*
|
|
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
|
|
*
|
|
* @param string $str
|
|
* @return false|string
|
|
*/
|
|
public static function fromUtf16be($str)
|
|
{
|
|
$uni = unpack('n*', $str);
|
|
return Unicode::toUtf8($uni);
|
|
}
|
|
|
|
}
|