205 lines
6.2 KiB
PHP
205 lines
6.2 KiB
PHP
<?php
|
|
|
|
namespace dokuwiki\Utf8;
|
|
|
|
/**
|
|
* Methods to assess and clean UTF-8 strings
|
|
*/
|
|
class Clean
|
|
{
|
|
/**
|
|
* Checks if a string contains 7bit ASCII only
|
|
*
|
|
* @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
|
|
*
|
|
* @param string $str
|
|
* @return bool
|
|
*/
|
|
public static function isASCII($str)
|
|
{
|
|
return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
|
|
}
|
|
|
|
/**
|
|
* Tries to detect if a string is in Unicode encoding
|
|
*
|
|
* @author <bmorel@ssi.fr>
|
|
* @link http://php.net/manual/en/function.utf8-encode.php
|
|
*
|
|
* @param string $str
|
|
* @return bool
|
|
*/
|
|
public static function isUtf8($str)
|
|
{
|
|
$len = strlen($str);
|
|
for ($i = 0; $i < $len; $i++) {
|
|
$b = ord($str[$i]);
|
|
if ($b < 0x80) continue; # 0bbbbbbb
|
|
elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
|
|
elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
|
|
elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
|
|
elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
|
|
elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
|
|
else return false; # Does not match any model
|
|
|
|
for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
|
|
if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Strips all high byte chars
|
|
*
|
|
* Returns a pure ASCII7 string
|
|
*
|
|
* @author Andreas Gohr <andi@splitbrain.org>
|
|
*
|
|
* @param string $str
|
|
* @return string
|
|
*/
|
|
public static function strip($str)
|
|
{
|
|
$ascii = '';
|
|
$len = strlen($str);
|
|
for ($i = 0; $i < $len; $i++) {
|
|
if (ord($str[$i]) < 128) {
|
|
$ascii .= $str[$i];
|
|
}
|
|
}
|
|
return $ascii;
|
|
}
|
|
|
|
/**
|
|
* Removes special characters (nonalphanumeric) from a UTF-8 string
|
|
*
|
|
* This function adds the controlchars 0x00 to 0x19 to the array of
|
|
* stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
|
|
*
|
|
* @author Andreas Gohr <andi@splitbrain.org>
|
|
*
|
|
* @param string $string The UTF8 string to strip of special chars
|
|
* @param string $repl Replace special with this string
|
|
* @param string $additional Additional chars to strip (used in regexp char class)
|
|
* @return string
|
|
*/
|
|
public static function stripspecials($string, $repl = '', $additional = '')
|
|
{
|
|
static $specials = null;
|
|
if ($specials === null) {
|
|
$specials = preg_quote(Table::specialChars(), '/');
|
|
}
|
|
|
|
return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
|
|
}
|
|
|
|
/**
|
|
* Replace bad bytes with an alternative character
|
|
*
|
|
* ASCII character is recommended for replacement char
|
|
*
|
|
* PCRE Pattern to locate bad bytes in a UTF-8 string
|
|
* Comes from W3 FAQ: Multilingual Forms
|
|
* Note: modified to include full ASCII range including control chars
|
|
*
|
|
* @author Harry Fuecks <hfuecks@gmail.com>
|
|
* @see http://www.w3.org/International/questions/qa-forms-utf-8
|
|
*
|
|
* @param string $str to search
|
|
* @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
|
|
* @return string
|
|
*/
|
|
public static function replaceBadBytes($str, $replace = '')
|
|
{
|
|
$UTF8_BAD =
|
|
'([\x00-\x7F]' . # ASCII (including control chars)
|
|
'|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
|
|
'|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
|
|
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
|
|
'|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
|
|
'|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
|
|
'|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
|
|
'|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
|
|
'|(.{1}))'; # invalid byte
|
|
ob_start();
|
|
while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
|
|
if (!isset($matches[2])) {
|
|
echo $matches[0];
|
|
} else {
|
|
echo $replace;
|
|
}
|
|
$str = substr($str, strlen($matches[0]));
|
|
}
|
|
return ob_get_clean();
|
|
}
|
|
|
|
|
|
/**
|
|
* Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
|
|
*
|
|
* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
|
|
* letters. Default is to deaccent both cases ($case = 0)
|
|
*
|
|
* @author Andreas Gohr <andi@splitbrain.org>
|
|
*
|
|
* @param string $string
|
|
* @param int $case
|
|
* @return string
|
|
*/
|
|
public static function deaccent($string, $case = 0)
|
|
{
|
|
if ($case <= 0) {
|
|
$string = strtr($string, Table::lowerAccents());
|
|
}
|
|
if ($case >= 0) {
|
|
$string = strtr($string, Table::upperAccents());
|
|
}
|
|
return $string;
|
|
}
|
|
|
|
/**
|
|
* Romanize a non-latin string
|
|
*
|
|
* @author Andreas Gohr <andi@splitbrain.org>
|
|
*
|
|
* @param string $string
|
|
* @return string
|
|
*/
|
|
public static function romanize($string)
|
|
{
|
|
if (self::isASCII($string)) return $string; //nothing to do
|
|
|
|
return strtr($string, Table::romanization());
|
|
}
|
|
|
|
/**
|
|
* adjust a byte index into a utf8 string to a utf8 character boundary
|
|
*
|
|
* @author chris smith <chris@jalakai.co.uk>
|
|
*
|
|
* @param string $str utf8 character string
|
|
* @param int $i byte index into $str
|
|
* @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
|
|
* @return int byte index into $str now pointing to a utf8 character boundary
|
|
*/
|
|
public static function correctIdx($str, $i, $next = false)
|
|
{
|
|
|
|
if ($i <= 0) return 0;
|
|
|
|
$limit = strlen($str);
|
|
if ($i >= $limit) return $limit;
|
|
|
|
if ($next) {
|
|
while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
|
|
} else {
|
|
while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
|
|
}
|
|
|
|
return $i;
|
|
}
|
|
|
|
}
|