dockerwiki/content/inc/Parsing/Lexer/ParallelRegex.php
2021-10-26 13:02:53 +02:00

204 lines
7.0 KiB
PHP

<?php
/**
* Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
* For an intro to the Lexer see:
* https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
*
* @author Marcus Baker http://www.lastcraft.com
*/
namespace dokuwiki\Parsing\Lexer;
/**
* Compounded regular expression.
*
* Any of the contained patterns could match and when one does it's label is returned.
*/
class ParallelRegex
{
/** @var string[] patterns to match */
protected $patterns;
/** @var string[] labels for above patterns */
protected $labels;
/** @var string the compound regex matching all patterns */
protected $regex;
/** @var bool case sensitive matching? */
protected $case;
/**
* Constructor. Starts with no patterns.
*
* @param boolean $case True for case sensitive, false
* for insensitive.
*/
public function __construct($case)
{
$this->case = $case;
$this->patterns = array();
$this->labels = array();
$this->regex = null;
}
/**
* Adds a pattern with an optional label.
*
* @param mixed $pattern Perl style regex. Must be UTF-8
* encoded. If its a string, the (, )
* lose their meaning unless they
* form part of a lookahead or
* lookbehind assertation.
* @param bool|string $label Label of regex to be returned
* on a match. Label must be ASCII
*/
public function addPattern($pattern, $label = true)
{
$count = count($this->patterns);
$this->patterns[$count] = $pattern;
$this->labels[$count] = $label;
$this->regex = null;
}
/**
* Attempts to match all patterns at once against a string.
*
* @param string $subject String to match against.
* @param string $match First matched portion of
* subject.
* @return bool|string False if no match found, label if label exists, true if not
*/
public function match($subject, &$match)
{
if (count($this->patterns) == 0) {
return false;
}
if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
$match = "";
return false;
}
$match = $matches[0];
$size = count($matches);
// FIXME this could be made faster by storing the labels as keys in a hashmap
for ($i = 1; $i < $size; $i++) {
if ($matches[$i] && isset($this->labels[$i - 1])) {
return $this->labels[$i - 1];
}
}
return true;
}
/**
* Attempts to split the string against all patterns at once
*
* @param string $subject String to match against.
* @param array $split The split result: array containing, pre-match, match & post-match strings
* @return boolean True on success.
*
* @author Christopher Smith <chris@jalakai.co.uk>
*/
public function split($subject, &$split)
{
if (count($this->patterns) == 0) {
return false;
}
if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
if (function_exists('preg_last_error')) {
$err = preg_last_error();
switch ($err) {
case PREG_BACKTRACK_LIMIT_ERROR:
msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
break;
case PREG_RECURSION_LIMIT_ERROR:
msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
break;
case PREG_BAD_UTF8_ERROR:
msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
break;
case PREG_INTERNAL_ERROR:
msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
break;
}
}
$split = array($subject, "", "");
return false;
}
$idx = count($matches)-2;
list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
$split = array($pre, $matches[0], $post);
return isset($this->labels[$idx]) ? $this->labels[$idx] : true;
}
/**
* Compounds the patterns into a single
* regular expression separated with the
* "or" operator. Caches the regex.
* Will automatically escape (, ) and / tokens.
*
* @return null|string
*/
protected function getCompoundedRegex()
{
if ($this->regex == null) {
$cnt = count($this->patterns);
for ($i = 0; $i < $cnt; $i++) {
/*
* decompose the input pattern into "(", "(?", ")",
* "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
* elements.
*/
preg_match_all('/\\\\.|' .
'\(\?|' .
'[()]|' .
'\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
'[^[()\\\\]+/', $this->patterns[$i], $elts);
$pattern = "";
$level = 0;
foreach ($elts[0] as $elt) {
/*
* for "(", ")" remember the nesting level, add "\"
* only to the non-"(?" ones.
*/
switch ($elt) {
case '(':
$pattern .= '\(';
break;
case ')':
if ($level > 0)
$level--; /* closing (? */
else $pattern .= '\\';
$pattern .= ')';
break;
case '(?':
$level++;
$pattern .= '(?';
break;
default:
if (substr($elt, 0, 1) == '\\')
$pattern .= $elt;
else $pattern .= str_replace('/', '\/', $elt);
}
}
$this->patterns[$i] = "($pattern)";
}
$this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
}
return $this->regex;
}
/**
* Accessor for perl regex mode flags to use.
* @return string Perl regex flags.
*/
protected function getPerlMatchingFlags()
{
return ($this->case ? "msS" : "msSi");
}
}