Improve Remarkup parsing performance for certain large input blocks
Summary: Fixes T13487. In PHI1628, an install has a 4MB remarkup corpus which takes a long time to render. This is broadly expected, but a few reasonable improvements fell out of running it through the profiler. Test Plan: - Saw local cold-cache end-to-end rendering time drop from 12s to 4s for the highly secret input corpus. - Verified output has the same hashes before/after. - Ran all remarkup unit tests. Maniphest Tasks: T13487 Differential Revision: https://secure.phabricator.com/D20968
This commit is contained in:
@@ -100,6 +100,9 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private function getRegEx() {
|
private function getRegEx() {
|
||||||
|
static $regex;
|
||||||
|
|
||||||
|
if ($regex === null) {
|
||||||
$words = array(
|
$words = array(
|
||||||
'NOTE',
|
'NOTE',
|
||||||
'IMPORTANT',
|
'IMPORTANT',
|
||||||
@@ -111,11 +114,14 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule {
|
|||||||
}
|
}
|
||||||
$words = implode('|', $words);
|
$words = implode('|', $words);
|
||||||
|
|
||||||
return
|
$regex =
|
||||||
'/^(?:'.
|
'/^(?:'.
|
||||||
'(?:\((?P<hideword>'.$words.')\))'.
|
'(?:\((?P<hideword>'.$words.')\))'.
|
||||||
'|'.
|
'|'.
|
||||||
'(?:(?P<showword>'.$words.'):))\s*'.
|
'(?:(?P<showword>'.$words.'):))\s*'.
|
||||||
'/';
|
'/';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return $regex;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -153,33 +153,54 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
|
|||||||
$block_rules = $this->blockRules;
|
$block_rules = $this->blockRules;
|
||||||
$blocks = array();
|
$blocks = array();
|
||||||
$cursor = 0;
|
$cursor = 0;
|
||||||
$prev_block = array();
|
|
||||||
|
$can_merge = array();
|
||||||
|
foreach ($block_rules as $key => $block_rule) {
|
||||||
|
if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) {
|
||||||
|
$can_merge[$key] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$last_block = null;
|
||||||
|
$last_block_key = -1;
|
||||||
|
|
||||||
|
// See T13487. For very large inputs, block separation can dominate
|
||||||
|
// runtime. This is written somewhat clumsily to attempt to handle
|
||||||
|
// very large inputs as gracefully as is practical.
|
||||||
|
|
||||||
while (isset($text[$cursor])) {
|
while (isset($text[$cursor])) {
|
||||||
$starting_cursor = $cursor;
|
$starting_cursor = $cursor;
|
||||||
foreach ($block_rules as $block_rule) {
|
foreach ($block_rules as $block_key => $block_rule) {
|
||||||
$num_lines = $block_rule->getMatchingLineCount($text, $cursor);
|
$num_lines = $block_rule->getMatchingLineCount($text, $cursor);
|
||||||
|
|
||||||
if ($num_lines) {
|
if ($num_lines) {
|
||||||
if ($blocks) {
|
$current_block = array(
|
||||||
$prev_block = last($blocks);
|
|
||||||
}
|
|
||||||
|
|
||||||
$curr_block = array(
|
|
||||||
'start' => $cursor,
|
'start' => $cursor,
|
||||||
'num_lines' => $num_lines,
|
'num_lines' => $num_lines,
|
||||||
'rule' => $block_rule,
|
'rule' => $block_rule,
|
||||||
'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines),
|
'empty' => self::isEmptyBlock($text, $cursor, $num_lines),
|
||||||
'children' => array(),
|
'children' => array(),
|
||||||
|
'merge' => isset($can_merge[$block_key]),
|
||||||
);
|
);
|
||||||
|
|
||||||
if ($prev_block
|
$should_merge = self::shouldMergeParagraphBlocks(
|
||||||
&& self::shouldMergeBlocks($text, $prev_block, $curr_block)) {
|
$text,
|
||||||
$blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines'];
|
$last_block,
|
||||||
$blocks[last_key($blocks)]['is_empty'] =
|
$current_block);
|
||||||
$blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty'];
|
|
||||||
|
if ($should_merge) {
|
||||||
|
$last_block['num_lines'] =
|
||||||
|
($last_block['num_lines'] + $current_block['num_lines']);
|
||||||
|
|
||||||
|
$last_block['empty'] =
|
||||||
|
($last_block['empty'] && $current_block['empty']);
|
||||||
|
|
||||||
|
$blocks[$last_block_key] = $last_block;
|
||||||
} else {
|
} else {
|
||||||
$blocks[] = $curr_block;
|
$blocks[] = $current_block;
|
||||||
|
|
||||||
|
$last_block = $current_block;
|
||||||
|
$last_block_key++;
|
||||||
}
|
}
|
||||||
|
|
||||||
$cursor += $num_lines;
|
$cursor += $num_lines;
|
||||||
@@ -192,9 +213,20 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// See T13487. It's common for blocks to be small, and this loop seems to
|
||||||
|
// measure as faster if we manually concatenate blocks than if we
|
||||||
|
// "array_slice()" and "implode()" blocks. This is a bit muddy.
|
||||||
|
|
||||||
foreach ($blocks as $key => $block) {
|
foreach ($blocks as $key => $block) {
|
||||||
$lines = array_slice($text, $block['start'], $block['num_lines']);
|
$min = $block['start'];
|
||||||
$blocks[$key]['text'] = implode('', $lines);
|
$max = $min + $block['num_lines'];
|
||||||
|
|
||||||
|
$lines = '';
|
||||||
|
for ($ii = $min; $ii < $max; $ii++) {
|
||||||
|
$lines .= $text[$ii];
|
||||||
|
}
|
||||||
|
|
||||||
|
$blocks[$key]['text'] = $lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop splitting child blocks apart if we get too deep. This arrests
|
// Stop splitting child blocks apart if we get too deep. This arrests
|
||||||
@@ -246,30 +278,48 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
|
|||||||
return $output;
|
return $output;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static function shouldMergeBlocks($text, $prev_block, $curr_block) {
|
private static function shouldMergeParagraphBlocks(
|
||||||
$block_rules = ipull(array($prev_block, $curr_block), 'rule');
|
$text,
|
||||||
|
$last_block,
|
||||||
|
$current_block) {
|
||||||
|
|
||||||
$default_rule = 'PhutilRemarkupDefaultBlockRule';
|
// If we're at the beginning of the input, we can't merge.
|
||||||
try {
|
if ($last_block === null) {
|
||||||
assert_instances_of($block_rules, $default_rule);
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// If the last block was empty keep merging
|
// If the previous block wasn't a default block, we can't merge.
|
||||||
if ($prev_block['is_empty']) {
|
if (!$last_block['merge']) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the current block isn't a default block, we can't merge.
|
||||||
|
if (!$current_block['merge']) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the last block was empty, we definitely want to merge.
|
||||||
|
if ($last_block['empty']) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If this line is blank keep merging
|
// If this block is empty, we definitely want to merge.
|
||||||
if ($curr_block['is_empty']) {
|
if ($current_block['empty']) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the current line and the last line have content, keep merging
|
// Check if the last line of the previous block or the first line of this
|
||||||
if (strlen(trim($text[$curr_block['start'] - 1]))) {
|
// block have any non-whitespace text. If they both do, we're going to
|
||||||
if (strlen(trim($text[$curr_block['start']]))) {
|
// merge.
|
||||||
|
|
||||||
|
// If either of them are a blank line or a line with only whitespace, we
|
||||||
|
// do not merge: this means we've found a paragraph break.
|
||||||
|
|
||||||
|
$tail = $text[$current_block['start'] - 1];
|
||||||
|
$head = $text[$current_block['start']];
|
||||||
|
if (strlen(trim($tail)) && strlen(trim($head))) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} catch (Exception $e) {}
|
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
|
abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
|
||||||
|
|
||||||
|
private $referencePattern;
|
||||||
|
private $embedPattern;
|
||||||
|
|
||||||
const KEY_RULE_OBJECT = 'rule.object';
|
const KEY_RULE_OBJECT = 'rule.object';
|
||||||
const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned';
|
const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned';
|
||||||
|
|
||||||
@@ -192,14 +195,20 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private function getObjectEmbedPattern() {
|
private function getObjectEmbedPattern() {
|
||||||
|
if ($this->embedPattern === null) {
|
||||||
$prefix = $this->getObjectNamePrefix();
|
$prefix = $this->getObjectNamePrefix();
|
||||||
$prefix = preg_quote($prefix);
|
$prefix = preg_quote($prefix);
|
||||||
$id = $this->getObjectIDPattern();
|
$id = $this->getObjectIDPattern();
|
||||||
|
|
||||||
return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
|
$this->embedPattern =
|
||||||
|
'(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->embedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function getObjectReferencePattern() {
|
private function getObjectReferencePattern() {
|
||||||
|
if ($this->referencePattern === null) {
|
||||||
$prefix = $this->getObjectNamePrefix();
|
$prefix = $this->getObjectNamePrefix();
|
||||||
$prefix = preg_quote($prefix);
|
$prefix = preg_quote($prefix);
|
||||||
|
|
||||||
@@ -223,7 +232,11 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
|
|||||||
// The "\b" allows us to link "(abcdef)" or similar without linking things
|
// The "\b" allows us to link "(abcdef)" or similar without linking things
|
||||||
// in the middle of words.
|
// in the middle of words.
|
||||||
|
|
||||||
return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
|
$this->referencePattern =
|
||||||
|
'((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->referencePattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user