Improve Remarkup parsing performance for certain large input blocks
Summary: Fixes T13487. In PHI1628, an install has a 4MB remarkup corpus which takes a long time to render. This is broadly expected, but a few reasonable improvements fell out of running it through the profiler. Test Plan: - Saw local cold-cache end-to-end rendering time drop from 12s to 4s for the highly secret input corpus. - Verified output has the same hashes before/after. - Ran all remarkup unit tests. Maniphest Tasks: T13487 Differential Revision: https://secure.phabricator.com/D20968
This commit is contained in:
		| @@ -100,6 +100,9 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule { | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   private function getRegEx() { |   private function getRegEx() { | ||||||
|  |     static $regex; | ||||||
|  |  | ||||||
|  |     if ($regex === null) { | ||||||
|       $words = array( |       $words = array( | ||||||
|         'NOTE', |         'NOTE', | ||||||
|         'IMPORTANT', |         'IMPORTANT', | ||||||
| @@ -111,11 +114,14 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule { | |||||||
|       } |       } | ||||||
|       $words = implode('|', $words); |       $words = implode('|', $words); | ||||||
|  |  | ||||||
|     return |       $regex = | ||||||
|         '/^(?:'. |         '/^(?:'. | ||||||
|         '(?:\((?P<hideword>'.$words.')\))'. |         '(?:\((?P<hideword>'.$words.')\))'. | ||||||
|         '|'. |         '|'. | ||||||
|         '(?:(?P<showword>'.$words.'):))\s*'. |         '(?:(?P<showword>'.$words.'):))\s*'. | ||||||
|         '/'; |         '/'; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     return $regex; | ||||||
|  |   } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -153,33 +153,54 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine { | |||||||
|     $block_rules = $this->blockRules; |     $block_rules = $this->blockRules; | ||||||
|     $blocks = array(); |     $blocks = array(); | ||||||
|     $cursor = 0; |     $cursor = 0; | ||||||
|     $prev_block = array(); |  | ||||||
|  |     $can_merge = array(); | ||||||
|  |     foreach ($block_rules as $key => $block_rule) { | ||||||
|  |       if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) { | ||||||
|  |         $can_merge[$key] = true; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     $last_block = null; | ||||||
|  |     $last_block_key = -1; | ||||||
|  |  | ||||||
|  |     // See T13487. For very large inputs, block separation can dominate | ||||||
|  |     // runtime. This is written somewhat clumsily to attempt to handle | ||||||
|  |     // very large inputs as gracefully as is practical. | ||||||
|  |  | ||||||
|     while (isset($text[$cursor])) { |     while (isset($text[$cursor])) { | ||||||
|       $starting_cursor = $cursor; |       $starting_cursor = $cursor; | ||||||
|       foreach ($block_rules as $block_rule) { |       foreach ($block_rules as $block_key => $block_rule) { | ||||||
|         $num_lines = $block_rule->getMatchingLineCount($text, $cursor); |         $num_lines = $block_rule->getMatchingLineCount($text, $cursor); | ||||||
|  |  | ||||||
|         if ($num_lines) { |         if ($num_lines) { | ||||||
|           if ($blocks) { |           $current_block = array( | ||||||
|             $prev_block = last($blocks); |  | ||||||
|           } |  | ||||||
|  |  | ||||||
|           $curr_block = array( |  | ||||||
|             'start' => $cursor, |             'start' => $cursor, | ||||||
|             'num_lines' => $num_lines, |             'num_lines' => $num_lines, | ||||||
|             'rule' => $block_rule, |             'rule' => $block_rule, | ||||||
|             'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines), |             'empty' => self::isEmptyBlock($text, $cursor, $num_lines), | ||||||
|             'children' => array(), |             'children' => array(), | ||||||
|  |             'merge' => isset($can_merge[$block_key]), | ||||||
|           ); |           ); | ||||||
|  |  | ||||||
|           if ($prev_block |           $should_merge = self::shouldMergeParagraphBlocks( | ||||||
|             && self::shouldMergeBlocks($text, $prev_block, $curr_block)) { |             $text, | ||||||
|             $blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines']; |             $last_block, | ||||||
|             $blocks[last_key($blocks)]['is_empty'] = |             $current_block); | ||||||
|               $blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty']; |  | ||||||
|  |           if ($should_merge) { | ||||||
|  |             $last_block['num_lines'] = | ||||||
|  |               ($last_block['num_lines'] + $current_block['num_lines']); | ||||||
|  |  | ||||||
|  |             $last_block['empty'] = | ||||||
|  |               ($last_block['empty'] && $current_block['empty']); | ||||||
|  |  | ||||||
|  |             $blocks[$last_block_key] = $last_block; | ||||||
|           } else { |           } else { | ||||||
|             $blocks[] = $curr_block; |             $blocks[] = $current_block; | ||||||
|  |  | ||||||
|  |             $last_block = $current_block; | ||||||
|  |             $last_block_key++; | ||||||
|           } |           } | ||||||
|  |  | ||||||
|           $cursor += $num_lines; |           $cursor += $num_lines; | ||||||
| @@ -192,9 +213,20 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine { | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // See T13487. It's common for blocks to be small, and this loop seems to | ||||||
|  |     // measure as faster if we manually concatenate blocks than if we | ||||||
|  |     // "array_slice()" and "implode()" blocks. This is a bit muddy. | ||||||
|  |  | ||||||
|     foreach ($blocks as $key => $block) { |     foreach ($blocks as $key => $block) { | ||||||
|       $lines = array_slice($text, $block['start'], $block['num_lines']); |       $min = $block['start']; | ||||||
|       $blocks[$key]['text'] = implode('', $lines); |       $max = $min + $block['num_lines']; | ||||||
|  |  | ||||||
|  |       $lines = ''; | ||||||
|  |       for ($ii = $min; $ii < $max; $ii++) { | ||||||
|  |         $lines .= $text[$ii]; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       $blocks[$key]['text'] = $lines; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Stop splitting child blocks apart if we get too deep. This arrests |     // Stop splitting child blocks apart if we get too deep. This arrests | ||||||
| @@ -246,30 +278,48 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine { | |||||||
|     return $output; |     return $output; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   private static function shouldMergeBlocks($text, $prev_block, $curr_block) { |   private static function shouldMergeParagraphBlocks( | ||||||
|     $block_rules = ipull(array($prev_block, $curr_block), 'rule'); |     $text, | ||||||
|  |     $last_block, | ||||||
|  |     $current_block) { | ||||||
|  |  | ||||||
|     $default_rule = 'PhutilRemarkupDefaultBlockRule'; |     // If we're at the beginning of the input, we can't merge. | ||||||
|     try { |     if ($last_block === null) { | ||||||
|       assert_instances_of($block_rules, $default_rule); |       return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|       // If the last block was empty keep merging |     // If the previous block wasn't a default block, we can't merge. | ||||||
|       if ($prev_block['is_empty']) { |     if (!$last_block['merge']) { | ||||||
|  |       return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // If the current block isn't a default block, we can't merge. | ||||||
|  |     if (!$current_block['merge']) { | ||||||
|  |       return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // If the last block was empty, we definitely want to merge. | ||||||
|  |     if ($last_block['empty']) { | ||||||
|       return true; |       return true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|       // If this line is blank keep merging |     // If this block is empty, we definitely want to merge. | ||||||
|       if ($curr_block['is_empty']) { |     if ($current_block['empty']) { | ||||||
|       return true; |       return true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|       // If the current line and the last line have content, keep merging |     // Check if the last line of the previous block or the first line of this | ||||||
|       if (strlen(trim($text[$curr_block['start'] - 1]))) { |     // block have any non-whitespace text. If they both do, we're going to | ||||||
|         if (strlen(trim($text[$curr_block['start']]))) { |     // merge. | ||||||
|  |  | ||||||
|  |     // If either of them are a blank line or a line with only whitespace, we | ||||||
|  |     // do not merge: this means we've found a paragraph break. | ||||||
|  |  | ||||||
|  |     $tail = $text[$current_block['start'] - 1]; | ||||||
|  |     $head = $text[$current_block['start']]; | ||||||
|  |     if (strlen(trim($tail)) && strlen(trim($head))) { | ||||||
|       return true; |       return true; | ||||||
|     } |     } | ||||||
|       } |  | ||||||
|     } catch (Exception $e) {} |  | ||||||
|  |  | ||||||
|     return false; |     return false; | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -2,6 +2,9 @@ | |||||||
|  |  | ||||||
| abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule { | abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule { | ||||||
|  |  | ||||||
|  |   private $referencePattern; | ||||||
|  |   private $embedPattern; | ||||||
|  |  | ||||||
|   const KEY_RULE_OBJECT = 'rule.object'; |   const KEY_RULE_OBJECT = 'rule.object'; | ||||||
|   const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned'; |   const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned'; | ||||||
|  |  | ||||||
| @@ -192,14 +195,20 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule { | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   private function getObjectEmbedPattern() { |   private function getObjectEmbedPattern() { | ||||||
|  |     if ($this->embedPattern === null) { | ||||||
|       $prefix = $this->getObjectNamePrefix(); |       $prefix = $this->getObjectNamePrefix(); | ||||||
|       $prefix = preg_quote($prefix); |       $prefix = preg_quote($prefix); | ||||||
|       $id = $this->getObjectIDPattern(); |       $id = $this->getObjectIDPattern(); | ||||||
|  |  | ||||||
|     return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u'; |       $this->embedPattern = | ||||||
|  |         '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u'; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return $this->embedPattern; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   private function getObjectReferencePattern() { |   private function getObjectReferencePattern() { | ||||||
|  |     if ($this->referencePattern === null) { | ||||||
|       $prefix = $this->getObjectNamePrefix(); |       $prefix = $this->getObjectNamePrefix(); | ||||||
|       $prefix = preg_quote($prefix); |       $prefix = preg_quote($prefix); | ||||||
|  |  | ||||||
| @@ -223,7 +232,11 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule { | |||||||
|       // The "\b" allows us to link "(abcdef)" or similar without linking things |       // The "\b" allows us to link "(abcdef)" or similar without linking things | ||||||
|       // in the middle of words. |       // in the middle of words. | ||||||
|  |  | ||||||
|     return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u'; |       $this->referencePattern = | ||||||
|  |         '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u'; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return $this->referencePattern; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 epriestley
					epriestley