From 3fc817d0882931f2c63a73c971febc3481610d33 Mon Sep 17 00:00:00 2001 From: epriestley Date: Fri, 24 Jun 2011 10:46:30 -0700 Subject: [PATCH] Revert "Remove UTF-8 kludges from Differential" This shouldn't have landed. This reverts commit fe04d8bf70e80a19443e2f6d8a5f745a6e4b4445. --- .../changeset/DifferentialChangesetParser.php | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php index fb6c5befa1..b2e6b12973 100644 --- a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php +++ b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php @@ -520,6 +520,9 @@ class DifferentialChangesetParser { ipull($this->intra, 1), $new_corpus); + $this->tokenHighlight($this->oldRender); + $this->tokenHighlight($this->newRender); + $generated = (strpos($new_corpus_block, '@'.'generated') !== false); $this->specialAttributes[self::ATTR_GENERATED] = $generated; @@ -707,6 +710,23 @@ class DifferentialChangesetParser { return implode('', $result); } + + protected function tokenHighlight(&$render) { + // TODO: This is really terribly horrible and should be fixed. We have two + // byte-oriented algorithms (wordwrap and intraline diff) which are not + // unicode-aware and can accept a valid UTF-8 string but emit an invalid + // one by adding markup inside the byte sequences of characters. The right + // fix here is to make them UTF-8 aware. Short of that, we can repair the + // possibly-broken UTF-8 string into a valid UTF-8 string by replacing all + // UTF-8 bytes with a Unicode Replacement Character. + foreach ($render as $key => $text) { + $render[$key] = preg_replace( + '/[\x80-\xFF]/', + ''."\xEF\xBF\xBD".'', + $text); + } + } + protected function getHighlightFuture($corpus) { return $this->highlightEngine->getHighlightFuture( $this->filetype,