From d031d3ae329b7f152ed014470285176c34a384e7 Mon Sep 17 00:00:00 2001 From: epriestley Date: Thu, 23 Jun 2011 12:01:00 -0700 Subject: [PATCH] Slightly improve UTF-8 handling in Differential Summary: See comments. I think this will fix the issue, where we end up handling off garbage to htmlspecialchars() after highlighting a file we've stuck full of \0 bytes. The right fix for this is to make wordwrap and intraline-diff utf8 aware and throw this whole thing away. I'll work on that but I think this fixes the immediate issue. Test Plan: diffed the file with a UTF-8 quote in it and got a reasonable render in Differential Reviewed By: jungejason Reviewers: jungejason, aran, tuomaspelkonen CC: aran, jungejason Differential Revision: 504 --- .../changeset/DifferentialChangesetParser.php | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php index 2e45839f14..67ecbfc556 100644 --- a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php +++ b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php @@ -186,14 +186,6 @@ class DifferentialChangesetParser { $this->parsedHunk = true; $lines = $hunk->getChanges(); - // Flatten UTF-8 into "\0". We don't support UTF-8 because the diffing - // algorithms are byte-oriented (not character oriented) and everyone seems - // to be in agreement that it's fairly reasonable not to allow UTF-8 in - // source files. These bytes will later be replaced with a "?" glyph, but - // in the meantime we replace them with "\0" since Pygments is happy to - // deal with that. - $lines = preg_replace('/[\x80-\xFF]/', "\0", $lines); - $lines = str_replace( array("\t", "\r\n", "\r"), array(' ', "\n", "\n"), @@ -702,11 +694,18 @@ class DifferentialChangesetParser { protected function tokenHighlight(&$render) { + // TODO: This is really terribly horrible and should be fixed. We have two + // byte-oriented algorithms (wordwrap and intraline diff) which are not + // unicode-aware and can accept a valid UTF-8 string but emit an invalid + // one by adding markup inside the byte sequences of characters. The right + // fix here is to make them UTF-8 aware. Short of that, we can repair the + // possibly-broken UTF-8 string into a valid UTF-8 string by replacing all + // UTF-8 bytes with a Unicode Replacement Character. foreach ($render as $key => $text) { - $render[$key] = str_replace( - "\0", - ''."\xEF\xBF\xBD".'', - $text); + $render[$key] = preg_replace( + '/[\x80-\xFF]/', + ''."\xEF\xBF\xBD".'', + $text); } }