From d031d3ae329b7f152ed014470285176c34a384e7 Mon Sep 17 00:00:00 2001
From: epriestley <git@epriestley.com>
Date: Thu, 23 Jun 2011 12:01:00 -0700
Subject: [PATCH] Slightly improve UTF-8 handling in Differential

Summary:
See comments. I think this will fix the issue, where we end up handling off
garbage to htmlspecialchars() after highlighting a file we've stuck full of \0
bytes.

The right fix for this is to make wordwrap and intraline-diff utf8 aware and
throw this whole thing away. I'll work on that but I think this fixes the
immediate issue.

Test Plan:
diffed the file with a UTF-8 quote in it and got a reasonable render in
Differential

Reviewed By: jungejason
Reviewers: jungejason, aran, tuomaspelkonen
CC: aran, jungejason
Differential Revision: 504
---
 .../changeset/DifferentialChangesetParser.php | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
index 2e45839f14..67ecbfc556 100644
--- a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
+++ b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
@@ -186,14 +186,6 @@ class DifferentialChangesetParser {
     $this->parsedHunk = true;
     $lines = $hunk->getChanges();
 
-    // Flatten UTF-8 into "\0". We don't support UTF-8 because the diffing
-    // algorithms are byte-oriented (not character oriented) and everyone seems
-    // to be in agreement that it's fairly reasonable not to allow UTF-8 in
-    // source files. These bytes will later be replaced with a "?" glyph, but
-    // in the meantime we replace them with "\0" since Pygments is happy to
-    // deal with that.
-    $lines = preg_replace('/[\x80-\xFF]/', "\0", $lines);
-
     $lines = str_replace(
       array("\t", "\r\n", "\r"),
       array('  ', "\n",   "\n"),
@@ -702,11 +694,18 @@ class DifferentialChangesetParser {
 
 
   protected function tokenHighlight(&$render) {
+    // TODO: This is really terribly horrible and should be fixed. We have two
+    // byte-oriented algorithms (wordwrap and intraline diff) which are not
+    // unicode-aware and can accept a valid UTF-8 string but emit an invalid
+    // one by adding markup inside the byte sequences of characters. The right
+    // fix here is to make them UTF-8 aware. Short of that, we can repair the
+    // possibly-broken UTF-8 string into a valid UTF-8 string by replacing all
+    // UTF-8 bytes with a Unicode Replacement Character.
     foreach ($render as $key => $text) {
-      $render[$key] = str_replace(
-      "\0",
-      '<span class="uu">'."\xEF\xBF\xBD".'</span>',
-      $text);
+      $render[$key] = preg_replace(
+        '/[\x80-\xFF]/',
+        '<span class="uu">'."\xEF\xBF\xBD".'</span>',
+        $text);
     }
   }