Identify compound short search tokens in the form "xx.yy" as unqueryable in the search UI

Summary:
Ref T12928. The index doesn't work for these, so show the user that there's a problem and drop the terms.

This doesn't fix the problem, but makes the behavior more clear.

Test Plan:
{F5053703}

{F5053704}

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12928

Differential Revision: https://secure.phabricator.com/D18254
This commit is contained in:
epriestley
2017-07-20 09:08:22 -07:00
parent e9208ed3da
commit 018d1b77bf

View File

@@ -235,7 +235,7 @@ final class PhabricatorMySQLFulltextStorageEngine
$value = $stemmer->stemToken($value);
}
if (phutil_utf8_strlen($value) < $min_length) {
if ($this->isShortToken($value, $min_length)) {
$fulltext_token->setIsShort(true);
continue;
}
@@ -549,4 +549,22 @@ final class PhabricatorMySQLFulltextStorageEngine
return array($min_len, $stopwords);
}
private function isShortToken($value, $min_length) {
// NOTE: The engine tokenizes internally on periods, so terms in the form
// "ab.cd", where short substrings are separated by periods, do not produce
// any queryable tokens. These terms are meaningful if at least one
// substring is longer than the minimum length, like "example.py". See
// T12928.
$parts = preg_split('/[.]+/', $value);
foreach ($parts as $part) {
if (phutil_utf8_strlen($part) >= $min_length) {
return false;
}
}
return true;
}
}