Implement basic ngram search for Owners Package names
Summary: Ref T9979. This uses ngrams (specifically, trigrams) to build a reasonably efficient index for substring matching. Specifically, for a package like "Example", with ID 123, we store rows like this: ``` < ex, 123> <exa, 123> <xam, 123> <amp, 123> <mpl, 123> <ple, 123> <le , 123> ``` When the user searches for `exam`, we join this table for packages with tokens `exa` and `xam`. MySQL can do this a lot more efficiently than it can process a `LIKE "%exam%"` query against a huge table. When the user searches for a one-letter or two-letter string, we only search the beginnings of words. This is probably what they want, the only thing we can do quickly, and a reasonable/expected behavior for typeaheads. Test Plan: - Ran storage upgrades and search indexer. - Searched for stuff with "name contains". - Used typehaead and got sensible results. - Searched for `aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz` and saw only 16 joins. Reviewers: chad Reviewed By: chad Maniphest Tasks: T9979 Differential Revision: https://secure.phabricator.com/D14846
This commit is contained in:
@@ -26,6 +26,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||
private $edgeLogicConstraintsAreValid = false;
|
||||
private $spacePHIDs;
|
||||
private $spaceIsArchived;
|
||||
private $ngrams = array();
|
||||
|
||||
protected function getPageCursors(array $page) {
|
||||
return array(
|
||||
@@ -253,6 +254,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||
$joins = array();
|
||||
$joins[] = $this->buildEdgeLogicJoinClause($conn);
|
||||
$joins[] = $this->buildApplicationSearchJoinClause($conn);
|
||||
$joins[] = $this->buildNgramsJoinClause($conn);
|
||||
return $joins;
|
||||
}
|
||||
|
||||
@@ -274,6 +276,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||
$where[] = $this->buildPagingClause($conn);
|
||||
$where[] = $this->buildEdgeLogicWhereClause($conn);
|
||||
$where[] = $this->buildSpacesWhereClause($conn);
|
||||
$where[] = $this->buildNgramsWhereClause($conn);
|
||||
return $where;
|
||||
}
|
||||
|
||||
@@ -324,6 +327,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||
return true;
|
||||
}
|
||||
|
||||
if ($this->shouldGroupNgramResultRows()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1345,6 +1352,138 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||
}
|
||||
|
||||
|
||||
/* -( Ngrams )------------------------------------------------------------- */
|
||||
|
||||
|
||||
protected function withNgramsConstraint(
|
||||
PhabricatorSearchNgrams $index,
|
||||
$value) {
|
||||
|
||||
if (strlen($value)) {
|
||||
$this->ngrams[] = array(
|
||||
'index' => $index,
|
||||
'value' => $value,
|
||||
'length' => count(phutil_utf8v($value)),
|
||||
);
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
|
||||
$flat = array();
|
||||
foreach ($this->ngrams as $spec) {
|
||||
$index = $spec['index'];
|
||||
$value = $spec['value'];
|
||||
$length = $spec['length'];
|
||||
|
||||
if ($length >= 3) {
|
||||
$ngrams = $index->getNgramsFromString($value, 'query');
|
||||
$prefix = false;
|
||||
} else if ($length == 2) {
|
||||
$ngrams = $index->getNgramsFromString($value, 'prefix');
|
||||
$prefix = false;
|
||||
} else {
|
||||
$ngrams = array(' '.$value);
|
||||
$prefix = true;
|
||||
}
|
||||
|
||||
foreach ($ngrams as $ngram) {
|
||||
$flat[] = array(
|
||||
'table' => $index->getTableName(),
|
||||
'ngram' => $ngram,
|
||||
'prefix' => $prefix,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// MySQL only allows us to join a maximum of 61 tables per query. Each
|
||||
// ngram is going to cost us a join toward that limit, so if the user
|
||||
// specified a very long query string, just pick 16 of the ngrams
|
||||
// at random.
|
||||
if (count($flat) > 16) {
|
||||
shuffle($flat);
|
||||
$flat = array_slice($flat, 0, 16);
|
||||
}
|
||||
|
||||
$alias = $this->getPrimaryTableAlias();
|
||||
if ($alias) {
|
||||
$id_column = qsprintf($conn, '%T.%T', $alias, 'id');
|
||||
} else {
|
||||
$id_column = qsprintf($conn, '%T', 'id');
|
||||
}
|
||||
|
||||
$idx = 1;
|
||||
$joins = array();
|
||||
foreach ($flat as $spec) {
|
||||
$table = $spec['table'];
|
||||
$ngram = $spec['ngram'];
|
||||
$prefix = $spec['prefix'];
|
||||
|
||||
$alias = 'ngm'.$idx++;
|
||||
|
||||
if ($prefix) {
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.objectID = %Q AND %T.ngram LIKE %>',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$id_column,
|
||||
$alias,
|
||||
$ngram);
|
||||
} else {
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.objectID = %Q AND %T.ngram = %s',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$id_column,
|
||||
$alias,
|
||||
$ngram);
|
||||
}
|
||||
}
|
||||
|
||||
return $joins;
|
||||
}
|
||||
|
||||
|
||||
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
|
||||
$where = array();
|
||||
|
||||
foreach ($this->ngrams as $ngram) {
|
||||
$index = $ngram['index'];
|
||||
$value = $ngram['value'];
|
||||
|
||||
$column = $index->getColumnName();
|
||||
$alias = $this->getPrimaryTableAlias();
|
||||
if ($alias) {
|
||||
$column = qsprintf($conn, '%T.%T', $alias, $column);
|
||||
} else {
|
||||
$column = qsprintf($conn, '%T', $column);
|
||||
}
|
||||
|
||||
$tokens = $index->tokenizeString($value);
|
||||
foreach ($tokens as $token) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'%Q LIKE %~',
|
||||
$column,
|
||||
$token);
|
||||
}
|
||||
}
|
||||
|
||||
return $where;
|
||||
}
|
||||
|
||||
|
||||
protected function shouldGroupNgramResultRows() {
|
||||
return (bool)$this->ngrams;
|
||||
}
|
||||
|
||||
|
||||
/* -( Edge Logic )--------------------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user