Implement basic ngram search for Owners Package names

Summary:
Ref T9979. This uses ngrams (specifically, trigrams) to build a reasonably efficient index for substring matching. Specifically, for a package like "Example", with ID 123, we store rows like this:

```
< ex, 123>
<exa, 123>
<xam, 123>
<amp, 123>
<mpl, 123>
<ple, 123>
<le , 123>
```

When the user searches for `exam`, we join this table for packages with tokens `exa` and `xam`. MySQL can do this a lot more efficiently than it can process a `LIKE "%exam%"` query against a huge table.

When the user searches for a one-letter or two-letter string, we only search the beginnings of words. This is probably what they want, the only thing we can do quickly, and a reasonable/expected behavior for typeaheads.

Test Plan:
  - Ran storage upgrades and search indexer.
  - Searched for stuff with "name contains".
  - Used typehaead and got sensible results.
  - Searched for `aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz` and saw only 16 joins.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T9979

Differential Revision: https://secure.phabricator.com/D14846
This commit is contained in:
epriestley
2015-12-21 12:22:07 -08:00
parent 5c8025c41d
commit 96fe8c0b83
18 changed files with 457 additions and 28 deletions

View File

@@ -26,6 +26,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
private $edgeLogicConstraintsAreValid = false;
private $spacePHIDs;
private $spaceIsArchived;
private $ngrams = array();
protected function getPageCursors(array $page) {
return array(
@@ -253,6 +254,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$joins = array();
$joins[] = $this->buildEdgeLogicJoinClause($conn);
$joins[] = $this->buildApplicationSearchJoinClause($conn);
$joins[] = $this->buildNgramsJoinClause($conn);
return $joins;
}
@@ -274,6 +276,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$where[] = $this->buildPagingClause($conn);
$where[] = $this->buildEdgeLogicWhereClause($conn);
$where[] = $this->buildSpacesWhereClause($conn);
$where[] = $this->buildNgramsWhereClause($conn);
return $where;
}
@@ -324,6 +327,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return true;
}
if ($this->shouldGroupNgramResultRows()) {
return true;
}
return false;
}
@@ -1345,6 +1352,138 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
}
/* -( Ngrams )------------------------------------------------------------- */
protected function withNgramsConstraint(
PhabricatorSearchNgrams $index,
$value) {
if (strlen($value)) {
$this->ngrams[] = array(
'index' => $index,
'value' => $value,
'length' => count(phutil_utf8v($value)),
);
}
return $this;
}
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
$flat = array();
foreach ($this->ngrams as $spec) {
$index = $spec['index'];
$value = $spec['value'];
$length = $spec['length'];
if ($length >= 3) {
$ngrams = $index->getNgramsFromString($value, 'query');
$prefix = false;
} else if ($length == 2) {
$ngrams = $index->getNgramsFromString($value, 'prefix');
$prefix = false;
} else {
$ngrams = array(' '.$value);
$prefix = true;
}
foreach ($ngrams as $ngram) {
$flat[] = array(
'table' => $index->getTableName(),
'ngram' => $ngram,
'prefix' => $prefix,
);
}
}
// MySQL only allows us to join a maximum of 61 tables per query. Each
// ngram is going to cost us a join toward that limit, so if the user
// specified a very long query string, just pick 16 of the ngrams
// at random.
if (count($flat) > 16) {
shuffle($flat);
$flat = array_slice($flat, 0, 16);
}
$alias = $this->getPrimaryTableAlias();
if ($alias) {
$id_column = qsprintf($conn, '%T.%T', $alias, 'id');
} else {
$id_column = qsprintf($conn, '%T', 'id');
}
$idx = 1;
$joins = array();
foreach ($flat as $spec) {
$table = $spec['table'];
$ngram = $spec['ngram'];
$prefix = $spec['prefix'];
$alias = 'ngm'.$idx++;
if ($prefix) {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.objectID = %Q AND %T.ngram LIKE %>',
$table,
$alias,
$alias,
$id_column,
$alias,
$ngram);
} else {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.objectID = %Q AND %T.ngram = %s',
$table,
$alias,
$alias,
$id_column,
$alias,
$ngram);
}
}
return $joins;
}
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
$where = array();
foreach ($this->ngrams as $ngram) {
$index = $ngram['index'];
$value = $ngram['value'];
$column = $index->getColumnName();
$alias = $this->getPrimaryTableAlias();
if ($alias) {
$column = qsprintf($conn, '%T.%T', $alias, $column);
} else {
$column = qsprintf($conn, '%T', $column);
}
$tokens = $index->tokenizeString($value);
foreach ($tokens as $token) {
$where[] = qsprintf(
$conn,
'%Q LIKE %~',
$column,
$token);
}
}
return $where;
}
protected function shouldGroupNgramResultRows() {
return (bool)$this->ngrams;
}
/* -( Edge Logic )--------------------------------------------------------- */