Full-text search using postgresql #162

Merged
Oleg-Komarov merged 10 commits from fts into main 2024-06-03 20:07:23 +02:00
2 changed files with 114 additions and 31 deletions

View File

@ -0,0 +1,41 @@
# Generated by Django 4.2.11 on 2024-06-03 17:18
from django.db import migrations
def create_indexes(apps, schema_editor):
if schema_editor.connection.vendor != 'postgresql':
return
with schema_editor.connection.cursor() as cursor:
cursor.execute(
"""
CREATE INDEX extensions_fts ON extensions_extension USING
gin ((to_tsvector('english', name) || ' ' || to_tsvector('english', description)))
"""
)
cursor.execute('create extension if not exists pg_trgm;')
cursor.execute(
"""
CREATE INDEX extensions_trgm_gin ON extensions_extension USING
gin((((name)::text || ' '::text) || description) gin_trgm_ops);
"""
)
def delete_indexes(apps, schema_editor):
if schema_editor.connection.vendor != 'postgresql':
return
with schema_editor.connection.cursor() as cursor:
cursor.execute('drop index extensions_fts')
cursor.execute('drop index extensions_trgm_gin')
class Migration(migrations.Migration):
dependencies = [
('extensions', '0032_extension_extensions__is_list_765936_idx_and_more'),
]
operations = [
migrations.RunPython(create_indexes, delete_indexes)
]

View File

@ -2,6 +2,7 @@ from collections import OrderedDict
import logging import logging
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
from django.db import connection
from django.db.models import Count, Q from django.db.models import Count, Q
from django.shortcuts import get_object_or_404, redirect from django.shortcuts import get_object_or_404, redirect
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
@ -24,7 +25,15 @@ log = logging.getLogger(__name__)
class ListedExtensionsView(ListView): class ListedExtensionsView(ListView):
model = Extension model = Extension
queryset = Extension.objects.listed queryset = Extension.objects.listed.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
context_object_name = 'extensions' context_object_name = 'extensions'
@ -34,20 +43,7 @@ class HomeView(ListedExtensionsView):
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs) context = super().get_context_data(**kwargs)
q = ( q = super().get_queryset().order_by('-average_score')
super()
.get_queryset()
.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
.order_by('-average_score')
)
context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8] context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8]
context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8] context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8]
return context return context
@ -94,7 +90,7 @@ class SearchView(ListedExtensionsView):
return sort_by return sort_by
def get_queryset(self): def get_queryset(self):
queryset = super().get_queryset() queryset = super().get_queryset().order_by(self._get_sort_by())
if self.kwargs.get('tag_slug'): if self.kwargs.get('tag_slug'):
queryset = queryset.filter( queryset = queryset.filter(
latest_version__tags__slug=self.kwargs['tag_slug'] latest_version__tags__slug=self.kwargs['tag_slug']
@ -108,26 +104,72 @@ class SearchView(ListedExtensionsView):
if self.kwargs.get('type_slug'): if self.kwargs.get('type_slug'):
_type = self._get_type_id_by_slug() _type = self._get_type_id_by_slug()
queryset = queryset.filter(type=_type) queryset = queryset.filter(type=_type)
if 'q' in self.request.GET:
qs = self.request.GET['q'].split() search_query = self.request.GET.get('q')
search_query = Q() if not search_query:
for token in qs: return queryset
search_query &= (
# WARNING: full-text search support only on postgres
if connection.vendor == 'postgresql':
queryset = self.postgres_fts(queryset, search_query)
else:
filter = Q()
for token in search_query.split():
filter &= (
Q(slug__icontains=token) Q(slug__icontains=token)
| Q(name__icontains=token) | Q(name__icontains=token)
| Q(description__icontains=token) | Q(description__icontains=token)
| Q(latest_version__tags__name__icontains=token) | Q(latest_version__tags__name__icontains=token)
) )
queryset = queryset.filter(search_query).distinct() queryset = queryset.filter(filter).distinct()
return queryset.prefetch_related( return queryset
'authors',
'latest_version__file', def postgres_fts(self, queryset, search_query):
'latest_version__tags', """Postgres full text search (fast) and a fuzzy trigram search (slow) as a fallback.
'preview_set',
'preview_set__file', Searches Extension name and description only, ranking name matches higher.
'ratings', If we need to extend the functionality, it's better to consider using a different approach,
'team', e.g. introduce meilisearch.
).order_by(self._get_sort_by())
Limits the results size to 32 items (2 pages), assuming that nobody will click through many
pages if we failed to present the vital results on the first page.
Relies on indexed expressions:
CREATE INDEX extensions_fts ON extensions_extension USING
gin ((to_tsvector('english', name) || ' ' || to_tsvector('english', description)));
CREATE INDEX extensions_trgm_gin ON extensions_extension USING
gin((((name)::text || ' '::text) || description) gin_trgm_ops);
"""
with connection.cursor() as cursor:
sql = """
select id
from extensions_extension
where (
(to_tsvector('english', name) || ' ' || to_tsvector('english', description))
@@ websearch_to_tsquery('english', %(query)s)
) and is_listed
order by ts_rank(
to_tsvector('english', name),
websearch_to_tsquery('english', %(query)s)
) desc
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
if not pks:
# fallback to fuzzy trigram search
sql = """
select id
from extensions_extension
where ((name || ' ' || description) %%> %(query)s)
and is_listed
order by %(query)s <<<-> (name || ' ' || description)
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
# pks are ordered by ranking, keep that order
# this approach is fine under the assumption that the list is small
return sorted(queryset.filter(pk__in=pks).order_by(), key=lambda x: pks.index(x.pk))
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs) context = super().get_context_data(**kwargs)