Full-text search using postgresql #162

Merged
Oleg-Komarov merged 10 commits from fts into main 2024-06-03 20:07:23 +02:00
Showing only changes of commit 16e82fde28 - Show all commits

View File

@ -2,7 +2,7 @@ import logging
from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from django.db import connection
from django.db.models import Q
from django.shortcuts import get_object_or_404, redirect
from django.views.generic.list import ListView
@ -24,7 +24,15 @@ log = logging.getLogger(__name__)
class ListedExtensionsView(ListView):
model = Extension
queryset = Extension.objects.listed
queryset = Extension.objects.listed.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
context_object_name = 'extensions'
@ -34,19 +42,7 @@ class HomeView(ListedExtensionsView):
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
q = (
super()
.get_queryset()
.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
)
q = super().get_queryset()
context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8]
context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8]
return context
@ -90,39 +86,68 @@ class SearchView(ListedExtensionsView):
if self.kwargs.get('type_slug'):
_type = self._get_type_id_by_slug()
queryset = queryset.filter(type=_type)
if 'q' in self.request.GET:
# using DEBUG as a shortcut for checking if we run on postgres vs sqlite
if settings.DEBUG:
qs = self.request.GET['q'].split()
search_query = Q()
for token in qs:
search_query &= (
Q(slug__icontains=token)
| Q(name__icontains=token)
| Q(description__icontains=token)
| Q(latest_version__tags__name__icontains=token)
)
queryset = queryset.filter(search_query).distinct()
else:
query = SearchQuery(self.request.GET['q'], search_type='websearch')
vector = (
SearchVector('name', weight='A')
+ SearchVector('description', weight='B')
+ SearchVector('latest_version__tags__name', weight='C')
search_query = self.request.GET.get('q')
if not search_query:
return queryset
# WARNING: full-text search support only on postgres
# using DEBUG as a shortcut for checking if we run on postgres vs sqlite
if settings.DEBUG and 0:
filter = Q()
for token in search_query.split():
filter &= (
Q(slug__icontains=token)
| Q(name__icontains=token)
| Q(description__icontains=token)
| Q(latest_version__tags__name__icontains=token)
)
rank = SearchRank(vector, query)
queryset = (
queryset.annotate(rank=rank).filter(rank__gte=0.3).distinct().order_by('-rank')
)
return queryset.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
queryset = queryset.filter(filter).distinct()
else:
queryset = self.postgres_fts(queryset, search_query)
return queryset
def postgres_fts(self, queryset, search_query):
"""Postgres full text search (fast) and a fuzzy trigram search (slow) as a fallback.
Searches Extension name and description only. If we need to extend the functionality,
it's better to consider using a different approach, e.g. introduce meilisearch.
Limits the results size to 32 items (2 pages), assuming that nobody will click through many
pages if we failed to present the vital results on the first page.
Relies on indexed expressions:
CREATE INDEX extensions_fts ON extensions_extension USING
gin ((to_tsvector('english', name) || ' ' || to_tsvector('english', description)));
CREATE INDEX extensions_trgm_gin ON extensions_extension USING
gin((((name)::text || ' '::text) || description) gin_trgm_ops);
"""
with connection.cursor() as cursor:
sql = """
select id
from extensions_extension
where (
(to_tsvector('english', name) || ' ' || to_tsvector('english', description))
@@ websearch_to_tsquery('english', %(query)s)
) and is_listed
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
if not pks:
# fallback to fuzzy trigram search
sql = """
select id
from extensions_extension
where ((name || ' ' || description) %%> %(query)s)
and is_listed
order by %(query)s <<<-> (name || ' ' || description)
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
# pks are ordered by ranking, keep that order
# this approach is fine under the assumption that the list is small
return sorted(queryset.filter(pk__in=pks).order_by(), key=lambda x: pks.index(x.pk))
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)