Full-text search using postgresql #162

Merged
Oleg-Komarov merged 10 commits from fts into main 2024-06-03 20:07:23 +02:00
Showing only changes of commit 16e82fde28 - Show all commits

View File

@ -2,7 +2,7 @@ import logging
from django.conf import settings from django.conf import settings
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from django.db import connection
from django.db.models import Q from django.db.models import Q
from django.shortcuts import get_object_or_404, redirect from django.shortcuts import get_object_or_404, redirect
from django.views.generic.list import ListView from django.views.generic.list import ListView
@ -24,7 +24,15 @@ log = logging.getLogger(__name__)
class ListedExtensionsView(ListView): class ListedExtensionsView(ListView):
model = Extension model = Extension
queryset = Extension.objects.listed queryset = Extension.objects.listed.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
context_object_name = 'extensions' context_object_name = 'extensions'
@ -34,19 +42,7 @@ class HomeView(ListedExtensionsView):
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs) context = super().get_context_data(**kwargs)
q = ( q = super().get_queryset()
super()
.get_queryset()
.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
)
context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8] context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8]
context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8] context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8]
return context return context
@ -90,39 +86,68 @@ class SearchView(ListedExtensionsView):
if self.kwargs.get('type_slug'): if self.kwargs.get('type_slug'):
_type = self._get_type_id_by_slug() _type = self._get_type_id_by_slug()
queryset = queryset.filter(type=_type) queryset = queryset.filter(type=_type)
if 'q' in self.request.GET:
search_query = self.request.GET.get('q')
if not search_query:
return queryset
# WARNING: full-text search support only on postgres
# using DEBUG as a shortcut for checking if we run on postgres vs sqlite # using DEBUG as a shortcut for checking if we run on postgres vs sqlite
if settings.DEBUG: if settings.DEBUG and 0:
qs = self.request.GET['q'].split() filter = Q()
search_query = Q() for token in search_query.split():
for token in qs: filter &= (
search_query &= (
Q(slug__icontains=token) Q(slug__icontains=token)
| Q(name__icontains=token) | Q(name__icontains=token)
| Q(description__icontains=token) | Q(description__icontains=token)
| Q(latest_version__tags__name__icontains=token) | Q(latest_version__tags__name__icontains=token)
) )
queryset = queryset.filter(search_query).distinct() queryset = queryset.filter(filter).distinct()
else: else:
query = SearchQuery(self.request.GET['q'], search_type='websearch') queryset = self.postgres_fts(queryset, search_query)
vector = ( return queryset
SearchVector('name', weight='A')
+ SearchVector('description', weight='B') def postgres_fts(self, queryset, search_query):
+ SearchVector('latest_version__tags__name', weight='C') """Postgres full text search (fast) and a fuzzy trigram search (slow) as a fallback.
)
rank = SearchRank(vector, query) Searches Extension name and description only. If we need to extend the functionality,
queryset = ( it's better to consider using a different approach, e.g. introduce meilisearch.
queryset.annotate(rank=rank).filter(rank__gte=0.3).distinct().order_by('-rank')
) Limits the results size to 32 items (2 pages), assuming that nobody will click through many
return queryset.prefetch_related( pages if we failed to present the vital results on the first page.
'authors',
'latest_version__file', Relies on indexed expressions:
'latest_version__tags', CREATE INDEX extensions_fts ON extensions_extension USING
'preview_set', gin ((to_tsvector('english', name) || ' ' || to_tsvector('english', description)));
'preview_set__file', CREATE INDEX extensions_trgm_gin ON extensions_extension USING
'ratings', gin((((name)::text || ' '::text) || description) gin_trgm_ops);
'team',
) """
with connection.cursor() as cursor:
sql = """
select id
from extensions_extension
where (
(to_tsvector('english', name) || ' ' || to_tsvector('english', description))
@@ websearch_to_tsquery('english', %(query)s)
) and is_listed
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
if not pks:
# fallback to fuzzy trigram search
sql = """
select id
from extensions_extension
where ((name || ' ' || description) %%> %(query)s)
and is_listed
order by %(query)s <<<-> (name || ' ' || description)
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
# pks are ordered by ranking, keep that order
# this approach is fine under the assumption that the list is small
return sorted(queryset.filter(pk__in=pks).order_by(), key=lambda x: pks.index(x.pk))
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs) context = super().get_context_data(**kwargs)