Full-text search using postgresql #162

Merged
Oleg-Komarov merged 10 commits from fts into main 2024-06-03 20:07:23 +02:00
Showing only changes of commit 16e82fde28 - Show all commits

View File

@ -2,7 +2,7 @@ import logging
from django.conf import settings from django.conf import settings
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from django.db import connection
from django.db.models import Q from django.db.models import Q
from django.shortcuts import get_object_or_404, redirect from django.shortcuts import get_object_or_404, redirect
from django.views.generic.list import ListView from django.views.generic.list import ListView
@ -24,7 +24,15 @@ log = logging.getLogger(__name__)
class ListedExtensionsView(ListView): class ListedExtensionsView(ListView):
model = Extension model = Extension
queryset = Extension.objects.listed queryset = Extension.objects.listed.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
context_object_name = 'extensions' context_object_name = 'extensions'
@ -34,19 +42,7 @@ class HomeView(ListedExtensionsView):
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs) context = super().get_context_data(**kwargs)
q = ( q = super().get_queryset()
super()
.get_queryset()
.prefetch_related(
'authors',
'latest_version__file',
'latest_version__tags',
'preview_set',
'preview_set__file',
'ratings',
'team',
)
)
context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8] context['addons'] = q.filter(type=EXTENSION_TYPE_CHOICES.BPY)[:8]
context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8] context['themes'] = q.filter(type=EXTENSION_TYPE_CHOICES.THEME)[:8]
return context return context
@ -90,39 +86,68 @@ class SearchView(ListedExtensionsView):
if self.kwargs.get('type_slug'): if self.kwargs.get('type_slug'):
_type = self._get_type_id_by_slug() _type = self._get_type_id_by_slug()
queryset = queryset.filter(type=_type) queryset = queryset.filter(type=_type)
if 'q' in self.request.GET:
# using DEBUG as a shortcut for checking if we run on postgres vs sqlite search_query = self.request.GET.get('q')
if settings.DEBUG: if not search_query:
qs = self.request.GET['q'].split() return queryset
search_query = Q()
for token in qs: # WARNING: full-text search support only on postgres
search_query &= ( # using DEBUG as a shortcut for checking if we run on postgres vs sqlite
Q(slug__icontains=token) if settings.DEBUG and 0:
| Q(name__icontains=token) filter = Q()
| Q(description__icontains=token) for token in search_query.split():
| Q(latest_version__tags__name__icontains=token) filter &= (
) Q(slug__icontains=token)
queryset = queryset.filter(search_query).distinct() | Q(name__icontains=token)
else: | Q(description__icontains=token)
query = SearchQuery(self.request.GET['q'], search_type='websearch') | Q(latest_version__tags__name__icontains=token)
vector = (
SearchVector('name', weight='A')
+ SearchVector('description', weight='B')
+ SearchVector('latest_version__tags__name', weight='C')
) )
rank = SearchRank(vector, query) queryset = queryset.filter(filter).distinct()
queryset = ( else:
queryset.annotate(rank=rank).filter(rank__gte=0.3).distinct().order_by('-rank') queryset = self.postgres_fts(queryset, search_query)
) return queryset
return queryset.prefetch_related(
'authors', def postgres_fts(self, queryset, search_query):
'latest_version__file', """Postgres full text search (fast) and a fuzzy trigram search (slow) as a fallback.
'latest_version__tags',
'preview_set', Searches Extension name and description only. If we need to extend the functionality,
'preview_set__file', it's better to consider using a different approach, e.g. introduce meilisearch.
'ratings',
'team', Limits the results size to 32 items (2 pages), assuming that nobody will click through many
) pages if we failed to present the vital results on the first page.
Relies on indexed expressions:
CREATE INDEX extensions_fts ON extensions_extension USING
gin ((to_tsvector('english', name) || ' ' || to_tsvector('english', description)));
CREATE INDEX extensions_trgm_gin ON extensions_extension USING
gin((((name)::text || ' '::text) || description) gin_trgm_ops);
"""
with connection.cursor() as cursor:
sql = """
select id
from extensions_extension
where (
(to_tsvector('english', name) || ' ' || to_tsvector('english', description))
@@ websearch_to_tsquery('english', %(query)s)
) and is_listed
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
if not pks:
# fallback to fuzzy trigram search
sql = """
select id
from extensions_extension
where ((name || ' ' || description) %%> %(query)s)
and is_listed
order by %(query)s <<<-> (name || ' ' || description)
limit 32"""
cursor.execute(sql, {'query': search_query})
pks = [row[0] for row in cursor.fetchall()]
# pks are ordered by ranking, keep that order
# this approach is fine under the assumption that the list is small
return sorted(queryset.filter(pk__in=pks).order_by(), key=lambda x: pks.index(x.pk))
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs) context = super().get_context_data(**kwargs)