(backend) keep ordering from fulltext search in results

Keep ordering by score from Find API on search/ results and
fallback search still uses "-update_at" ordering as default

Refactor pagination to work with a list instead of a queryset

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-10-31 11:20:46 +01:00
parent 73ea54d02d
commit 99f41edfee
2 changed files with 275 additions and 46 deletions

View File

@@ -14,6 +14,7 @@ from django.contrib.postgres.search import TrigramSimilarity
from django.core.cache import cache
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
from django.core.paginator import InvalidPage, Paginator
from django.core.validators import URLValidator
from django.db import connection, transaction
from django.db import models as db
@@ -36,6 +37,7 @@ from lasuite.oidc_login.decorators import refresh_oidc_access_token
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
from rest_framework.utils.urls import replace_query_param as drf_replace_query_param
from core import authentication, choices, enums, models
from core.services.ai_services import AIService
@@ -1071,12 +1073,10 @@ class DocumentViewSet(
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
)
def _simple_search_queryset(self, params):
def _search_simple(self, request, text):
"""
Returns a queryset filtered by the content of the document title
"""
text = params.validated_data["q"]
# As the 'list' view we get a prefiltered queryset (deleted docs are excluded)
queryset = self.get_queryset()
filterset = DocumentFilter({"title": text}, queryset=queryset)
@@ -1084,25 +1084,76 @@ class DocumentViewSet(
if not filterset.is_valid():
raise drf.exceptions.ValidationError(filterset.errors)
return filterset.filter_queryset(queryset)
queryset = filterset.filter_queryset(queryset)
def _fulltext_search_queryset(self, indexer, token, user, params):
return self.get_response_for_queryset(
queryset.order_by("-updated_at"),
context={
"request": request,
},
)
def _search_fulltext(self, indexer, request, params):
"""
Returns a queryset from the results the fulltext search of Find
"""
access_token = request.session.get("oidc_access_token")
user = request.user
text = params.validated_data["q"]
page_size = params.validated_data.get("page_size", 20)
page_number = params.validated_data.get("page", 1)
queryset = models.Document.objects.all()
# Retrieve the documents ids from Find.
results = indexer.search(
text=text,
token=token,
token=access_token,
visited=get_visited_document_ids_of(queryset, user),
page=params.validated_data.get("page", 1),
page_size=params.validated_data.get("page_size", 20),
page=1,
page_size=100,
)
return queryset.filter(pk__in=results)
docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)}
ordered_docs = [docs_by_uuid[id] for id in results]
paginator = Paginator(
ordered_docs, per_page=page_size, allow_empty_first_page=True
)
try:
page = paginator.page(page_number)
except InvalidPage as e:
raise drf.exceptions.NotFound(_("Invalid page.")) from e
serializer = self.get_serializer(
page.object_list,
many=True,
context={
"request": request,
},
)
next_url, prev_url = None, None
if page.has_next():
next_url = request.build_absolute_uri()
next_url = drf_replace_query_param(
next_url, "page", page.next_page_number()
)
if page.has_previous():
prev_url = request.build_absolute_uri()
prev_url = drf_replace_query_param(
prev_url, "page", page.previous_page_number()
)
output = {
"count": paginator.count,
"next": next_url,
"previous": prev_url,
"results": serializer.data,
}
return drf.response.Response(output)
@drf.decorators.action(detail=False, methods=["get"], url_path="search")
@method_decorator(refresh_oidc_access_token)
@@ -1118,29 +1169,17 @@ class DocumentViewSet(
The ordering is always by the most recent first.
"""
access_token = request.session.get("oidc_access_token")
user = request.user
params = serializers.SearchDocumentSerializer(data=request.query_params)
params.is_valid(raise_exception=True)
indexer = get_document_indexer()
if indexer:
queryset = self._fulltext_search_queryset(
indexer, token=access_token, user=user, params=params
)
else:
# The indexer is not configured, we fallback on a simple icontains filter by the
# model field 'title'.
queryset = self._simple_search_queryset(params)
return self._search_fulltext(indexer, request, params=params)
return self.get_response_for_queryset(
queryset.order_by("-updated_at"),
context={
"request": request,
},
)
# The indexer is not configured, we fallback on a simple icontains filter by the
# model field 'title'.
return self._search_simple(request, text=params.validated_data["q"])
@drf.decorators.action(detail=True, methods=["get"], url_path="versions")
def versions_list(self, request, *args, **kwargs):

View File

@@ -2,8 +2,11 @@
Tests for Documents API endpoint in impress's core app: list
"""
import random
from json import loads as json_loads
from django.test import RequestFactory
import pytest
import responses
from faker import Faker
@@ -16,6 +19,15 @@ fake = Faker()
pytestmark = pytest.mark.django_db
def build_search_url(**kwargs):
"""Build absolute uri for search endpoint with ORDERED query arguments"""
return (
RequestFactory()
.get("/api/v1.0/documents/search/", dict(sorted(kwargs.items())))
.build_absolute_uri()
)
@pytest.mark.parametrize("role", models.LinkRoleChoices.values)
@pytest.mark.parametrize("reach", models.LinkReachChoices.values)
@responses.activate
@@ -191,8 +203,62 @@ def test_api_documents_search_format(indexer_settings):
@responses.activate
def test_api_documents_search_pagination(indexer_settings):
"""Documents should be ordered by descending "updated_at" by default"""
@pytest.mark.parametrize(
"pagination, status, expected",
(
(
{"page": 1, "page_size": 10},
200,
{
"count": 10,
"previous": None,
"next": None,
"range": (0, None),
},
),
(
{},
200,
{
"count": 10,
"previous": None,
"next": None,
"range": (0, None),
"api_page_size": 21, # default page_size is 20
},
),
(
{"page": 2, "page_size": 10},
404,
{},
),
(
{"page": 1, "page_size": 5},
200,
{
"count": 10,
"previous": None,
"next": {"page": 2, "page_size": 5},
"range": (0, 5),
},
),
(
{"page": 2, "page_size": 5},
200,
{
"count": 10,
"previous": {"page": 1, "page_size": 5},
"next": None,
"range": (5, None),
},
),
({"page": 3, "page_size": 5}, 404, {}),
),
)
def test_api_documents_search_pagination(
indexer_settings, pagination, status, expected
):
"""Documents should be ordered by descending "score" by default"""
indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
assert get_document_indexer() is not None
@@ -202,35 +268,159 @@ def test_api_documents_search_pagination(indexer_settings):
client = APIClient()
client.force_login(user)
docs = factories.DocumentFactory.create_batch(10)
docs = factories.DocumentFactory.create_batch(10, title="alpha", users=[user])
docs_by_uuid = {str(doc.pk): doc for doc in docs}
api_results = [{"_id": id} for id in docs_by_uuid.keys()]
# reorder randomly to simulate score ordering
random.shuffle(api_results)
# Find response
# pylint: disable-next=assignment-from-none
api_search = responses.add(
responses.POST,
"http://find/api/v1.0/search",
json=[{"_id": str(doc.pk)} for doc in docs],
json=api_results,
status=200,
)
response = client.get(
"/api/v1.0/documents/search/", data={"q": "alpha", "page": 2, "page_size": 5}
"/api/v1.0/documents/search/",
data={
"q": "alpha",
**pagination,
},
)
assert response.status_code == 200
content = response.json()
results = content.pop("results")
assert len(results) == 5
assert response.status_code == status
# Check the query parameters.
assert api_search.call_count == 1
assert api_search.calls[0].response.status_code == 200
assert json_loads(api_search.calls[0].request.body) == {
"q": "alpha",
"visited": [],
"services": ["docs"],
"page_number": 2,
"page_size": 5,
"order_by": "updated_at",
"order_direction": "desc",
}
if response.status_code < 300:
previous_url = (
build_search_url(q="alpha", **expected["previous"])
if expected["previous"]
else None
)
next_url = (
build_search_url(q="alpha", **expected["next"])
if expected["next"]
else None
)
start, end = expected["range"]
content = response.json()
assert content["count"] == expected["count"]
assert content["previous"] == previous_url
assert content["next"] == next_url
results = content.pop("results")
# The find api results ordering by score is kept
assert [r["id"] for r in results] == [r["_id"] for r in api_results[start:end]]
# Check the query parameters.
assert api_search.call_count == 1
assert api_search.calls[0].response.status_code == 200
assert json_loads(api_search.calls[0].request.body) == {
"q": "alpha",
"visited": [],
"services": ["docs"],
"page_number": 1,
"page_size": 100,
"order_by": "updated_at",
"order_direction": "desc",
}
@responses.activate
@pytest.mark.parametrize(
"pagination, status, expected",
(
(
{"page": 1, "page_size": 10},
200,
{"count": 10, "previous": None, "next": None, "range": (0, None)},
),
(
{},
200,
{"count": 10, "previous": None, "next": None, "range": (0, None)},
),
(
{"page": 2, "page_size": 10},
404,
{},
),
(
{"page": 1, "page_size": 5},
200,
{
"count": 10,
"previous": None,
"next": {"page": 2, "page_size": 5},
"range": (0, 5),
},
),
(
{"page": 2, "page_size": 5},
200,
{
"count": 10,
"previous": {"page_size": 5},
"next": None,
"range": (5, None),
},
),
({"page": 3, "page_size": 5}, 404, {}),
),
)
def test_api_documents_search_pagination_endpoint_is_none(
indexer_settings, pagination, status, expected
):
"""Documents should be ordered by descending "-updated_at" by default"""
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
assert get_document_indexer() is None
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
factories.DocumentFactory.create_batch(10, title="alpha", users=[user])
response = client.get(
"/api/v1.0/documents/search/",
data={
"q": "alpha",
**pagination,
},
)
assert response.status_code == status
if response.status_code < 300:
previous_url = (
build_search_url(q="alpha", **expected["previous"])
if expected["previous"]
else None
)
next_url = (
build_search_url(q="alpha", **expected["next"])
if expected["next"]
else None
)
queryset = models.Document.objects.order_by("-updated_at")
start, end = expected["range"]
expected_results = [str(d.pk) for d in queryset[start:end]]
content = response.json()
assert content["count"] == expected["count"]
assert content["previous"] == previous_url
assert content["next"] == next_url
results = content.pop("results")
assert [r["id"] for r in results] == expected_results