From 99f41edfeea696c6f7820005458e6fc83c80f757 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 31 Oct 2025 11:20:46 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(backend)=20keep=20ordering=20from=20f?= =?UTF-8?q?ulltext=20search=20in=20results?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep ordering by score from Find API on search/ results and fallback search still uses "-update_at" ordering as default Refactor pagination to work with a list instead of a queryset Signed-off-by: Fabre Florian --- src/backend/core/api/viewsets.py | 89 +++++-- .../documents/test_api_documents_search.py | 232 ++++++++++++++++-- 2 files changed, 275 insertions(+), 46 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index c232fd14c..fb2eabc6e 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -14,6 +14,7 @@ from django.contrib.postgres.search import TrigramSimilarity from django.core.cache import cache from django.core.exceptions import ValidationError from django.core.files.storage import default_storage +from django.core.paginator import InvalidPage, Paginator from django.core.validators import URLValidator from django.db import connection, transaction from django.db import models as db @@ -36,6 +37,7 @@ from lasuite.oidc_login.decorators import refresh_oidc_access_token from rest_framework import filters, status, viewsets from rest_framework import response as drf_response from rest_framework.permissions import AllowAny +from rest_framework.utils.urls import replace_query_param as drf_replace_query_param from core import authentication, choices, enums, models from core.services.ai_services import AIService @@ -1071,12 +1073,10 @@ class DocumentViewSet( {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) - def _simple_search_queryset(self, params): + def _search_simple(self, request, text): """ Returns a queryset filtered by the content of the document title """ - text = params.validated_data["q"] - # As the 'list' view we get a prefiltered queryset (deleted docs are excluded) queryset = self.get_queryset() filterset = DocumentFilter({"title": text}, queryset=queryset) @@ -1084,25 +1084,76 @@ class DocumentViewSet( if not filterset.is_valid(): raise drf.exceptions.ValidationError(filterset.errors) - return filterset.filter_queryset(queryset) + queryset = filterset.filter_queryset(queryset) - def _fulltext_search_queryset(self, indexer, token, user, params): + return self.get_response_for_queryset( + queryset.order_by("-updated_at"), + context={ + "request": request, + }, + ) + + def _search_fulltext(self, indexer, request, params): """ Returns a queryset from the results the fulltext search of Find """ + access_token = request.session.get("oidc_access_token") + user = request.user text = params.validated_data["q"] + page_size = params.validated_data.get("page_size", 20) + page_number = params.validated_data.get("page", 1) queryset = models.Document.objects.all() # Retrieve the documents ids from Find. results = indexer.search( text=text, - token=token, + token=access_token, visited=get_visited_document_ids_of(queryset, user), - page=params.validated_data.get("page", 1), - page_size=params.validated_data.get("page_size", 20), + page=1, + page_size=100, ) - return queryset.filter(pk__in=results) + docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)} + ordered_docs = [docs_by_uuid[id] for id in results] + + paginator = Paginator( + ordered_docs, per_page=page_size, allow_empty_first_page=True + ) + + try: + page = paginator.page(page_number) + except InvalidPage as e: + raise drf.exceptions.NotFound(_("Invalid page.")) from e + + serializer = self.get_serializer( + page.object_list, + many=True, + context={ + "request": request, + }, + ) + next_url, prev_url = None, None + + if page.has_next(): + next_url = request.build_absolute_uri() + next_url = drf_replace_query_param( + next_url, "page", page.next_page_number() + ) + + if page.has_previous(): + prev_url = request.build_absolute_uri() + prev_url = drf_replace_query_param( + prev_url, "page", page.previous_page_number() + ) + + output = { + "count": paginator.count, + "next": next_url, + "previous": prev_url, + "results": serializer.data, + } + + return drf.response.Response(output) @drf.decorators.action(detail=False, methods=["get"], url_path="search") @method_decorator(refresh_oidc_access_token) @@ -1118,29 +1169,17 @@ class DocumentViewSet( The ordering is always by the most recent first. """ - access_token = request.session.get("oidc_access_token") - user = request.user - params = serializers.SearchDocumentSerializer(data=request.query_params) params.is_valid(raise_exception=True) indexer = get_document_indexer() if indexer: - queryset = self._fulltext_search_queryset( - indexer, token=access_token, user=user, params=params - ) - else: - # The indexer is not configured, we fallback on a simple icontains filter by the - # model field 'title'. - queryset = self._simple_search_queryset(params) + return self._search_fulltext(indexer, request, params=params) - return self.get_response_for_queryset( - queryset.order_by("-updated_at"), - context={ - "request": request, - }, - ) + # The indexer is not configured, we fallback on a simple icontains filter by the + # model field 'title'. + return self._search_simple(request, text=params.validated_data["q"]) @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 869a8d566..465a8c91f 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -2,8 +2,11 @@ Tests for Documents API endpoint in impress's core app: list """ +import random from json import loads as json_loads +from django.test import RequestFactory + import pytest import responses from faker import Faker @@ -16,6 +19,15 @@ fake = Faker() pytestmark = pytest.mark.django_db +def build_search_url(**kwargs): + """Build absolute uri for search endpoint with ORDERED query arguments""" + return ( + RequestFactory() + .get("/api/v1.0/documents/search/", dict(sorted(kwargs.items()))) + .build_absolute_uri() + ) + + @pytest.mark.parametrize("role", models.LinkRoleChoices.values) @pytest.mark.parametrize("reach", models.LinkReachChoices.values) @responses.activate @@ -191,8 +203,62 @@ def test_api_documents_search_format(indexer_settings): @responses.activate -def test_api_documents_search_pagination(indexer_settings): - """Documents should be ordered by descending "updated_at" by default""" +@pytest.mark.parametrize( + "pagination, status, expected", + ( + ( + {"page": 1, "page_size": 10}, + 200, + { + "count": 10, + "previous": None, + "next": None, + "range": (0, None), + }, + ), + ( + {}, + 200, + { + "count": 10, + "previous": None, + "next": None, + "range": (0, None), + "api_page_size": 21, # default page_size is 20 + }, + ), + ( + {"page": 2, "page_size": 10}, + 404, + {}, + ), + ( + {"page": 1, "page_size": 5}, + 200, + { + "count": 10, + "previous": None, + "next": {"page": 2, "page_size": 5}, + "range": (0, 5), + }, + ), + ( + {"page": 2, "page_size": 5}, + 200, + { + "count": 10, + "previous": {"page": 1, "page_size": 5}, + "next": None, + "range": (5, None), + }, + ), + ({"page": 3, "page_size": 5}, 404, {}), + ), +) +def test_api_documents_search_pagination( + indexer_settings, pagination, status, expected +): + """Documents should be ordered by descending "score" by default""" indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" assert get_document_indexer() is not None @@ -202,35 +268,159 @@ def test_api_documents_search_pagination(indexer_settings): client = APIClient() client.force_login(user) - docs = factories.DocumentFactory.create_batch(10) + docs = factories.DocumentFactory.create_batch(10, title="alpha", users=[user]) + + docs_by_uuid = {str(doc.pk): doc for doc in docs} + api_results = [{"_id": id} for id in docs_by_uuid.keys()] + + # reorder randomly to simulate score ordering + random.shuffle(api_results) # Find response # pylint: disable-next=assignment-from-none api_search = responses.add( responses.POST, "http://find/api/v1.0/search", - json=[{"_id": str(doc.pk)} for doc in docs], + json=api_results, status=200, ) response = client.get( - "/api/v1.0/documents/search/", data={"q": "alpha", "page": 2, "page_size": 5} + "/api/v1.0/documents/search/", + data={ + "q": "alpha", + **pagination, + }, ) - assert response.status_code == 200 - content = response.json() - results = content.pop("results") - assert len(results) == 5 + assert response.status_code == status - # Check the query parameters. - assert api_search.call_count == 1 - assert api_search.calls[0].response.status_code == 200 - assert json_loads(api_search.calls[0].request.body) == { - "q": "alpha", - "visited": [], - "services": ["docs"], - "page_number": 2, - "page_size": 5, - "order_by": "updated_at", - "order_direction": "desc", - } + if response.status_code < 300: + previous_url = ( + build_search_url(q="alpha", **expected["previous"]) + if expected["previous"] + else None + ) + next_url = ( + build_search_url(q="alpha", **expected["next"]) + if expected["next"] + else None + ) + start, end = expected["range"] + + content = response.json() + + assert content["count"] == expected["count"] + assert content["previous"] == previous_url + assert content["next"] == next_url + + results = content.pop("results") + + # The find api results ordering by score is kept + assert [r["id"] for r in results] == [r["_id"] for r in api_results[start:end]] + + # Check the query parameters. + assert api_search.call_count == 1 + assert api_search.calls[0].response.status_code == 200 + assert json_loads(api_search.calls[0].request.body) == { + "q": "alpha", + "visited": [], + "services": ["docs"], + "page_number": 1, + "page_size": 100, + "order_by": "updated_at", + "order_direction": "desc", + } + + +@responses.activate +@pytest.mark.parametrize( + "pagination, status, expected", + ( + ( + {"page": 1, "page_size": 10}, + 200, + {"count": 10, "previous": None, "next": None, "range": (0, None)}, + ), + ( + {}, + 200, + {"count": 10, "previous": None, "next": None, "range": (0, None)}, + ), + ( + {"page": 2, "page_size": 10}, + 404, + {}, + ), + ( + {"page": 1, "page_size": 5}, + 200, + { + "count": 10, + "previous": None, + "next": {"page": 2, "page_size": 5}, + "range": (0, 5), + }, + ), + ( + {"page": 2, "page_size": 5}, + 200, + { + "count": 10, + "previous": {"page_size": 5}, + "next": None, + "range": (5, None), + }, + ), + ({"page": 3, "page_size": 5}, 404, {}), + ), +) +def test_api_documents_search_pagination_endpoint_is_none( + indexer_settings, pagination, status, expected +): + """Documents should be ordered by descending "-updated_at" by default""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + assert get_document_indexer() is None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + factories.DocumentFactory.create_batch(10, title="alpha", users=[user]) + + response = client.get( + "/api/v1.0/documents/search/", + data={ + "q": "alpha", + **pagination, + }, + ) + + assert response.status_code == status + + if response.status_code < 300: + previous_url = ( + build_search_url(q="alpha", **expected["previous"]) + if expected["previous"] + else None + ) + next_url = ( + build_search_url(q="alpha", **expected["next"]) + if expected["next"] + else None + ) + queryset = models.Document.objects.order_by("-updated_at") + start, end = expected["range"] + expected_results = [str(d.pk) for d in queryset[start:end]] + + content = response.json() + + assert content["count"] == expected["count"] + assert content["previous"] == previous_url + assert content["next"] == next_url + + results = content.pop("results") + + assert [r["id"] for r in results] == expected_results