From c6ed7a79f8dc9a9c190a120dca2a12d168b2d926 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Mon, 17 Nov 2025 15:50:14 +0100 Subject: [PATCH] =?UTF-8?q?WIP=20=E2=9C=A8(backend)=20use=20markdown=20as?= =?UTF-8?q?=20input=20data=20for=20indexation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Fabre Florian --- src/backend/core/services/search_indexers.py | 37 +++++++++- .../tests/test_services_search_indexers.py | 72 ++++++++++++++++++- 2 files changed, 104 insertions(+), 5 deletions(-) diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index a4bb9eec6..3878c86fd 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -1,5 +1,6 @@ """Document search index management utilities and indexers""" +import base64 import logging from abc import ABC, abstractmethod from collections import defaultdict @@ -14,6 +15,12 @@ from django.utils.module_loading import import_string import requests from core import models, utils +from core.services.converter_services import ( + ServiceUnavailableError as YProviderServiceUnavailableError, +) +from core.services.converter_services import ( + YdocConverter, +) logger = logging.getLogger(__name__) @@ -231,6 +238,32 @@ class SearchIndexer(BaseDocumentIndexer): Document indexer that pushes documents to La Suite Find app. """ + def to_markdown(self, document): + """ + Convert document as markdown. + Returns raw text if Ydoc service is not accessible + """ + content = "" + base64_content = document.content + + if base64_content is not None and len(base64_content) > 0: + # Convert using the y-provider service + try: + yprovider = YdocConverter() + result = yprovider.convert( + base64.b64decode(base64_content), + "application/vnd.yjs.doc", + "text/markdown", + ) + content = result + except YProviderServiceUnavailableError as e: + logger.error( + "Error getting content for document %s: %s", document.pk, e + ) + return utils.base64_yjs_to_text(base64_content) + + return content + def serialize_document(self, document, accesses): """ Convert a Document to the JSON format expected by La Suite Find. @@ -243,8 +276,7 @@ class SearchIndexer(BaseDocumentIndexer): dict: A JSON-serializable dictionary. """ doc_path = document.path - doc_content = document.content - text_content = utils.base64_yjs_to_text(doc_content) if doc_content else "" + text_content = self.to_markdown(document) return { "id": str(document.id), @@ -259,6 +291,7 @@ class SearchIndexer(BaseDocumentIndexer): "groups": list(accesses.get(doc_path, {}).get("teams", set())), "reach": document.computed_link_reach, "size": len(text_content.encode("utf-8")), + "mimetype": "text/markdown", "is_active": not bool(document.ancestors_deleted_at), } diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 61488a921..72b3a9a81 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -13,6 +13,9 @@ import responses from requests import HTTPError from core import factories, models, utils +from core.services.converter_services import ( + ServiceUnavailableError as YProviderServiceUnavailableError, +) from core.services.search_indexers import ( BaseDocumentIndexer, SearchIndexer, @@ -172,7 +175,10 @@ def test_services_search_endpoint_is_empty(indexer_settings): @pytest.mark.usefixtures("indexer_settings") -def test_services_search_indexers_serialize_document_returns_expected_json(): +@patch("core.services.converter_services.YdocConverter.convert") +def test_services_search_indexers_serialize_document_returns_expected_json( + mock_convert, +): """ It should serialize documents with correct metadata and access control. """ @@ -180,6 +186,12 @@ def test_services_search_indexers_serialize_document_returns_expected_json(): document = factories.DocumentFactory() factories.DocumentFactory(parent=document) + markdown_content = ( + f"## {document.title}\n{utils.base64_yjs_to_text(document.content)}" + ) + + mock_convert.return_value = markdown_content + factories.UserDocumentAccessFactory(document=document, user=user_a) factories.UserDocumentAccessFactory(document=document, user=user_b) factories.TeamDocumentAccessFactory(document=document, team="team1") @@ -195,6 +207,8 @@ def test_services_search_indexers_serialize_document_returns_expected_json(): indexer = SearchIndexer() result = indexer.serialize_document(document, accesses) + assert mock_convert.call_count == 1 + assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)} assert set(result.pop("groups")) == {"team1", "team2"} assert result == { @@ -203,11 +217,63 @@ def test_services_search_indexers_serialize_document_returns_expected_json(): "depth": 1, "path": document.path, "numchild": 1, - "content": utils.base64_yjs_to_text(document.content), + "content": markdown_content, + "mimetype": "text/markdown", "created_at": document.created_at.isoformat(), "updated_at": document.updated_at.isoformat(), "reach": document.link_reach, - "size": 13, + "size": len(markdown_content), + "is_active": True, + } + + +@pytest.mark.usefixtures("indexer_settings") +@patch("core.services.converter_services.YdocConverter.convert") +def test_services_search_indexers_serialize_document_no_converter( + mock_convert, +): + """ + It should serialize documents with correct metadata and access control. + """ + user_a, user_b = factories.UserFactory.create_batch(2) + document = factories.DocumentFactory() + factories.DocumentFactory(parent=document) + + mock_convert.side_effect = YProviderServiceUnavailableError() + + text_content = utils.base64_yjs_to_text(document.content) + + factories.UserDocumentAccessFactory(document=document, user=user_a) + factories.UserDocumentAccessFactory(document=document, user=user_b) + factories.TeamDocumentAccessFactory(document=document, team="team1") + factories.TeamDocumentAccessFactory(document=document, team="team2") + + accesses = { + document.path: { + "users": {str(user_a.sub), str(user_b.sub)}, + "teams": {"team1", "team2"}, + } + } + + indexer = SearchIndexer() + result = indexer.serialize_document(document, accesses) + + assert mock_convert.call_count == 1 + + assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)} + assert set(result.pop("groups")) == {"team1", "team2"} + assert result == { + "id": str(document.id), + "title": document.title, + "depth": 1, + "path": document.path, + "numchild": 1, + "content": text_content, + "mimetype": "text/markdown", + "created_at": document.created_at.isoformat(), + "updated_at": document.updated_at.isoformat(), + "reach": document.link_reach, + "size": len(text_content), "is_active": True, }