WIP (backend) use markdown as input data for indexation

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-11-17 15:50:14 +01:00
parent f2106dd880
commit c6ed7a79f8
2 changed files with 104 additions and 5 deletions

View File

@@ -1,5 +1,6 @@
"""Document search index management utilities and indexers"""
import base64
import logging
from abc import ABC, abstractmethod
from collections import defaultdict
@@ -14,6 +15,12 @@ from django.utils.module_loading import import_string
import requests
from core import models, utils
from core.services.converter_services import (
ServiceUnavailableError as YProviderServiceUnavailableError,
)
from core.services.converter_services import (
YdocConverter,
)
logger = logging.getLogger(__name__)
@@ -231,6 +238,32 @@ class SearchIndexer(BaseDocumentIndexer):
Document indexer that pushes documents to La Suite Find app.
"""
def to_markdown(self, document):
"""
Convert document as markdown.
Returns raw text if Ydoc service is not accessible
"""
content = ""
base64_content = document.content
if base64_content is not None and len(base64_content) > 0:
# Convert using the y-provider service
try:
yprovider = YdocConverter()
result = yprovider.convert(
base64.b64decode(base64_content),
"application/vnd.yjs.doc",
"text/markdown",
)
content = result
except YProviderServiceUnavailableError as e:
logger.error(
"Error getting content for document %s: %s", document.pk, e
)
return utils.base64_yjs_to_text(base64_content)
return content
def serialize_document(self, document, accesses):
"""
Convert a Document to the JSON format expected by La Suite Find.
@@ -243,8 +276,7 @@ class SearchIndexer(BaseDocumentIndexer):
dict: A JSON-serializable dictionary.
"""
doc_path = document.path
doc_content = document.content
text_content = utils.base64_yjs_to_text(doc_content) if doc_content else ""
text_content = self.to_markdown(document)
return {
"id": str(document.id),
@@ -259,6 +291,7 @@ class SearchIndexer(BaseDocumentIndexer):
"groups": list(accesses.get(doc_path, {}).get("teams", set())),
"reach": document.computed_link_reach,
"size": len(text_content.encode("utf-8")),
"mimetype": "text/markdown",
"is_active": not bool(document.ancestors_deleted_at),
}

View File

@@ -13,6 +13,9 @@ import responses
from requests import HTTPError
from core import factories, models, utils
from core.services.converter_services import (
ServiceUnavailableError as YProviderServiceUnavailableError,
)
from core.services.search_indexers import (
BaseDocumentIndexer,
SearchIndexer,
@@ -172,7 +175,10 @@ def test_services_search_endpoint_is_empty(indexer_settings):
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_serialize_document_returns_expected_json():
@patch("core.services.converter_services.YdocConverter.convert")
def test_services_search_indexers_serialize_document_returns_expected_json(
mock_convert,
):
"""
It should serialize documents with correct metadata and access control.
"""
@@ -180,6 +186,12 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
document = factories.DocumentFactory()
factories.DocumentFactory(parent=document)
markdown_content = (
f"## {document.title}\n{utils.base64_yjs_to_text(document.content)}"
)
mock_convert.return_value = markdown_content
factories.UserDocumentAccessFactory(document=document, user=user_a)
factories.UserDocumentAccessFactory(document=document, user=user_b)
factories.TeamDocumentAccessFactory(document=document, team="team1")
@@ -195,6 +207,8 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
indexer = SearchIndexer()
result = indexer.serialize_document(document, accesses)
assert mock_convert.call_count == 1
assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
assert set(result.pop("groups")) == {"team1", "team2"}
assert result == {
@@ -203,11 +217,63 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
"depth": 1,
"path": document.path,
"numchild": 1,
"content": utils.base64_yjs_to_text(document.content),
"content": markdown_content,
"mimetype": "text/markdown",
"created_at": document.created_at.isoformat(),
"updated_at": document.updated_at.isoformat(),
"reach": document.link_reach,
"size": 13,
"size": len(markdown_content),
"is_active": True,
}
@pytest.mark.usefixtures("indexer_settings")
@patch("core.services.converter_services.YdocConverter.convert")
def test_services_search_indexers_serialize_document_no_converter(
mock_convert,
):
"""
It should serialize documents with correct metadata and access control.
"""
user_a, user_b = factories.UserFactory.create_batch(2)
document = factories.DocumentFactory()
factories.DocumentFactory(parent=document)
mock_convert.side_effect = YProviderServiceUnavailableError()
text_content = utils.base64_yjs_to_text(document.content)
factories.UserDocumentAccessFactory(document=document, user=user_a)
factories.UserDocumentAccessFactory(document=document, user=user_b)
factories.TeamDocumentAccessFactory(document=document, team="team1")
factories.TeamDocumentAccessFactory(document=document, team="team2")
accesses = {
document.path: {
"users": {str(user_a.sub), str(user_b.sub)},
"teams": {"team1", "team2"},
}
}
indexer = SearchIndexer()
result = indexer.serialize_document(document, accesses)
assert mock_convert.call_count == 1
assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
assert set(result.pop("groups")) == {"team1", "team2"}
assert result == {
"id": str(document.id),
"title": document.title,
"depth": 1,
"path": document.path,
"numchild": 1,
"content": text_content,
"mimetype": "text/markdown",
"created_at": document.created_at.isoformat(),
"updated_at": document.updated_at.isoformat(),
"reach": document.link_reach,
"size": len(text_content),
"is_active": True,
}