mirror of
https://github.com/suitenumerique/docs.git
synced 2026-04-25 17:15:01 +02:00
WIP ✨(backend) use markdown as input data for indexation
Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
"""Document search index management utilities and indexers"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
@@ -14,6 +15,12 @@ from django.utils.module_loading import import_string
|
||||
import requests
|
||||
|
||||
from core import models, utils
|
||||
from core.services.converter_services import (
|
||||
ServiceUnavailableError as YProviderServiceUnavailableError,
|
||||
)
|
||||
from core.services.converter_services import (
|
||||
YdocConverter,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -231,6 +238,32 @@ class SearchIndexer(BaseDocumentIndexer):
|
||||
Document indexer that pushes documents to La Suite Find app.
|
||||
"""
|
||||
|
||||
def to_markdown(self, document):
|
||||
"""
|
||||
Convert document as markdown.
|
||||
Returns raw text if Ydoc service is not accessible
|
||||
"""
|
||||
content = ""
|
||||
base64_content = document.content
|
||||
|
||||
if base64_content is not None and len(base64_content) > 0:
|
||||
# Convert using the y-provider service
|
||||
try:
|
||||
yprovider = YdocConverter()
|
||||
result = yprovider.convert(
|
||||
base64.b64decode(base64_content),
|
||||
"application/vnd.yjs.doc",
|
||||
"text/markdown",
|
||||
)
|
||||
content = result
|
||||
except YProviderServiceUnavailableError as e:
|
||||
logger.error(
|
||||
"Error getting content for document %s: %s", document.pk, e
|
||||
)
|
||||
return utils.base64_yjs_to_text(base64_content)
|
||||
|
||||
return content
|
||||
|
||||
def serialize_document(self, document, accesses):
|
||||
"""
|
||||
Convert a Document to the JSON format expected by La Suite Find.
|
||||
@@ -243,8 +276,7 @@ class SearchIndexer(BaseDocumentIndexer):
|
||||
dict: A JSON-serializable dictionary.
|
||||
"""
|
||||
doc_path = document.path
|
||||
doc_content = document.content
|
||||
text_content = utils.base64_yjs_to_text(doc_content) if doc_content else ""
|
||||
text_content = self.to_markdown(document)
|
||||
|
||||
return {
|
||||
"id": str(document.id),
|
||||
@@ -259,6 +291,7 @@ class SearchIndexer(BaseDocumentIndexer):
|
||||
"groups": list(accesses.get(doc_path, {}).get("teams", set())),
|
||||
"reach": document.computed_link_reach,
|
||||
"size": len(text_content.encode("utf-8")),
|
||||
"mimetype": "text/markdown",
|
||||
"is_active": not bool(document.ancestors_deleted_at),
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,9 @@ import responses
|
||||
from requests import HTTPError
|
||||
|
||||
from core import factories, models, utils
|
||||
from core.services.converter_services import (
|
||||
ServiceUnavailableError as YProviderServiceUnavailableError,
|
||||
)
|
||||
from core.services.search_indexers import (
|
||||
BaseDocumentIndexer,
|
||||
SearchIndexer,
|
||||
@@ -172,7 +175,10 @@ def test_services_search_endpoint_is_empty(indexer_settings):
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_serialize_document_returns_expected_json():
|
||||
@patch("core.services.converter_services.YdocConverter.convert")
|
||||
def test_services_search_indexers_serialize_document_returns_expected_json(
|
||||
mock_convert,
|
||||
):
|
||||
"""
|
||||
It should serialize documents with correct metadata and access control.
|
||||
"""
|
||||
@@ -180,6 +186,12 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
|
||||
document = factories.DocumentFactory()
|
||||
factories.DocumentFactory(parent=document)
|
||||
|
||||
markdown_content = (
|
||||
f"## {document.title}\n{utils.base64_yjs_to_text(document.content)}"
|
||||
)
|
||||
|
||||
mock_convert.return_value = markdown_content
|
||||
|
||||
factories.UserDocumentAccessFactory(document=document, user=user_a)
|
||||
factories.UserDocumentAccessFactory(document=document, user=user_b)
|
||||
factories.TeamDocumentAccessFactory(document=document, team="team1")
|
||||
@@ -195,6 +207,8 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
|
||||
indexer = SearchIndexer()
|
||||
result = indexer.serialize_document(document, accesses)
|
||||
|
||||
assert mock_convert.call_count == 1
|
||||
|
||||
assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
|
||||
assert set(result.pop("groups")) == {"team1", "team2"}
|
||||
assert result == {
|
||||
@@ -203,11 +217,63 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
|
||||
"depth": 1,
|
||||
"path": document.path,
|
||||
"numchild": 1,
|
||||
"content": utils.base64_yjs_to_text(document.content),
|
||||
"content": markdown_content,
|
||||
"mimetype": "text/markdown",
|
||||
"created_at": document.created_at.isoformat(),
|
||||
"updated_at": document.updated_at.isoformat(),
|
||||
"reach": document.link_reach,
|
||||
"size": 13,
|
||||
"size": len(markdown_content),
|
||||
"is_active": True,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
@patch("core.services.converter_services.YdocConverter.convert")
|
||||
def test_services_search_indexers_serialize_document_no_converter(
|
||||
mock_convert,
|
||||
):
|
||||
"""
|
||||
It should serialize documents with correct metadata and access control.
|
||||
"""
|
||||
user_a, user_b = factories.UserFactory.create_batch(2)
|
||||
document = factories.DocumentFactory()
|
||||
factories.DocumentFactory(parent=document)
|
||||
|
||||
mock_convert.side_effect = YProviderServiceUnavailableError()
|
||||
|
||||
text_content = utils.base64_yjs_to_text(document.content)
|
||||
|
||||
factories.UserDocumentAccessFactory(document=document, user=user_a)
|
||||
factories.UserDocumentAccessFactory(document=document, user=user_b)
|
||||
factories.TeamDocumentAccessFactory(document=document, team="team1")
|
||||
factories.TeamDocumentAccessFactory(document=document, team="team2")
|
||||
|
||||
accesses = {
|
||||
document.path: {
|
||||
"users": {str(user_a.sub), str(user_b.sub)},
|
||||
"teams": {"team1", "team2"},
|
||||
}
|
||||
}
|
||||
|
||||
indexer = SearchIndexer()
|
||||
result = indexer.serialize_document(document, accesses)
|
||||
|
||||
assert mock_convert.call_count == 1
|
||||
|
||||
assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
|
||||
assert set(result.pop("groups")) == {"team1", "team2"}
|
||||
assert result == {
|
||||
"id": str(document.id),
|
||||
"title": document.title,
|
||||
"depth": 1,
|
||||
"path": document.path,
|
||||
"numchild": 1,
|
||||
"content": text_content,
|
||||
"mimetype": "text/markdown",
|
||||
"created_at": document.created_at.isoformat(),
|
||||
"updated_at": document.updated_at.isoformat(),
|
||||
"reach": document.link_reach,
|
||||
"size": len(text_content),
|
||||
"is_active": True,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user