Compare commits

...

2 Commits

Author SHA1 Message Date
Anthony LC
955b322a09 ⚗️(backend) text_to_yjs_base64
We want to be able to convert text to yjs base64
to be able to save it to a document.
This format will be readable by the Blocknote editor.
2024-10-16 12:14:09 +02:00
Anthony LC
1ee8e5fdba ⚗️(backend) function to extract text from base64 yjs document
Function to extract text from base64 yjs document.
Can be usefull if we need to index the content
of the documents.
2024-09-20 10:43:24 +02:00
5 changed files with 85 additions and 13 deletions

View File

@@ -9,6 +9,10 @@ and this project adheres to
## [Unreleased]
## Added
- ⚗️(backend) Extract text from base64 yjs document #270
## [1.4.0] - 2024-09-17

View File

@@ -10,7 +10,7 @@ from django.core import mail
import pytest
from core.utils import email_invitation
from core.utils import email_invitation, text_to_yjs_base64, yjs_base64_to_text
pytestmark = pytest.mark.django_db
@@ -85,3 +85,34 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail):
assert email == "guest@example.com"
assert isinstance(exception, smtplib.SMTPException)
def test_yjs_base64_to_text():
"""
Test extract_text_from_saved_yjs_document
This base64 string is an example of what is saved in the database.
This base64 is generated from the blocknote editor, it contains
the text \n# *Hello* \n- w**or**ld
"""
base64_string = (
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
)
assert yjs_base64_to_text(base64_string) == "Hello world"
def test_text_to_yjs_base64():
base64_string = text_to_yjs_base64("Hello world")
assert yjs_base64_to_text(base64_string) == "Hello world"

View File

@@ -2,6 +2,7 @@
Utilities for the core app.
"""
import base64
import smtplib
from logging import getLogger
@@ -12,6 +13,9 @@ from django.template.loader import render_to_string
from django.utils.translation import gettext_lazy as _
from django.utils.translation import override
import y_py as Y
from bs4 import BeautifulSoup
logger = getLogger(__name__)
@@ -38,3 +42,37 @@ def email_invitation(language, email, document_id):
except smtplib.SMTPException as exception:
logger.error("invitation to %s was not sent: %s", email, exception)
def yjs_base64_to_text(base64_string):
"""Extract text from base64 yjs document"""
decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes)
doc = Y.YDoc() # pylint: disable=E1101
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
blocknote_structure = str(doc.get_xml_element("document-store"))
soup = BeautifulSoup(blocknote_structure, "html.parser")
return soup.get_text(separator=" ").strip()
def text_to_yjs_base64(text: str) -> str:
"""Convert plain text to a base64-encoded Yjs document"""
doc = Y.YDoc()
# Insert the paragraph text into the document
with doc.begin_transaction() as txn:
xml_fragment = doc.get_xml_element('document-store')
xml_element = xml_fragment.push_xml_element(txn, 'paragraph')
xml_text = xml_element.push_xml_text(txn)
xml_text.push(txn, text)
# Encode the document as a Uint8Array
update = Y.encode_state_as_update(doc)
# Encode the result to base64
return base64.b64encode(update).decode('utf-8')

View File

@@ -12,7 +12,7 @@ from django.core.management.base import BaseCommand, CommandError
from faker import Faker
from core import models
from core import models, utils
from demo import defaults
@@ -127,17 +127,14 @@ def create_demo(stdout):
with Timeit(stdout, "Creating documents"):
for _ in range(defaults.NB_OBJECTS["docs"]):
queue.push(
models.Document(
title=fake.sentence(nb_words=4),
link_reach=models.LinkReachChoices.AUTHENTICATED
if random_true_with_probability(0.5)
else random.choice(models.LinkReachChoices.values),
)
)
queue.flush()
models.Document(
title=fake.sentence(nb_words=4),
content=utils.text_to_yjs_base64(fake.text()),
link_reach=models.LinkReachChoices.AUTHENTICATED
if random_true_with_probability(0.5)
else random.choice(models.LinkReachChoices.values),
).save()
with Timeit(stdout, "Creating docs accesses"):
docs_ids = list(models.Document.objects.values_list("id", flat=True))
users_ids = list(models.User.objects.values_list("id", flat=True))

View File

@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4==4.12.3",
"boto3==1.35.10",
"Brotli==1.1.0",
"celery[redis]==5.4.0",
@@ -57,6 +58,7 @@ dependencies = [
"WeasyPrint>=60.2",
"whitenoise==6.7.0",
"mozilla-django-oidc==4.0.1",
"y-py==0.5.5",
]
[project.urls]