Files
docs/src/backend/core/utils.py
Charles Englebert 0fca6db79c Integrate Find (#1834)
## Purpose

integrate Find to Docs

## Proposal

- [x]  add a `useSeachDocs` hook in charged of calling the search
endpoint.
- [x]  add a optional `path` param to the `search` route. This param
represents the parent document path in case of a sub-documents
(descendants) search.
- [x] ️return Indexer results directly without DB calls to retrieve the
Document objects. All informations necessary for display are indexed in
Find. We can skip the DB calls and improve performance.
- [x] ♻️ refactor react `DocSearchContent` components.
`DocSearchContent` and `DocSearchSubContent` are now merged a unique
component handling all search scenarios and relying on the unique
`search` route.
- [x] 🔥remove pagination logic in the Indexer. Removing the DB calls
also removes the DRF queryset object which handles the pagination. Also
we consider pagination not to be necessary for search v1.
- [x] 🔥remove the `document/<document_id>/descendants` route. This route
is not used anymore. The logic of finding the descendants are moved to
the internal `_list_descendants` method. This method is based on the
parent `path` instead of the parent `id` which has some consequence
about the user access management. Relying on the path prevents the use
of the `self.get_object()` method which used to handle the user access
logic.
- [x] handle fallback logic on DRF based title search in case of
non-configured, badly configured or failing at run time indexer.
- [x] handle language extension in `title` field. Find returns titles
with a language extension (ex: `{ title.fr: "rapport d'activité" }`
instead of `{ "title": "rapport d'activité" }`.
- [x] 🔧 add a `common.test` file to allow running the tests without
docker
- [x] ♻️ rename `SearchIndexer` -> `FindDocumentIndexer`. This class has
to do with Find in particular and the convention is more coherent with
`BaseDocumentIndexer`
- [x] ♻️ rename `SEARCH_INDEXER_URL` -> `INDEXING_URL` and
`SEARCH_INDEXER_QUERY_URL` -> `SEARCH_URL`. I found the original names
very confusing.
- [x] 🔧 update the environment variables to activate the
FindDocumentIndexer.
- [x] automate the generation of encryption key during bootstrap.
OIDC_STORE_REFRESH_TOKEN_KEY is a mandatory secret key. We can not push
it on Github and we want any contributor to be able to run the app by
only running the `make bootstrap`. We chose to generate and wright it
into the `common.local` during bootstrap.

## External contributions

Thank you for your contribution! 🎉  

Please ensure the following items are checked before submitting your
pull request:
- [x] I have read and followed the [contributing
guidelines](https://github.com/suitenumerique/docs/blob/main/CONTRIBUTING.md)
- [x] I have read and agreed to the [Code of
Conduct](https://github.com/suitenumerique/docs/blob/main/CODE_OF_CONDUCT.md)
- [x] I have signed off my commits with `git commit --signoff` (DCO
compliance)
- [x] I have signed my commits with my SSH or GPG key (`git commit -S`)
- [x] My commit messages follow the required format: `<gitmoji>(type)
title description`
- [x] I have added a changelog entry under `## [Unreleased]` section (if
noticeable change)
- [x] I have added corresponding tests for new features or bug fixes (if
applicable)

---------

Signed-off-by: charles <charles.englebert@protonmail.com>
2026-03-17 17:32:03 +01:00

171 lines
5.2 KiB
Python

"""Utils for the core app."""
import base64
import logging
import re
import time
from collections import defaultdict
from django.core.cache import cache
from django.db import models as db
from django.db.models import Subquery
import pycrdt
from bs4 import BeautifulSoup
from core import enums, models
logger = logging.getLogger(__name__)
def get_value_by_pattern(data, pattern):
"""
Get all values from keys matching a regex pattern in a dictionary.
Args:
data (dict): Source dictionary to search
pattern (str): Regex pattern to match against keys
Returns:
list: List of values for all matching keys, empty list if no matches
Example:
>>> get_value_by_pattern({"title.fr": "Bonjour", "id": 1}, r"^title\\.")
["Bonjour"]
>>> get_value_by_pattern({"title.fr": "Bonjour", "title.en": "Hello"}, r"^title\\.")
["Bonjour", "Hello"]
"""
regex = re.compile(pattern)
return [value for key, value in data.items() if regex.match(key)]
def get_ancestor_to_descendants_map(paths, steplen):
"""
Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.
Each path is assumed to use materialized path format with fixed-length segments.
Args:
paths (list of str): List of full document paths.
steplen (int): Length of each path segment.
Returns:
dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).
"""
ancestor_map = defaultdict(set)
for path in paths:
for i in range(steplen, len(path) + 1, steplen):
ancestor = path[:i]
ancestor_map[ancestor].add(path)
return ancestor_map
def filter_descendants(paths, root_paths, skip_sorting=False):
"""
Filters paths to keep only those that are descendants of any path in root_paths.
A path is considered a descendant of a root path if it starts with the root path.
If `skip_sorting` is not set to True, the function will sort both lists before
processing because both `paths` and `root_paths` need to be in lexicographic order
before going through the algorithm.
Args:
paths (iterable of str): List of paths to be filtered.
root_paths (iterable of str): List of paths to check as potential prefixes.
skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted.
Returns:
list of str: A list of sorted paths that are descendants of any path in `root_paths`.
"""
results = []
i = 0
n = len(root_paths)
if not skip_sorting:
paths.sort()
root_paths.sort()
for path in paths:
# Try to find a matching prefix in the sorted accessible paths
while i < n:
if path.startswith(root_paths[i]):
results.append(path)
break
if root_paths[i] < path:
i += 1
else:
# If paths[i] > path, no need to keep searching
break
return results
def base64_yjs_to_xml(base64_string):
"""Extract xml from base64 yjs document."""
decoded_bytes = base64.b64decode(base64_string)
# uint8_array = bytearray(decoded_bytes)
doc = pycrdt.Doc()
doc.apply_update(decoded_bytes)
return str(doc.get("document-store", type=pycrdt.XmlFragment))
def base64_yjs_to_text(base64_string):
"""Extract text from base64 yjs document."""
blocknote_structure = base64_yjs_to_xml(base64_string)
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
return soup.get_text(separator=" ", strip=True)
def extract_attachments(content):
"""Helper method to extract media paths from a document's content."""
if not content:
return []
xml_content = base64_yjs_to_xml(content)
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)
def get_users_sharing_documents_with_cache_key(user):
"""Generate a unique cache key for each user."""
return f"users_sharing_documents_with_{user.id}"
def users_sharing_documents_with(user):
"""
Returns a map of users sharing documents with the given user,
sorted by last shared date.
"""
start_time = time.time()
cache_key = get_users_sharing_documents_with_cache_key(user)
cached_result = cache.get(cache_key)
if cached_result is not None:
elapsed = time.time() - start_time
logger.info(
"users_sharing_documents_with cache hit for user %s (took %.3fs)",
user.id,
elapsed,
)
return cached_result
user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(
"document_id", flat=True
)
shared_qs = (
models.DocumentAccess.objects.filter(document_id__in=Subquery(user_docs_qs))
.exclude(user=user)
.values("user")
.annotate(last_shared=db.Max("created_at"))
)
result = {item["user"]: item["last_shared"] for item in shared_qs}
cache.set(cache_key, result, 86400) # Cache for 1 day
elapsed = time.time() - start_time
logger.info(
"users_sharing_documents_with cache miss for user %s (took %.3fs)",
user.id,
elapsed,
)
return result