mirror of
https://github.com/suitenumerique/docs.git
synced 2026-05-09 00:22:46 +02:00
## Purpose integrate Find to Docs ## Proposal - [x] ✨ add a `useSeachDocs` hook in charged of calling the search endpoint. - [x] ✨ add a optional `path` param to the `search` route. This param represents the parent document path in case of a sub-documents (descendants) search. - [x] ⚡️return Indexer results directly without DB calls to retrieve the Document objects. All informations necessary for display are indexed in Find. We can skip the DB calls and improve performance. - [x] ♻️ refactor react `DocSearchContent` components. `DocSearchContent` and `DocSearchSubContent` are now merged a unique component handling all search scenarios and relying on the unique `search` route. - [x] 🔥remove pagination logic in the Indexer. Removing the DB calls also removes the DRF queryset object which handles the pagination. Also we consider pagination not to be necessary for search v1. - [x] 🔥remove the `document/<document_id>/descendants` route. This route is not used anymore. The logic of finding the descendants are moved to the internal `_list_descendants` method. This method is based on the parent `path` instead of the parent `id` which has some consequence about the user access management. Relying on the path prevents the use of the `self.get_object()` method which used to handle the user access logic. - [x] ✨handle fallback logic on DRF based title search in case of non-configured, badly configured or failing at run time indexer. - [x] ✨handle language extension in `title` field. Find returns titles with a language extension (ex: `{ title.fr: "rapport d'activité" }` instead of `{ "title": "rapport d'activité" }`. - [x] 🔧 add a `common.test` file to allow running the tests without docker - [x] ♻️ rename `SearchIndexer` -> `FindDocumentIndexer`. This class has to do with Find in particular and the convention is more coherent with `BaseDocumentIndexer` - [x] ♻️ rename `SEARCH_INDEXER_URL` -> `INDEXING_URL` and `SEARCH_INDEXER_QUERY_URL` -> `SEARCH_URL`. I found the original names very confusing. - [x] 🔧 update the environment variables to activate the FindDocumentIndexer. - [x] ✨automate the generation of encryption key during bootstrap. OIDC_STORE_REFRESH_TOKEN_KEY is a mandatory secret key. We can not push it on Github and we want any contributor to be able to run the app by only running the `make bootstrap`. We chose to generate and wright it into the `common.local` during bootstrap. ## External contributions Thank you for your contribution! 🎉 Please ensure the following items are checked before submitting your pull request: - [x] I have read and followed the [contributing guidelines](https://github.com/suitenumerique/docs/blob/main/CONTRIBUTING.md) - [x] I have read and agreed to the [Code of Conduct](https://github.com/suitenumerique/docs/blob/main/CODE_OF_CONDUCT.md) - [x] I have signed off my commits with `git commit --signoff` (DCO compliance) - [x] I have signed my commits with my SSH or GPG key (`git commit -S`) - [x] My commit messages follow the required format: `<gitmoji>(type) title description` - [x] I have added a changelog entry under `## [Unreleased]` section (if noticeable change) - [x] I have added corresponding tests for new features or bug fixes (if applicable) --------- Signed-off-by: charles <charles.englebert@protonmail.com>
171 lines
5.2 KiB
Python
171 lines
5.2 KiB
Python
"""Utils for the core app."""
|
|
|
|
import base64
|
|
import logging
|
|
import re
|
|
import time
|
|
from collections import defaultdict
|
|
|
|
from django.core.cache import cache
|
|
from django.db import models as db
|
|
from django.db.models import Subquery
|
|
|
|
import pycrdt
|
|
from bs4 import BeautifulSoup
|
|
|
|
from core import enums, models
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_value_by_pattern(data, pattern):
|
|
"""
|
|
Get all values from keys matching a regex pattern in a dictionary.
|
|
|
|
Args:
|
|
data (dict): Source dictionary to search
|
|
pattern (str): Regex pattern to match against keys
|
|
|
|
Returns:
|
|
list: List of values for all matching keys, empty list if no matches
|
|
|
|
Example:
|
|
>>> get_value_by_pattern({"title.fr": "Bonjour", "id": 1}, r"^title\\.")
|
|
["Bonjour"]
|
|
>>> get_value_by_pattern({"title.fr": "Bonjour", "title.en": "Hello"}, r"^title\\.")
|
|
["Bonjour", "Hello"]
|
|
"""
|
|
regex = re.compile(pattern)
|
|
return [value for key, value in data.items() if regex.match(key)]
|
|
|
|
|
|
def get_ancestor_to_descendants_map(paths, steplen):
|
|
"""
|
|
Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.
|
|
|
|
Each path is assumed to use materialized path format with fixed-length segments.
|
|
|
|
Args:
|
|
paths (list of str): List of full document paths.
|
|
steplen (int): Length of each path segment.
|
|
|
|
Returns:
|
|
dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).
|
|
"""
|
|
ancestor_map = defaultdict(set)
|
|
for path in paths:
|
|
for i in range(steplen, len(path) + 1, steplen):
|
|
ancestor = path[:i]
|
|
ancestor_map[ancestor].add(path)
|
|
return ancestor_map
|
|
|
|
|
|
def filter_descendants(paths, root_paths, skip_sorting=False):
|
|
"""
|
|
Filters paths to keep only those that are descendants of any path in root_paths.
|
|
|
|
A path is considered a descendant of a root path if it starts with the root path.
|
|
If `skip_sorting` is not set to True, the function will sort both lists before
|
|
processing because both `paths` and `root_paths` need to be in lexicographic order
|
|
before going through the algorithm.
|
|
|
|
Args:
|
|
paths (iterable of str): List of paths to be filtered.
|
|
root_paths (iterable of str): List of paths to check as potential prefixes.
|
|
skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted.
|
|
|
|
Returns:
|
|
list of str: A list of sorted paths that are descendants of any path in `root_paths`.
|
|
"""
|
|
results = []
|
|
i = 0
|
|
n = len(root_paths)
|
|
|
|
if not skip_sorting:
|
|
paths.sort()
|
|
root_paths.sort()
|
|
|
|
for path in paths:
|
|
# Try to find a matching prefix in the sorted accessible paths
|
|
while i < n:
|
|
if path.startswith(root_paths[i]):
|
|
results.append(path)
|
|
break
|
|
if root_paths[i] < path:
|
|
i += 1
|
|
else:
|
|
# If paths[i] > path, no need to keep searching
|
|
break
|
|
return results
|
|
|
|
|
|
def base64_yjs_to_xml(base64_string):
|
|
"""Extract xml from base64 yjs document."""
|
|
|
|
decoded_bytes = base64.b64decode(base64_string)
|
|
# uint8_array = bytearray(decoded_bytes)
|
|
|
|
doc = pycrdt.Doc()
|
|
doc.apply_update(decoded_bytes)
|
|
return str(doc.get("document-store", type=pycrdt.XmlFragment))
|
|
|
|
|
|
def base64_yjs_to_text(base64_string):
|
|
"""Extract text from base64 yjs document."""
|
|
|
|
blocknote_structure = base64_yjs_to_xml(base64_string)
|
|
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
|
|
return soup.get_text(separator=" ", strip=True)
|
|
|
|
|
|
def extract_attachments(content):
|
|
"""Helper method to extract media paths from a document's content."""
|
|
if not content:
|
|
return []
|
|
|
|
xml_content = base64_yjs_to_xml(content)
|
|
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)
|
|
|
|
|
|
def get_users_sharing_documents_with_cache_key(user):
|
|
"""Generate a unique cache key for each user."""
|
|
return f"users_sharing_documents_with_{user.id}"
|
|
|
|
|
|
def users_sharing_documents_with(user):
|
|
"""
|
|
Returns a map of users sharing documents with the given user,
|
|
sorted by last shared date.
|
|
"""
|
|
start_time = time.time()
|
|
cache_key = get_users_sharing_documents_with_cache_key(user)
|
|
cached_result = cache.get(cache_key)
|
|
|
|
if cached_result is not None:
|
|
elapsed = time.time() - start_time
|
|
logger.info(
|
|
"users_sharing_documents_with cache hit for user %s (took %.3fs)",
|
|
user.id,
|
|
elapsed,
|
|
)
|
|
return cached_result
|
|
|
|
user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(
|
|
"document_id", flat=True
|
|
)
|
|
shared_qs = (
|
|
models.DocumentAccess.objects.filter(document_id__in=Subquery(user_docs_qs))
|
|
.exclude(user=user)
|
|
.values("user")
|
|
.annotate(last_shared=db.Max("created_at"))
|
|
)
|
|
result = {item["user"]: item["last_shared"] for item in shared_qs}
|
|
cache.set(cache_key, result, 86400) # Cache for 1 day
|
|
elapsed = time.time() - start_time
|
|
logger.info(
|
|
"users_sharing_documents_with cache miss for user %s (took %.3fs)",
|
|
user.id,
|
|
elapsed,
|
|
)
|
|
return result
|