mirror of
https://github.com/suitenumerique/docs.git
synced 2026-04-26 01:25:05 +02:00
Compare commits
24 Commits
fix/link-p
...
ui/fulltex
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87771587e8 | ||
|
|
48c312c1e6 | ||
|
|
d5d4d1a6d0 | ||
|
|
0ee4fdcd53 | ||
|
|
a16649076a | ||
|
|
2c6becec97 | ||
|
|
a0433b30ff | ||
|
|
ba9541bd29 | ||
|
|
5632c868c2 | ||
|
|
40452d6fbb | ||
|
|
59f5314116 | ||
|
|
7b57b2f5ea | ||
|
|
5b413f7a7f | ||
|
|
ec74630c3e | ||
|
|
c9f9dd8144 | ||
|
|
a32e636f27 | ||
|
|
154dc91dff | ||
|
|
87a836a5c2 | ||
|
|
2b436f40fb | ||
|
|
432b8ecc22 | ||
|
|
25e68535e8 | ||
|
|
d63d5f1464 | ||
|
|
fbc0c0d86c | ||
|
|
e67e261208 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -43,6 +43,10 @@ venv.bak/
|
||||
env.d/development/*.local
|
||||
env.d/terraform
|
||||
|
||||
# Docker
|
||||
compose.override.yml
|
||||
docker/auth/*.local
|
||||
|
||||
# npm
|
||||
node_modules
|
||||
|
||||
|
||||
@@ -123,6 +123,9 @@ and this project adheres to
|
||||
- ♿ update labels and shared document icon accessibility #1442
|
||||
- 🍱(frontend) Fonts GDPR compliants #1453
|
||||
- ♻️(service-worker) improve SW registration and update handling #1473
|
||||
- ✨(backend) add async indexation of documents on save (or access save) #1276
|
||||
- ✨(backend) add debounce mechanism to limit indexation jobs #1276
|
||||
- ✨(api) add API route to search for indexed documents in Find #1276
|
||||
|
||||
### Fixed
|
||||
|
||||
|
||||
4
Makefile
4
Makefile
@@ -247,6 +247,10 @@ demo: ## flush db then create a demo for load testing purpose
|
||||
@$(MANAGE) create_demo
|
||||
.PHONY: demo
|
||||
|
||||
index: ## index all documents to remote search
|
||||
@$(MANAGE) index
|
||||
.PHONY: index
|
||||
|
||||
# Nota bene: Black should come after isort just in case they don't agree...
|
||||
lint: ## lint back-end python sources
|
||||
lint: \
|
||||
|
||||
6
bin/fernetkey
Executable file
6
bin/fernetkey
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# shellcheck source=bin/_config.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/_config.sh"
|
||||
|
||||
_dc_run app-dev python -c 'from cryptography.fernet import Fernet;import sys; sys.stdout.write("\n" + Fernet.generate_key().decode() + "\n");'
|
||||
18
compose.yml
18
compose.yml
@@ -72,6 +72,11 @@ services:
|
||||
- env.d/development/postgresql.local
|
||||
ports:
|
||||
- "8071:8000"
|
||||
networks:
|
||||
default: {}
|
||||
lasuite:
|
||||
aliases:
|
||||
- impress
|
||||
volumes:
|
||||
- ./src/backend:/app
|
||||
- ./data/static:/data/static
|
||||
@@ -92,6 +97,9 @@ services:
|
||||
command: ["celery", "-A", "impress.celery_app", "worker", "-l", "DEBUG"]
|
||||
environment:
|
||||
- DJANGO_CONFIGURATION=Development
|
||||
networks:
|
||||
- default
|
||||
- lasuite
|
||||
env_file:
|
||||
- env.d/development/common
|
||||
- env.d/development/common.local
|
||||
@@ -107,6 +115,11 @@ services:
|
||||
image: nginx:1.25
|
||||
ports:
|
||||
- "8083:8083"
|
||||
networks:
|
||||
default: {}
|
||||
lasuite:
|
||||
aliases:
|
||||
- nginx
|
||||
volumes:
|
||||
- ./docker/files/etc/nginx/conf.d:/etc/nginx/conf.d:ro
|
||||
depends_on:
|
||||
@@ -217,3 +230,8 @@ services:
|
||||
kc_postgresql:
|
||||
condition: service_healthy
|
||||
restart: true
|
||||
|
||||
networks:
|
||||
lasuite:
|
||||
name: lasuite-network
|
||||
driver: bridge
|
||||
|
||||
@@ -12,6 +12,7 @@ flowchart TD
|
||||
Back --> DB("Database (PostgreSQL)")
|
||||
Back <--> Celery --> DB
|
||||
Back ----> S3("Minio (S3)")
|
||||
Back -- REST API --> Find
|
||||
```
|
||||
|
||||
### Architecture decision records
|
||||
|
||||
@@ -93,6 +93,13 @@ These are the environment variables you can set for the `impress-backend` contai
|
||||
| OIDC_USERINFO_SHORTNAME_FIELD | OIDC token claims to create shortname | first_name |
|
||||
| POSTHOG_KEY | Posthog key for analytics | |
|
||||
| REDIS_URL | Cache url | redis://redis:6379/1 |
|
||||
| SEARCH_INDEXER_CLASS | Class of the backend for document indexation & search | |
|
||||
| SEARCH_INDEXER_BATCH_SIZE | Size of each batch for indexation of all documents | 100000 |
|
||||
| SEARCH_INDEXER_COUNTDOWN | Minimum debounce delay of indexation jobs (in seconds) | 1 |
|
||||
| SEARCH_INDEXER_URL | Find application endpoint for indexation | |
|
||||
| SEARCH_INDEXER_SECRET | Token for indexation queries | |
|
||||
| SEARCH_INDEXER_QUERY_URL | Find application endpoint for search | |
|
||||
| SEARCH_INDEXER_QUERY_LIMIT | Maximum number of results expected from search endpoint | 50 |
|
||||
| SENTRY_DSN | Sentry host | |
|
||||
| SESSION_COOKIE_AGE | duration of the cookie session | 60*60*12 |
|
||||
| SPECTACULAR_SETTINGS_ENABLE_DJANGO_DEPLOY_CHECK | | false |
|
||||
|
||||
41
docs/search.md
Normal file
41
docs/search.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# Setup the Find search for Impress
|
||||
|
||||
This configuration will enable the fulltext search feature for Docs :
|
||||
- Each save on **core.Document** or **core.DocumentAccess** will trigger the indexer
|
||||
- The `api/v1.0/documents/search/` will work as a proxy with the Find API for fulltext search.
|
||||
|
||||
## Create an index service for Docs
|
||||
|
||||
Configure a **Service** for Docs application with these settings
|
||||
|
||||
- **Name**: `docs`<br>_request.auth.name of the Docs application._
|
||||
- **Client id**: `impress`<br>_Name of the token audience or client_id of the Docs application._
|
||||
|
||||
See [how-to-use-indexer.md](how-to-use-indexer.md) for details.
|
||||
|
||||
## Configure settings of Docs
|
||||
|
||||
Add those Django settings the Docs application to enable the feature.
|
||||
|
||||
```shell
|
||||
SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer"
|
||||
SEARCH_INDEXER_COUNTDOWN=10 # Debounce delay in seconds for the indexer calls.
|
||||
|
||||
# The token from service "docs" of Find application (development).
|
||||
SEARCH_INDEXER_SECRET="find-api-key-for-docs-with-exactly-50-chars-length"
|
||||
SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/"
|
||||
|
||||
# Search endpoint. Uses the OIDC token for authentication
|
||||
SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/"
|
||||
# Maximum number of results expected from the search endpoint
|
||||
SEARCH_INDEXER_QUERY_LIMIT=50
|
||||
```
|
||||
|
||||
We also need to enable the **OIDC Token** refresh or the authentication will fail quickly.
|
||||
|
||||
```shell
|
||||
# Store OIDC tokens in the session
|
||||
OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session
|
||||
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session
|
||||
OIDC_STORE_REFRESH_TOKEN_KEY = "your-32-byte-encryption-key==" # Must be a valid Fernet key (32 url-safe base64-encoded bytes)
|
||||
```
|
||||
@@ -97,6 +97,17 @@ Production deployments differ significantly from development environments. The t
|
||||
| 5433 | PostgreSQL (Keycloak) |
|
||||
| 1081 | MailCatcher |
|
||||
|
||||
**With fulltext search service**
|
||||
|
||||
| Port | Service |
|
||||
| --------- | --------------------- |
|
||||
| 8081 | Find (Django) |
|
||||
| 9200 | Opensearch |
|
||||
| 9600 | Opensearch admin |
|
||||
| 5601 | Opensearch dashboard |
|
||||
| 25432 | PostgreSQL (Find) |
|
||||
|
||||
|
||||
## 6. Sizing Guidelines
|
||||
|
||||
**RAM** – start at 8 GB dev / 16 GB staging / 32 GB prod. Postgres and Keycloak are the first to OOM; scale them first.
|
||||
|
||||
@@ -36,6 +36,7 @@ OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/c
|
||||
OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth
|
||||
OIDC_OP_TOKEN_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token
|
||||
OIDC_OP_USER_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/userinfo
|
||||
OIDC_OP_INTROSPECTION_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token/introspect
|
||||
|
||||
OIDC_RP_CLIENT_ID=impress
|
||||
OIDC_RP_CLIENT_SECRET=ThisIsAnExampleKeyForDevPurposeOnly
|
||||
@@ -49,6 +50,14 @@ LOGOUT_REDIRECT_URL=http://localhost:3000
|
||||
OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"]
|
||||
OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}
|
||||
|
||||
# Store OIDC tokens in the session. Needed by search/ endpoint.
|
||||
# OIDC_STORE_ACCESS_TOKEN = True
|
||||
# OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session.
|
||||
|
||||
# Must be a valid Fernet key (32 url-safe base64-encoded bytes)
|
||||
# To create one, use the bin/fernetkey command.
|
||||
# OIDC_STORE_REFRESH_TOKEN_KEY="your-32-byte-encryption-key=="
|
||||
|
||||
# AI
|
||||
AI_FEATURE_ENABLED=true
|
||||
AI_BASE_URL=https://openaiendpoint.com
|
||||
@@ -68,4 +77,10 @@ Y_PROVIDER_API_BASE_URL=http://y-provider-development:4444/api/
|
||||
Y_PROVIDER_API_KEY=yprovider-api-key
|
||||
|
||||
# Theme customization
|
||||
THEME_CUSTOMIZATION_CACHE_TIMEOUT=15
|
||||
THEME_CUSTOMIZATION_CACHE_TIMEOUT=15
|
||||
|
||||
# Indexer (disabled)
|
||||
# SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer"
|
||||
SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app.
|
||||
SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/"
|
||||
SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/"
|
||||
|
||||
@@ -6,4 +6,9 @@ Y_PROVIDER_API_BASE_URL=http://y-provider:4444/api/
|
||||
|
||||
# Throttle
|
||||
API_DOCUMENT_THROTTLE_RATE=1000/min
|
||||
API_CONFIG_THROTTLE_RATE=1000/min
|
||||
API_CONFIG_THROTTLE_RATE=1000/min
|
||||
|
||||
# Store OIDC tokens in the session.
|
||||
OIDC_STORE_ACCESS_TOKEN = True
|
||||
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session.
|
||||
OIDC_STORE_REFRESH_TOKEN_KEY = "na1hhus-OLhq9mb9SO3R-8E4dONuMnqpZSY_SX8xcFk="
|
||||
|
||||
@@ -1013,3 +1013,13 @@ class ThreadSerializer(serializers.ModelSerializer):
|
||||
if request:
|
||||
return thread.get_abilities(request.user)
|
||||
return {}
|
||||
|
||||
|
||||
class SearchDocumentSerializer(serializers.Serializer):
|
||||
"""Serializer for fulltext search requests through Find application"""
|
||||
|
||||
q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
|
||||
page_size = serializers.IntegerField(
|
||||
required=False, min_value=1, max_value=50, default=20
|
||||
)
|
||||
page = serializers.IntegerField(required=False, min_value=1, default=1)
|
||||
|
||||
@@ -22,6 +22,7 @@ from django.db.models.functions import Left, Length
|
||||
from django.http import Http404, StreamingHttpResponse
|
||||
from django.urls import reverse
|
||||
from django.utils import timezone
|
||||
from django.utils.decorators import method_decorator
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import capfirst, slugify
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
@@ -32,6 +33,7 @@ from botocore.exceptions import ClientError
|
||||
from csp.constants import NONE
|
||||
from csp.decorators import csp_update
|
||||
from lasuite.malware_detection import malware_detection
|
||||
from lasuite.oidc_login.decorators import refresh_oidc_access_token
|
||||
from rest_framework import filters, status, viewsets
|
||||
from rest_framework import response as drf_response
|
||||
from rest_framework.permissions import AllowAny
|
||||
@@ -48,6 +50,10 @@ from core.services.converter_services import (
|
||||
from core.services.converter_services import (
|
||||
YdocConverter,
|
||||
)
|
||||
from core.services.search_indexers import (
|
||||
get_document_indexer,
|
||||
get_visited_document_ids_of,
|
||||
)
|
||||
from core.tasks.mail import send_ask_for_access_mail
|
||||
from core.utils import extract_attachments, filter_descendants
|
||||
|
||||
@@ -374,6 +380,7 @@ class DocumentViewSet(
|
||||
list_serializer_class = serializers.ListDocumentSerializer
|
||||
trashbin_serializer_class = serializers.ListDocumentSerializer
|
||||
tree_serializer_class = serializers.ListDocumentSerializer
|
||||
search_serializer_class = serializers.ListDocumentSerializer
|
||||
|
||||
def get_queryset(self):
|
||||
"""Get queryset performing all annotation and filtering on the document tree structure."""
|
||||
@@ -1065,6 +1072,83 @@ class DocumentViewSet(
|
||||
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
|
||||
)
|
||||
|
||||
def _search_simple(self, request, text):
|
||||
"""
|
||||
Returns a queryset filtered by the content of the document title
|
||||
"""
|
||||
# As the 'list' view we get a prefiltered queryset (deleted docs are excluded)
|
||||
queryset = self.get_queryset()
|
||||
filterset = DocumentFilter({"title": text}, queryset=queryset)
|
||||
|
||||
if not filterset.is_valid():
|
||||
raise drf.exceptions.ValidationError(filterset.errors)
|
||||
|
||||
queryset = filterset.filter_queryset(queryset)
|
||||
|
||||
return self.get_response_for_queryset(
|
||||
queryset.order_by("-updated_at"),
|
||||
context={
|
||||
"request": request,
|
||||
},
|
||||
)
|
||||
|
||||
def _search_fulltext(self, indexer, request, params):
|
||||
"""
|
||||
Returns a queryset from the results the fulltext search of Find
|
||||
"""
|
||||
access_token = request.session.get("oidc_access_token")
|
||||
user = request.user
|
||||
text = params.validated_data["q"]
|
||||
queryset = models.Document.objects.all()
|
||||
|
||||
# Retrieve the documents ids from Find.
|
||||
results = indexer.search(
|
||||
text=text,
|
||||
token=access_token,
|
||||
visited=get_visited_document_ids_of(queryset, user),
|
||||
)
|
||||
|
||||
docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)}
|
||||
ordered_docs = [docs_by_uuid[id] for id in results]
|
||||
|
||||
page = self.paginate_queryset(ordered_docs)
|
||||
|
||||
serializer = self.get_serializer(
|
||||
page if page else ordered_docs,
|
||||
many=True,
|
||||
context={
|
||||
"request": request,
|
||||
},
|
||||
)
|
||||
|
||||
return self.get_paginated_response(serializer.data)
|
||||
|
||||
@drf.decorators.action(detail=False, methods=["get"], url_path="search")
|
||||
@method_decorator(refresh_oidc_access_token)
|
||||
def search(self, request, *args, **kwargs):
|
||||
"""
|
||||
Returns a DRF response containing the filtered, annotated and ordered document list.
|
||||
|
||||
Applies filtering based on request parameter 'q' from `SearchDocumentSerializer`.
|
||||
Depending of the configuration it can be:
|
||||
- A fulltext search through the opensearch indexation app "find" if the backend is
|
||||
enabled (see SEARCH_INDEXER_CLASS)
|
||||
- A filtering by the model field 'title'.
|
||||
|
||||
The ordering is always by the most recent first.
|
||||
"""
|
||||
params = serializers.SearchDocumentSerializer(data=request.query_params)
|
||||
params.is_valid(raise_exception=True)
|
||||
|
||||
indexer = get_document_indexer()
|
||||
|
||||
if indexer:
|
||||
return self._search_fulltext(indexer, request, params=params)
|
||||
|
||||
# The indexer is not configured, we fallback on a simple icontains filter by the
|
||||
# model field 'title'.
|
||||
return self._search_simple(request, text=params.validated_data["q"])
|
||||
|
||||
@drf.decorators.action(detail=True, methods=["get"], url_path="versions")
|
||||
def versions_list(self, request, *args, **kwargs):
|
||||
"""
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
"""Impress Core application"""
|
||||
# from django.apps import AppConfig
|
||||
# from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from django.apps import AppConfig
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
|
||||
# class CoreConfig(AppConfig):
|
||||
# """Configuration class for the impress core app."""
|
||||
class CoreConfig(AppConfig):
|
||||
"""Configuration class for the impress core app."""
|
||||
|
||||
# name = "core"
|
||||
# app_label = "core"
|
||||
# verbose_name = _("impress core application")
|
||||
name = "core"
|
||||
app_label = "core"
|
||||
verbose_name = _("Impress core application")
|
||||
|
||||
def ready(self):
|
||||
"""
|
||||
Import signals when the app is ready.
|
||||
"""
|
||||
# pylint: disable=import-outside-toplevel, unused-import
|
||||
from . import signals # noqa: PLC0415
|
||||
|
||||
52
src/backend/core/management/commands/index.py
Normal file
52
src/backend/core/management/commands/index.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Handle search setup that needs to be done at bootstrap time.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from core.services.search_indexers import get_document_indexer
|
||||
|
||||
logger = logging.getLogger("docs.search.bootstrap_search")
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""Index all documents to remote search service"""
|
||||
|
||||
help = __doc__
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add argument to require forcing execution when not in debug mode."""
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
action="store",
|
||||
dest="batch_size",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Indexation query batch size",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
"""Launch and log search index generation."""
|
||||
indexer = get_document_indexer()
|
||||
|
||||
if not indexer:
|
||||
raise CommandError("The indexer is not enabled or properly configured.")
|
||||
|
||||
logger.info("Starting to regenerate Find index...")
|
||||
start = time.perf_counter()
|
||||
batch_size = options["batch_size"]
|
||||
|
||||
try:
|
||||
count = indexer.index(batch_size=batch_size)
|
||||
except Exception as err:
|
||||
raise CommandError("Unable to regenerate index") from err
|
||||
|
||||
duration = time.perf_counter() - start
|
||||
logger.info(
|
||||
"Search index regenerated from %d document(s) in %.2f seconds.",
|
||||
count,
|
||||
duration,
|
||||
)
|
||||
@@ -432,32 +432,35 @@ class Document(MP_Node, BaseModel):
|
||||
def save(self, *args, **kwargs):
|
||||
"""Write content to object storage only if _content has changed."""
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
if self._content:
|
||||
file_key = self.file_key
|
||||
bytes_content = self._content.encode("utf-8")
|
||||
self.save_content(self._content)
|
||||
|
||||
# Attempt to directly check if the object exists using the storage client.
|
||||
try:
|
||||
response = default_storage.connection.meta.client.head_object(
|
||||
Bucket=default_storage.bucket_name, Key=file_key
|
||||
)
|
||||
except ClientError as excpt:
|
||||
# If the error is a 404, the object doesn't exist, so we should create it.
|
||||
if excpt.response["Error"]["Code"] == "404":
|
||||
has_changed = True
|
||||
else:
|
||||
raise
|
||||
def save_content(self, content):
|
||||
"""Save content to object storage."""
|
||||
|
||||
file_key = self.file_key
|
||||
bytes_content = content.encode("utf-8")
|
||||
|
||||
# Attempt to directly check if the object exists using the storage client.
|
||||
try:
|
||||
response = default_storage.connection.meta.client.head_object(
|
||||
Bucket=default_storage.bucket_name, Key=file_key
|
||||
)
|
||||
except ClientError as excpt:
|
||||
# If the error is a 404, the object doesn't exist, so we should create it.
|
||||
if excpt.response["Error"]["Code"] == "404":
|
||||
has_changed = True
|
||||
else:
|
||||
# Compare the existing ETag with the MD5 hash of the new content.
|
||||
has_changed = (
|
||||
response["ETag"].strip('"')
|
||||
!= hashlib.md5(bytes_content).hexdigest() # noqa: S324
|
||||
)
|
||||
raise
|
||||
else:
|
||||
# Compare the existing ETag with the MD5 hash of the new content.
|
||||
has_changed = (
|
||||
response["ETag"].strip('"') != hashlib.md5(bytes_content).hexdigest() # noqa: S324
|
||||
)
|
||||
|
||||
if has_changed:
|
||||
content_file = ContentFile(bytes_content)
|
||||
default_storage.save(file_key, content_file)
|
||||
if has_changed:
|
||||
content_file = ContentFile(bytes_content)
|
||||
default_storage.save(file_key, content_file)
|
||||
|
||||
def is_leaf(self):
|
||||
"""
|
||||
@@ -903,7 +906,8 @@ class Document(MP_Node, BaseModel):
|
||||
|
||||
# Mark all descendants as soft deleted
|
||||
self.get_descendants().filter(ancestors_deleted_at__isnull=True).update(
|
||||
ancestors_deleted_at=self.ancestors_deleted_at
|
||||
ancestors_deleted_at=self.ancestors_deleted_at,
|
||||
updated_at=self.updated_at,
|
||||
)
|
||||
|
||||
@transaction.atomic
|
||||
|
||||
298
src/backend/core/services/search_indexers.py
Normal file
298
src/backend/core/services/search_indexers.py
Normal file
@@ -0,0 +1,298 @@
|
||||
"""Document search index management utilities and indexers"""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
from functools import cache
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import AnonymousUser
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
from django.db.models import Subquery
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
import requests
|
||||
|
||||
from core import models, utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@cache
|
||||
def get_document_indexer():
|
||||
"""Returns an instance of indexer service if enabled and properly configured."""
|
||||
classpath = settings.SEARCH_INDEXER_CLASS
|
||||
|
||||
# For this usecase an empty indexer class is not an issue but a feature.
|
||||
if not classpath:
|
||||
logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)")
|
||||
return None
|
||||
|
||||
try:
|
||||
indexer_class = import_string(settings.SEARCH_INDEXER_CLASS)
|
||||
return indexer_class()
|
||||
except ImportError as err:
|
||||
logger.error("SEARCH_INDEXER_CLASS setting is not valid : %s", err)
|
||||
except ImproperlyConfigured as err:
|
||||
logger.error("Document indexer is not properly configured : %s", err)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_batch_accesses_by_users_and_teams(paths):
|
||||
"""
|
||||
Get accesses related to a list of document paths,
|
||||
grouped by users and teams, including all ancestor paths.
|
||||
"""
|
||||
ancestor_map = utils.get_ancestor_to_descendants_map(
|
||||
paths, steplen=models.Document.steplen
|
||||
)
|
||||
ancestor_paths = list(ancestor_map.keys())
|
||||
|
||||
access_qs = models.DocumentAccess.objects.filter(
|
||||
document__path__in=ancestor_paths
|
||||
).values("document__path", "user__sub", "team")
|
||||
|
||||
access_by_document_path = defaultdict(lambda: {"users": set(), "teams": set()})
|
||||
|
||||
for access in access_qs:
|
||||
ancestor_path = access["document__path"]
|
||||
user_sub = access["user__sub"]
|
||||
team = access["team"]
|
||||
|
||||
for descendant_path in ancestor_map.get(ancestor_path, []):
|
||||
if user_sub:
|
||||
access_by_document_path[descendant_path]["users"].add(str(user_sub))
|
||||
if team:
|
||||
access_by_document_path[descendant_path]["teams"].add(team)
|
||||
|
||||
return dict(access_by_document_path)
|
||||
|
||||
|
||||
def get_visited_document_ids_of(queryset, user):
|
||||
"""
|
||||
Returns the ids of the documents that have a linktrace to the user and NOT owned.
|
||||
It will be use to limit the opensearch responses to the public documents already
|
||||
"visited" by the user.
|
||||
"""
|
||||
if isinstance(user, AnonymousUser):
|
||||
return []
|
||||
|
||||
qs = models.LinkTrace.objects.filter(user=user)
|
||||
|
||||
docs = (
|
||||
queryset.exclude(accesses__user=user)
|
||||
.filter(
|
||||
deleted_at__isnull=True,
|
||||
ancestors_deleted_at__isnull=True,
|
||||
)
|
||||
.filter(pk__in=Subquery(qs.values("document_id")))
|
||||
.order_by("pk")
|
||||
.distinct("pk")
|
||||
)
|
||||
|
||||
return [str(id) for id in docs.values_list("pk", flat=True)]
|
||||
|
||||
|
||||
class BaseDocumentIndexer(ABC):
|
||||
"""
|
||||
Base class for document indexers.
|
||||
|
||||
Handles batching and access resolution. Subclasses must implement both
|
||||
`serialize_document()` and `push()` to define backend-specific behavior.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the indexer.
|
||||
"""
|
||||
self.batch_size = settings.SEARCH_INDEXER_BATCH_SIZE
|
||||
self.indexer_url = settings.SEARCH_INDEXER_URL
|
||||
self.indexer_secret = settings.SEARCH_INDEXER_SECRET
|
||||
self.search_url = settings.SEARCH_INDEXER_QUERY_URL
|
||||
self.search_limit = settings.SEARCH_INDEXER_QUERY_LIMIT
|
||||
|
||||
if not self.indexer_url:
|
||||
raise ImproperlyConfigured(
|
||||
"SEARCH_INDEXER_URL must be set in Django settings."
|
||||
)
|
||||
|
||||
if not self.indexer_secret:
|
||||
raise ImproperlyConfigured(
|
||||
"SEARCH_INDEXER_SECRET must be set in Django settings."
|
||||
)
|
||||
|
||||
if not self.search_url:
|
||||
raise ImproperlyConfigured(
|
||||
"SEARCH_INDEXER_QUERY_URL must be set in Django settings."
|
||||
)
|
||||
|
||||
def index(self, queryset=None, batch_size=None):
|
||||
"""
|
||||
Fetch documents in batches, serialize them, and push to the search backend.
|
||||
|
||||
Args:
|
||||
queryset (optional): Document queryset
|
||||
Defaults to all documents without filter.
|
||||
batch_size (int, optional): Number of documents per batch.
|
||||
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
|
||||
"""
|
||||
last_id = 0
|
||||
count = 0
|
||||
queryset = queryset or models.Document.objects.all()
|
||||
batch_size = batch_size or self.batch_size
|
||||
|
||||
while True:
|
||||
documents_batch = list(
|
||||
queryset.filter(
|
||||
id__gt=last_id,
|
||||
).order_by("id")[:batch_size]
|
||||
)
|
||||
|
||||
if not documents_batch:
|
||||
break
|
||||
|
||||
doc_paths = [doc.path for doc in documents_batch]
|
||||
last_id = documents_batch[-1].id
|
||||
accesses_by_document_path = get_batch_accesses_by_users_and_teams(doc_paths)
|
||||
|
||||
serialized_batch = [
|
||||
self.serialize_document(document, accesses_by_document_path)
|
||||
for document in documents_batch
|
||||
if document.content or document.title
|
||||
]
|
||||
|
||||
if serialized_batch:
|
||||
self.push(serialized_batch)
|
||||
count += len(serialized_batch)
|
||||
|
||||
return count
|
||||
|
||||
@abstractmethod
|
||||
def serialize_document(self, document, accesses):
|
||||
"""
|
||||
Convert a Document instance to a JSON-serializable format for indexing.
|
||||
|
||||
Must be implemented by subclasses.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def push(self, data):
|
||||
"""
|
||||
Push a batch of serialized documents to the backend.
|
||||
|
||||
Must be implemented by subclasses.
|
||||
"""
|
||||
|
||||
# pylint: disable-next=too-many-arguments,too-many-positional-arguments
|
||||
def search(self, text, token, visited=(), nb_results=None):
|
||||
"""
|
||||
Search for documents in Find app.
|
||||
Ensure the same default ordering as "Docs" list : -updated_at
|
||||
|
||||
Returns ids of the documents
|
||||
|
||||
Args:
|
||||
text (str): Text search content.
|
||||
token (str): OIDC Authentication token.
|
||||
visited (list, optional):
|
||||
List of ids of active public documents with LinkTrace
|
||||
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
|
||||
nb_results (int, optional):
|
||||
The number of results to return.
|
||||
Defaults to 50 if not specified.
|
||||
"""
|
||||
nb_results = nb_results or self.search_limit
|
||||
response = self.search_query(
|
||||
data={
|
||||
"q": text,
|
||||
"visited": visited,
|
||||
"services": ["docs"],
|
||||
"nb_results": nb_results,
|
||||
"order_by": "updated_at",
|
||||
"order_direction": "desc",
|
||||
},
|
||||
token=token,
|
||||
)
|
||||
|
||||
return [d["_id"] for d in response]
|
||||
|
||||
@abstractmethod
|
||||
def search_query(self, data, token) -> dict:
|
||||
"""
|
||||
Retrieve documents from the Find app API.
|
||||
|
||||
Must be implemented by subclasses.
|
||||
"""
|
||||
|
||||
|
||||
class SearchIndexer(BaseDocumentIndexer):
|
||||
"""
|
||||
Document indexer that pushes documents to La Suite Find app.
|
||||
"""
|
||||
|
||||
def serialize_document(self, document, accesses):
|
||||
"""
|
||||
Convert a Document to the JSON format expected by La Suite Find.
|
||||
|
||||
Args:
|
||||
document (Document): The document instance.
|
||||
accesses (dict): Mapping of document ID to user/team access.
|
||||
|
||||
Returns:
|
||||
dict: A JSON-serializable dictionary.
|
||||
"""
|
||||
doc_path = document.path
|
||||
doc_content = document.content
|
||||
text_content = utils.base64_yjs_to_text(doc_content) if doc_content else ""
|
||||
|
||||
return {
|
||||
"id": str(document.id),
|
||||
"title": document.title or "",
|
||||
"content": text_content,
|
||||
"depth": document.depth,
|
||||
"path": document.path,
|
||||
"numchild": document.numchild,
|
||||
"created_at": document.created_at.isoformat(),
|
||||
"updated_at": document.updated_at.isoformat(),
|
||||
"users": list(accesses.get(doc_path, {}).get("users", set())),
|
||||
"groups": list(accesses.get(doc_path, {}).get("teams", set())),
|
||||
"reach": document.computed_link_reach,
|
||||
"size": len(text_content.encode("utf-8")),
|
||||
"is_active": not bool(document.ancestors_deleted_at),
|
||||
}
|
||||
|
||||
def search_query(self, data, token) -> requests.Response:
|
||||
"""
|
||||
Retrieve documents from the Find app API.
|
||||
|
||||
Args:
|
||||
data (dict): search data
|
||||
token (str): OICD token
|
||||
|
||||
Returns:
|
||||
dict: A JSON-serializable dictionary.
|
||||
"""
|
||||
response = requests.post(
|
||||
self.search_url,
|
||||
json=data,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def push(self, data):
|
||||
"""
|
||||
Push a batch of documents to the Find backend.
|
||||
|
||||
Args:
|
||||
data (list): List of document dictionaries.
|
||||
"""
|
||||
response = requests.post(
|
||||
self.indexer_url,
|
||||
json=data,
|
||||
headers={"Authorization": f"Bearer {self.indexer_secret}"},
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
33
src/backend/core/signals.py
Normal file
33
src/backend/core/signals.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
Declare and configure the signals for the impress core application
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
from django.db import transaction
|
||||
from django.db.models import signals
|
||||
from django.dispatch import receiver
|
||||
|
||||
from . import models
|
||||
from .tasks.search import trigger_batch_document_indexer
|
||||
|
||||
|
||||
@receiver(signals.post_save, sender=models.Document)
|
||||
def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument
|
||||
"""
|
||||
Asynchronous call to the document indexer at the end of the transaction.
|
||||
Note : Within the transaction we can have an empty content and a serialization
|
||||
error.
|
||||
"""
|
||||
transaction.on_commit(partial(trigger_batch_document_indexer, instance))
|
||||
|
||||
|
||||
@receiver(signals.post_save, sender=models.DocumentAccess)
|
||||
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
|
||||
"""
|
||||
Asynchronous call to the document indexer at the end of the transaction.
|
||||
"""
|
||||
if not created:
|
||||
transaction.on_commit(
|
||||
partial(trigger_batch_document_indexer, instance.document)
|
||||
)
|
||||
95
src/backend/core/tasks/search.py
Normal file
95
src/backend/core/tasks/search.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Trigger document indexation using celery task."""
|
||||
|
||||
from logging import getLogger
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.cache import cache
|
||||
from django.db.models import Q
|
||||
|
||||
from django_redis.cache import RedisCache
|
||||
|
||||
from core import models
|
||||
from core.services.search_indexers import (
|
||||
get_document_indexer,
|
||||
)
|
||||
|
||||
from impress.celery_app import app
|
||||
|
||||
logger = getLogger(__file__)
|
||||
|
||||
|
||||
@app.task
|
||||
def document_indexer_task(document_id):
|
||||
"""Celery Task : Sends indexation query for a document."""
|
||||
indexer = get_document_indexer()
|
||||
|
||||
if indexer:
|
||||
logger.info("Start document %s indexation", document_id)
|
||||
indexer.index(models.Document.objects.filter(pk=document_id))
|
||||
|
||||
|
||||
def batch_indexer_throttle_acquire(timeout: int = 0, atomic: bool = True):
|
||||
"""
|
||||
Enable the task throttle flag for a delay.
|
||||
Uses redis locks if available to ensure atomic changes
|
||||
"""
|
||||
key = "document-batch-indexer-throttle"
|
||||
|
||||
# Redis is used as cache database (not in tests). Use the lock feature here
|
||||
# to ensure atomicity of changes to the throttle flag.
|
||||
if isinstance(cache, RedisCache) and atomic:
|
||||
with cache.locks(key):
|
||||
return batch_indexer_throttle_acquire(timeout, atomic=False)
|
||||
|
||||
# Use add() here :
|
||||
# - set the flag and returns true if not exist
|
||||
# - do nothing and return false if exist
|
||||
return cache.add(key, 1, timeout=timeout)
|
||||
|
||||
|
||||
@app.task
|
||||
def batch_document_indexer_task(timestamp):
|
||||
"""Celery Task : Sends indexation query for a batch of documents."""
|
||||
indexer = get_document_indexer()
|
||||
|
||||
if indexer:
|
||||
queryset = models.Document.objects.filter(
|
||||
Q(updated_at__gte=timestamp)
|
||||
| Q(deleted_at__gte=timestamp)
|
||||
| Q(ancestors_deleted_at__gte=timestamp)
|
||||
)
|
||||
|
||||
count = indexer.index(queryset)
|
||||
logger.info("Indexed %d documents", count)
|
||||
|
||||
|
||||
def trigger_batch_document_indexer(item):
|
||||
"""
|
||||
Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting.
|
||||
|
||||
Args:
|
||||
document (Document): The document instance.
|
||||
"""
|
||||
countdown = int(settings.SEARCH_INDEXER_COUNTDOWN)
|
||||
|
||||
# DO NOT create a task if indexation if disabled
|
||||
if not settings.SEARCH_INDEXER_CLASS:
|
||||
return
|
||||
|
||||
if countdown > 0:
|
||||
# Each time this method is called during a countdown, we increment the
|
||||
# counter and each task decrease it, so the index be run only once.
|
||||
if batch_indexer_throttle_acquire(timeout=countdown):
|
||||
logger.info(
|
||||
"Add task for batch document indexation from updated_at=%s in %d seconds",
|
||||
item.updated_at.isoformat(),
|
||||
countdown,
|
||||
)
|
||||
|
||||
batch_document_indexer_task.apply_async(
|
||||
args=[item.updated_at], countdown=countdown
|
||||
)
|
||||
else:
|
||||
logger.info("Skip task for batch document %s indexation", item.pk)
|
||||
else:
|
||||
document_indexer_task.apply(args=[item.pk])
|
||||
65
src/backend/core/tests/commands/test_index.py
Normal file
65
src/backend/core/tests/commands/test_index.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Unit test for `index` command.
|
||||
"""
|
||||
|
||||
from operator import itemgetter
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import CommandError, call_command
|
||||
from django.db import transaction
|
||||
|
||||
import pytest
|
||||
|
||||
from core import factories
|
||||
from core.services.search_indexers import SearchIndexer
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_index():
|
||||
"""Test the command `index` that run the Find app indexer for all the available documents."""
|
||||
user = factories.UserFactory()
|
||||
indexer = SearchIndexer()
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory()
|
||||
empty_doc = factories.DocumentFactory(title=None, content="")
|
||||
no_title_doc = factories.DocumentFactory(title=None)
|
||||
|
||||
factories.UserDocumentAccessFactory(document=doc, user=user)
|
||||
factories.UserDocumentAccessFactory(document=empty_doc, user=user)
|
||||
factories.UserDocumentAccessFactory(document=no_title_doc, user=user)
|
||||
|
||||
accesses = {
|
||||
str(doc.path): {"users": [user.sub]},
|
||||
str(empty_doc.path): {"users": [user.sub]},
|
||||
str(no_title_doc.path): {"users": [user.sub]},
|
||||
}
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
call_command("index")
|
||||
|
||||
push_call_args = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
# called once but with a batch of docs
|
||||
mock_push.assert_called_once()
|
||||
|
||||
assert sorted(push_call_args[0], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc, accesses),
|
||||
indexer.serialize_document(no_title_doc, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_index_improperly_configured(indexer_settings):
|
||||
"""The command should raise an exception if the indexer is not configured"""
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = None
|
||||
|
||||
with pytest.raises(CommandError) as err:
|
||||
call_command("index")
|
||||
|
||||
assert str(err.value) == "The indexer is not enabled or properly configured."
|
||||
@@ -24,3 +24,30 @@ def mock_user_teams():
|
||||
"core.models.User.teams", new_callable=mock.PropertyMock
|
||||
) as mock_teams:
|
||||
yield mock_teams
|
||||
|
||||
|
||||
@pytest.fixture(name="indexer_settings")
|
||||
def indexer_settings_fixture(settings):
|
||||
"""
|
||||
Setup valid settings for the document indexer. Clear the indexer cache.
|
||||
"""
|
||||
|
||||
# pylint: disable-next=import-outside-toplevel
|
||||
from core.services.search_indexers import ( # noqa: PLC0415
|
||||
get_document_indexer,
|
||||
)
|
||||
|
||||
get_document_indexer.cache_clear()
|
||||
|
||||
settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.SearchIndexer"
|
||||
settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest"
|
||||
settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
|
||||
settings.SEARCH_INDEXER_QUERY_URL = (
|
||||
"http://localhost:8081/api/v1.0/documents/search/"
|
||||
)
|
||||
settings.SEARCH_INDEXER_COUNTDOWN = 1
|
||||
|
||||
yield settings
|
||||
|
||||
# clear cache to prevent issues with other tests
|
||||
get_document_indexer.cache_clear()
|
||||
|
||||
425
src/backend/core/tests/documents/test_api_documents_search.py
Normal file
425
src/backend/core/tests/documents/test_api_documents_search.py
Normal file
@@ -0,0 +1,425 @@
|
||||
"""
|
||||
Tests for Documents API endpoint in impress's core app: list
|
||||
"""
|
||||
|
||||
import random
|
||||
from json import loads as json_loads
|
||||
|
||||
from django.test import RequestFactory
|
||||
|
||||
import pytest
|
||||
import responses
|
||||
from faker import Faker
|
||||
from rest_framework.test import APIClient
|
||||
|
||||
from core import factories, models
|
||||
from core.services.search_indexers import get_document_indexer
|
||||
|
||||
fake = Faker()
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def build_search_url(**kwargs):
|
||||
"""Build absolute uri for search endpoint with ORDERED query arguments"""
|
||||
return (
|
||||
RequestFactory()
|
||||
.get("/api/v1.0/documents/search/", dict(sorted(kwargs.items())))
|
||||
.build_absolute_uri()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("role", models.LinkRoleChoices.values)
|
||||
@pytest.mark.parametrize("reach", models.LinkReachChoices.values)
|
||||
@responses.activate
|
||||
def test_api_documents_search_anonymous(reach, role, indexer_settings):
|
||||
"""
|
||||
Anonymous users should not be allowed to search documents whatever the
|
||||
link reach and link role
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
|
||||
|
||||
factories.DocumentFactory(link_reach=reach, link_role=role)
|
||||
|
||||
# Find response
|
||||
responses.add(
|
||||
responses.POST,
|
||||
"http://find/api/v1.0/search",
|
||||
json=[],
|
||||
status=200,
|
||||
)
|
||||
|
||||
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {
|
||||
"count": 0,
|
||||
"next": None,
|
||||
"previous": None,
|
||||
"results": [],
|
||||
}
|
||||
|
||||
|
||||
def test_api_documents_search_endpoint_is_none(indexer_settings):
|
||||
"""
|
||||
Missing SEARCH_INDEXER_QUERY_URL, so the indexer is not properly configured.
|
||||
Should fallback on title filter
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
|
||||
|
||||
assert get_document_indexer() is None
|
||||
|
||||
user = factories.UserFactory()
|
||||
document = factories.DocumentFactory(title="alpha")
|
||||
access = factories.UserDocumentAccessFactory(document=document, user=user)
|
||||
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
|
||||
|
||||
assert response.status_code == 200
|
||||
content = response.json()
|
||||
results = content.pop("results")
|
||||
assert content == {
|
||||
"count": 1,
|
||||
"next": None,
|
||||
"previous": None,
|
||||
}
|
||||
assert len(results) == 1
|
||||
assert results[0] == {
|
||||
"id": str(document.id),
|
||||
"abilities": document.get_abilities(user),
|
||||
"ancestors_link_reach": None,
|
||||
"ancestors_link_role": None,
|
||||
"computed_link_reach": document.computed_link_reach,
|
||||
"computed_link_role": document.computed_link_role,
|
||||
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
|
||||
"creator": str(document.creator.id),
|
||||
"depth": 1,
|
||||
"excerpt": document.excerpt,
|
||||
"link_reach": document.link_reach,
|
||||
"link_role": document.link_role,
|
||||
"nb_accesses_ancestors": 1,
|
||||
"nb_accesses_direct": 1,
|
||||
"numchild": 0,
|
||||
"path": document.path,
|
||||
"title": document.title,
|
||||
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
|
||||
"deleted_at": None,
|
||||
"user_role": access.role,
|
||||
}
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_api_documents_search_invalid_params(indexer_settings):
|
||||
"""Validate the format of documents as returned by the search view."""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
|
||||
|
||||
user = factories.UserFactory()
|
||||
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
response = client.get("/api/v1.0/documents/search/")
|
||||
|
||||
assert response.status_code == 400
|
||||
assert response.json() == {"q": ["This field is required."]}
|
||||
|
||||
response = client.get("/api/v1.0/documents/search/", data={"q": " "})
|
||||
|
||||
assert response.status_code == 400
|
||||
assert response.json() == {"q": ["This field may not be blank."]}
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/documents/search/", data={"q": "any", "page": "NaN"}
|
||||
)
|
||||
|
||||
assert response.status_code == 400
|
||||
assert response.json() == {"page": ["A valid integer is required."]}
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_api_documents_search_format(indexer_settings):
|
||||
"""Validate the format of documents as returned by the search view."""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
|
||||
|
||||
assert get_document_indexer() is not None
|
||||
|
||||
user = factories.UserFactory()
|
||||
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
user_a, user_b, user_c = factories.UserFactory.create_batch(3)
|
||||
document = factories.DocumentFactory(
|
||||
title="alpha",
|
||||
users=(user_a, user_c),
|
||||
link_traces=(user, user_b),
|
||||
)
|
||||
access = factories.UserDocumentAccessFactory(document=document, user=user)
|
||||
|
||||
# Find response
|
||||
responses.add(
|
||||
responses.POST,
|
||||
"http://find/api/v1.0/search",
|
||||
json=[
|
||||
{"_id": str(document.pk)},
|
||||
],
|
||||
status=200,
|
||||
)
|
||||
response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
|
||||
|
||||
assert response.status_code == 200
|
||||
content = response.json()
|
||||
results = content.pop("results")
|
||||
assert content == {
|
||||
"count": 1,
|
||||
"next": None,
|
||||
"previous": None,
|
||||
}
|
||||
assert len(results) == 1
|
||||
assert results[0] == {
|
||||
"id": str(document.id),
|
||||
"abilities": document.get_abilities(user),
|
||||
"ancestors_link_reach": None,
|
||||
"ancestors_link_role": None,
|
||||
"computed_link_reach": document.computed_link_reach,
|
||||
"computed_link_role": document.computed_link_role,
|
||||
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
|
||||
"creator": str(document.creator.id),
|
||||
"depth": 1,
|
||||
"excerpt": document.excerpt,
|
||||
"link_reach": document.link_reach,
|
||||
"link_role": document.link_role,
|
||||
"nb_accesses_ancestors": 3,
|
||||
"nb_accesses_direct": 3,
|
||||
"numchild": 0,
|
||||
"path": document.path,
|
||||
"title": document.title,
|
||||
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
|
||||
"deleted_at": None,
|
||||
"user_role": access.role,
|
||||
}
|
||||
|
||||
|
||||
@responses.activate
|
||||
@pytest.mark.parametrize(
|
||||
"pagination, status, expected",
|
||||
(
|
||||
(
|
||||
{"page": 1, "page_size": 10},
|
||||
200,
|
||||
{
|
||||
"count": 10,
|
||||
"previous": None,
|
||||
"next": None,
|
||||
"range": (0, None),
|
||||
},
|
||||
),
|
||||
(
|
||||
{},
|
||||
200,
|
||||
{
|
||||
"count": 10,
|
||||
"previous": None,
|
||||
"next": None,
|
||||
"range": (0, None),
|
||||
"api_page_size": 21, # default page_size is 20
|
||||
},
|
||||
),
|
||||
(
|
||||
{"page": 2, "page_size": 10},
|
||||
404,
|
||||
{},
|
||||
),
|
||||
(
|
||||
{"page": 1, "page_size": 5},
|
||||
200,
|
||||
{
|
||||
"count": 10,
|
||||
"previous": None,
|
||||
"next": {"page": 2, "page_size": 5},
|
||||
"range": (0, 5),
|
||||
},
|
||||
),
|
||||
(
|
||||
{"page": 2, "page_size": 5},
|
||||
200,
|
||||
{
|
||||
"count": 10,
|
||||
"previous": {"page_size": 5},
|
||||
"next": None,
|
||||
"range": (5, None),
|
||||
},
|
||||
),
|
||||
({"page": 3, "page_size": 5}, 404, {}),
|
||||
),
|
||||
)
|
||||
def test_api_documents_search_pagination(
|
||||
indexer_settings, pagination, status, expected
|
||||
):
|
||||
"""Documents should be ordered by descending "score" by default"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
|
||||
|
||||
assert get_document_indexer() is not None
|
||||
|
||||
user = factories.UserFactory()
|
||||
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
docs = factories.DocumentFactory.create_batch(10, title="alpha", users=[user])
|
||||
|
||||
docs_by_uuid = {str(doc.pk): doc for doc in docs}
|
||||
api_results = [{"_id": id} for id in docs_by_uuid.keys()]
|
||||
|
||||
# reorder randomly to simulate score ordering
|
||||
random.shuffle(api_results)
|
||||
|
||||
# Find response
|
||||
# pylint: disable-next=assignment-from-none
|
||||
api_search = responses.add(
|
||||
responses.POST,
|
||||
"http://find/api/v1.0/search",
|
||||
json=api_results,
|
||||
status=200,
|
||||
)
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/documents/search/",
|
||||
data={
|
||||
"q": "alpha",
|
||||
**pagination,
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == status
|
||||
|
||||
if response.status_code < 300:
|
||||
previous_url = (
|
||||
build_search_url(q="alpha", **expected["previous"])
|
||||
if expected["previous"]
|
||||
else None
|
||||
)
|
||||
next_url = (
|
||||
build_search_url(q="alpha", **expected["next"])
|
||||
if expected["next"]
|
||||
else None
|
||||
)
|
||||
start, end = expected["range"]
|
||||
|
||||
content = response.json()
|
||||
|
||||
assert content["count"] == expected["count"]
|
||||
assert content["previous"] == previous_url
|
||||
assert content["next"] == next_url
|
||||
|
||||
results = content.pop("results")
|
||||
|
||||
# The find api results ordering by score is kept
|
||||
assert [r["id"] for r in results] == [r["_id"] for r in api_results[start:end]]
|
||||
|
||||
# Check the query parameters.
|
||||
assert api_search.call_count == 1
|
||||
assert api_search.calls[0].response.status_code == 200
|
||||
assert json_loads(api_search.calls[0].request.body) == {
|
||||
"q": "alpha",
|
||||
"visited": [],
|
||||
"services": ["docs"],
|
||||
"nb_results": 50,
|
||||
"order_by": "updated_at",
|
||||
"order_direction": "desc",
|
||||
}
|
||||
|
||||
|
||||
@responses.activate
|
||||
@pytest.mark.parametrize(
|
||||
"pagination, status, expected",
|
||||
(
|
||||
(
|
||||
{"page": 1, "page_size": 10},
|
||||
200,
|
||||
{"count": 10, "previous": None, "next": None, "range": (0, None)},
|
||||
),
|
||||
(
|
||||
{},
|
||||
200,
|
||||
{"count": 10, "previous": None, "next": None, "range": (0, None)},
|
||||
),
|
||||
(
|
||||
{"page": 2, "page_size": 10},
|
||||
404,
|
||||
{},
|
||||
),
|
||||
(
|
||||
{"page": 1, "page_size": 5},
|
||||
200,
|
||||
{
|
||||
"count": 10,
|
||||
"previous": None,
|
||||
"next": {"page": 2, "page_size": 5},
|
||||
"range": (0, 5),
|
||||
},
|
||||
),
|
||||
(
|
||||
{"page": 2, "page_size": 5},
|
||||
200,
|
||||
{
|
||||
"count": 10,
|
||||
"previous": {"page_size": 5},
|
||||
"next": None,
|
||||
"range": (5, None),
|
||||
},
|
||||
),
|
||||
({"page": 3, "page_size": 5}, 404, {}),
|
||||
),
|
||||
)
|
||||
def test_api_documents_search_pagination_endpoint_is_none(
|
||||
indexer_settings, pagination, status, expected
|
||||
):
|
||||
"""Documents should be ordered by descending "-updated_at" by default"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
|
||||
|
||||
assert get_document_indexer() is None
|
||||
|
||||
user = factories.UserFactory()
|
||||
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
factories.DocumentFactory.create_batch(10, title="alpha", users=[user])
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/documents/search/",
|
||||
data={
|
||||
"q": "alpha",
|
||||
**pagination,
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == status
|
||||
|
||||
if response.status_code < 300:
|
||||
previous_url = (
|
||||
build_search_url(q="alpha", **expected["previous"])
|
||||
if expected["previous"]
|
||||
else None
|
||||
)
|
||||
next_url = (
|
||||
build_search_url(q="alpha", **expected["next"])
|
||||
if expected["next"]
|
||||
else None
|
||||
)
|
||||
queryset = models.Document.objects.order_by("-updated_at")
|
||||
start, end = expected["range"]
|
||||
expected_results = [str(d.pk) for d in queryset[start:end]]
|
||||
|
||||
content = response.json()
|
||||
|
||||
assert content["count"] == expected["count"]
|
||||
assert content["previous"] == previous_url
|
||||
assert content["next"] == next_url
|
||||
|
||||
results = content.pop("results")
|
||||
|
||||
assert [r["id"] for r in results] == expected_results
|
||||
441
src/backend/core/tests/test_models_documents_indexer.py
Normal file
441
src/backend/core/tests/test_models_documents_indexer.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Unit tests for the Document model
|
||||
"""
|
||||
# pylint: disable=too-many-lines
|
||||
|
||||
from operator import itemgetter
|
||||
from unittest import mock
|
||||
|
||||
from django.core.cache import cache
|
||||
from django.db import transaction
|
||||
|
||||
import pytest
|
||||
|
||||
from core import factories, models
|
||||
from core.services.search_indexers import SearchIndexer
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def reset_batch_indexer_throttle():
|
||||
"""Reset throttle flag"""
|
||||
cache.delete("document-batch-indexer-throttle")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_throttle():
|
||||
"""Reset throttle flag before each test"""
|
||||
reset_batch_indexer_throttle()
|
||||
yield
|
||||
reset_batch_indexer_throttle()
|
||||
|
||||
|
||||
@mock.patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer(mock_push):
|
||||
"""Test indexation task on document creation"""
|
||||
with transaction.atomic():
|
||||
doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
|
||||
|
||||
accesses = {}
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
indexer = SearchIndexer()
|
||||
|
||||
assert len(data) == 1
|
||||
|
||||
# One call
|
||||
assert sorted(data[0], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc1, accesses),
|
||||
indexer.serialize_document(doc2, accesses),
|
||||
indexer.serialize_document(doc3, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
# The throttle counters should be reset
|
||||
assert cache.get("document-batch-indexer-throttle") == 1
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer_no_batches(indexer_settings):
|
||||
"""Test indexation task on doculment creation, no throttle"""
|
||||
indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
with transaction.atomic():
|
||||
doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
|
||||
|
||||
accesses = {}
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
indexer = SearchIndexer()
|
||||
|
||||
# 3 calls
|
||||
assert len(data) == 3
|
||||
# one document per call
|
||||
assert [len(d) for d in data] == [1] * 3
|
||||
# all documents are indexed
|
||||
assert sorted([d[0] for d in data], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc1, accesses),
|
||||
indexer.serialize_document(doc2, accesses),
|
||||
indexer.serialize_document(doc3, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
# The throttle counters should be reset
|
||||
assert cache.get("file-batch-indexer-throttle") is None
|
||||
|
||||
|
||||
@mock.patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings):
|
||||
"""Task should not start an indexation when disabled"""
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = None
|
||||
|
||||
user = factories.UserFactory()
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory()
|
||||
factories.UserDocumentAccessFactory(document=doc, user=user)
|
||||
|
||||
assert mock_push.assert_not_called
|
||||
|
||||
|
||||
@mock.patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer_wrongly_configured(
|
||||
mock_push, indexer_settings
|
||||
):
|
||||
"""Task should not start an indexation when disabled"""
|
||||
indexer_settings.SEARCH_INDEXER_URL = None
|
||||
|
||||
user = factories.UserFactory()
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory()
|
||||
factories.UserDocumentAccessFactory(document=doc, user=user)
|
||||
|
||||
assert mock_push.assert_not_called
|
||||
|
||||
|
||||
@mock.patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer_with_accesses(mock_push):
|
||||
"""Test indexation task on document creation"""
|
||||
user = factories.UserFactory()
|
||||
|
||||
with transaction.atomic():
|
||||
doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
|
||||
|
||||
factories.UserDocumentAccessFactory(document=doc1, user=user)
|
||||
factories.UserDocumentAccessFactory(document=doc2, user=user)
|
||||
factories.UserDocumentAccessFactory(document=doc3, user=user)
|
||||
|
||||
accesses = {
|
||||
str(doc1.path): {"users": [user.sub]},
|
||||
str(doc2.path): {"users": [user.sub]},
|
||||
str(doc3.path): {"users": [user.sub]},
|
||||
}
|
||||
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
indexer = SearchIndexer()
|
||||
|
||||
assert len(data) == 1
|
||||
assert sorted(data[0], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc1, accesses),
|
||||
indexer.serialize_document(doc2, accesses),
|
||||
indexer.serialize_document(doc3, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
|
||||
@mock.patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer_deleted(mock_push):
|
||||
"""Indexation task on deleted or ancestor_deleted documents"""
|
||||
user = factories.UserFactory()
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory(
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
)
|
||||
main_doc = factories.DocumentFactory(
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
)
|
||||
child_doc = factories.DocumentFactory(
|
||||
parent=main_doc,
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED,
|
||||
)
|
||||
|
||||
factories.UserDocumentAccessFactory(document=doc, user=user)
|
||||
factories.UserDocumentAccessFactory(document=main_doc, user=user)
|
||||
factories.UserDocumentAccessFactory(document=child_doc, user=user)
|
||||
|
||||
# Manually reset the throttle flag here or the next indexation will be ignored for 1 second
|
||||
reset_batch_indexer_throttle()
|
||||
|
||||
with transaction.atomic():
|
||||
main_doc_deleted = models.Document.objects.get(pk=main_doc.pk)
|
||||
main_doc_deleted.soft_delete()
|
||||
|
||||
child_doc_deleted = models.Document.objects.get(pk=child_doc.pk)
|
||||
|
||||
main_doc_deleted.refresh_from_db()
|
||||
child_doc_deleted.refresh_from_db()
|
||||
|
||||
assert main_doc_deleted.deleted_at is not None
|
||||
assert child_doc_deleted.ancestors_deleted_at is not None
|
||||
|
||||
assert child_doc_deleted.deleted_at is None
|
||||
assert child_doc_deleted.ancestors_deleted_at is not None
|
||||
|
||||
accesses = {
|
||||
str(doc.path): {"users": [user.sub]},
|
||||
str(main_doc_deleted.path): {"users": [user.sub]},
|
||||
str(child_doc_deleted.path): {"users": [user.sub]},
|
||||
}
|
||||
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
indexer = SearchIndexer()
|
||||
|
||||
assert len(data) == 2
|
||||
|
||||
# First indexation on document creation
|
||||
assert sorted(data[0], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc, accesses),
|
||||
indexer.serialize_document(main_doc, accesses),
|
||||
indexer.serialize_document(child_doc, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
# Even deleted items are re-indexed : only update their status in the future
|
||||
assert sorted(data[1], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(main_doc_deleted, accesses), # soft_delete()
|
||||
indexer.serialize_document(child_doc_deleted, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_models_documents_indexer_hard_deleted():
|
||||
"""Indexation task on hard deleted document"""
|
||||
user = factories.UserFactory()
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory(
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
)
|
||||
factories.UserDocumentAccessFactory(document=doc, user=user)
|
||||
|
||||
# Call task on deleted document.
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
doc.delete()
|
||||
|
||||
# Hard delete document are not re-indexed.
|
||||
assert mock_push.assert_not_called
|
||||
|
||||
|
||||
@mock.patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_documents_post_save_indexer_restored(mock_push):
|
||||
"""Restart indexation task on restored documents"""
|
||||
user = factories.UserFactory()
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory(
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
)
|
||||
doc_deleted = factories.DocumentFactory(
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
)
|
||||
doc_ancestor_deleted = factories.DocumentFactory(
|
||||
parent=doc_deleted,
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED,
|
||||
)
|
||||
|
||||
factories.UserDocumentAccessFactory(document=doc, user=user)
|
||||
factories.UserDocumentAccessFactory(document=doc_deleted, user=user)
|
||||
factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user)
|
||||
|
||||
doc_deleted.soft_delete()
|
||||
|
||||
doc_deleted.refresh_from_db()
|
||||
doc_ancestor_deleted.refresh_from_db()
|
||||
|
||||
assert doc_deleted.deleted_at is not None
|
||||
assert doc_deleted.ancestors_deleted_at is not None
|
||||
|
||||
assert doc_ancestor_deleted.deleted_at is None
|
||||
assert doc_ancestor_deleted.ancestors_deleted_at is not None
|
||||
|
||||
# Manually reset the throttle flag here or the next indexation will be ignored for 1 second
|
||||
reset_batch_indexer_throttle()
|
||||
|
||||
with transaction.atomic():
|
||||
doc_restored = models.Document.objects.get(pk=doc_deleted.pk)
|
||||
doc_restored.restore()
|
||||
|
||||
doc_ancestor_restored = models.Document.objects.get(pk=doc_ancestor_deleted.pk)
|
||||
|
||||
assert doc_restored.deleted_at is None
|
||||
assert doc_restored.ancestors_deleted_at is None
|
||||
|
||||
assert doc_ancestor_restored.deleted_at is None
|
||||
assert doc_ancestor_restored.ancestors_deleted_at is None
|
||||
|
||||
accesses = {
|
||||
str(doc.path): {"users": [user.sub]},
|
||||
str(doc_deleted.path): {"users": [user.sub]},
|
||||
str(doc_ancestor_deleted.path): {"users": [user.sub]},
|
||||
}
|
||||
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
indexer = SearchIndexer()
|
||||
|
||||
# All docs are re-indexed
|
||||
assert len(data) == 2
|
||||
|
||||
# First indexation on items creation & soft delete (in the same transaction)
|
||||
assert sorted(data[0], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc, accesses),
|
||||
indexer.serialize_document(doc_deleted, accesses),
|
||||
indexer.serialize_document(doc_ancestor_deleted, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
# Restored items are re-indexed : only update their status in the future
|
||||
assert sorted(data[1], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(doc_restored, accesses), # restore()
|
||||
indexer.serialize_document(doc_ancestor_restored, accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_models_documents_post_save_indexer_throttle():
|
||||
"""Test indexation task skipping on document update"""
|
||||
indexer = SearchIndexer()
|
||||
user = factories.UserFactory()
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push"):
|
||||
with transaction.atomic():
|
||||
docs = factories.DocumentFactory.create_batch(5, users=(user,))
|
||||
|
||||
accesses = {str(item.path): {"users": [user.sub]} for item in docs}
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
# Simulate 1 running task
|
||||
cache.set("document-batch-indexer-throttle", 1)
|
||||
|
||||
# save doc to trigger the indexer, but nothing should be done since
|
||||
# the flag is up
|
||||
with transaction.atomic():
|
||||
docs[0].save()
|
||||
docs[2].save()
|
||||
docs[3].save()
|
||||
|
||||
assert [call.args[0] for call in mock_push.call_args_list] == []
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
# No waiting task
|
||||
cache.delete("document-batch-indexer-throttle")
|
||||
|
||||
with transaction.atomic():
|
||||
docs[0].save()
|
||||
docs[2].save()
|
||||
docs[3].save()
|
||||
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
# One call
|
||||
assert len(data) == 1
|
||||
|
||||
assert sorted(data[0], key=itemgetter("id")) == sorted(
|
||||
[
|
||||
indexer.serialize_document(docs[0], accesses),
|
||||
indexer.serialize_document(docs[2], accesses),
|
||||
indexer.serialize_document(docs[3], accesses),
|
||||
],
|
||||
key=itemgetter("id"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_models_documents_access_post_save_indexer():
|
||||
"""Test indexation task on DocumentAccess update"""
|
||||
users = factories.UserFactory.create_batch(3)
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push"):
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory(users=users)
|
||||
doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by(
|
||||
"user__sub"
|
||||
)
|
||||
|
||||
reset_batch_indexer_throttle()
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
with transaction.atomic():
|
||||
for doc_access in doc_accesses:
|
||||
doc_access.save()
|
||||
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
# One call
|
||||
assert len(data) == 1
|
||||
|
||||
assert [d["id"] for d in data[0]] == [str(doc.pk)]
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_models_items_access_post_save_indexer_no_throttle(indexer_settings):
|
||||
"""Test indexation task on ItemAccess update, no throttle"""
|
||||
indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
|
||||
|
||||
users = factories.UserFactory.create_batch(3)
|
||||
|
||||
with transaction.atomic():
|
||||
doc = factories.DocumentFactory(users=users)
|
||||
doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by(
|
||||
"user__sub"
|
||||
)
|
||||
|
||||
reset_batch_indexer_throttle()
|
||||
|
||||
with mock.patch.object(SearchIndexer, "push") as mock_push:
|
||||
with transaction.atomic():
|
||||
for doc_access in doc_accesses:
|
||||
doc_access.save()
|
||||
|
||||
data = [call.args[0] for call in mock_push.call_args_list]
|
||||
|
||||
# 3 calls
|
||||
assert len(data) == 3
|
||||
# one document per call
|
||||
assert [len(d) for d in data] == [1] * 3
|
||||
# the same document is indexed 3 times
|
||||
assert [d[0]["id"] for d in data] == [str(doc.pk)] * 3
|
||||
635
src/backend/core/tests/test_services_search_indexers.py
Normal file
635
src/backend/core/tests/test_services_search_indexers.py
Normal file
@@ -0,0 +1,635 @@
|
||||
"""Tests for Documents search indexers"""
|
||||
|
||||
from functools import partial
|
||||
from json import dumps as json_dumps
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.contrib.auth.models import AnonymousUser
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
import pytest
|
||||
import responses
|
||||
from requests import HTTPError
|
||||
|
||||
from core import factories, models, utils
|
||||
from core.services.search_indexers import (
|
||||
BaseDocumentIndexer,
|
||||
SearchIndexer,
|
||||
get_document_indexer,
|
||||
get_visited_document_ids_of,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
class FakeDocumentIndexer(BaseDocumentIndexer):
|
||||
"""Fake indexer for test purpose"""
|
||||
|
||||
def serialize_document(self, document, accesses):
|
||||
return {}
|
||||
|
||||
def push(self, data):
|
||||
pass
|
||||
|
||||
def search_query(self, data, token):
|
||||
return {}
|
||||
|
||||
|
||||
def test_services_search_indexer_class_invalid(indexer_settings):
|
||||
"""
|
||||
Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown"
|
||||
|
||||
assert get_document_indexer() is None
|
||||
|
||||
|
||||
def test_services_search_indexer_class(indexer_settings):
|
||||
"""
|
||||
Import indexer class defined in setting SEARCH_INDEXER_CLASS.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = (
|
||||
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
|
||||
)
|
||||
|
||||
assert isinstance(
|
||||
get_document_indexer(),
|
||||
import_string("core.tests.test_services_search_indexers.FakeDocumentIndexer"),
|
||||
)
|
||||
|
||||
|
||||
def test_services_search_indexer_is_configured(indexer_settings):
|
||||
"""
|
||||
Should return true only when the indexer class and other configuration settings
|
||||
are valid.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = None
|
||||
|
||||
# None
|
||||
get_document_indexer.cache_clear()
|
||||
assert not get_document_indexer()
|
||||
|
||||
# Empty
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = ""
|
||||
|
||||
get_document_indexer.cache_clear()
|
||||
assert not get_document_indexer()
|
||||
|
||||
# Valid class
|
||||
indexer_settings.SEARCH_INDEXER_CLASS = (
|
||||
"core.services.search_indexers.SearchIndexer"
|
||||
)
|
||||
|
||||
get_document_indexer.cache_clear()
|
||||
assert get_document_indexer() is not None
|
||||
|
||||
indexer_settings.SEARCH_INDEXER_URL = ""
|
||||
|
||||
# Invalid url
|
||||
get_document_indexer.cache_clear()
|
||||
assert not get_document_indexer()
|
||||
|
||||
|
||||
def test_services_search_indexer_url_is_none(indexer_settings):
|
||||
"""
|
||||
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_URL = None
|
||||
|
||||
with pytest.raises(ImproperlyConfigured) as exc_info:
|
||||
SearchIndexer()
|
||||
|
||||
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
|
||||
|
||||
|
||||
def test_services_search_indexer_url_is_empty(indexer_settings):
|
||||
"""
|
||||
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_URL = ""
|
||||
|
||||
with pytest.raises(ImproperlyConfigured) as exc_info:
|
||||
SearchIndexer()
|
||||
|
||||
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
|
||||
|
||||
|
||||
def test_services_search_indexer_secret_is_none(indexer_settings):
|
||||
"""
|
||||
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_SECRET = None
|
||||
|
||||
with pytest.raises(ImproperlyConfigured) as exc_info:
|
||||
SearchIndexer()
|
||||
|
||||
assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
|
||||
exc_info.value
|
||||
)
|
||||
|
||||
|
||||
def test_services_search_indexer_secret_is_empty(indexer_settings):
|
||||
"""
|
||||
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_SECRET = ""
|
||||
|
||||
with pytest.raises(ImproperlyConfigured) as exc_info:
|
||||
SearchIndexer()
|
||||
|
||||
assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
|
||||
exc_info.value
|
||||
)
|
||||
|
||||
|
||||
def test_services_search_endpoint_is_none(indexer_settings):
|
||||
"""
|
||||
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
|
||||
|
||||
with pytest.raises(ImproperlyConfigured) as exc_info:
|
||||
SearchIndexer()
|
||||
|
||||
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
|
||||
exc_info.value
|
||||
)
|
||||
|
||||
|
||||
def test_services_search_endpoint_is_empty(indexer_settings):
|
||||
"""
|
||||
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is empty.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = ""
|
||||
|
||||
with pytest.raises(ImproperlyConfigured) as exc_info:
|
||||
SearchIndexer()
|
||||
|
||||
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
|
||||
exc_info.value
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_serialize_document_returns_expected_json():
|
||||
"""
|
||||
It should serialize documents with correct metadata and access control.
|
||||
"""
|
||||
user_a, user_b = factories.UserFactory.create_batch(2)
|
||||
document = factories.DocumentFactory()
|
||||
factories.DocumentFactory(parent=document)
|
||||
|
||||
factories.UserDocumentAccessFactory(document=document, user=user_a)
|
||||
factories.UserDocumentAccessFactory(document=document, user=user_b)
|
||||
factories.TeamDocumentAccessFactory(document=document, team="team1")
|
||||
factories.TeamDocumentAccessFactory(document=document, team="team2")
|
||||
|
||||
accesses = {
|
||||
document.path: {
|
||||
"users": {str(user_a.sub), str(user_b.sub)},
|
||||
"teams": {"team1", "team2"},
|
||||
}
|
||||
}
|
||||
|
||||
indexer = SearchIndexer()
|
||||
result = indexer.serialize_document(document, accesses)
|
||||
|
||||
assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
|
||||
assert set(result.pop("groups")) == {"team1", "team2"}
|
||||
assert result == {
|
||||
"id": str(document.id),
|
||||
"title": document.title,
|
||||
"depth": 1,
|
||||
"path": document.path,
|
||||
"numchild": 1,
|
||||
"content": utils.base64_yjs_to_text(document.content),
|
||||
"created_at": document.created_at.isoformat(),
|
||||
"updated_at": document.updated_at.isoformat(),
|
||||
"reach": document.link_reach,
|
||||
"size": 13,
|
||||
"is_active": True,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_serialize_document_deleted():
|
||||
"""Deleted documents are marked as just in the serialized json."""
|
||||
parent = factories.DocumentFactory()
|
||||
document = factories.DocumentFactory(parent=parent)
|
||||
|
||||
parent.soft_delete()
|
||||
document.refresh_from_db()
|
||||
|
||||
indexer = SearchIndexer()
|
||||
result = indexer.serialize_document(document, {})
|
||||
|
||||
assert result["is_active"] is False
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_serialize_document_empty():
|
||||
"""Empty documents returns empty content in the serialized json."""
|
||||
document = factories.DocumentFactory(content="", title=None)
|
||||
|
||||
indexer = SearchIndexer()
|
||||
result = indexer.serialize_document(document, {})
|
||||
|
||||
assert result["content"] == ""
|
||||
assert result["title"] == ""
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_services_search_indexers_index_errors(indexer_settings):
|
||||
"""
|
||||
Documents indexing response handling on Find API HTTP errors.
|
||||
"""
|
||||
factories.DocumentFactory()
|
||||
|
||||
indexer_settings.SEARCH_INDEXER_URL = "http://app-find/api/v1.0/documents/index/"
|
||||
|
||||
responses.add(
|
||||
responses.POST,
|
||||
"http://app-find/api/v1.0/documents/index/",
|
||||
status=401,
|
||||
body=json_dumps({"message": "Authentication failed."}),
|
||||
)
|
||||
|
||||
with pytest.raises(HTTPError):
|
||||
SearchIndexer().index()
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
def test_services_search_indexers_batches_pass_only_batch_accesses(
|
||||
mock_push, indexer_settings
|
||||
):
|
||||
"""
|
||||
Documents indexing should be processed in batches,
|
||||
and only the access data relevant to each batch should be used.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2
|
||||
documents = factories.DocumentFactory.create_batch(5)
|
||||
|
||||
# Attach a single user access to each document
|
||||
expected_user_subs = {}
|
||||
for document in documents:
|
||||
access = factories.UserDocumentAccessFactory(document=document)
|
||||
expected_user_subs[str(document.id)] = str(access.user.sub)
|
||||
|
||||
assert SearchIndexer().index() == 5
|
||||
|
||||
# Should be 3 batches: 2 + 2 + 1
|
||||
assert mock_push.call_count == 3
|
||||
|
||||
seen_doc_ids = set()
|
||||
|
||||
for call in mock_push.call_args_list:
|
||||
batch = call.args[0]
|
||||
assert isinstance(batch, list)
|
||||
|
||||
for doc_json in batch:
|
||||
doc_id = doc_json["id"]
|
||||
seen_doc_ids.add(doc_id)
|
||||
|
||||
# Only one user expected per document
|
||||
assert doc_json["users"] == [expected_user_subs[doc_id]]
|
||||
assert doc_json["groups"] == []
|
||||
|
||||
# Make sure all 5 documents were indexed
|
||||
assert seen_doc_ids == {str(d.id) for d in documents}
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_batch_size_argument(mock_push):
|
||||
"""
|
||||
Documents indexing should be processed in batches,
|
||||
batch_size overrides SEARCH_INDEXER_BATCH_SIZE
|
||||
"""
|
||||
documents = factories.DocumentFactory.create_batch(5)
|
||||
|
||||
# Attach a single user access to each document
|
||||
expected_user_subs = {}
|
||||
for document in documents:
|
||||
access = factories.UserDocumentAccessFactory(document=document)
|
||||
expected_user_subs[str(document.id)] = str(access.user.sub)
|
||||
|
||||
assert SearchIndexer().index(batch_size=2) == 5
|
||||
|
||||
# Should be 3 batches: 2 + 2 + 1
|
||||
assert mock_push.call_count == 3
|
||||
|
||||
seen_doc_ids = set()
|
||||
|
||||
for call in mock_push.call_args_list:
|
||||
batch = call.args[0]
|
||||
assert isinstance(batch, list)
|
||||
|
||||
for doc_json in batch:
|
||||
doc_id = doc_json["id"]
|
||||
seen_doc_ids.add(doc_id)
|
||||
|
||||
# Only one user expected per document
|
||||
assert doc_json["users"] == [expected_user_subs[doc_id]]
|
||||
assert doc_json["groups"] == []
|
||||
|
||||
# Make sure all 5 documents were indexed
|
||||
assert seen_doc_ids == {str(d.id) for d in documents}
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_ignore_empty_documents(mock_push):
|
||||
"""
|
||||
Documents indexing should be processed in batches,
|
||||
and only the access data relevant to each batch should be used.
|
||||
"""
|
||||
document = factories.DocumentFactory()
|
||||
factories.DocumentFactory(content="", title="")
|
||||
empty_title = factories.DocumentFactory(title="")
|
||||
empty_content = factories.DocumentFactory(content="")
|
||||
|
||||
assert SearchIndexer().index() == 3
|
||||
|
||||
assert mock_push.call_count == 1
|
||||
|
||||
# Make sure only not eempty documents are indexed
|
||||
results = {doc["id"] for doc in mock_push.call_args[0][0]}
|
||||
assert results == {
|
||||
str(d.id)
|
||||
for d in (
|
||||
document,
|
||||
empty_content,
|
||||
empty_title,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
def test_services_search_indexers_skip_empty_batches(mock_push, indexer_settings):
|
||||
"""
|
||||
Documents indexing batch can be empty if all the docs are empty.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2
|
||||
|
||||
document = factories.DocumentFactory()
|
||||
|
||||
# Only empty docs
|
||||
factories.DocumentFactory.create_batch(5, content="", title="")
|
||||
|
||||
assert SearchIndexer().index() == 1
|
||||
assert mock_push.call_count == 1
|
||||
|
||||
results = [doc["id"] for doc in mock_push.call_args[0][0]]
|
||||
assert results == [str(document.id)]
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_ancestors_link_reach(mock_push):
|
||||
"""Document accesses and reach should take into account ancestors link reaches."""
|
||||
great_grand_parent = factories.DocumentFactory(link_reach="restricted")
|
||||
grand_parent = factories.DocumentFactory(
|
||||
parent=great_grand_parent, link_reach="authenticated"
|
||||
)
|
||||
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
|
||||
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
|
||||
|
||||
assert SearchIndexer().index() == 4
|
||||
|
||||
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
||||
assert len(results) == 4
|
||||
assert results[str(great_grand_parent.id)]["reach"] == "restricted"
|
||||
assert results[str(grand_parent.id)]["reach"] == "authenticated"
|
||||
assert results[str(parent.id)]["reach"] == "public"
|
||||
assert results[str(document.id)]["reach"] == "public"
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_ancestors_users(mock_push):
|
||||
"""Document accesses and reach should include users from ancestors."""
|
||||
user_gp, user_p, user_d = factories.UserFactory.create_batch(3)
|
||||
|
||||
grand_parent = factories.DocumentFactory(users=[user_gp])
|
||||
parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
|
||||
document = factories.DocumentFactory(parent=parent, users=[user_d])
|
||||
|
||||
assert SearchIndexer().index() == 3
|
||||
|
||||
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
||||
assert len(results) == 3
|
||||
assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)]
|
||||
assert set(results[str(parent.id)]["users"]) == {str(user_gp.sub), str(user_p.sub)}
|
||||
assert set(results[str(document.id)]["users"]) == {
|
||||
str(user_gp.sub),
|
||||
str(user_p.sub),
|
||||
str(user_d.sub),
|
||||
}
|
||||
|
||||
|
||||
@patch.object(SearchIndexer, "push")
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_services_search_indexers_ancestors_teams(mock_push):
|
||||
"""Document accesses and reach should include teams from ancestors."""
|
||||
grand_parent = factories.DocumentFactory(teams=["team_gp"])
|
||||
parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
|
||||
document = factories.DocumentFactory(parent=parent, teams=["team_d"])
|
||||
|
||||
assert SearchIndexer().index() == 3
|
||||
|
||||
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
||||
assert len(results) == 3
|
||||
assert results[str(grand_parent.id)]["groups"] == ["team_gp"]
|
||||
assert set(results[str(parent.id)]["groups"]) == {"team_gp", "team_p"}
|
||||
assert set(results[str(document.id)]["groups"]) == {"team_gp", "team_p", "team_d"}
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_push_uses_correct_url_and_data(mock_post, indexer_settings):
|
||||
"""
|
||||
push() should call requests.post with the correct URL from settings
|
||||
the timeout set to 10 seconds and the data as JSON.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index"
|
||||
|
||||
indexer = SearchIndexer()
|
||||
sample_data = [{"id": "123", "title": "Test"}]
|
||||
|
||||
mock_response = mock_post.return_value
|
||||
mock_response.raise_for_status.return_value = None # No error
|
||||
|
||||
indexer.push(sample_data)
|
||||
|
||||
mock_post.assert_called_once()
|
||||
args, kwargs = mock_post.call_args
|
||||
|
||||
assert args[0] == indexer_settings.SEARCH_INDEXER_URL
|
||||
assert kwargs.get("json") == sample_data
|
||||
assert kwargs.get("timeout") == 10
|
||||
|
||||
|
||||
def test_get_visited_document_ids_of():
|
||||
"""
|
||||
get_visited_document_ids_of() returns the ids of the documents viewed
|
||||
by the user BUT without specific access configuration (like public ones)
|
||||
"""
|
||||
user = factories.UserFactory()
|
||||
other = factories.UserFactory()
|
||||
anonymous = AnonymousUser()
|
||||
queryset = models.Document.objects.all()
|
||||
|
||||
assert not get_visited_document_ids_of(queryset, anonymous)
|
||||
assert not get_visited_document_ids_of(queryset, user)
|
||||
|
||||
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
|
||||
|
||||
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
|
||||
|
||||
create_link(document=doc1)
|
||||
create_link(document=doc2)
|
||||
|
||||
# The third document is not visited
|
||||
assert sorted(get_visited_document_ids_of(queryset, user)) == sorted(
|
||||
[str(doc1.pk), str(doc2.pk)]
|
||||
)
|
||||
|
||||
factories.UserDocumentAccessFactory(user=other, document=doc1)
|
||||
factories.UserDocumentAccessFactory(user=user, document=doc2)
|
||||
|
||||
# The second document have an access for the user
|
||||
assert get_visited_document_ids_of(queryset, user) == [str(doc1.pk)]
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("indexer_settings")
|
||||
def test_get_visited_document_ids_of_deleted():
|
||||
"""
|
||||
get_visited_document_ids_of() returns the ids of the documents viewed
|
||||
by the user if they are not deleted.
|
||||
"""
|
||||
user = factories.UserFactory()
|
||||
anonymous = AnonymousUser()
|
||||
queryset = models.Document.objects.all()
|
||||
|
||||
assert not get_visited_document_ids_of(queryset, anonymous)
|
||||
assert not get_visited_document_ids_of(queryset, user)
|
||||
|
||||
doc = factories.DocumentFactory()
|
||||
doc_deleted = factories.DocumentFactory()
|
||||
doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted)
|
||||
|
||||
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
|
||||
|
||||
create_link(document=doc)
|
||||
create_link(document=doc_deleted)
|
||||
create_link(document=doc_ancestor_deleted)
|
||||
|
||||
# The all documents are visited
|
||||
assert sorted(get_visited_document_ids_of(queryset, user)) == sorted(
|
||||
[str(doc.pk), str(doc_deleted.pk), str(doc_ancestor_deleted.pk)]
|
||||
)
|
||||
|
||||
doc_deleted.soft_delete()
|
||||
|
||||
# Only the first document is not deleted
|
||||
assert get_visited_document_ids_of(queryset, user) == [str(doc.pk)]
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_services_search_indexers_search_errors(indexer_settings):
|
||||
"""
|
||||
Documents indexing response handling on Find API HTTP errors.
|
||||
"""
|
||||
factories.DocumentFactory()
|
||||
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_URL = (
|
||||
"http://app-find/api/v1.0/documents/search/"
|
||||
)
|
||||
|
||||
responses.add(
|
||||
responses.POST,
|
||||
"http://app-find/api/v1.0/documents/search/",
|
||||
status=401,
|
||||
body=json_dumps({"message": "Authentication failed."}),
|
||||
)
|
||||
|
||||
with pytest.raises(HTTPError):
|
||||
SearchIndexer().search("alpha", token="mytoken")
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_services_search_indexers_search(mock_post, indexer_settings):
|
||||
"""
|
||||
search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the
|
||||
document ids from linktraces.
|
||||
"""
|
||||
user = factories.UserFactory()
|
||||
indexer = SearchIndexer()
|
||||
|
||||
mock_response = mock_post.return_value
|
||||
mock_response.raise_for_status.return_value = None # No error
|
||||
|
||||
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
|
||||
|
||||
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
|
||||
|
||||
create_link(document=doc1)
|
||||
create_link(document=doc2)
|
||||
|
||||
visited = get_visited_document_ids_of(models.Document.objects.all(), user)
|
||||
|
||||
indexer.search("alpha", visited=visited, token="mytoken")
|
||||
|
||||
args, kwargs = mock_post.call_args
|
||||
|
||||
assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
|
||||
|
||||
query_data = kwargs.get("json")
|
||||
assert query_data["q"] == "alpha"
|
||||
assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)])
|
||||
assert query_data["services"] == ["docs"]
|
||||
assert query_data["nb_results"] == 50
|
||||
assert query_data["order_by"] == "updated_at"
|
||||
assert query_data["order_direction"] == "desc"
|
||||
|
||||
assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"}
|
||||
assert kwargs.get("timeout") == 10
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_services_search_indexers_search_nb_results(mock_post, indexer_settings):
|
||||
"""
|
||||
Find API call should have nb_results == SEARCH_INDEXER_QUERY_LIMIT
|
||||
or the given nb_results argument.
|
||||
"""
|
||||
indexer_settings.SEARCH_INDEXER_QUERY_LIMIT = 25
|
||||
|
||||
user = factories.UserFactory()
|
||||
indexer = SearchIndexer()
|
||||
|
||||
mock_response = mock_post.return_value
|
||||
mock_response.raise_for_status.return_value = None # No error
|
||||
|
||||
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
|
||||
|
||||
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
|
||||
|
||||
create_link(document=doc1)
|
||||
create_link(document=doc2)
|
||||
|
||||
visited = get_visited_document_ids_of(models.Document.objects.all(), user)
|
||||
|
||||
indexer.search("alpha", visited=visited, token="mytoken")
|
||||
|
||||
args, kwargs = mock_post.call_args
|
||||
|
||||
assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
|
||||
assert kwargs.get("json")["nb_results"] == 25
|
||||
|
||||
# The argument overrides the setting value
|
||||
indexer.search("alpha", visited=visited, token="mytoken", nb_results=109)
|
||||
|
||||
args, kwargs = mock_post.call_args
|
||||
|
||||
assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
|
||||
assert kwargs.get("json")["nb_results"] == 109
|
||||
@@ -75,3 +75,28 @@ def test_utils_extract_attachments():
|
||||
base64_string = base64.b64encode(update).decode("utf-8")
|
||||
# image_key2 is missing the "/media/" part and shouldn't get extracted
|
||||
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]
|
||||
|
||||
|
||||
def test_utils_get_ancestor_to_descendants_map_single_path():
|
||||
"""Test ancestor mapping of a single path."""
|
||||
paths = ["000100020005"]
|
||||
result = utils.get_ancestor_to_descendants_map(paths, steplen=4)
|
||||
|
||||
assert result == {
|
||||
"0001": {"000100020005"},
|
||||
"00010002": {"000100020005"},
|
||||
"000100020005": {"000100020005"},
|
||||
}
|
||||
|
||||
|
||||
def test_utils_get_ancestor_to_descendants_map_multiple_paths():
|
||||
"""Test ancestor mapping of multiple paths with shared prefixes."""
|
||||
paths = ["000100020005", "00010003"]
|
||||
result = utils.get_ancestor_to_descendants_map(paths, steplen=4)
|
||||
|
||||
assert result == {
|
||||
"0001": {"000100020005", "00010003"},
|
||||
"00010002": {"000100020005"},
|
||||
"000100020005": {"000100020005"},
|
||||
"00010003": {"00010003"},
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import base64
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
import pycrdt
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -9,6 +10,27 @@ from bs4 import BeautifulSoup
|
||||
from core import enums
|
||||
|
||||
|
||||
def get_ancestor_to_descendants_map(paths, steplen):
|
||||
"""
|
||||
Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.
|
||||
|
||||
Each path is assumed to use materialized path format with fixed-length segments.
|
||||
|
||||
Args:
|
||||
paths (list of str): List of full document paths.
|
||||
steplen (int): Length of each path segment.
|
||||
|
||||
Returns:
|
||||
dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).
|
||||
"""
|
||||
ancestor_map = defaultdict(set)
|
||||
for path in paths:
|
||||
for i in range(steplen, len(path) + 1, steplen):
|
||||
ancestor = path[:i]
|
||||
ancestor_map[ancestor].add(path)
|
||||
return ancestor_map
|
||||
|
||||
|
||||
def filter_descendants(paths, root_paths, skip_sorting=False):
|
||||
"""
|
||||
Filters paths to keep only those that are descendants of any path in root_paths.
|
||||
|
||||
@@ -1,16 +1,19 @@
|
||||
# ruff: noqa: S311, S106
|
||||
"""create_demo management command"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from uuid import uuid4
|
||||
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
import pycrdt
|
||||
from faker import Faker
|
||||
|
||||
from core import models
|
||||
@@ -27,6 +30,16 @@ def random_true_with_probability(probability):
|
||||
return random.random() < probability
|
||||
|
||||
|
||||
def get_ydoc_for_text(text):
|
||||
"""Return a ydoc from plain text for demo purposes."""
|
||||
ydoc = pycrdt.Doc()
|
||||
paragraph = pycrdt.XmlElement("p", {}, [pycrdt.XmlText(text)])
|
||||
fragment = pycrdt.XmlFragment([paragraph])
|
||||
ydoc["document-store"] = fragment
|
||||
update = ydoc.get_update()
|
||||
return base64.b64encode(update).decode("utf-8")
|
||||
|
||||
|
||||
class BulkQueue:
|
||||
"""A utility class to create Django model instances in bulk by just pushing to a queue."""
|
||||
|
||||
@@ -48,7 +61,7 @@ class BulkQueue:
|
||||
self.queue[objects[0]._meta.model.__name__] = [] # noqa: SLF001
|
||||
|
||||
def push(self, obj):
|
||||
"""Add a model instance to queue to that it gets created in bulk."""
|
||||
"""Add a model instance to queue so that it gets created in bulk."""
|
||||
objects = self.queue[obj._meta.model.__name__] # noqa: SLF001
|
||||
objects.append(obj)
|
||||
if len(objects) > self.BATCH_SIZE:
|
||||
@@ -139,17 +152,19 @@ def create_demo(stdout):
|
||||
# pylint: disable=protected-access
|
||||
key = models.Document._int2str(i) # noqa: SLF001
|
||||
padding = models.Document.alphabet[0] * (models.Document.steplen - len(key))
|
||||
queue.push(
|
||||
models.Document(
|
||||
depth=1,
|
||||
path=f"{padding}{key}",
|
||||
creator_id=random.choice(users_ids),
|
||||
title=fake.sentence(nb_words=4),
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
if random_true_with_probability(0.5)
|
||||
else random.choice(models.LinkReachChoices.values),
|
||||
)
|
||||
title = fake.sentence(nb_words=4)
|
||||
document = models.Document(
|
||||
id=uuid4(),
|
||||
depth=1,
|
||||
path=f"{padding}{key}",
|
||||
creator_id=random.choice(users_ids),
|
||||
title=title,
|
||||
link_reach=models.LinkReachChoices.AUTHENTICATED
|
||||
if random_true_with_probability(0.5)
|
||||
else random.choice(models.LinkReachChoices.values),
|
||||
)
|
||||
document.save_content(get_ydoc_for_text(f"Content for {title:s}"))
|
||||
queue.push(document)
|
||||
|
||||
queue.flush()
|
||||
|
||||
|
||||
@@ -99,6 +99,31 @@ class Base(Configuration):
|
||||
}
|
||||
DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
|
||||
|
||||
# Search
|
||||
SEARCH_INDEXER_CLASS = values.Value(
|
||||
default=None,
|
||||
environ_name="SEARCH_INDEXER_CLASS",
|
||||
environ_prefix=None,
|
||||
)
|
||||
SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue(
|
||||
default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None
|
||||
)
|
||||
SEARCH_INDEXER_URL = values.Value(
|
||||
default=None, environ_name="SEARCH_INDEXER_URL", environ_prefix=None
|
||||
)
|
||||
SEARCH_INDEXER_COUNTDOWN = values.IntegerValue(
|
||||
default=1, environ_name="SEARCH_INDEXER_COUNTDOWN", environ_prefix=None
|
||||
)
|
||||
SEARCH_INDEXER_SECRET = values.Value(
|
||||
default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None
|
||||
)
|
||||
SEARCH_INDEXER_QUERY_URL = values.Value(
|
||||
default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None
|
||||
)
|
||||
SEARCH_INDEXER_QUERY_LIMIT = values.PositiveIntegerValue(
|
||||
default=50, environ_name="SEARCH_INDEXER_QUERY_LIMIT", environ_prefix=None
|
||||
)
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
STATIC_URL = "/static/"
|
||||
STATIC_ROOT = os.path.join(DATA_DIR, "static")
|
||||
|
||||
@@ -44,7 +44,16 @@ export const constructParams = (params: DocsParams): URLSearchParams => {
|
||||
export type DocsResponse = APIList<Doc>;
|
||||
export const getDocs = async (params: DocsParams): Promise<DocsResponse> => {
|
||||
const searchParams = constructParams(params);
|
||||
const response = await fetchAPI(`documents/?${searchParams.toString()}`);
|
||||
let response
|
||||
|
||||
// HACK for fulltext search feature
|
||||
if (searchParams.has('title')) {
|
||||
searchParams.set('q', searchParams.get('title') || '');
|
||||
searchParams.delete('title');
|
||||
response = await fetchAPI(`documents/search?${searchParams.toString()}`);
|
||||
} else {
|
||||
response = await fetchAPI(`documents/?${searchParams.toString()}`);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new APIError('Failed to get the docs', await errorCauses(response));
|
||||
|
||||
Reference in New Issue
Block a user