🚸(backend) make document search on title accent-insensitive

This should work in both cases:
- search for "vélo" when the document title contains "velo"
- search for "velo" when the document title contains "vélo"
This commit is contained in:
Samuel Paccoud - DINUM
2025-04-15 12:04:59 +02:00
parent d85d20ff8f
commit d76bd8175e
3 changed files with 83 additions and 7 deletions

View File

@@ -1,5 +1,8 @@
"""API filters for Impress' core application."""
import unicodedata
from django.db.models import CharField, Func
from django.utils.translation import gettext_lazy as _
import django_filters
@@ -7,12 +10,64 @@ import django_filters
from core import models
class DocumentFilter(django_filters.FilterSet):
def remove_accents(value):
"""Remove accents from a string (vélo -> velo)."""
return "".join(
c
for c in unicodedata.normalize("NFD", value)
if unicodedata.category(c) != "Mn"
)
# pylint: disable=abstract-method
class Unaccent(Func):
"""
Custom filter for filtering documents.
PostgreSQL unaccent function wrapper for use in Django ORM queries.
This allows you to annotate a field using the unaccented version of a
text column, enabling accent-insensitive filtering.
"""
title = django_filters.CharFilter(
function = "unaccent"
template = "unaccent(%(expressions)s::text)"
output_field = CharField()
class AccentInsensitiveCharFilter(django_filters.CharFilter):
"""
A custom CharFilter that performs case-insensitive and accent-insensitive filtering.
This filter uses PostgreSQL's extension `unaccent` function to remove diacritics (accents)
from characters before applying the lookup expression (e.g., `icontains`).
"""
def filter(self, qs, value):
"""
Apply the filter to the queryset using the unaccented version of the field.
Args:
qs: The queryset to filter.
value: The value to search for in the unaccented field.
Returns:
A filtered queryset.
"""
if value:
value = remove_accents(value)
field_name = self.field_name
annotated_field = f"unaccented_{field_name}"
return qs.annotate(**{annotated_field: Unaccent(field_name)}).filter(
**{f"{annotated_field}__{self.lookup_expr}": value}
)
return qs
class DocumentFilter(django_filters.FilterSet):
"""
Custom filter for filtering documents on title (accent and case insensitive).
"""
title = AccentInsensitiveCharFilter(
field_name="title", lookup_expr="icontains", label=_("Title")
)

View File

@@ -0,0 +1,14 @@
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("core", "0020_remove_is_public_add_field_attachments_and_duplicated_from"),
]
operations = [
migrations.RunSQL(
"CREATE EXTENSION IF NOT EXISTS unaccent;",
reverse_sql="DROP EXTENSION IF EXISTS unaccent;",
),
]

View File

@@ -7,6 +7,7 @@ from faker import Faker
from rest_framework.test import APIClient
from core import factories
from core.api.filters import remove_accents
fake = Faker()
pytestmark = pytest.mark.django_db
@@ -49,14 +50,16 @@ def test_api_documents_descendants_filter_unknown_field():
[
("Project Alpha", 1), # Exact match
("project", 2), # Partial match (case-insensitive)
("Guide", 1), # Word match within a title
("Guide", 2), # Word match within a title
("Special", 0), # No match (nonexistent keyword)
("2024", 2), # Match by numeric keyword
("", 5), # Empty string
("", 6), # Empty string
("velo", 1), # Accent-insensitive match (velo vs vélo)
("bêta", 1), # Accent-insensitive match (bêta vs beta)
],
)
def test_api_documents_descendants_filter_title(query, nb_results):
"""Authenticated users should be able to search documents by their title."""
"""Authenticated users should be able to search documents by their unaccented title."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
@@ -70,6 +73,7 @@ def test_api_documents_descendants_filter_title(query, nb_results):
"User Guide",
"Financial Report 2024",
"Annual Review 2024",
"Guide du vélo urbain", # <-- Title with accent for accent-insensitive test
]
for title in titles:
factories.DocumentFactory(title=title, parent=document)
@@ -85,4 +89,7 @@ def test_api_documents_descendants_filter_title(query, nb_results):
# Ensure all results contain the query in their title
for result in results:
assert query.lower().strip() in result["title"].lower()
assert (
remove_accents(query).lower().strip()
in remove_accents(result["title"]).lower()
)