🐛(backend) manage race condition between GET and PATCH content

When a PATCH and a GET on the content endpoint are made at the same time for different users a race condition can happen and the metadata returned by the S3 head_object can be outdated when the object is fetched leading to an error raised because the Content-Length header does not match the size of the response body. To avoid this, we no longer used head_object followed bu get_object, we have to manage everything in one call with the get_object. The get_object also accepts as parameters an etag or last-modified header and will return a 304 if the content has not changed, so we can use this to not return the entire body if this one has not changed.
2026-05-07 07:32:33 +02:00 · 2026-05-05 15:42:24 +02:00
4 changed files with 171 additions and 66 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ and this project adheres to

 - 🐛(frontend) sanitize pasted and dropped content in document title #2210

+### Fixed
+
+- 🐛(backend) manage race condition between GET and PATCH content
+
 ## [v5.0.0] - 2026-04-08

 ### Added
--- a/src/backend/core/api/utils.py
+++ b/src/backend/core/api/utils.py
@@ -1,5 +1,6 @@
 """Util to generate S3 authorization headers for object storage access control"""

+import datetime as dt
 import time
 from abc import ABC, abstractmethod

@@ -199,3 +200,31 @@ class AIUserRateThrottle(AIBaseRateThrottle):
 def get_content_metadata_cache_key(document_id):
    """Return the cache key used to store content metadata."""
    return f"docs:content-metadata:{document_id!s}"
+
+
+def parse_http_conditional_headers(request):
+    """Extract and normalize `If-None-Match` and `If-Modified-Since`.
+
+    The `W/` weak prefix is stripped from the ETag because reverse proxies
+    (e.g. nginx with gzip) rewrite strong ETags into weak ones, which would
+    otherwise break a strict equality check in production.
+    """
+    if_none_match = request.META.get("HTTP_IF_NONE_MATCH")
+    if if_none_match and if_none_match.startswith("W/"):
+        if_none_match = if_none_match.removeprefix("W/")
+
+    if_modified_since_dt = None
+    if if_modified_since := request.META.get("HTTP_IF_MODIFIED_SINCE"):
+        try:
+            if_modified_since_dt = dt.datetime.strptime(
+                if_modified_since, "%a, %d %b %Y %H:%M:%S %Z"
+            )
+        except ValueError:
+            if_modified_since_dt = None
+        else:
+            if not if_modified_since_dt.tzinfo:
+                if_modified_since_dt = if_modified_since_dt.replace(
+                    tzinfo=dt.timezone.utc
+                )
+
+    return if_none_match, if_modified_since_dt
--- a/src/backend/core/api/viewsets.py
+++ b/src/backend/core/api/viewsets.py
@@ -1941,11 +1941,12 @@ class DocumentViewSet(
        Retrieve the raw content file from s3 and stream it.

        We implement a HTTP cache based on the ETag and LastModified headers.
-        We retrieve the ETag and LastModified from the S3 head operation, save them in cache to
-        reuse them in future requests.
+        The ETag and LastModified are retrieved in the S3 get_object operation to be consistent with
+        the content Body retrieved at the same time. These metadata are saved in cache for
+        future requests.
        We check in the request if the ETag is present in the If-None-Match header and if it's the
-        same as the one from the S3 head operation, we return a 304 response.
-        If the ETag is not present or not the same, we do the same check based on the LastModifed
+        same as the one from the S3 get_object, we return a 304 response.
+        If the ETag is not present or not the same, we do the same check based on the LastModified
        value if present in the If-Modified-Since header.
        """
        document = self.get_object()
@@ -1955,73 +1956,69 @@ class DocumentViewSet(
        # the web-socket re-connection burst.
        connection.close()

-        if not (
-            content_metadata := cache.get(
-                utils.get_content_metadata_cache_key(document.id)
+        if_none_match, if_modified_since_dt = utils.parse_http_conditional_headers(
+            request
+        )
+
+        # First check if a cache is existing to return earlier a 304 without reaching s3
+        # if etag or last_modified have not changed.
+        cache_key = utils.get_content_metadata_cache_key(document.id)
+        if content_metadata := cache.get(cache_key):
+            if if_none_match and if_none_match == content_metadata.get("etag"):
+                return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
+
+            if (
+                if_modified_since_dt
+                and dt.datetime.fromisoformat(content_metadata.get("last_modified"))
+                <= if_modified_since_dt
+            ):
+                return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
+
+        # Prepare get_object S3 operation. The get_object manages ETag and last_modified
+        # headers and will raise a 304 client error if one of them matches the value existing in
+        # S3.
+        get_kwargs = {"Bucket": default_storage.bucket_name, "Key": document.file_key}
+        if if_none_match:
+            get_kwargs["IfNoneMatch"] = if_none_match
+        if if_modified_since_dt:
+            get_kwargs["IfModifiedSince"] = if_modified_since_dt
+
+        try:
+            s3_response = default_storage.connection.meta.client.get_object(
+                **get_kwargs
            )
-        ):
+        except ClientError as exc:
+            code = exc.response["Error"]["Code"]
+            if code in ("304", "PreconditionFailed", "NotModified"):
+                return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
+            if code in ("NoSuchKey", "404"):
+                return StreamingHttpResponse(b"", content_type="text/plain", status=200)
+            raise
+
+        last_modified = s3_response["LastModified"]
+        etag = s3_response["ETag"]
+        size = s3_response["ContentLength"]
+
+        # Refresh the metadata cache so future conditional requests can
+        # check them earlier
+        cache.set(
+            cache_key,
+            {
+                "last_modified": last_modified.isoformat(),
+                "etag": etag,
+            },
+            settings.CONTENT_METADATA_CACHE_TIMEOUT,
+        )
+
+        def _stream(body):
            try:
-                file_metadata = default_storage.connection.meta.client.head_object(
-                    Bucket=default_storage.bucket_name, Key=document.file_key
-                )
-            except ClientError:
-                return StreamingHttpResponse(
-                    b"", content_type="text/plain", status=status.HTTP_200_OK
-                )
-
-            last_modified = file_metadata["LastModified"]
-            etag = file_metadata["ETag"]
-            size = file_metadata["ContentLength"]
-
-            cache.set(
-                utils.get_content_metadata_cache_key(document.id),
-                {
-                    "last_modified": last_modified.isoformat(),
-                    "etag": etag,
-                    "size": size,
-                },
-                settings.CONTENT_METADATA_CACHE_TIMEOUT,
-            )
-        else:
-            last_modified = dt.datetime.fromisoformat(
-                content_metadata.get("last_modified")
-            )
-            etag = content_metadata.get("etag")
-            size = content_metadata.get("size")
-
-        # --- Check conditional headers from any client ---
-        if_none_match = request.META.get("HTTP_IF_NONE_MATCH")  # contains ETag
-        if_modified_since = request.META.get("HTTP_IF_MODIFIED_SINCE")
-
-        # Strip the W/ weak prefix. Proxies (e.g. nginx with gzip) convert strong
-        # ETags to weak ones, so a strict equality check would fail on production
-        # even when unchanged.
-        if if_none_match and if_none_match.startswith("W/"):
-            if_none_match = if_none_match.removeprefix("W/")
-
-        if if_none_match and if_none_match == etag:
-            return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
-
-        if if_modified_since:
-            try:
-                since = dt.datetime.strptime(
-                    if_modified_since, "%a, %d %b %Y %H:%M:%S %Z"
-                )
-            except ValueError:
-                pass
-            else:
-                if not since.tzinfo:
-                    since = since.replace(tzinfo=dt.timezone.utc)
-                if last_modified <= since:
-                    return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
-
-        def _stream(file_key):
-            with default_storage.open(file_key, "rb") as f:
-                while chunk := f.read(8192):
+                while chunk := body.read(8192):
                    yield chunk
+            finally:
+                body.close()

        response = StreamingHttpResponse(
-            streaming_content=_stream(document.file_key),
+            streaming_content=_stream(s3_response["Body"]),
            content_type="text/plain",
            status=status.HTTP_200_OK,
        )
--- a/src/backend/core/tests/test_api_utils_parse_http_conditional_headers.py
+++ b/src/backend/core/tests/test_api_utils_parse_http_conditional_headers.py
@@ -0,0 +1,75 @@
+"""
+Unit tests for the parse_http_conditional_headers utility function.
+"""
+
+import datetime as dt
+
+from rest_framework.test import APIRequestFactory
+
+from core.api.utils import parse_http_conditional_headers
+
+
+def _request(**headers):
+    """Build a request with the given HTTP headers."""
+    return APIRequestFactory().get("/", headers=headers)
+
+
+def test_api_utils_parse_http_conditional_headers_no_headers():
+    """Without conditional headers, both values should be None."""
+    if_none_match, if_modified_since_dt = parse_http_conditional_headers(_request())
+    assert if_none_match is None
+    assert if_modified_since_dt is None
+
+
+def test_api_utils_parse_http_conditional_headers_strong_etag():
+    """A strong ETag should be returned unchanged."""
+    if_none_match, _ = parse_http_conditional_headers(
+        _request(**{"if-none-match": '"abc123"'})
+    )
+    assert if_none_match == '"abc123"'
+
+
+def test_api_utils_parse_http_conditional_headers_weak_etag():
+    """The W/ weak prefix should be stripped from the ETag."""
+    if_none_match, _ = parse_http_conditional_headers(
+        _request(**{"if-none-match": 'W/"abc123"'})
+    )
+    assert if_none_match == '"abc123"'
+
+
+def test_api_utils_parse_http_conditional_headers_valid_if_modified_since():
+    """A valid RFC 1123 If-Modified-Since header should be parsed as tz-aware UTC.
+
+    Python's strptime parses ``%Z`` for "GMT"/"UTC" but does not populate
+    ``tzinfo``; this test therefore also exercises the UTC fallback branch.
+    """
+    _, if_modified_since_dt = parse_http_conditional_headers(
+        _request(**{"if-modified-since": "Wed, 21 Oct 2015 07:28:00 GMT"})
+    )
+    assert if_modified_since_dt == dt.datetime(
+        2015, 10, 21, 7, 28, 0, tzinfo=dt.timezone.utc
+    )
+
+
+def test_api_utils_parse_http_conditional_headers_invalid_if_modified_since():
+    """An unparsable If-Modified-Since should yield None instead of raising."""
+    _, if_modified_since_dt = parse_http_conditional_headers(
+        _request(**{"if-modified-since": "not-a-date"})
+    )
+    assert if_modified_since_dt is None
+
+
+def test_api_utils_parse_http_conditional_headers_both_headers():
+    """Both If-None-Match and If-Modified-Since should be parsed independently."""
+    if_none_match, if_modified_since_dt = parse_http_conditional_headers(
+        _request(
+            **{
+                "if-none-match": 'W/"deadbeef"',
+                "if-modified-since": "Wed, 21 Oct 2015 07:28:00 GMT",
+            }
+        )
+    )
+    assert if_none_match == '"deadbeef"'
+    assert if_modified_since_dt == dt.datetime(
+        2015, 10, 21, 7, 28, 0, tzinfo=dt.timezone.utc
+    )