Compare commits

...

1 Commits

Author SHA1 Message Date
Manuel Raynaud
bae30152c6 🐛(backend) manage race condition between GET and PATCH content
When a PATCH and a GET on the content endpoint are made at the same time
for different users a race condition can happen and the metadata returned
by the S3 head_object can be outdated when the object is fetched leading
to an error raised because the Content-Length header does not match the
size of the response body. To avoid this, we no longer used head_object
followed bu get_object, we have to manage
everything in one call with the get_object. The get_object also accepts
as parameters an etag or last-modified header and will return a 304 if
the content has not changed, so we can use this to not return the entire
body if this one has not changed.
2026-05-05 15:42:24 +02:00
4 changed files with 171 additions and 66 deletions

View File

@@ -10,6 +10,10 @@ and this project adheres to
- 🐛(frontend) sanitize pasted and dropped content in document title #2210
### Fixed
- 🐛(backend) manage race condition between GET and PATCH content
## [v5.0.0] - 2026-04-08
### Added

View File

@@ -1,5 +1,6 @@
"""Util to generate S3 authorization headers for object storage access control"""
import datetime as dt
import time
from abc import ABC, abstractmethod
@@ -199,3 +200,31 @@ class AIUserRateThrottle(AIBaseRateThrottle):
def get_content_metadata_cache_key(document_id):
"""Return the cache key used to store content metadata."""
return f"docs:content-metadata:{document_id!s}"
def parse_http_conditional_headers(request):
"""Extract and normalize `If-None-Match` and `If-Modified-Since`.
The `W/` weak prefix is stripped from the ETag because reverse proxies
(e.g. nginx with gzip) rewrite strong ETags into weak ones, which would
otherwise break a strict equality check in production.
"""
if_none_match = request.META.get("HTTP_IF_NONE_MATCH")
if if_none_match and if_none_match.startswith("W/"):
if_none_match = if_none_match.removeprefix("W/")
if_modified_since_dt = None
if if_modified_since := request.META.get("HTTP_IF_MODIFIED_SINCE"):
try:
if_modified_since_dt = dt.datetime.strptime(
if_modified_since, "%a, %d %b %Y %H:%M:%S %Z"
)
except ValueError:
if_modified_since_dt = None
else:
if not if_modified_since_dt.tzinfo:
if_modified_since_dt = if_modified_since_dt.replace(
tzinfo=dt.timezone.utc
)
return if_none_match, if_modified_since_dt

View File

@@ -1941,11 +1941,12 @@ class DocumentViewSet(
Retrieve the raw content file from s3 and stream it.
We implement a HTTP cache based on the ETag and LastModified headers.
We retrieve the ETag and LastModified from the S3 head operation, save them in cache to
reuse them in future requests.
The ETag and LastModified are retrieved in the S3 get_object operation to be consistent with
the content Body retrieved at the same time. These metadata are saved in cache for
future requests.
We check in the request if the ETag is present in the If-None-Match header and if it's the
same as the one from the S3 head operation, we return a 304 response.
If the ETag is not present or not the same, we do the same check based on the LastModifed
same as the one from the S3 get_object, we return a 304 response.
If the ETag is not present or not the same, we do the same check based on the LastModified
value if present in the If-Modified-Since header.
"""
document = self.get_object()
@@ -1955,73 +1956,69 @@ class DocumentViewSet(
# the web-socket re-connection burst.
connection.close()
if not (
content_metadata := cache.get(
utils.get_content_metadata_cache_key(document.id)
if_none_match, if_modified_since_dt = utils.parse_http_conditional_headers(
request
)
# First check if a cache is existing to return earlier a 304 without reaching s3
# if etag or last_modified have not changed.
cache_key = utils.get_content_metadata_cache_key(document.id)
if content_metadata := cache.get(cache_key):
if if_none_match and if_none_match == content_metadata.get("etag"):
return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
if (
if_modified_since_dt
and dt.datetime.fromisoformat(content_metadata.get("last_modified"))
<= if_modified_since_dt
):
return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
# Prepare get_object S3 operation. The get_object manages ETag and last_modified
# headers and will raise a 304 client error if one of them matches the value existing in
# S3.
get_kwargs = {"Bucket": default_storage.bucket_name, "Key": document.file_key}
if if_none_match:
get_kwargs["IfNoneMatch"] = if_none_match
if if_modified_since_dt:
get_kwargs["IfModifiedSince"] = if_modified_since_dt
try:
s3_response = default_storage.connection.meta.client.get_object(
**get_kwargs
)
):
except ClientError as exc:
code = exc.response["Error"]["Code"]
if code in ("304", "PreconditionFailed", "NotModified"):
return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
if code in ("NoSuchKey", "404"):
return StreamingHttpResponse(b"", content_type="text/plain", status=200)
raise
last_modified = s3_response["LastModified"]
etag = s3_response["ETag"]
size = s3_response["ContentLength"]
# Refresh the metadata cache so future conditional requests can
# check them earlier
cache.set(
cache_key,
{
"last_modified": last_modified.isoformat(),
"etag": etag,
},
settings.CONTENT_METADATA_CACHE_TIMEOUT,
)
def _stream(body):
try:
file_metadata = default_storage.connection.meta.client.head_object(
Bucket=default_storage.bucket_name, Key=document.file_key
)
except ClientError:
return StreamingHttpResponse(
b"", content_type="text/plain", status=status.HTTP_200_OK
)
last_modified = file_metadata["LastModified"]
etag = file_metadata["ETag"]
size = file_metadata["ContentLength"]
cache.set(
utils.get_content_metadata_cache_key(document.id),
{
"last_modified": last_modified.isoformat(),
"etag": etag,
"size": size,
},
settings.CONTENT_METADATA_CACHE_TIMEOUT,
)
else:
last_modified = dt.datetime.fromisoformat(
content_metadata.get("last_modified")
)
etag = content_metadata.get("etag")
size = content_metadata.get("size")
# --- Check conditional headers from any client ---
if_none_match = request.META.get("HTTP_IF_NONE_MATCH") # contains ETag
if_modified_since = request.META.get("HTTP_IF_MODIFIED_SINCE")
# Strip the W/ weak prefix. Proxies (e.g. nginx with gzip) convert strong
# ETags to weak ones, so a strict equality check would fail on production
# even when unchanged.
if if_none_match and if_none_match.startswith("W/"):
if_none_match = if_none_match.removeprefix("W/")
if if_none_match and if_none_match == etag:
return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
if if_modified_since:
try:
since = dt.datetime.strptime(
if_modified_since, "%a, %d %b %Y %H:%M:%S %Z"
)
except ValueError:
pass
else:
if not since.tzinfo:
since = since.replace(tzinfo=dt.timezone.utc)
if last_modified <= since:
return drf_response.Response(status=status.HTTP_304_NOT_MODIFIED)
def _stream(file_key):
with default_storage.open(file_key, "rb") as f:
while chunk := f.read(8192):
while chunk := body.read(8192):
yield chunk
finally:
body.close()
response = StreamingHttpResponse(
streaming_content=_stream(document.file_key),
streaming_content=_stream(s3_response["Body"]),
content_type="text/plain",
status=status.HTTP_200_OK,
)

View File

@@ -0,0 +1,75 @@
"""
Unit tests for the parse_http_conditional_headers utility function.
"""
import datetime as dt
from rest_framework.test import APIRequestFactory
from core.api.utils import parse_http_conditional_headers
def _request(**headers):
"""Build a request with the given HTTP headers."""
return APIRequestFactory().get("/", headers=headers)
def test_api_utils_parse_http_conditional_headers_no_headers():
"""Without conditional headers, both values should be None."""
if_none_match, if_modified_since_dt = parse_http_conditional_headers(_request())
assert if_none_match is None
assert if_modified_since_dt is None
def test_api_utils_parse_http_conditional_headers_strong_etag():
"""A strong ETag should be returned unchanged."""
if_none_match, _ = parse_http_conditional_headers(
_request(**{"if-none-match": '"abc123"'})
)
assert if_none_match == '"abc123"'
def test_api_utils_parse_http_conditional_headers_weak_etag():
"""The W/ weak prefix should be stripped from the ETag."""
if_none_match, _ = parse_http_conditional_headers(
_request(**{"if-none-match": 'W/"abc123"'})
)
assert if_none_match == '"abc123"'
def test_api_utils_parse_http_conditional_headers_valid_if_modified_since():
"""A valid RFC 1123 If-Modified-Since header should be parsed as tz-aware UTC.
Python's strptime parses ``%Z`` for "GMT"/"UTC" but does not populate
``tzinfo``; this test therefore also exercises the UTC fallback branch.
"""
_, if_modified_since_dt = parse_http_conditional_headers(
_request(**{"if-modified-since": "Wed, 21 Oct 2015 07:28:00 GMT"})
)
assert if_modified_since_dt == dt.datetime(
2015, 10, 21, 7, 28, 0, tzinfo=dt.timezone.utc
)
def test_api_utils_parse_http_conditional_headers_invalid_if_modified_since():
"""An unparsable If-Modified-Since should yield None instead of raising."""
_, if_modified_since_dt = parse_http_conditional_headers(
_request(**{"if-modified-since": "not-a-date"})
)
assert if_modified_since_dt is None
def test_api_utils_parse_http_conditional_headers_both_headers():
"""Both If-None-Match and If-Modified-Since should be parsed independently."""
if_none_match, if_modified_since_dt = parse_http_conditional_headers(
_request(
**{
"if-none-match": 'W/"deadbeef"',
"if-modified-since": "Wed, 21 Oct 2015 07:28:00 GMT",
}
)
)
assert if_none_match == '"deadbeef"'
assert if_modified_since_dt == dt.datetime(
2015, 10, 21, 7, 28, 0, tzinfo=dt.timezone.utc
)