From ef621e000a0c3321b13ad1162d84b6bd2240161d Mon Sep 17 00:00:00 2001 From: Alex Newman Date: Sun, 19 Apr 2026 17:08:25 -0700 Subject: [PATCH] fix: address coderabbit review on PR #2076 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Actionable (4): - Dockerfile uv install: wrap `chmod ... || true` in braces so the trailing `|| true` no longer masks failures from `curl|sh` via bash operator precedence (&& binds tighter than ||). Applied to both docker/claude-mem/ and evals/swebench/Dockerfile.agent. Added `set -eux` to the RUN lines. - docker/claude-mem/Dockerfile: drop unused `sudo` apt package (~2 MB). - run-batch.py: name each agent container (`swebench-agent---`) and force-remove via `docker rm -f ` in the TimeoutExpired handler so timed-out runs don't leave orphan containers. Nitpicks (2): - smoke-test.sh: collapse 3 python3 invocations into 1 — parse the instance JSON once, print `repo base_commit`, and write problem.txt in the same call. - run-instance.sh: shallow clone via `--depth 1 --no-single-branch` + `fetch --depth 1 origin $BASE_COMMIT`. Falls back to a full clone if the server rejects the by-commit fetch. Co-Authored-By: Claude Opus 4.7 (1M context) --- docker/claude-mem/Dockerfile | 9 ++++++--- evals/swebench/Dockerfile.agent | 8 ++++++-- evals/swebench/run-batch.py | 17 ++++++++++++++++- evals/swebench/run-instance.sh | 13 +++++++++++-- evals/swebench/smoke-test.sh | 18 ++++++++++++------ 5 files changed, 51 insertions(+), 14 deletions(-) diff --git a/docker/claude-mem/Dockerfile b/docker/claude-mem/Dockerfile index 5ef1c882..ce896dc9 100644 --- a/docker/claude-mem/Dockerfile +++ b/docker/claude-mem/Dockerfile @@ -30,7 +30,6 @@ RUN apt-get update \ ca-certificates \ unzip \ jq \ - sudo \ less \ procps \ uuid-runtime \ @@ -49,8 +48,12 @@ ENV PATH="/usr/local/bun/bin:${PATH}" # Versioned installer URL per https://docs.astral.sh/uv/getting-started/installation/. ARG UV_VERSION=0.11.7 ENV UV_INSTALL_DIR="/usr/local/bin" -RUN curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh \ - && chmod a+rX /usr/local/bin/uv /usr/local/bin/uvx 2>/dev/null || true +# `&&` binds tighter than `||` in bash, so the previous form let `curl|sh` fail +# silently via the trailing `|| true`. Group the chmod so tolerated failure is +# scoped to perms-fixing only. +RUN set -eux \ + && curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh \ + && { chmod a+rX /usr/local/bin/uv /usr/local/bin/uvx 2>/dev/null || true; } # Match the upstream devcontainer's npm-global prefix so `npm install -g` # targets a dir the `node` user owns. diff --git a/evals/swebench/Dockerfile.agent b/evals/swebench/Dockerfile.agent index b1115724..edfc16a0 100644 --- a/evals/swebench/Dockerfile.agent +++ b/evals/swebench/Dockerfile.agent @@ -35,8 +35,12 @@ ENV PATH="/usr/local/bun/bin:${PATH}" # uv (provides Python for Chroma per CLAUDE.md). Installed to a system # location, same reason. ENV UV_INSTALL_DIR="/usr/local/bin" -RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ - && chmod a+rX /usr/local/bin/uv /usr/local/bin/uvx 2>/dev/null || true +# Group the chmod so the trailing `|| true` only absorbs chmod failures; without +# this grouping, bash precedence (`&&` binds tighter than `||`) would silently +# mask a failed `curl|sh` install step. +RUN set -eux \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && { chmod a+rX /usr/local/bin/uv /usr/local/bin/uvx 2>/dev/null || true; } # Claude Code CLI — PINNED to the version whose flag surface was verified in # the plan (Phase 0). Do NOT bump without re-verifying flags. diff --git a/evals/swebench/run-batch.py b/evals/swebench/run-batch.py index e10de52e..103b3332 100755 --- a/evals/swebench/run-batch.py +++ b/evals/swebench/run-batch.py @@ -281,6 +281,10 @@ def run_one_instance( status: str = "failed" model_patch: str = "" + # Uniquely named so the TimeoutExpired handler can kill it without racing + # other instances on the host. + container_name = f"swebench-agent-{instance_id}-{os.getpid()}-{threading.get_ident()}" + try: # The orchestrator owns JSONL writes under `predictions_lock` to avoid # racy concurrent appends across containers — so we DO NOT mount the @@ -293,6 +297,8 @@ def run_one_instance( "docker", "run", "--rm", + "--name", + container_name, "-e", "CLAUDE_MEM_OUTPUT_DIR=/scratch", "-v", @@ -354,8 +360,17 @@ def run_one_instance( except subprocess.TimeoutExpired as exc: status = "timed_out" + # subprocess.run killed the docker CLI, but the container may + # still be running. Force-remove it by name so we don't leak + # containers across the batch. + subprocess.run( + ["docker", "rm", "-f", container_name], + capture_output=True, + check=False, + timeout=30, + ) stderr_log_path.write_text( - f"TIMEOUT after {timeout}s\n" + f"TIMEOUT after {timeout}s (forced docker rm -f {container_name})\n" f"=== STDOUT (partial) ===\n{exc.stdout or ''}\n" f"=== STDERR (partial) ===\n{exc.stderr or ''}\n", encoding="utf-8", diff --git a/evals/swebench/run-instance.sh b/evals/swebench/run-instance.sh index 12001a63..b87797b4 100755 --- a/evals/swebench/run-instance.sh +++ b/evals/swebench/run-instance.sh @@ -94,8 +94,17 @@ cleanup() { } trap cleanup EXIT -# Clone the repo at BASE_COMMIT. -git clone "https://github.com/${REPO_SLUG}.git" "$REPO_DIR" +# Shallow clone + fetch the exact commit. Saves minutes on large repos +# (sympy/django/scikit-learn) vs. a full-history clone. Fallback to a full +# clone if the server rejects the by-commit fetch (GitHub supports +# uploadpack.allowReachableSHA1InWant by default on public repos, but mirrors +# may not). +if ! { git clone --depth 1 --no-single-branch "https://github.com/${REPO_SLUG}.git" "$REPO_DIR" \ + && git -C "$REPO_DIR" fetch --depth 1 origin "$BASE_COMMIT"; }; then + echo "WARN: shallow fetch failed; falling back to full clone" >&2 + rm -rf "$REPO_DIR" + git clone "https://github.com/${REPO_SLUG}.git" "$REPO_DIR" +fi git -C "$REPO_DIR" reset --hard "$BASE_COMMIT" # ---------- Turn 1: Ingest (populate memory via PostToolUse hook) ---------- diff --git a/evals/swebench/smoke-test.sh b/evals/swebench/smoke-test.sh index d2d39cb9..e41ce2d2 100755 --- a/evals/swebench/smoke-test.sh +++ b/evals/swebench/smoke-test.sh @@ -64,15 +64,21 @@ else: sys.exit(1) PY -# Read JSON fields from stdin so $INSTANCE_JSON paths with spaces/special chars -# don't break shell interpolation inside `python3 -c`. -REPO=$(python3 -c 'import json,sys; print(json.load(sys.stdin)["repo"])' < "$INSTANCE_JSON") -BASE_COMMIT=$(python3 -c 'import json,sys; print(json.load(sys.stdin)["base_commit"])' < "$INSTANCE_JSON") - SCRATCH="$(mktemp -d -t claude-mem-smoke.XXXXXX)" trap 'rm -f "$CREDS_FILE" "$INSTANCE_JSON"; rm -rf "$SCRATCH"' EXIT -python3 -c 'import json,sys,os; open(os.path.join(sys.argv[1],"problem.txt"),"w").write(json.load(sys.stdin)["problem_statement"])' "$SCRATCH" < "$INSTANCE_JSON" +# Parse the instance JSON once: print repo + base_commit to stdout, write the +# problem statement directly to $SCRATCH/problem.txt. Avoids three separate +# python3 invocations and any shell-quote injection on $INSTANCE_JSON. +read -r REPO BASE_COMMIT < <( + python3 - "$SCRATCH" < "$INSTANCE_JSON" <<'PY' +import json, os, sys +d = json.load(sys.stdin) +scratch = sys.argv[1] +open(os.path.join(scratch, "problem.txt"), "w").write(d["problem_statement"]) +print(d["repo"], d["base_commit"]) +PY +) echo "=== Running $INSTANCE_ID ($REPO @ $BASE_COMMIT) ===" >&2 echo "Scratch: $SCRATCH" >&2