br-acc/etl/scripts/download_pncp.py

#!/usr/bin/env python3
"""Download PNCP procurement bid publications via REST API.

Fetches data from the PNCP public API in date-range windows and saves
as JSON files for pipeline consumption.  Each (window, modalidade)
combination is checkpointed to disk immediately, so progress is never
lost on crash.  Use --skip-existing to resume interrupted runs.

API: https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao
Swagger: https://pncp.gov.br/api/consulta/swagger-ui/index.html

Usage:
    python etl/scripts/download_pncp.py
    python etl/scripts/download_pncp.py --start-date 2021-01-01 --end-date 2026-02-25
    python etl/scripts/download_pncp.py --output-dir ./data/pncp --modalidades 6,8,9
"""

from __future__ import annotations

import json
import logging
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
from pathlib import Path

import click
import httpx

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

API_BASE = "https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao"

# All PNCP modalidade codes (procurement types)
ALL_MODALIDADES = [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

# API constraints
MAX_PAGE_SIZE = 50
MAX_DATE_RANGE_DAYS = 10
REQUEST_DELAY_SECONDS = 1.0
MAX_RETRIES = 3
RETRY_BACKOFF_SECONDS = 5.0


def _fetch_page(
    client: httpx.Client,
    date_start: str,
    date_end: str,
    modalidade: int,
    page: int,
) -> dict | None:
    """Fetch a single page from the PNCP API.

    Returns parsed JSON dict, or None for empty responses (204, empty body).
    """
    params = {
        "dataInicial": date_start,
        "dataFinal": date_end,
        "codigoModalidadeContratacao": modalidade,
        "pagina": page,
        "tamanhoPagina": MAX_PAGE_SIZE,
    }
    response = client.get(API_BASE, params=params)

    # 204 No Content = no data for this combination
    if response.status_code == 204:
        return None

    response.raise_for_status()

    text = response.text.strip()
    if not text:
        return None

    # PNCP sometimes returns invalid control characters in JSON text fields
    return json.loads(text, strict=False)  # type: ignore[no-any-return]


def _fetch_window(
    client: httpx.Client,
    date_start: str,
    date_end: str,
    modalidade: int,
    *,
    page_workers: int = 1,
    request_delay_seconds: float = REQUEST_DELAY_SECONDS,
) -> list[dict]:
    """Fetch all pages for a single date window + modalidade combination."""
    def fetch_with_retry(page: int) -> dict | None:
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                return _fetch_page(client, date_start, date_end, modalidade, page)
            except httpx.HTTPStatusError as e:
                if e.response.status_code in (400, 404):
                    # No data or invalid range
                    return None
                if e.response.status_code == 429:
                    wait = RETRY_BACKOFF_SECONDS * attempt * 2
                    logger.warning(
                        "Rate limited (429) for %s-%s mod=%d page=%d, "
                        "waiting %.0fs (attempt %d/%d)",
                        date_start, date_end, modalidade, page,
                        wait, attempt, MAX_RETRIES,
                    )
                    time.sleep(wait)
                    continue
                if attempt < MAX_RETRIES:
                    logger.warning(
                        "HTTP %d for %s-%s mod=%d page=%d (attempt %d/%d)",
                        e.response.status_code, date_start, date_end,
                        modalidade, page, attempt, MAX_RETRIES,
                    )
                    time.sleep(RETRY_BACKOFF_SECONDS * attempt)
                    continue
                logger.warning(
                    "Giving up on %s-%s mod=%d page=%d after %d attempts: %s",
                    date_start, date_end, modalidade, page, MAX_RETRIES, e,
                )
                return None
            except httpx.HTTPError as e:
                if attempt < MAX_RETRIES:
                    logger.warning(
                        "Network error for %s-%s mod=%d page=%d "
                        "(attempt %d/%d): %s",
                        date_start, date_end, modalidade, page,
                        attempt, MAX_RETRIES, e,
                    )
                    time.sleep(RETRY_BACKOFF_SECONDS * attempt)
                    continue
                logger.warning(
                    "Giving up on %s-%s mod=%d page=%d after %d attempts: %s",
                    date_start, date_end, modalidade, page, MAX_RETRIES, e,
                )
                return None
        return None

    first = fetch_with_retry(1)
    if first is None:
        return []

    first_items = first.get("data", [])
    if not first_items or first.get("empty", True):
        return []

    all_records: list[dict] = list(first_items)
    total_pages = int(first.get("totalPaginas", 1) or 1)
    if total_pages <= 1:
        return all_records

    remaining_pages = range(2, total_pages + 1)
    workers = max(1, int(page_workers))
    if workers == 1:
        for page in remaining_pages:
            data = fetch_with_retry(page)
            if data is None:
                continue
            items = data.get("data", [])
            if items and not data.get("empty", False):
                all_records.extend(items)
            if request_delay_seconds > 0:
                time.sleep(request_delay_seconds)
        return all_records

    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = {executor.submit(fetch_with_retry, page): page for page in remaining_pages}
        for future in as_completed(futures):
            data = future.result()
            if data is None:
                continue
            items = data.get("data", [])
            if items and not data.get("empty", False):
                all_records.extend(items)
    return all_records


def _date_windows(
    start: datetime, end: datetime, window_days: int,
) -> list[tuple[str, str]]:
    """Generate (start_yyyymmdd, end_yyyymmdd) tuples for date windows."""
    windows: list[tuple[str, str]] = []
    current = start
    while current < end:
        window_end = min(current + timedelta(days=window_days - 1), end)
        windows.append((
            current.strftime("%Y%m%d"),
            window_end.strftime("%Y%m%d"),
        ))
        current = window_end + timedelta(days=1)
    return windows


def _month_key_for_record(rec: dict, fallback: str) -> str:
    """Extract YYYYMM month key from a record's publication date."""
    pub_date = str(rec.get("dataPublicacaoPncp", fallback))
    if "-" in pub_date:
        return pub_date[:7].replace("-", "")
    return fallback[:6]


def _flush_to_disk(
    out_dir: Path,
    month_key: str,
    new_records: list[dict],
) -> int:
    """Append-merge new_records into the monthly JSON file.

    Deduplicates by numeroControlePNCP.  Returns total record count
    in the file after merge.
    """
    out_file = out_dir / f"pncp_{month_key}.json"

    existing_data: list[dict] = []
    if out_file.exists():
        try:
            raw = json.loads(out_file.read_text(encoding="utf-8"), strict=False)
            if isinstance(raw, dict) and "data" in raw:
                existing_data = raw["data"]
            elif isinstance(raw, list):
                existing_data = raw
        except (json.JSONDecodeError, OSError):
            logger.warning("Could not read existing file %s, overwriting", out_file)

    # Deduplicate by control number
    seen_ids: set[str] = {
        str(r.get("numeroControlePNCP", "")) for r in existing_data
    }
    unique_new = [
        r for r in new_records
        if str(r.get("numeroControlePNCP", "")) not in seen_ids
    ]

    merged = existing_data + unique_new
    out_file.write_text(
        json.dumps(merged, ensure_ascii=False, indent=None),
        encoding="utf-8",
    )
    return len(merged)


def _load_checkpoint(checkpoint_file: Path) -> set[str]:
    """Load set of completed (window_start, window_end, modalidade) keys."""
    if not checkpoint_file.exists():
        return set()
    try:
        lines = checkpoint_file.read_text(encoding="utf-8").strip().splitlines()
        return set(lines)
    except OSError:
        return set()


def _save_checkpoint(checkpoint_file: Path, key: str) -> None:
    """Append a completed key to the checkpoint file."""
    with checkpoint_file.open("a", encoding="utf-8") as f:
        f.write(key + "\n")


def _month_range(start: datetime, end: datetime) -> list[str]:
    """Return YYYYMM keys from start month to end month inclusive."""
    keys: list[str] = []
    cursor = datetime(start.year, start.month, 1)
    limit = datetime(end.year, end.month, 1)
    while cursor <= limit:
        keys.append(cursor.strftime("%Y%m"))
        if cursor.month == 12:
            cursor = datetime(cursor.year + 1, 1, 1)
        else:
            cursor = datetime(cursor.year, cursor.month + 1, 1)
    return keys


def _load_month_file(path: Path) -> list[dict]:
    if not path.exists():
        return []
    try:
        raw = json.loads(path.read_text(encoding="utf-8"), strict=False)
    except (json.JSONDecodeError, OSError):
        return []
    if isinstance(raw, dict) and "data" in raw and isinstance(raw["data"], list):
        return [r for r in raw["data"] if isinstance(r, dict)]
    if isinstance(raw, list):
        return [r for r in raw if isinstance(r, dict)]
    return []


def _compute_missing_months(out_dir: Path, expected_months: list[str]) -> list[str]:
    missing: list[str] = []
    for mk in expected_months:
        if not (out_dir / f"pncp_{mk}.json").exists():
            missing.append(mk)
    return missing


def _write_manifest(
    manifest_path: Path,
    start_date: str,
    end_date: str,
    expected_months: list[str],
    month_sources: dict[str, set[str]],
    missing_months: list[str],
) -> None:
    month_entries: list[dict[str, object]] = []
    totals = {"in_sync": 0, "empty": 0, "missing": 0}

    for mk in expected_months:
        rows = len(_load_month_file(manifest_path.parent / f"pncp_{mk}.json"))
        if mk in missing_months:
            status = "missing"
        elif rows == 0:
            status = "empty"
        else:
            status = "in_sync"
        totals[status] += 1
        month_entries.append({
            "month": mk,
            "rows": rows,
            "source_windows": sorted(month_sources.get(mk, set())),
            "status": status,
        })

    payload = {
        "generated_at_utc": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "range_start": start_date,
        "range_end": end_date,
        "expected_months": expected_months,
        "missing_months": missing_months,
        "summary": {
            "months_total": len(expected_months),
            "months_in_sync": totals["in_sync"],
            "months_empty": totals["empty"],
            "months_missing": totals["missing"],
        },
        "months": month_entries,
    }
    manifest_path.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    logger.info("Wrote PNCP manifest: %s", manifest_path)


@click.command()
@click.option(
    "--start-date",
    default="2021-01-01",
    help="Start date (YYYY-MM-DD). Default: 2021-01-01 (PNCP launch).",
)
@click.option(
    "--end-date",
    default=lambda: datetime.now().strftime("%Y-%m-%d"),
    help="End date (YYYY-MM-DD). Default: today.",
)
@click.option(
    "--modalidades",
    default=",".join(str(m) for m in ALL_MODALIDADES),
    help="Comma-separated modalidade codes. Default: all.",
)
@click.option("--output-dir", default="./data/pncp", help="Output directory")
@click.option(
    "--window-days", type=int, default=MAX_DATE_RANGE_DAYS,
    help="Days per API window",
)
@click.option(
    "--skip-existing/--no-skip-existing", default=True,
    help="Skip already-checkpointed windows",
)
@click.option("--timeout", type=int, default=90, help="HTTP request timeout in seconds")
@click.option(
    "--strict-month-continuity/--no-strict-month-continuity",
    default=False,
    help="Fail if any month in range has no monthly PNCP file after run.",
)
@click.option(
    "--request-delay",
    type=float,
    default=REQUEST_DELAY_SECONDS,
    show_default=True,
    help="Delay (seconds) between combo requests. Use 0 for max throughput.",
)
@click.option(
    "--page-workers",
    type=int,
    default=1,
    show_default=True,
    help="Parallel workers to fetch remaining pages inside each combo.",
)
@click.option(
    "--manifest-path",
    default=None,
    help="Optional manifest JSON output path (default: <output-dir>/download_manifest.json).",
)
def main(
    start_date: str,
    end_date: str,
    modalidades: str,
    output_dir: str,
    window_days: int,
    skip_existing: bool,
    timeout: int,
    strict_month_continuity: bool,
    request_delay: float,
    page_workers: int,
    manifest_path: str | None,
) -> None:
    """Download PNCP procurement bid publications."""
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    mod_list = [int(m.strip()) for m in modalidades.split(",")]

    logger.info("=== PNCP Download ===")
    logger.info("Date range: %s to %s", start_date, end_date)
    logger.info("Modalidades: %s", mod_list)
    logger.info("Page workers: %d", max(1, page_workers))
    logger.info("Request delay: %.3fs", max(0.0, request_delay))

    windows = _date_windows(start, end, window_days)
    total_combos = len(windows) * len(mod_list)
    expected_months = _month_range(start, end)
    missing_before = _compute_missing_months(out, expected_months)
    logger.info("Date windows: %d, total combos: %d", len(windows), total_combos)
    logger.info(
        "Month continuity (pre-run): expected=%d missing=%d",
        len(expected_months),
        len(missing_before),
    )
    if missing_before:
        logger.info("Missing months before run: %s", ",".join(missing_before))

    # Checkpoint file tracks completed (window, modalidade) combos
    checkpoint_file = out / ".checkpoint"
    completed = _load_checkpoint(checkpoint_file) if skip_existing else set()
    if completed:
        logger.info("Resuming: %d combos already completed", len(completed))

    client = httpx.Client(
        timeout=timeout,
        follow_redirects=True,
        headers={"User-Agent": "BR-ACC-ETL/1.0 (public data research)"},
    )

    total_records = 0
    combos_done = len(completed)
    month_sources: dict[str, set[str]] = defaultdict(set)

    try:
        for win_start, win_end in windows:
            for mod in mod_list:
                combo_key = f"{win_start}_{win_end}_{mod}"

                if combo_key in completed:
                    continue

                logger.info(
                    "[%d/%d] Fetching %s-%s modalidade=%d...",
                    combos_done + 1, total_combos, win_start, win_end, mod,
                )
                records = _fetch_window(
                    client,
                    win_start,
                    win_end,
                    mod,
                    page_workers=max(1, page_workers),
                    request_delay_seconds=max(0.0, request_delay),
                )

                if records:
                    # Group by publication month and flush immediately
                    by_month: dict[str, list[dict]] = {}
                    for rec in records:
                        mk = _month_key_for_record(rec, win_start)
                        by_month.setdefault(mk, []).append(rec)
                        month_sources[mk].add(combo_key)

                    for mk, recs in by_month.items():
                        count = _flush_to_disk(out, mk, recs)
                        logger.info(
                            "  %s: +%d records (file total: %d)",
                            mk, len(recs), count,
                        )

                    total_records += len(records)

                # Mark combo as done
                _save_checkpoint(checkpoint_file, combo_key)
                completed.add(combo_key)
                combos_done += 1

                if request_delay > 0:
                    time.sleep(request_delay)
    except KeyboardInterrupt:
        logger.info("Interrupted. Progress saved — rerun with --skip-existing to resume.")
    finally:
        client.close()

    missing_after = _compute_missing_months(out, expected_months)
    logger.info(
        "Month continuity (post-run): expected=%d missing=%d",
        len(expected_months),
        len(missing_after),
    )
    if missing_after:
        logger.warning("Missing months after run: %s", ",".join(missing_after))

    manifest_output = Path(manifest_path) if manifest_path else out / "download_manifest.json"
    _write_manifest(
        manifest_output,
        start_date=start_date,
        end_date=end_date,
        expected_months=expected_months,
        month_sources=month_sources,
        missing_months=missing_after,
    )

    logger.info(
        "=== Done: %d new records fetched, %d/%d combos completed ===",
        total_records, combos_done, total_combos,
    )
    if strict_month_continuity and missing_after:
        raise click.ClickException(
            f"Strict month continuity failed: {len(missing_after)} missing month(s)",
        )


if __name__ == "__main__":
    main()