sync: upstream convergence 2026-03-02

Co-authored-by: bruno cesar <brunoclz@brunos-MacBook-Pro.local>
This commit is contained in:
Bruno César
2026-03-02 03:51:26 -03:00
committed by GitHub
parent d00d150f93
commit add44821e8
175 changed files with 2569 additions and 4713 deletions

View File

@@ -18,29 +18,24 @@ API_PORT=8000
LOG_LEVEL=info LOG_LEVEL=info
APP_ENV=dev APP_ENV=dev
JWT_SECRET_KEY=change-me-generate-with-openssl-rand-hex-32 JWT_SECRET_KEY=change-me-generate-with-openssl-rand-hex-32
AUTH_COOKIE_NAME=bracc_session
AUTH_COOKIE_SECURE=false
AUTH_COOKIE_SAMESITE=lax
TRUST_PROXY_HEADERS=false
INVITE_CODE= INVITE_CODE=
CORS_ORIGINS=http://localhost:3000 CORS_ORIGINS=http://localhost:3000
PRODUCT_TIER=community PRODUCT_TIER=community
PATTERNS_ENABLED=false PATTERNS_ENABLED=false
PUBLIC_MODE=true PUBLIC_MODE=false
PUBLIC_ALLOW_PERSON=false PUBLIC_ALLOW_PERSON=false
PUBLIC_ALLOW_ENTITY_LOOKUP=false PUBLIC_ALLOW_ENTITY_LOOKUP=false
PUBLIC_ALLOW_INVESTIGATIONS=false PUBLIC_ALLOW_INVESTIGATIONS=false
PATTERN_SPLIT_THRESHOLD_VALUE=80000 PATTERN_SPLIT_THRESHOLD_VALUE=80000
PATTERN_SPLIT_MIN_COUNT=3 PATTERN_SPLIT_MIN_COUNT=3
PATTERN_SHARE_THRESHOLD=0.60 PATTERN_SHARE_THRESHOLD=0.6
PATTERN_SRP_MIN_ORGS=5 PATTERN_SRP_MIN_ORGS=5
PATTERN_INEXIG_MIN_RECURRENCE=3 PATTERN_INEXIG_MIN_RECURRENCE=3
PATTERN_MAX_EVIDENCE_REFS=50 PATTERN_MAX_EVIDENCE_REFS=50
SHARE_TOKEN_TTL_HOURS=168
# Frontend (dev only — production uses Caddy reverse proxy with relative paths) # Frontend (dev only — production uses Caddy reverse proxy with relative paths)
VITE_API_URL=http://localhost:8000 VITE_API_URL=http://localhost:8000
VITE_PUBLIC_MODE=true VITE_PUBLIC_MODE=false
VITE_PATTERNS_ENABLED=false VITE_PATTERNS_ENABLED=false
# Optional: Google Cloud (for Base dos Dados / TSE BigQuery) # Optional: Google Cloud (for Base dos Dados / TSE BigQuery)

View File

@@ -1,5 +1,5 @@
blank_issues_enabled: false blank_issues_enabled: false
contact_links: contact_links:
- name: Security vulnerability report - name: Security vulnerability report
url: https://github.com/World-Open-Graph/br-acc/security/advisories/new url: https://github.com/brunoclz/world-transparency-graph/security/advisories/new
about: Use GitHub Security Advisories for private vulnerability disclosure. about: Use GitHub Security Advisories for private vulnerability disclosure.

View File

@@ -10,8 +10,8 @@
"README.md", "README.md",
"CONTRIBUTING.md", "CONTRIBUTING.md",
"frontend/src/**", "frontend/src/**",
"api/src/icarus/queries/**", "api/src/bracc/queries/**",
"api/src/icarus/models/**", "api/src/bracc/models/**",
"api/tests/**", "api/tests/**",
"etl/tests/**", "etl/tests/**",
"frontend/src/**/*.test.*" "frontend/src/**/*.test.*"

View File

@@ -23,26 +23,6 @@ on:
description: "Release title (EN)" description: "Release title (EN)"
required: true required: true
type: string type: string
highlights_pt:
description: "PT highlights (separate bullets with |)"
required: true
type: string
highlights_en:
description: "EN highlights (separate bullets with |)"
required: true
type: string
patterns_included:
description: "Comma-separated pattern IDs included in this release (use 'none' if not applicable)"
required: true
type: string
technical_changes_pt:
description: "PT technical changes (separate bullets with |)"
required: true
type: string
technical_changes_en:
description: "EN technical changes (separate bullets with |)"
required: true
type: string
permissions: permissions:
contents: write contents: write
@@ -124,116 +104,63 @@ jobs:
COMPARE_URL: ${{ steps.validate.outputs.compare_url }} COMPARE_URL: ${{ steps.validate.outputs.compare_url }}
TITLE_PT: ${{ inputs.title_pt }} TITLE_PT: ${{ inputs.title_pt }}
TITLE_EN: ${{ inputs.title_en }} TITLE_EN: ${{ inputs.title_en }}
HIGHLIGHTS_PT: ${{ inputs.highlights_pt }}
HIGHLIGHTS_EN: ${{ inputs.highlights_en }}
PATTERNS_INCLUDED: ${{ inputs.patterns_included }}
TECHNICAL_CHANGES_PT: ${{ inputs.technical_changes_pt }}
TECHNICAL_CHANGES_EN: ${{ inputs.technical_changes_en }}
run: | run: |
set -euo pipefail set -euo pipefail
DATE_UTC="$(date -u +"%Y-%m-%d")" DATE_UTC="$(date -u +"%Y-%m-%d")"
export DATE_UTC export DATE_UTC
cat > release_notes.md <<NOTES
## PT-BR
${TITLE_PT}
### Escopo
- Release publicada por marco.
- Mudanças detalhadas por categorias no histórico desta versão.
### Integridade pública
Os sinais e padrões refletem coocorrências em bases públicas e não constituem prova legal.
## EN
${TITLE_EN}
### Scope
- Milestone-based release publication.
- Detailed changes grouped by category in this version history.
### Public integrity
Signals and patterns reflect co-occurrence in public records and are not legal proof.
## Compatibility
- Breaking changes: declare explicitly when applicable.
- Migration required: declare explicitly when applicable.
## Compare
${COMPARE_URL}
## Metadata
- Version: ${VERSION}
- Target SHA: ${TARGET_SHA}
- Previous tag: ${PREVIOUS_TAG}
- Date (UTC): ${DATE_UTC}
NOTES
python - <<'PY' python - <<'PY'
import json import json
import os import os
from textwrap import dedent
def split_pipe(raw: str) -> list[str]:
normalized = raw.replace("\r\n", "\n").replace("\n", "|")
return [item.strip(" -\t") for item in normalized.split("|") if item.strip()]
def split_csv(raw: str) -> list[str]:
value = raw.strip()
if value.lower() in {"none", "n/a", "na", "-"}:
return []
return [item.strip() for item in value.split(",") if item.strip()]
def bullets(items: list[str], fallback: str) -> str:
if not items:
return f"- {fallback}"
return "\n".join(f"- {item}" for item in items)
highlights_pt = split_pipe(os.environ["HIGHLIGHTS_PT"])
highlights_en = split_pipe(os.environ["HIGHLIGHTS_EN"])
technical_changes_pt = split_pipe(os.environ["TECHNICAL_CHANGES_PT"])
technical_changes_en = split_pipe(os.environ["TECHNICAL_CHANGES_EN"])
patterns = split_csv(os.environ["PATTERNS_INCLUDED"])
release_notes = dedent(
f"""
## PT-BR
{os.environ["TITLE_PT"]}
### Escopo
- Release publicada por marco.
- Mudanças listadas de forma específica para facilitar auditoria pública.
### Destaques
{bullets(highlights_pt, "Sem destaques declarados.")}
### Padrões incluídos
{bullets(patterns, "Sem novos padrões nesta release.")}
### Mudanças técnicas
{bullets(technical_changes_pt, "Sem mudanças técnicas declaradas.")}
### Integridade pública
Os sinais e padrões refletem coocorrências em bases públicas e não constituem prova legal.
## EN
{os.environ["TITLE_EN"]}
### Scope
- Milestone-based release publication.
- Changes are listed explicitly for public traceability.
### Highlights
{bullets(highlights_en, "No highlights declared.")}
### Included patterns
{bullets(patterns, "No new patterns in this release.")}
### Technical changes
{bullets(technical_changes_en, "No technical changes declared.")}
### Public integrity
Signals and patterns reflect co-occurrence in public records and are not legal proof.
## Compatibility
- Breaking changes: none declared.
- Migration required: no.
## Compare
{os.environ.get("COMPARE_URL", "")}
## Metadata
- Version: {os.environ["VERSION"]}
- Target SHA: {os.environ["TARGET_SHA"]}
- Previous tag: {os.environ["PREVIOUS_TAG"]}
- Date (UTC): {os.environ.get("DATE_UTC", "")}
"""
).strip() + "\n"
with open("release_notes.md", "w", encoding="utf-8") as fh:
fh.write(release_notes)
payload = { payload = {
"version": os.environ["VERSION"], "version": os.environ["VERSION"],
"date": os.environ.get("DATE_UTC", ""), "date": os.environ.get("DATE_UTC", ""),
"highlights_pt": highlights_pt, "highlights_pt": [os.environ["TITLE_PT"]],
"highlights_en": highlights_en, "highlights_en": [os.environ["TITLE_EN"]],
"api_changes": [], "api_changes": [],
"data_changes": [], "data_changes": [],
"privacy_compliance_changes": [], "privacy_compliance_changes": [],
"patterns_included": patterns,
"technical_changes_pt": technical_changes_pt,
"technical_changes_en": technical_changes_en,
"breaking_changes": False, "breaking_changes": False,
"migration_required": False, "migration_required": False,
"compare_url": os.environ.get("COMPARE_URL", ""), "compare_url": os.environ.get("COMPARE_URL", ""),

View File

@@ -6,18 +6,10 @@ on:
pull_request: pull_request:
branches: [main] branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs: jobs:
gitleaks: gitleaks:
name: Gitleaks name: Gitleaks
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
@@ -37,7 +29,6 @@ jobs:
bandit: bandit:
name: Bandit (Python) name: Bandit (Python)
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -57,7 +48,6 @@ jobs:
pip-audit: pip-audit:
name: Pip Audit (Python deps) name: Pip Audit (Python deps)
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 20
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -69,14 +59,6 @@ jobs:
with: with:
python-version: "3.12" python-version: "3.12"
- name: Cache uv
uses: actions/cache@v4
with:
path: ~/.cache/uv
key: ${{ runner.os }}-uv-security-${{ hashFiles('api/uv.lock', 'etl/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-security-
- name: Export lock-compatible requirement sets - name: Export lock-compatible requirement sets
run: | run: |
cd api cd api
@@ -93,7 +75,6 @@ jobs:
public-privacy-gate: public-privacy-gate:
name: Public Privacy Gate name: Public Privacy Gate
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -107,7 +88,6 @@ jobs:
compliance-pack-gate: compliance-pack-gate:
name: Compliance Pack Gate name: Compliance Pack Gate
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -120,9 +100,8 @@ jobs:
public-boundary-gate: public-boundary-gate:
name: Public Boundary Gate name: Public Boundary Gate
if: vars.PUBLIC_BOUNDARY_GATE_ENABLED == 'true' if: github.repository == 'brunoclz/world-transparency-graph'
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -136,7 +115,6 @@ jobs:
internal-instruction-boundary: internal-instruction-boundary:
name: Internal Instruction Boundary name: Internal Instruction Boundary
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4

3
.gitignore vendored
View File

@@ -75,7 +75,6 @@ scripts/audit-prompts/
# Local report artifacts in repository root # Local report artifacts in repository root
/*.pdf /*.pdf
/*.html /*.html
gitleaks-report*.json
# Playwright MCP cache # Playwright MCP cache
.playwright-mcp/ .playwright-mcp/
@@ -91,7 +90,7 @@ data/tse/
# Local MCP runtime config (keep example only) # Local MCP runtime config (keep example only)
.mcp.json .mcp.json
# Internal assistant instructions (must never be published) # Internal assistant instruction files (must never be published)
CLAUDE.md CLAUDE.md
AGENTS.md AGENTS.md
AGENTS*.md AGENTS*.md

142
Makefile
View File

@@ -1,14 +1,125 @@
.PHONY: dev stop seed bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics check neutrality .PHONY: dev stop api etl frontend lint type-check test test-api test-etl test-frontend test-integration-api test-integration-etl test-integration check seed clean download-cnpj download-tse download-transparencia download-sanctions download-all etl-cnpj etl-cnpj-stream etl-tse etl-transparencia etl-sanctions etl-all link-persons bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics
# ── Development ─────────────────────────────────────────
dev: dev:
docker compose -f infra/docker-compose.yml up -d docker compose up -d
stop: stop:
docker compose -f infra/docker-compose.yml down docker compose down
# ── API ─────────────────────────────────────────────────
api:
cd api && uv run uvicorn bracc.main:app --reload --host 0.0.0.0 --port 8000
# ── ETL ─────────────────────────────────────────────────
etl:
cd etl && uv run bracc-etl --help
seed: seed:
bash infra/scripts/seed-dev.sh bash infra/scripts/seed-dev.sh
# ── CNPJ Data ──────────────────────────────────────────
download-cnpj:
cd etl && uv run python scripts/download_cnpj.py --reference-only
cd etl && uv run python scripts/download_cnpj.py --files 1
download-cnpj-all:
cd etl && uv run python scripts/download_cnpj.py --files 10
etl-cnpj:
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
etl-cnpj-dev:
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
etl-cnpj-stream:
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --streaming
# ── TSE Data ──────────────────────────────────────────
download-tse:
cd etl && uv run python scripts/download_tse.py --years 2024
etl-tse:
cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
etl-tse-dev:
cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
# ── Transparencia Data ────────────────────────────────
download-transparencia:
cd etl && uv run python scripts/download_transparencia.py --year 2025
etl-transparencia:
cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
etl-transparencia-dev:
cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
# ── Sanctions Data ────────────────────────────────────
download-sanctions:
cd etl && uv run python scripts/download_sanctions.py
etl-sanctions:
cd etl && uv run bracc-etl run --source sanctions --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
# ── All Data ──────────────────────────────────────────
download-all: download-cnpj download-tse download-transparencia download-sanctions
etl-all: etl-cnpj etl-tse etl-transparencia etl-sanctions
# ── Entity Resolution ────────────────────────────────────
link-persons:
docker compose exec neo4j cypher-shell -u neo4j -p "$${NEO4J_PASSWORD}" -f /scripts/link_persons.cypher
# ── Frontend ────────────────────────────────────────────
frontend:
cd frontend && npm run dev
# ── Quality ─────────────────────────────────────────────
lint:
cd api && uv run ruff check src/ tests/
cd etl && uv run ruff check src/ tests/
cd frontend && npm run lint
type-check:
cd api && uv run mypy src/
cd etl && uv run mypy src/
cd frontend && npm run type-check
test-api:
cd api && uv run pytest
test-etl:
cd etl && uv run pytest
test-frontend:
cd frontend && npm test
test: test-api test-etl test-frontend
# ── Integration tests ─────────────────────────────────
test-integration-api:
cd api && uv run pytest -m integration
test-integration-etl:
cd etl && uv run pytest -m integration
test-integration: test-integration-api test-integration-etl
# ── Full check (run before commit) ─────────────────────
check: lint type-check test
@echo "All checks passed."
# ── Neutrality audit ───────────────────────────────────
neutrality:
@! grep -rn \
"suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty\|CRITICAL\|HIGH.*severity\|MEDIUM.*severity\|LOW.*severity" \
api/src/ etl/src/ frontend/src/ \
--include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \
|| (echo "NEUTRALITY VIOLATION FOUND" && exit 1)
@echo "Neutrality check passed."
# ── Bootstrap ─────────────────────────────────────────────
bootstrap-demo: bootstrap-demo:
bash scripts/bootstrap_public_demo.sh --profile demo bash scripts/bootstrap_public_demo.sh --profile demo
@@ -24,6 +135,7 @@ bootstrap-all-noninteractive:
bootstrap-all-report: bootstrap-all-report:
python3 scripts/run_bootstrap_all.py --repo-root . --report-latest python3 scripts/run_bootstrap_all.py --repo-root . --report-latest
# ── Quality checks ────────────────────────────────────────
check-public-claims: check-public-claims:
python3 scripts/check_public_claims.py --repo-root . python3 scripts/check_public_claims.py --repo-root .
@@ -36,22 +148,20 @@ check-pipeline-contracts:
check-pipeline-inputs: check-pipeline-inputs:
python3 scripts/check_pipeline_inputs.py python3 scripts/check_pipeline_inputs.py
# ── Generators ────────────────────────────────────────────
generate-pipeline-status: generate-pipeline-status:
python3 scripts/generate_pipeline_status.py --registry-path docs/source_registry_br_v1.csv --output docs/pipeline_status.md python3 scripts/generate_pipeline_status.py
generate-source-summary: generate-source-summary:
python3 scripts/generate_data_sources_summary.py --registry-path docs/source_registry_br_v1.csv --docs-path docs/data-sources.md python3 scripts/generate_data_sources_summary.py
generate-reference-metrics: generate-reference-metrics:
python3 scripts/generate_reference_metrics.py --json-output audit-results/public-trust/latest/neo4j-reference-metrics.json --doc-output docs/reference_metrics.md python3 scripts/generate_reference_metrics.py
check: # ── Cleanup ─────────────────────────────────────────────
cd api && bash ../scripts/ci/python_quality.sh clean:
cd etl && bash ../scripts/ci/python_quality.sh find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
cd frontend && bash ../scripts/ci/frontend_quality.sh find . -type d -name .pytest_cache -exec rm -rf {} + 2>/dev/null || true
find . -type d -name .mypy_cache -exec rm -rf {} + 2>/dev/null || true
neutrality: find . -type d -name .ruff_cache -exec rm -rf {} + 2>/dev/null || true
@! grep -rn "suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty" \ rm -rf frontend/dist
api/src/ etl/src/ frontend/src/ \
--include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \
|| (echo "NEUTRALITY VIOLATION: banned words found in source" && exit 1)

View File

@@ -1,7 +1,7 @@
[project] [project]
name = "bracc-api" name = "bracc-api"
version = "0.1.0" version = "0.1.0"
description = "BRACC API — Brazilian public data anti-corruption graph tool" description = "BR-ACC API — Brazilian public data anti-corruption graph tool"
requires-python = ">=3.12" requires-python = ">=3.12"
license = "AGPL-3.0-or-later" license = "AGPL-3.0-or-later"
dependencies = [ dependencies = [

View File

@@ -1,5 +1,6 @@
from typing import Literal from typing import Literal
from pydantic import Field
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
@@ -17,14 +18,15 @@ class Settings(BaseSettings):
jwt_secret_key: str = "change-me-in-production" jwt_secret_key: str = "change-me-in-production"
jwt_algorithm: str = "HS256" jwt_algorithm: str = "HS256"
jwt_expire_minutes: int = 1440 jwt_expire_minutes: int = 1440
auth_cookie_name: str = "bracc_session"
auth_cookie_secure: bool = False
auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax"
trust_proxy_headers: bool = False
rate_limit_anon: str = "60/minute" rate_limit_anon: str = "60/minute"
rate_limit_auth: str = "300/minute" rate_limit_auth: str = "300/minute"
invite_code: str = "" invite_code: str = ""
cors_origins: str = "http://localhost:3000" cors_origins: str = "http://localhost:3000"
auth_cookie_name: str = "bracc_session"
auth_cookie_secure: bool = False
auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax"
trust_proxy_headers: bool = False
share_token_ttl_hours: int = 168 # 7 days
product_tier: str = "community" product_tier: str = "community"
patterns_enabled: bool = False patterns_enabled: bool = False
public_mode: bool = False public_mode: bool = False
@@ -37,7 +39,16 @@ class Settings(BaseSettings):
pattern_srp_min_orgs: int = 5 pattern_srp_min_orgs: int = 5
pattern_inexig_min_recurrence: int = 3 pattern_inexig_min_recurrence: int = 3
pattern_max_evidence_refs: int = 50 pattern_max_evidence_refs: int = 50
share_token_ttl_hours: int = 168
# Pattern hardening defaults (decision-complete contract)
pattern_temporal_window_years: int = Field(default=4, ge=1, le=20)
pattern_min_contract_value: float = Field(default=100000.0, ge=0)
pattern_min_contract_count: int = Field(default=2, ge=1)
pattern_min_debt_value: float = Field(default=50000.0, ge=0)
pattern_same_as_min_confidence: float = Field(default=0.85, ge=0, le=1)
pattern_pep_min_confidence: float = Field(default=0.85, ge=0, le=1)
pattern_min_recurrence: int = Field(default=2, ge=1)
pattern_min_discrepancy_ratio: float = Field(default=0.30, ge=0, le=1)
model_config = {"env_prefix": "", "env_file": ".env"} model_config = {"env_prefix": "", "env_file": ".env"}

View File

@@ -35,7 +35,12 @@ async def close_driver() -> None:
async def get_driver(request: Request) -> AsyncDriver: async def get_driver(request: Request) -> AsyncDriver:
driver: AsyncDriver = request.app.state.neo4j_driver driver: AsyncDriver | None = getattr(request.app.state, "neo4j_driver", None)
if driver is None:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Database connection not available",
)
return driver return driver

View File

@@ -2,7 +2,7 @@ import logging
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from fastapi import FastAPI, Request from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from slowapi import _rate_limit_exceeded_handler from slowapi import _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded from slowapi.errors import RateLimitExceeded
@@ -51,7 +51,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
app = FastAPI( app = FastAPI(
title="BRACC API", title="BR-ACC API",
description="Brazilian public data graph analysis tool", description="Brazilian public data graph analysis tool",
version="0.1.0", version="0.1.0",
lifespan=lifespan, lifespan=lifespan,
@@ -85,5 +85,5 @@ app.include_router(investigation.shared_router)
@app.get("/health") @app.get("/health")
async def health(request: Request) -> dict[str, str]: async def health() -> dict[str, str]:
return {"status": "ok", "version": request.app.version} return {"status": "ok"}

View File

@@ -53,7 +53,7 @@ def _is_pep_record(record: dict[str, Any]) -> bool:
for field in ("role", "cargo"): for field in ("role", "cargo"):
value = record.get(field) value = record.get(field)
if isinstance(value, str) and value.strip().lower() in PEP_ROLES: if isinstance(value, str) and any(kw in value.strip().lower() for kw in PEP_ROLES):
return True return True
return False return False

View File

@@ -6,37 +6,20 @@ from bracc.config import settings
from bracc.services.auth_service import decode_access_token from bracc.services.auth_service import decode_access_token
def _extract_token(request: Request) -> str | None: def _get_rate_limit_key(request: Request) -> str:
"""Extract user_id from JWT (Bearer or cookie) for rate limiting, fallback to IP."""
auth = request.headers.get("authorization", "") auth = request.headers.get("authorization", "")
if auth.startswith("Bearer "): if auth.startswith("Bearer "):
return auth[7:].strip() token = auth[7:]
cookie_token = request.cookies.get(settings.auth_cookie_name)
if isinstance(cookie_token, str) and cookie_token.strip():
return cookie_token.strip()
return None
def _resolve_client_ip(request: Request) -> str:
if settings.trust_proxy_headers:
forwarded = request.headers.get("x-forwarded-for", "")
if forwarded:
first_hop = forwarded.split(",", 1)[0].strip()
if first_hop:
return first_hop
real_ip = request.headers.get("x-real-ip", "").strip()
if real_ip:
return real_ip
return get_remote_address(request)
def _get_rate_limit_key(request: Request) -> str:
"""Extract user_id from JWT for rate limiting, fallback to IP."""
token = _extract_token(request)
if token:
user_id = decode_access_token(token) user_id = decode_access_token(token)
if user_id: if user_id:
return f"user:{user_id}" return f"user:{user_id}"
return _resolve_client_ip(request) cookie_token = request.cookies.get(settings.auth_cookie_name)
if isinstance(cookie_token, str) and cookie_token.strip():
user_id = decode_access_token(cookie_token.strip())
if user_id:
return f"user:{user_id}"
return get_remote_address(request)
limiter = Limiter( limiter = Limiter(

View File

@@ -1,27 +1,15 @@
MATCH (center) WHERE elementId(center) = $entity_id MATCH (center)
WHERE elementId(center) = $entity_id
AND (center:Person OR center:Partner OR center:Company OR center:Contract OR center:Sanction OR center:Election AND (center:Person OR center:Partner OR center:Company OR center:Contract OR center:Sanction OR center:Election
OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education
OR center:Convenio OR center:LaborStats OR center:PublicOffice) OR center:Convenio OR center:LaborStats OR center:PublicOffice)
WITH center, OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS*1..4]-(connected)
CASE WHERE length(p) <= $depth
WHEN coalesce($include_probable, false) THEN AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag))
"SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS" WITH center, p
ELSE UNWIND CASE WHEN p IS NULL THEN [] ELSE relationships(p) END AS r
"SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS" WITH DISTINCT center, r, startNode(r) AS src, endNode(r) AS tgt
END AS relationship_filter WHERE coalesce($include_probable, false) OR type(r) <> "POSSIBLE_SAME_AS"
CALL apoc.path.subgraphAll(center, {
relationshipFilter: relationship_filter,
labelFilter: "-User|-Investigation|-Annotation|-Tag",
maxLevel: $depth,
limit: 200
})
YIELD nodes, relationships
WITH center, nodes, relationships
UNWIND relationships AS r
WITH center,
startNode(r) AS src,
endNode(r) AS tgt,
r
RETURN center AS e, RETURN center AS e,
r, r,
CASE WHEN elementId(src) = elementId(center) THEN tgt ELSE src END AS connected, CASE WHEN elementId(src) = elementId(center) THEN tgt ELSE src END AS connected,

View File

@@ -1,14 +1,21 @@
MATCH (center) WHERE elementId(center) = $entity_id MATCH (center)
WHERE elementId(center) = $entity_id
AND (center:Person OR center:Company OR center:Contract OR center:Sanction OR center:Election AND (center:Person OR center:Company OR center:Contract OR center:Sanction OR center:Election
OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education
OR center:Convenio OR center:LaborStats OR center:PublicOffice OR center:Convenio OR center:LaborStats OR center:PublicOffice
OR center:OffshoreEntity OR center:OffshoreOfficer OR center:GlobalPEP OR center:OffshoreEntity OR center:OffshoreOfficer OR center:GlobalPEP
OR center:CVMProceeding OR center:Expense) OR center:CVMProceeding OR center:Expense)
CALL apoc.path.subgraphAll(center, { OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU*1..4]-(n)
relationshipFilter: "SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU", WHERE length(p) <= $depth
labelFilter: $label_filter, AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag))
maxLevel: $depth, WITH center, collect(p) AS paths
limit: 200 WITH center,
}) reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes,
YIELD nodes, relationships reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels
RETURN nodes, relationships, elementId(center) AS center_id UNWIND raw_nodes AS n
WITH center, collect(DISTINCT n) AS nodes, raw_rels
UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r
WITH center, nodes, collect(DISTINCT r) AS rels
RETURN nodes,
[x IN rels WHERE x IS NOT NULL] AS relationships,
elementId(center) AS center_id

View File

@@ -1,6 +1,4 @@
MATCH (i:Investigation) MATCH (i:Investigation {share_token: $token})
WHERE i.share_token = $token
AND (i.share_expires_at IS NULL OR i.share_expires_at > datetime())
OPTIONAL MATCH (i)-[:INCLUDES]->(e) OPTIONAL MATCH (i)-[:INCLUDES]->(e)
WITH i, collect(coalesce(e.cpf, e.cnpj, e.contract_id, e.sanction_id, e.amendment_id, e.cnes_code, e.finance_id, e.embargo_id, e.school_id, e.convenio_id, e.stats_id, elementId(e))) AS eids WITH i, collect(coalesce(e.cpf, e.cnpj, e.contract_id, e.sanction_id, e.amendment_id, e.cnes_code, e.finance_id, e.embargo_id, e.school_id, e.convenio_id, e.stats_id, elementId(e))) AS eids
RETURN i.id AS id, RETURN i.id AS id,
@@ -9,5 +7,4 @@ RETURN i.id AS id,
i.created_at AS created_at, i.created_at AS created_at,
i.updated_at AS updated_at, i.updated_at AS updated_at,
i.share_token AS share_token, i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids [x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -4,8 +4,7 @@ CREATE (i:Investigation {
description: $description, description: $description,
created_at: datetime(), created_at: datetime(),
updated_at: datetime(), updated_at: datetime(),
share_token: null, share_token: null
share_expires_at: null
}) })
WITH i WITH i
MATCH (u:User {id: $user_id}) MATCH (u:User {id: $user_id})
@@ -16,5 +15,4 @@ RETURN i.id AS id,
i.created_at AS created_at, i.created_at AS created_at,
i.updated_at AS updated_at, i.updated_at AS updated_at,
i.share_token AS share_token, i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[] AS entity_ids [] AS entity_ids

View File

@@ -7,5 +7,4 @@ RETURN i.id AS id,
i.created_at AS created_at, i.created_at AS created_at,
i.updated_at AS updated_at, i.updated_at AS updated_at,
i.share_token AS share_token, i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids [x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -13,5 +13,4 @@ RETURN total,
i.created_at AS created_at, i.created_at AS created_at,
i.updated_at AS updated_at, i.updated_at AS updated_at,
i.share_token AS share_token, i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids [x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -1,7 +1,5 @@
MATCH (u:User {id: $user_id})-[:OWNS]->(i:Investigation {id: $id}) MATCH (u:User {id: $user_id})-[:OWNS]->(i:Investigation {id: $id})
SET i.share_token = $share_token, SET i.share_token = $share_token,
i.share_expires_at = $share_expires_at,
i.updated_at = datetime() i.updated_at = datetime()
RETURN i.id AS id, RETURN i.id AS id,
i.share_token AS share_token, i.share_token AS share_token
i.share_expires_at AS share_expires_at

View File

@@ -11,5 +11,4 @@ RETURN i.id AS id,
i.created_at AS created_at, i.created_at AS created_at,
i.updated_at AS updated_at, i.updated_at AS updated_at,
i.share_token AS share_token, i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids [x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -1,5 +1,6 @@
MATCH (n) WHERE elementId(n) = $entity_id MATCH (n)
WHERE elementId(n) = $entity_id
AND (n:Person OR n:Company OR n:Contract OR n:Sanction OR n:Election AND (n:Person OR n:Company OR n:Contract OR n:Sanction OR n:Election
OR n:Amendment OR n:Finance OR n:Embargo OR n:Health OR n:Education OR n:Amendment OR n:Finance OR n:Embargo OR n:Health OR n:Education
OR n:Convenio OR n:LaborStats OR n:PublicOffice) OR n:Convenio OR n:LaborStats OR n:PublicOffice)
RETURN apoc.node.degree(n) AS degree RETURN COUNT { (n)--() } AS degree

View File

@@ -2,11 +2,31 @@ MATCH (center:Company)
WHERE elementId(center) = $company_id WHERE elementId(center) = $company_id
OR center.cnpj = $company_identifier OR center.cnpj = $company_identifier
OR center.cnpj = $company_identifier_formatted OR center.cnpj = $company_identifier_formatted
CALL apoc.path.subgraphAll(center, { OPTIONAL MATCH p=(center)-[:SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU*1..4]-(n)
relationshipFilter: "SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU", WHERE length(p) <= $depth
labelFilter: "+Company|+Contract|+Sanction|+Finance|+Amendment|+Convenio|+Bid|+MunicipalContract|+MunicipalBid|-Person|-Partner|-User|-Investigation|-Annotation|-Tag", AND all(
maxLevel: $depth, x IN nodes(p)
limit: 200 WHERE NOT (
}) "Person" IN labels(x)
YIELD nodes, relationships OR "Partner" IN labels(x)
RETURN nodes, relationships, elementId(center) AS center_id OR "User" IN labels(x)
OR "Investigation" IN labels(x)
OR "Annotation" IN labels(x)
OR "Tag" IN labels(x)
)
)
AND (
n:Company OR n:Contract OR n:Sanction OR n:Finance OR n:Amendment OR n:Convenio
OR n:Bid OR n:MunicipalContract OR n:MunicipalBid OR n IS NULL
)
WITH center, collect(p) AS paths
WITH center,
reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes,
reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels
UNWIND raw_nodes AS n
WITH center, collect(DISTINCT n) AS nodes, raw_rels
UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r
WITH center, nodes, collect(DISTINCT r) AS rels
RETURN nodes,
[x IN rels WHERE x IS NOT NULL] AS relationships,
elementId(center) AS center_id

View File

@@ -1,4 +1,4 @@
// BRACC Neo4j Schema — Constraints and Indexes // BR-ACC Neo4j Schema — Constraints and Indexes
// Applied on database initialization // Applied on database initialization
// ── Uniqueness Constraints ────────────────────────────── // ── Uniqueness Constraints ──────────────────────────────

View File

@@ -6,6 +6,7 @@ from neo4j import AsyncSession
from bracc.dependencies import get_session from bracc.dependencies import get_session
from bracc.models.baseline import BaselineResponse from bracc.models.baseline import BaselineResponse
from bracc.services.baseline_service import BASELINE_QUERIES, run_all_baselines, run_baseline from bracc.services.baseline_service import BASELINE_QUERIES, run_all_baselines, run_baseline
from bracc.services.public_guard import enforce_entity_lookup_enabled
router = APIRouter(prefix="/api/v1/baseline", tags=["baseline"]) router = APIRouter(prefix="/api/v1/baseline", tags=["baseline"])
@@ -16,6 +17,7 @@ async def get_baseline_for_entity(
session: Annotated[AsyncSession, Depends(get_session)], session: Annotated[AsyncSession, Depends(get_session)],
dimension: Annotated[str | None, Query()] = None, dimension: Annotated[str | None, Query()] = None,
) -> BaselineResponse: ) -> BaselineResponse:
enforce_entity_lookup_enabled()
if dimension: if dimension:
if dimension not in BASELINE_QUERIES: if dimension not in BASELINE_QUERIES:
available = list(BASELINE_QUERIES.keys()) available = list(BASELINE_QUERIES.keys())

View File

@@ -182,7 +182,7 @@ async def get_entity_timeline(
date=event_date, date=event_date,
label=str(label), label=str(label),
entity_type=entity_type, entity_type=entity_type,
properties=sanitize_props(props), properties=sanitize_public_properties(sanitize_props(props)),
sources=[SourceAttribution(database="neo4j_graph")], sources=[SourceAttribution(database="neo4j_graph")],
)) ))

View File

@@ -311,7 +311,7 @@ async def export_investigation_pdf(
cpf_val = node.get("cpf") cpf_val = node.get("cpf")
if cpf_val and isinstance(cpf_val, str): if cpf_val and isinstance(cpf_val, str):
role = str(node.get("role", node.get("cargo", ""))).lower() role = str(node.get("role", node.get("cargo", ""))).lower()
is_pep = role in PEP_ROLES is_pep = any(kw in role for kw in PEP_ROLES)
if not is_pep: if not is_pep:
if "." in document and "-" in document: if "." in document and "-" in document:
document = mask_formatted_cpf(document) document = mask_formatted_cpf(document)

View File

@@ -6,6 +6,7 @@ from neo4j import AsyncSession
from bracc.dependencies import get_session from bracc.dependencies import get_session
from bracc.services.neo4j_service import execute_query_single from bracc.services.neo4j_service import execute_query_single
from bracc.services.public_guard import should_hide_person_entities
from bracc.services.source_registry import load_source_registry, source_registry_summary from bracc.services.source_registry import load_source_registry, source_registry_summary
router = APIRouter(prefix="/api/v1/meta", tags=["meta"]) router = APIRouter(prefix="/api/v1/meta", tags=["meta"])
@@ -40,7 +41,9 @@ async def database_stats(
result = { result = {
"total_nodes": record["total_nodes"] if record else 0, "total_nodes": record["total_nodes"] if record else 0,
"total_relationships": record["total_relationships"] if record else 0, "total_relationships": record["total_relationships"] if record else 0,
"person_count": record["person_count"] if record else 0, "person_count": (
0 if should_hide_person_entities() else (record["person_count"] if record else 0)
),
"company_count": record["company_count"] if record else 0, "company_count": record["company_count"] if record else 0,
"health_count": record["health_count"] if record else 0, "health_count": record["health_count"] if record else 0,
"finance_count": record["finance_count"] if record else 0, "finance_count": record["finance_count"] if record else 0,

View File

@@ -57,12 +57,6 @@ async def public_meta(
return { return {
"product": "World Transparency Graph", "product": "World Transparency Graph",
"mode": "public_safe", "mode": "public_safe",
"dataset_scope": {
"local_default": "demo_local",
"ingestion_mode": "byo_ingestion",
"reference_metrics": "reference_production_snapshot",
},
"metrics_as_of_utc": "2026-03-01T23:05:00Z",
"total_nodes": record["total_nodes"] if record else 0, "total_nodes": record["total_nodes"] if record else 0,
"total_relationships": record["total_relationships"] if record else 0, "total_relationships": record["total_relationships"] if record else 0,
"company_count": record["company_count"] if record else 0, "company_count": record["company_count"] if record else 0,

View File

@@ -61,9 +61,9 @@ async def search_entities(
{ {
"query": _escape_lucene(q), "query": _escape_lucene(q),
"entity_type": type_filter, "entity_type": type_filter,
"hide_person_entities": hide_person_entities,
"skip": skip, "skip": skip,
"limit": size, "limit": size,
"hide_person_entities": hide_person_entities,
}, },
) )
total_record = await execute_query_single( total_record = await execute_query_single(

View File

@@ -9,6 +9,17 @@ from testcontainers.neo4j import Neo4jContainer
from bracc.main import app from bracc.main import app
def _iter_cypher_statements(path: Path) -> list[str]:
# Strip comment-only lines before splitting to avoid dropping statements
# that are preceded by section headers.
filtered_lines = [
line for line in path.read_text().splitlines()
if line.strip() and not line.strip().startswith("//")
]
text = "\n".join(filtered_lines)
return [stmt.strip() for stmt in text.split(";") if stmt.strip()]
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def neo4j_container() -> Neo4jContainer: # type: ignore[misc] def neo4j_container() -> Neo4jContainer: # type: ignore[misc]
"""Start a Neo4j container for integration tests.""" """Start a Neo4j container for integration tests."""
@@ -25,32 +36,43 @@ def neo4j_uri(neo4j_container: Neo4jContainer) -> str:
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def neo4j_auth(neo4j_container: Neo4jContainer) -> tuple[str, str]: def neo4j_auth(neo4j_container: Neo4jContainer) -> tuple[str, str]:
return ("neo4j", neo4j_container.NEO4J_ADMIN_PASSWORD) # testcontainers.neo4j API changed: older versions exposed NEO4J_ADMIN_PASSWORD,
# newer versions expose username/password attributes.
username = getattr(neo4j_container, "username", "neo4j")
password = getattr(
neo4j_container,
"password",
getattr(neo4j_container, "NEO4J_ADMIN_PASSWORD", None),
)
if password is None:
msg = "Could not resolve Neo4j testcontainer password"
raise RuntimeError(msg)
return (username, password)
@pytest.fixture(scope="session") @pytest.fixture
async def neo4j_driver( async def neo4j_driver(
neo4j_uri: str, neo4j_auth: tuple[str, str] neo4j_uri: str, neo4j_auth: tuple[str, str]
) -> AsyncIterator[AsyncDriver]: ) -> AsyncIterator[AsyncDriver]:
# Function-scoped driver avoids loop affinity issues between async tests.
driver = AsyncGraphDatabase.driver(neo4j_uri, auth=neo4j_auth) driver = AsyncGraphDatabase.driver(neo4j_uri, auth=neo4j_auth)
async with driver.session() as session:
# Keep tests deterministic across function scope by resetting test data.
await session.run("MATCH (n) DETACH DELETE n")
# Apply schema # Apply schema
schema_path = Path(__file__).parent.parent.parent.parent / "infra" / "neo4j" / "init.cypher" schema_path = Path(__file__).parent.parent.parent.parent / "infra" / "neo4j" / "init.cypher"
if schema_path.exists(): if schema_path.exists():
async with driver.session() as session: async with driver.session() as session:
for statement in schema_path.read_text().split(";"): for stmt in _iter_cypher_statements(schema_path):
stmt = statement.strip() await session.run(stmt)
if stmt and not stmt.startswith("//"):
await session.run(stmt)
# Seed dev data # Seed dev data
seed_path = ( seed_path = (
Path(__file__).parent.parent.parent.parent / "infra" / "scripts" / "seed-dev.cypher" Path(__file__).parent.parent.parent.parent / "infra" / "scripts" / "seed-dev.cypher"
) )
if seed_path.exists(): if seed_path.exists():
async with driver.session() as session: async with driver.session() as session:
for statement in seed_path.read_text().split(";"): for stmt in _iter_cypher_statements(seed_path):
stmt = statement.strip() await session.run(stmt)
if stmt and not stmt.startswith("//"):
await session.run(stmt)
yield driver yield driver
await driver.close() await driver.close()

View File

@@ -34,7 +34,11 @@ def _setup_mock_session(driver: MagicMock, records: list[MagicMock]) -> AsyncMoc
@pytest.mark.anyio @pytest.mark.anyio
async def test_register_success(client: AsyncClient) -> None: async def test_register_success(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None:
from bracc.config import settings
monkeypatch.setattr(settings, "invite_code", "")
record = _mock_record({ record = _mock_record({
"id": "user-uuid", "id": "user-uuid",
"email": "test@example.com", "email": "test@example.com",
@@ -56,19 +60,15 @@ async def test_register_success(client: AsyncClient) -> None:
@pytest.mark.anyio @pytest.mark.anyio
async def test_register_bad_invite(client: AsyncClient) -> None: async def test_register_bad_invite(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None:
from bracc.config import settings from bracc.config import settings
original = settings.invite_code monkeypatch.setattr(settings, "invite_code", "secret-code")
try: response = await client.post(
settings.invite_code = "secret-code" "/api/v1/auth/register",
response = await client.post( json={"email": "test@example.com", "password": "password123", "invite_code": "wrong"},
"/api/v1/auth/register", )
json={"email": "test@example.com", "password": "password123", "invite_code": "wrong"}, assert response.status_code == 403
)
assert response.status_code == 403
finally:
settings.invite_code = original
@pytest.mark.anyio @pytest.mark.anyio
@@ -155,16 +155,25 @@ async def test_me_invalid_token(client: AsyncClient) -> None:
@pytest.mark.anyio @pytest.mark.anyio
async def test_register_duplicate_email(client: AsyncClient) -> None: async def test_register_duplicate_email(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
from neo4j.exceptions import ConstraintError
from bracc.config import settings
from bracc.main import app from bracc.main import app
monkeypatch.setattr(settings, "invite_code", "")
driver = app.state.neo4j_driver driver = app.state.neo4j_driver
mock_session = AsyncMock() mock_session = AsyncMock()
mock_session.run = AsyncMock(side_effect=Exception("Constraint violation")) mock_session.run = AsyncMock(side_effect=ConstraintError("Node already exists"))
driver.session.return_value.__aenter__ = AsyncMock(return_value=mock_session) driver.session.return_value.__aenter__ = AsyncMock(return_value=mock_session)
with pytest.raises(Exception, match="Constraint violation"): response = await client.post(
await client.post( "/api/v1/auth/register",
"/api/v1/auth/register", json={"email": "duplicate@example.com", "password": "password123"},
json={"email": "duplicate@example.com", "password": "password123"}, )
) assert response.status_code == 409
assert response.json()["detail"] == "Email already registered"

View File

@@ -61,7 +61,9 @@ def test_decode_access_token_invalid() -> None:
@pytest.mark.anyio @pytest.mark.anyio
async def test_register_user_success() -> None: async def test_register_user_success(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "invite_code", "")
mock_record = _mock_record({ mock_record = _mock_record({
"id": "user-uuid", "id": "user-uuid",
"email": "test@example.com", "email": "test@example.com",
@@ -80,15 +82,11 @@ async def test_register_user_success() -> None:
@pytest.mark.anyio @pytest.mark.anyio
async def test_register_user_bad_invite() -> None: async def test_register_user_bad_invite(monkeypatch: pytest.MonkeyPatch) -> None:
original = settings.invite_code monkeypatch.setattr(settings, "invite_code", "secret-code")
try: session = AsyncMock()
settings.invite_code = "secret-code" with pytest.raises(ValueError, match="Invalid invite code"):
session = AsyncMock() await register_user(session, "test@example.com", "password123", "wrong-code")
with pytest.raises(ValueError, match="Invalid invite code"):
await register_user(session, "test@example.com", "password123", "wrong-code")
finally:
settings.invite_code = original
@pytest.mark.anyio @pytest.mark.anyio

View File

@@ -68,6 +68,28 @@ class TestIsPepRecord:
def test_cargo_field(self) -> None: def test_cargo_field(self) -> None:
assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado"}) assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado"})
@pytest.mark.parametrize(
"role",
[
"Deputado Federal",
"deputado federal",
"DEPUTADO FEDERAL",
"Senador da Republica",
"senadora da republica",
"Vereador Suplente",
"Ministro de Estado",
"Governadora do Estado de Sao Paulo",
"Presidente da Republica",
],
)
def test_compound_role_detected_as_pep(self, role: str) -> None:
"""Compound PEP roles like 'deputado federal' must be detected via substring match."""
assert _is_pep_record({"name": "X", "cpf": "11111111111", "role": role})
def test_compound_cargo_detected_as_pep(self) -> None:
"""Compound PEP cargo like 'Deputado Federal' must be detected via substring match."""
assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado Federal"})
def test_non_pep_role(self) -> None: def test_non_pep_role(self) -> None:
assert not _is_pep_record({"name": "X", "cpf": "11111111111", "role": "assessor"}) assert not _is_pep_record({"name": "X", "cpf": "11111111111", "role": "assessor"})
@@ -99,6 +121,18 @@ class TestCollectPepCpfs:
data = {"a": {"b": {"c": [{"cpf": "33333333333", "is_pep": True}]}}} data = {"a": {"b": {"c": [{"cpf": "33333333333", "is_pep": True}]}}}
assert "33333333333" in _collect_pep_cpfs(data) assert "33333333333" in _collect_pep_cpfs(data)
def test_compound_role_collected(self) -> None:
"""Compound roles like 'Deputado Federal' must be recognized in the walk."""
data = {
"results": [
{"cpf": "11111111111", "role": "Deputado Federal"},
{"cpf": "22222222222", "role": "assessor parlamentar"},
]
}
peps = _collect_pep_cpfs(data)
assert "11111111111" in peps
assert "22222222222" not in peps
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Unit tests for mask_cpfs_in_json # Unit tests for mask_cpfs_in_json
@@ -205,4 +239,4 @@ async def test_health_not_masked(client: AsyncClient) -> None:
"""Non-CPF JSON responses pass through unchanged.""" """Non-CPF JSON responses pass through unchanged."""
resp = await client.get("/health") resp = await client.get("/health")
assert resp.status_code == 200 assert resp.status_code == 200
assert resp.json()["status"] == "ok" and "version" in resp.json() assert resp.json() == {"status": "ok"}

View File

@@ -8,9 +8,7 @@ from httpx import AsyncClient
async def test_health_returns_ok(client: AsyncClient) -> None: async def test_health_returns_ok(client: AsyncClient) -> None:
response = await client.get("/health") response = await client.get("/health")
assert response.status_code == 200 assert response.status_code == 200
data = response.json() assert response.json() == {"status": "ok"}
assert data["status"] == "ok"
assert "version" in data
assert response.headers["x-content-type-options"] == "nosniff" assert response.headers["x-content-type-options"] == "nosniff"
assert response.headers["x-frame-options"] == "DENY" assert response.headers["x-frame-options"] == "DENY"
assert response.headers["referrer-policy"] == "no-referrer" assert response.headers["referrer-policy"] == "no-referrer"

View File

@@ -1,120 +0,0 @@
from unittest.mock import AsyncMock, patch
import pytest
from httpx import AsyncClient
from bracc.config import settings
from bracc.models.pattern import PATTERN_METADATA
from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES
from bracc.services.neo4j_service import CypherLoader
@pytest.fixture(autouse=True)
def _enable_patterns(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "patterns_enabled", True)
def test_all_community_patterns_have_metadata() -> None:
for pattern_id in COMMUNITY_PATTERN_IDS:
assert pattern_id in PATTERN_METADATA, f"Missing metadata for {pattern_id}"
def test_all_community_patterns_have_query_files() -> None:
for query_name in COMMUNITY_PATTERN_QUERIES.values():
try:
CypherLoader.load(query_name)
except FileNotFoundError:
pytest.fail(f"Missing .cypher file for query {query_name}.cypher")
finally:
CypherLoader.clear_cache()
def test_pattern_metadata_has_required_fields() -> None:
for pid, meta in PATTERN_METADATA.items():
assert "name_pt" in meta, f"{pid} missing name_pt"
assert "name_en" in meta, f"{pid} missing name_en"
assert "desc_pt" in meta, f"{pid} missing desc_pt"
assert "desc_en" in meta, f"{pid} missing desc_en"
@pytest.mark.anyio
async def test_list_patterns_endpoint(client: AsyncClient) -> None:
response = await client.get("/api/v1/patterns/")
assert response.status_code == 200
data = response.json()
assert "patterns" in data
assert len(data["patterns"]) == 8
ids = {row["id"] for row in data["patterns"]}
assert ids == set(COMMUNITY_PATTERN_IDS)
@pytest.mark.anyio
async def test_patterns_endpoint_returns_503_when_disabled(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "patterns_enabled", False)
response = await client.get("/api/v1/patterns/")
assert response.status_code == 503
assert "temporarily unavailable" in response.json()["detail"]
@pytest.mark.anyio
async def test_invalid_pattern_returns_404(client: AsyncClient) -> None:
response = await client.get("/api/v1/patterns/test-id/nonexistent_pattern")
assert response.status_code == 404
assert "Pattern not found" in response.json()["detail"]
@pytest.mark.anyio
async def test_patterns_endpoint_forwards_include_probable(client: AsyncClient) -> None:
with patch("bracc.routers.patterns.run_all_patterns", new_callable=AsyncMock) as mock_run_all:
mock_run_all.return_value = []
response = await client.get("/api/v1/patterns/test-id?include_probable=true")
assert response.status_code == 200
mock_run_all.assert_awaited_once()
_driver, entity_id, _lang = mock_run_all.await_args.args
assert entity_id == "test-id"
assert mock_run_all.await_args.kwargs["include_probable"] is True
@pytest.mark.anyio
async def test_specific_pattern_endpoint_forwards_include_probable(client: AsyncClient) -> None:
with patch("bracc.routers.patterns.run_pattern", new_callable=AsyncMock) as mock_run_one:
mock_run_one.return_value = []
response = await client.get(
"/api/v1/patterns/test-id/debtor_contracts?include_probable=true",
)
assert response.status_code == 200
mock_run_one.assert_awaited_once()
_session, pattern_name, entity_id, _lang = mock_run_one.await_args.args
assert pattern_name == "debtor_contracts"
assert entity_id == "test-id"
assert mock_run_one.await_args.kwargs["include_probable"] is True
def test_community_queries_use_bind_params() -> None:
for query_name in COMMUNITY_PATTERN_QUERIES.values():
try:
cypher = CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
assert "$company_id" in cypher, f"{query_name}.cypher missing $company_id"
assert "$company_identifier" in cypher, f"{query_name}.cypher missing $company_identifier"
assert "$company_identifier_formatted" in cypher, (
f"{query_name}.cypher missing $company_identifier_formatted"
)
assert "${" not in cypher, f"{query_name}.cypher uses unsafe string interpolation"
def test_no_banned_words_in_pattern_metadata() -> None:
banned = {"suspicious", "corrupt", "criminal", "fraudulent", "illegal", "guilty"}
for pid, meta in PATTERN_METADATA.items():
for key, value in meta.items():
for word in banned:
assert word not in value.lower(), (
f"Banned word '{word}' in {pid}.{key}: {value}"
)

View File

@@ -1,79 +0,0 @@
"""Community public-safe pattern registry and query contract tests."""
import pytest
from bracc.models.pattern import PATTERN_METADATA
from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES
from bracc.services.neo4j_service import CypherLoader
def test_community_pattern_registry_exact_ids() -> None:
assert len(COMMUNITY_PATTERN_IDS) == 8
assert set(COMMUNITY_PATTERN_IDS) == {
"sanctioned_still_receiving",
"amendment_beneficiary_contracts",
"split_contracts_below_threshold",
"contract_concentration",
"embargoed_receiving",
"debtor_contracts",
"srp_multi_org_hitchhiking",
"inexigibility_recurrence",
}
def test_community_pattern_query_mapping_is_complete() -> None:
assert set(COMMUNITY_PATTERN_QUERIES.keys()) == set(COMMUNITY_PATTERN_IDS)
for query_name in COMMUNITY_PATTERN_QUERIES.values():
assert query_name.startswith("public_pattern_")
@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values())
def test_public_pattern_query_files_load(query_name: str) -> None:
try:
CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values())
def test_public_pattern_query_required_return_aliases(query_name: str) -> None:
try:
cypher = CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
for required_alias in (
" AS pattern_id",
" AS risk_signal",
" AS amount_total",
" AS window_start",
" AS window_end",
" AS evidence_refs",
" AS evidence_count",
):
assert required_alias in cypher, f"{query_name}.cypher missing alias: {required_alias}"
@pytest.mark.parametrize("pattern_id", COMMUNITY_PATTERN_IDS)
def test_community_pattern_metadata_is_present(pattern_id: str) -> None:
meta = PATTERN_METADATA.get(pattern_id)
assert meta is not None
assert meta.get("name_pt")
assert meta.get("name_en")
assert meta.get("desc_pt")
assert meta.get("desc_en")
def test_threshold_params_used_in_threshold_patterns() -> None:
query_params = {
"public_pattern_split_contracts_below_threshold": "$pattern_split_threshold_value",
"public_pattern_contract_concentration": "$pattern_share_threshold",
"public_pattern_srp_multi_org_hitchhiking": "$pattern_srp_min_orgs",
"public_pattern_inexigibility_recurrence": "$pattern_inexig_min_recurrence",
}
for query_name, required_param in query_params.items():
try:
cypher = CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
assert required_param in cypher, f"{query_name}.cypher missing {required_param}"

View File

@@ -225,6 +225,135 @@ async def test_public_graph_company_filters_person_nodes(client: AsyncClient) ->
assert len(payload["edges"]) == 0 assert len(payload["edges"]) == 0
@pytest.mark.anyio
async def test_baseline_disabled_in_public_mode(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "public_mode", True)
monkeypatch.setattr(settings, "public_allow_entity_lookup", False)
response = await client.get("/api/v1/baseline/test-id")
assert response.status_code == 403
assert "disabled in public mode" in response.json()["detail"]
@pytest.mark.anyio
async def test_stats_hides_person_count_in_public_mode(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "public_mode", True)
monkeypatch.setattr(settings, "public_allow_person", False)
# Clear stats cache to ensure fresh computation
import bracc.routers.meta as meta_mod
monkeypatch.setattr(meta_mod, "_stats_cache", None)
fake_record = {
"total_nodes": 100,
"total_relationships": 200,
"person_count": 999,
"company_count": 50,
"health_count": 10,
"finance_count": 5,
"contract_count": 20,
"sanction_count": 3,
"election_count": 7,
"amendment_count": 4,
"embargo_count": 2,
"education_count": 6,
"convenio_count": 8,
"laborstats_count": 9,
"offshore_entity_count": 1,
"offshore_officer_count": 2,
"global_pep_count": 3,
"cvm_proceeding_count": 4,
"expense_count": 11,
"pep_record_count": 12,
"expulsion_count": 13,
"leniency_count": 14,
"international_sanction_count": 15,
"gov_card_expense_count": 16,
"gov_travel_count": 17,
"bid_count": 18,
"fund_count": 19,
"dou_act_count": 20,
"tax_waiver_count": 21,
"municipal_finance_count": 22,
"declared_asset_count": 23,
"party_membership_count": 24,
"barred_ngo_count": 25,
"bcb_penalty_count": 26,
"labor_movement_count": 27,
"legal_case_count": 28,
"judicial_case_count": 29,
"source_document_count": 30,
"ingestion_run_count": 31,
"temporal_violation_count": 32,
"cpi_count": 33,
"inquiry_requirement_count": 34,
"inquiry_session_count": 35,
"municipal_bid_count": 36,
"municipal_contract_count": 37,
"municipal_gazette_act_count": 38,
}
with patch(
"bracc.routers.meta.execute_query_single",
new_callable=AsyncMock,
return_value=fake_record,
), patch(
"bracc.routers.meta.load_source_registry",
return_value=[],
), patch(
"bracc.routers.meta.source_registry_summary",
return_value={
"universe_v1_sources": 0,
"implemented_sources": 0,
"loaded_sources": 0,
"healthy_sources": 0,
"stale_sources": 0,
"blocked_external_sources": 0,
"quality_fail_sources": 0,
"discovered_uningested_sources": 0,
},
):
response = await client.get("/api/v1/meta/stats")
assert response.status_code == 200
payload = response.json()
assert payload["person_count"] == 0
assert payload["company_count"] == 50 # non-person counts preserved
@pytest.mark.anyio
async def test_timeline_sanitizes_properties_in_public_mode(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "public_mode", True)
monkeypatch.setattr(settings, "public_allow_entity_lookup", True)
mock_records = [
{
"lbls": ["Contract"],
"props": {"type": "licitacao", "cpf": "12345678900", "value": 50000.0},
"event_date": "2024-01-15",
"id": "evt-1",
},
]
with patch(
"bracc.routers.entity.execute_query",
new_callable=AsyncMock,
return_value=mock_records,
):
response = await client.get("/api/v1/entity/test-id/timeline")
assert response.status_code == 200
payload = response.json()
assert len(payload["events"]) == 1
event_props = payload["events"][0]["properties"]
assert "cpf" not in event_props
assert event_props["value"] == 50000.0
@pytest.mark.anyio @pytest.mark.anyio
async def test_investigations_disabled_in_public_mode( async def test_investigations_disabled_in_public_mode(
client: AsyncClient, client: AsyncClient,

View File

@@ -1,24 +1,15 @@
from unittest.mock import MagicMock from unittest.mock import MagicMock
from bracc.config import settings
from bracc.middleware.rate_limit import _get_rate_limit_key, limiter from bracc.middleware.rate_limit import _get_rate_limit_key, limiter
from bracc.services.auth_service import create_access_token from bracc.services.auth_service import create_access_token
def _make_request( def _make_request(auth_header: str | None = None, client_ip: str = "127.0.0.1") -> MagicMock:
auth_header: str | None = None,
client_ip: str = "127.0.0.1",
cookie_token: str | None = None,
x_forwarded_for: str | None = None,
) -> MagicMock:
request = MagicMock() request = MagicMock()
headers: dict[str, str] = {} headers: dict[str, str] = {}
if auth_header: if auth_header:
headers["authorization"] = auth_header headers["authorization"] = auth_header
if x_forwarded_for:
headers["x-forwarded-for"] = x_forwarded_for
request.headers = headers request.headers = headers
request.cookies = {settings.auth_cookie_name: cookie_token} if cookie_token else {}
request.client = MagicMock() request.client = MagicMock()
request.client.host = client_ip request.client.host = client_ip
return request return request
@@ -43,23 +34,5 @@ def test_key_func_invalid_token_fallback() -> None:
assert key == "10.0.0.1" assert key == "10.0.0.1"
def test_key_func_extracts_user_from_cookie_token() -> None:
token = create_access_token("cookie-user-1")
request = _make_request(cookie_token=token)
key = _get_rate_limit_key(request)
assert key == "user:cookie-user-1"
def test_key_func_uses_forwarded_ip_when_enabled() -> None:
original = settings.trust_proxy_headers
try:
settings.trust_proxy_headers = True
request = _make_request(client_ip="127.0.0.1", x_forwarded_for="203.0.113.9, 10.0.0.4")
key = _get_rate_limit_key(request)
assert key == "203.0.113.9"
finally:
settings.trust_proxy_headers = original
def test_limiter_instance_exists() -> None: def test_limiter_instance_exists() -> None:
assert limiter is not None assert limiter is not None

View File

@@ -1,21 +1,6 @@
import pytest import pytest
from httpx import AsyncClient from httpx import AsyncClient
from bracc.routers.search import _escape_lucene
def test_escape_lucene_cnpj() -> None:
assert _escape_lucene("00.000.000/0001-00") == "00.000.000\\/0001\\-00"
def test_escape_lucene_plain_text() -> None:
assert _escape_lucene("silva construcoes") == "silva construcoes"
def test_escape_lucene_all_special_chars() -> None:
for ch in r'+-&|!(){}[]^"~*?:\/':
assert f"\\{ch}" in _escape_lucene(ch)
@pytest.mark.anyio @pytest.mark.anyio
async def test_search_rejects_short_query(client: AsyncClient) -> None: async def test_search_rejects_short_query(client: AsyncClient) -> None:

100
api/uv.lock generated
View File

@@ -103,6 +103,56 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" }, { url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
] ]
[[package]]
name = "bracc-api"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "bcrypt" },
{ name = "fastapi" },
{ name = "jinja2" },
{ name = "neo4j" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "pyjwt", extra = ["crypto"] },
{ name = "python-multipart" },
{ name = "slowapi" },
{ name = "uvicorn", extra = ["standard"] },
{ name = "weasyprint" },
]
[package.optional-dependencies]
dev = [
{ name = "httpx" },
{ name = "mypy" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "ruff" },
{ name = "testcontainers", extra = ["neo4j"] },
]
[package.metadata]
requires-dist = [
{ name = "bcrypt", specifier = ">=4.0.0" },
{ name = "fastapi", specifier = ">=0.115.0" },
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
{ name = "jinja2", specifier = ">=3.1.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" },
{ name = "neo4j", specifier = ">=5.27.0" },
{ name = "pydantic", specifier = ">=2.10.0" },
{ name = "pydantic-settings", specifier = ">=2.7.0" },
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
{ name = "python-multipart", specifier = ">=0.0.18" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
{ name = "slowapi", specifier = ">=0.1.9" },
{ name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" },
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" },
{ name = "weasyprint", specifier = ">=62.0" },
]
provides-extras = ["dev"]
[[package]] [[package]]
name = "brotli" name = "brotli"
version = "1.2.0" version = "1.2.0"
@@ -523,56 +573,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
] ]
[[package]]
name = "bracc-api"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "bcrypt" },
{ name = "fastapi" },
{ name = "jinja2" },
{ name = "neo4j" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "pyjwt", extra = ["crypto"] },
{ name = "python-multipart" },
{ name = "slowapi" },
{ name = "uvicorn", extra = ["standard"] },
{ name = "weasyprint" },
]
[package.optional-dependencies]
dev = [
{ name = "httpx" },
{ name = "mypy" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "ruff" },
{ name = "testcontainers", extra = ["neo4j"] },
]
[package.metadata]
requires-dist = [
{ name = "bcrypt", specifier = ">=4.0.0" },
{ name = "fastapi", specifier = ">=0.115.0" },
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
{ name = "jinja2", specifier = ">=3.1.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" },
{ name = "neo4j", specifier = ">=5.27.0" },
{ name = "pydantic", specifier = ">=2.10.0" },
{ name = "pydantic-settings", specifier = ">=2.7.0" },
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
{ name = "python-multipart", specifier = ">=0.0.18" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
{ name = "slowapi", specifier = ">=0.1.9" },
{ name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" },
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" },
{ name = "weasyprint", specifier = ">=62.0" },
]
provides-extras = ["dev"]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.11" version = "3.11"

0
data/.gitkeep Normal file
View File

View File

0
data/cnpj/raw/.gitkeep Normal file
View File

View File

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 MiB

View File

@@ -1,24 +1,13 @@
# BRACC Data Source Catalog # ICARUS Data Source Catalog
<!-- SOURCE_SUMMARY_START --> **38 loaded | 3 pipelines pending data | 60+ not yet built**
**Generated from `docs/source_registry_br_v1.csv` (as-of UTC: 2026-03-01T23:05:00Z)** Last updated: 2026-02-26
- Universe v1 sources: 108
- Implemented pipelines: 45
- Loaded sources (load_state=loaded): 36
- Partial sources (load_state=partial): 8
- Not loaded sources (load_state=not_loaded): 64
- Status counts: loaded=36, partial=5, stale=3, blocked_external=1, not_built=63
<!-- SOURCE_SUMMARY_END -->
Catalog note: counts and status labels are generated from the public registry (`docs/source_registry_br_v1.csv`).
This document includes reference production inventory context and backlog discovery; it is not a guarantee that every listed source is currently loaded in your local environment.
--- ---
## 1. Reference Production Snapshot (Loaded/Implemented Inventory) ## 1. LOADED (38 sources)
The table below is a timestamped reference snapshot and should be interpreted together with the generated summary block above. All sources below have working ETL pipelines in `etl/src/icarus_etl/pipelines/` and are loaded into production Neo4j.
| # | Source | Pipeline | Nodes Created | Rels Created | Notes | | # | Source | Pipeline | Nodes Created | Rels Created | Notes |
|---|--------|----------|---------------|--------------|-------| |---|--------|----------|---------------|--------------|-------|

View File

@@ -1,29 +0,0 @@
# Demo Dataset Contract (WTG Open)
## Objective
Provide a reproducible, public-safe demo graph with synthetic records only.
## Safety rules
- Synthetic data only. No real CPF, no real personal names, no real personal addresses.
- Company identifiers may use synthetic CNPJ-like values reserved for demonstration.
- Demo graph cannot include `Person` or `Partner` labels.
- Demo exports must never include private or operational metadata.
## Required files
- `data/demo/synthetic_graph.json`
- `data/demo/README.md`
- `scripts/generate_demo_dataset.py`
## JSON schema (minimum)
- `nodes[]`: `{id, label, type, properties}`
- `edges[]`: `{id, source, target, type, properties}`
- `meta`: `{generated_at_utc, generator_version, source: "synthetic"}`
## Acceptance checks
- No field name contains `cpf`, `doc_partial`, or `doc_raw`.
- No node label equals `Person` or `Partner`.
- CI privacy gate passes.
## Runtime target
- Dedicated demo Neo4j instance (non-production).
- Public API served with `PUBLIC_MODE=true`.

View File

@@ -14,7 +14,6 @@ Resumo:
Release notes: {release_url} Release notes: {release_url}
Observação de integridade: os sinais refletem coocorrências em bases públicas e não constituem prova legal. Observação de integridade: os sinais refletem coocorrências em bases públicas e não constituem prova legal.
Divulgação obrigatória: o repositório público entrega engine + demo + fluxo BYO-data; métricas de escala são snapshots de referência com timestamp.
## Short post (EN) ## Short post (EN)
@@ -28,7 +27,6 @@ Summary:
Release notes: {release_url} Release notes: {release_url}
Integrity note: signals reflect co-occurrence in public records and are not legal proof. Integrity note: signals reflect co-occurrence in public records and are not legal proof.
Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; production-scale metrics are timestamped reference snapshots.
## Discord/Telegram long form (PT+EN) ## Discord/Telegram long form (PT+EN)
@@ -44,11 +42,6 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p
**Compatibilidade** **Compatibilidade**
- {pt_compat} - {pt_compat}
**Reproducibility Reality Check**
- Funciona agora: {pt_works_now}
- Requer ingestão de dados: {pt_requires_ingestion}
- Não incluído por padrão: {pt_not_included}
**Link** **Link**
- {release_url} - {release_url}
@@ -64,10 +57,5 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p
**Compatibility** **Compatibility**
- {en_compat} - {en_compat}
**Reproducibility Reality Check**
- Works now: {en_works_now}
- Requires data ingestion: {en_requires_ingestion}
- Not included by default: {en_not_included}
**Link** **Link**
- {release_url} - {release_url}

View File

@@ -7,8 +7,8 @@ docs/**,PUBLIC with review,Keep public documentation and legal pack,include revi
.github/workflows/**,PUBLIC,CI and security transparency,include .github/workflows/**,PUBLIC,CI and security transparency,include
scripts/**,PUBLIC with review,Keep public utilities and gates,include reviewed subset scripts/**,PUBLIC with review,Keep public utilities and gates,include reviewed subset
data/demo/**,PUBLIC,Synthetic demo dataset only,include data/demo/**,PUBLIC,Synthetic demo dataset only,include
api/src/bracc/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude api/src/icarus/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude
api/src/bracc/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude api/src/icarus/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude
scripts/auto_finalize_pncp_backfill.sh,REMOVE_FROM_PUBLIC,Production operational script with server-specific assumptions,exclude scripts/auto_finalize_pncp_backfill.sh,REMOVE_FROM_PUBLIC,Production operational script with server-specific assumptions,exclude
docs/shadow_rollout_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude docs/shadow_rollout_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude
docs/ingestion_priority_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude docs/ingestion_priority_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude
1 path classification reason action_for_public_repo
7 .github/workflows/** PUBLIC CI and security transparency include
8 scripts/** PUBLIC with review Keep public utilities and gates include reviewed subset
9 data/demo/** PUBLIC Synthetic demo dataset only include
10 api/src/bracc/services/pattern_service.py api/src/icarus/services/pattern_service.py REMOVE_FROM_PUBLIC Pattern engine disabled pending validation exclude
11 api/src/bracc/queries/pattern_*.cypher api/src/icarus/queries/pattern_*.cypher REMOVE_FROM_PUBLIC Pattern query engine disabled pending validation exclude
12 scripts/auto_finalize_pncp_backfill.sh REMOVE_FROM_PUBLIC Production operational script with server-specific assumptions exclude
13 docs/shadow_rollout_runbook.md REMOVE_FROM_PUBLIC Production operational runbook details exclude
14 docs/ingestion_priority_runbook.md REMOVE_FROM_PUBLIC Production operational runbook details exclude

View File

@@ -1,56 +1,78 @@
# Public Repo Release Checklist — `World-Open-Graph/br-acc` # Public Repo Release Checklist — World Transparency Graph
## 1) Pre-release gate
1. Confirm target merge commit exists on `main`.
2. Confirm CI + Security + Public gates are green on that commit.
3. Confirm PR is merged with exactly one release label.
## 2) Public boundary checks
## 1) Prepare sanitized snapshot
```bash ```bash
python scripts/check_public_privacy.py --repo-root . bash scripts/prepare_public_snapshot.sh /Users/brunoclz/CORRUPTOS /tmp/world-transparency-graph-public
python scripts/check_compliance_pack.py --repo-root .
python scripts/check_open_core_boundary.py --repo-root .
``` ```
Expected: all `PASS`. ## 2) Initialize clean-history repo from snapshot
## 3) Snapshot hygiene (optional verification)
```bash ```bash
bash scripts/prepare_public_snapshot.sh . /tmp/br-acc-public cd /tmp/world-transparency-graph-public
python /tmp/br-acc-public/scripts/check_public_privacy.py --repo-root /tmp/br-acc-public git init
python /tmp/br-acc-public/scripts/check_compliance_pack.py --repo-root /tmp/br-acc-public git add .
python /tmp/br-acc-public/scripts/check_open_core_boundary.py --repo-root /tmp/br-acc-public git commit -m "Initial public release (WTG)"
``` ```
Expected in snapshot: ## 3) Create GitHub repository (manual)
- Owner: `brunoclz`
- Name: `world-transparency-graph`
- Visibility: Public
- Do not auto-add README/License (already present)
- No `CLAUDE.md`. ## 4) Push initial release
- No `AGENTS.md` or `AGENTS*.md`. ```bash
- No private operational runbooks outside public scope. git branch -M main
git remote add origin https://github.com/brunoclz/world-transparency-graph.git
git push -u origin main
```
## 4) Publish release (manual workflow) ## 5) Configure branch protection (GitHub UI)
Require all checks:
- `API (Python)`
- `ETL (Python)`
- `Frontend (TypeScript)`
- `Neutrality Audit`
- `Gitleaks`
- `Bandit (Python)`
- `Pip Audit (Python deps)`
- `Public Privacy Gate`
- `Compliance Pack Gate`
- `Public Boundary Gate`
In GitHub Actions, run **Publish Release** with: ## 6) Configure environment defaults
- Set public deployment environment vars:
- `PRODUCT_TIER=community`
- `PUBLIC_MODE=true`
- `PUBLIC_ALLOW_PERSON=false`
- `PUBLIC_ALLOW_ENTITY_LOOKUP=false`
- `PUBLIC_ALLOW_INVESTIGATIONS=false`
- `PATTERNS_ENABLED=false`
- `VITE_PUBLIC_MODE=true`
- `VITE_PATTERNS_ENABLED=false`
- `version`: SemVer tag (e.g. `v0.3.0`, `v0.3.1-rc.1`) ## 7) Final checks before launch
- `target_sha`: merge commit on `main` - `python scripts/check_public_privacy.py --repo-root .` => `PASS`
- `prerelease`: `false` (stable) or `true` (RC) - `python scripts/check_compliance_pack.py --repo-root .` => `PASS`
- `title_pt`: release title PT-BR - `python scripts/check_open_core_boundary.py --repo-root .` => `PASS`
- `title_en`: release title EN - Confirm no internal runbooks in public repo
- Confirm demo data is synthetic (`data/demo/synthetic_graph.json`)
- Confirm all legal docs exist in root:
- `ETHICS.md`
- `LGPD.md`
- `PRIVACY.md`
- `TERMS.md`
- `DISCLAIMER.md`
- `SECURITY.md`
- `ABUSE_RESPONSE.md`
## 5) Verify outputs ## 8) Launch communication split
- Publish product announcement as **WTG**
- Publish movement announcement as **BRCC**
- Mention methodology limits and non-accusatory policy
1. Tag exists in repository. ## 9) Release system bootstrap
2. Release page published under `/releases`. - Ensure `.github/release.yml` exists for auto-notes categories.
3. Notes include PT+EN and non-accusatory disclaimer. - Ensure `.github/release-drafter.yml` + workflow are active.
4. `release_manifest.json` asset is attached. - Ensure `publish-release.yml` workflow is present and dispatchable.
5. Compare link is valid (`previous_tag...new_tag`). - Ensure release label taxonomy is documented and applied to PRs.
- Publish first policy-compliant tag from this stream (`v0.3.0`).
## 6) Community communication
1. Use `docs/release/community_announcement_template.md`.
2. Publish short PT+EN summary with release URL.
3. Keep wording factual: “signals/co-occurrence”, never accusatory language.

View File

@@ -48,11 +48,10 @@ A release can only be published from a commit on `main` where all required gates
Every release must include PT-BR and EN sections with: Every release must include PT-BR and EN sections with:
1. Scope summary. 1. Scope summary.
2. Notable changes (explicit bullet points). 2. Notable changes.
3. Included pattern IDs when release contains pattern/signal changes. 3. Compatibility/breaking notes.
4. Compatibility/breaking notes. 4. Privacy/compliance notes when applicable.
5. Privacy/compliance notes when applicable. 5. Non-accusatory disclaimer.
6. Non-accusatory disclaimer.
## Artifacts ## Artifacts

View File

@@ -37,19 +37,6 @@ For validation cycles use RC:
- `prerelease`: `true` for RC, `false` for stable - `prerelease`: `true` for RC, `false` for stable
- `title_pt`: short PT-BR title - `title_pt`: short PT-BR title
- `title_en`: short EN title - `title_en`: short EN title
- `highlights_pt`: PT highlights separated by `|`
- `highlights_en`: EN highlights separated by `|`
- `patterns_included`: comma-separated pattern IDs (use `none` when not applicable)
- `technical_changes_pt`: PT technical changes separated by `|`
- `technical_changes_en`: EN technical changes separated by `|`
Example inputs for a pattern release:
- `highlights_pt`: `Port de 8 padrões públicos factuais | Padronização de payload público`
- `highlights_en`: `Port of 8 factual public-safe patterns | Public payload standardization`
- `patterns_included`: `sanctioned_still_receiving,amendment_beneficiary_contracts,split_contracts_below_threshold,contract_concentration,embargoed_receiving,debtor_contracts,srp_multi_org_hitchhiking,inexigibility_recurrence`
- `technical_changes_pt`: `Provider community de 4 para 8 padrões | ETL criou relação Contract-REFERENTE_A-Bid`
- `technical_changes_en`: `Community provider expanded from 4 to 8 patterns | ETL created Contract-REFERENTE_A-Bid linkage`
## 4) Workflow validations performed ## 4) Workflow validations performed
@@ -65,7 +52,7 @@ The workflow blocks publication when:
On success the workflow: On success the workflow:
1. Creates and pushes an annotated tag. 1. Creates and pushes an annotated tag.
2. Creates GitHub Release (PT+EN notes) with explicit highlights, patterns, and technical changes. 2. Creates GitHub Release (PT+EN notes).
3. Uploads `release_manifest.json` asset. 3. Uploads `release_manifest.json` asset.
## 6) Post-release checklist ## 6) Post-release checklist
@@ -73,7 +60,6 @@ On success the workflow:
1. Open the release page and confirm: 1. Open the release page and confirm:
- version tag is correct, - version tag is correct,
- PT+EN notes are present, - PT+EN notes are present,
- included patterns are explicitly listed (or marked as none),
- non-accusatory disclaimer line is present, - non-accusatory disclaimer line is present,
- `release_manifest.json` is attached. - `release_manifest.json` is attached.
2. Share release link in community channels. 2. Share release link in community channels.

View File

@@ -1,67 +0,0 @@
# Source Onboarding Contract (Brazil Coverage v1)
This contract is mandatory for every new source before `shadow -> promote`.
## 1. Source Identity
- `source_id`:
- `name`:
- `category`:
- `tier`:
- `owner_agent`:
- `primary_url`:
- `access_mode` (`file|api|bigquery|web`):
- `public_access_mode` (`open|open_with_rate_limit|registration|credentialed_public`):
- `discovery_status` (`discovered|discovered_uningested|monitored|unreachable`):
- `last_seen_url`:
- `cadence_expected`:
- `cadence_observed`:
- `quality_status` (`healthy|stale|quality_fail|blocked_external|not_built|partial|loaded`):
## 2. Access and Legal
- Credential required:
- Secret name/path:
- License or usage restriction:
- LGPD/privacy considerations:
- `blocked_external` criteria:
## 3. Data Contract
- Downloader script: `etl/scripts/download_<source>.py`
- Canonical output files:
- Manifest file:
- Manifest mandatory fields (`run_id`, `source_id`, `window_start`, `window_end`, `rows`, `error`, `checksum`, `retrieved_at_utc`):
- Update cadence:
- Expected row volume:
- Partition/window strategy:
## 4. Graph Contract
- Node labels introduced:
- Relationship types introduced:
- Natural key(s) per node:
- Merge key strategy:
- Relationship quality tier (`strong|probable`):
- Provenance fields (`method`, `confidence`, `source_ref`, `run_id`):
## 5. Index and Constraint Contract
- Required uniqueness constraints:
- Required date indexes:
- Required lookup indexes:
- Required fulltext indexes (if text-heavy):
## 6. Quality Gates (Hard Stop/Go)
- Identity integrity preserved (`Person.cpf` masked = 0, 14-digit = 0):
- Freshness SLA threshold:
- Temporal sanity (`<= now + 365d`):
- Null/duplicate key thresholds:
- Mandatory non-zero nodes/rels:
## 7. Operational Flow
- Shadow load command:
- Gate runner commands:
- API smoke checks:
- Promote command:
- Rollback command:
## 8. Acceptance
- Evidence bundle path in `audit-results/`:
- Final status: `resolved | resolved_full | blocked_external | quality_fail`
- Reviewer sign-off:

View File

@@ -1,109 +1,109 @@
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status,last_verified_utc,verification_status source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,ok comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/holding/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=bens,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/dataset/bens-candidato,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=filiacao,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/dataset/filiados-partidos,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/penalidades,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/estban,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/dataset/if-data,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/datajud/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/cnciai/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.gov.br/icmbio/pt-br,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://tcers.tc.br/fiscalizado/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.rs.gov.br/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
1 source_id name category tier status implementation_state load_state frequency in_universe_v1 primary_url pipeline_id owner_agent access_mode notes public_access_mode discovery_status last_seen_url cadence_expected cadence_observed quality_status last_verified_utc verification_status
2 cnpj Receita Federal CNPJ identity P0 loaded implemented loaded monthly true https://dadosabertos.rfb.gov.br/CNPJ/ https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/ cnpj Agent A file http://dadosabertos.rfb.gov.br monitored healthy 2026-03-01T23:11:31.444615+00:00 transient_error
3 tse TSE elections and donations electoral P0 loaded implemented loaded biennial true https://dadosabertos.tse.jus.br/ tse Agent E file Core electoral data loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
4 transparencia Portal da Transparencia contracts contracts P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados transparencia Agent C file Federal contracts and servants monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
5 sanctions CEIS CNEP sanctions sanctions P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/sancoes/consulta sanctions Agent C file Administrative sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
6 pep_cgu CGU PEP list integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/pep pep_cgu Agent A file PEP baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
7 bndes BNDES financings finance P1 loaded implemented loaded monthly true https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados bndes Agent G file Loan relationships monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
8 pgfn PGFN divida ativa fiscal P0 loaded implemented loaded monthly true https://www.regularize.pgfn.gov.br/dados-abertos pgfn Agent C file Debt risk core monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
9 ibama IBAMA embargos environment P1 loaded implemented loaded monthly true https://servicos.ibama.gov.br/ctf/publico/areasembargadas/ ibama Agent F file Environmental enforcement monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
10 comprasnet ComprasNet contracts contracts P0 stale implemented partial monthly true https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos comprasnet Agent C file Needs freshness backfill monitored stale 2026-03-01T23:11:31.444615+00:00 ok
11 tcu TCU sanctions audit P1 loaded implemented loaded monthly true https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS tcu Agent C file Inidoneidade sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
12 transferegov TransfereGov emendas e convenios transfers P0 loaded implemented loaded monthly true https://www.transferegov.sistema.gov.br/portal/download-de-dados transferegov Agent C file Transfer relationships monitored healthy 2026-03-01T23:11:31.444615+00:00 transient_error
13 rais RAIS aggregated labor labor P1 loaded implemented loaded annual true https://basedosdados.org/dataset/br-me-rais rais Agent H bigquery Aggregate mode only monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
14 inep INEP school census education P2 loaded implemented loaded annual true https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar inep Agent H file Education coverage monitored healthy 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
15 dou Diario Oficial da Uniao gazette P0 loaded implemented loaded daily true https://www.in.gov.br/leiturajornal dou Agent E bigquery National acts ingestion monitored healthy 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
16 datasus DATASUS CNES health P1 loaded implemented loaded monthly true https://opendatasus.saude.gov.br/ datasus Agent H file Health establishments monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
17 icij ICIJ offshore leaks offshore P1 loaded implemented loaded yearly true https://offshoreleaks.icij.org/pages/database icij Agent G file Offshore entities and officers monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
18 opensanctions OpenSanctions global PEP sanctions P1 loaded implemented loaded monthly true https://www.opensanctions.org/datasets/peps/ opensanctions Agent G file Global PEP matching monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
19 cvm CVM proceedings market P1 loaded implemented loaded monthly true https://dados.cvm.gov.br/ cvm Agent G file Proceedings loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
20 cvm_funds CVM fund registry market P1 loaded implemented loaded monthly true https://dados.cvm.gov.br/dados/FI/ cvm_funds Agent G file Fund baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
21 camara Camara CEAP expenses legislative P1 loaded implemented loaded monthly true https://dadosabertos.camara.leg.br/ camara Agent E api Expense reimbursement monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
22 camara_inquiries Camara inquiries and requirements legislative P0 partial implemented partial daily true https://dadosabertos.camara.leg.br/ camara_inquiries Agent E api Sessions still low monitored partial 2026-03-01T23:11:31.444615+00:00 ok
23 senado Senado CEAPS expenses legislative P1 loaded implemented loaded monthly true https://www12.senado.leg.br/dados-abertos senado Agent E api Expense data loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
24 ceaf CEAF expelled servants integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/ceaf ceaf Agent A file Expulsion evidence monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
25 cepim CEPIM barred NGOs integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/cepim cepim Agent A file NGO restrictions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
26 cpgf CPGF gov card expenses spending P2 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/cpgf cpgf Agent H file Masked CPF source monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
27 leniency Acordos de leniencia integrity P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia leniency Agent A file High signal low volume monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
28 ofac OFAC sanctions sanctions P1 loaded implemented loaded monthly true https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files ofac Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
29 holdings Brasil IO holdings ownership P1 loaded implemented loaded monthly true https://brasil.io/dataset/socios-brasil/ https://brasil.io/dataset/socios-brasil/holding/ holdings Agent G file Ownership enrichment monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
30 viagens Viagens a servico spending P2 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/viagens viagens Agent H file Travel spend baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
31 siop SIOP emendas budget P0 partial implemented partial annual true https://www.siop.planejamento.gov.br/ siop Agent C api Author linkage limited monitored partial 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
32 pncp PNCP bids and contracts contracts P0 stale implemented partial monthly true https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao https://pncp.gov.br/api/consulta/v1/ pncp Agent C api Freshness SLA pending monitored stale 2026-03-01T23:11:31.444615+00:00 transient_error
33 renuncias Renuncias fiscais fiscal P1 loaded implemented loaded annual true https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos renuncias Agent G file Tax waiver baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
34 siconfi SICONFI municipal finance fiscal P1 partial implemented partial annual true https://apidatalake.tesouro.gov.br/docs/siconfi/ siconfi Agent C api No CNPJ direct links monitored partial 2026-03-01T23:11:31.444615+00:00 ok
35 tse_bens TSE candidate assets electoral P1 loaded implemented loaded biennial true https://dadosabertos.tse.jus.br/api/3/action/package_search?q=bens https://dadosabertos.tse.jus.br/dataset/bens-candidato tse_bens Agent E file Patrimony baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
36 tse_filiados TSE party memberships electoral P1 loaded implemented loaded monthly true https://dadosabertos.tse.jus.br/api/3/action/package_search?q=filiacao https://dadosabertos.tse.jus.br/dataset/filiados-partidos tse_filiados Agent E file Party network monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
37 bcb BCB penalties finance P1 loaded implemented loaded monthly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/penalidades bcb Agent G file Bank penalties loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
38 stf STF court data judiciary P1 loaded implemented loaded monthly true https://basedosdados.org/dataset/br-stf-corte-aberta stf Agent D bigquery Supreme court coverage monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
39 caged CAGED labor movements labor P1 stale implemented partial monthly true https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/ caged Agent H file Aggregate-only implementation monitored stale 2026-03-01T23:11:31.444615+00:00 transient_error
40 eu_sanctions EU sanctions sanctions P1 loaded implemented loaded monthly true https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions eu_sanctions Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
41 un_sanctions UN sanctions sanctions P1 loaded implemented loaded monthly true https://scsanctions.un.org/resources/xml/en/consolidated.xml un_sanctions Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 transient_error
42 world_bank World Bank debarment sanctions P1 loaded implemented loaded monthly true https://www.worldbank.org/en/projects-operations/procurement/debarred-firms world_bank Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
43 senado_cpis Senado CPIs legislative P0 partial implemented partial yearly true https://www12.senado.leg.br/dados-abertos senado_cpis Agent E api Needs richer sessions and requirements monitored partial 2026-03-01T23:11:31.444615+00:00 ok
44 mides MiDES municipal procurement municipal P0 loaded implemented loaded daily true https://basedosdados.org/dataset/world-wb-mides mides Agent H bigquery Operational after access fix monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
45 querido_diario Querido Diario gazettes municipal P1 partial implemented partial daily true https://queridodiario.ok.org.br/api querido_diario Agent H api Text availability gap monitored partial 2026-03-01T23:11:31.444615+00:00 ok
46 datajud CNJ DataJud judiciary P0 blocked_external implemented not_loaded monthly true https://api-publica.datajud.cnj.jus.br/ datajud Agent D api Credentials not fully operational in prod monitored blocked_external 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
47 bolsa_familia_bpc Bolsa Familia and BPC social P3 not_built not_implemented not_loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos Agent H file High volume masked identities discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
48 estban BCB ESTBAN balances finance P3 not_built not_implemented not_loaded monthly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/estban Agent G file Banking aggregates discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
49 if_data BCB IF data indicators finance P3 not_built not_implemented not_loaded quarterly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/if-data Agent G file Institution KPIs discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
50 bcb_liquidacao BCB bank liquidation acts finance P2 not_built not_implemented not_loaded monthly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao Agent G file Regulatory actions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
51 stj_dados_abertos STJ open data judiciary P1 not_built not_implemented not_loaded monthly true https://dadosabertos.stj.jus.br/ Agent D api Superior court decisions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
52 cnciai_improbidade CNIAI improbidade judiciary P1 not_built not_implemented not_loaded monthly true https://www.cnj.jus.br/sistemas/datajud/ https://www.cnj.jus.br/sistemas/cnciai/ Agent D api Misconduct convictions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
53 carf_tax_appeals CARF tax appeals judiciary P2 not_built not_implemented not_loaded monthly true https://carf.economia.gov.br/dados-abertos Agent D file Tax litigation discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
54 anp_royalties ANP royalties and fuel regulatory P2 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anp Agent F api Oil and gas royalties discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
55 aneel_concessions ANEEL concessions regulatory P2 not_built not_implemented not_loaded monthly true https://dadosabertos.aneel.gov.br/ Agent F api Energy concessions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
56 anm_mining_rights ANM mining rights regulatory P1 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anm Agent F api Mining rights and permits discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
57 antt_transport_concessions ANTT concessions regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/antt Agent F api Transport concessions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
58 ans_health_plans ANS operators regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/ans Agent H api Health insurance operators discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
59 anvisa_registrations ANVISA products regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anvisa Agent H api Regulatory registrations discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
60 anac_aviation_concessions ANAC concessions regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anac Agent F api Aviation contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
61 antaq_port_contracts ANTAQ contracts regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/antaq Agent F api Port concessions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
62 ana_water_grants ANA water grants regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/ana Agent F api Water use rights discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
63 anatel_telecom_licenses ANATEL licenses regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anatel Agent G api Telecom operators discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
64 susep_insurance_market SUSEP insurance market regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/susep Agent G file Insurance entities discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
65 cvm_full_ownership_chain CVM ownership chains market P1 not_built not_implemented not_loaded monthly true https://dados.cvm.gov.br/ Agent G file Shareholder graph expansion discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
66 receita_dirbi Receita DIRBI tax P1 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi Agent G file Tax benefit declarations discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
67 mapbiomas_alertas MapBiomas Alerta environment P1 not_built not_implemented not_loaded monthly true https://alerta.mapbiomas.org/api Agent F api Deforestation alerts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
68 sicar_rural_registry SiCAR rural registry environment P1 not_built not_implemented not_loaded quarterly true https://www.car.gov.br/publico/municipios/downloads Agent F file Property boundaries and owners discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
69 icmbio_cnuc ICMBio CNUC units environment P2 not_built not_implemented not_loaded monthly true https://www.gov.br/icmbio/pt-br https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao Agent F file Protected areas discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
70 tesouro_emendas Tesouro emendas budget P0 not_built not_implemented not_loaded monthly true https://www.tesourotransparente.gov.br/ Agent C file Budget execution discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
71 siga_brasil SIGA Brasil budget P0 not_built not_implemented not_loaded monthly true https://www12.senado.leg.br/orcamento/sigabrasil Agent C file Federal budget traces discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
72 camara_votes_bills Camara votes and bills legislative P1 not_built not_implemented not_loaded daily true https://dadosabertos.camara.leg.br/api/v2 Agent E api Legislative behavior discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
73 senado_votes_bills Senado votes and bills legislative P1 not_built not_implemented not_loaded daily true https://legis.senado.leg.br/dadosabertos Agent E api Legislative behavior discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
74 interpol_red_notices Interpol red notices international P2 not_built not_implemented not_loaded weekly true https://www.interpol.int/How-we-work/Notices/Red-Notices Agent G api Requires key discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
75 tce_sp TCE Sao Paulo state P2 not_built not_implemented not_loaded monthly true https://transparencia.tce.sp.gov.br/ Agent H api State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
76 tce_pe TCE Pernambuco state P2 not_built not_implemented not_loaded monthly true https://sistemas.tce.pe.gov.br/ Agent H api State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
77 tce_rj TCE Rio de Janeiro state P2 not_built not_implemented not_loaded monthly true https://dados.tce.rj.gov.br/ Agent H api State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
78 tce_rs TCE Rio Grande do Sul state P2 not_built not_implemented not_loaded monthly true https://tcers.tc.br/fiscalizado/ https://portal.tce.rs.gov.br/ Agent H file State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
79 tce_mg TCE Minas Gerais state P2 not_built not_implemented not_loaded monthly true https://www.tce.mg.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
80 tce_ba TCE Bahia state P3 not_built not_implemented not_loaded monthly true https://www.tce.ba.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
81 tce_ce TCE Ceara state P3 not_built not_implemented not_loaded monthly true https://www.tce.ce.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
82 tce_go TCE Goias state P3 not_built not_implemented not_loaded monthly true https://portal.tce.go.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
83 tce_pr TCE Parana state P3 not_built not_implemented not_loaded monthly true https://www1.tce.pr.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
84 tce_sc TCE Santa Catarina state P3 not_built not_implemented not_loaded monthly true https://www.tcesc.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
85 tce_es TCE Espirito Santo state P3 not_built not_implemented not_loaded monthly true https://www.tcees.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
86 tce_mt TCE Mato Grosso state P3 not_built not_implemented not_loaded monthly true https://www.tce.mt.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
87 tce_ms TCE Mato Grosso do Sul state P3 not_built not_implemented not_loaded monthly true https://www.tce.ms.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
88 tce_am TCE Amazonas state P3 not_built not_implemented not_loaded monthly true https://www.tce.am.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
89 tce_pa TCE Para state P3 not_built not_implemented not_loaded monthly true https://www.tcepa.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
90 tce_ro TCE Rondonia state P3 not_built not_implemented not_loaded monthly true https://www.tce.ro.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
91 tce_rr TCE Roraima state P3 not_built not_implemented not_loaded monthly true https://www.tcerr.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
92 tce_ap TCE Amapa state P3 not_built not_implemented not_loaded monthly true https://www.tce.ap.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
93 tce_to TCE Tocantins state P3 not_built not_implemented not_loaded monthly true https://www.tceto.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
94 tce_ma TCE Maranhao state P3 not_built not_implemented not_loaded monthly true https://www.tcema.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
95 tce_pi TCE Piaui state P3 not_built not_implemented not_loaded monthly true https://www.tce.pi.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
96 tce_rn TCE Rio Grande do Norte state P3 not_built not_implemented not_loaded monthly true https://www.tce.rn.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
97 tce_pb TCE Paraiba state P3 not_built not_implemented not_loaded monthly true https://tce.pb.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
98 tce_al TCE Alagoas state P3 not_built not_implemented not_loaded monthly true https://www.tceal.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
99 tce_se TCE Sergipe state P3 not_built not_implemented not_loaded monthly true https://www.tce.se.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
100 state_portal_sp Sao Paulo transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.sp.gov.br/ Agent H api State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
101 state_portal_mg Minas Gerais transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.mg.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
102 state_portal_ba Bahia transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.ba.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
103 state_portal_ce Ceara transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.ce.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
104 state_portal_go Goias transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.go.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
105 state_portal_pr Parana transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.pr.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
106 state_portal_sc Santa Catarina transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.sc.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
107 state_portal_rs Rio Grande do Sul transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.rs.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
108 state_portal_pe Pernambuco transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.pe.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
109 state_portal_rj Rio de Janeiro transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.rj.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error

View File

@@ -1,7 +1,7 @@
[project] [project]
name = "bracc-etl" name = "bracc-etl"
version = "0.1.0" version = "0.1.0"
description = "BRACC ETL — Data ingestion pipelines for Brazilian public data" description = "BR-ACC ETL — Data ingestion pipelines for Brazilian public data"
requires-python = ">=3.12" requires-python = ">=3.12"
license = "AGPL-3.0-or-later" license = "AGPL-3.0-or-later"
dependencies = [ dependencies = [
@@ -9,10 +9,11 @@ dependencies = [
"pandas>=2.2.0", "pandas>=2.2.0",
"httpx>=0.28.0", "httpx>=0.28.0",
"click>=8.1.0", "click>=8.1.0",
"defusedxml>=0.7.1",
"pydantic>=2.10.0", "pydantic>=2.10.0",
"pydantic-settings>=2.7.0", "pydantic-settings>=2.7.0",
"pypdf>=5.2.0", "pypdf>=5.2.0",
"defusedxml>=0.7.0",
"pandera>=0.21.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@@ -3,8 +3,6 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
import shutil
import stat
import zipfile import zipfile
from pathlib import Path from pathlib import Path
@@ -38,12 +36,21 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool:
response.raise_for_status() response.raise_for_status()
# If we requested a range but server returned full content (200 vs 206),
# start fresh to avoid corruption
if start_byte > 0 and response.status_code != 206:
logger.warning(
"Server ignored Range header for %s, restarting download",
dest.name,
)
start_byte = 0
total = response.headers.get("content-length") total = response.headers.get("content-length")
total_mb = f"{int(total) / 1e6:.1f} MB" if total else "unknown size" total_mb = f"{int(total) / 1e6:.1f} MB" if total else "unknown size"
logger.info("Downloading %s (%s)...", dest.name, total_mb) logger.info("Downloading %s (%s)...", dest.name, total_mb)
mode = "ab" if start_byte > 0 else "wb" mode = "ab" if start_byte > 0 and response.status_code == 206 else "wb"
downloaded = start_byte downloaded = start_byte if mode == "ab" else 0
with open(partial, mode) as f: with open(partial, mode) as f:
for chunk in response.iter_bytes(chunk_size=65_536): for chunk in response.iter_bytes(chunk_size=65_536):
f.write(chunk) f.write(chunk)
@@ -58,24 +65,49 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool:
return False return False
def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]: def safe_extract_zip(
"""Extract ZIP and return list of extracted files. zip_path: Path,
output_dir: Path,
*,
max_total_bytes: int = 50 * 1024**3, # 50GB default (CNPJ zips are huge)
) -> list[Path]:
"""Safely extract ZIP with path traversal and bomb guards.
Deletes corrupted ZIPs for re-download. Deletes corrupted ZIPs for re-download.
""" """
try: try:
with zipfile.ZipFile(zip_path, "r") as zf: with zipfile.ZipFile(zip_path, "r") as zf:
extracted = safe_extract_zip(zf, output_dir) # Check for path traversal
logger.info("Extracted %d files from %s", len(extracted), zip_path.name) resolved_output = output_dir.resolve()
return extracted for info in zf.infolist():
target = (output_dir / info.filename).resolve()
if not target.is_relative_to(resolved_output):
raise ValueError(
f"Path traversal detected in {zip_path.name}: {info.filename}"
)
# Check total uncompressed size (zip bomb guard)
total_size = sum(info.file_size for info in zf.infolist())
if total_size > max_total_bytes:
raise ValueError(
f"ZIP bomb guard: {zip_path.name} would extract to "
f"{total_size / 1e9:.1f}GB (limit: {max_total_bytes / 1e9:.1f}GB)"
)
names = zf.namelist()
zf.extractall(output_dir)
logger.info("Extracted %d files from %s", len(names), zip_path.name)
return [output_dir / n for n in names]
except zipfile.BadZipFile: except zipfile.BadZipFile:
logger.warning("Bad ZIP file: %s — deleting for re-download", zip_path.name) logger.warning("Bad ZIP file: %s — deleting for re-download", zip_path.name)
zip_path.unlink() zip_path.unlink()
return [] return []
except ValueError as exc:
logger.warning("Unsafe ZIP file %s: %s — deleting", zip_path.name, exc)
zip_path.unlink(missing_ok=True) def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]:
return [] """Extract ZIP and return list of extracted files."""
return safe_extract_zip(zip_path, output_dir)
def validate_csv( def validate_csv(
@@ -111,60 +143,3 @@ def validate_csv(
except Exception as e: except Exception as e:
logger.warning("Validation failed for %s: %s", path.name, e) logger.warning("Validation failed for %s: %s", path.name, e)
return False return False
def safe_extract_zip(
archive: zipfile.ZipFile,
output_dir: Path,
*,
max_members: int = 50_000,
max_uncompressed_bytes: int = 5_000_000_000,
) -> list[Path]:
"""Safely extract a ZIP archive.
Blocks path traversal, symlinks, and oversized archives.
"""
output_root = output_dir.resolve()
infos = archive.infolist()
if len(infos) > max_members:
msg = f"ZIP has too many entries ({len(infos)} > {max_members})"
raise ValueError(msg)
extracted: list[Path] = []
uncompressed_total = 0
for info in infos:
member_name = info.filename.replace("\\", "/")
if not member_name:
continue
# Reject symlink entries.
mode = info.external_attr >> 16
if stat.S_ISLNK(mode):
msg = f"ZIP contains symlink entry: {member_name}"
raise ValueError(msg)
target = (output_dir / member_name).resolve()
try:
target.relative_to(output_root)
except ValueError as exc:
msg = f"Path traversal detected: {member_name}"
raise ValueError(msg) from exc
if info.is_dir():
target.mkdir(parents=True, exist_ok=True)
continue
uncompressed_total += info.file_size
if uncompressed_total > max_uncompressed_bytes:
msg = (
f"ZIP exceeds max extracted size "
f"({uncompressed_total} > {max_uncompressed_bytes})"
)
raise ValueError(msg)
target.parent.mkdir(parents=True, exist_ok=True)
with archive.open(info, "r") as source, target.open("wb") as destination:
shutil.copyfileobj(source, destination)
extracted.append(target)
return extracted

View File

@@ -5,9 +5,9 @@ Streams microdados_movimentacao year-by-year to separate CSVs for
resumability and memory management on large datasets. resumability and memory management on large datasets.
Usage: Usage:
python etl/scripts/download_caged.py --billing-project bracc-corruptos python etl/scripts/download_caged.py --billing-project icarus-corruptos
python etl/scripts/download_caged.py --billing-project bracc-corruptos --start-year 2024 python etl/scripts/download_caged.py --billing-project icarus-corruptos --start-year 2024
python etl/scripts/download_caged.py --billing-project bracc-corruptos --skip-existing python etl/scripts/download_caged.py --billing-project icarus-corruptos --skip-existing
""" """
from __future__ import annotations from __future__ import annotations

View File

@@ -413,7 +413,7 @@ def _write_manifest(
) )
@click.option( @click.option(
"--billing-project", "--billing-project",
default="bracc-corruptos", default="icarus-corruptos",
help="GCP billing project for BQ mode.", help="GCP billing project for BQ mode.",
) )
@click.option( @click.option(

View File

@@ -6,15 +6,21 @@ Usage:
python etl/scripts/download_cnpj.py --reference-only # reference tables only (tiny) python etl/scripts/download_cnpj.py --reference-only # reference tables only (tiny)
python etl/scripts/download_cnpj.py --files 1 # just first file of each type python etl/scripts/download_cnpj.py --files 1 # just first file of each type
python etl/scripts/download_cnpj.py --types Empresas # specific type only python etl/scripts/download_cnpj.py --types Empresas # specific type only
python etl/scripts/download_cnpj.py --release 2026-03 # pin to specific monthly release
""" """
from __future__ import annotations from __future__ import annotations
import hashlib
import json
import logging import logging
import os
import sys import sys
from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
import click import click
import httpx
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
from _download_utils import download_file, extract_zip, validate_csv from _download_utils import download_file, extract_zip, validate_csv
@@ -22,7 +28,13 @@ from _download_utils import download_file, extract_zip, validate_csv
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/" # Receita Federal Nextcloud (primary since Jan 2026)
NEXTCLOUD_BASE = "https://arquivos.receitafederal.gov.br/s/{token}/download?path=%2F&files="
KNOWN_TOKENS = ["gn672Ad4CF8N6TK", "YggdBLfdninEJX9"]
# Legacy URLs (dadosabertos.rfb.gov.br decommissioned Jan 2026)
LEGACY_NEW_BASE_PATTERN = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/{year_month}/"
LEGACY_BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/"
MAIN_TYPES = ["Empresas", "Socios", "Estabelecimentos"] MAIN_TYPES = ["Empresas", "Socios", "Estabelecimentos"]
REFERENCE_FILES = [ REFERENCE_FILES = [
@@ -48,6 +60,126 @@ EXPECTED_COLS = {
} }
def _previous_month(year: int, month: int) -> tuple[int, int]:
"""Return (year, month) for the previous month."""
if month == 1:
return year - 1, 12
return year, month - 1
def _check_url_accessible(url: str, timeout: int = 30) -> bool:
"""Send HTTP HEAD to verify a URL is accessible (2xx)."""
try:
resp = httpx.head(url, follow_redirects=True, timeout=timeout)
return resp.status_code < 400
except httpx.HTTPError:
return False
def _check_nextcloud_token(token: str, timeout: int = 30) -> bool:
"""Verify a Nextcloud share token is valid via HEAD request."""
share_url = f"https://arquivos.receitafederal.gov.br/s/{token}"
try:
resp = httpx.head(share_url, follow_redirects=True, timeout=timeout)
return resp.status_code < 400
except httpx.HTTPError:
return False
def resolve_rf_release(year_month: str | None = None) -> str:
"""Resolve the Receita Federal CNPJ release URL.
Strategy:
1. Try Nextcloud share (primary since Jan 2026):
a. Check CNPJ_SHARE_TOKEN env var first.
b. Then try each known token.
2. Fall back to legacy dadosabertos.rfb.gov.br paths.
3. Raise RuntimeError if nothing works (fail-closed).
Returns the resolved base URL. For Nextcloud, files are fetched via
``{base_url}{filename}``.
"""
now = datetime.now(timezone.utc)
# --- Nextcloud (primary) ---
tokens_to_try: list[str] = []
env_token = os.environ.get("CNPJ_SHARE_TOKEN")
if env_token:
tokens_to_try.append(env_token)
for t in KNOWN_TOKENS:
if t not in tokens_to_try:
tokens_to_try.append(t)
for token in tokens_to_try:
logger.info("Probing Nextcloud token: %s...", token[:6])
if _check_nextcloud_token(token):
base_url = NEXTCLOUD_BASE.format(token=token)
logger.info("Resolved CNPJ via Nextcloud (token %s...)", token[:6])
return base_url
# --- Legacy dadosabertos.rfb.gov.br ---
if year_month is not None:
candidates = [year_month]
else:
current = f"{now.year:04d}-{now.month:02d}"
prev_y, prev_m = _previous_month(now.year, now.month)
previous = f"{prev_y:04d}-{prev_m:02d}"
candidates = [current, previous]
for ym in candidates:
url = LEGACY_NEW_BASE_PATTERN.format(year_month=ym)
logger.info("Probing legacy release URL: %s", url)
if _check_url_accessible(url):
logger.info("Resolved CNPJ release (legacy new path): %s", url)
return url
logger.info("Trying legacy flat URL: %s", LEGACY_BASE_URL)
if _check_url_accessible(LEGACY_BASE_URL):
logger.info("Resolved CNPJ release (legacy flat): %s", LEGACY_BASE_URL)
return LEGACY_BASE_URL
tried = ", ".join(candidates)
raise RuntimeError(
f"Could not resolve CNPJ release. Tried Nextcloud tokens, "
f"legacy months [{tried}], and legacy flat path. "
"Receita Federal portal may be down or the URL structure has changed."
)
def _write_manifest(
output_dir: Path,
base_url: str,
resolved_release: str,
file_results: list[dict],
started_at: str,
) -> Path:
"""Write download manifest JSON after download completes."""
finished_at = datetime.now(timezone.utc).isoformat()
# Compute an aggregate checksum over all successful file names + sizes
hasher = hashlib.sha256()
for fr in sorted(file_results, key=lambda x: x["name"]):
hasher.update(f"{fr['name']}:{fr['size_bytes']}:{fr['status']}".encode())
checksum = f"sha256:{hasher.hexdigest()}"
manifest = {
"source": "receita_federal_cnpj",
"resolved_release": resolved_release,
"base_url": base_url,
"files": file_results,
"started_at": started_at,
"finished_at": finished_at,
"checksum": checksum,
}
manifest_path = output_dir / "download_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
logger.info("Manifest written: %s", manifest_path)
return manifest_path
@click.command() @click.command()
@click.option("--output-dir", default="./data/cnpj", help="Base output directory") @click.option("--output-dir", default="./data/cnpj", help="Base output directory")
@click.option("--files", type=int, default=10, help="Number of files per type (0-9)") @click.option("--files", type=int, default=10, help="Number of files per type (0-9)")
@@ -56,6 +188,7 @@ EXPECTED_COLS = {
@click.option("--skip-existing/--no-skip-existing", default=True, help="Skip already downloaded files") @click.option("--skip-existing/--no-skip-existing", default=True, help="Skip already downloaded files")
@click.option("--skip-extract", is_flag=True, help="Skip extraction after download") @click.option("--skip-extract", is_flag=True, help="Skip extraction after download")
@click.option("--timeout", type=int, default=600, help="Download timeout in seconds") @click.option("--timeout", type=int, default=600, help="Download timeout in seconds")
@click.option("--release", default=None, help="Pin to specific monthly release (YYYY-MM format)")
def main( def main(
output_dir: str, output_dir: str,
files: int, files: int,
@@ -64,8 +197,20 @@ def main(
skip_existing: bool, skip_existing: bool,
skip_extract: bool, skip_extract: bool,
timeout: int, timeout: int,
release: str | None,
) -> None: ) -> None:
"""Download and extract CNPJ data from Receita Federal.""" """Download and extract CNPJ data from Receita Federal."""
started_at = datetime.now(timezone.utc).isoformat()
base_url = resolve_rf_release(release)
# Extract the release identifier from the resolved URL
resolved_release = release or "legacy"
if "arquivos.receitafederal.gov.br" in base_url:
resolved_release = "nextcloud"
elif "/dados_abertos_cnpj/" in base_url:
# Extract YYYY-MM from URL
resolved_release = base_url.rstrip("/").rsplit("/", 1)[-1]
base = Path(output_dir) base = Path(output_dir)
raw_dir = base / "raw" raw_dir = base / "raw"
extract_dir = base / "extracted" extract_dir = base / "extracted"
@@ -73,14 +218,26 @@ def main(
for d in [raw_dir, extract_dir, ref_dir]: for d in [raw_dir, extract_dir, ref_dir]:
d.mkdir(parents=True, exist_ok=True) d.mkdir(parents=True, exist_ok=True)
file_results: list[dict] = []
# --- Reference tables (always download, they're tiny) --- # --- Reference tables (always download, they're tiny) ---
logger.info("=== Reference tables ===") logger.info("=== Reference tables ===")
for filename in REFERENCE_FILES: for filename in REFERENCE_FILES:
dest = raw_dir / filename dest = raw_dir / filename
if skip_existing and dest.exists(): if skip_existing and dest.exists():
logger.info("Skipping (exists): %s", filename) logger.info("Skipping (exists): %s", filename)
file_results.append({
"name": filename,
"status": "skipped",
"size_bytes": dest.stat().st_size,
})
else: else:
download_file(f"{BASE_URL}{filename}", dest, timeout=timeout) success = download_file(f"{base_url}{filename}", dest, timeout=timeout)
file_results.append({
"name": filename,
"status": "ok" if success else "failed",
"size_bytes": dest.stat().st_size if dest.exists() else 0,
})
if not skip_extract and dest.exists(): if not skip_extract and dest.exists():
extracted = extract_zip(dest, ref_dir) extracted = extract_zip(dest, ref_dir)
@@ -90,7 +247,8 @@ def main(
validate_csv(f, expected_cols=expected) validate_csv(f, expected_cols=expected)
if reference_only: if reference_only:
logger.info("Reference-only mode done.") logger.info("Reference-only mode -- done.")
_write_manifest(base, base_url, resolved_release, file_results, started_at)
return return
# --- Main data files --- # --- Main data files ---
@@ -102,10 +260,25 @@ def main(
dest = raw_dir / filename dest = raw_dir / filename
if skip_existing and dest.exists(): if skip_existing and dest.exists():
logger.info("Skipping (exists): %s", filename) logger.info("Skipping (exists): %s", filename)
file_results.append({
"name": filename,
"status": "skipped",
"size_bytes": dest.stat().st_size,
})
else: else:
success = download_file(f"{BASE_URL}{filename}", dest, timeout=timeout) success = download_file(f"{base_url}{filename}", dest, timeout=timeout)
if not success: if not success:
file_results.append({
"name": filename,
"status": "failed",
"size_bytes": 0,
})
continue continue
file_results.append({
"name": filename,
"status": "ok",
"size_bytes": dest.stat().st_size if dest.exists() else 0,
})
if not skip_extract and dest.exists(): if not skip_extract and dest.exists():
extracted = extract_zip(dest, extract_dir) extracted = extract_zip(dest, extract_dir)
@@ -120,6 +293,7 @@ def main(
logger.info("=== Download complete ===") logger.info("=== Download complete ===")
_print_summary(raw_dir, extract_dir, ref_dir) _print_summary(raw_dir, extract_dir, ref_dir)
_write_manifest(base, base_url, resolved_release, file_results, started_at)
def _print_summary(raw_dir: Path, extract_dir: Path, ref_dir: Path) -> None: def _print_summary(raw_dir: Path, extract_dir: Path, ref_dir: Path) -> None:

View File

@@ -10,8 +10,8 @@ And a manifest:
- download_manifest.json - download_manifest.json
Usage: Usage:
python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos
python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos --tables socios python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos --tables socios
""" """
from __future__ import annotations from __future__ import annotations
@@ -105,6 +105,44 @@ TABLES: dict[str, list[str]] = {
PAGE_SIZE = 100_000 PAGE_SIZE = 100_000
def _run_bigquery_precheck(
*,
billing_project: str,
source_project: str,
source_dataset: str,
snapshot_start: str | None,
) -> None:
"""Run explicit auth/ACL prechecks before starting large table downloads."""
from google.cloud import bigquery
client = bigquery.Client(project=billing_project)
logger.info("Running BigQuery precheck: SELECT 1")
list(client.query("SELECT 1 AS ok").result())
socios_table = f"{source_project}.{source_dataset}.socios"
if snapshot_start:
precheck_sql = (
f"SELECT COUNT(1) AS n FROM `{socios_table}` "
"WHERE data >= @snapshot_start"
)
query_params = [
bigquery.ScalarQueryParameter("snapshot_start", "DATE", snapshot_start),
]
else:
precheck_sql = f"SELECT COUNT(1) AS n FROM `{socios_table}`"
query_params = []
logger.info("Running BigQuery precheck: %s", precheck_sql)
rows = list(
client.query(
precheck_sql,
job_config=bigquery.QueryJobConfig(query_parameters=query_params),
).result(),
)
check_value = rows[0].n if rows else 0
logger.info("BigQuery precheck OK: socios_count=%s", check_value)
def _sha256_file(path: Path) -> str: def _sha256_file(path: Path) -> str:
digest = hashlib.sha256() digest = hashlib.sha256()
with path.open("rb") as f: with path.open("rb") as f:
@@ -292,6 +330,19 @@ def main(
) )
source_project, source_dataset = dataset.split(".", 1) source_project, source_dataset = dataset.split(".", 1)
try:
_run_bigquery_precheck(
billing_project=billing_project,
source_project=source_project,
source_dataset=source_dataset,
snapshot_start=snapshot_start,
)
except Exception as exc:
raise click.ClickException(
"BigQuery precheck failed. Configure a non-interactive service account "
"(GOOGLE_APPLICATION_CREDENTIALS) with dataset ACL and billing access.",
) from exc
selected = list(tables) if tables else list(TABLES.keys()) selected = list(tables) if tables else list(TABLES.keys())
run_id = f"cnpj-bq-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}-{uuid.uuid4().hex[:8]}" run_id = f"cnpj-bq-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}-{uuid.uuid4().hex[:8]}"
logger.info( logger.info(

View File

@@ -22,7 +22,6 @@ from pathlib import Path
import click import click
import httpx import httpx
from _download_utils import safe_extract_zip
logging.basicConfig( logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s" level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
@@ -90,15 +89,24 @@ def _download_zip(
xml_count = 0 xml_count = 0
try: try:
resolved_dir = section_dir.resolve()
with zipfile.ZipFile(BytesIO(resp.content)) as zf: with zipfile.ZipFile(BytesIO(resp.content)) as zf:
extracted = safe_extract_zip(zf, section_dir) for member in zf.namelist():
xml_count = sum(1 for path in extracted if path.suffix.lower() == ".xml") # Path traversal guard
target = (section_dir / member).resolve()
if not target.is_relative_to(resolved_dir):
logger.warning(
"Path traversal detected in %s: %s — skipping",
zip_name,
member,
)
continue
if member.lower().endswith(".xml"):
zf.extract(member, section_dir)
xml_count += 1
except zipfile.BadZipFile: except zipfile.BadZipFile:
logger.warning("Bad ZIP file: %s", zip_name) logger.warning("Bad ZIP file: %s", zip_name)
return 0 return 0
except ValueError as exc:
logger.warning("Unsafe ZIP file %s: %s", zip_name, exc)
return 0
if xml_count > 0: if xml_count > 0:
marker.write_text(str(xml_count)) marker.write_text(str(xml_count))

View File

@@ -71,7 +71,7 @@ def _write_manifest(out_dir: Path, tables: list[dict[str, Any]]) -> Path:
@click.command() @click.command()
@click.option("--billing-project", default="bracc-corruptos", help="GCP billing project") @click.option("--billing-project", default="icarus-corruptos", help="GCP billing project")
@click.option( @click.option(
"--dataset", "--dataset",
default=WORLD_WB_DATASET, default=WORLD_WB_DATASET,

View File

@@ -439,7 +439,7 @@ def main(
client = httpx.Client( client = httpx.Client(
timeout=timeout, timeout=timeout,
follow_redirects=True, follow_redirects=True,
headers={"User-Agent": "BRACC-ETL/1.0 (public data research)"}, headers={"User-Agent": "BR-ACC-ETL/1.0 (public data research)"},
) )
total_records = 0 total_records = 0

View File

@@ -8,10 +8,12 @@ from __future__ import annotations
import argparse import argparse
import logging import logging
import zipfile import sys
from pathlib import Path from pathlib import Path
import httpx import httpx
sys.path.insert(0, str(Path(__file__).parent))
from _download_utils import safe_extract_zip from _download_utils import safe_extract_zip
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -34,15 +36,14 @@ def download_year(output_dir: Path, year: int) -> None:
url, url,
follow_redirects=True, follow_redirects=True,
timeout=300, timeout=300,
headers={"User-Agent": "BRACC-ETL/1.0"}, headers={"User-Agent": "BR-ACC-ETL/1.0"},
) )
response.raise_for_status() response.raise_for_status()
dest_zip.write_bytes(response.content) dest_zip.write_bytes(response.content)
logger.info("Downloaded: %s (%d bytes)", dest_zip.name, len(response.content)) logger.info("Downloaded: %s (%d bytes)", dest_zip.name, len(response.content))
with zipfile.ZipFile(dest_zip, "r") as zf: extracted = safe_extract_zip(dest_zip, output_dir)
extracted = safe_extract_zip(zf, output_dir) logger.info("Extracted %d files", len(extracted))
logger.info("Extracted %d files", len(extracted))
except httpx.HTTPError: except httpx.HTTPError:
logger.warning("Failed to download renuncias for %d", year) logger.warning("Failed to download renuncias for %d", year)

View File

@@ -16,13 +16,13 @@ import hashlib
import json import json
import logging import logging
import re import re
import defusedxml.ElementTree as ET
from datetime import UTC, datetime from datetime import UTC, datetime
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import click import click
import httpx import httpx
from defusedxml import ElementTree as ET
from download_senado_cpi_archive import fetch_archive_historical from download_senado_cpi_archive import fetch_archive_historical
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@@ -44,7 +44,7 @@ def get_all_entities() -> list[dict]:
url, url,
params={"offset": offset, "limit": limit}, params={"offset": offset, "limit": limit},
timeout=60, timeout=60,
headers={"User-Agent": "BRACC-ETL/1.0"}, headers={"User-Agent": "BR-ACC-ETL/1.0"},
) )
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
@@ -125,7 +125,7 @@ def download_year(
header_written = partial.exists() and partial.stat().st_size > 0 header_written = partial.exists() and partial.stat().st_size > 0
with ( with (
httpx.Client(headers={"User-Agent": "BRACC-ETL/1.0"}) as client, httpx.Client(headers={"User-Agent": "BR-ACC-ETL/1.0"}) as client,
open(partial, "a", newline="", encoding="utf-8") as f, open(partial, "a", newline="", encoding="utf-8") as f,
): ):
writer: csv.DictWriter | None = None writer: csv.DictWriter | None = None

View File

@@ -5,9 +5,9 @@ Streams from BigQuery table basedosdados.br_stf_corte_aberta.decisoes to local C
Requires `google-cloud-bigquery` and an authenticated GCP project. Requires `google-cloud-bigquery` and an authenticated GCP project.
Usage: Usage:
python etl/scripts/download_stf.py --billing-project bracc-corruptos python etl/scripts/download_stf.py --billing-project icarus-corruptos
python etl/scripts/download_stf.py --billing-project bracc-corruptos --skip-existing python etl/scripts/download_stf.py --billing-project icarus-corruptos --skip-existing
python etl/scripts/download_stf.py --billing-project bracc-corruptos --output-dir ./data/stf python etl/scripts/download_stf.py --billing-project icarus-corruptos --output-dir ./data/stf
""" """
from __future__ import annotations from __future__ import annotations

View File

@@ -5,9 +5,9 @@ Streams from BigQuery table `basedosdados.br_tse_eleicoes.bens_candidato` to a l
Requires `google-cloud-bigquery` and an authenticated GCP project. Requires `google-cloud-bigquery` and an authenticated GCP project.
Usage: Usage:
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --start-year 2018 python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --start-year 2018
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --skip-existing python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --skip-existing
""" """
from __future__ import annotations from __future__ import annotations

View File

@@ -7,9 +7,9 @@ Filters to REGULAR status only (active members) to reduce volume.
Requires `google-cloud-bigquery` and an authenticated GCP project. Requires `google-cloud-bigquery` and an authenticated GCP project.
Usage: Usage:
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --skip-existing python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --skip-existing
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --all-statuses python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --all-statuses
""" """
from __future__ import annotations from __future__ import annotations

View File

@@ -14,10 +14,10 @@ from __future__ import annotations
import json import json
import logging import logging
import sys import sys
import defusedxml.ElementTree as ET
from pathlib import Path from pathlib import Path
import click import click
from defusedxml import ElementTree as ET
# Allow imports from scripts/ directory # Allow imports from scripts/ directory
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))

View File

@@ -21,12 +21,16 @@ class Pipeline(ABC):
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
neo4j_database: str | None = None, neo4j_database: str | None = None,
history: bool = False,
) -> None: ) -> None:
self.driver = driver self.driver = driver
self.data_dir = data_dir self.data_dir = data_dir
self.limit = limit self.limit = limit
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.neo4j_database = neo4j_database or os.getenv("NEO4J_DATABASE", "neo4j") self.neo4j_database = neo4j_database or os.getenv("NEO4J_DATABASE", "neo4j")
self.history = history
self.rows_in: int = 0
self.rows_loaded: int = 0
source_key = getattr(self, "source_id", getattr(self, "name", "unknown_source")) source_key = getattr(self, "source_id", getattr(self, "name", "unknown_source"))
self.run_id = f"{source_key}_{datetime.now(tz=UTC).strftime('%Y%m%d%H%M%S')}" self.run_id = f"{source_key}_{datetime.now(tz=UTC).strftime('%Y%m%d%H%M%S')}"
@@ -87,8 +91,8 @@ class Pipeline(ABC):
" r.started_at = coalesce($started_at, r.started_at), " " r.started_at = coalesce($started_at, r.started_at), "
" r.finished_at = coalesce($finished_at, r.finished_at), " " r.finished_at = coalesce($finished_at, r.finished_at), "
" r.error = coalesce($error, r.error), " " r.error = coalesce($error, r.error), "
" r.rows_in = coalesce(r.rows_in, 0), " " r.rows_in = $rows_in, "
" r.rows_loaded = coalesce(r.rows_loaded, 0)" " r.rows_loaded = $rows_loaded"
) )
run_id = getattr(self, "run_id", f"{source_id}_manual") run_id = getattr(self, "run_id", f"{source_id}_manual")
params = { params = {
@@ -98,6 +102,8 @@ class Pipeline(ABC):
"started_at": started_at, "started_at": started_at,
"finished_at": finished_at, "finished_at": finished_at,
"error": error, "error": error,
"rows_in": self.rows_in,
"rows_loaded": self.rows_loaded,
} }
try: try:
with self.driver.session(database=self.neo4j_database) as session: with self.driver.session(database=self.neo4j_database) as session:

View File

@@ -13,7 +13,7 @@ def get_person_settings() -> dict[str, Any]:
""" """
try: try:
import splink.comparison_library as cl # type: ignore[import-not-found] import splink.comparison_library as cl # type: ignore[import-not-found]
from splink import SettingsCreator from splink import SettingsCreator # type: ignore[import-not-found,unused-ignore]
except ImportError as exc: except ImportError as exc:
raise ImportError( raise ImportError(
"splink is required for entity resolution. " "splink is required for entity resolution. "

View File

@@ -51,8 +51,9 @@ class BcbPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.penalties: list[dict[str, Any]] = [] self.penalties: list[dict[str, Any]] = []
self.company_rels: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = []

View File

@@ -33,8 +33,9 @@ class BndesPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.finances: list[dict[str, Any]] = [] self.finances: list[dict[str, Any]] = []
self.relationships: list[dict[str, Any]] = [] self.relationships: list[dict[str, Any]] = []
@@ -51,8 +52,15 @@ class BndesPipeline(Pipeline):
def extract(self) -> None: def extract(self) -> None:
bndes_dir = Path(self.data_dir) / "bndes" bndes_dir = Path(self.data_dir) / "bndes"
if not bndes_dir.exists():
logger.warning("[%s] Data directory not found: %s", self.name, bndes_dir)
return
csv_path = bndes_dir / "operacoes-nao-automaticas.csv"
if not csv_path.exists():
logger.warning("[%s] CSV file not found: %s", self.name, csv_path)
return
self._raw = pd.read_csv( self._raw = pd.read_csv(
bndes_dir / "operacoes-nao-automaticas.csv", csv_path,
dtype=str, dtype=str,
delimiter=";", delimiter=";",
encoding="latin-1", encoding="latin-1",

View File

@@ -88,8 +88,9 @@ class CagedPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._csv_files: list[Path] = [] self._csv_files: list[Path] = []
def extract(self) -> None: def extract(self) -> None:

View File

@@ -60,8 +60,9 @@ class CamaraPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.expenses: list[dict[str, Any]] = [] self.expenses: list[dict[str, Any]] = []
self.deputies: list[dict[str, Any]] = [] self.deputies: list[dict[str, Any]] = []

View File

@@ -66,8 +66,9 @@ class CamaraInquiriesPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_inquiries: pd.DataFrame = pd.DataFrame() self._raw_inquiries: pd.DataFrame = pd.DataFrame()
self._raw_requirements: pd.DataFrame = pd.DataFrame() self._raw_requirements: pd.DataFrame = pd.DataFrame()

View File

@@ -31,8 +31,9 @@ class CeafPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.expulsions: list[dict[str, Any]] = [] self.expulsions: list[dict[str, Any]] = []
self.person_rels: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = []

View File

@@ -37,8 +37,9 @@ class CepimPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.ngos: list[dict[str, Any]] = [] self.ngos: list[dict[str, Any]] = []
self.company_rels: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = []

View File

@@ -216,9 +216,11 @@ class CNPJPipeline(Pipeline):
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
history: bool = False, history: bool = False,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(
self.history = history driver, data_dir, limit=limit, chunk_size=chunk_size, history=history, **kwargs,
)
self.run_id = f"cnpj-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}" self.run_id = f"cnpj-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}"
self._raw_empresas: pd.DataFrame = pd.DataFrame() self._raw_empresas: pd.DataFrame = pd.DataFrame()
self._raw_socios: pd.DataFrame = pd.DataFrame() self._raw_socios: pd.DataFrame = pd.DataFrame()

View File

@@ -63,7 +63,7 @@ class ComprasnetPipeline(Pipeline):
"""ETL pipeline for PNCP federal procurement contracts.""" """ETL pipeline for PNCP federal procurement contracts."""
name = "comprasnet" name = "comprasnet"
source_id = "pncp" source_id = "comprasnet"
def __init__( def __init__(
self, self,
@@ -71,8 +71,9 @@ class ComprasnetPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self.contracts: list[dict[str, Any]] = [] self.contracts: list[dict[str, Any]] = []
def extract(self) -> None: def extract(self) -> None:

View File

@@ -84,8 +84,9 @@ class CpgfPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.expenses: list[dict[str, Any]] = [] self.expenses: list[dict[str, Any]] = []
self.cardholders: list[dict[str, Any]] = [] self.cardholders: list[dict[str, Any]] = []

View File

@@ -38,8 +38,9 @@ class CvmPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_processos: pd.DataFrame = pd.DataFrame() self._raw_processos: pd.DataFrame = pd.DataFrame()
self._raw_acusados: pd.DataFrame = pd.DataFrame() self._raw_acusados: pd.DataFrame = pd.DataFrame()
self.proceedings: list[dict[str, Any]] = [] self.proceedings: list[dict[str, Any]] = []

View File

@@ -43,8 +43,9 @@ class CvmFundsPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.funds: list[dict[str, Any]] = [] self.funds: list[dict[str, Any]] = []
self.admin_rels: list[dict[str, Any]] = [] self.admin_rels: list[dict[str, Any]] = []

View File

@@ -50,8 +50,9 @@ class DatajudPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_cases: pd.DataFrame = pd.DataFrame() self._raw_cases: pd.DataFrame = pd.DataFrame()
self._raw_parties: pd.DataFrame = pd.DataFrame() self._raw_parties: pd.DataFrame = pd.DataFrame()

View File

@@ -29,8 +29,9 @@ class DatasusPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.facilities: list[dict[str, Any]] = [] self.facilities: list[dict[str, Any]] = []
self.company_links: list[dict[str, Any]] = [] self.company_links: list[dict[str, Any]] = []

View File

@@ -17,7 +17,10 @@ import re
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
from defusedxml import ElementTree # type: ignore[import-untyped] from defusedxml.ElementTree import ParseError as _XmlParseError # type: ignore[import-untyped]
from defusedxml.ElementTree import (
parse as _safe_xml_parse, # type: ignore[import-untyped,unused-ignore]
)
from bracc_etl.base import Pipeline from bracc_etl.base import Pipeline
from bracc_etl.loader import Neo4jBatchLoader from bracc_etl.loader import Neo4jBatchLoader
@@ -141,8 +144,9 @@ class DouPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_acts: list[dict[str, str]] = [] self._raw_acts: list[dict[str, str]] = []
self.acts: list[dict[str, Any]] = [] self.acts: list[dict[str, Any]] = []
self.person_rels: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = []
@@ -227,8 +231,8 @@ class DouPipeline(Pipeline):
"""Extract acts from Imprensa Nacional XML dumps.""" """Extract acts from Imprensa Nacional XML dumps."""
for f in xml_files: for f in xml_files:
try: try:
tree = ElementTree.parse(f) # noqa: S314 tree = _safe_xml_parse(f)
except ElementTree.ParseError: except _XmlParseError:
logger.warning("[dou] Failed to parse XML: %s", f.name) logger.warning("[dou] Failed to parse XML: %s", f.name)
continue continue

View File

@@ -76,8 +76,9 @@ class EuSanctionsPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.sanctions: list[dict[str, Any]] = [] self.sanctions: list[dict[str, Any]] = []
self.person_rels: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = []

View File

@@ -36,8 +36,9 @@ class HoldingsPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.holding_rels: list[dict[str, Any]] = [] self.holding_rels: list[dict[str, Any]] = []

View File

@@ -40,8 +40,9 @@ class IbamaPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.embargoes: list[dict[str, Any]] = [] self.embargoes: list[dict[str, Any]] = []
self.companies: list[dict[str, Any]] = [] self.companies: list[dict[str, Any]] = []
@@ -65,7 +66,13 @@ class IbamaPipeline(Pipeline):
def extract(self) -> None: def extract(self) -> None:
ibama_dir = Path(self.data_dir) / "ibama" ibama_dir = Path(self.data_dir) / "ibama"
if not ibama_dir.exists():
logger.warning("[%s] Data directory not found: %s", self.name, ibama_dir)
return
csv_path = ibama_dir / "areas_embargadas.csv" csv_path = ibama_dir / "areas_embargadas.csv"
if not csv_path.exists():
logger.warning("[%s] CSV file not found: %s", self.name, csv_path)
return
logger.info("[ibama] Reading %s", csv_path) logger.info("[ibama] Reading %s", csv_path)
self._raw = pd.read_csv( self._raw = pd.read_csv(
csv_path, csv_path,

View File

@@ -42,8 +42,9 @@ class ICIJPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._entities_raw: pd.DataFrame = pd.DataFrame() self._entities_raw: pd.DataFrame = pd.DataFrame()
self._officers_raw: pd.DataFrame = pd.DataFrame() self._officers_raw: pd.DataFrame = pd.DataFrame()
self._intermediaries_raw: pd.DataFrame = pd.DataFrame() self._intermediaries_raw: pd.DataFrame = pd.DataFrame()

View File

@@ -42,8 +42,9 @@ class InepPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self.schools: list[dict[str, Any]] = [] self.schools: list[dict[str, Any]] = []
self.school_company_links: list[dict[str, Any]] = [] self.school_company_links: list[dict[str, Any]] = []

View File

@@ -31,8 +31,9 @@ class LeniencyPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.agreements: list[dict[str, Any]] = [] self.agreements: list[dict[str, Any]] = []
self.company_rels: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = []

View File

@@ -74,8 +74,9 @@ class MidesPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_bids: pd.DataFrame = pd.DataFrame() self._raw_bids: pd.DataFrame = pd.DataFrame()
self._raw_contracts: pd.DataFrame = pd.DataFrame() self._raw_contracts: pd.DataFrame = pd.DataFrame()

View File

@@ -63,8 +63,9 @@ class OfacPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.sanctions: list[dict[str, Any]] = [] self.sanctions: list[dict[str, Any]] = []

View File

@@ -81,8 +81,9 @@ class OpenSanctionsPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_entities: list[dict[str, Any]] = [] self._raw_entities: list[dict[str, Any]] = []
self.global_peps: list[dict[str, Any]] = [] self.global_peps: list[dict[str, Any]] = []
self.pep_match_rels: list[dict[str, Any]] = [] self.pep_match_rels: list[dict[str, Any]] = []

View File

@@ -84,8 +84,9 @@ class PepCguPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame()
self.pep_records: list[dict[str, Any]] = [] self.pep_records: list[dict[str, Any]] = []
self.person_links: list[dict[str, Any]] = [] self.person_links: list[dict[str, Any]] = []

View File

@@ -38,8 +38,9 @@ class PgfnPipeline(Pipeline):
data_dir: str = "./data", data_dir: str = "./data",
limit: int | None = None, limit: int | None = None,
chunk_size: int = 50_000, chunk_size: int = 50_000,
**kwargs: Any,
) -> None: ) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._csv_files: list[Path] = [] self._csv_files: list[Path] = []
self.finances: list[dict[str, Any]] = [] self.finances: list[dict[str, Any]] = []
self.relationships: list[dict[str, Any]] = [] self.relationships: list[dict[str, Any]] = []
@@ -56,10 +57,13 @@ class PgfnPipeline(Pipeline):
def extract(self) -> None: def extract(self) -> None:
pgfn_dir = Path(self.data_dir) / "pgfn" pgfn_dir = Path(self.data_dir) / "pgfn"
if not pgfn_dir.exists():
logger.warning("[%s] Data directory not found: %s", self.name, pgfn_dir)
return
self._csv_files = sorted(pgfn_dir.glob("arquivo_lai_SIDA_*_*.csv")) self._csv_files = sorted(pgfn_dir.glob("arquivo_lai_SIDA_*_*.csv"))
if not self._csv_files: if not self._csv_files:
msg = f"No PGFN CSV files found in {pgfn_dir}" logger.warning("[%s] No PGFN CSV files found in %s", self.name, pgfn_dir)
raise FileNotFoundError(msg) return
logger.info("[pgfn] Found %d CSV files to process", len(self._csv_files)) logger.info("[pgfn] Found %d CSV files to process", len(self._csv_files))
def transform(self) -> None: def transform(self) -> None:

Some files were not shown because too many files have changed in this diff Show More