sync: upstream convergence 2026-03-02

Co-authored-by: bruno cesar <brunoclz@brunos-MacBook-Pro.local>
This commit is contained in:
Bruno César
2026-03-02 03:51:26 -03:00
committed by GitHub
parent d00d150f93
commit add44821e8
175 changed files with 2569 additions and 4713 deletions

View File

@@ -18,29 +18,24 @@ API_PORT=8000
LOG_LEVEL=info
APP_ENV=dev
JWT_SECRET_KEY=change-me-generate-with-openssl-rand-hex-32
AUTH_COOKIE_NAME=bracc_session
AUTH_COOKIE_SECURE=false
AUTH_COOKIE_SAMESITE=lax
TRUST_PROXY_HEADERS=false
INVITE_CODE=
CORS_ORIGINS=http://localhost:3000
PRODUCT_TIER=community
PATTERNS_ENABLED=false
PUBLIC_MODE=true
PUBLIC_MODE=false
PUBLIC_ALLOW_PERSON=false
PUBLIC_ALLOW_ENTITY_LOOKUP=false
PUBLIC_ALLOW_INVESTIGATIONS=false
PATTERN_SPLIT_THRESHOLD_VALUE=80000
PATTERN_SPLIT_MIN_COUNT=3
PATTERN_SHARE_THRESHOLD=0.60
PATTERN_SHARE_THRESHOLD=0.6
PATTERN_SRP_MIN_ORGS=5
PATTERN_INEXIG_MIN_RECURRENCE=3
PATTERN_MAX_EVIDENCE_REFS=50
SHARE_TOKEN_TTL_HOURS=168
# Frontend (dev only — production uses Caddy reverse proxy with relative paths)
VITE_API_URL=http://localhost:8000
VITE_PUBLIC_MODE=true
VITE_PUBLIC_MODE=false
VITE_PATTERNS_ENABLED=false
# Optional: Google Cloud (for Base dos Dados / TSE BigQuery)

View File

@@ -1,5 +1,5 @@
blank_issues_enabled: false
contact_links:
- name: Security vulnerability report
url: https://github.com/World-Open-Graph/br-acc/security/advisories/new
url: https://github.com/brunoclz/world-transparency-graph/security/advisories/new
about: Use GitHub Security Advisories for private vulnerability disclosure.

View File

@@ -10,8 +10,8 @@
"README.md",
"CONTRIBUTING.md",
"frontend/src/**",
"api/src/icarus/queries/**",
"api/src/icarus/models/**",
"api/src/bracc/queries/**",
"api/src/bracc/models/**",
"api/tests/**",
"etl/tests/**",
"frontend/src/**/*.test.*"

View File

@@ -23,26 +23,6 @@ on:
description: "Release title (EN)"
required: true
type: string
highlights_pt:
description: "PT highlights (separate bullets with |)"
required: true
type: string
highlights_en:
description: "EN highlights (separate bullets with |)"
required: true
type: string
patterns_included:
description: "Comma-separated pattern IDs included in this release (use 'none' if not applicable)"
required: true
type: string
technical_changes_pt:
description: "PT technical changes (separate bullets with |)"
required: true
type: string
technical_changes_en:
description: "EN technical changes (separate bullets with |)"
required: true
type: string
permissions:
contents: write
@@ -124,116 +104,63 @@ jobs:
COMPARE_URL: ${{ steps.validate.outputs.compare_url }}
TITLE_PT: ${{ inputs.title_pt }}
TITLE_EN: ${{ inputs.title_en }}
HIGHLIGHTS_PT: ${{ inputs.highlights_pt }}
HIGHLIGHTS_EN: ${{ inputs.highlights_en }}
PATTERNS_INCLUDED: ${{ inputs.patterns_included }}
TECHNICAL_CHANGES_PT: ${{ inputs.technical_changes_pt }}
TECHNICAL_CHANGES_EN: ${{ inputs.technical_changes_en }}
run: |
set -euo pipefail
DATE_UTC="$(date -u +"%Y-%m-%d")"
export DATE_UTC
python - <<'PY'
import json
import os
from textwrap import dedent
def split_pipe(raw: str) -> list[str]:
normalized = raw.replace("\r\n", "\n").replace("\n", "|")
return [item.strip(" -\t") for item in normalized.split("|") if item.strip()]
def split_csv(raw: str) -> list[str]:
value = raw.strip()
if value.lower() in {"none", "n/a", "na", "-"}:
return []
return [item.strip() for item in value.split(",") if item.strip()]
def bullets(items: list[str], fallback: str) -> str:
if not items:
return f"- {fallback}"
return "\n".join(f"- {item}" for item in items)
highlights_pt = split_pipe(os.environ["HIGHLIGHTS_PT"])
highlights_en = split_pipe(os.environ["HIGHLIGHTS_EN"])
technical_changes_pt = split_pipe(os.environ["TECHNICAL_CHANGES_PT"])
technical_changes_en = split_pipe(os.environ["TECHNICAL_CHANGES_EN"])
patterns = split_csv(os.environ["PATTERNS_INCLUDED"])
release_notes = dedent(
f"""
cat > release_notes.md <<NOTES
## PT-BR
{os.environ["TITLE_PT"]}
${TITLE_PT}
### Escopo
- Release publicada por marco.
- Mudanças listadas de forma específica para facilitar auditoria pública.
### Destaques
{bullets(highlights_pt, "Sem destaques declarados.")}
### Padrões incluídos
{bullets(patterns, "Sem novos padrões nesta release.")}
### Mudanças técnicas
{bullets(technical_changes_pt, "Sem mudanças técnicas declaradas.")}
- Mudanças detalhadas por categorias no histórico desta versão.
### Integridade pública
Os sinais e padrões refletem coocorrências em bases públicas e não constituem prova legal.
## EN
{os.environ["TITLE_EN"]}
${TITLE_EN}
### Scope
- Milestone-based release publication.
- Changes are listed explicitly for public traceability.
### Highlights
{bullets(highlights_en, "No highlights declared.")}
### Included patterns
{bullets(patterns, "No new patterns in this release.")}
### Technical changes
{bullets(technical_changes_en, "No technical changes declared.")}
- Detailed changes grouped by category in this version history.
### Public integrity
Signals and patterns reflect co-occurrence in public records and are not legal proof.
## Compatibility
- Breaking changes: none declared.
- Migration required: no.
- Breaking changes: declare explicitly when applicable.
- Migration required: declare explicitly when applicable.
## Compare
{os.environ.get("COMPARE_URL", "")}
${COMPARE_URL}
## Metadata
- Version: {os.environ["VERSION"]}
- Target SHA: {os.environ["TARGET_SHA"]}
- Previous tag: {os.environ["PREVIOUS_TAG"]}
- Date (UTC): {os.environ.get("DATE_UTC", "")}
"""
).strip() + "\n"
- Version: ${VERSION}
- Target SHA: ${TARGET_SHA}
- Previous tag: ${PREVIOUS_TAG}
- Date (UTC): ${DATE_UTC}
NOTES
with open("release_notes.md", "w", encoding="utf-8") as fh:
fh.write(release_notes)
python - <<'PY'
import json
import os
payload = {
"version": os.environ["VERSION"],
"date": os.environ.get("DATE_UTC", ""),
"highlights_pt": highlights_pt,
"highlights_en": highlights_en,
"highlights_pt": [os.environ["TITLE_PT"]],
"highlights_en": [os.environ["TITLE_EN"]],
"api_changes": [],
"data_changes": [],
"privacy_compliance_changes": [],
"patterns_included": patterns,
"technical_changes_pt": technical_changes_pt,
"technical_changes_en": technical_changes_en,
"breaking_changes": False,
"migration_required": False,
"compare_url": os.environ.get("COMPARE_URL", ""),

View File

@@ -6,18 +6,10 @@ on:
pull_request:
branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
gitleaks:
name: Gitleaks
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
with:
@@ -37,7 +29,6 @@ jobs:
bandit:
name: Bandit (Python)
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
@@ -57,7 +48,6 @@ jobs:
pip-audit:
name: Pip Audit (Python deps)
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
@@ -69,14 +59,6 @@ jobs:
with:
python-version: "3.12"
- name: Cache uv
uses: actions/cache@v4
with:
path: ~/.cache/uv
key: ${{ runner.os }}-uv-security-${{ hashFiles('api/uv.lock', 'etl/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-security-
- name: Export lock-compatible requirement sets
run: |
cd api
@@ -93,7 +75,6 @@ jobs:
public-privacy-gate:
name: Public Privacy Gate
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
@@ -107,7 +88,6 @@ jobs:
compliance-pack-gate:
name: Compliance Pack Gate
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
@@ -120,9 +100,8 @@ jobs:
public-boundary-gate:
name: Public Boundary Gate
if: vars.PUBLIC_BOUNDARY_GATE_ENABLED == 'true'
if: github.repository == 'brunoclz/world-transparency-graph'
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
@@ -136,7 +115,6 @@ jobs:
internal-instruction-boundary:
name: Internal Instruction Boundary
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4

3
.gitignore vendored
View File

@@ -75,7 +75,6 @@ scripts/audit-prompts/
# Local report artifacts in repository root
/*.pdf
/*.html
gitleaks-report*.json
# Playwright MCP cache
.playwright-mcp/
@@ -91,7 +90,7 @@ data/tse/
# Local MCP runtime config (keep example only)
.mcp.json
# Internal assistant instructions (must never be published)
# Internal assistant instruction files (must never be published)
CLAUDE.md
AGENTS.md
AGENTS*.md

142
Makefile
View File

@@ -1,14 +1,125 @@
.PHONY: dev stop seed bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics check neutrality
.PHONY: dev stop api etl frontend lint type-check test test-api test-etl test-frontend test-integration-api test-integration-etl test-integration check seed clean download-cnpj download-tse download-transparencia download-sanctions download-all etl-cnpj etl-cnpj-stream etl-tse etl-transparencia etl-sanctions etl-all link-persons bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics
# ── Development ─────────────────────────────────────────
dev:
docker compose -f infra/docker-compose.yml up -d
docker compose up -d
stop:
docker compose -f infra/docker-compose.yml down
docker compose down
# ── API ─────────────────────────────────────────────────
api:
cd api && uv run uvicorn bracc.main:app --reload --host 0.0.0.0 --port 8000
# ── ETL ─────────────────────────────────────────────────
etl:
cd etl && uv run bracc-etl --help
seed:
bash infra/scripts/seed-dev.sh
# ── CNPJ Data ──────────────────────────────────────────
download-cnpj:
cd etl && uv run python scripts/download_cnpj.py --reference-only
cd etl && uv run python scripts/download_cnpj.py --files 1
download-cnpj-all:
cd etl && uv run python scripts/download_cnpj.py --files 10
etl-cnpj:
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
etl-cnpj-dev:
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
etl-cnpj-stream:
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --streaming
# ── TSE Data ──────────────────────────────────────────
download-tse:
cd etl && uv run python scripts/download_tse.py --years 2024
etl-tse:
cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
etl-tse-dev:
cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
# ── Transparencia Data ────────────────────────────────
download-transparencia:
cd etl && uv run python scripts/download_transparencia.py --year 2025
etl-transparencia:
cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
etl-transparencia-dev:
cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
# ── Sanctions Data ────────────────────────────────────
download-sanctions:
cd etl && uv run python scripts/download_sanctions.py
etl-sanctions:
cd etl && uv run bracc-etl run --source sanctions --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
# ── All Data ──────────────────────────────────────────
download-all: download-cnpj download-tse download-transparencia download-sanctions
etl-all: etl-cnpj etl-tse etl-transparencia etl-sanctions
# ── Entity Resolution ────────────────────────────────────
link-persons:
docker compose exec neo4j cypher-shell -u neo4j -p "$${NEO4J_PASSWORD}" -f /scripts/link_persons.cypher
# ── Frontend ────────────────────────────────────────────
frontend:
cd frontend && npm run dev
# ── Quality ─────────────────────────────────────────────
lint:
cd api && uv run ruff check src/ tests/
cd etl && uv run ruff check src/ tests/
cd frontend && npm run lint
type-check:
cd api && uv run mypy src/
cd etl && uv run mypy src/
cd frontend && npm run type-check
test-api:
cd api && uv run pytest
test-etl:
cd etl && uv run pytest
test-frontend:
cd frontend && npm test
test: test-api test-etl test-frontend
# ── Integration tests ─────────────────────────────────
test-integration-api:
cd api && uv run pytest -m integration
test-integration-etl:
cd etl && uv run pytest -m integration
test-integration: test-integration-api test-integration-etl
# ── Full check (run before commit) ─────────────────────
check: lint type-check test
@echo "All checks passed."
# ── Neutrality audit ───────────────────────────────────
neutrality:
@! grep -rn \
"suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty\|CRITICAL\|HIGH.*severity\|MEDIUM.*severity\|LOW.*severity" \
api/src/ etl/src/ frontend/src/ \
--include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \
|| (echo "NEUTRALITY VIOLATION FOUND" && exit 1)
@echo "Neutrality check passed."
# ── Bootstrap ─────────────────────────────────────────────
bootstrap-demo:
bash scripts/bootstrap_public_demo.sh --profile demo
@@ -24,6 +135,7 @@ bootstrap-all-noninteractive:
bootstrap-all-report:
python3 scripts/run_bootstrap_all.py --repo-root . --report-latest
# ── Quality checks ────────────────────────────────────────
check-public-claims:
python3 scripts/check_public_claims.py --repo-root .
@@ -36,22 +148,20 @@ check-pipeline-contracts:
check-pipeline-inputs:
python3 scripts/check_pipeline_inputs.py
# ── Generators ────────────────────────────────────────────
generate-pipeline-status:
python3 scripts/generate_pipeline_status.py --registry-path docs/source_registry_br_v1.csv --output docs/pipeline_status.md
python3 scripts/generate_pipeline_status.py
generate-source-summary:
python3 scripts/generate_data_sources_summary.py --registry-path docs/source_registry_br_v1.csv --docs-path docs/data-sources.md
python3 scripts/generate_data_sources_summary.py
generate-reference-metrics:
python3 scripts/generate_reference_metrics.py --json-output audit-results/public-trust/latest/neo4j-reference-metrics.json --doc-output docs/reference_metrics.md
python3 scripts/generate_reference_metrics.py
check:
cd api && bash ../scripts/ci/python_quality.sh
cd etl && bash ../scripts/ci/python_quality.sh
cd frontend && bash ../scripts/ci/frontend_quality.sh
neutrality:
@! grep -rn "suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty" \
api/src/ etl/src/ frontend/src/ \
--include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \
|| (echo "NEUTRALITY VIOLATION: banned words found in source" && exit 1)
# ── Cleanup ─────────────────────────────────────────────
clean:
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
find . -type d -name .pytest_cache -exec rm -rf {} + 2>/dev/null || true
find . -type d -name .mypy_cache -exec rm -rf {} + 2>/dev/null || true
find . -type d -name .ruff_cache -exec rm -rf {} + 2>/dev/null || true
rm -rf frontend/dist

View File

@@ -1,7 +1,7 @@
[project]
name = "bracc-api"
version = "0.1.0"
description = "BRACC API — Brazilian public data anti-corruption graph tool"
description = "BR-ACC API — Brazilian public data anti-corruption graph tool"
requires-python = ">=3.12"
license = "AGPL-3.0-or-later"
dependencies = [

View File

@@ -1,5 +1,6 @@
from typing import Literal
from pydantic import Field
from pydantic_settings import BaseSettings
@@ -17,14 +18,15 @@ class Settings(BaseSettings):
jwt_secret_key: str = "change-me-in-production"
jwt_algorithm: str = "HS256"
jwt_expire_minutes: int = 1440
auth_cookie_name: str = "bracc_session"
auth_cookie_secure: bool = False
auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax"
trust_proxy_headers: bool = False
rate_limit_anon: str = "60/minute"
rate_limit_auth: str = "300/minute"
invite_code: str = ""
cors_origins: str = "http://localhost:3000"
auth_cookie_name: str = "bracc_session"
auth_cookie_secure: bool = False
auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax"
trust_proxy_headers: bool = False
share_token_ttl_hours: int = 168 # 7 days
product_tier: str = "community"
patterns_enabled: bool = False
public_mode: bool = False
@@ -37,7 +39,16 @@ class Settings(BaseSettings):
pattern_srp_min_orgs: int = 5
pattern_inexig_min_recurrence: int = 3
pattern_max_evidence_refs: int = 50
share_token_ttl_hours: int = 168
# Pattern hardening defaults (decision-complete contract)
pattern_temporal_window_years: int = Field(default=4, ge=1, le=20)
pattern_min_contract_value: float = Field(default=100000.0, ge=0)
pattern_min_contract_count: int = Field(default=2, ge=1)
pattern_min_debt_value: float = Field(default=50000.0, ge=0)
pattern_same_as_min_confidence: float = Field(default=0.85, ge=0, le=1)
pattern_pep_min_confidence: float = Field(default=0.85, ge=0, le=1)
pattern_min_recurrence: int = Field(default=2, ge=1)
pattern_min_discrepancy_ratio: float = Field(default=0.30, ge=0, le=1)
model_config = {"env_prefix": "", "env_file": ".env"}

View File

@@ -35,7 +35,12 @@ async def close_driver() -> None:
async def get_driver(request: Request) -> AsyncDriver:
driver: AsyncDriver = request.app.state.neo4j_driver
driver: AsyncDriver | None = getattr(request.app.state, "neo4j_driver", None)
if driver is None:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Database connection not available",
)
return driver

View File

@@ -2,7 +2,7 @@ import logging
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from slowapi import _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded
@@ -51,7 +51,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
app = FastAPI(
title="BRACC API",
title="BR-ACC API",
description="Brazilian public data graph analysis tool",
version="0.1.0",
lifespan=lifespan,
@@ -85,5 +85,5 @@ app.include_router(investigation.shared_router)
@app.get("/health")
async def health(request: Request) -> dict[str, str]:
return {"status": "ok", "version": request.app.version}
async def health() -> dict[str, str]:
return {"status": "ok"}

View File

@@ -53,7 +53,7 @@ def _is_pep_record(record: dict[str, Any]) -> bool:
for field in ("role", "cargo"):
value = record.get(field)
if isinstance(value, str) and value.strip().lower() in PEP_ROLES:
if isinstance(value, str) and any(kw in value.strip().lower() for kw in PEP_ROLES):
return True
return False

View File

@@ -6,37 +6,20 @@ from bracc.config import settings
from bracc.services.auth_service import decode_access_token
def _extract_token(request: Request) -> str | None:
def _get_rate_limit_key(request: Request) -> str:
"""Extract user_id from JWT (Bearer or cookie) for rate limiting, fallback to IP."""
auth = request.headers.get("authorization", "")
if auth.startswith("Bearer "):
return auth[7:].strip()
cookie_token = request.cookies.get(settings.auth_cookie_name)
if isinstance(cookie_token, str) and cookie_token.strip():
return cookie_token.strip()
return None
def _resolve_client_ip(request: Request) -> str:
if settings.trust_proxy_headers:
forwarded = request.headers.get("x-forwarded-for", "")
if forwarded:
first_hop = forwarded.split(",", 1)[0].strip()
if first_hop:
return first_hop
real_ip = request.headers.get("x-real-ip", "").strip()
if real_ip:
return real_ip
return get_remote_address(request)
def _get_rate_limit_key(request: Request) -> str:
"""Extract user_id from JWT for rate limiting, fallback to IP."""
token = _extract_token(request)
if token:
token = auth[7:]
user_id = decode_access_token(token)
if user_id:
return f"user:{user_id}"
return _resolve_client_ip(request)
cookie_token = request.cookies.get(settings.auth_cookie_name)
if isinstance(cookie_token, str) and cookie_token.strip():
user_id = decode_access_token(cookie_token.strip())
if user_id:
return f"user:{user_id}"
return get_remote_address(request)
limiter = Limiter(

View File

@@ -1,27 +1,15 @@
MATCH (center) WHERE elementId(center) = $entity_id
MATCH (center)
WHERE elementId(center) = $entity_id
AND (center:Person OR center:Partner OR center:Company OR center:Contract OR center:Sanction OR center:Election
OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education
OR center:Convenio OR center:LaborStats OR center:PublicOffice)
WITH center,
CASE
WHEN coalesce($include_probable, false) THEN
"SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS"
ELSE
"SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS"
END AS relationship_filter
CALL apoc.path.subgraphAll(center, {
relationshipFilter: relationship_filter,
labelFilter: "-User|-Investigation|-Annotation|-Tag",
maxLevel: $depth,
limit: 200
})
YIELD nodes, relationships
WITH center, nodes, relationships
UNWIND relationships AS r
WITH center,
startNode(r) AS src,
endNode(r) AS tgt,
r
OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS*1..4]-(connected)
WHERE length(p) <= $depth
AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag))
WITH center, p
UNWIND CASE WHEN p IS NULL THEN [] ELSE relationships(p) END AS r
WITH DISTINCT center, r, startNode(r) AS src, endNode(r) AS tgt
WHERE coalesce($include_probable, false) OR type(r) <> "POSSIBLE_SAME_AS"
RETURN center AS e,
r,
CASE WHEN elementId(src) = elementId(center) THEN tgt ELSE src END AS connected,

View File

@@ -1,14 +1,21 @@
MATCH (center) WHERE elementId(center) = $entity_id
MATCH (center)
WHERE elementId(center) = $entity_id
AND (center:Person OR center:Company OR center:Contract OR center:Sanction OR center:Election
OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education
OR center:Convenio OR center:LaborStats OR center:PublicOffice
OR center:OffshoreEntity OR center:OffshoreOfficer OR center:GlobalPEP
OR center:CVMProceeding OR center:Expense)
CALL apoc.path.subgraphAll(center, {
relationshipFilter: "SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU",
labelFilter: $label_filter,
maxLevel: $depth,
limit: 200
})
YIELD nodes, relationships
RETURN nodes, relationships, elementId(center) AS center_id
OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU*1..4]-(n)
WHERE length(p) <= $depth
AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag))
WITH center, collect(p) AS paths
WITH center,
reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes,
reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels
UNWIND raw_nodes AS n
WITH center, collect(DISTINCT n) AS nodes, raw_rels
UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r
WITH center, nodes, collect(DISTINCT r) AS rels
RETURN nodes,
[x IN rels WHERE x IS NOT NULL] AS relationships,
elementId(center) AS center_id

View File

@@ -1,6 +1,4 @@
MATCH (i:Investigation)
WHERE i.share_token = $token
AND (i.share_expires_at IS NULL OR i.share_expires_at > datetime())
MATCH (i:Investigation {share_token: $token})
OPTIONAL MATCH (i)-[:INCLUDES]->(e)
WITH i, collect(coalesce(e.cpf, e.cnpj, e.contract_id, e.sanction_id, e.amendment_id, e.cnes_code, e.finance_id, e.embargo_id, e.school_id, e.convenio_id, e.stats_id, elementId(e))) AS eids
RETURN i.id AS id,
@@ -9,5 +7,4 @@ RETURN i.id AS id,
i.created_at AS created_at,
i.updated_at AS updated_at,
i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -4,8 +4,7 @@ CREATE (i:Investigation {
description: $description,
created_at: datetime(),
updated_at: datetime(),
share_token: null,
share_expires_at: null
share_token: null
})
WITH i
MATCH (u:User {id: $user_id})
@@ -16,5 +15,4 @@ RETURN i.id AS id,
i.created_at AS created_at,
i.updated_at AS updated_at,
i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[] AS entity_ids

View File

@@ -7,5 +7,4 @@ RETURN i.id AS id,
i.created_at AS created_at,
i.updated_at AS updated_at,
i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -13,5 +13,4 @@ RETURN total,
i.created_at AS created_at,
i.updated_at AS updated_at,
i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -1,7 +1,5 @@
MATCH (u:User {id: $user_id})-[:OWNS]->(i:Investigation {id: $id})
SET i.share_token = $share_token,
i.share_expires_at = $share_expires_at,
i.updated_at = datetime()
RETURN i.id AS id,
i.share_token AS share_token,
i.share_expires_at AS share_expires_at
i.share_token AS share_token

View File

@@ -11,5 +11,4 @@ RETURN i.id AS id,
i.created_at AS created_at,
i.updated_at AS updated_at,
i.share_token AS share_token,
i.share_expires_at AS share_expires_at,
[x IN eids WHERE x IS NOT NULL] AS entity_ids

View File

@@ -1,5 +1,6 @@
MATCH (n) WHERE elementId(n) = $entity_id
MATCH (n)
WHERE elementId(n) = $entity_id
AND (n:Person OR n:Company OR n:Contract OR n:Sanction OR n:Election
OR n:Amendment OR n:Finance OR n:Embargo OR n:Health OR n:Education
OR n:Convenio OR n:LaborStats OR n:PublicOffice)
RETURN apoc.node.degree(n) AS degree
RETURN COUNT { (n)--() } AS degree

View File

@@ -2,11 +2,31 @@ MATCH (center:Company)
WHERE elementId(center) = $company_id
OR center.cnpj = $company_identifier
OR center.cnpj = $company_identifier_formatted
CALL apoc.path.subgraphAll(center, {
relationshipFilter: "SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU",
labelFilter: "+Company|+Contract|+Sanction|+Finance|+Amendment|+Convenio|+Bid|+MunicipalContract|+MunicipalBid|-Person|-Partner|-User|-Investigation|-Annotation|-Tag",
maxLevel: $depth,
limit: 200
})
YIELD nodes, relationships
RETURN nodes, relationships, elementId(center) AS center_id
OPTIONAL MATCH p=(center)-[:SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU*1..4]-(n)
WHERE length(p) <= $depth
AND all(
x IN nodes(p)
WHERE NOT (
"Person" IN labels(x)
OR "Partner" IN labels(x)
OR "User" IN labels(x)
OR "Investigation" IN labels(x)
OR "Annotation" IN labels(x)
OR "Tag" IN labels(x)
)
)
AND (
n:Company OR n:Contract OR n:Sanction OR n:Finance OR n:Amendment OR n:Convenio
OR n:Bid OR n:MunicipalContract OR n:MunicipalBid OR n IS NULL
)
WITH center, collect(p) AS paths
WITH center,
reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes,
reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels
UNWIND raw_nodes AS n
WITH center, collect(DISTINCT n) AS nodes, raw_rels
UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r
WITH center, nodes, collect(DISTINCT r) AS rels
RETURN nodes,
[x IN rels WHERE x IS NOT NULL] AS relationships,
elementId(center) AS center_id

View File

@@ -1,4 +1,4 @@
// BRACC Neo4j Schema — Constraints and Indexes
// BR-ACC Neo4j Schema — Constraints and Indexes
// Applied on database initialization
// ── Uniqueness Constraints ──────────────────────────────

View File

@@ -6,6 +6,7 @@ from neo4j import AsyncSession
from bracc.dependencies import get_session
from bracc.models.baseline import BaselineResponse
from bracc.services.baseline_service import BASELINE_QUERIES, run_all_baselines, run_baseline
from bracc.services.public_guard import enforce_entity_lookup_enabled
router = APIRouter(prefix="/api/v1/baseline", tags=["baseline"])
@@ -16,6 +17,7 @@ async def get_baseline_for_entity(
session: Annotated[AsyncSession, Depends(get_session)],
dimension: Annotated[str | None, Query()] = None,
) -> BaselineResponse:
enforce_entity_lookup_enabled()
if dimension:
if dimension not in BASELINE_QUERIES:
available = list(BASELINE_QUERIES.keys())

View File

@@ -182,7 +182,7 @@ async def get_entity_timeline(
date=event_date,
label=str(label),
entity_type=entity_type,
properties=sanitize_props(props),
properties=sanitize_public_properties(sanitize_props(props)),
sources=[SourceAttribution(database="neo4j_graph")],
))

View File

@@ -311,7 +311,7 @@ async def export_investigation_pdf(
cpf_val = node.get("cpf")
if cpf_val and isinstance(cpf_val, str):
role = str(node.get("role", node.get("cargo", ""))).lower()
is_pep = role in PEP_ROLES
is_pep = any(kw in role for kw in PEP_ROLES)
if not is_pep:
if "." in document and "-" in document:
document = mask_formatted_cpf(document)

View File

@@ -6,6 +6,7 @@ from neo4j import AsyncSession
from bracc.dependencies import get_session
from bracc.services.neo4j_service import execute_query_single
from bracc.services.public_guard import should_hide_person_entities
from bracc.services.source_registry import load_source_registry, source_registry_summary
router = APIRouter(prefix="/api/v1/meta", tags=["meta"])
@@ -40,7 +41,9 @@ async def database_stats(
result = {
"total_nodes": record["total_nodes"] if record else 0,
"total_relationships": record["total_relationships"] if record else 0,
"person_count": record["person_count"] if record else 0,
"person_count": (
0 if should_hide_person_entities() else (record["person_count"] if record else 0)
),
"company_count": record["company_count"] if record else 0,
"health_count": record["health_count"] if record else 0,
"finance_count": record["finance_count"] if record else 0,

View File

@@ -57,12 +57,6 @@ async def public_meta(
return {
"product": "World Transparency Graph",
"mode": "public_safe",
"dataset_scope": {
"local_default": "demo_local",
"ingestion_mode": "byo_ingestion",
"reference_metrics": "reference_production_snapshot",
},
"metrics_as_of_utc": "2026-03-01T23:05:00Z",
"total_nodes": record["total_nodes"] if record else 0,
"total_relationships": record["total_relationships"] if record else 0,
"company_count": record["company_count"] if record else 0,

View File

@@ -61,9 +61,9 @@ async def search_entities(
{
"query": _escape_lucene(q),
"entity_type": type_filter,
"hide_person_entities": hide_person_entities,
"skip": skip,
"limit": size,
"hide_person_entities": hide_person_entities,
},
)
total_record = await execute_query_single(

View File

@@ -9,6 +9,17 @@ from testcontainers.neo4j import Neo4jContainer
from bracc.main import app
def _iter_cypher_statements(path: Path) -> list[str]:
# Strip comment-only lines before splitting to avoid dropping statements
# that are preceded by section headers.
filtered_lines = [
line for line in path.read_text().splitlines()
if line.strip() and not line.strip().startswith("//")
]
text = "\n".join(filtered_lines)
return [stmt.strip() for stmt in text.split(";") if stmt.strip()]
@pytest.fixture(scope="session")
def neo4j_container() -> Neo4jContainer: # type: ignore[misc]
"""Start a Neo4j container for integration tests."""
@@ -25,21 +36,34 @@ def neo4j_uri(neo4j_container: Neo4jContainer) -> str:
@pytest.fixture(scope="session")
def neo4j_auth(neo4j_container: Neo4jContainer) -> tuple[str, str]:
return ("neo4j", neo4j_container.NEO4J_ADMIN_PASSWORD)
# testcontainers.neo4j API changed: older versions exposed NEO4J_ADMIN_PASSWORD,
# newer versions expose username/password attributes.
username = getattr(neo4j_container, "username", "neo4j")
password = getattr(
neo4j_container,
"password",
getattr(neo4j_container, "NEO4J_ADMIN_PASSWORD", None),
)
if password is None:
msg = "Could not resolve Neo4j testcontainer password"
raise RuntimeError(msg)
return (username, password)
@pytest.fixture(scope="session")
@pytest.fixture
async def neo4j_driver(
neo4j_uri: str, neo4j_auth: tuple[str, str]
) -> AsyncIterator[AsyncDriver]:
# Function-scoped driver avoids loop affinity issues between async tests.
driver = AsyncGraphDatabase.driver(neo4j_uri, auth=neo4j_auth)
async with driver.session() as session:
# Keep tests deterministic across function scope by resetting test data.
await session.run("MATCH (n) DETACH DELETE n")
# Apply schema
schema_path = Path(__file__).parent.parent.parent.parent / "infra" / "neo4j" / "init.cypher"
if schema_path.exists():
async with driver.session() as session:
for statement in schema_path.read_text().split(";"):
stmt = statement.strip()
if stmt and not stmt.startswith("//"):
for stmt in _iter_cypher_statements(schema_path):
await session.run(stmt)
# Seed dev data
seed_path = (
@@ -47,9 +71,7 @@ async def neo4j_driver(
)
if seed_path.exists():
async with driver.session() as session:
for statement in seed_path.read_text().split(";"):
stmt = statement.strip()
if stmt and not stmt.startswith("//"):
for stmt in _iter_cypher_statements(seed_path):
await session.run(stmt)
yield driver
await driver.close()

View File

@@ -34,7 +34,11 @@ def _setup_mock_session(driver: MagicMock, records: list[MagicMock]) -> AsyncMoc
@pytest.mark.anyio
async def test_register_success(client: AsyncClient) -> None:
async def test_register_success(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None:
from bracc.config import settings
monkeypatch.setattr(settings, "invite_code", "")
record = _mock_record({
"id": "user-uuid",
"email": "test@example.com",
@@ -56,19 +60,15 @@ async def test_register_success(client: AsyncClient) -> None:
@pytest.mark.anyio
async def test_register_bad_invite(client: AsyncClient) -> None:
async def test_register_bad_invite(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None:
from bracc.config import settings
original = settings.invite_code
try:
settings.invite_code = "secret-code"
monkeypatch.setattr(settings, "invite_code", "secret-code")
response = await client.post(
"/api/v1/auth/register",
json={"email": "test@example.com", "password": "password123", "invite_code": "wrong"},
)
assert response.status_code == 403
finally:
settings.invite_code = original
@pytest.mark.anyio
@@ -155,16 +155,25 @@ async def test_me_invalid_token(client: AsyncClient) -> None:
@pytest.mark.anyio
async def test_register_duplicate_email(client: AsyncClient) -> None:
async def test_register_duplicate_email(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
from neo4j.exceptions import ConstraintError
from bracc.config import settings
from bracc.main import app
monkeypatch.setattr(settings, "invite_code", "")
driver = app.state.neo4j_driver
mock_session = AsyncMock()
mock_session.run = AsyncMock(side_effect=Exception("Constraint violation"))
mock_session.run = AsyncMock(side_effect=ConstraintError("Node already exists"))
driver.session.return_value.__aenter__ = AsyncMock(return_value=mock_session)
with pytest.raises(Exception, match="Constraint violation"):
await client.post(
response = await client.post(
"/api/v1/auth/register",
json={"email": "duplicate@example.com", "password": "password123"},
)
assert response.status_code == 409
assert response.json()["detail"] == "Email already registered"

View File

@@ -61,7 +61,9 @@ def test_decode_access_token_invalid() -> None:
@pytest.mark.anyio
async def test_register_user_success() -> None:
async def test_register_user_success(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "invite_code", "")
mock_record = _mock_record({
"id": "user-uuid",
"email": "test@example.com",
@@ -80,15 +82,11 @@ async def test_register_user_success() -> None:
@pytest.mark.anyio
async def test_register_user_bad_invite() -> None:
original = settings.invite_code
try:
settings.invite_code = "secret-code"
async def test_register_user_bad_invite(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "invite_code", "secret-code")
session = AsyncMock()
with pytest.raises(ValueError, match="Invalid invite code"):
await register_user(session, "test@example.com", "password123", "wrong-code")
finally:
settings.invite_code = original
@pytest.mark.anyio

View File

@@ -68,6 +68,28 @@ class TestIsPepRecord:
def test_cargo_field(self) -> None:
assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado"})
@pytest.mark.parametrize(
"role",
[
"Deputado Federal",
"deputado federal",
"DEPUTADO FEDERAL",
"Senador da Republica",
"senadora da republica",
"Vereador Suplente",
"Ministro de Estado",
"Governadora do Estado de Sao Paulo",
"Presidente da Republica",
],
)
def test_compound_role_detected_as_pep(self, role: str) -> None:
"""Compound PEP roles like 'deputado federal' must be detected via substring match."""
assert _is_pep_record({"name": "X", "cpf": "11111111111", "role": role})
def test_compound_cargo_detected_as_pep(self) -> None:
"""Compound PEP cargo like 'Deputado Federal' must be detected via substring match."""
assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado Federal"})
def test_non_pep_role(self) -> None:
assert not _is_pep_record({"name": "X", "cpf": "11111111111", "role": "assessor"})
@@ -99,6 +121,18 @@ class TestCollectPepCpfs:
data = {"a": {"b": {"c": [{"cpf": "33333333333", "is_pep": True}]}}}
assert "33333333333" in _collect_pep_cpfs(data)
def test_compound_role_collected(self) -> None:
"""Compound roles like 'Deputado Federal' must be recognized in the walk."""
data = {
"results": [
{"cpf": "11111111111", "role": "Deputado Federal"},
{"cpf": "22222222222", "role": "assessor parlamentar"},
]
}
peps = _collect_pep_cpfs(data)
assert "11111111111" in peps
assert "22222222222" not in peps
# ---------------------------------------------------------------------------
# Unit tests for mask_cpfs_in_json
@@ -205,4 +239,4 @@ async def test_health_not_masked(client: AsyncClient) -> None:
"""Non-CPF JSON responses pass through unchanged."""
resp = await client.get("/health")
assert resp.status_code == 200
assert resp.json()["status"] == "ok" and "version" in resp.json()
assert resp.json() == {"status": "ok"}

View File

@@ -8,9 +8,7 @@ from httpx import AsyncClient
async def test_health_returns_ok(client: AsyncClient) -> None:
response = await client.get("/health")
assert response.status_code == 200
data = response.json()
assert data["status"] == "ok"
assert "version" in data
assert response.json() == {"status": "ok"}
assert response.headers["x-content-type-options"] == "nosniff"
assert response.headers["x-frame-options"] == "DENY"
assert response.headers["referrer-policy"] == "no-referrer"

View File

@@ -1,120 +0,0 @@
from unittest.mock import AsyncMock, patch
import pytest
from httpx import AsyncClient
from bracc.config import settings
from bracc.models.pattern import PATTERN_METADATA
from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES
from bracc.services.neo4j_service import CypherLoader
@pytest.fixture(autouse=True)
def _enable_patterns(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "patterns_enabled", True)
def test_all_community_patterns_have_metadata() -> None:
for pattern_id in COMMUNITY_PATTERN_IDS:
assert pattern_id in PATTERN_METADATA, f"Missing metadata for {pattern_id}"
def test_all_community_patterns_have_query_files() -> None:
for query_name in COMMUNITY_PATTERN_QUERIES.values():
try:
CypherLoader.load(query_name)
except FileNotFoundError:
pytest.fail(f"Missing .cypher file for query {query_name}.cypher")
finally:
CypherLoader.clear_cache()
def test_pattern_metadata_has_required_fields() -> None:
for pid, meta in PATTERN_METADATA.items():
assert "name_pt" in meta, f"{pid} missing name_pt"
assert "name_en" in meta, f"{pid} missing name_en"
assert "desc_pt" in meta, f"{pid} missing desc_pt"
assert "desc_en" in meta, f"{pid} missing desc_en"
@pytest.mark.anyio
async def test_list_patterns_endpoint(client: AsyncClient) -> None:
response = await client.get("/api/v1/patterns/")
assert response.status_code == 200
data = response.json()
assert "patterns" in data
assert len(data["patterns"]) == 8
ids = {row["id"] for row in data["patterns"]}
assert ids == set(COMMUNITY_PATTERN_IDS)
@pytest.mark.anyio
async def test_patterns_endpoint_returns_503_when_disabled(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "patterns_enabled", False)
response = await client.get("/api/v1/patterns/")
assert response.status_code == 503
assert "temporarily unavailable" in response.json()["detail"]
@pytest.mark.anyio
async def test_invalid_pattern_returns_404(client: AsyncClient) -> None:
response = await client.get("/api/v1/patterns/test-id/nonexistent_pattern")
assert response.status_code == 404
assert "Pattern not found" in response.json()["detail"]
@pytest.mark.anyio
async def test_patterns_endpoint_forwards_include_probable(client: AsyncClient) -> None:
with patch("bracc.routers.patterns.run_all_patterns", new_callable=AsyncMock) as mock_run_all:
mock_run_all.return_value = []
response = await client.get("/api/v1/patterns/test-id?include_probable=true")
assert response.status_code == 200
mock_run_all.assert_awaited_once()
_driver, entity_id, _lang = mock_run_all.await_args.args
assert entity_id == "test-id"
assert mock_run_all.await_args.kwargs["include_probable"] is True
@pytest.mark.anyio
async def test_specific_pattern_endpoint_forwards_include_probable(client: AsyncClient) -> None:
with patch("bracc.routers.patterns.run_pattern", new_callable=AsyncMock) as mock_run_one:
mock_run_one.return_value = []
response = await client.get(
"/api/v1/patterns/test-id/debtor_contracts?include_probable=true",
)
assert response.status_code == 200
mock_run_one.assert_awaited_once()
_session, pattern_name, entity_id, _lang = mock_run_one.await_args.args
assert pattern_name == "debtor_contracts"
assert entity_id == "test-id"
assert mock_run_one.await_args.kwargs["include_probable"] is True
def test_community_queries_use_bind_params() -> None:
for query_name in COMMUNITY_PATTERN_QUERIES.values():
try:
cypher = CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
assert "$company_id" in cypher, f"{query_name}.cypher missing $company_id"
assert "$company_identifier" in cypher, f"{query_name}.cypher missing $company_identifier"
assert "$company_identifier_formatted" in cypher, (
f"{query_name}.cypher missing $company_identifier_formatted"
)
assert "${" not in cypher, f"{query_name}.cypher uses unsafe string interpolation"
def test_no_banned_words_in_pattern_metadata() -> None:
banned = {"suspicious", "corrupt", "criminal", "fraudulent", "illegal", "guilty"}
for pid, meta in PATTERN_METADATA.items():
for key, value in meta.items():
for word in banned:
assert word not in value.lower(), (
f"Banned word '{word}' in {pid}.{key}: {value}"
)

View File

@@ -1,79 +0,0 @@
"""Community public-safe pattern registry and query contract tests."""
import pytest
from bracc.models.pattern import PATTERN_METADATA
from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES
from bracc.services.neo4j_service import CypherLoader
def test_community_pattern_registry_exact_ids() -> None:
assert len(COMMUNITY_PATTERN_IDS) == 8
assert set(COMMUNITY_PATTERN_IDS) == {
"sanctioned_still_receiving",
"amendment_beneficiary_contracts",
"split_contracts_below_threshold",
"contract_concentration",
"embargoed_receiving",
"debtor_contracts",
"srp_multi_org_hitchhiking",
"inexigibility_recurrence",
}
def test_community_pattern_query_mapping_is_complete() -> None:
assert set(COMMUNITY_PATTERN_QUERIES.keys()) == set(COMMUNITY_PATTERN_IDS)
for query_name in COMMUNITY_PATTERN_QUERIES.values():
assert query_name.startswith("public_pattern_")
@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values())
def test_public_pattern_query_files_load(query_name: str) -> None:
try:
CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values())
def test_public_pattern_query_required_return_aliases(query_name: str) -> None:
try:
cypher = CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
for required_alias in (
" AS pattern_id",
" AS risk_signal",
" AS amount_total",
" AS window_start",
" AS window_end",
" AS evidence_refs",
" AS evidence_count",
):
assert required_alias in cypher, f"{query_name}.cypher missing alias: {required_alias}"
@pytest.mark.parametrize("pattern_id", COMMUNITY_PATTERN_IDS)
def test_community_pattern_metadata_is_present(pattern_id: str) -> None:
meta = PATTERN_METADATA.get(pattern_id)
assert meta is not None
assert meta.get("name_pt")
assert meta.get("name_en")
assert meta.get("desc_pt")
assert meta.get("desc_en")
def test_threshold_params_used_in_threshold_patterns() -> None:
query_params = {
"public_pattern_split_contracts_below_threshold": "$pattern_split_threshold_value",
"public_pattern_contract_concentration": "$pattern_share_threshold",
"public_pattern_srp_multi_org_hitchhiking": "$pattern_srp_min_orgs",
"public_pattern_inexigibility_recurrence": "$pattern_inexig_min_recurrence",
}
for query_name, required_param in query_params.items():
try:
cypher = CypherLoader.load(query_name)
finally:
CypherLoader.clear_cache()
assert required_param in cypher, f"{query_name}.cypher missing {required_param}"

View File

@@ -225,6 +225,135 @@ async def test_public_graph_company_filters_person_nodes(client: AsyncClient) ->
assert len(payload["edges"]) == 0
@pytest.mark.anyio
async def test_baseline_disabled_in_public_mode(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "public_mode", True)
monkeypatch.setattr(settings, "public_allow_entity_lookup", False)
response = await client.get("/api/v1/baseline/test-id")
assert response.status_code == 403
assert "disabled in public mode" in response.json()["detail"]
@pytest.mark.anyio
async def test_stats_hides_person_count_in_public_mode(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "public_mode", True)
monkeypatch.setattr(settings, "public_allow_person", False)
# Clear stats cache to ensure fresh computation
import bracc.routers.meta as meta_mod
monkeypatch.setattr(meta_mod, "_stats_cache", None)
fake_record = {
"total_nodes": 100,
"total_relationships": 200,
"person_count": 999,
"company_count": 50,
"health_count": 10,
"finance_count": 5,
"contract_count": 20,
"sanction_count": 3,
"election_count": 7,
"amendment_count": 4,
"embargo_count": 2,
"education_count": 6,
"convenio_count": 8,
"laborstats_count": 9,
"offshore_entity_count": 1,
"offshore_officer_count": 2,
"global_pep_count": 3,
"cvm_proceeding_count": 4,
"expense_count": 11,
"pep_record_count": 12,
"expulsion_count": 13,
"leniency_count": 14,
"international_sanction_count": 15,
"gov_card_expense_count": 16,
"gov_travel_count": 17,
"bid_count": 18,
"fund_count": 19,
"dou_act_count": 20,
"tax_waiver_count": 21,
"municipal_finance_count": 22,
"declared_asset_count": 23,
"party_membership_count": 24,
"barred_ngo_count": 25,
"bcb_penalty_count": 26,
"labor_movement_count": 27,
"legal_case_count": 28,
"judicial_case_count": 29,
"source_document_count": 30,
"ingestion_run_count": 31,
"temporal_violation_count": 32,
"cpi_count": 33,
"inquiry_requirement_count": 34,
"inquiry_session_count": 35,
"municipal_bid_count": 36,
"municipal_contract_count": 37,
"municipal_gazette_act_count": 38,
}
with patch(
"bracc.routers.meta.execute_query_single",
new_callable=AsyncMock,
return_value=fake_record,
), patch(
"bracc.routers.meta.load_source_registry",
return_value=[],
), patch(
"bracc.routers.meta.source_registry_summary",
return_value={
"universe_v1_sources": 0,
"implemented_sources": 0,
"loaded_sources": 0,
"healthy_sources": 0,
"stale_sources": 0,
"blocked_external_sources": 0,
"quality_fail_sources": 0,
"discovered_uningested_sources": 0,
},
):
response = await client.get("/api/v1/meta/stats")
assert response.status_code == 200
payload = response.json()
assert payload["person_count"] == 0
assert payload["company_count"] == 50 # non-person counts preserved
@pytest.mark.anyio
async def test_timeline_sanitizes_properties_in_public_mode(
client: AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "public_mode", True)
monkeypatch.setattr(settings, "public_allow_entity_lookup", True)
mock_records = [
{
"lbls": ["Contract"],
"props": {"type": "licitacao", "cpf": "12345678900", "value": 50000.0},
"event_date": "2024-01-15",
"id": "evt-1",
},
]
with patch(
"bracc.routers.entity.execute_query",
new_callable=AsyncMock,
return_value=mock_records,
):
response = await client.get("/api/v1/entity/test-id/timeline")
assert response.status_code == 200
payload = response.json()
assert len(payload["events"]) == 1
event_props = payload["events"][0]["properties"]
assert "cpf" not in event_props
assert event_props["value"] == 50000.0
@pytest.mark.anyio
async def test_investigations_disabled_in_public_mode(
client: AsyncClient,

View File

@@ -1,24 +1,15 @@
from unittest.mock import MagicMock
from bracc.config import settings
from bracc.middleware.rate_limit import _get_rate_limit_key, limiter
from bracc.services.auth_service import create_access_token
def _make_request(
auth_header: str | None = None,
client_ip: str = "127.0.0.1",
cookie_token: str | None = None,
x_forwarded_for: str | None = None,
) -> MagicMock:
def _make_request(auth_header: str | None = None, client_ip: str = "127.0.0.1") -> MagicMock:
request = MagicMock()
headers: dict[str, str] = {}
if auth_header:
headers["authorization"] = auth_header
if x_forwarded_for:
headers["x-forwarded-for"] = x_forwarded_for
request.headers = headers
request.cookies = {settings.auth_cookie_name: cookie_token} if cookie_token else {}
request.client = MagicMock()
request.client.host = client_ip
return request
@@ -43,23 +34,5 @@ def test_key_func_invalid_token_fallback() -> None:
assert key == "10.0.0.1"
def test_key_func_extracts_user_from_cookie_token() -> None:
token = create_access_token("cookie-user-1")
request = _make_request(cookie_token=token)
key = _get_rate_limit_key(request)
assert key == "user:cookie-user-1"
def test_key_func_uses_forwarded_ip_when_enabled() -> None:
original = settings.trust_proxy_headers
try:
settings.trust_proxy_headers = True
request = _make_request(client_ip="127.0.0.1", x_forwarded_for="203.0.113.9, 10.0.0.4")
key = _get_rate_limit_key(request)
assert key == "203.0.113.9"
finally:
settings.trust_proxy_headers = original
def test_limiter_instance_exists() -> None:
assert limiter is not None

View File

@@ -1,21 +1,6 @@
import pytest
from httpx import AsyncClient
from bracc.routers.search import _escape_lucene
def test_escape_lucene_cnpj() -> None:
assert _escape_lucene("00.000.000/0001-00") == "00.000.000\\/0001\\-00"
def test_escape_lucene_plain_text() -> None:
assert _escape_lucene("silva construcoes") == "silva construcoes"
def test_escape_lucene_all_special_chars() -> None:
for ch in r'+-&|!(){}[]^"~*?:\/':
assert f"\\{ch}" in _escape_lucene(ch)
@pytest.mark.anyio
async def test_search_rejects_short_query(client: AsyncClient) -> None:

100
api/uv.lock generated
View File

@@ -103,6 +103,56 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
]
[[package]]
name = "bracc-api"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "bcrypt" },
{ name = "fastapi" },
{ name = "jinja2" },
{ name = "neo4j" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "pyjwt", extra = ["crypto"] },
{ name = "python-multipart" },
{ name = "slowapi" },
{ name = "uvicorn", extra = ["standard"] },
{ name = "weasyprint" },
]
[package.optional-dependencies]
dev = [
{ name = "httpx" },
{ name = "mypy" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "ruff" },
{ name = "testcontainers", extra = ["neo4j"] },
]
[package.metadata]
requires-dist = [
{ name = "bcrypt", specifier = ">=4.0.0" },
{ name = "fastapi", specifier = ">=0.115.0" },
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
{ name = "jinja2", specifier = ">=3.1.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" },
{ name = "neo4j", specifier = ">=5.27.0" },
{ name = "pydantic", specifier = ">=2.10.0" },
{ name = "pydantic-settings", specifier = ">=2.7.0" },
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
{ name = "python-multipart", specifier = ">=0.0.18" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
{ name = "slowapi", specifier = ">=0.1.9" },
{ name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" },
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" },
{ name = "weasyprint", specifier = ">=62.0" },
]
provides-extras = ["dev"]
[[package]]
name = "brotli"
version = "1.2.0"
@@ -523,56 +573,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
[[package]]
name = "bracc-api"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "bcrypt" },
{ name = "fastapi" },
{ name = "jinja2" },
{ name = "neo4j" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "pyjwt", extra = ["crypto"] },
{ name = "python-multipart" },
{ name = "slowapi" },
{ name = "uvicorn", extra = ["standard"] },
{ name = "weasyprint" },
]
[package.optional-dependencies]
dev = [
{ name = "httpx" },
{ name = "mypy" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "ruff" },
{ name = "testcontainers", extra = ["neo4j"] },
]
[package.metadata]
requires-dist = [
{ name = "bcrypt", specifier = ">=4.0.0" },
{ name = "fastapi", specifier = ">=0.115.0" },
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
{ name = "jinja2", specifier = ">=3.1.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" },
{ name = "neo4j", specifier = ">=5.27.0" },
{ name = "pydantic", specifier = ">=2.10.0" },
{ name = "pydantic-settings", specifier = ">=2.7.0" },
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
{ name = "python-multipart", specifier = ">=0.0.18" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
{ name = "slowapi", specifier = ">=0.1.9" },
{ name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" },
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" },
{ name = "weasyprint", specifier = ">=62.0" },
]
provides-extras = ["dev"]
[[package]]
name = "idna"
version = "3.11"

0
data/.gitkeep Normal file
View File

View File

0
data/cnpj/raw/.gitkeep Normal file
View File

View File

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 MiB

View File

@@ -1,24 +1,13 @@
# BRACC Data Source Catalog
# ICARUS Data Source Catalog
<!-- SOURCE_SUMMARY_START -->
**Generated from `docs/source_registry_br_v1.csv` (as-of UTC: 2026-03-01T23:05:00Z)**
- Universe v1 sources: 108
- Implemented pipelines: 45
- Loaded sources (load_state=loaded): 36
- Partial sources (load_state=partial): 8
- Not loaded sources (load_state=not_loaded): 64
- Status counts: loaded=36, partial=5, stale=3, blocked_external=1, not_built=63
<!-- SOURCE_SUMMARY_END -->
Catalog note: counts and status labels are generated from the public registry (`docs/source_registry_br_v1.csv`).
This document includes reference production inventory context and backlog discovery; it is not a guarantee that every listed source is currently loaded in your local environment.
**38 loaded | 3 pipelines pending data | 60+ not yet built**
Last updated: 2026-02-26
---
## 1. Reference Production Snapshot (Loaded/Implemented Inventory)
## 1. LOADED (38 sources)
The table below is a timestamped reference snapshot and should be interpreted together with the generated summary block above.
All sources below have working ETL pipelines in `etl/src/icarus_etl/pipelines/` and are loaded into production Neo4j.
| # | Source | Pipeline | Nodes Created | Rels Created | Notes |
|---|--------|----------|---------------|--------------|-------|

View File

@@ -1,29 +0,0 @@
# Demo Dataset Contract (WTG Open)
## Objective
Provide a reproducible, public-safe demo graph with synthetic records only.
## Safety rules
- Synthetic data only. No real CPF, no real personal names, no real personal addresses.
- Company identifiers may use synthetic CNPJ-like values reserved for demonstration.
- Demo graph cannot include `Person` or `Partner` labels.
- Demo exports must never include private or operational metadata.
## Required files
- `data/demo/synthetic_graph.json`
- `data/demo/README.md`
- `scripts/generate_demo_dataset.py`
## JSON schema (minimum)
- `nodes[]`: `{id, label, type, properties}`
- `edges[]`: `{id, source, target, type, properties}`
- `meta`: `{generated_at_utc, generator_version, source: "synthetic"}`
## Acceptance checks
- No field name contains `cpf`, `doc_partial`, or `doc_raw`.
- No node label equals `Person` or `Partner`.
- CI privacy gate passes.
## Runtime target
- Dedicated demo Neo4j instance (non-production).
- Public API served with `PUBLIC_MODE=true`.

View File

@@ -14,7 +14,6 @@ Resumo:
Release notes: {release_url}
Observação de integridade: os sinais refletem coocorrências em bases públicas e não constituem prova legal.
Divulgação obrigatória: o repositório público entrega engine + demo + fluxo BYO-data; métricas de escala são snapshots de referência com timestamp.
## Short post (EN)
@@ -28,7 +27,6 @@ Summary:
Release notes: {release_url}
Integrity note: signals reflect co-occurrence in public records and are not legal proof.
Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; production-scale metrics are timestamped reference snapshots.
## Discord/Telegram long form (PT+EN)
@@ -44,11 +42,6 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p
**Compatibilidade**
- {pt_compat}
**Reproducibility Reality Check**
- Funciona agora: {pt_works_now}
- Requer ingestão de dados: {pt_requires_ingestion}
- Não incluído por padrão: {pt_not_included}
**Link**
- {release_url}
@@ -64,10 +57,5 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p
**Compatibility**
- {en_compat}
**Reproducibility Reality Check**
- Works now: {en_works_now}
- Requires data ingestion: {en_requires_ingestion}
- Not included by default: {en_not_included}
**Link**
- {release_url}

View File

@@ -7,8 +7,8 @@ docs/**,PUBLIC with review,Keep public documentation and legal pack,include revi
.github/workflows/**,PUBLIC,CI and security transparency,include
scripts/**,PUBLIC with review,Keep public utilities and gates,include reviewed subset
data/demo/**,PUBLIC,Synthetic demo dataset only,include
api/src/bracc/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude
api/src/bracc/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude
api/src/icarus/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude
api/src/icarus/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude
scripts/auto_finalize_pncp_backfill.sh,REMOVE_FROM_PUBLIC,Production operational script with server-specific assumptions,exclude
docs/shadow_rollout_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude
docs/ingestion_priority_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude
1 path classification reason action_for_public_repo
7 .github/workflows/** PUBLIC CI and security transparency include
8 scripts/** PUBLIC with review Keep public utilities and gates include reviewed subset
9 data/demo/** PUBLIC Synthetic demo dataset only include
10 api/src/bracc/services/pattern_service.py api/src/icarus/services/pattern_service.py REMOVE_FROM_PUBLIC Pattern engine disabled pending validation exclude
11 api/src/bracc/queries/pattern_*.cypher api/src/icarus/queries/pattern_*.cypher REMOVE_FROM_PUBLIC Pattern query engine disabled pending validation exclude
12 scripts/auto_finalize_pncp_backfill.sh REMOVE_FROM_PUBLIC Production operational script with server-specific assumptions exclude
13 docs/shadow_rollout_runbook.md REMOVE_FROM_PUBLIC Production operational runbook details exclude
14 docs/ingestion_priority_runbook.md REMOVE_FROM_PUBLIC Production operational runbook details exclude

View File

@@ -1,56 +1,78 @@
# Public Repo Release Checklist — `World-Open-Graph/br-acc`
## 1) Pre-release gate
1. Confirm target merge commit exists on `main`.
2. Confirm CI + Security + Public gates are green on that commit.
3. Confirm PR is merged with exactly one release label.
## 2) Public boundary checks
# Public Repo Release Checklist — World Transparency Graph
## 1) Prepare sanitized snapshot
```bash
python scripts/check_public_privacy.py --repo-root .
python scripts/check_compliance_pack.py --repo-root .
python scripts/check_open_core_boundary.py --repo-root .
bash scripts/prepare_public_snapshot.sh /Users/brunoclz/CORRUPTOS /tmp/world-transparency-graph-public
```
Expected: all `PASS`.
## 3) Snapshot hygiene (optional verification)
## 2) Initialize clean-history repo from snapshot
```bash
bash scripts/prepare_public_snapshot.sh . /tmp/br-acc-public
python /tmp/br-acc-public/scripts/check_public_privacy.py --repo-root /tmp/br-acc-public
python /tmp/br-acc-public/scripts/check_compliance_pack.py --repo-root /tmp/br-acc-public
python /tmp/br-acc-public/scripts/check_open_core_boundary.py --repo-root /tmp/br-acc-public
cd /tmp/world-transparency-graph-public
git init
git add .
git commit -m "Initial public release (WTG)"
```
Expected in snapshot:
## 3) Create GitHub repository (manual)
- Owner: `brunoclz`
- Name: `world-transparency-graph`
- Visibility: Public
- Do not auto-add README/License (already present)
- No `CLAUDE.md`.
- No `AGENTS.md` or `AGENTS*.md`.
- No private operational runbooks outside public scope.
## 4) Push initial release
```bash
git branch -M main
git remote add origin https://github.com/brunoclz/world-transparency-graph.git
git push -u origin main
```
## 4) Publish release (manual workflow)
## 5) Configure branch protection (GitHub UI)
Require all checks:
- `API (Python)`
- `ETL (Python)`
- `Frontend (TypeScript)`
- `Neutrality Audit`
- `Gitleaks`
- `Bandit (Python)`
- `Pip Audit (Python deps)`
- `Public Privacy Gate`
- `Compliance Pack Gate`
- `Public Boundary Gate`
In GitHub Actions, run **Publish Release** with:
## 6) Configure environment defaults
- Set public deployment environment vars:
- `PRODUCT_TIER=community`
- `PUBLIC_MODE=true`
- `PUBLIC_ALLOW_PERSON=false`
- `PUBLIC_ALLOW_ENTITY_LOOKUP=false`
- `PUBLIC_ALLOW_INVESTIGATIONS=false`
- `PATTERNS_ENABLED=false`
- `VITE_PUBLIC_MODE=true`
- `VITE_PATTERNS_ENABLED=false`
- `version`: SemVer tag (e.g. `v0.3.0`, `v0.3.1-rc.1`)
- `target_sha`: merge commit on `main`
- `prerelease`: `false` (stable) or `true` (RC)
- `title_pt`: release title PT-BR
- `title_en`: release title EN
## 7) Final checks before launch
- `python scripts/check_public_privacy.py --repo-root .` => `PASS`
- `python scripts/check_compliance_pack.py --repo-root .` => `PASS`
- `python scripts/check_open_core_boundary.py --repo-root .` => `PASS`
- Confirm no internal runbooks in public repo
- Confirm demo data is synthetic (`data/demo/synthetic_graph.json`)
- Confirm all legal docs exist in root:
- `ETHICS.md`
- `LGPD.md`
- `PRIVACY.md`
- `TERMS.md`
- `DISCLAIMER.md`
- `SECURITY.md`
- `ABUSE_RESPONSE.md`
## 5) Verify outputs
## 8) Launch communication split
- Publish product announcement as **WTG**
- Publish movement announcement as **BRCC**
- Mention methodology limits and non-accusatory policy
1. Tag exists in repository.
2. Release page published under `/releases`.
3. Notes include PT+EN and non-accusatory disclaimer.
4. `release_manifest.json` asset is attached.
5. Compare link is valid (`previous_tag...new_tag`).
## 6) Community communication
1. Use `docs/release/community_announcement_template.md`.
2. Publish short PT+EN summary with release URL.
3. Keep wording factual: “signals/co-occurrence”, never accusatory language.
## 9) Release system bootstrap
- Ensure `.github/release.yml` exists for auto-notes categories.
- Ensure `.github/release-drafter.yml` + workflow are active.
- Ensure `publish-release.yml` workflow is present and dispatchable.
- Ensure release label taxonomy is documented and applied to PRs.
- Publish first policy-compliant tag from this stream (`v0.3.0`).

View File

@@ -48,11 +48,10 @@ A release can only be published from a commit on `main` where all required gates
Every release must include PT-BR and EN sections with:
1. Scope summary.
2. Notable changes (explicit bullet points).
3. Included pattern IDs when release contains pattern/signal changes.
4. Compatibility/breaking notes.
5. Privacy/compliance notes when applicable.
6. Non-accusatory disclaimer.
2. Notable changes.
3. Compatibility/breaking notes.
4. Privacy/compliance notes when applicable.
5. Non-accusatory disclaimer.
## Artifacts

View File

@@ -37,19 +37,6 @@ For validation cycles use RC:
- `prerelease`: `true` for RC, `false` for stable
- `title_pt`: short PT-BR title
- `title_en`: short EN title
- `highlights_pt`: PT highlights separated by `|`
- `highlights_en`: EN highlights separated by `|`
- `patterns_included`: comma-separated pattern IDs (use `none` when not applicable)
- `technical_changes_pt`: PT technical changes separated by `|`
- `technical_changes_en`: EN technical changes separated by `|`
Example inputs for a pattern release:
- `highlights_pt`: `Port de 8 padrões públicos factuais | Padronização de payload público`
- `highlights_en`: `Port of 8 factual public-safe patterns | Public payload standardization`
- `patterns_included`: `sanctioned_still_receiving,amendment_beneficiary_contracts,split_contracts_below_threshold,contract_concentration,embargoed_receiving,debtor_contracts,srp_multi_org_hitchhiking,inexigibility_recurrence`
- `technical_changes_pt`: `Provider community de 4 para 8 padrões | ETL criou relação Contract-REFERENTE_A-Bid`
- `technical_changes_en`: `Community provider expanded from 4 to 8 patterns | ETL created Contract-REFERENTE_A-Bid linkage`
## 4) Workflow validations performed
@@ -65,7 +52,7 @@ The workflow blocks publication when:
On success the workflow:
1. Creates and pushes an annotated tag.
2. Creates GitHub Release (PT+EN notes) with explicit highlights, patterns, and technical changes.
2. Creates GitHub Release (PT+EN notes).
3. Uploads `release_manifest.json` asset.
## 6) Post-release checklist
@@ -73,7 +60,6 @@ On success the workflow:
1. Open the release page and confirm:
- version tag is correct,
- PT+EN notes are present,
- included patterns are explicitly listed (or marked as none),
- non-accusatory disclaimer line is present,
- `release_manifest.json` is attached.
2. Share release link in community channels.

View File

@@ -1,67 +0,0 @@
# Source Onboarding Contract (Brazil Coverage v1)
This contract is mandatory for every new source before `shadow -> promote`.
## 1. Source Identity
- `source_id`:
- `name`:
- `category`:
- `tier`:
- `owner_agent`:
- `primary_url`:
- `access_mode` (`file|api|bigquery|web`):
- `public_access_mode` (`open|open_with_rate_limit|registration|credentialed_public`):
- `discovery_status` (`discovered|discovered_uningested|monitored|unreachable`):
- `last_seen_url`:
- `cadence_expected`:
- `cadence_observed`:
- `quality_status` (`healthy|stale|quality_fail|blocked_external|not_built|partial|loaded`):
## 2. Access and Legal
- Credential required:
- Secret name/path:
- License or usage restriction:
- LGPD/privacy considerations:
- `blocked_external` criteria:
## 3. Data Contract
- Downloader script: `etl/scripts/download_<source>.py`
- Canonical output files:
- Manifest file:
- Manifest mandatory fields (`run_id`, `source_id`, `window_start`, `window_end`, `rows`, `error`, `checksum`, `retrieved_at_utc`):
- Update cadence:
- Expected row volume:
- Partition/window strategy:
## 4. Graph Contract
- Node labels introduced:
- Relationship types introduced:
- Natural key(s) per node:
- Merge key strategy:
- Relationship quality tier (`strong|probable`):
- Provenance fields (`method`, `confidence`, `source_ref`, `run_id`):
## 5. Index and Constraint Contract
- Required uniqueness constraints:
- Required date indexes:
- Required lookup indexes:
- Required fulltext indexes (if text-heavy):
## 6. Quality Gates (Hard Stop/Go)
- Identity integrity preserved (`Person.cpf` masked = 0, 14-digit = 0):
- Freshness SLA threshold:
- Temporal sanity (`<= now + 365d`):
- Null/duplicate key thresholds:
- Mandatory non-zero nodes/rels:
## 7. Operational Flow
- Shadow load command:
- Gate runner commands:
- API smoke checks:
- Promote command:
- Rollback command:
## 8. Acceptance
- Evidence bundle path in `audit-results/`:
- Final status: `resolved | resolved_full | blocked_external | quality_fail`
- Reviewer sign-off:

View File

@@ -1,109 +1,109 @@
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status,last_verified_utc,verification_status
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,ok
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=bens,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=filiacao,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/datajud/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.gov.br/icmbio/pt-br,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://tcers.tc.br/fiscalizado/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/holding/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/dataset/bens-candidato,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/dataset/filiados-partidos,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/penalidades,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/estban,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/dataset/if-data,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/cnciai/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.rs.gov.br/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
1 source_id name category tier status implementation_state load_state frequency in_universe_v1 primary_url pipeline_id owner_agent access_mode notes public_access_mode discovery_status last_seen_url cadence_expected cadence_observed quality_status last_verified_utc verification_status
2 cnpj Receita Federal CNPJ identity P0 loaded implemented loaded monthly true https://dadosabertos.rfb.gov.br/CNPJ/ https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/ cnpj Agent A file http://dadosabertos.rfb.gov.br monitored healthy 2026-03-01T23:11:31.444615+00:00 transient_error
3 tse TSE elections and donations electoral P0 loaded implemented loaded biennial true https://dadosabertos.tse.jus.br/ tse Agent E file Core electoral data loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
4 transparencia Portal da Transparencia contracts contracts P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados transparencia Agent C file Federal contracts and servants monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
5 sanctions CEIS CNEP sanctions sanctions P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/sancoes/consulta sanctions Agent C file Administrative sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
6 pep_cgu CGU PEP list integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/pep pep_cgu Agent A file PEP baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
7 bndes BNDES financings finance P1 loaded implemented loaded monthly true https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados bndes Agent G file Loan relationships monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
8 pgfn PGFN divida ativa fiscal P0 loaded implemented loaded monthly true https://www.regularize.pgfn.gov.br/dados-abertos pgfn Agent C file Debt risk core monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
9 ibama IBAMA embargos environment P1 loaded implemented loaded monthly true https://servicos.ibama.gov.br/ctf/publico/areasembargadas/ ibama Agent F file Environmental enforcement monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
10 comprasnet ComprasNet contracts contracts P0 stale implemented partial monthly true https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos comprasnet Agent C file Needs freshness backfill monitored stale 2026-03-01T23:11:31.444615+00:00 ok
11 tcu TCU sanctions audit P1 loaded implemented loaded monthly true https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS tcu Agent C file Inidoneidade sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
12 transferegov TransfereGov emendas e convenios transfers P0 loaded implemented loaded monthly true https://www.transferegov.sistema.gov.br/portal/download-de-dados transferegov Agent C file Transfer relationships monitored healthy 2026-03-01T23:11:31.444615+00:00 transient_error
13 rais RAIS aggregated labor labor P1 loaded implemented loaded annual true https://basedosdados.org/dataset/br-me-rais rais Agent H bigquery Aggregate mode only monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
14 inep INEP school census education P2 loaded implemented loaded annual true https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar inep Agent H file Education coverage monitored healthy 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
15 dou Diario Oficial da Uniao gazette P0 loaded implemented loaded daily true https://www.in.gov.br/leiturajornal dou Agent E bigquery National acts ingestion monitored healthy 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
16 datasus DATASUS CNES health P1 loaded implemented loaded monthly true https://opendatasus.saude.gov.br/ datasus Agent H file Health establishments monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
17 icij ICIJ offshore leaks offshore P1 loaded implemented loaded yearly true https://offshoreleaks.icij.org/pages/database icij Agent G file Offshore entities and officers monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
18 opensanctions OpenSanctions global PEP sanctions P1 loaded implemented loaded monthly true https://www.opensanctions.org/datasets/peps/ opensanctions Agent G file Global PEP matching monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
19 cvm CVM proceedings market P1 loaded implemented loaded monthly true https://dados.cvm.gov.br/ cvm Agent G file Proceedings loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
20 cvm_funds CVM fund registry market P1 loaded implemented loaded monthly true https://dados.cvm.gov.br/dados/FI/ cvm_funds Agent G file Fund baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
21 camara Camara CEAP expenses legislative P1 loaded implemented loaded monthly true https://dadosabertos.camara.leg.br/ camara Agent E api Expense reimbursement monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
22 camara_inquiries Camara inquiries and requirements legislative P0 partial implemented partial daily true https://dadosabertos.camara.leg.br/ camara_inquiries Agent E api Sessions still low monitored partial 2026-03-01T23:11:31.444615+00:00 ok
23 senado Senado CEAPS expenses legislative P1 loaded implemented loaded monthly true https://www12.senado.leg.br/dados-abertos senado Agent E api Expense data loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
24 ceaf CEAF expelled servants integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/ceaf ceaf Agent A file Expulsion evidence monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
25 cepim CEPIM barred NGOs integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/cepim cepim Agent A file NGO restrictions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
26 cpgf CPGF gov card expenses spending P2 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/cpgf cpgf Agent H file Masked CPF source monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
27 leniency Acordos de leniencia integrity P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia leniency Agent A file High signal low volume monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
28 ofac OFAC sanctions sanctions P1 loaded implemented loaded monthly true https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files ofac Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
29 holdings Brasil IO holdings ownership P1 loaded implemented loaded monthly true https://brasil.io/dataset/socios-brasil/ https://brasil.io/dataset/socios-brasil/holding/ holdings Agent G file Ownership enrichment monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
30 viagens Viagens a servico spending P2 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/viagens viagens Agent H file Travel spend baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
31 siop SIOP emendas budget P0 partial implemented partial annual true https://www.siop.planejamento.gov.br/ siop Agent C api Author linkage limited monitored partial 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
32 pncp PNCP bids and contracts contracts P0 stale implemented partial monthly true https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao https://pncp.gov.br/api/consulta/v1/ pncp Agent C api Freshness SLA pending monitored stale 2026-03-01T23:11:31.444615+00:00 transient_error
33 renuncias Renuncias fiscais fiscal P1 loaded implemented loaded annual true https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos renuncias Agent G file Tax waiver baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
34 siconfi SICONFI municipal finance fiscal P1 partial implemented partial annual true https://apidatalake.tesouro.gov.br/docs/siconfi/ siconfi Agent C api No CNPJ direct links monitored partial 2026-03-01T23:11:31.444615+00:00 ok
35 tse_bens TSE candidate assets electoral P1 loaded implemented loaded biennial true https://dadosabertos.tse.jus.br/api/3/action/package_search?q=bens https://dadosabertos.tse.jus.br/dataset/bens-candidato tse_bens Agent E file Patrimony baseline monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
36 tse_filiados TSE party memberships electoral P1 loaded implemented loaded monthly true https://dadosabertos.tse.jus.br/api/3/action/package_search?q=filiacao https://dadosabertos.tse.jus.br/dataset/filiados-partidos tse_filiados Agent E file Party network monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
37 bcb BCB penalties finance P1 loaded implemented loaded monthly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/penalidades bcb Agent G file Bank penalties loaded monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
38 stf STF court data judiciary P1 loaded implemented loaded monthly true https://basedosdados.org/dataset/br-stf-corte-aberta stf Agent D bigquery Supreme court coverage monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
39 caged CAGED labor movements labor P1 stale implemented partial monthly true https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/ caged Agent H file Aggregate-only implementation monitored stale 2026-03-01T23:11:31.444615+00:00 transient_error
40 eu_sanctions EU sanctions sanctions P1 loaded implemented loaded monthly true https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions eu_sanctions Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
41 un_sanctions UN sanctions sanctions P1 loaded implemented loaded monthly true https://scsanctions.un.org/resources/xml/en/consolidated.xml un_sanctions Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 transient_error
42 world_bank World Bank debarment sanctions P1 loaded implemented loaded monthly true https://www.worldbank.org/en/projects-operations/procurement/debarred-firms world_bank Agent G file International sanctions monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
43 senado_cpis Senado CPIs legislative P0 partial implemented partial yearly true https://www12.senado.leg.br/dados-abertos senado_cpis Agent E api Needs richer sessions and requirements monitored partial 2026-03-01T23:11:31.444615+00:00 ok
44 mides MiDES municipal procurement municipal P0 loaded implemented loaded daily true https://basedosdados.org/dataset/world-wb-mides mides Agent H bigquery Operational after access fix monitored healthy 2026-03-01T23:11:31.444615+00:00 ok
45 querido_diario Querido Diario gazettes municipal P1 partial implemented partial daily true https://queridodiario.ok.org.br/api querido_diario Agent H api Text availability gap monitored partial 2026-03-01T23:11:31.444615+00:00 ok
46 datajud CNJ DataJud judiciary P0 blocked_external implemented not_loaded monthly true https://api-publica.datajud.cnj.jus.br/ datajud Agent D api Credentials not fully operational in prod monitored blocked_external 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
47 bolsa_familia_bpc Bolsa Familia and BPC social P3 not_built not_implemented not_loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos Agent H file High volume masked identities discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
48 estban BCB ESTBAN balances finance P3 not_built not_implemented not_loaded monthly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/estban Agent G file Banking aggregates discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
49 if_data BCB IF data indicators finance P3 not_built not_implemented not_loaded quarterly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/if-data Agent G file Institution KPIs discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
50 bcb_liquidacao BCB bank liquidation acts finance P2 not_built not_implemented not_loaded monthly true https://dadosabertos.bcb.gov.br/ https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao Agent G file Regulatory actions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
51 stj_dados_abertos STJ open data judiciary P1 not_built not_implemented not_loaded monthly true https://dadosabertos.stj.jus.br/ Agent D api Superior court decisions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
52 cnciai_improbidade CNIAI improbidade judiciary P1 not_built not_implemented not_loaded monthly true https://www.cnj.jus.br/sistemas/datajud/ https://www.cnj.jus.br/sistemas/cnciai/ Agent D api Misconduct convictions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
53 carf_tax_appeals CARF tax appeals judiciary P2 not_built not_implemented not_loaded monthly true https://carf.economia.gov.br/dados-abertos Agent D file Tax litigation discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
54 anp_royalties ANP royalties and fuel regulatory P2 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anp Agent F api Oil and gas royalties discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
55 aneel_concessions ANEEL concessions regulatory P2 not_built not_implemented not_loaded monthly true https://dadosabertos.aneel.gov.br/ Agent F api Energy concessions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
56 anm_mining_rights ANM mining rights regulatory P1 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anm Agent F api Mining rights and permits discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
57 antt_transport_concessions ANTT concessions regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/antt Agent F api Transport concessions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
58 ans_health_plans ANS operators regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/ans Agent H api Health insurance operators discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
59 anvisa_registrations ANVISA products regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anvisa Agent H api Regulatory registrations discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
60 anac_aviation_concessions ANAC concessions regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anac Agent F api Aviation contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
61 antaq_port_contracts ANTAQ contracts regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/antaq Agent F api Port concessions discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
62 ana_water_grants ANA water grants regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/ana Agent F api Water use rights discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
63 anatel_telecom_licenses ANATEL licenses regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anatel Agent G api Telecom operators discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
64 susep_insurance_market SUSEP insurance market regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/susep Agent G file Insurance entities discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
65 cvm_full_ownership_chain CVM ownership chains market P1 not_built not_implemented not_loaded monthly true https://dados.cvm.gov.br/ Agent G file Shareholder graph expansion discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
66 receita_dirbi Receita DIRBI tax P1 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi Agent G file Tax benefit declarations discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
67 mapbiomas_alertas MapBiomas Alerta environment P1 not_built not_implemented not_loaded monthly true https://alerta.mapbiomas.org/api Agent F api Deforestation alerts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
68 sicar_rural_registry SiCAR rural registry environment P1 not_built not_implemented not_loaded quarterly true https://www.car.gov.br/publico/municipios/downloads Agent F file Property boundaries and owners discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
69 icmbio_cnuc ICMBio CNUC units environment P2 not_built not_implemented not_loaded monthly true https://www.gov.br/icmbio/pt-br https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao Agent F file Protected areas discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
70 tesouro_emendas Tesouro emendas budget P0 not_built not_implemented not_loaded monthly true https://www.tesourotransparente.gov.br/ Agent C file Budget execution discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
71 siga_brasil SIGA Brasil budget P0 not_built not_implemented not_loaded monthly true https://www12.senado.leg.br/orcamento/sigabrasil Agent C file Federal budget traces discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
72 camara_votes_bills Camara votes and bills legislative P1 not_built not_implemented not_loaded daily true https://dadosabertos.camara.leg.br/api/v2 Agent E api Legislative behavior discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
73 senado_votes_bills Senado votes and bills legislative P1 not_built not_implemented not_loaded daily true https://legis.senado.leg.br/dadosabertos Agent E api Legislative behavior discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
74 interpol_red_notices Interpol red notices international P2 not_built not_implemented not_loaded weekly true https://www.interpol.int/How-we-work/Notices/Red-Notices Agent G api Requires key discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
75 tce_sp TCE Sao Paulo state P2 not_built not_implemented not_loaded monthly true https://transparencia.tce.sp.gov.br/ Agent H api State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
76 tce_pe TCE Pernambuco state P2 not_built not_implemented not_loaded monthly true https://sistemas.tce.pe.gov.br/ Agent H api State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 auth_or_rate_limited
77 tce_rj TCE Rio de Janeiro state P2 not_built not_implemented not_loaded monthly true https://dados.tce.rj.gov.br/ Agent H api State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
78 tce_rs TCE Rio Grande do Sul state P2 not_built not_implemented not_loaded monthly true https://tcers.tc.br/fiscalizado/ https://portal.tce.rs.gov.br/ Agent H file State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
79 tce_mg TCE Minas Gerais state P2 not_built not_implemented not_loaded monthly true https://www.tce.mg.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
80 tce_ba TCE Bahia state P3 not_built not_implemented not_loaded monthly true https://www.tce.ba.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
81 tce_ce TCE Ceara state P3 not_built not_implemented not_loaded monthly true https://www.tce.ce.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
82 tce_go TCE Goias state P3 not_built not_implemented not_loaded monthly true https://portal.tce.go.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
83 tce_pr TCE Parana state P3 not_built not_implemented not_loaded monthly true https://www1.tce.pr.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
84 tce_sc TCE Santa Catarina state P3 not_built not_implemented not_loaded monthly true https://www.tcesc.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
85 tce_es TCE Espirito Santo state P3 not_built not_implemented not_loaded monthly true https://www.tcees.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
86 tce_mt TCE Mato Grosso state P3 not_built not_implemented not_loaded monthly true https://www.tce.mt.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
87 tce_ms TCE Mato Grosso do Sul state P3 not_built not_implemented not_loaded monthly true https://www.tce.ms.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
88 tce_am TCE Amazonas state P3 not_built not_implemented not_loaded monthly true https://www.tce.am.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
89 tce_pa TCE Para state P3 not_built not_implemented not_loaded monthly true https://www.tcepa.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
90 tce_ro TCE Rondonia state P3 not_built not_implemented not_loaded monthly true https://www.tce.ro.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
91 tce_rr TCE Roraima state P3 not_built not_implemented not_loaded monthly true https://www.tcerr.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
92 tce_ap TCE Amapa state P3 not_built not_implemented not_loaded monthly true https://www.tce.ap.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
93 tce_to TCE Tocantins state P3 not_built not_implemented not_loaded monthly true https://www.tceto.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
94 tce_ma TCE Maranhao state P3 not_built not_implemented not_loaded monthly true https://www.tcema.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
95 tce_pi TCE Piaui state P3 not_built not_implemented not_loaded monthly true https://www.tce.pi.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
96 tce_rn TCE Rio Grande do Norte state P3 not_built not_implemented not_loaded monthly true https://www.tce.rn.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
97 tce_pb TCE Paraiba state P3 not_built not_implemented not_loaded monthly true https://tce.pb.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
98 tce_al TCE Alagoas state P3 not_built not_implemented not_loaded monthly true https://www.tceal.tc.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
99 tce_se TCE Sergipe state P3 not_built not_implemented not_loaded monthly true https://www.tce.se.gov.br/ Agent H web State audit procurement discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
100 state_portal_sp Sao Paulo transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.sp.gov.br/ Agent H api State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
101 state_portal_mg Minas Gerais transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.mg.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
102 state_portal_ba Bahia transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.ba.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
103 state_portal_ce Ceara transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.ce.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error
104 state_portal_go Goias transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.go.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
105 state_portal_pr Parana transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.pr.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
106 state_portal_sc Santa Catarina transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.sc.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
107 state_portal_rs Rio Grande do Sul transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.rs.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
108 state_portal_pe Pernambuco transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.pe.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 ok
109 state_portal_rj Rio de Janeiro transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.rj.gov.br/ Agent H web State expenses and contracts discovered_uningested not_built 2026-03-01T23:11:31.444615+00:00 transient_error

View File

@@ -1,7 +1,7 @@
[project]
name = "bracc-etl"
version = "0.1.0"
description = "BRACC ETL — Data ingestion pipelines for Brazilian public data"
description = "BR-ACC ETL — Data ingestion pipelines for Brazilian public data"
requires-python = ">=3.12"
license = "AGPL-3.0-or-later"
dependencies = [
@@ -9,10 +9,11 @@ dependencies = [
"pandas>=2.2.0",
"httpx>=0.28.0",
"click>=8.1.0",
"defusedxml>=0.7.1",
"pydantic>=2.10.0",
"pydantic-settings>=2.7.0",
"pypdf>=5.2.0",
"defusedxml>=0.7.0",
"pandera>=0.21.0",
]
[project.optional-dependencies]

View File

@@ -3,8 +3,6 @@
from __future__ import annotations
import logging
import shutil
import stat
import zipfile
from pathlib import Path
@@ -38,12 +36,21 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool:
response.raise_for_status()
# If we requested a range but server returned full content (200 vs 206),
# start fresh to avoid corruption
if start_byte > 0 and response.status_code != 206:
logger.warning(
"Server ignored Range header for %s, restarting download",
dest.name,
)
start_byte = 0
total = response.headers.get("content-length")
total_mb = f"{int(total) / 1e6:.1f} MB" if total else "unknown size"
logger.info("Downloading %s (%s)...", dest.name, total_mb)
mode = "ab" if start_byte > 0 else "wb"
downloaded = start_byte
mode = "ab" if start_byte > 0 and response.status_code == 206 else "wb"
downloaded = start_byte if mode == "ab" else 0
with open(partial, mode) as f:
for chunk in response.iter_bytes(chunk_size=65_536):
f.write(chunk)
@@ -58,24 +65,49 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool:
return False
def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]:
"""Extract ZIP and return list of extracted files.
def safe_extract_zip(
zip_path: Path,
output_dir: Path,
*,
max_total_bytes: int = 50 * 1024**3, # 50GB default (CNPJ zips are huge)
) -> list[Path]:
"""Safely extract ZIP with path traversal and bomb guards.
Deletes corrupted ZIPs for re-download.
"""
try:
with zipfile.ZipFile(zip_path, "r") as zf:
extracted = safe_extract_zip(zf, output_dir)
logger.info("Extracted %d files from %s", len(extracted), zip_path.name)
return extracted
# Check for path traversal
resolved_output = output_dir.resolve()
for info in zf.infolist():
target = (output_dir / info.filename).resolve()
if not target.is_relative_to(resolved_output):
raise ValueError(
f"Path traversal detected in {zip_path.name}: {info.filename}"
)
# Check total uncompressed size (zip bomb guard)
total_size = sum(info.file_size for info in zf.infolist())
if total_size > max_total_bytes:
raise ValueError(
f"ZIP bomb guard: {zip_path.name} would extract to "
f"{total_size / 1e9:.1f}GB (limit: {max_total_bytes / 1e9:.1f}GB)"
)
names = zf.namelist()
zf.extractall(output_dir)
logger.info("Extracted %d files from %s", len(names), zip_path.name)
return [output_dir / n for n in names]
except zipfile.BadZipFile:
logger.warning("Bad ZIP file: %s — deleting for re-download", zip_path.name)
zip_path.unlink()
return []
except ValueError as exc:
logger.warning("Unsafe ZIP file %s: %s — deleting", zip_path.name, exc)
zip_path.unlink(missing_ok=True)
return []
def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]:
"""Extract ZIP and return list of extracted files."""
return safe_extract_zip(zip_path, output_dir)
def validate_csv(
@@ -111,60 +143,3 @@ def validate_csv(
except Exception as e:
logger.warning("Validation failed for %s: %s", path.name, e)
return False
def safe_extract_zip(
archive: zipfile.ZipFile,
output_dir: Path,
*,
max_members: int = 50_000,
max_uncompressed_bytes: int = 5_000_000_000,
) -> list[Path]:
"""Safely extract a ZIP archive.
Blocks path traversal, symlinks, and oversized archives.
"""
output_root = output_dir.resolve()
infos = archive.infolist()
if len(infos) > max_members:
msg = f"ZIP has too many entries ({len(infos)} > {max_members})"
raise ValueError(msg)
extracted: list[Path] = []
uncompressed_total = 0
for info in infos:
member_name = info.filename.replace("\\", "/")
if not member_name:
continue
# Reject symlink entries.
mode = info.external_attr >> 16
if stat.S_ISLNK(mode):
msg = f"ZIP contains symlink entry: {member_name}"
raise ValueError(msg)
target = (output_dir / member_name).resolve()
try:
target.relative_to(output_root)
except ValueError as exc:
msg = f"Path traversal detected: {member_name}"
raise ValueError(msg) from exc
if info.is_dir():
target.mkdir(parents=True, exist_ok=True)
continue
uncompressed_total += info.file_size
if uncompressed_total > max_uncompressed_bytes:
msg = (
f"ZIP exceeds max extracted size "
f"({uncompressed_total} > {max_uncompressed_bytes})"
)
raise ValueError(msg)
target.parent.mkdir(parents=True, exist_ok=True)
with archive.open(info, "r") as source, target.open("wb") as destination:
shutil.copyfileobj(source, destination)
extracted.append(target)
return extracted

View File

@@ -5,9 +5,9 @@ Streams microdados_movimentacao year-by-year to separate CSVs for
resumability and memory management on large datasets.
Usage:
python etl/scripts/download_caged.py --billing-project bracc-corruptos
python etl/scripts/download_caged.py --billing-project bracc-corruptos --start-year 2024
python etl/scripts/download_caged.py --billing-project bracc-corruptos --skip-existing
python etl/scripts/download_caged.py --billing-project icarus-corruptos
python etl/scripts/download_caged.py --billing-project icarus-corruptos --start-year 2024
python etl/scripts/download_caged.py --billing-project icarus-corruptos --skip-existing
"""
from __future__ import annotations

View File

@@ -413,7 +413,7 @@ def _write_manifest(
)
@click.option(
"--billing-project",
default="bracc-corruptos",
default="icarus-corruptos",
help="GCP billing project for BQ mode.",
)
@click.option(

View File

@@ -6,15 +6,21 @@ Usage:
python etl/scripts/download_cnpj.py --reference-only # reference tables only (tiny)
python etl/scripts/download_cnpj.py --files 1 # just first file of each type
python etl/scripts/download_cnpj.py --types Empresas # specific type only
python etl/scripts/download_cnpj.py --release 2026-03 # pin to specific monthly release
"""
from __future__ import annotations
import hashlib
import json
import logging
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
import click
import httpx
sys.path.insert(0, str(Path(__file__).parent))
from _download_utils import download_file, extract_zip, validate_csv
@@ -22,7 +28,13 @@ from _download_utils import download_file, extract_zip, validate_csv
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/"
# Receita Federal Nextcloud (primary since Jan 2026)
NEXTCLOUD_BASE = "https://arquivos.receitafederal.gov.br/s/{token}/download?path=%2F&files="
KNOWN_TOKENS = ["gn672Ad4CF8N6TK", "YggdBLfdninEJX9"]
# Legacy URLs (dadosabertos.rfb.gov.br decommissioned Jan 2026)
LEGACY_NEW_BASE_PATTERN = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/{year_month}/"
LEGACY_BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/"
MAIN_TYPES = ["Empresas", "Socios", "Estabelecimentos"]
REFERENCE_FILES = [
@@ -48,6 +60,126 @@ EXPECTED_COLS = {
}
def _previous_month(year: int, month: int) -> tuple[int, int]:
"""Return (year, month) for the previous month."""
if month == 1:
return year - 1, 12
return year, month - 1
def _check_url_accessible(url: str, timeout: int = 30) -> bool:
"""Send HTTP HEAD to verify a URL is accessible (2xx)."""
try:
resp = httpx.head(url, follow_redirects=True, timeout=timeout)
return resp.status_code < 400
except httpx.HTTPError:
return False
def _check_nextcloud_token(token: str, timeout: int = 30) -> bool:
"""Verify a Nextcloud share token is valid via HEAD request."""
share_url = f"https://arquivos.receitafederal.gov.br/s/{token}"
try:
resp = httpx.head(share_url, follow_redirects=True, timeout=timeout)
return resp.status_code < 400
except httpx.HTTPError:
return False
def resolve_rf_release(year_month: str | None = None) -> str:
"""Resolve the Receita Federal CNPJ release URL.
Strategy:
1. Try Nextcloud share (primary since Jan 2026):
a. Check CNPJ_SHARE_TOKEN env var first.
b. Then try each known token.
2. Fall back to legacy dadosabertos.rfb.gov.br paths.
3. Raise RuntimeError if nothing works (fail-closed).
Returns the resolved base URL. For Nextcloud, files are fetched via
``{base_url}{filename}``.
"""
now = datetime.now(timezone.utc)
# --- Nextcloud (primary) ---
tokens_to_try: list[str] = []
env_token = os.environ.get("CNPJ_SHARE_TOKEN")
if env_token:
tokens_to_try.append(env_token)
for t in KNOWN_TOKENS:
if t not in tokens_to_try:
tokens_to_try.append(t)
for token in tokens_to_try:
logger.info("Probing Nextcloud token: %s...", token[:6])
if _check_nextcloud_token(token):
base_url = NEXTCLOUD_BASE.format(token=token)
logger.info("Resolved CNPJ via Nextcloud (token %s...)", token[:6])
return base_url
# --- Legacy dadosabertos.rfb.gov.br ---
if year_month is not None:
candidates = [year_month]
else:
current = f"{now.year:04d}-{now.month:02d}"
prev_y, prev_m = _previous_month(now.year, now.month)
previous = f"{prev_y:04d}-{prev_m:02d}"
candidates = [current, previous]
for ym in candidates:
url = LEGACY_NEW_BASE_PATTERN.format(year_month=ym)
logger.info("Probing legacy release URL: %s", url)
if _check_url_accessible(url):
logger.info("Resolved CNPJ release (legacy new path): %s", url)
return url
logger.info("Trying legacy flat URL: %s", LEGACY_BASE_URL)
if _check_url_accessible(LEGACY_BASE_URL):
logger.info("Resolved CNPJ release (legacy flat): %s", LEGACY_BASE_URL)
return LEGACY_BASE_URL
tried = ", ".join(candidates)
raise RuntimeError(
f"Could not resolve CNPJ release. Tried Nextcloud tokens, "
f"legacy months [{tried}], and legacy flat path. "
"Receita Federal portal may be down or the URL structure has changed."
)
def _write_manifest(
output_dir: Path,
base_url: str,
resolved_release: str,
file_results: list[dict],
started_at: str,
) -> Path:
"""Write download manifest JSON after download completes."""
finished_at = datetime.now(timezone.utc).isoformat()
# Compute an aggregate checksum over all successful file names + sizes
hasher = hashlib.sha256()
for fr in sorted(file_results, key=lambda x: x["name"]):
hasher.update(f"{fr['name']}:{fr['size_bytes']}:{fr['status']}".encode())
checksum = f"sha256:{hasher.hexdigest()}"
manifest = {
"source": "receita_federal_cnpj",
"resolved_release": resolved_release,
"base_url": base_url,
"files": file_results,
"started_at": started_at,
"finished_at": finished_at,
"checksum": checksum,
}
manifest_path = output_dir / "download_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
logger.info("Manifest written: %s", manifest_path)
return manifest_path
@click.command()
@click.option("--output-dir", default="./data/cnpj", help="Base output directory")
@click.option("--files", type=int, default=10, help="Number of files per type (0-9)")
@@ -56,6 +188,7 @@ EXPECTED_COLS = {
@click.option("--skip-existing/--no-skip-existing", default=True, help="Skip already downloaded files")
@click.option("--skip-extract", is_flag=True, help="Skip extraction after download")
@click.option("--timeout", type=int, default=600, help="Download timeout in seconds")
@click.option("--release", default=None, help="Pin to specific monthly release (YYYY-MM format)")
def main(
output_dir: str,
files: int,
@@ -64,8 +197,20 @@ def main(
skip_existing: bool,
skip_extract: bool,
timeout: int,
release: str | None,
) -> None:
"""Download and extract CNPJ data from Receita Federal."""
started_at = datetime.now(timezone.utc).isoformat()
base_url = resolve_rf_release(release)
# Extract the release identifier from the resolved URL
resolved_release = release or "legacy"
if "arquivos.receitafederal.gov.br" in base_url:
resolved_release = "nextcloud"
elif "/dados_abertos_cnpj/" in base_url:
# Extract YYYY-MM from URL
resolved_release = base_url.rstrip("/").rsplit("/", 1)[-1]
base = Path(output_dir)
raw_dir = base / "raw"
extract_dir = base / "extracted"
@@ -73,14 +218,26 @@ def main(
for d in [raw_dir, extract_dir, ref_dir]:
d.mkdir(parents=True, exist_ok=True)
file_results: list[dict] = []
# --- Reference tables (always download, they're tiny) ---
logger.info("=== Reference tables ===")
for filename in REFERENCE_FILES:
dest = raw_dir / filename
if skip_existing and dest.exists():
logger.info("Skipping (exists): %s", filename)
file_results.append({
"name": filename,
"status": "skipped",
"size_bytes": dest.stat().st_size,
})
else:
download_file(f"{BASE_URL}{filename}", dest, timeout=timeout)
success = download_file(f"{base_url}{filename}", dest, timeout=timeout)
file_results.append({
"name": filename,
"status": "ok" if success else "failed",
"size_bytes": dest.stat().st_size if dest.exists() else 0,
})
if not skip_extract and dest.exists():
extracted = extract_zip(dest, ref_dir)
@@ -90,7 +247,8 @@ def main(
validate_csv(f, expected_cols=expected)
if reference_only:
logger.info("Reference-only mode done.")
logger.info("Reference-only mode -- done.")
_write_manifest(base, base_url, resolved_release, file_results, started_at)
return
# --- Main data files ---
@@ -102,10 +260,25 @@ def main(
dest = raw_dir / filename
if skip_existing and dest.exists():
logger.info("Skipping (exists): %s", filename)
file_results.append({
"name": filename,
"status": "skipped",
"size_bytes": dest.stat().st_size,
})
else:
success = download_file(f"{BASE_URL}{filename}", dest, timeout=timeout)
success = download_file(f"{base_url}{filename}", dest, timeout=timeout)
if not success:
file_results.append({
"name": filename,
"status": "failed",
"size_bytes": 0,
})
continue
file_results.append({
"name": filename,
"status": "ok",
"size_bytes": dest.stat().st_size if dest.exists() else 0,
})
if not skip_extract and dest.exists():
extracted = extract_zip(dest, extract_dir)
@@ -120,6 +293,7 @@ def main(
logger.info("=== Download complete ===")
_print_summary(raw_dir, extract_dir, ref_dir)
_write_manifest(base, base_url, resolved_release, file_results, started_at)
def _print_summary(raw_dir: Path, extract_dir: Path, ref_dir: Path) -> None:

View File

@@ -10,8 +10,8 @@ And a manifest:
- download_manifest.json
Usage:
python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos
python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos --tables socios
python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos
python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos --tables socios
"""
from __future__ import annotations
@@ -105,6 +105,44 @@ TABLES: dict[str, list[str]] = {
PAGE_SIZE = 100_000
def _run_bigquery_precheck(
*,
billing_project: str,
source_project: str,
source_dataset: str,
snapshot_start: str | None,
) -> None:
"""Run explicit auth/ACL prechecks before starting large table downloads."""
from google.cloud import bigquery
client = bigquery.Client(project=billing_project)
logger.info("Running BigQuery precheck: SELECT 1")
list(client.query("SELECT 1 AS ok").result())
socios_table = f"{source_project}.{source_dataset}.socios"
if snapshot_start:
precheck_sql = (
f"SELECT COUNT(1) AS n FROM `{socios_table}` "
"WHERE data >= @snapshot_start"
)
query_params = [
bigquery.ScalarQueryParameter("snapshot_start", "DATE", snapshot_start),
]
else:
precheck_sql = f"SELECT COUNT(1) AS n FROM `{socios_table}`"
query_params = []
logger.info("Running BigQuery precheck: %s", precheck_sql)
rows = list(
client.query(
precheck_sql,
job_config=bigquery.QueryJobConfig(query_parameters=query_params),
).result(),
)
check_value = rows[0].n if rows else 0
logger.info("BigQuery precheck OK: socios_count=%s", check_value)
def _sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as f:
@@ -292,6 +330,19 @@ def main(
)
source_project, source_dataset = dataset.split(".", 1)
try:
_run_bigquery_precheck(
billing_project=billing_project,
source_project=source_project,
source_dataset=source_dataset,
snapshot_start=snapshot_start,
)
except Exception as exc:
raise click.ClickException(
"BigQuery precheck failed. Configure a non-interactive service account "
"(GOOGLE_APPLICATION_CREDENTIALS) with dataset ACL and billing access.",
) from exc
selected = list(tables) if tables else list(TABLES.keys())
run_id = f"cnpj-bq-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}-{uuid.uuid4().hex[:8]}"
logger.info(

View File

@@ -22,7 +22,6 @@ from pathlib import Path
import click
import httpx
from _download_utils import safe_extract_zip
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
@@ -90,15 +89,24 @@ def _download_zip(
xml_count = 0
try:
resolved_dir = section_dir.resolve()
with zipfile.ZipFile(BytesIO(resp.content)) as zf:
extracted = safe_extract_zip(zf, section_dir)
xml_count = sum(1 for path in extracted if path.suffix.lower() == ".xml")
for member in zf.namelist():
# Path traversal guard
target = (section_dir / member).resolve()
if not target.is_relative_to(resolved_dir):
logger.warning(
"Path traversal detected in %s: %s — skipping",
zip_name,
member,
)
continue
if member.lower().endswith(".xml"):
zf.extract(member, section_dir)
xml_count += 1
except zipfile.BadZipFile:
logger.warning("Bad ZIP file: %s", zip_name)
return 0
except ValueError as exc:
logger.warning("Unsafe ZIP file %s: %s", zip_name, exc)
return 0
if xml_count > 0:
marker.write_text(str(xml_count))

View File

@@ -71,7 +71,7 @@ def _write_manifest(out_dir: Path, tables: list[dict[str, Any]]) -> Path:
@click.command()
@click.option("--billing-project", default="bracc-corruptos", help="GCP billing project")
@click.option("--billing-project", default="icarus-corruptos", help="GCP billing project")
@click.option(
"--dataset",
default=WORLD_WB_DATASET,

View File

@@ -439,7 +439,7 @@ def main(
client = httpx.Client(
timeout=timeout,
follow_redirects=True,
headers={"User-Agent": "BRACC-ETL/1.0 (public data research)"},
headers={"User-Agent": "BR-ACC-ETL/1.0 (public data research)"},
)
total_records = 0

View File

@@ -8,10 +8,12 @@ from __future__ import annotations
import argparse
import logging
import zipfile
import sys
from pathlib import Path
import httpx
sys.path.insert(0, str(Path(__file__).parent))
from _download_utils import safe_extract_zip
logger = logging.getLogger(__name__)
@@ -34,14 +36,13 @@ def download_year(output_dir: Path, year: int) -> None:
url,
follow_redirects=True,
timeout=300,
headers={"User-Agent": "BRACC-ETL/1.0"},
headers={"User-Agent": "BR-ACC-ETL/1.0"},
)
response.raise_for_status()
dest_zip.write_bytes(response.content)
logger.info("Downloaded: %s (%d bytes)", dest_zip.name, len(response.content))
with zipfile.ZipFile(dest_zip, "r") as zf:
extracted = safe_extract_zip(zf, output_dir)
extracted = safe_extract_zip(dest_zip, output_dir)
logger.info("Extracted %d files", len(extracted))
except httpx.HTTPError:
logger.warning("Failed to download renuncias for %d", year)

View File

@@ -16,13 +16,13 @@ import hashlib
import json
import logging
import re
import defusedxml.ElementTree as ET
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
import click
import httpx
from defusedxml import ElementTree as ET
from download_senado_cpi_archive import fetch_archive_historical
logger = logging.getLogger(__name__)

View File

@@ -44,7 +44,7 @@ def get_all_entities() -> list[dict]:
url,
params={"offset": offset, "limit": limit},
timeout=60,
headers={"User-Agent": "BRACC-ETL/1.0"},
headers={"User-Agent": "BR-ACC-ETL/1.0"},
)
response.raise_for_status()
data = response.json()
@@ -125,7 +125,7 @@ def download_year(
header_written = partial.exists() and partial.stat().st_size > 0
with (
httpx.Client(headers={"User-Agent": "BRACC-ETL/1.0"}) as client,
httpx.Client(headers={"User-Agent": "BR-ACC-ETL/1.0"}) as client,
open(partial, "a", newline="", encoding="utf-8") as f,
):
writer: csv.DictWriter | None = None

View File

@@ -5,9 +5,9 @@ Streams from BigQuery table basedosdados.br_stf_corte_aberta.decisoes to local C
Requires `google-cloud-bigquery` and an authenticated GCP project.
Usage:
python etl/scripts/download_stf.py --billing-project bracc-corruptos
python etl/scripts/download_stf.py --billing-project bracc-corruptos --skip-existing
python etl/scripts/download_stf.py --billing-project bracc-corruptos --output-dir ./data/stf
python etl/scripts/download_stf.py --billing-project icarus-corruptos
python etl/scripts/download_stf.py --billing-project icarus-corruptos --skip-existing
python etl/scripts/download_stf.py --billing-project icarus-corruptos --output-dir ./data/stf
"""
from __future__ import annotations

View File

@@ -5,9 +5,9 @@ Streams from BigQuery table `basedosdados.br_tse_eleicoes.bens_candidato` to a l
Requires `google-cloud-bigquery` and an authenticated GCP project.
Usage:
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --start-year 2018
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --skip-existing
python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos
python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --start-year 2018
python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --skip-existing
"""
from __future__ import annotations

View File

@@ -7,9 +7,9 @@ Filters to REGULAR status only (active members) to reduce volume.
Requires `google-cloud-bigquery` and an authenticated GCP project.
Usage:
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --skip-existing
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --all-statuses
python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos
python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --skip-existing
python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --all-statuses
"""
from __future__ import annotations

View File

@@ -14,10 +14,10 @@ from __future__ import annotations
import json
import logging
import sys
import defusedxml.ElementTree as ET
from pathlib import Path
import click
from defusedxml import ElementTree as ET
# Allow imports from scripts/ directory
sys.path.insert(0, str(Path(__file__).parent))

View File

@@ -21,12 +21,16 @@ class Pipeline(ABC):
limit: int | None = None,
chunk_size: int = 50_000,
neo4j_database: str | None = None,
history: bool = False,
) -> None:
self.driver = driver
self.data_dir = data_dir
self.limit = limit
self.chunk_size = chunk_size
self.neo4j_database = neo4j_database or os.getenv("NEO4J_DATABASE", "neo4j")
self.history = history
self.rows_in: int = 0
self.rows_loaded: int = 0
source_key = getattr(self, "source_id", getattr(self, "name", "unknown_source"))
self.run_id = f"{source_key}_{datetime.now(tz=UTC).strftime('%Y%m%d%H%M%S')}"
@@ -87,8 +91,8 @@ class Pipeline(ABC):
" r.started_at = coalesce($started_at, r.started_at), "
" r.finished_at = coalesce($finished_at, r.finished_at), "
" r.error = coalesce($error, r.error), "
" r.rows_in = coalesce(r.rows_in, 0), "
" r.rows_loaded = coalesce(r.rows_loaded, 0)"
" r.rows_in = $rows_in, "
" r.rows_loaded = $rows_loaded"
)
run_id = getattr(self, "run_id", f"{source_id}_manual")
params = {
@@ -98,6 +102,8 @@ class Pipeline(ABC):
"started_at": started_at,
"finished_at": finished_at,
"error": error,
"rows_in": self.rows_in,
"rows_loaded": self.rows_loaded,
}
try:
with self.driver.session(database=self.neo4j_database) as session:

View File

@@ -13,7 +13,7 @@ def get_person_settings() -> dict[str, Any]:
"""
try:
import splink.comparison_library as cl # type: ignore[import-not-found]
from splink import SettingsCreator
from splink import SettingsCreator # type: ignore[import-not-found,unused-ignore]
except ImportError as exc:
raise ImportError(
"splink is required for entity resolution. "

View File

@@ -51,8 +51,9 @@ class BcbPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.penalties: list[dict[str, Any]] = []
self.company_rels: list[dict[str, Any]] = []

View File

@@ -33,8 +33,9 @@ class BndesPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.finances: list[dict[str, Any]] = []
self.relationships: list[dict[str, Any]] = []
@@ -51,8 +52,15 @@ class BndesPipeline(Pipeline):
def extract(self) -> None:
bndes_dir = Path(self.data_dir) / "bndes"
if not bndes_dir.exists():
logger.warning("[%s] Data directory not found: %s", self.name, bndes_dir)
return
csv_path = bndes_dir / "operacoes-nao-automaticas.csv"
if not csv_path.exists():
logger.warning("[%s] CSV file not found: %s", self.name, csv_path)
return
self._raw = pd.read_csv(
bndes_dir / "operacoes-nao-automaticas.csv",
csv_path,
dtype=str,
delimiter=";",
encoding="latin-1",

View File

@@ -88,8 +88,9 @@ class CagedPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._csv_files: list[Path] = []
def extract(self) -> None:

View File

@@ -60,8 +60,9 @@ class CamaraPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.expenses: list[dict[str, Any]] = []
self.deputies: list[dict[str, Any]] = []

View File

@@ -66,8 +66,9 @@ class CamaraInquiriesPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_inquiries: pd.DataFrame = pd.DataFrame()
self._raw_requirements: pd.DataFrame = pd.DataFrame()

View File

@@ -31,8 +31,9 @@ class CeafPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.expulsions: list[dict[str, Any]] = []
self.person_rels: list[dict[str, Any]] = []

View File

@@ -37,8 +37,9 @@ class CepimPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.ngos: list[dict[str, Any]] = []
self.company_rels: list[dict[str, Any]] = []

View File

@@ -216,9 +216,11 @@ class CNPJPipeline(Pipeline):
limit: int | None = None,
chunk_size: int = 50_000,
history: bool = False,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
self.history = history
super().__init__(
driver, data_dir, limit=limit, chunk_size=chunk_size, history=history, **kwargs,
)
self.run_id = f"cnpj-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}"
self._raw_empresas: pd.DataFrame = pd.DataFrame()
self._raw_socios: pd.DataFrame = pd.DataFrame()

View File

@@ -63,7 +63,7 @@ class ComprasnetPipeline(Pipeline):
"""ETL pipeline for PNCP federal procurement contracts."""
name = "comprasnet"
source_id = "pncp"
source_id = "comprasnet"
def __init__(
self,
@@ -71,8 +71,9 @@ class ComprasnetPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self.contracts: list[dict[str, Any]] = []
def extract(self) -> None:

View File

@@ -84,8 +84,9 @@ class CpgfPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.expenses: list[dict[str, Any]] = []
self.cardholders: list[dict[str, Any]] = []

View File

@@ -38,8 +38,9 @@ class CvmPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_processos: pd.DataFrame = pd.DataFrame()
self._raw_acusados: pd.DataFrame = pd.DataFrame()
self.proceedings: list[dict[str, Any]] = []

View File

@@ -43,8 +43,9 @@ class CvmFundsPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.funds: list[dict[str, Any]] = []
self.admin_rels: list[dict[str, Any]] = []

View File

@@ -50,8 +50,9 @@ class DatajudPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_cases: pd.DataFrame = pd.DataFrame()
self._raw_parties: pd.DataFrame = pd.DataFrame()

View File

@@ -29,8 +29,9 @@ class DatasusPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.facilities: list[dict[str, Any]] = []
self.company_links: list[dict[str, Any]] = []

View File

@@ -17,7 +17,10 @@ import re
from pathlib import Path
from typing import TYPE_CHECKING, Any
from defusedxml import ElementTree # type: ignore[import-untyped]
from defusedxml.ElementTree import ParseError as _XmlParseError # type: ignore[import-untyped]
from defusedxml.ElementTree import (
parse as _safe_xml_parse, # type: ignore[import-untyped,unused-ignore]
)
from bracc_etl.base import Pipeline
from bracc_etl.loader import Neo4jBatchLoader
@@ -141,8 +144,9 @@ class DouPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_acts: list[dict[str, str]] = []
self.acts: list[dict[str, Any]] = []
self.person_rels: list[dict[str, Any]] = []
@@ -227,8 +231,8 @@ class DouPipeline(Pipeline):
"""Extract acts from Imprensa Nacional XML dumps."""
for f in xml_files:
try:
tree = ElementTree.parse(f) # noqa: S314
except ElementTree.ParseError:
tree = _safe_xml_parse(f)
except _XmlParseError:
logger.warning("[dou] Failed to parse XML: %s", f.name)
continue

View File

@@ -76,8 +76,9 @@ class EuSanctionsPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.sanctions: list[dict[str, Any]] = []
self.person_rels: list[dict[str, Any]] = []

View File

@@ -36,8 +36,9 @@ class HoldingsPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.holding_rels: list[dict[str, Any]] = []

View File

@@ -40,8 +40,9 @@ class IbamaPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.embargoes: list[dict[str, Any]] = []
self.companies: list[dict[str, Any]] = []
@@ -65,7 +66,13 @@ class IbamaPipeline(Pipeline):
def extract(self) -> None:
ibama_dir = Path(self.data_dir) / "ibama"
if not ibama_dir.exists():
logger.warning("[%s] Data directory not found: %s", self.name, ibama_dir)
return
csv_path = ibama_dir / "areas_embargadas.csv"
if not csv_path.exists():
logger.warning("[%s] CSV file not found: %s", self.name, csv_path)
return
logger.info("[ibama] Reading %s", csv_path)
self._raw = pd.read_csv(
csv_path,

View File

@@ -42,8 +42,9 @@ class ICIJPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._entities_raw: pd.DataFrame = pd.DataFrame()
self._officers_raw: pd.DataFrame = pd.DataFrame()
self._intermediaries_raw: pd.DataFrame = pd.DataFrame()

View File

@@ -42,8 +42,9 @@ class InepPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self.schools: list[dict[str, Any]] = []
self.school_company_links: list[dict[str, Any]] = []

View File

@@ -31,8 +31,9 @@ class LeniencyPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.agreements: list[dict[str, Any]] = []
self.company_rels: list[dict[str, Any]] = []

View File

@@ -74,8 +74,9 @@ class MidesPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_bids: pd.DataFrame = pd.DataFrame()
self._raw_contracts: pd.DataFrame = pd.DataFrame()

View File

@@ -63,8 +63,9 @@ class OfacPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.sanctions: list[dict[str, Any]] = []

View File

@@ -81,8 +81,9 @@ class OpenSanctionsPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw_entities: list[dict[str, Any]] = []
self.global_peps: list[dict[str, Any]] = []
self.pep_match_rels: list[dict[str, Any]] = []

View File

@@ -84,8 +84,9 @@ class PepCguPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._raw: pd.DataFrame = pd.DataFrame()
self.pep_records: list[dict[str, Any]] = []
self.person_links: list[dict[str, Any]] = []

View File

@@ -38,8 +38,9 @@ class PgfnPipeline(Pipeline):
data_dir: str = "./data",
limit: int | None = None,
chunk_size: int = 50_000,
**kwargs: Any,
) -> None:
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
self._csv_files: list[Path] = []
self.finances: list[dict[str, Any]] = []
self.relationships: list[dict[str, Any]] = []
@@ -56,10 +57,13 @@ class PgfnPipeline(Pipeline):
def extract(self) -> None:
pgfn_dir = Path(self.data_dir) / "pgfn"
if not pgfn_dir.exists():
logger.warning("[%s] Data directory not found: %s", self.name, pgfn_dir)
return
self._csv_files = sorted(pgfn_dir.glob("arquivo_lai_SIDA_*_*.csv"))
if not self._csv_files:
msg = f"No PGFN CSV files found in {pgfn_dir}"
raise FileNotFoundError(msg)
logger.warning("[%s] No PGFN CSV files found in %s", self.name, pgfn_dir)
return
logger.info("[pgfn] Found %d CSV files to process", len(self._csv_files))
def transform(self) -> None:

Some files were not shown because too many files have changed in this diff Show More