mirror of
https://github.com/kharonsec/br-acc
synced 2026-04-25 17:15:02 +02:00
sync: upstream convergence 2026-03-02
Co-authored-by: bruno cesar <brunoclz@brunos-MacBook-Pro.local>
This commit is contained in:
11
.env.example
11
.env.example
@@ -18,29 +18,24 @@ API_PORT=8000
|
||||
LOG_LEVEL=info
|
||||
APP_ENV=dev
|
||||
JWT_SECRET_KEY=change-me-generate-with-openssl-rand-hex-32
|
||||
AUTH_COOKIE_NAME=bracc_session
|
||||
AUTH_COOKIE_SECURE=false
|
||||
AUTH_COOKIE_SAMESITE=lax
|
||||
TRUST_PROXY_HEADERS=false
|
||||
INVITE_CODE=
|
||||
CORS_ORIGINS=http://localhost:3000
|
||||
PRODUCT_TIER=community
|
||||
PATTERNS_ENABLED=false
|
||||
PUBLIC_MODE=true
|
||||
PUBLIC_MODE=false
|
||||
PUBLIC_ALLOW_PERSON=false
|
||||
PUBLIC_ALLOW_ENTITY_LOOKUP=false
|
||||
PUBLIC_ALLOW_INVESTIGATIONS=false
|
||||
PATTERN_SPLIT_THRESHOLD_VALUE=80000
|
||||
PATTERN_SPLIT_MIN_COUNT=3
|
||||
PATTERN_SHARE_THRESHOLD=0.60
|
||||
PATTERN_SHARE_THRESHOLD=0.6
|
||||
PATTERN_SRP_MIN_ORGS=5
|
||||
PATTERN_INEXIG_MIN_RECURRENCE=3
|
||||
PATTERN_MAX_EVIDENCE_REFS=50
|
||||
SHARE_TOKEN_TTL_HOURS=168
|
||||
|
||||
# Frontend (dev only — production uses Caddy reverse proxy with relative paths)
|
||||
VITE_API_URL=http://localhost:8000
|
||||
VITE_PUBLIC_MODE=true
|
||||
VITE_PUBLIC_MODE=false
|
||||
VITE_PATTERNS_ENABLED=false
|
||||
|
||||
# Optional: Google Cloud (for Base dos Dados / TSE BigQuery)
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/config.yml
vendored
2
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1,5 +1,5 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Security vulnerability report
|
||||
url: https://github.com/World-Open-Graph/br-acc/security/advisories/new
|
||||
url: https://github.com/brunoclz/world-transparency-graph/security/advisories/new
|
||||
about: Use GitHub Security Advisories for private vulnerability disclosure.
|
||||
|
||||
4
.github/claude-automerge-policy.json
vendored
4
.github/claude-automerge-policy.json
vendored
@@ -10,8 +10,8 @@
|
||||
"README.md",
|
||||
"CONTRIBUTING.md",
|
||||
"frontend/src/**",
|
||||
"api/src/icarus/queries/**",
|
||||
"api/src/icarus/models/**",
|
||||
"api/src/bracc/queries/**",
|
||||
"api/src/bracc/models/**",
|
||||
"api/tests/**",
|
||||
"etl/tests/**",
|
||||
"frontend/src/**/*.test.*"
|
||||
|
||||
109
.github/workflows/publish-release.yml
vendored
109
.github/workflows/publish-release.yml
vendored
@@ -23,26 +23,6 @@ on:
|
||||
description: "Release title (EN)"
|
||||
required: true
|
||||
type: string
|
||||
highlights_pt:
|
||||
description: "PT highlights (separate bullets with |)"
|
||||
required: true
|
||||
type: string
|
||||
highlights_en:
|
||||
description: "EN highlights (separate bullets with |)"
|
||||
required: true
|
||||
type: string
|
||||
patterns_included:
|
||||
description: "Comma-separated pattern IDs included in this release (use 'none' if not applicable)"
|
||||
required: true
|
||||
type: string
|
||||
technical_changes_pt:
|
||||
description: "PT technical changes (separate bullets with |)"
|
||||
required: true
|
||||
type: string
|
||||
technical_changes_en:
|
||||
description: "EN technical changes (separate bullets with |)"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
@@ -124,116 +104,63 @@ jobs:
|
||||
COMPARE_URL: ${{ steps.validate.outputs.compare_url }}
|
||||
TITLE_PT: ${{ inputs.title_pt }}
|
||||
TITLE_EN: ${{ inputs.title_en }}
|
||||
HIGHLIGHTS_PT: ${{ inputs.highlights_pt }}
|
||||
HIGHLIGHTS_EN: ${{ inputs.highlights_en }}
|
||||
PATTERNS_INCLUDED: ${{ inputs.patterns_included }}
|
||||
TECHNICAL_CHANGES_PT: ${{ inputs.technical_changes_pt }}
|
||||
TECHNICAL_CHANGES_EN: ${{ inputs.technical_changes_en }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
DATE_UTC="$(date -u +"%Y-%m-%d")"
|
||||
export DATE_UTC
|
||||
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from textwrap import dedent
|
||||
|
||||
def split_pipe(raw: str) -> list[str]:
|
||||
normalized = raw.replace("\r\n", "\n").replace("\n", "|")
|
||||
return [item.strip(" -\t") for item in normalized.split("|") if item.strip()]
|
||||
|
||||
def split_csv(raw: str) -> list[str]:
|
||||
value = raw.strip()
|
||||
if value.lower() in {"none", "n/a", "na", "-"}:
|
||||
return []
|
||||
return [item.strip() for item in value.split(",") if item.strip()]
|
||||
|
||||
def bullets(items: list[str], fallback: str) -> str:
|
||||
if not items:
|
||||
return f"- {fallback}"
|
||||
return "\n".join(f"- {item}" for item in items)
|
||||
|
||||
highlights_pt = split_pipe(os.environ["HIGHLIGHTS_PT"])
|
||||
highlights_en = split_pipe(os.environ["HIGHLIGHTS_EN"])
|
||||
technical_changes_pt = split_pipe(os.environ["TECHNICAL_CHANGES_PT"])
|
||||
technical_changes_en = split_pipe(os.environ["TECHNICAL_CHANGES_EN"])
|
||||
patterns = split_csv(os.environ["PATTERNS_INCLUDED"])
|
||||
|
||||
release_notes = dedent(
|
||||
f"""
|
||||
cat > release_notes.md <<NOTES
|
||||
## PT-BR
|
||||
|
||||
{os.environ["TITLE_PT"]}
|
||||
${TITLE_PT}
|
||||
|
||||
### Escopo
|
||||
- Release publicada por marco.
|
||||
- Mudanças listadas de forma específica para facilitar auditoria pública.
|
||||
|
||||
### Destaques
|
||||
{bullets(highlights_pt, "Sem destaques declarados.")}
|
||||
|
||||
### Padrões incluídos
|
||||
{bullets(patterns, "Sem novos padrões nesta release.")}
|
||||
|
||||
### Mudanças técnicas
|
||||
{bullets(technical_changes_pt, "Sem mudanças técnicas declaradas.")}
|
||||
- Mudanças detalhadas por categorias no histórico desta versão.
|
||||
|
||||
### Integridade pública
|
||||
Os sinais e padrões refletem coocorrências em bases públicas e não constituem prova legal.
|
||||
|
||||
## EN
|
||||
|
||||
{os.environ["TITLE_EN"]}
|
||||
${TITLE_EN}
|
||||
|
||||
### Scope
|
||||
- Milestone-based release publication.
|
||||
- Changes are listed explicitly for public traceability.
|
||||
|
||||
### Highlights
|
||||
{bullets(highlights_en, "No highlights declared.")}
|
||||
|
||||
### Included patterns
|
||||
{bullets(patterns, "No new patterns in this release.")}
|
||||
|
||||
### Technical changes
|
||||
{bullets(technical_changes_en, "No technical changes declared.")}
|
||||
- Detailed changes grouped by category in this version history.
|
||||
|
||||
### Public integrity
|
||||
Signals and patterns reflect co-occurrence in public records and are not legal proof.
|
||||
|
||||
## Compatibility
|
||||
|
||||
- Breaking changes: none declared.
|
||||
- Migration required: no.
|
||||
- Breaking changes: declare explicitly when applicable.
|
||||
- Migration required: declare explicitly when applicable.
|
||||
|
||||
## Compare
|
||||
|
||||
{os.environ.get("COMPARE_URL", "")}
|
||||
${COMPARE_URL}
|
||||
|
||||
## Metadata
|
||||
|
||||
- Version: {os.environ["VERSION"]}
|
||||
- Target SHA: {os.environ["TARGET_SHA"]}
|
||||
- Previous tag: {os.environ["PREVIOUS_TAG"]}
|
||||
- Date (UTC): {os.environ.get("DATE_UTC", "")}
|
||||
"""
|
||||
).strip() + "\n"
|
||||
- Version: ${VERSION}
|
||||
- Target SHA: ${TARGET_SHA}
|
||||
- Previous tag: ${PREVIOUS_TAG}
|
||||
- Date (UTC): ${DATE_UTC}
|
||||
NOTES
|
||||
|
||||
with open("release_notes.md", "w", encoding="utf-8") as fh:
|
||||
fh.write(release_notes)
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
|
||||
payload = {
|
||||
"version": os.environ["VERSION"],
|
||||
"date": os.environ.get("DATE_UTC", ""),
|
||||
"highlights_pt": highlights_pt,
|
||||
"highlights_en": highlights_en,
|
||||
"highlights_pt": [os.environ["TITLE_PT"]],
|
||||
"highlights_en": [os.environ["TITLE_EN"]],
|
||||
"api_changes": [],
|
||||
"data_changes": [],
|
||||
"privacy_compliance_changes": [],
|
||||
"patterns_included": patterns,
|
||||
"technical_changes_pt": technical_changes_pt,
|
||||
"technical_changes_en": technical_changes_en,
|
||||
"breaking_changes": False,
|
||||
"migration_required": False,
|
||||
"compare_url": os.environ.get("COMPARE_URL", ""),
|
||||
|
||||
24
.github/workflows/security.yml
vendored
24
.github/workflows/security.yml
vendored
@@ -6,18 +6,10 @@ on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
gitleaks:
|
||||
name: Gitleaks
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -37,7 +29,6 @@ jobs:
|
||||
bandit:
|
||||
name: Bandit (Python)
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -57,7 +48,6 @@ jobs:
|
||||
pip-audit:
|
||||
name: Pip Audit (Python deps)
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -69,14 +59,6 @@ jobs:
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Cache uv
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/uv
|
||||
key: ${{ runner.os }}-uv-security-${{ hashFiles('api/uv.lock', 'etl/uv.lock') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-uv-security-
|
||||
|
||||
- name: Export lock-compatible requirement sets
|
||||
run: |
|
||||
cd api
|
||||
@@ -93,7 +75,6 @@ jobs:
|
||||
public-privacy-gate:
|
||||
name: Public Privacy Gate
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -107,7 +88,6 @@ jobs:
|
||||
compliance-pack-gate:
|
||||
name: Compliance Pack Gate
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -120,9 +100,8 @@ jobs:
|
||||
|
||||
public-boundary-gate:
|
||||
name: Public Boundary Gate
|
||||
if: vars.PUBLIC_BOUNDARY_GATE_ENABLED == 'true'
|
||||
if: github.repository == 'brunoclz/world-transparency-graph'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -136,7 +115,6 @@ jobs:
|
||||
internal-instruction-boundary:
|
||||
name: Internal Instruction Boundary
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -75,7 +75,6 @@ scripts/audit-prompts/
|
||||
# Local report artifacts in repository root
|
||||
/*.pdf
|
||||
/*.html
|
||||
gitleaks-report*.json
|
||||
|
||||
# Playwright MCP cache
|
||||
.playwright-mcp/
|
||||
@@ -91,7 +90,7 @@ data/tse/
|
||||
# Local MCP runtime config (keep example only)
|
||||
.mcp.json
|
||||
|
||||
# Internal assistant instructions (must never be published)
|
||||
# Internal assistant instruction files (must never be published)
|
||||
CLAUDE.md
|
||||
AGENTS.md
|
||||
AGENTS*.md
|
||||
|
||||
142
Makefile
142
Makefile
@@ -1,14 +1,125 @@
|
||||
.PHONY: dev stop seed bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics check neutrality
|
||||
.PHONY: dev stop api etl frontend lint type-check test test-api test-etl test-frontend test-integration-api test-integration-etl test-integration check seed clean download-cnpj download-tse download-transparencia download-sanctions download-all etl-cnpj etl-cnpj-stream etl-tse etl-transparencia etl-sanctions etl-all link-persons bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics
|
||||
|
||||
# ── Development ─────────────────────────────────────────
|
||||
dev:
|
||||
docker compose -f infra/docker-compose.yml up -d
|
||||
docker compose up -d
|
||||
|
||||
stop:
|
||||
docker compose -f infra/docker-compose.yml down
|
||||
docker compose down
|
||||
|
||||
# ── API ─────────────────────────────────────────────────
|
||||
api:
|
||||
cd api && uv run uvicorn bracc.main:app --reload --host 0.0.0.0 --port 8000
|
||||
|
||||
# ── ETL ─────────────────────────────────────────────────
|
||||
etl:
|
||||
cd etl && uv run bracc-etl --help
|
||||
|
||||
seed:
|
||||
bash infra/scripts/seed-dev.sh
|
||||
|
||||
# ── CNPJ Data ──────────────────────────────────────────
|
||||
download-cnpj:
|
||||
cd etl && uv run python scripts/download_cnpj.py --reference-only
|
||||
cd etl && uv run python scripts/download_cnpj.py --files 1
|
||||
|
||||
download-cnpj-all:
|
||||
cd etl && uv run python scripts/download_cnpj.py --files 10
|
||||
|
||||
etl-cnpj:
|
||||
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
|
||||
|
||||
etl-cnpj-dev:
|
||||
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
|
||||
|
||||
etl-cnpj-stream:
|
||||
cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --streaming
|
||||
|
||||
# ── TSE Data ──────────────────────────────────────────
|
||||
download-tse:
|
||||
cd etl && uv run python scripts/download_tse.py --years 2024
|
||||
|
||||
etl-tse:
|
||||
cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
|
||||
|
||||
etl-tse-dev:
|
||||
cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
|
||||
|
||||
# ── Transparencia Data ────────────────────────────────
|
||||
download-transparencia:
|
||||
cd etl && uv run python scripts/download_transparencia.py --year 2025
|
||||
|
||||
etl-transparencia:
|
||||
cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
|
||||
|
||||
etl-transparencia-dev:
|
||||
cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000
|
||||
|
||||
# ── Sanctions Data ────────────────────────────────────
|
||||
download-sanctions:
|
||||
cd etl && uv run python scripts/download_sanctions.py
|
||||
|
||||
etl-sanctions:
|
||||
cd etl && uv run bracc-etl run --source sanctions --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data
|
||||
|
||||
# ── All Data ──────────────────────────────────────────
|
||||
download-all: download-cnpj download-tse download-transparencia download-sanctions
|
||||
|
||||
etl-all: etl-cnpj etl-tse etl-transparencia etl-sanctions
|
||||
|
||||
# ── Entity Resolution ────────────────────────────────────
|
||||
link-persons:
|
||||
docker compose exec neo4j cypher-shell -u neo4j -p "$${NEO4J_PASSWORD}" -f /scripts/link_persons.cypher
|
||||
|
||||
# ── Frontend ────────────────────────────────────────────
|
||||
frontend:
|
||||
cd frontend && npm run dev
|
||||
|
||||
# ── Quality ─────────────────────────────────────────────
|
||||
lint:
|
||||
cd api && uv run ruff check src/ tests/
|
||||
cd etl && uv run ruff check src/ tests/
|
||||
cd frontend && npm run lint
|
||||
|
||||
type-check:
|
||||
cd api && uv run mypy src/
|
||||
cd etl && uv run mypy src/
|
||||
cd frontend && npm run type-check
|
||||
|
||||
test-api:
|
||||
cd api && uv run pytest
|
||||
|
||||
test-etl:
|
||||
cd etl && uv run pytest
|
||||
|
||||
test-frontend:
|
||||
cd frontend && npm test
|
||||
|
||||
test: test-api test-etl test-frontend
|
||||
|
||||
# ── Integration tests ─────────────────────────────────
|
||||
test-integration-api:
|
||||
cd api && uv run pytest -m integration
|
||||
|
||||
test-integration-etl:
|
||||
cd etl && uv run pytest -m integration
|
||||
|
||||
test-integration: test-integration-api test-integration-etl
|
||||
|
||||
# ── Full check (run before commit) ─────────────────────
|
||||
check: lint type-check test
|
||||
@echo "All checks passed."
|
||||
|
||||
# ── Neutrality audit ───────────────────────────────────
|
||||
neutrality:
|
||||
@! grep -rn \
|
||||
"suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty\|CRITICAL\|HIGH.*severity\|MEDIUM.*severity\|LOW.*severity" \
|
||||
api/src/ etl/src/ frontend/src/ \
|
||||
--include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \
|
||||
|| (echo "NEUTRALITY VIOLATION FOUND" && exit 1)
|
||||
@echo "Neutrality check passed."
|
||||
|
||||
# ── Bootstrap ─────────────────────────────────────────────
|
||||
bootstrap-demo:
|
||||
bash scripts/bootstrap_public_demo.sh --profile demo
|
||||
|
||||
@@ -24,6 +135,7 @@ bootstrap-all-noninteractive:
|
||||
bootstrap-all-report:
|
||||
python3 scripts/run_bootstrap_all.py --repo-root . --report-latest
|
||||
|
||||
# ── Quality checks ────────────────────────────────────────
|
||||
check-public-claims:
|
||||
python3 scripts/check_public_claims.py --repo-root .
|
||||
|
||||
@@ -36,22 +148,20 @@ check-pipeline-contracts:
|
||||
check-pipeline-inputs:
|
||||
python3 scripts/check_pipeline_inputs.py
|
||||
|
||||
# ── Generators ────────────────────────────────────────────
|
||||
generate-pipeline-status:
|
||||
python3 scripts/generate_pipeline_status.py --registry-path docs/source_registry_br_v1.csv --output docs/pipeline_status.md
|
||||
python3 scripts/generate_pipeline_status.py
|
||||
|
||||
generate-source-summary:
|
||||
python3 scripts/generate_data_sources_summary.py --registry-path docs/source_registry_br_v1.csv --docs-path docs/data-sources.md
|
||||
python3 scripts/generate_data_sources_summary.py
|
||||
|
||||
generate-reference-metrics:
|
||||
python3 scripts/generate_reference_metrics.py --json-output audit-results/public-trust/latest/neo4j-reference-metrics.json --doc-output docs/reference_metrics.md
|
||||
python3 scripts/generate_reference_metrics.py
|
||||
|
||||
check:
|
||||
cd api && bash ../scripts/ci/python_quality.sh
|
||||
cd etl && bash ../scripts/ci/python_quality.sh
|
||||
cd frontend && bash ../scripts/ci/frontend_quality.sh
|
||||
|
||||
neutrality:
|
||||
@! grep -rn "suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty" \
|
||||
api/src/ etl/src/ frontend/src/ \
|
||||
--include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \
|
||||
|| (echo "NEUTRALITY VIOLATION: banned words found in source" && exit 1)
|
||||
# ── Cleanup ─────────────────────────────────────────────
|
||||
clean:
|
||||
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|
||||
find . -type d -name .pytest_cache -exec rm -rf {} + 2>/dev/null || true
|
||||
find . -type d -name .mypy_cache -exec rm -rf {} + 2>/dev/null || true
|
||||
find . -type d -name .ruff_cache -exec rm -rf {} + 2>/dev/null || true
|
||||
rm -rf frontend/dist
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[project]
|
||||
name = "bracc-api"
|
||||
version = "0.1.0"
|
||||
description = "BRACC API — Brazilian public data anti-corruption graph tool"
|
||||
description = "BR-ACC API — Brazilian public data anti-corruption graph tool"
|
||||
requires-python = ">=3.12"
|
||||
license = "AGPL-3.0-or-later"
|
||||
dependencies = [
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
@@ -17,14 +18,15 @@ class Settings(BaseSettings):
|
||||
jwt_secret_key: str = "change-me-in-production"
|
||||
jwt_algorithm: str = "HS256"
|
||||
jwt_expire_minutes: int = 1440
|
||||
auth_cookie_name: str = "bracc_session"
|
||||
auth_cookie_secure: bool = False
|
||||
auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax"
|
||||
trust_proxy_headers: bool = False
|
||||
rate_limit_anon: str = "60/minute"
|
||||
rate_limit_auth: str = "300/minute"
|
||||
invite_code: str = ""
|
||||
cors_origins: str = "http://localhost:3000"
|
||||
auth_cookie_name: str = "bracc_session"
|
||||
auth_cookie_secure: bool = False
|
||||
auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax"
|
||||
trust_proxy_headers: bool = False
|
||||
share_token_ttl_hours: int = 168 # 7 days
|
||||
product_tier: str = "community"
|
||||
patterns_enabled: bool = False
|
||||
public_mode: bool = False
|
||||
@@ -37,7 +39,16 @@ class Settings(BaseSettings):
|
||||
pattern_srp_min_orgs: int = 5
|
||||
pattern_inexig_min_recurrence: int = 3
|
||||
pattern_max_evidence_refs: int = 50
|
||||
share_token_ttl_hours: int = 168
|
||||
|
||||
# Pattern hardening defaults (decision-complete contract)
|
||||
pattern_temporal_window_years: int = Field(default=4, ge=1, le=20)
|
||||
pattern_min_contract_value: float = Field(default=100000.0, ge=0)
|
||||
pattern_min_contract_count: int = Field(default=2, ge=1)
|
||||
pattern_min_debt_value: float = Field(default=50000.0, ge=0)
|
||||
pattern_same_as_min_confidence: float = Field(default=0.85, ge=0, le=1)
|
||||
pattern_pep_min_confidence: float = Field(default=0.85, ge=0, le=1)
|
||||
pattern_min_recurrence: int = Field(default=2, ge=1)
|
||||
pattern_min_discrepancy_ratio: float = Field(default=0.30, ge=0, le=1)
|
||||
|
||||
model_config = {"env_prefix": "", "env_file": ".env"}
|
||||
|
||||
|
||||
@@ -35,7 +35,12 @@ async def close_driver() -> None:
|
||||
|
||||
|
||||
async def get_driver(request: Request) -> AsyncDriver:
|
||||
driver: AsyncDriver = request.app.state.neo4j_driver
|
||||
driver: AsyncDriver | None = getattr(request.app.state, "neo4j_driver", None)
|
||||
if driver is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="Database connection not available",
|
||||
)
|
||||
return driver
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
from collections.abc import AsyncIterator
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from slowapi import _rate_limit_exceeded_handler
|
||||
from slowapi.errors import RateLimitExceeded
|
||||
@@ -51,7 +51,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="BRACC API",
|
||||
title="BR-ACC API",
|
||||
description="Brazilian public data graph analysis tool",
|
||||
version="0.1.0",
|
||||
lifespan=lifespan,
|
||||
@@ -85,5 +85,5 @@ app.include_router(investigation.shared_router)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health(request: Request) -> dict[str, str]:
|
||||
return {"status": "ok", "version": request.app.version}
|
||||
async def health() -> dict[str, str]:
|
||||
return {"status": "ok"}
|
||||
|
||||
@@ -53,7 +53,7 @@ def _is_pep_record(record: dict[str, Any]) -> bool:
|
||||
|
||||
for field in ("role", "cargo"):
|
||||
value = record.get(field)
|
||||
if isinstance(value, str) and value.strip().lower() in PEP_ROLES:
|
||||
if isinstance(value, str) and any(kw in value.strip().lower() for kw in PEP_ROLES):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@@ -6,37 +6,20 @@ from bracc.config import settings
|
||||
from bracc.services.auth_service import decode_access_token
|
||||
|
||||
|
||||
def _extract_token(request: Request) -> str | None:
|
||||
def _get_rate_limit_key(request: Request) -> str:
|
||||
"""Extract user_id from JWT (Bearer or cookie) for rate limiting, fallback to IP."""
|
||||
auth = request.headers.get("authorization", "")
|
||||
if auth.startswith("Bearer "):
|
||||
return auth[7:].strip()
|
||||
cookie_token = request.cookies.get(settings.auth_cookie_name)
|
||||
if isinstance(cookie_token, str) and cookie_token.strip():
|
||||
return cookie_token.strip()
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_client_ip(request: Request) -> str:
|
||||
if settings.trust_proxy_headers:
|
||||
forwarded = request.headers.get("x-forwarded-for", "")
|
||||
if forwarded:
|
||||
first_hop = forwarded.split(",", 1)[0].strip()
|
||||
if first_hop:
|
||||
return first_hop
|
||||
real_ip = request.headers.get("x-real-ip", "").strip()
|
||||
if real_ip:
|
||||
return real_ip
|
||||
return get_remote_address(request)
|
||||
|
||||
|
||||
def _get_rate_limit_key(request: Request) -> str:
|
||||
"""Extract user_id from JWT for rate limiting, fallback to IP."""
|
||||
token = _extract_token(request)
|
||||
if token:
|
||||
token = auth[7:]
|
||||
user_id = decode_access_token(token)
|
||||
if user_id:
|
||||
return f"user:{user_id}"
|
||||
return _resolve_client_ip(request)
|
||||
cookie_token = request.cookies.get(settings.auth_cookie_name)
|
||||
if isinstance(cookie_token, str) and cookie_token.strip():
|
||||
user_id = decode_access_token(cookie_token.strip())
|
||||
if user_id:
|
||||
return f"user:{user_id}"
|
||||
return get_remote_address(request)
|
||||
|
||||
|
||||
limiter = Limiter(
|
||||
|
||||
@@ -1,27 +1,15 @@
|
||||
MATCH (center) WHERE elementId(center) = $entity_id
|
||||
MATCH (center)
|
||||
WHERE elementId(center) = $entity_id
|
||||
AND (center:Person OR center:Partner OR center:Company OR center:Contract OR center:Sanction OR center:Election
|
||||
OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education
|
||||
OR center:Convenio OR center:LaborStats OR center:PublicOffice)
|
||||
WITH center,
|
||||
CASE
|
||||
WHEN coalesce($include_probable, false) THEN
|
||||
"SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS"
|
||||
ELSE
|
||||
"SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS"
|
||||
END AS relationship_filter
|
||||
CALL apoc.path.subgraphAll(center, {
|
||||
relationshipFilter: relationship_filter,
|
||||
labelFilter: "-User|-Investigation|-Annotation|-Tag",
|
||||
maxLevel: $depth,
|
||||
limit: 200
|
||||
})
|
||||
YIELD nodes, relationships
|
||||
WITH center, nodes, relationships
|
||||
UNWIND relationships AS r
|
||||
WITH center,
|
||||
startNode(r) AS src,
|
||||
endNode(r) AS tgt,
|
||||
r
|
||||
OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS*1..4]-(connected)
|
||||
WHERE length(p) <= $depth
|
||||
AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag))
|
||||
WITH center, p
|
||||
UNWIND CASE WHEN p IS NULL THEN [] ELSE relationships(p) END AS r
|
||||
WITH DISTINCT center, r, startNode(r) AS src, endNode(r) AS tgt
|
||||
WHERE coalesce($include_probable, false) OR type(r) <> "POSSIBLE_SAME_AS"
|
||||
RETURN center AS e,
|
||||
r,
|
||||
CASE WHEN elementId(src) = elementId(center) THEN tgt ELSE src END AS connected,
|
||||
|
||||
@@ -1,14 +1,21 @@
|
||||
MATCH (center) WHERE elementId(center) = $entity_id
|
||||
MATCH (center)
|
||||
WHERE elementId(center) = $entity_id
|
||||
AND (center:Person OR center:Company OR center:Contract OR center:Sanction OR center:Election
|
||||
OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education
|
||||
OR center:Convenio OR center:LaborStats OR center:PublicOffice
|
||||
OR center:OffshoreEntity OR center:OffshoreOfficer OR center:GlobalPEP
|
||||
OR center:CVMProceeding OR center:Expense)
|
||||
CALL apoc.path.subgraphAll(center, {
|
||||
relationshipFilter: "SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU",
|
||||
labelFilter: $label_filter,
|
||||
maxLevel: $depth,
|
||||
limit: 200
|
||||
})
|
||||
YIELD nodes, relationships
|
||||
RETURN nodes, relationships, elementId(center) AS center_id
|
||||
OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU*1..4]-(n)
|
||||
WHERE length(p) <= $depth
|
||||
AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag))
|
||||
WITH center, collect(p) AS paths
|
||||
WITH center,
|
||||
reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes,
|
||||
reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels
|
||||
UNWIND raw_nodes AS n
|
||||
WITH center, collect(DISTINCT n) AS nodes, raw_rels
|
||||
UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r
|
||||
WITH center, nodes, collect(DISTINCT r) AS rels
|
||||
RETURN nodes,
|
||||
[x IN rels WHERE x IS NOT NULL] AS relationships,
|
||||
elementId(center) AS center_id
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
MATCH (i:Investigation)
|
||||
WHERE i.share_token = $token
|
||||
AND (i.share_expires_at IS NULL OR i.share_expires_at > datetime())
|
||||
MATCH (i:Investigation {share_token: $token})
|
||||
OPTIONAL MATCH (i)-[:INCLUDES]->(e)
|
||||
WITH i, collect(coalesce(e.cpf, e.cnpj, e.contract_id, e.sanction_id, e.amendment_id, e.cnes_code, e.finance_id, e.embargo_id, e.school_id, e.convenio_id, e.stats_id, elementId(e))) AS eids
|
||||
RETURN i.id AS id,
|
||||
@@ -9,5 +7,4 @@ RETURN i.id AS id,
|
||||
i.created_at AS created_at,
|
||||
i.updated_at AS updated_at,
|
||||
i.share_token AS share_token,
|
||||
i.share_expires_at AS share_expires_at,
|
||||
[x IN eids WHERE x IS NOT NULL] AS entity_ids
|
||||
|
||||
@@ -4,8 +4,7 @@ CREATE (i:Investigation {
|
||||
description: $description,
|
||||
created_at: datetime(),
|
||||
updated_at: datetime(),
|
||||
share_token: null,
|
||||
share_expires_at: null
|
||||
share_token: null
|
||||
})
|
||||
WITH i
|
||||
MATCH (u:User {id: $user_id})
|
||||
@@ -16,5 +15,4 @@ RETURN i.id AS id,
|
||||
i.created_at AS created_at,
|
||||
i.updated_at AS updated_at,
|
||||
i.share_token AS share_token,
|
||||
i.share_expires_at AS share_expires_at,
|
||||
[] AS entity_ids
|
||||
|
||||
@@ -7,5 +7,4 @@ RETURN i.id AS id,
|
||||
i.created_at AS created_at,
|
||||
i.updated_at AS updated_at,
|
||||
i.share_token AS share_token,
|
||||
i.share_expires_at AS share_expires_at,
|
||||
[x IN eids WHERE x IS NOT NULL] AS entity_ids
|
||||
|
||||
@@ -13,5 +13,4 @@ RETURN total,
|
||||
i.created_at AS created_at,
|
||||
i.updated_at AS updated_at,
|
||||
i.share_token AS share_token,
|
||||
i.share_expires_at AS share_expires_at,
|
||||
[x IN eids WHERE x IS NOT NULL] AS entity_ids
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
MATCH (u:User {id: $user_id})-[:OWNS]->(i:Investigation {id: $id})
|
||||
SET i.share_token = $share_token,
|
||||
i.share_expires_at = $share_expires_at,
|
||||
i.updated_at = datetime()
|
||||
RETURN i.id AS id,
|
||||
i.share_token AS share_token,
|
||||
i.share_expires_at AS share_expires_at
|
||||
i.share_token AS share_token
|
||||
|
||||
@@ -11,5 +11,4 @@ RETURN i.id AS id,
|
||||
i.created_at AS created_at,
|
||||
i.updated_at AS updated_at,
|
||||
i.share_token AS share_token,
|
||||
i.share_expires_at AS share_expires_at,
|
||||
[x IN eids WHERE x IS NOT NULL] AS entity_ids
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
MATCH (n) WHERE elementId(n) = $entity_id
|
||||
MATCH (n)
|
||||
WHERE elementId(n) = $entity_id
|
||||
AND (n:Person OR n:Company OR n:Contract OR n:Sanction OR n:Election
|
||||
OR n:Amendment OR n:Finance OR n:Embargo OR n:Health OR n:Education
|
||||
OR n:Convenio OR n:LaborStats OR n:PublicOffice)
|
||||
RETURN apoc.node.degree(n) AS degree
|
||||
RETURN COUNT { (n)--() } AS degree
|
||||
|
||||
@@ -2,11 +2,31 @@ MATCH (center:Company)
|
||||
WHERE elementId(center) = $company_id
|
||||
OR center.cnpj = $company_identifier
|
||||
OR center.cnpj = $company_identifier_formatted
|
||||
CALL apoc.path.subgraphAll(center, {
|
||||
relationshipFilter: "SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU",
|
||||
labelFilter: "+Company|+Contract|+Sanction|+Finance|+Amendment|+Convenio|+Bid|+MunicipalContract|+MunicipalBid|-Person|-Partner|-User|-Investigation|-Annotation|-Tag",
|
||||
maxLevel: $depth,
|
||||
limit: 200
|
||||
})
|
||||
YIELD nodes, relationships
|
||||
RETURN nodes, relationships, elementId(center) AS center_id
|
||||
OPTIONAL MATCH p=(center)-[:SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU*1..4]-(n)
|
||||
WHERE length(p) <= $depth
|
||||
AND all(
|
||||
x IN nodes(p)
|
||||
WHERE NOT (
|
||||
"Person" IN labels(x)
|
||||
OR "Partner" IN labels(x)
|
||||
OR "User" IN labels(x)
|
||||
OR "Investigation" IN labels(x)
|
||||
OR "Annotation" IN labels(x)
|
||||
OR "Tag" IN labels(x)
|
||||
)
|
||||
)
|
||||
AND (
|
||||
n:Company OR n:Contract OR n:Sanction OR n:Finance OR n:Amendment OR n:Convenio
|
||||
OR n:Bid OR n:MunicipalContract OR n:MunicipalBid OR n IS NULL
|
||||
)
|
||||
WITH center, collect(p) AS paths
|
||||
WITH center,
|
||||
reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes,
|
||||
reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels
|
||||
UNWIND raw_nodes AS n
|
||||
WITH center, collect(DISTINCT n) AS nodes, raw_rels
|
||||
UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r
|
||||
WITH center, nodes, collect(DISTINCT r) AS rels
|
||||
RETURN nodes,
|
||||
[x IN rels WHERE x IS NOT NULL] AS relationships,
|
||||
elementId(center) AS center_id
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// BRACC Neo4j Schema — Constraints and Indexes
|
||||
// BR-ACC Neo4j Schema — Constraints and Indexes
|
||||
// Applied on database initialization
|
||||
|
||||
// ── Uniqueness Constraints ──────────────────────────────
|
||||
|
||||
@@ -6,6 +6,7 @@ from neo4j import AsyncSession
|
||||
from bracc.dependencies import get_session
|
||||
from bracc.models.baseline import BaselineResponse
|
||||
from bracc.services.baseline_service import BASELINE_QUERIES, run_all_baselines, run_baseline
|
||||
from bracc.services.public_guard import enforce_entity_lookup_enabled
|
||||
|
||||
router = APIRouter(prefix="/api/v1/baseline", tags=["baseline"])
|
||||
|
||||
@@ -16,6 +17,7 @@ async def get_baseline_for_entity(
|
||||
session: Annotated[AsyncSession, Depends(get_session)],
|
||||
dimension: Annotated[str | None, Query()] = None,
|
||||
) -> BaselineResponse:
|
||||
enforce_entity_lookup_enabled()
|
||||
if dimension:
|
||||
if dimension not in BASELINE_QUERIES:
|
||||
available = list(BASELINE_QUERIES.keys())
|
||||
|
||||
@@ -182,7 +182,7 @@ async def get_entity_timeline(
|
||||
date=event_date,
|
||||
label=str(label),
|
||||
entity_type=entity_type,
|
||||
properties=sanitize_props(props),
|
||||
properties=sanitize_public_properties(sanitize_props(props)),
|
||||
sources=[SourceAttribution(database="neo4j_graph")],
|
||||
))
|
||||
|
||||
|
||||
@@ -311,7 +311,7 @@ async def export_investigation_pdf(
|
||||
cpf_val = node.get("cpf")
|
||||
if cpf_val and isinstance(cpf_val, str):
|
||||
role = str(node.get("role", node.get("cargo", ""))).lower()
|
||||
is_pep = role in PEP_ROLES
|
||||
is_pep = any(kw in role for kw in PEP_ROLES)
|
||||
if not is_pep:
|
||||
if "." in document and "-" in document:
|
||||
document = mask_formatted_cpf(document)
|
||||
|
||||
@@ -6,6 +6,7 @@ from neo4j import AsyncSession
|
||||
|
||||
from bracc.dependencies import get_session
|
||||
from bracc.services.neo4j_service import execute_query_single
|
||||
from bracc.services.public_guard import should_hide_person_entities
|
||||
from bracc.services.source_registry import load_source_registry, source_registry_summary
|
||||
|
||||
router = APIRouter(prefix="/api/v1/meta", tags=["meta"])
|
||||
@@ -40,7 +41,9 @@ async def database_stats(
|
||||
result = {
|
||||
"total_nodes": record["total_nodes"] if record else 0,
|
||||
"total_relationships": record["total_relationships"] if record else 0,
|
||||
"person_count": record["person_count"] if record else 0,
|
||||
"person_count": (
|
||||
0 if should_hide_person_entities() else (record["person_count"] if record else 0)
|
||||
),
|
||||
"company_count": record["company_count"] if record else 0,
|
||||
"health_count": record["health_count"] if record else 0,
|
||||
"finance_count": record["finance_count"] if record else 0,
|
||||
|
||||
@@ -57,12 +57,6 @@ async def public_meta(
|
||||
return {
|
||||
"product": "World Transparency Graph",
|
||||
"mode": "public_safe",
|
||||
"dataset_scope": {
|
||||
"local_default": "demo_local",
|
||||
"ingestion_mode": "byo_ingestion",
|
||||
"reference_metrics": "reference_production_snapshot",
|
||||
},
|
||||
"metrics_as_of_utc": "2026-03-01T23:05:00Z",
|
||||
"total_nodes": record["total_nodes"] if record else 0,
|
||||
"total_relationships": record["total_relationships"] if record else 0,
|
||||
"company_count": record["company_count"] if record else 0,
|
||||
|
||||
@@ -61,9 +61,9 @@ async def search_entities(
|
||||
{
|
||||
"query": _escape_lucene(q),
|
||||
"entity_type": type_filter,
|
||||
"hide_person_entities": hide_person_entities,
|
||||
"skip": skip,
|
||||
"limit": size,
|
||||
"hide_person_entities": hide_person_entities,
|
||||
},
|
||||
)
|
||||
total_record = await execute_query_single(
|
||||
|
||||
@@ -9,6 +9,17 @@ from testcontainers.neo4j import Neo4jContainer
|
||||
from bracc.main import app
|
||||
|
||||
|
||||
def _iter_cypher_statements(path: Path) -> list[str]:
|
||||
# Strip comment-only lines before splitting to avoid dropping statements
|
||||
# that are preceded by section headers.
|
||||
filtered_lines = [
|
||||
line for line in path.read_text().splitlines()
|
||||
if line.strip() and not line.strip().startswith("//")
|
||||
]
|
||||
text = "\n".join(filtered_lines)
|
||||
return [stmt.strip() for stmt in text.split(";") if stmt.strip()]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def neo4j_container() -> Neo4jContainer: # type: ignore[misc]
|
||||
"""Start a Neo4j container for integration tests."""
|
||||
@@ -25,21 +36,34 @@ def neo4j_uri(neo4j_container: Neo4jContainer) -> str:
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def neo4j_auth(neo4j_container: Neo4jContainer) -> tuple[str, str]:
|
||||
return ("neo4j", neo4j_container.NEO4J_ADMIN_PASSWORD)
|
||||
# testcontainers.neo4j API changed: older versions exposed NEO4J_ADMIN_PASSWORD,
|
||||
# newer versions expose username/password attributes.
|
||||
username = getattr(neo4j_container, "username", "neo4j")
|
||||
password = getattr(
|
||||
neo4j_container,
|
||||
"password",
|
||||
getattr(neo4j_container, "NEO4J_ADMIN_PASSWORD", None),
|
||||
)
|
||||
if password is None:
|
||||
msg = "Could not resolve Neo4j testcontainer password"
|
||||
raise RuntimeError(msg)
|
||||
return (username, password)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@pytest.fixture
|
||||
async def neo4j_driver(
|
||||
neo4j_uri: str, neo4j_auth: tuple[str, str]
|
||||
) -> AsyncIterator[AsyncDriver]:
|
||||
# Function-scoped driver avoids loop affinity issues between async tests.
|
||||
driver = AsyncGraphDatabase.driver(neo4j_uri, auth=neo4j_auth)
|
||||
async with driver.session() as session:
|
||||
# Keep tests deterministic across function scope by resetting test data.
|
||||
await session.run("MATCH (n) DETACH DELETE n")
|
||||
# Apply schema
|
||||
schema_path = Path(__file__).parent.parent.parent.parent / "infra" / "neo4j" / "init.cypher"
|
||||
if schema_path.exists():
|
||||
async with driver.session() as session:
|
||||
for statement in schema_path.read_text().split(";"):
|
||||
stmt = statement.strip()
|
||||
if stmt and not stmt.startswith("//"):
|
||||
for stmt in _iter_cypher_statements(schema_path):
|
||||
await session.run(stmt)
|
||||
# Seed dev data
|
||||
seed_path = (
|
||||
@@ -47,9 +71,7 @@ async def neo4j_driver(
|
||||
)
|
||||
if seed_path.exists():
|
||||
async with driver.session() as session:
|
||||
for statement in seed_path.read_text().split(";"):
|
||||
stmt = statement.strip()
|
||||
if stmt and not stmt.startswith("//"):
|
||||
for stmt in _iter_cypher_statements(seed_path):
|
||||
await session.run(stmt)
|
||||
yield driver
|
||||
await driver.close()
|
||||
|
||||
@@ -34,7 +34,11 @@ def _setup_mock_session(driver: MagicMock, records: list[MagicMock]) -> AsyncMoc
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_register_success(client: AsyncClient) -> None:
|
||||
async def test_register_success(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
from bracc.config import settings
|
||||
|
||||
monkeypatch.setattr(settings, "invite_code", "")
|
||||
|
||||
record = _mock_record({
|
||||
"id": "user-uuid",
|
||||
"email": "test@example.com",
|
||||
@@ -56,19 +60,15 @@ async def test_register_success(client: AsyncClient) -> None:
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_register_bad_invite(client: AsyncClient) -> None:
|
||||
async def test_register_bad_invite(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
from bracc.config import settings
|
||||
|
||||
original = settings.invite_code
|
||||
try:
|
||||
settings.invite_code = "secret-code"
|
||||
monkeypatch.setattr(settings, "invite_code", "secret-code")
|
||||
response = await client.post(
|
||||
"/api/v1/auth/register",
|
||||
json={"email": "test@example.com", "password": "password123", "invite_code": "wrong"},
|
||||
)
|
||||
assert response.status_code == 403
|
||||
finally:
|
||||
settings.invite_code = original
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
@@ -155,16 +155,25 @@ async def test_me_invalid_token(client: AsyncClient) -> None:
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_register_duplicate_email(client: AsyncClient) -> None:
|
||||
async def test_register_duplicate_email(
|
||||
client: AsyncClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
from neo4j.exceptions import ConstraintError
|
||||
|
||||
from bracc.config import settings
|
||||
from bracc.main import app
|
||||
|
||||
monkeypatch.setattr(settings, "invite_code", "")
|
||||
|
||||
driver = app.state.neo4j_driver
|
||||
mock_session = AsyncMock()
|
||||
mock_session.run = AsyncMock(side_effect=Exception("Constraint violation"))
|
||||
mock_session.run = AsyncMock(side_effect=ConstraintError("Node already exists"))
|
||||
driver.session.return_value.__aenter__ = AsyncMock(return_value=mock_session)
|
||||
|
||||
with pytest.raises(Exception, match="Constraint violation"):
|
||||
await client.post(
|
||||
response = await client.post(
|
||||
"/api/v1/auth/register",
|
||||
json={"email": "duplicate@example.com", "password": "password123"},
|
||||
)
|
||||
assert response.status_code == 409
|
||||
assert response.json()["detail"] == "Email already registered"
|
||||
|
||||
@@ -61,7 +61,9 @@ def test_decode_access_token_invalid() -> None:
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_register_user_success() -> None:
|
||||
async def test_register_user_success(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(settings, "invite_code", "")
|
||||
|
||||
mock_record = _mock_record({
|
||||
"id": "user-uuid",
|
||||
"email": "test@example.com",
|
||||
@@ -80,15 +82,11 @@ async def test_register_user_success() -> None:
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_register_user_bad_invite() -> None:
|
||||
original = settings.invite_code
|
||||
try:
|
||||
settings.invite_code = "secret-code"
|
||||
async def test_register_user_bad_invite(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(settings, "invite_code", "secret-code")
|
||||
session = AsyncMock()
|
||||
with pytest.raises(ValueError, match="Invalid invite code"):
|
||||
await register_user(session, "test@example.com", "password123", "wrong-code")
|
||||
finally:
|
||||
settings.invite_code = original
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
|
||||
@@ -68,6 +68,28 @@ class TestIsPepRecord:
|
||||
def test_cargo_field(self) -> None:
|
||||
assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado"})
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"role",
|
||||
[
|
||||
"Deputado Federal",
|
||||
"deputado federal",
|
||||
"DEPUTADO FEDERAL",
|
||||
"Senador da Republica",
|
||||
"senadora da republica",
|
||||
"Vereador Suplente",
|
||||
"Ministro de Estado",
|
||||
"Governadora do Estado de Sao Paulo",
|
||||
"Presidente da Republica",
|
||||
],
|
||||
)
|
||||
def test_compound_role_detected_as_pep(self, role: str) -> None:
|
||||
"""Compound PEP roles like 'deputado federal' must be detected via substring match."""
|
||||
assert _is_pep_record({"name": "X", "cpf": "11111111111", "role": role})
|
||||
|
||||
def test_compound_cargo_detected_as_pep(self) -> None:
|
||||
"""Compound PEP cargo like 'Deputado Federal' must be detected via substring match."""
|
||||
assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado Federal"})
|
||||
|
||||
def test_non_pep_role(self) -> None:
|
||||
assert not _is_pep_record({"name": "X", "cpf": "11111111111", "role": "assessor"})
|
||||
|
||||
@@ -99,6 +121,18 @@ class TestCollectPepCpfs:
|
||||
data = {"a": {"b": {"c": [{"cpf": "33333333333", "is_pep": True}]}}}
|
||||
assert "33333333333" in _collect_pep_cpfs(data)
|
||||
|
||||
def test_compound_role_collected(self) -> None:
|
||||
"""Compound roles like 'Deputado Federal' must be recognized in the walk."""
|
||||
data = {
|
||||
"results": [
|
||||
{"cpf": "11111111111", "role": "Deputado Federal"},
|
||||
{"cpf": "22222222222", "role": "assessor parlamentar"},
|
||||
]
|
||||
}
|
||||
peps = _collect_pep_cpfs(data)
|
||||
assert "11111111111" in peps
|
||||
assert "22222222222" not in peps
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests for mask_cpfs_in_json
|
||||
@@ -205,4 +239,4 @@ async def test_health_not_masked(client: AsyncClient) -> None:
|
||||
"""Non-CPF JSON responses pass through unchanged."""
|
||||
resp = await client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "ok" and "version" in resp.json()
|
||||
assert resp.json() == {"status": "ok"}
|
||||
|
||||
@@ -8,9 +8,7 @@ from httpx import AsyncClient
|
||||
async def test_health_returns_ok(client: AsyncClient) -> None:
|
||||
response = await client.get("/health")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["status"] == "ok"
|
||||
assert "version" in data
|
||||
assert response.json() == {"status": "ok"}
|
||||
assert response.headers["x-content-type-options"] == "nosniff"
|
||||
assert response.headers["x-frame-options"] == "DENY"
|
||||
assert response.headers["referrer-policy"] == "no-referrer"
|
||||
|
||||
@@ -1,120 +0,0 @@
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import pytest
|
||||
from httpx import AsyncClient
|
||||
|
||||
from bracc.config import settings
|
||||
from bracc.models.pattern import PATTERN_METADATA
|
||||
from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES
|
||||
from bracc.services.neo4j_service import CypherLoader
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _enable_patterns(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(settings, "patterns_enabled", True)
|
||||
|
||||
|
||||
def test_all_community_patterns_have_metadata() -> None:
|
||||
for pattern_id in COMMUNITY_PATTERN_IDS:
|
||||
assert pattern_id in PATTERN_METADATA, f"Missing metadata for {pattern_id}"
|
||||
|
||||
|
||||
def test_all_community_patterns_have_query_files() -> None:
|
||||
for query_name in COMMUNITY_PATTERN_QUERIES.values():
|
||||
try:
|
||||
CypherLoader.load(query_name)
|
||||
except FileNotFoundError:
|
||||
pytest.fail(f"Missing .cypher file for query {query_name}.cypher")
|
||||
finally:
|
||||
CypherLoader.clear_cache()
|
||||
|
||||
|
||||
def test_pattern_metadata_has_required_fields() -> None:
|
||||
for pid, meta in PATTERN_METADATA.items():
|
||||
assert "name_pt" in meta, f"{pid} missing name_pt"
|
||||
assert "name_en" in meta, f"{pid} missing name_en"
|
||||
assert "desc_pt" in meta, f"{pid} missing desc_pt"
|
||||
assert "desc_en" in meta, f"{pid} missing desc_en"
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_list_patterns_endpoint(client: AsyncClient) -> None:
|
||||
response = await client.get("/api/v1/patterns/")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "patterns" in data
|
||||
assert len(data["patterns"]) == 8
|
||||
|
||||
ids = {row["id"] for row in data["patterns"]}
|
||||
assert ids == set(COMMUNITY_PATTERN_IDS)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_patterns_endpoint_returns_503_when_disabled(
|
||||
client: AsyncClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(settings, "patterns_enabled", False)
|
||||
response = await client.get("/api/v1/patterns/")
|
||||
assert response.status_code == 503
|
||||
assert "temporarily unavailable" in response.json()["detail"]
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_invalid_pattern_returns_404(client: AsyncClient) -> None:
|
||||
response = await client.get("/api/v1/patterns/test-id/nonexistent_pattern")
|
||||
assert response.status_code == 404
|
||||
assert "Pattern not found" in response.json()["detail"]
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_patterns_endpoint_forwards_include_probable(client: AsyncClient) -> None:
|
||||
with patch("bracc.routers.patterns.run_all_patterns", new_callable=AsyncMock) as mock_run_all:
|
||||
mock_run_all.return_value = []
|
||||
response = await client.get("/api/v1/patterns/test-id?include_probable=true")
|
||||
|
||||
assert response.status_code == 200
|
||||
mock_run_all.assert_awaited_once()
|
||||
_driver, entity_id, _lang = mock_run_all.await_args.args
|
||||
assert entity_id == "test-id"
|
||||
assert mock_run_all.await_args.kwargs["include_probable"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_specific_pattern_endpoint_forwards_include_probable(client: AsyncClient) -> None:
|
||||
with patch("bracc.routers.patterns.run_pattern", new_callable=AsyncMock) as mock_run_one:
|
||||
mock_run_one.return_value = []
|
||||
response = await client.get(
|
||||
"/api/v1/patterns/test-id/debtor_contracts?include_probable=true",
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
mock_run_one.assert_awaited_once()
|
||||
_session, pattern_name, entity_id, _lang = mock_run_one.await_args.args
|
||||
assert pattern_name == "debtor_contracts"
|
||||
assert entity_id == "test-id"
|
||||
assert mock_run_one.await_args.kwargs["include_probable"] is True
|
||||
|
||||
|
||||
def test_community_queries_use_bind_params() -> None:
|
||||
for query_name in COMMUNITY_PATTERN_QUERIES.values():
|
||||
try:
|
||||
cypher = CypherLoader.load(query_name)
|
||||
finally:
|
||||
CypherLoader.clear_cache()
|
||||
assert "$company_id" in cypher, f"{query_name}.cypher missing $company_id"
|
||||
assert "$company_identifier" in cypher, f"{query_name}.cypher missing $company_identifier"
|
||||
assert "$company_identifier_formatted" in cypher, (
|
||||
f"{query_name}.cypher missing $company_identifier_formatted"
|
||||
)
|
||||
assert "${" not in cypher, f"{query_name}.cypher uses unsafe string interpolation"
|
||||
|
||||
|
||||
def test_no_banned_words_in_pattern_metadata() -> None:
|
||||
banned = {"suspicious", "corrupt", "criminal", "fraudulent", "illegal", "guilty"}
|
||||
for pid, meta in PATTERN_METADATA.items():
|
||||
for key, value in meta.items():
|
||||
for word in banned:
|
||||
assert word not in value.lower(), (
|
||||
f"Banned word '{word}' in {pid}.{key}: {value}"
|
||||
)
|
||||
@@ -1,79 +0,0 @@
|
||||
"""Community public-safe pattern registry and query contract tests."""
|
||||
|
||||
import pytest
|
||||
|
||||
from bracc.models.pattern import PATTERN_METADATA
|
||||
from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES
|
||||
from bracc.services.neo4j_service import CypherLoader
|
||||
|
||||
|
||||
def test_community_pattern_registry_exact_ids() -> None:
|
||||
assert len(COMMUNITY_PATTERN_IDS) == 8
|
||||
assert set(COMMUNITY_PATTERN_IDS) == {
|
||||
"sanctioned_still_receiving",
|
||||
"amendment_beneficiary_contracts",
|
||||
"split_contracts_below_threshold",
|
||||
"contract_concentration",
|
||||
"embargoed_receiving",
|
||||
"debtor_contracts",
|
||||
"srp_multi_org_hitchhiking",
|
||||
"inexigibility_recurrence",
|
||||
}
|
||||
|
||||
|
||||
def test_community_pattern_query_mapping_is_complete() -> None:
|
||||
assert set(COMMUNITY_PATTERN_QUERIES.keys()) == set(COMMUNITY_PATTERN_IDS)
|
||||
for query_name in COMMUNITY_PATTERN_QUERIES.values():
|
||||
assert query_name.startswith("public_pattern_")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values())
|
||||
def test_public_pattern_query_files_load(query_name: str) -> None:
|
||||
try:
|
||||
CypherLoader.load(query_name)
|
||||
finally:
|
||||
CypherLoader.clear_cache()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values())
|
||||
def test_public_pattern_query_required_return_aliases(query_name: str) -> None:
|
||||
try:
|
||||
cypher = CypherLoader.load(query_name)
|
||||
finally:
|
||||
CypherLoader.clear_cache()
|
||||
|
||||
for required_alias in (
|
||||
" AS pattern_id",
|
||||
" AS risk_signal",
|
||||
" AS amount_total",
|
||||
" AS window_start",
|
||||
" AS window_end",
|
||||
" AS evidence_refs",
|
||||
" AS evidence_count",
|
||||
):
|
||||
assert required_alias in cypher, f"{query_name}.cypher missing alias: {required_alias}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pattern_id", COMMUNITY_PATTERN_IDS)
|
||||
def test_community_pattern_metadata_is_present(pattern_id: str) -> None:
|
||||
meta = PATTERN_METADATA.get(pattern_id)
|
||||
assert meta is not None
|
||||
assert meta.get("name_pt")
|
||||
assert meta.get("name_en")
|
||||
assert meta.get("desc_pt")
|
||||
assert meta.get("desc_en")
|
||||
|
||||
|
||||
def test_threshold_params_used_in_threshold_patterns() -> None:
|
||||
query_params = {
|
||||
"public_pattern_split_contracts_below_threshold": "$pattern_split_threshold_value",
|
||||
"public_pattern_contract_concentration": "$pattern_share_threshold",
|
||||
"public_pattern_srp_multi_org_hitchhiking": "$pattern_srp_min_orgs",
|
||||
"public_pattern_inexigibility_recurrence": "$pattern_inexig_min_recurrence",
|
||||
}
|
||||
for query_name, required_param in query_params.items():
|
||||
try:
|
||||
cypher = CypherLoader.load(query_name)
|
||||
finally:
|
||||
CypherLoader.clear_cache()
|
||||
assert required_param in cypher, f"{query_name}.cypher missing {required_param}"
|
||||
@@ -225,6 +225,135 @@ async def test_public_graph_company_filters_person_nodes(client: AsyncClient) ->
|
||||
assert len(payload["edges"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_baseline_disabled_in_public_mode(
|
||||
client: AsyncClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(settings, "public_mode", True)
|
||||
monkeypatch.setattr(settings, "public_allow_entity_lookup", False)
|
||||
response = await client.get("/api/v1/baseline/test-id")
|
||||
assert response.status_code == 403
|
||||
assert "disabled in public mode" in response.json()["detail"]
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_stats_hides_person_count_in_public_mode(
|
||||
client: AsyncClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(settings, "public_mode", True)
|
||||
monkeypatch.setattr(settings, "public_allow_person", False)
|
||||
# Clear stats cache to ensure fresh computation
|
||||
import bracc.routers.meta as meta_mod
|
||||
monkeypatch.setattr(meta_mod, "_stats_cache", None)
|
||||
|
||||
fake_record = {
|
||||
"total_nodes": 100,
|
||||
"total_relationships": 200,
|
||||
"person_count": 999,
|
||||
"company_count": 50,
|
||||
"health_count": 10,
|
||||
"finance_count": 5,
|
||||
"contract_count": 20,
|
||||
"sanction_count": 3,
|
||||
"election_count": 7,
|
||||
"amendment_count": 4,
|
||||
"embargo_count": 2,
|
||||
"education_count": 6,
|
||||
"convenio_count": 8,
|
||||
"laborstats_count": 9,
|
||||
"offshore_entity_count": 1,
|
||||
"offshore_officer_count": 2,
|
||||
"global_pep_count": 3,
|
||||
"cvm_proceeding_count": 4,
|
||||
"expense_count": 11,
|
||||
"pep_record_count": 12,
|
||||
"expulsion_count": 13,
|
||||
"leniency_count": 14,
|
||||
"international_sanction_count": 15,
|
||||
"gov_card_expense_count": 16,
|
||||
"gov_travel_count": 17,
|
||||
"bid_count": 18,
|
||||
"fund_count": 19,
|
||||
"dou_act_count": 20,
|
||||
"tax_waiver_count": 21,
|
||||
"municipal_finance_count": 22,
|
||||
"declared_asset_count": 23,
|
||||
"party_membership_count": 24,
|
||||
"barred_ngo_count": 25,
|
||||
"bcb_penalty_count": 26,
|
||||
"labor_movement_count": 27,
|
||||
"legal_case_count": 28,
|
||||
"judicial_case_count": 29,
|
||||
"source_document_count": 30,
|
||||
"ingestion_run_count": 31,
|
||||
"temporal_violation_count": 32,
|
||||
"cpi_count": 33,
|
||||
"inquiry_requirement_count": 34,
|
||||
"inquiry_session_count": 35,
|
||||
"municipal_bid_count": 36,
|
||||
"municipal_contract_count": 37,
|
||||
"municipal_gazette_act_count": 38,
|
||||
}
|
||||
with patch(
|
||||
"bracc.routers.meta.execute_query_single",
|
||||
new_callable=AsyncMock,
|
||||
return_value=fake_record,
|
||||
), patch(
|
||||
"bracc.routers.meta.load_source_registry",
|
||||
return_value=[],
|
||||
), patch(
|
||||
"bracc.routers.meta.source_registry_summary",
|
||||
return_value={
|
||||
"universe_v1_sources": 0,
|
||||
"implemented_sources": 0,
|
||||
"loaded_sources": 0,
|
||||
"healthy_sources": 0,
|
||||
"stale_sources": 0,
|
||||
"blocked_external_sources": 0,
|
||||
"quality_fail_sources": 0,
|
||||
"discovered_uningested_sources": 0,
|
||||
},
|
||||
):
|
||||
response = await client.get("/api/v1/meta/stats")
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload["person_count"] == 0
|
||||
assert payload["company_count"] == 50 # non-person counts preserved
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_timeline_sanitizes_properties_in_public_mode(
|
||||
client: AsyncClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(settings, "public_mode", True)
|
||||
monkeypatch.setattr(settings, "public_allow_entity_lookup", True)
|
||||
mock_records = [
|
||||
{
|
||||
"lbls": ["Contract"],
|
||||
"props": {"type": "licitacao", "cpf": "12345678900", "value": 50000.0},
|
||||
"event_date": "2024-01-15",
|
||||
"id": "evt-1",
|
||||
},
|
||||
]
|
||||
with patch(
|
||||
"bracc.routers.entity.execute_query",
|
||||
new_callable=AsyncMock,
|
||||
return_value=mock_records,
|
||||
):
|
||||
response = await client.get("/api/v1/entity/test-id/timeline")
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert len(payload["events"]) == 1
|
||||
event_props = payload["events"][0]["properties"]
|
||||
assert "cpf" not in event_props
|
||||
assert event_props["value"] == 50000.0
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_investigations_disabled_in_public_mode(
|
||||
client: AsyncClient,
|
||||
|
||||
@@ -1,24 +1,15 @@
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from bracc.config import settings
|
||||
from bracc.middleware.rate_limit import _get_rate_limit_key, limiter
|
||||
from bracc.services.auth_service import create_access_token
|
||||
|
||||
|
||||
def _make_request(
|
||||
auth_header: str | None = None,
|
||||
client_ip: str = "127.0.0.1",
|
||||
cookie_token: str | None = None,
|
||||
x_forwarded_for: str | None = None,
|
||||
) -> MagicMock:
|
||||
def _make_request(auth_header: str | None = None, client_ip: str = "127.0.0.1") -> MagicMock:
|
||||
request = MagicMock()
|
||||
headers: dict[str, str] = {}
|
||||
if auth_header:
|
||||
headers["authorization"] = auth_header
|
||||
if x_forwarded_for:
|
||||
headers["x-forwarded-for"] = x_forwarded_for
|
||||
request.headers = headers
|
||||
request.cookies = {settings.auth_cookie_name: cookie_token} if cookie_token else {}
|
||||
request.client = MagicMock()
|
||||
request.client.host = client_ip
|
||||
return request
|
||||
@@ -43,23 +34,5 @@ def test_key_func_invalid_token_fallback() -> None:
|
||||
assert key == "10.0.0.1"
|
||||
|
||||
|
||||
def test_key_func_extracts_user_from_cookie_token() -> None:
|
||||
token = create_access_token("cookie-user-1")
|
||||
request = _make_request(cookie_token=token)
|
||||
key = _get_rate_limit_key(request)
|
||||
assert key == "user:cookie-user-1"
|
||||
|
||||
|
||||
def test_key_func_uses_forwarded_ip_when_enabled() -> None:
|
||||
original = settings.trust_proxy_headers
|
||||
try:
|
||||
settings.trust_proxy_headers = True
|
||||
request = _make_request(client_ip="127.0.0.1", x_forwarded_for="203.0.113.9, 10.0.0.4")
|
||||
key = _get_rate_limit_key(request)
|
||||
assert key == "203.0.113.9"
|
||||
finally:
|
||||
settings.trust_proxy_headers = original
|
||||
|
||||
|
||||
def test_limiter_instance_exists() -> None:
|
||||
assert limiter is not None
|
||||
|
||||
@@ -1,21 +1,6 @@
|
||||
import pytest
|
||||
from httpx import AsyncClient
|
||||
|
||||
from bracc.routers.search import _escape_lucene
|
||||
|
||||
|
||||
def test_escape_lucene_cnpj() -> None:
|
||||
assert _escape_lucene("00.000.000/0001-00") == "00.000.000\\/0001\\-00"
|
||||
|
||||
|
||||
def test_escape_lucene_plain_text() -> None:
|
||||
assert _escape_lucene("silva construcoes") == "silva construcoes"
|
||||
|
||||
|
||||
def test_escape_lucene_all_special_chars() -> None:
|
||||
for ch in r'+-&|!(){}[]^"~*?:\/':
|
||||
assert f"\\{ch}" in _escape_lucene(ch)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_search_rejects_short_query(client: AsyncClient) -> None:
|
||||
|
||||
100
api/uv.lock
generated
100
api/uv.lock
generated
@@ -103,6 +103,56 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bracc-api"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "bcrypt" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "jinja2" },
|
||||
{ name = "neo4j" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pydantic-settings" },
|
||||
{ name = "pyjwt", extra = ["crypto"] },
|
||||
{ name = "python-multipart" },
|
||||
{ name = "slowapi" },
|
||||
{ name = "uvicorn", extra = ["standard"] },
|
||||
{ name = "weasyprint" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
dev = [
|
||||
{ name = "httpx" },
|
||||
{ name = "mypy" },
|
||||
{ name = "pytest" },
|
||||
{ name = "pytest-asyncio" },
|
||||
{ name = "ruff" },
|
||||
{ name = "testcontainers", extra = ["neo4j"] },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "bcrypt", specifier = ">=4.0.0" },
|
||||
{ name = "fastapi", specifier = ">=0.115.0" },
|
||||
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
|
||||
{ name = "jinja2", specifier = ">=3.1.0" },
|
||||
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" },
|
||||
{ name = "neo4j", specifier = ">=5.27.0" },
|
||||
{ name = "pydantic", specifier = ">=2.10.0" },
|
||||
{ name = "pydantic-settings", specifier = ">=2.7.0" },
|
||||
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" },
|
||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
||||
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
|
||||
{ name = "python-multipart", specifier = ">=0.0.18" },
|
||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
|
||||
{ name = "slowapi", specifier = ">=0.1.9" },
|
||||
{ name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" },
|
||||
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" },
|
||||
{ name = "weasyprint", specifier = ">=62.0" },
|
||||
]
|
||||
provides-extras = ["dev"]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "1.2.0"
|
||||
@@ -523,56 +573,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bracc-api"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "bcrypt" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "jinja2" },
|
||||
{ name = "neo4j" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pydantic-settings" },
|
||||
{ name = "pyjwt", extra = ["crypto"] },
|
||||
{ name = "python-multipart" },
|
||||
{ name = "slowapi" },
|
||||
{ name = "uvicorn", extra = ["standard"] },
|
||||
{ name = "weasyprint" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
dev = [
|
||||
{ name = "httpx" },
|
||||
{ name = "mypy" },
|
||||
{ name = "pytest" },
|
||||
{ name = "pytest-asyncio" },
|
||||
{ name = "ruff" },
|
||||
{ name = "testcontainers", extra = ["neo4j"] },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "bcrypt", specifier = ">=4.0.0" },
|
||||
{ name = "fastapi", specifier = ">=0.115.0" },
|
||||
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
|
||||
{ name = "jinja2", specifier = ">=3.1.0" },
|
||||
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" },
|
||||
{ name = "neo4j", specifier = ">=5.27.0" },
|
||||
{ name = "pydantic", specifier = ">=2.10.0" },
|
||||
{ name = "pydantic-settings", specifier = ">=2.7.0" },
|
||||
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" },
|
||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
||||
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
|
||||
{ name = "python-multipart", specifier = ">=0.0.18" },
|
||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
|
||||
{ name = "slowapi", specifier = ">=0.1.9" },
|
||||
{ name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" },
|
||||
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" },
|
||||
{ name = "weasyprint", specifier = ">=62.0" },
|
||||
]
|
||||
provides-extras = ["dev"]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.11"
|
||||
|
||||
0
data/.gitkeep
Normal file
0
data/.gitkeep
Normal file
0
data/cnpj/extracted/.gitkeep
Normal file
0
data/cnpj/extracted/.gitkeep
Normal file
0
data/cnpj/raw/.gitkeep
Normal file
0
data/cnpj/raw/.gitkeep
Normal file
0
data/cnpj/reference/.gitkeep
Normal file
0
data/cnpj/reference/.gitkeep
Normal file
Binary file not shown.
|
Before Width: | Height: | Size: 1.1 MiB |
Binary file not shown.
|
Before Width: | Height: | Size: 2.3 MiB |
@@ -1,24 +1,13 @@
|
||||
# BRACC Data Source Catalog
|
||||
# ICARUS Data Source Catalog
|
||||
|
||||
<!-- SOURCE_SUMMARY_START -->
|
||||
**Generated from `docs/source_registry_br_v1.csv` (as-of UTC: 2026-03-01T23:05:00Z)**
|
||||
|
||||
- Universe v1 sources: 108
|
||||
- Implemented pipelines: 45
|
||||
- Loaded sources (load_state=loaded): 36
|
||||
- Partial sources (load_state=partial): 8
|
||||
- Not loaded sources (load_state=not_loaded): 64
|
||||
- Status counts: loaded=36, partial=5, stale=3, blocked_external=1, not_built=63
|
||||
<!-- SOURCE_SUMMARY_END -->
|
||||
|
||||
Catalog note: counts and status labels are generated from the public registry (`docs/source_registry_br_v1.csv`).
|
||||
This document includes reference production inventory context and backlog discovery; it is not a guarantee that every listed source is currently loaded in your local environment.
|
||||
**38 loaded | 3 pipelines pending data | 60+ not yet built**
|
||||
Last updated: 2026-02-26
|
||||
|
||||
---
|
||||
|
||||
## 1. Reference Production Snapshot (Loaded/Implemented Inventory)
|
||||
## 1. LOADED (38 sources)
|
||||
|
||||
The table below is a timestamped reference snapshot and should be interpreted together with the generated summary block above.
|
||||
All sources below have working ETL pipelines in `etl/src/icarus_etl/pipelines/` and are loaded into production Neo4j.
|
||||
|
||||
| # | Source | Pipeline | Nodes Created | Rels Created | Notes |
|
||||
|---|--------|----------|---------------|--------------|-------|
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
# Demo Dataset Contract (WTG Open)
|
||||
|
||||
## Objective
|
||||
Provide a reproducible, public-safe demo graph with synthetic records only.
|
||||
|
||||
## Safety rules
|
||||
- Synthetic data only. No real CPF, no real personal names, no real personal addresses.
|
||||
- Company identifiers may use synthetic CNPJ-like values reserved for demonstration.
|
||||
- Demo graph cannot include `Person` or `Partner` labels.
|
||||
- Demo exports must never include private or operational metadata.
|
||||
|
||||
## Required files
|
||||
- `data/demo/synthetic_graph.json`
|
||||
- `data/demo/README.md`
|
||||
- `scripts/generate_demo_dataset.py`
|
||||
|
||||
## JSON schema (minimum)
|
||||
- `nodes[]`: `{id, label, type, properties}`
|
||||
- `edges[]`: `{id, source, target, type, properties}`
|
||||
- `meta`: `{generated_at_utc, generator_version, source: "synthetic"}`
|
||||
|
||||
## Acceptance checks
|
||||
- No field name contains `cpf`, `doc_partial`, or `doc_raw`.
|
||||
- No node label equals `Person` or `Partner`.
|
||||
- CI privacy gate passes.
|
||||
|
||||
## Runtime target
|
||||
- Dedicated demo Neo4j instance (non-production).
|
||||
- Public API served with `PUBLIC_MODE=true`.
|
||||
@@ -14,7 +14,6 @@ Resumo:
|
||||
Release notes: {release_url}
|
||||
|
||||
Observação de integridade: os sinais refletem coocorrências em bases públicas e não constituem prova legal.
|
||||
Divulgação obrigatória: o repositório público entrega engine + demo + fluxo BYO-data; métricas de escala são snapshots de referência com timestamp.
|
||||
|
||||
## Short post (EN)
|
||||
|
||||
@@ -28,7 +27,6 @@ Summary:
|
||||
Release notes: {release_url}
|
||||
|
||||
Integrity note: signals reflect co-occurrence in public records and are not legal proof.
|
||||
Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; production-scale metrics are timestamped reference snapshots.
|
||||
|
||||
## Discord/Telegram long form (PT+EN)
|
||||
|
||||
@@ -44,11 +42,6 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p
|
||||
**Compatibilidade**
|
||||
- {pt_compat}
|
||||
|
||||
**Reproducibility Reality Check**
|
||||
- Funciona agora: {pt_works_now}
|
||||
- Requer ingestão de dados: {pt_requires_ingestion}
|
||||
- Não incluído por padrão: {pt_not_included}
|
||||
|
||||
**Link**
|
||||
- {release_url}
|
||||
|
||||
@@ -64,10 +57,5 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p
|
||||
**Compatibility**
|
||||
- {en_compat}
|
||||
|
||||
**Reproducibility Reality Check**
|
||||
- Works now: {en_works_now}
|
||||
- Requires data ingestion: {en_requires_ingestion}
|
||||
- Not included by default: {en_not_included}
|
||||
|
||||
**Link**
|
||||
- {release_url}
|
||||
|
||||
@@ -7,8 +7,8 @@ docs/**,PUBLIC with review,Keep public documentation and legal pack,include revi
|
||||
.github/workflows/**,PUBLIC,CI and security transparency,include
|
||||
scripts/**,PUBLIC with review,Keep public utilities and gates,include reviewed subset
|
||||
data/demo/**,PUBLIC,Synthetic demo dataset only,include
|
||||
api/src/bracc/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude
|
||||
api/src/bracc/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude
|
||||
api/src/icarus/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude
|
||||
api/src/icarus/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude
|
||||
scripts/auto_finalize_pncp_backfill.sh,REMOVE_FROM_PUBLIC,Production operational script with server-specific assumptions,exclude
|
||||
docs/shadow_rollout_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude
|
||||
docs/ingestion_priority_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude
|
||||
|
||||
|
@@ -1,56 +1,78 @@
|
||||
# Public Repo Release Checklist — `World-Open-Graph/br-acc`
|
||||
|
||||
## 1) Pre-release gate
|
||||
|
||||
1. Confirm target merge commit exists on `main`.
|
||||
2. Confirm CI + Security + Public gates are green on that commit.
|
||||
3. Confirm PR is merged with exactly one release label.
|
||||
|
||||
## 2) Public boundary checks
|
||||
# Public Repo Release Checklist — World Transparency Graph
|
||||
|
||||
## 1) Prepare sanitized snapshot
|
||||
```bash
|
||||
python scripts/check_public_privacy.py --repo-root .
|
||||
python scripts/check_compliance_pack.py --repo-root .
|
||||
python scripts/check_open_core_boundary.py --repo-root .
|
||||
bash scripts/prepare_public_snapshot.sh /Users/brunoclz/CORRUPTOS /tmp/world-transparency-graph-public
|
||||
```
|
||||
|
||||
Expected: all `PASS`.
|
||||
|
||||
## 3) Snapshot hygiene (optional verification)
|
||||
|
||||
## 2) Initialize clean-history repo from snapshot
|
||||
```bash
|
||||
bash scripts/prepare_public_snapshot.sh . /tmp/br-acc-public
|
||||
python /tmp/br-acc-public/scripts/check_public_privacy.py --repo-root /tmp/br-acc-public
|
||||
python /tmp/br-acc-public/scripts/check_compliance_pack.py --repo-root /tmp/br-acc-public
|
||||
python /tmp/br-acc-public/scripts/check_open_core_boundary.py --repo-root /tmp/br-acc-public
|
||||
cd /tmp/world-transparency-graph-public
|
||||
git init
|
||||
git add .
|
||||
git commit -m "Initial public release (WTG)"
|
||||
```
|
||||
|
||||
Expected in snapshot:
|
||||
## 3) Create GitHub repository (manual)
|
||||
- Owner: `brunoclz`
|
||||
- Name: `world-transparency-graph`
|
||||
- Visibility: Public
|
||||
- Do not auto-add README/License (already present)
|
||||
|
||||
- No `CLAUDE.md`.
|
||||
- No `AGENTS.md` or `AGENTS*.md`.
|
||||
- No private operational runbooks outside public scope.
|
||||
## 4) Push initial release
|
||||
```bash
|
||||
git branch -M main
|
||||
git remote add origin https://github.com/brunoclz/world-transparency-graph.git
|
||||
git push -u origin main
|
||||
```
|
||||
|
||||
## 4) Publish release (manual workflow)
|
||||
## 5) Configure branch protection (GitHub UI)
|
||||
Require all checks:
|
||||
- `API (Python)`
|
||||
- `ETL (Python)`
|
||||
- `Frontend (TypeScript)`
|
||||
- `Neutrality Audit`
|
||||
- `Gitleaks`
|
||||
- `Bandit (Python)`
|
||||
- `Pip Audit (Python deps)`
|
||||
- `Public Privacy Gate`
|
||||
- `Compliance Pack Gate`
|
||||
- `Public Boundary Gate`
|
||||
|
||||
In GitHub Actions, run **Publish Release** with:
|
||||
## 6) Configure environment defaults
|
||||
- Set public deployment environment vars:
|
||||
- `PRODUCT_TIER=community`
|
||||
- `PUBLIC_MODE=true`
|
||||
- `PUBLIC_ALLOW_PERSON=false`
|
||||
- `PUBLIC_ALLOW_ENTITY_LOOKUP=false`
|
||||
- `PUBLIC_ALLOW_INVESTIGATIONS=false`
|
||||
- `PATTERNS_ENABLED=false`
|
||||
- `VITE_PUBLIC_MODE=true`
|
||||
- `VITE_PATTERNS_ENABLED=false`
|
||||
|
||||
- `version`: SemVer tag (e.g. `v0.3.0`, `v0.3.1-rc.1`)
|
||||
- `target_sha`: merge commit on `main`
|
||||
- `prerelease`: `false` (stable) or `true` (RC)
|
||||
- `title_pt`: release title PT-BR
|
||||
- `title_en`: release title EN
|
||||
## 7) Final checks before launch
|
||||
- `python scripts/check_public_privacy.py --repo-root .` => `PASS`
|
||||
- `python scripts/check_compliance_pack.py --repo-root .` => `PASS`
|
||||
- `python scripts/check_open_core_boundary.py --repo-root .` => `PASS`
|
||||
- Confirm no internal runbooks in public repo
|
||||
- Confirm demo data is synthetic (`data/demo/synthetic_graph.json`)
|
||||
- Confirm all legal docs exist in root:
|
||||
- `ETHICS.md`
|
||||
- `LGPD.md`
|
||||
- `PRIVACY.md`
|
||||
- `TERMS.md`
|
||||
- `DISCLAIMER.md`
|
||||
- `SECURITY.md`
|
||||
- `ABUSE_RESPONSE.md`
|
||||
|
||||
## 5) Verify outputs
|
||||
## 8) Launch communication split
|
||||
- Publish product announcement as **WTG**
|
||||
- Publish movement announcement as **BRCC**
|
||||
- Mention methodology limits and non-accusatory policy
|
||||
|
||||
1. Tag exists in repository.
|
||||
2. Release page published under `/releases`.
|
||||
3. Notes include PT+EN and non-accusatory disclaimer.
|
||||
4. `release_manifest.json` asset is attached.
|
||||
5. Compare link is valid (`previous_tag...new_tag`).
|
||||
|
||||
## 6) Community communication
|
||||
|
||||
1. Use `docs/release/community_announcement_template.md`.
|
||||
2. Publish short PT+EN summary with release URL.
|
||||
3. Keep wording factual: “signals/co-occurrence”, never accusatory language.
|
||||
## 9) Release system bootstrap
|
||||
- Ensure `.github/release.yml` exists for auto-notes categories.
|
||||
- Ensure `.github/release-drafter.yml` + workflow are active.
|
||||
- Ensure `publish-release.yml` workflow is present and dispatchable.
|
||||
- Ensure release label taxonomy is documented and applied to PRs.
|
||||
- Publish first policy-compliant tag from this stream (`v0.3.0`).
|
||||
|
||||
@@ -48,11 +48,10 @@ A release can only be published from a commit on `main` where all required gates
|
||||
Every release must include PT-BR and EN sections with:
|
||||
|
||||
1. Scope summary.
|
||||
2. Notable changes (explicit bullet points).
|
||||
3. Included pattern IDs when release contains pattern/signal changes.
|
||||
4. Compatibility/breaking notes.
|
||||
5. Privacy/compliance notes when applicable.
|
||||
6. Non-accusatory disclaimer.
|
||||
2. Notable changes.
|
||||
3. Compatibility/breaking notes.
|
||||
4. Privacy/compliance notes when applicable.
|
||||
5. Non-accusatory disclaimer.
|
||||
|
||||
## Artifacts
|
||||
|
||||
|
||||
@@ -37,19 +37,6 @@ For validation cycles use RC:
|
||||
- `prerelease`: `true` for RC, `false` for stable
|
||||
- `title_pt`: short PT-BR title
|
||||
- `title_en`: short EN title
|
||||
- `highlights_pt`: PT highlights separated by `|`
|
||||
- `highlights_en`: EN highlights separated by `|`
|
||||
- `patterns_included`: comma-separated pattern IDs (use `none` when not applicable)
|
||||
- `technical_changes_pt`: PT technical changes separated by `|`
|
||||
- `technical_changes_en`: EN technical changes separated by `|`
|
||||
|
||||
Example inputs for a pattern release:
|
||||
|
||||
- `highlights_pt`: `Port de 8 padrões públicos factuais | Padronização de payload público`
|
||||
- `highlights_en`: `Port of 8 factual public-safe patterns | Public payload standardization`
|
||||
- `patterns_included`: `sanctioned_still_receiving,amendment_beneficiary_contracts,split_contracts_below_threshold,contract_concentration,embargoed_receiving,debtor_contracts,srp_multi_org_hitchhiking,inexigibility_recurrence`
|
||||
- `technical_changes_pt`: `Provider community de 4 para 8 padrões | ETL criou relação Contract-REFERENTE_A-Bid`
|
||||
- `technical_changes_en`: `Community provider expanded from 4 to 8 patterns | ETL created Contract-REFERENTE_A-Bid linkage`
|
||||
|
||||
## 4) Workflow validations performed
|
||||
|
||||
@@ -65,7 +52,7 @@ The workflow blocks publication when:
|
||||
On success the workflow:
|
||||
|
||||
1. Creates and pushes an annotated tag.
|
||||
2. Creates GitHub Release (PT+EN notes) with explicit highlights, patterns, and technical changes.
|
||||
2. Creates GitHub Release (PT+EN notes).
|
||||
3. Uploads `release_manifest.json` asset.
|
||||
|
||||
## 6) Post-release checklist
|
||||
@@ -73,7 +60,6 @@ On success the workflow:
|
||||
1. Open the release page and confirm:
|
||||
- version tag is correct,
|
||||
- PT+EN notes are present,
|
||||
- included patterns are explicitly listed (or marked as none),
|
||||
- non-accusatory disclaimer line is present,
|
||||
- `release_manifest.json` is attached.
|
||||
2. Share release link in community channels.
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
# Source Onboarding Contract (Brazil Coverage v1)
|
||||
|
||||
This contract is mandatory for every new source before `shadow -> promote`.
|
||||
|
||||
## 1. Source Identity
|
||||
- `source_id`:
|
||||
- `name`:
|
||||
- `category`:
|
||||
- `tier`:
|
||||
- `owner_agent`:
|
||||
- `primary_url`:
|
||||
- `access_mode` (`file|api|bigquery|web`):
|
||||
- `public_access_mode` (`open|open_with_rate_limit|registration|credentialed_public`):
|
||||
- `discovery_status` (`discovered|discovered_uningested|monitored|unreachable`):
|
||||
- `last_seen_url`:
|
||||
- `cadence_expected`:
|
||||
- `cadence_observed`:
|
||||
- `quality_status` (`healthy|stale|quality_fail|blocked_external|not_built|partial|loaded`):
|
||||
|
||||
## 2. Access and Legal
|
||||
- Credential required:
|
||||
- Secret name/path:
|
||||
- License or usage restriction:
|
||||
- LGPD/privacy considerations:
|
||||
- `blocked_external` criteria:
|
||||
|
||||
## 3. Data Contract
|
||||
- Downloader script: `etl/scripts/download_<source>.py`
|
||||
- Canonical output files:
|
||||
- Manifest file:
|
||||
- Manifest mandatory fields (`run_id`, `source_id`, `window_start`, `window_end`, `rows`, `error`, `checksum`, `retrieved_at_utc`):
|
||||
- Update cadence:
|
||||
- Expected row volume:
|
||||
- Partition/window strategy:
|
||||
|
||||
## 4. Graph Contract
|
||||
- Node labels introduced:
|
||||
- Relationship types introduced:
|
||||
- Natural key(s) per node:
|
||||
- Merge key strategy:
|
||||
- Relationship quality tier (`strong|probable`):
|
||||
- Provenance fields (`method`, `confidence`, `source_ref`, `run_id`):
|
||||
|
||||
## 5. Index and Constraint Contract
|
||||
- Required uniqueness constraints:
|
||||
- Required date indexes:
|
||||
- Required lookup indexes:
|
||||
- Required fulltext indexes (if text-heavy):
|
||||
|
||||
## 6. Quality Gates (Hard Stop/Go)
|
||||
- Identity integrity preserved (`Person.cpf` masked = 0, 14-digit = 0):
|
||||
- Freshness SLA threshold:
|
||||
- Temporal sanity (`<= now + 365d`):
|
||||
- Null/duplicate key thresholds:
|
||||
- Mandatory non-zero nodes/rels:
|
||||
|
||||
## 7. Operational Flow
|
||||
- Shadow load command:
|
||||
- Gate runner commands:
|
||||
- API smoke checks:
|
||||
- Promote command:
|
||||
- Rollback command:
|
||||
|
||||
## 8. Acceptance
|
||||
- Evidence bundle path in `audit-results/`:
|
||||
- Final status: `resolved | resolved_full | blocked_external | quality_fail`
|
||||
- Reviewer sign-off:
|
||||
@@ -1,109 +1,109 @@
|
||||
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status,last_verified_utc,verification_status
|
||||
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
|
||||
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=bens,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=filiacao,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
|
||||
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok
|
||||
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok
|
||||
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/datajud/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.gov.br/icmbio/pt-br,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited
|
||||
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://tcers.tc.br/fiscalizado/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok
|
||||
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error
|
||||
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status
|
||||
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy
|
||||
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy
|
||||
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy
|
||||
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy
|
||||
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy
|
||||
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy
|
||||
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy
|
||||
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy
|
||||
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale
|
||||
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy
|
||||
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy
|
||||
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy
|
||||
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy
|
||||
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy
|
||||
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy
|
||||
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy
|
||||
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy
|
||||
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy
|
||||
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy
|
||||
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy
|
||||
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial
|
||||
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy
|
||||
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy
|
||||
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy
|
||||
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy
|
||||
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy
|
||||
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy
|
||||
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/holding/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy
|
||||
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy
|
||||
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial
|
||||
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale
|
||||
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy
|
||||
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial
|
||||
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/dataset/bens-candidato,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy
|
||||
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/dataset/filiados-partidos,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy
|
||||
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/penalidades,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy
|
||||
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy
|
||||
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale
|
||||
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy
|
||||
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy
|
||||
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy
|
||||
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial
|
||||
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy
|
||||
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial
|
||||
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external
|
||||
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built
|
||||
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/estban,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built
|
||||
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/dataset/if-data,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built
|
||||
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built
|
||||
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built
|
||||
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/cnciai/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built
|
||||
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built
|
||||
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built
|
||||
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built
|
||||
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built
|
||||
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built
|
||||
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built
|
||||
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built
|
||||
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built
|
||||
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built
|
||||
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built
|
||||
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built
|
||||
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built
|
||||
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built
|
||||
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built
|
||||
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built
|
||||
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built
|
||||
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built
|
||||
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built
|
||||
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built
|
||||
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built
|
||||
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built
|
||||
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built
|
||||
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.rs.gov.br/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built
|
||||
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
[project]
|
||||
name = "bracc-etl"
|
||||
version = "0.1.0"
|
||||
description = "BRACC ETL — Data ingestion pipelines for Brazilian public data"
|
||||
description = "BR-ACC ETL — Data ingestion pipelines for Brazilian public data"
|
||||
requires-python = ">=3.12"
|
||||
license = "AGPL-3.0-or-later"
|
||||
dependencies = [
|
||||
@@ -9,10 +9,11 @@ dependencies = [
|
||||
"pandas>=2.2.0",
|
||||
"httpx>=0.28.0",
|
||||
"click>=8.1.0",
|
||||
"defusedxml>=0.7.1",
|
||||
"pydantic>=2.10.0",
|
||||
"pydantic-settings>=2.7.0",
|
||||
"pypdf>=5.2.0",
|
||||
"defusedxml>=0.7.0",
|
||||
"pandera>=0.21.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import stat
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
@@ -38,12 +36,21 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool:
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
# If we requested a range but server returned full content (200 vs 206),
|
||||
# start fresh to avoid corruption
|
||||
if start_byte > 0 and response.status_code != 206:
|
||||
logger.warning(
|
||||
"Server ignored Range header for %s, restarting download",
|
||||
dest.name,
|
||||
)
|
||||
start_byte = 0
|
||||
|
||||
total = response.headers.get("content-length")
|
||||
total_mb = f"{int(total) / 1e6:.1f} MB" if total else "unknown size"
|
||||
logger.info("Downloading %s (%s)...", dest.name, total_mb)
|
||||
|
||||
mode = "ab" if start_byte > 0 else "wb"
|
||||
downloaded = start_byte
|
||||
mode = "ab" if start_byte > 0 and response.status_code == 206 else "wb"
|
||||
downloaded = start_byte if mode == "ab" else 0
|
||||
with open(partial, mode) as f:
|
||||
for chunk in response.iter_bytes(chunk_size=65_536):
|
||||
f.write(chunk)
|
||||
@@ -58,24 +65,49 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]:
|
||||
"""Extract ZIP and return list of extracted files.
|
||||
def safe_extract_zip(
|
||||
zip_path: Path,
|
||||
output_dir: Path,
|
||||
*,
|
||||
max_total_bytes: int = 50 * 1024**3, # 50GB default (CNPJ zips are huge)
|
||||
) -> list[Path]:
|
||||
"""Safely extract ZIP with path traversal and bomb guards.
|
||||
|
||||
Deletes corrupted ZIPs for re-download.
|
||||
"""
|
||||
try:
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
extracted = safe_extract_zip(zf, output_dir)
|
||||
logger.info("Extracted %d files from %s", len(extracted), zip_path.name)
|
||||
return extracted
|
||||
# Check for path traversal
|
||||
resolved_output = output_dir.resolve()
|
||||
for info in zf.infolist():
|
||||
target = (output_dir / info.filename).resolve()
|
||||
if not target.is_relative_to(resolved_output):
|
||||
raise ValueError(
|
||||
f"Path traversal detected in {zip_path.name}: {info.filename}"
|
||||
)
|
||||
|
||||
# Check total uncompressed size (zip bomb guard)
|
||||
total_size = sum(info.file_size for info in zf.infolist())
|
||||
if total_size > max_total_bytes:
|
||||
raise ValueError(
|
||||
f"ZIP bomb guard: {zip_path.name} would extract to "
|
||||
f"{total_size / 1e9:.1f}GB (limit: {max_total_bytes / 1e9:.1f}GB)"
|
||||
)
|
||||
|
||||
names = zf.namelist()
|
||||
zf.extractall(output_dir)
|
||||
|
||||
logger.info("Extracted %d files from %s", len(names), zip_path.name)
|
||||
return [output_dir / n for n in names]
|
||||
except zipfile.BadZipFile:
|
||||
logger.warning("Bad ZIP file: %s — deleting for re-download", zip_path.name)
|
||||
zip_path.unlink()
|
||||
return []
|
||||
except ValueError as exc:
|
||||
logger.warning("Unsafe ZIP file %s: %s — deleting", zip_path.name, exc)
|
||||
zip_path.unlink(missing_ok=True)
|
||||
return []
|
||||
|
||||
|
||||
def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]:
|
||||
"""Extract ZIP and return list of extracted files."""
|
||||
return safe_extract_zip(zip_path, output_dir)
|
||||
|
||||
|
||||
def validate_csv(
|
||||
@@ -111,60 +143,3 @@ def validate_csv(
|
||||
except Exception as e:
|
||||
logger.warning("Validation failed for %s: %s", path.name, e)
|
||||
return False
|
||||
|
||||
|
||||
def safe_extract_zip(
|
||||
archive: zipfile.ZipFile,
|
||||
output_dir: Path,
|
||||
*,
|
||||
max_members: int = 50_000,
|
||||
max_uncompressed_bytes: int = 5_000_000_000,
|
||||
) -> list[Path]:
|
||||
"""Safely extract a ZIP archive.
|
||||
|
||||
Blocks path traversal, symlinks, and oversized archives.
|
||||
"""
|
||||
output_root = output_dir.resolve()
|
||||
infos = archive.infolist()
|
||||
if len(infos) > max_members:
|
||||
msg = f"ZIP has too many entries ({len(infos)} > {max_members})"
|
||||
raise ValueError(msg)
|
||||
|
||||
extracted: list[Path] = []
|
||||
uncompressed_total = 0
|
||||
for info in infos:
|
||||
member_name = info.filename.replace("\\", "/")
|
||||
if not member_name:
|
||||
continue
|
||||
|
||||
# Reject symlink entries.
|
||||
mode = info.external_attr >> 16
|
||||
if stat.S_ISLNK(mode):
|
||||
msg = f"ZIP contains symlink entry: {member_name}"
|
||||
raise ValueError(msg)
|
||||
|
||||
target = (output_dir / member_name).resolve()
|
||||
try:
|
||||
target.relative_to(output_root)
|
||||
except ValueError as exc:
|
||||
msg = f"Path traversal detected: {member_name}"
|
||||
raise ValueError(msg) from exc
|
||||
|
||||
if info.is_dir():
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
|
||||
uncompressed_total += info.file_size
|
||||
if uncompressed_total > max_uncompressed_bytes:
|
||||
msg = (
|
||||
f"ZIP exceeds max extracted size "
|
||||
f"({uncompressed_total} > {max_uncompressed_bytes})"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with archive.open(info, "r") as source, target.open("wb") as destination:
|
||||
shutil.copyfileobj(source, destination)
|
||||
extracted.append(target)
|
||||
|
||||
return extracted
|
||||
|
||||
@@ -5,9 +5,9 @@ Streams microdados_movimentacao year-by-year to separate CSVs for
|
||||
resumability and memory management on large datasets.
|
||||
|
||||
Usage:
|
||||
python etl/scripts/download_caged.py --billing-project bracc-corruptos
|
||||
python etl/scripts/download_caged.py --billing-project bracc-corruptos --start-year 2024
|
||||
python etl/scripts/download_caged.py --billing-project bracc-corruptos --skip-existing
|
||||
python etl/scripts/download_caged.py --billing-project icarus-corruptos
|
||||
python etl/scripts/download_caged.py --billing-project icarus-corruptos --start-year 2024
|
||||
python etl/scripts/download_caged.py --billing-project icarus-corruptos --skip-existing
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -413,7 +413,7 @@ def _write_manifest(
|
||||
)
|
||||
@click.option(
|
||||
"--billing-project",
|
||||
default="bracc-corruptos",
|
||||
default="icarus-corruptos",
|
||||
help="GCP billing project for BQ mode.",
|
||||
)
|
||||
@click.option(
|
||||
|
||||
@@ -6,15 +6,21 @@ Usage:
|
||||
python etl/scripts/download_cnpj.py --reference-only # reference tables only (tiny)
|
||||
python etl/scripts/download_cnpj.py --files 1 # just first file of each type
|
||||
python etl/scripts/download_cnpj.py --types Empresas # specific type only
|
||||
python etl/scripts/download_cnpj.py --release 2026-03 # pin to specific monthly release
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from _download_utils import download_file, extract_zip, validate_csv
|
||||
@@ -22,7 +28,13 @@ from _download_utils import download_file, extract_zip, validate_csv
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/"
|
||||
# Receita Federal Nextcloud (primary since Jan 2026)
|
||||
NEXTCLOUD_BASE = "https://arquivos.receitafederal.gov.br/s/{token}/download?path=%2F&files="
|
||||
KNOWN_TOKENS = ["gn672Ad4CF8N6TK", "YggdBLfdninEJX9"]
|
||||
|
||||
# Legacy URLs (dadosabertos.rfb.gov.br decommissioned Jan 2026)
|
||||
LEGACY_NEW_BASE_PATTERN = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/{year_month}/"
|
||||
LEGACY_BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/"
|
||||
|
||||
MAIN_TYPES = ["Empresas", "Socios", "Estabelecimentos"]
|
||||
REFERENCE_FILES = [
|
||||
@@ -48,6 +60,126 @@ EXPECTED_COLS = {
|
||||
}
|
||||
|
||||
|
||||
def _previous_month(year: int, month: int) -> tuple[int, int]:
|
||||
"""Return (year, month) for the previous month."""
|
||||
if month == 1:
|
||||
return year - 1, 12
|
||||
return year, month - 1
|
||||
|
||||
|
||||
def _check_url_accessible(url: str, timeout: int = 30) -> bool:
|
||||
"""Send HTTP HEAD to verify a URL is accessible (2xx)."""
|
||||
try:
|
||||
resp = httpx.head(url, follow_redirects=True, timeout=timeout)
|
||||
return resp.status_code < 400
|
||||
except httpx.HTTPError:
|
||||
return False
|
||||
|
||||
|
||||
def _check_nextcloud_token(token: str, timeout: int = 30) -> bool:
|
||||
"""Verify a Nextcloud share token is valid via HEAD request."""
|
||||
share_url = f"https://arquivos.receitafederal.gov.br/s/{token}"
|
||||
try:
|
||||
resp = httpx.head(share_url, follow_redirects=True, timeout=timeout)
|
||||
return resp.status_code < 400
|
||||
except httpx.HTTPError:
|
||||
return False
|
||||
|
||||
|
||||
def resolve_rf_release(year_month: str | None = None) -> str:
|
||||
"""Resolve the Receita Federal CNPJ release URL.
|
||||
|
||||
Strategy:
|
||||
1. Try Nextcloud share (primary since Jan 2026):
|
||||
a. Check CNPJ_SHARE_TOKEN env var first.
|
||||
b. Then try each known token.
|
||||
2. Fall back to legacy dadosabertos.rfb.gov.br paths.
|
||||
3. Raise RuntimeError if nothing works (fail-closed).
|
||||
|
||||
Returns the resolved base URL. For Nextcloud, files are fetched via
|
||||
``{base_url}{filename}``.
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# --- Nextcloud (primary) ---
|
||||
tokens_to_try: list[str] = []
|
||||
|
||||
env_token = os.environ.get("CNPJ_SHARE_TOKEN")
|
||||
if env_token:
|
||||
tokens_to_try.append(env_token)
|
||||
|
||||
for t in KNOWN_TOKENS:
|
||||
if t not in tokens_to_try:
|
||||
tokens_to_try.append(t)
|
||||
|
||||
for token in tokens_to_try:
|
||||
logger.info("Probing Nextcloud token: %s...", token[:6])
|
||||
if _check_nextcloud_token(token):
|
||||
base_url = NEXTCLOUD_BASE.format(token=token)
|
||||
logger.info("Resolved CNPJ via Nextcloud (token %s...)", token[:6])
|
||||
return base_url
|
||||
|
||||
# --- Legacy dadosabertos.rfb.gov.br ---
|
||||
if year_month is not None:
|
||||
candidates = [year_month]
|
||||
else:
|
||||
current = f"{now.year:04d}-{now.month:02d}"
|
||||
prev_y, prev_m = _previous_month(now.year, now.month)
|
||||
previous = f"{prev_y:04d}-{prev_m:02d}"
|
||||
candidates = [current, previous]
|
||||
|
||||
for ym in candidates:
|
||||
url = LEGACY_NEW_BASE_PATTERN.format(year_month=ym)
|
||||
logger.info("Probing legacy release URL: %s", url)
|
||||
if _check_url_accessible(url):
|
||||
logger.info("Resolved CNPJ release (legacy new path): %s", url)
|
||||
return url
|
||||
|
||||
logger.info("Trying legacy flat URL: %s", LEGACY_BASE_URL)
|
||||
if _check_url_accessible(LEGACY_BASE_URL):
|
||||
logger.info("Resolved CNPJ release (legacy flat): %s", LEGACY_BASE_URL)
|
||||
return LEGACY_BASE_URL
|
||||
|
||||
tried = ", ".join(candidates)
|
||||
raise RuntimeError(
|
||||
f"Could not resolve CNPJ release. Tried Nextcloud tokens, "
|
||||
f"legacy months [{tried}], and legacy flat path. "
|
||||
"Receita Federal portal may be down or the URL structure has changed."
|
||||
)
|
||||
|
||||
|
||||
def _write_manifest(
|
||||
output_dir: Path,
|
||||
base_url: str,
|
||||
resolved_release: str,
|
||||
file_results: list[dict],
|
||||
started_at: str,
|
||||
) -> Path:
|
||||
"""Write download manifest JSON after download completes."""
|
||||
finished_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Compute an aggregate checksum over all successful file names + sizes
|
||||
hasher = hashlib.sha256()
|
||||
for fr in sorted(file_results, key=lambda x: x["name"]):
|
||||
hasher.update(f"{fr['name']}:{fr['size_bytes']}:{fr['status']}".encode())
|
||||
checksum = f"sha256:{hasher.hexdigest()}"
|
||||
|
||||
manifest = {
|
||||
"source": "receita_federal_cnpj",
|
||||
"resolved_release": resolved_release,
|
||||
"base_url": base_url,
|
||||
"files": file_results,
|
||||
"started_at": started_at,
|
||||
"finished_at": finished_at,
|
||||
"checksum": checksum,
|
||||
}
|
||||
|
||||
manifest_path = output_dir / "download_manifest.json"
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
logger.info("Manifest written: %s", manifest_path)
|
||||
return manifest_path
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--output-dir", default="./data/cnpj", help="Base output directory")
|
||||
@click.option("--files", type=int, default=10, help="Number of files per type (0-9)")
|
||||
@@ -56,6 +188,7 @@ EXPECTED_COLS = {
|
||||
@click.option("--skip-existing/--no-skip-existing", default=True, help="Skip already downloaded files")
|
||||
@click.option("--skip-extract", is_flag=True, help="Skip extraction after download")
|
||||
@click.option("--timeout", type=int, default=600, help="Download timeout in seconds")
|
||||
@click.option("--release", default=None, help="Pin to specific monthly release (YYYY-MM format)")
|
||||
def main(
|
||||
output_dir: str,
|
||||
files: int,
|
||||
@@ -64,8 +197,20 @@ def main(
|
||||
skip_existing: bool,
|
||||
skip_extract: bool,
|
||||
timeout: int,
|
||||
release: str | None,
|
||||
) -> None:
|
||||
"""Download and extract CNPJ data from Receita Federal."""
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
base_url = resolve_rf_release(release)
|
||||
# Extract the release identifier from the resolved URL
|
||||
resolved_release = release or "legacy"
|
||||
if "arquivos.receitafederal.gov.br" in base_url:
|
||||
resolved_release = "nextcloud"
|
||||
elif "/dados_abertos_cnpj/" in base_url:
|
||||
# Extract YYYY-MM from URL
|
||||
resolved_release = base_url.rstrip("/").rsplit("/", 1)[-1]
|
||||
|
||||
base = Path(output_dir)
|
||||
raw_dir = base / "raw"
|
||||
extract_dir = base / "extracted"
|
||||
@@ -73,14 +218,26 @@ def main(
|
||||
for d in [raw_dir, extract_dir, ref_dir]:
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_results: list[dict] = []
|
||||
|
||||
# --- Reference tables (always download, they're tiny) ---
|
||||
logger.info("=== Reference tables ===")
|
||||
for filename in REFERENCE_FILES:
|
||||
dest = raw_dir / filename
|
||||
if skip_existing and dest.exists():
|
||||
logger.info("Skipping (exists): %s", filename)
|
||||
file_results.append({
|
||||
"name": filename,
|
||||
"status": "skipped",
|
||||
"size_bytes": dest.stat().st_size,
|
||||
})
|
||||
else:
|
||||
download_file(f"{BASE_URL}{filename}", dest, timeout=timeout)
|
||||
success = download_file(f"{base_url}{filename}", dest, timeout=timeout)
|
||||
file_results.append({
|
||||
"name": filename,
|
||||
"status": "ok" if success else "failed",
|
||||
"size_bytes": dest.stat().st_size if dest.exists() else 0,
|
||||
})
|
||||
|
||||
if not skip_extract and dest.exists():
|
||||
extracted = extract_zip(dest, ref_dir)
|
||||
@@ -90,7 +247,8 @@ def main(
|
||||
validate_csv(f, expected_cols=expected)
|
||||
|
||||
if reference_only:
|
||||
logger.info("Reference-only mode — done.")
|
||||
logger.info("Reference-only mode -- done.")
|
||||
_write_manifest(base, base_url, resolved_release, file_results, started_at)
|
||||
return
|
||||
|
||||
# --- Main data files ---
|
||||
@@ -102,10 +260,25 @@ def main(
|
||||
dest = raw_dir / filename
|
||||
if skip_existing and dest.exists():
|
||||
logger.info("Skipping (exists): %s", filename)
|
||||
file_results.append({
|
||||
"name": filename,
|
||||
"status": "skipped",
|
||||
"size_bytes": dest.stat().st_size,
|
||||
})
|
||||
else:
|
||||
success = download_file(f"{BASE_URL}{filename}", dest, timeout=timeout)
|
||||
success = download_file(f"{base_url}{filename}", dest, timeout=timeout)
|
||||
if not success:
|
||||
file_results.append({
|
||||
"name": filename,
|
||||
"status": "failed",
|
||||
"size_bytes": 0,
|
||||
})
|
||||
continue
|
||||
file_results.append({
|
||||
"name": filename,
|
||||
"status": "ok",
|
||||
"size_bytes": dest.stat().st_size if dest.exists() else 0,
|
||||
})
|
||||
|
||||
if not skip_extract and dest.exists():
|
||||
extracted = extract_zip(dest, extract_dir)
|
||||
@@ -120,6 +293,7 @@ def main(
|
||||
|
||||
logger.info("=== Download complete ===")
|
||||
_print_summary(raw_dir, extract_dir, ref_dir)
|
||||
_write_manifest(base, base_url, resolved_release, file_results, started_at)
|
||||
|
||||
|
||||
def _print_summary(raw_dir: Path, extract_dir: Path, ref_dir: Path) -> None:
|
||||
|
||||
@@ -10,8 +10,8 @@ And a manifest:
|
||||
- download_manifest.json
|
||||
|
||||
Usage:
|
||||
python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos
|
||||
python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos --tables socios
|
||||
python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos
|
||||
python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos --tables socios
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -105,6 +105,44 @@ TABLES: dict[str, list[str]] = {
|
||||
PAGE_SIZE = 100_000
|
||||
|
||||
|
||||
def _run_bigquery_precheck(
|
||||
*,
|
||||
billing_project: str,
|
||||
source_project: str,
|
||||
source_dataset: str,
|
||||
snapshot_start: str | None,
|
||||
) -> None:
|
||||
"""Run explicit auth/ACL prechecks before starting large table downloads."""
|
||||
from google.cloud import bigquery
|
||||
|
||||
client = bigquery.Client(project=billing_project)
|
||||
logger.info("Running BigQuery precheck: SELECT 1")
|
||||
list(client.query("SELECT 1 AS ok").result())
|
||||
|
||||
socios_table = f"{source_project}.{source_dataset}.socios"
|
||||
if snapshot_start:
|
||||
precheck_sql = (
|
||||
f"SELECT COUNT(1) AS n FROM `{socios_table}` "
|
||||
"WHERE data >= @snapshot_start"
|
||||
)
|
||||
query_params = [
|
||||
bigquery.ScalarQueryParameter("snapshot_start", "DATE", snapshot_start),
|
||||
]
|
||||
else:
|
||||
precheck_sql = f"SELECT COUNT(1) AS n FROM `{socios_table}`"
|
||||
query_params = []
|
||||
|
||||
logger.info("Running BigQuery precheck: %s", precheck_sql)
|
||||
rows = list(
|
||||
client.query(
|
||||
precheck_sql,
|
||||
job_config=bigquery.QueryJobConfig(query_parameters=query_params),
|
||||
).result(),
|
||||
)
|
||||
check_value = rows[0].n if rows else 0
|
||||
logger.info("BigQuery precheck OK: socios_count=%s", check_value)
|
||||
|
||||
|
||||
def _sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
@@ -292,6 +330,19 @@ def main(
|
||||
)
|
||||
source_project, source_dataset = dataset.split(".", 1)
|
||||
|
||||
try:
|
||||
_run_bigquery_precheck(
|
||||
billing_project=billing_project,
|
||||
source_project=source_project,
|
||||
source_dataset=source_dataset,
|
||||
snapshot_start=snapshot_start,
|
||||
)
|
||||
except Exception as exc:
|
||||
raise click.ClickException(
|
||||
"BigQuery precheck failed. Configure a non-interactive service account "
|
||||
"(GOOGLE_APPLICATION_CREDENTIALS) with dataset ACL and billing access.",
|
||||
) from exc
|
||||
|
||||
selected = list(tables) if tables else list(TABLES.keys())
|
||||
run_id = f"cnpj-bq-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}-{uuid.uuid4().hex[:8]}"
|
||||
logger.info(
|
||||
|
||||
@@ -22,7 +22,6 @@ from pathlib import Path
|
||||
|
||||
import click
|
||||
import httpx
|
||||
from _download_utils import safe_extract_zip
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
|
||||
@@ -90,15 +89,24 @@ def _download_zip(
|
||||
xml_count = 0
|
||||
|
||||
try:
|
||||
resolved_dir = section_dir.resolve()
|
||||
with zipfile.ZipFile(BytesIO(resp.content)) as zf:
|
||||
extracted = safe_extract_zip(zf, section_dir)
|
||||
xml_count = sum(1 for path in extracted if path.suffix.lower() == ".xml")
|
||||
for member in zf.namelist():
|
||||
# Path traversal guard
|
||||
target = (section_dir / member).resolve()
|
||||
if not target.is_relative_to(resolved_dir):
|
||||
logger.warning(
|
||||
"Path traversal detected in %s: %s — skipping",
|
||||
zip_name,
|
||||
member,
|
||||
)
|
||||
continue
|
||||
if member.lower().endswith(".xml"):
|
||||
zf.extract(member, section_dir)
|
||||
xml_count += 1
|
||||
except zipfile.BadZipFile:
|
||||
logger.warning("Bad ZIP file: %s", zip_name)
|
||||
return 0
|
||||
except ValueError as exc:
|
||||
logger.warning("Unsafe ZIP file %s: %s", zip_name, exc)
|
||||
return 0
|
||||
|
||||
if xml_count > 0:
|
||||
marker.write_text(str(xml_count))
|
||||
|
||||
@@ -71,7 +71,7 @@ def _write_manifest(out_dir: Path, tables: list[dict[str, Any]]) -> Path:
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--billing-project", default="bracc-corruptos", help="GCP billing project")
|
||||
@click.option("--billing-project", default="icarus-corruptos", help="GCP billing project")
|
||||
@click.option(
|
||||
"--dataset",
|
||||
default=WORLD_WB_DATASET,
|
||||
|
||||
@@ -439,7 +439,7 @@ def main(
|
||||
client = httpx.Client(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": "BRACC-ETL/1.0 (public data research)"},
|
||||
headers={"User-Agent": "BR-ACC-ETL/1.0 (public data research)"},
|
||||
)
|
||||
|
||||
total_records = 0
|
||||
|
||||
@@ -8,10 +8,12 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import zipfile
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from _download_utils import safe_extract_zip
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -34,14 +36,13 @@ def download_year(output_dir: Path, year: int) -> None:
|
||||
url,
|
||||
follow_redirects=True,
|
||||
timeout=300,
|
||||
headers={"User-Agent": "BRACC-ETL/1.0"},
|
||||
headers={"User-Agent": "BR-ACC-ETL/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
dest_zip.write_bytes(response.content)
|
||||
logger.info("Downloaded: %s (%d bytes)", dest_zip.name, len(response.content))
|
||||
|
||||
with zipfile.ZipFile(dest_zip, "r") as zf:
|
||||
extracted = safe_extract_zip(zf, output_dir)
|
||||
extracted = safe_extract_zip(dest_zip, output_dir)
|
||||
logger.info("Extracted %d files", len(extracted))
|
||||
except httpx.HTTPError:
|
||||
logger.warning("Failed to download renuncias for %d", year)
|
||||
|
||||
@@ -16,13 +16,13 @@ import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import defusedxml.ElementTree as ET
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import click
|
||||
import httpx
|
||||
from defusedxml import ElementTree as ET
|
||||
from download_senado_cpi_archive import fetch_archive_historical
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -44,7 +44,7 @@ def get_all_entities() -> list[dict]:
|
||||
url,
|
||||
params={"offset": offset, "limit": limit},
|
||||
timeout=60,
|
||||
headers={"User-Agent": "BRACC-ETL/1.0"},
|
||||
headers={"User-Agent": "BR-ACC-ETL/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
@@ -125,7 +125,7 @@ def download_year(
|
||||
header_written = partial.exists() and partial.stat().st_size > 0
|
||||
|
||||
with (
|
||||
httpx.Client(headers={"User-Agent": "BRACC-ETL/1.0"}) as client,
|
||||
httpx.Client(headers={"User-Agent": "BR-ACC-ETL/1.0"}) as client,
|
||||
open(partial, "a", newline="", encoding="utf-8") as f,
|
||||
):
|
||||
writer: csv.DictWriter | None = None
|
||||
|
||||
@@ -5,9 +5,9 @@ Streams from BigQuery table basedosdados.br_stf_corte_aberta.decisoes to local C
|
||||
Requires `google-cloud-bigquery` and an authenticated GCP project.
|
||||
|
||||
Usage:
|
||||
python etl/scripts/download_stf.py --billing-project bracc-corruptos
|
||||
python etl/scripts/download_stf.py --billing-project bracc-corruptos --skip-existing
|
||||
python etl/scripts/download_stf.py --billing-project bracc-corruptos --output-dir ./data/stf
|
||||
python etl/scripts/download_stf.py --billing-project icarus-corruptos
|
||||
python etl/scripts/download_stf.py --billing-project icarus-corruptos --skip-existing
|
||||
python etl/scripts/download_stf.py --billing-project icarus-corruptos --output-dir ./data/stf
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -5,9 +5,9 @@ Streams from BigQuery table `basedosdados.br_tse_eleicoes.bens_candidato` to a l
|
||||
Requires `google-cloud-bigquery` and an authenticated GCP project.
|
||||
|
||||
Usage:
|
||||
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos
|
||||
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --start-year 2018
|
||||
python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --skip-existing
|
||||
python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos
|
||||
python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --start-year 2018
|
||||
python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --skip-existing
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -7,9 +7,9 @@ Filters to REGULAR status only (active members) to reduce volume.
|
||||
Requires `google-cloud-bigquery` and an authenticated GCP project.
|
||||
|
||||
Usage:
|
||||
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos
|
||||
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --skip-existing
|
||||
python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --all-statuses
|
||||
python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos
|
||||
python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --skip-existing
|
||||
python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --all-statuses
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -14,10 +14,10 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import defusedxml.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
from defusedxml import ElementTree as ET
|
||||
|
||||
# Allow imports from scripts/ directory
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
@@ -21,12 +21,16 @@ class Pipeline(ABC):
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
neo4j_database: str | None = None,
|
||||
history: bool = False,
|
||||
) -> None:
|
||||
self.driver = driver
|
||||
self.data_dir = data_dir
|
||||
self.limit = limit
|
||||
self.chunk_size = chunk_size
|
||||
self.neo4j_database = neo4j_database or os.getenv("NEO4J_DATABASE", "neo4j")
|
||||
self.history = history
|
||||
self.rows_in: int = 0
|
||||
self.rows_loaded: int = 0
|
||||
source_key = getattr(self, "source_id", getattr(self, "name", "unknown_source"))
|
||||
self.run_id = f"{source_key}_{datetime.now(tz=UTC).strftime('%Y%m%d%H%M%S')}"
|
||||
|
||||
@@ -87,8 +91,8 @@ class Pipeline(ABC):
|
||||
" r.started_at = coalesce($started_at, r.started_at), "
|
||||
" r.finished_at = coalesce($finished_at, r.finished_at), "
|
||||
" r.error = coalesce($error, r.error), "
|
||||
" r.rows_in = coalesce(r.rows_in, 0), "
|
||||
" r.rows_loaded = coalesce(r.rows_loaded, 0)"
|
||||
" r.rows_in = $rows_in, "
|
||||
" r.rows_loaded = $rows_loaded"
|
||||
)
|
||||
run_id = getattr(self, "run_id", f"{source_id}_manual")
|
||||
params = {
|
||||
@@ -98,6 +102,8 @@ class Pipeline(ABC):
|
||||
"started_at": started_at,
|
||||
"finished_at": finished_at,
|
||||
"error": error,
|
||||
"rows_in": self.rows_in,
|
||||
"rows_loaded": self.rows_loaded,
|
||||
}
|
||||
try:
|
||||
with self.driver.session(database=self.neo4j_database) as session:
|
||||
|
||||
@@ -13,7 +13,7 @@ def get_person_settings() -> dict[str, Any]:
|
||||
"""
|
||||
try:
|
||||
import splink.comparison_library as cl # type: ignore[import-not-found]
|
||||
from splink import SettingsCreator
|
||||
from splink import SettingsCreator # type: ignore[import-not-found,unused-ignore]
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"splink is required for entity resolution. "
|
||||
|
||||
@@ -51,8 +51,9 @@ class BcbPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.penalties: list[dict[str, Any]] = []
|
||||
self.company_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -33,8 +33,9 @@ class BndesPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.finances: list[dict[str, Any]] = []
|
||||
self.relationships: list[dict[str, Any]] = []
|
||||
@@ -51,8 +52,15 @@ class BndesPipeline(Pipeline):
|
||||
|
||||
def extract(self) -> None:
|
||||
bndes_dir = Path(self.data_dir) / "bndes"
|
||||
if not bndes_dir.exists():
|
||||
logger.warning("[%s] Data directory not found: %s", self.name, bndes_dir)
|
||||
return
|
||||
csv_path = bndes_dir / "operacoes-nao-automaticas.csv"
|
||||
if not csv_path.exists():
|
||||
logger.warning("[%s] CSV file not found: %s", self.name, csv_path)
|
||||
return
|
||||
self._raw = pd.read_csv(
|
||||
bndes_dir / "operacoes-nao-automaticas.csv",
|
||||
csv_path,
|
||||
dtype=str,
|
||||
delimiter=";",
|
||||
encoding="latin-1",
|
||||
|
||||
@@ -88,8 +88,9 @@ class CagedPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._csv_files: list[Path] = []
|
||||
|
||||
def extract(self) -> None:
|
||||
|
||||
@@ -60,8 +60,9 @@ class CamaraPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.expenses: list[dict[str, Any]] = []
|
||||
self.deputies: list[dict[str, Any]] = []
|
||||
|
||||
@@ -66,8 +66,9 @@ class CamaraInquiriesPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
|
||||
self._raw_inquiries: pd.DataFrame = pd.DataFrame()
|
||||
self._raw_requirements: pd.DataFrame = pd.DataFrame()
|
||||
|
||||
@@ -31,8 +31,9 @@ class CeafPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.expulsions: list[dict[str, Any]] = []
|
||||
self.person_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -37,8 +37,9 @@ class CepimPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.ngos: list[dict[str, Any]] = []
|
||||
self.company_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -216,9 +216,11 @@ class CNPJPipeline(Pipeline):
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
history: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
self.history = history
|
||||
super().__init__(
|
||||
driver, data_dir, limit=limit, chunk_size=chunk_size, history=history, **kwargs,
|
||||
)
|
||||
self.run_id = f"cnpj-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}"
|
||||
self._raw_empresas: pd.DataFrame = pd.DataFrame()
|
||||
self._raw_socios: pd.DataFrame = pd.DataFrame()
|
||||
|
||||
@@ -63,7 +63,7 @@ class ComprasnetPipeline(Pipeline):
|
||||
"""ETL pipeline for PNCP federal procurement contracts."""
|
||||
|
||||
name = "comprasnet"
|
||||
source_id = "pncp"
|
||||
source_id = "comprasnet"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -71,8 +71,9 @@ class ComprasnetPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self.contracts: list[dict[str, Any]] = []
|
||||
|
||||
def extract(self) -> None:
|
||||
|
||||
@@ -84,8 +84,9 @@ class CpgfPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.expenses: list[dict[str, Any]] = []
|
||||
self.cardholders: list[dict[str, Any]] = []
|
||||
|
||||
@@ -38,8 +38,9 @@ class CvmPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw_processos: pd.DataFrame = pd.DataFrame()
|
||||
self._raw_acusados: pd.DataFrame = pd.DataFrame()
|
||||
self.proceedings: list[dict[str, Any]] = []
|
||||
|
||||
@@ -43,8 +43,9 @@ class CvmFundsPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.funds: list[dict[str, Any]] = []
|
||||
self.admin_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -50,8 +50,9 @@ class DatajudPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
|
||||
self._raw_cases: pd.DataFrame = pd.DataFrame()
|
||||
self._raw_parties: pd.DataFrame = pd.DataFrame()
|
||||
|
||||
@@ -29,8 +29,9 @@ class DatasusPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.facilities: list[dict[str, Any]] = []
|
||||
self.company_links: list[dict[str, Any]] = []
|
||||
|
||||
@@ -17,7 +17,10 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from defusedxml import ElementTree # type: ignore[import-untyped]
|
||||
from defusedxml.ElementTree import ParseError as _XmlParseError # type: ignore[import-untyped]
|
||||
from defusedxml.ElementTree import (
|
||||
parse as _safe_xml_parse, # type: ignore[import-untyped,unused-ignore]
|
||||
)
|
||||
|
||||
from bracc_etl.base import Pipeline
|
||||
from bracc_etl.loader import Neo4jBatchLoader
|
||||
@@ -141,8 +144,9 @@ class DouPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw_acts: list[dict[str, str]] = []
|
||||
self.acts: list[dict[str, Any]] = []
|
||||
self.person_rels: list[dict[str, Any]] = []
|
||||
@@ -227,8 +231,8 @@ class DouPipeline(Pipeline):
|
||||
"""Extract acts from Imprensa Nacional XML dumps."""
|
||||
for f in xml_files:
|
||||
try:
|
||||
tree = ElementTree.parse(f) # noqa: S314
|
||||
except ElementTree.ParseError:
|
||||
tree = _safe_xml_parse(f)
|
||||
except _XmlParseError:
|
||||
logger.warning("[dou] Failed to parse XML: %s", f.name)
|
||||
continue
|
||||
|
||||
|
||||
@@ -76,8 +76,9 @@ class EuSanctionsPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.sanctions: list[dict[str, Any]] = []
|
||||
self.person_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -36,8 +36,9 @@ class HoldingsPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.holding_rels: list[dict[str, Any]] = []
|
||||
|
||||
|
||||
@@ -40,8 +40,9 @@ class IbamaPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.embargoes: list[dict[str, Any]] = []
|
||||
self.companies: list[dict[str, Any]] = []
|
||||
@@ -65,7 +66,13 @@ class IbamaPipeline(Pipeline):
|
||||
|
||||
def extract(self) -> None:
|
||||
ibama_dir = Path(self.data_dir) / "ibama"
|
||||
if not ibama_dir.exists():
|
||||
logger.warning("[%s] Data directory not found: %s", self.name, ibama_dir)
|
||||
return
|
||||
csv_path = ibama_dir / "areas_embargadas.csv"
|
||||
if not csv_path.exists():
|
||||
logger.warning("[%s] CSV file not found: %s", self.name, csv_path)
|
||||
return
|
||||
logger.info("[ibama] Reading %s", csv_path)
|
||||
self._raw = pd.read_csv(
|
||||
csv_path,
|
||||
|
||||
@@ -42,8 +42,9 @@ class ICIJPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._entities_raw: pd.DataFrame = pd.DataFrame()
|
||||
self._officers_raw: pd.DataFrame = pd.DataFrame()
|
||||
self._intermediaries_raw: pd.DataFrame = pd.DataFrame()
|
||||
|
||||
@@ -42,8 +42,9 @@ class InepPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self.schools: list[dict[str, Any]] = []
|
||||
self.school_company_links: list[dict[str, Any]] = []
|
||||
|
||||
|
||||
@@ -31,8 +31,9 @@ class LeniencyPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.agreements: list[dict[str, Any]] = []
|
||||
self.company_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -74,8 +74,9 @@ class MidesPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
|
||||
self._raw_bids: pd.DataFrame = pd.DataFrame()
|
||||
self._raw_contracts: pd.DataFrame = pd.DataFrame()
|
||||
|
||||
@@ -63,8 +63,9 @@ class OfacPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.sanctions: list[dict[str, Any]] = []
|
||||
|
||||
|
||||
@@ -81,8 +81,9 @@ class OpenSanctionsPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw_entities: list[dict[str, Any]] = []
|
||||
self.global_peps: list[dict[str, Any]] = []
|
||||
self.pep_match_rels: list[dict[str, Any]] = []
|
||||
|
||||
@@ -84,8 +84,9 @@ class PepCguPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._raw: pd.DataFrame = pd.DataFrame()
|
||||
self.pep_records: list[dict[str, Any]] = []
|
||||
self.person_links: list[dict[str, Any]] = []
|
||||
|
||||
@@ -38,8 +38,9 @@ class PgfnPipeline(Pipeline):
|
||||
data_dir: str = "./data",
|
||||
limit: int | None = None,
|
||||
chunk_size: int = 50_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size)
|
||||
super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs)
|
||||
self._csv_files: list[Path] = []
|
||||
self.finances: list[dict[str, Any]] = []
|
||||
self.relationships: list[dict[str, Any]] = []
|
||||
@@ -56,10 +57,13 @@ class PgfnPipeline(Pipeline):
|
||||
|
||||
def extract(self) -> None:
|
||||
pgfn_dir = Path(self.data_dir) / "pgfn"
|
||||
if not pgfn_dir.exists():
|
||||
logger.warning("[%s] Data directory not found: %s", self.name, pgfn_dir)
|
||||
return
|
||||
self._csv_files = sorted(pgfn_dir.glob("arquivo_lai_SIDA_*_*.csv"))
|
||||
if not self._csv_files:
|
||||
msg = f"No PGFN CSV files found in {pgfn_dir}"
|
||||
raise FileNotFoundError(msg)
|
||||
logger.warning("[%s] No PGFN CSV files found in %s", self.name, pgfn_dir)
|
||||
return
|
||||
logger.info("[pgfn] Found %d CSV files to process", len(self._csv_files))
|
||||
|
||||
def transform(self) -> None:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user