Add source registry governance and dynamic meta source stats (#6)

Co-authored-by: bruno cesar <brunoclz@brunos-MacBook-Pro.local>
This commit is contained in:
Bruno César
2026-02-27 20:34:00 -03:00
committed by GitHub
parent edf9c95bac
commit 02466bb9d6
7 changed files with 609 additions and 77 deletions

1
.gitignore vendored
View File

@@ -49,6 +49,7 @@ htmlcov/
# Data files (too large for git)
*.csv
!docs/source_registry_br_v1.csv
*.tsv
*.parquet
*.dbc

View File

@@ -6,6 +6,7 @@ from neo4j import AsyncSession
from icarus.dependencies import get_session
from icarus.services.neo4j_service import execute_query_single
from icarus.services.source_registry import load_source_registry, source_registry_summary
router = APIRouter(prefix="/api/v1/meta", tags=["meta"])
@@ -33,6 +34,9 @@ async def database_stats(
return _stats_cache
record = await execute_query_single(session, "meta_stats", {})
source_entries = load_source_registry()
source_summary = source_registry_summary(source_entries)
result = {
"total_nodes": record["total_nodes"] if record else 0,
"total_relationships": record["total_relationships"] if record else 0,
@@ -77,7 +81,12 @@ async def database_stats(
"municipal_bid_count": record["municipal_bid_count"] if record else 0,
"municipal_contract_count": record["municipal_contract_count"] if record else 0,
"municipal_gazette_act_count": record["municipal_gazette_act_count"] if record else 0,
"data_sources": 45,
"data_sources": source_summary["universe_v1_sources"],
"implemented_sources": source_summary["implemented_sources"],
"loaded_sources": source_summary["loaded_sources"],
"stale_sources": source_summary["stale_sources"],
"blocked_external_sources": source_summary["blocked_external_sources"],
"quality_fail_sources": source_summary["quality_fail_sources"],
}
_stats_cache = result
@@ -86,77 +95,6 @@ async def database_stats(
@router.get("/sources")
async def list_sources() -> dict[str, list[dict[str, str]]]:
return {
"sources": [
{"id": "cnpj", "name": "Receita Federal (CNPJ)", "frequency": "monthly"},
{"id": "tse", "name": "Tribunal Superior Eleitoral", "frequency": "biennial"},
{"id": "transparencia", "name": "Portal da Transparência", "frequency": "monthly"},
{"id": "ceis", "name": "CEIS/CNEP/CEPIM/CEAF", "frequency": "monthly"},
{"id": "cnes", "name": "CNES/DATASUS", "frequency": "monthly"},
{"id": "bndes", "name": "BNDES (Empréstimos)", "frequency": "monthly"},
{"id": "pgfn", "name": "PGFN (Dívida Ativa)", "frequency": "monthly"},
{"id": "ibama", "name": "IBAMA (Embargos)", "frequency": "monthly"},
{"id": "comprasnet", "name": "ComprasNet/PNCP", "frequency": "monthly"},
{"id": "tcu", "name": "TCU (Sanções)", "frequency": "monthly"},
{"id": "transferegov", "name": "TransfereGov (Convênios)", "frequency": "monthly"},
{"id": "rais", "name": "RAIS (Estatísticas Trabalhistas)", "frequency": "annual"},
{"id": "inep", "name": "INEP (Censo Educação)", "frequency": "annual"},
{"id": "dou", "name": "Diário Oficial da União", "frequency": "daily"},
{"id": "icij", "name": "ICIJ Offshore Leaks", "frequency": "yearly"},
{"id": "opensanctions", "name": "OpenSanctions (PEPs globais)", "frequency": "monthly"},
{"id": "cvm", "name": "CVM (Processos Sancionadores)", "frequency": "monthly"},
{"id": "camara", "name": "Câmara dos Deputados (CEAP)", "frequency": "monthly"},
{"id": "senado", "name": "Senado Federal (CEAPS)", "frequency": "monthly"},
{"id": "pep_cgu", "name": "CGU PEP (Pessoas Expostas)", "frequency": "monthly"},
{"id": "ceaf", "name": "CEAF (Servidores Expulsos)", "frequency": "monthly"},
{"id": "leniency", "name": "Acordos de Leniência", "frequency": "monthly"},
{"id": "ofac", "name": "OFAC SDN (Sanções Internacionais)", "frequency": "monthly"},
{"id": "holdings", "name": "Brasil.IO (Holdings Empresariais)", "frequency": "monthly"},
{"id": "cpgf", "name": "CPGF (Cartão de Pagamento)", "frequency": "monthly"},
{"id": "viagens", "name": "Viagens a Serviço", "frequency": "monthly"},
{"id": "siop", "name": "SIOP (Emendas Parlamentares)", "frequency": "annual"},
{"id": "pncp", "name": "PNCP (Licitações)", "frequency": "monthly"},
{"id": "cvm_funds", "name": "CVM (Fundos de Investimento)", "frequency": "monthly"},
{"id": "renuncias", "name": "Renúncias Fiscais", "frequency": "annual"},
{"id": "siconfi", "name": "SICONFI (Finanças Municipais)", "frequency": "annual"},
{"id": "tse_bens", "name": "TSE Bens Declarados", "frequency": "biennial"},
{"id": "tse_filiados", "name": "TSE Filiação Partidária", "frequency": "monthly"},
{"id": "cepim", "name": "CEPIM (ONGs Impedidas)", "frequency": "monthly"},
{"id": "bcb", "name": "BCB (Penalidades Bancárias)", "frequency": "monthly"},
{"id": "caged", "name": "CAGED (Movimentações Trabalhistas)", "frequency": "monthly"},
{"id": "stf", "name": "STF (Decisões Corte Aberta)", "frequency": "monthly"},
{"id": "eu_sanctions", "name": "EU (Sanções Europeias)", "frequency": "monthly"},
{
"id": "un_sanctions",
"name": "ONU (Sanções do Conselho de Segurança)",
"frequency": "monthly",
},
{
"id": "world_bank",
"name": "Banco Mundial (Firmas Impedidas)",
"frequency": "monthly",
},
{"id": "senado_cpis", "name": "Senado CPIs", "frequency": "yearly"},
{
"id": "camara_inquiries",
"name": "Câmara (CPIs/CPMIs e Requerimentos)",
"frequency": "daily",
},
{
"id": "mides",
"name": "MiDES (Licitações/Contratos Municipais)",
"frequency": "daily",
},
{
"id": "querido_diario",
"name": "Querido Diário (Atos Municipais)",
"frequency": "daily",
},
{
"id": "datajud",
"name": "CNJ DataJud (Processos Judiciais)",
"frequency": "monthly",
},
]
}
async def list_sources() -> dict[str, list[dict[str, Any]]]:
sources = [entry.to_public_dict() for entry in load_source_registry() if entry.in_universe_v1]
return {"sources": sources}

View File

@@ -0,0 +1,107 @@
import csv
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any
@dataclass(frozen=True)
class SourceRegistryEntry:
id: str
name: str
category: str
tier: str
status: str
implementation_state: str
load_state: str
frequency: str
in_universe_v1: bool
primary_url: str
pipeline_id: str
owner_agent: str
access_mode: str
notes: str
def to_public_dict(self) -> dict[str, Any]:
return {
"id": self.id,
"name": self.name,
"category": self.category,
"tier": self.tier,
"status": self.status,
"implementation_state": self.implementation_state,
"load_state": self.load_state,
"frequency": self.frequency,
"in_universe_v1": self.in_universe_v1,
"primary_url": self.primary_url,
"pipeline_id": self.pipeline_id,
"owner_agent": self.owner_agent,
"access_mode": self.access_mode,
"notes": self.notes,
}
def _str_to_bool(value: str) -> bool:
return value.strip().lower() in {"1", "true", "yes", "y"}
def _default_registry_path() -> Path:
# .../api/src/icarus/services/source_registry.py -> repo root is parents[4]
return Path(__file__).resolve().parents[4] / "docs" / "source_registry_br_v1.csv"
def get_registry_path() -> Path:
configured = os.getenv("ICARUS_SOURCE_REGISTRY_PATH", "").strip()
return Path(configured) if configured else _default_registry_path()
def load_source_registry() -> list[SourceRegistryEntry]:
registry_path = get_registry_path()
if not registry_path.exists():
return []
entries: list[SourceRegistryEntry] = []
with registry_path.open(encoding="utf-8", newline="") as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
entries.append(
SourceRegistryEntry(
id=(row.get("source_id") or "").strip(),
name=(row.get("name") or "").strip(),
category=(row.get("category") or "").strip(),
tier=(row.get("tier") or "").strip(),
status=(row.get("status") or "").strip(),
implementation_state=(row.get("implementation_state") or "").strip(),
load_state=(row.get("load_state") or "").strip(),
frequency=(row.get("frequency") or "").strip(),
in_universe_v1=_str_to_bool(row.get("in_universe_v1") or ""),
primary_url=(row.get("primary_url") or "").strip(),
pipeline_id=(row.get("pipeline_id") or "").strip(),
owner_agent=(row.get("owner_agent") or "").strip(),
access_mode=(row.get("access_mode") or "").strip(),
notes=(row.get("notes") or "").strip(),
)
)
entries.sort(key=lambda entry: entry.id)
return entries
def source_registry_summary(entries: list[SourceRegistryEntry]) -> dict[str, int]:
universe_v1 = [entry for entry in entries if entry.in_universe_v1]
implemented = [
entry for entry in universe_v1 if entry.implementation_state == "implemented"
]
loaded = [entry for entry in universe_v1 if entry.load_state == "loaded"]
stale = [entry for entry in universe_v1 if entry.status == "stale"]
blocked = [entry for entry in universe_v1 if entry.status == "blocked_external"]
quality_fail = [entry for entry in universe_v1 if entry.status == "quality_fail"]
return {
"universe_v1_sources": len(universe_v1),
"implemented_sources": len(implemented),
"loaded_sources": len(loaded),
"stale_sources": len(stale),
"blocked_external_sources": len(blocked),
"quality_fail_sources": len(quality_fail),
}

View File

@@ -17,7 +17,7 @@ async def test_meta_sources(client: AsyncClient) -> None:
assert response.status_code == 200
data = response.json()
assert "sources" in data
assert len(data["sources"]) == 45
assert len(data["sources"]) == 108
source_ids = [s["id"] for s in data["sources"]]
assert "cnpj" in source_ids
assert "tse" in source_ids
@@ -61,6 +61,11 @@ async def test_meta_sources(client: AsyncClient) -> None:
assert "mides" in source_ids
assert "querido_diario" in source_ids
assert "datajud" in source_ids
first = data["sources"][0]
assert "status" in first
assert "implementation_state" in first
assert "load_state" in first
assert "in_universe_v1" in first
@pytest.mark.anyio
@@ -169,4 +174,9 @@ async def test_meta_stats(client: AsyncClient) -> None:
assert data["municipal_bid_count"] == 8_000_000
assert data["municipal_contract_count"] == 6_000_000
assert data["municipal_gazette_act_count"] == 4_000_000
assert data["data_sources"] == 45
assert data["data_sources"] == 108
assert data["implemented_sources"] == 45
assert data["loaded_sources"] >= 1
assert data["stale_sources"] >= 0
assert data["blocked_external_sources"] >= 0
assert data["quality_fail_sources"] >= 0

View File

@@ -0,0 +1,60 @@
# Source Onboarding Contract (Brazil Coverage v1)
This contract is mandatory for every new source before `shadow -> promote`.
## 1. Source Identity
- `source_id`:
- `name`:
- `category`:
- `tier`:
- `owner_agent`:
- `primary_url`:
- `access_mode` (`file|api|bigquery|web`):
## 2. Access and Legal
- Credential required:
- Secret name/path:
- License or usage restriction:
- LGPD/privacy considerations:
- `blocked_external` criteria:
## 3. Data Contract
- Downloader script: `etl/scripts/download_<source>.py`
- Canonical output files:
- Manifest file:
- Update cadence:
- Expected row volume:
- Partition/window strategy:
## 4. Graph Contract
- Node labels introduced:
- Relationship types introduced:
- Natural key(s) per node:
- Merge key strategy:
- Relationship quality tier (`strong|probable`):
- Provenance fields (`method`, `confidence`, `source_ref`, `run_id`):
## 5. Index and Constraint Contract
- Required uniqueness constraints:
- Required date indexes:
- Required lookup indexes:
- Required fulltext indexes (if text-heavy):
## 6. Quality Gates (Hard Stop/Go)
- Identity integrity preserved (`Person.cpf` masked = 0, 14-digit = 0):
- Freshness SLA threshold:
- Temporal sanity (`<= now + 365d`):
- Null/duplicate key thresholds:
- Mandatory non-zero nodes/rels:
## 7. Operational Flow
- Shadow load command:
- Gate runner commands:
- API smoke checks:
- Promote command:
- Rollback command:
## 8. Acceptance
- Evidence bundle path in `audit-results/`:
- Final status: `resolved | resolved_full | blocked_external | quality_fail`
- Reviewer sign-off:

View File

@@ -0,0 +1,109 @@
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/holding/,holdings,Agent G,file,Ownership enrichment
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/,pncp,Agent C,api,Freshness SLA pending
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/dataset/bens-candidato,tse_bens,Agent E,file,Patrimony baseline
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/dataset/filiados-partidos,tse_filiados,Agent E,file,Party network
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/penalidades,bcb,Agent G,file,Bank penalties loaded
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/estban,,Agent G,file,Banking aggregates
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/dataset/if-data,,Agent G,file,Institution KPIs
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao,,Agent G,file,Regulatory actions
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/cnciai/,,Agent D,api,Misconduct convictions
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao,,Agent F,file,Protected areas
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.rs.gov.br/,,Agent H,file,State audit procurement
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts
1 source_id name category tier status implementation_state load_state frequency in_universe_v1 primary_url pipeline_id owner_agent access_mode notes
2 cnpj Receita Federal CNPJ identity P0 loaded implemented loaded monthly true https://dadosabertos.rfb.gov.br/CNPJ/ cnpj Agent A file http://dadosabertos.rfb.gov.br
3 tse TSE elections and donations electoral P0 loaded implemented loaded biennial true https://dadosabertos.tse.jus.br/ tse Agent E file Core electoral data loaded
4 transparencia Portal da Transparencia contracts contracts P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados transparencia Agent C file Federal contracts and servants
5 sanctions CEIS CNEP sanctions sanctions P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/sancoes/consulta sanctions Agent C file Administrative sanctions
6 pep_cgu CGU PEP list integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/pep pep_cgu Agent A file PEP baseline
7 bndes BNDES financings finance P1 loaded implemented loaded monthly true https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados bndes Agent G file Loan relationships
8 pgfn PGFN divida ativa fiscal P0 loaded implemented loaded monthly true https://www.regularize.pgfn.gov.br/dados-abertos pgfn Agent C file Debt risk core
9 ibama IBAMA embargos environment P1 loaded implemented loaded monthly true https://servicos.ibama.gov.br/ctf/publico/areasembargadas/ ibama Agent F file Environmental enforcement
10 comprasnet ComprasNet contracts contracts P0 stale implemented partial monthly true https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos comprasnet Agent C file Needs freshness backfill
11 tcu TCU sanctions audit P1 loaded implemented loaded monthly true https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS tcu Agent C file Inidoneidade sanctions
12 transferegov TransfereGov emendas e convenios transfers P0 loaded implemented loaded monthly true https://www.transferegov.sistema.gov.br/portal/download-de-dados transferegov Agent C file Transfer relationships
13 rais RAIS aggregated labor labor P1 loaded implemented loaded annual true https://basedosdados.org/dataset/br-me-rais rais Agent H bigquery Aggregate mode only
14 inep INEP school census education P2 loaded implemented loaded annual true https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar inep Agent H file Education coverage
15 dou Diario Oficial da Uniao gazette P0 loaded implemented loaded daily true https://www.in.gov.br/leiturajornal dou Agent E bigquery National acts ingestion
16 datasus DATASUS CNES health P1 loaded implemented loaded monthly true https://opendatasus.saude.gov.br/ datasus Agent H file Health establishments
17 icij ICIJ offshore leaks offshore P1 loaded implemented loaded yearly true https://offshoreleaks.icij.org/pages/database icij Agent G file Offshore entities and officers
18 opensanctions OpenSanctions global PEP sanctions P1 loaded implemented loaded monthly true https://www.opensanctions.org/datasets/peps/ opensanctions Agent G file Global PEP matching
19 cvm CVM proceedings market P1 loaded implemented loaded monthly true https://dados.cvm.gov.br/ cvm Agent G file Proceedings loaded
20 cvm_funds CVM fund registry market P1 loaded implemented loaded monthly true https://dados.cvm.gov.br/dados/FI/ cvm_funds Agent G file Fund baseline
21 camara Camara CEAP expenses legislative P1 loaded implemented loaded monthly true https://dadosabertos.camara.leg.br/ camara Agent E api Expense reimbursement
22 camara_inquiries Camara inquiries and requirements legislative P0 partial implemented partial daily true https://dadosabertos.camara.leg.br/ camara_inquiries Agent E api Sessions still low
23 senado Senado CEAPS expenses legislative P1 loaded implemented loaded monthly true https://www12.senado.leg.br/dados-abertos senado Agent E api Expense data loaded
24 ceaf CEAF expelled servants integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/ceaf ceaf Agent A file Expulsion evidence
25 cepim CEPIM barred NGOs integrity P1 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/cepim cepim Agent A file NGO restrictions
26 cpgf CPGF gov card expenses spending P2 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/cpgf cpgf Agent H file Masked CPF source
27 leniency Acordos de leniencia integrity P0 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia leniency Agent A file High signal low volume
28 ofac OFAC sanctions sanctions P1 loaded implemented loaded monthly true https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files ofac Agent G file International sanctions
29 holdings Brasil IO holdings ownership P1 loaded implemented loaded monthly true https://brasil.io/dataset/socios-brasil/holding/ holdings Agent G file Ownership enrichment
30 viagens Viagens a servico spending P2 loaded implemented loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/viagens viagens Agent H file Travel spend baseline
31 siop SIOP emendas budget P0 partial implemented partial annual true https://www.siop.planejamento.gov.br/ siop Agent C api Author linkage limited
32 pncp PNCP bids and contracts contracts P0 stale implemented partial monthly true https://pncp.gov.br/api/consulta/v1/ pncp Agent C api Freshness SLA pending
33 renuncias Renuncias fiscais fiscal P1 loaded implemented loaded annual true https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos renuncias Agent G file Tax waiver baseline
34 siconfi SICONFI municipal finance fiscal P1 partial implemented partial annual true https://apidatalake.tesouro.gov.br/docs/siconfi/ siconfi Agent C api No CNPJ direct links
35 tse_bens TSE candidate assets electoral P1 loaded implemented loaded biennial true https://dadosabertos.tse.jus.br/dataset/bens-candidato tse_bens Agent E file Patrimony baseline
36 tse_filiados TSE party memberships electoral P1 loaded implemented loaded monthly true https://dadosabertos.tse.jus.br/dataset/filiados-partidos tse_filiados Agent E file Party network
37 bcb BCB penalties finance P1 loaded implemented loaded monthly true https://dadosabertos.bcb.gov.br/dataset/penalidades bcb Agent G file Bank penalties loaded
38 stf STF court data judiciary P1 loaded implemented loaded monthly true https://basedosdados.org/dataset/br-stf-corte-aberta stf Agent D bigquery Supreme court coverage
39 caged CAGED labor movements labor P1 stale implemented partial monthly true https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/ caged Agent H file Aggregate-only implementation
40 eu_sanctions EU sanctions sanctions P1 loaded implemented loaded monthly true https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions eu_sanctions Agent G file International sanctions
41 un_sanctions UN sanctions sanctions P1 loaded implemented loaded monthly true https://scsanctions.un.org/resources/xml/en/consolidated.xml un_sanctions Agent G file International sanctions
42 world_bank World Bank debarment sanctions P1 loaded implemented loaded monthly true https://www.worldbank.org/en/projects-operations/procurement/debarred-firms world_bank Agent G file International sanctions
43 senado_cpis Senado CPIs legislative P0 partial implemented partial yearly true https://www12.senado.leg.br/dados-abertos senado_cpis Agent E api Needs richer sessions and requirements
44 mides MiDES municipal procurement municipal P0 loaded implemented loaded daily true https://basedosdados.org/dataset/world-wb-mides mides Agent H bigquery Operational after access fix
45 querido_diario Querido Diario gazettes municipal P1 partial implemented partial daily true https://queridodiario.ok.org.br/api querido_diario Agent H api Text availability gap
46 datajud CNJ DataJud judiciary P0 blocked_external implemented not_loaded monthly true https://api-publica.datajud.cnj.jus.br/ datajud Agent D api Credentials not fully operational in prod
47 bolsa_familia_bpc Bolsa Familia and BPC social P3 not_built not_implemented not_loaded monthly true https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos Agent H file High volume masked identities
48 estban BCB ESTBAN balances finance P3 not_built not_implemented not_loaded monthly true https://dadosabertos.bcb.gov.br/dataset/estban Agent G file Banking aggregates
49 if_data BCB IF data indicators finance P3 not_built not_implemented not_loaded quarterly true https://dadosabertos.bcb.gov.br/dataset/if-data Agent G file Institution KPIs
50 bcb_liquidacao BCB bank liquidation acts finance P2 not_built not_implemented not_loaded monthly true https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao Agent G file Regulatory actions
51 stj_dados_abertos STJ open data judiciary P1 not_built not_implemented not_loaded monthly true https://dadosabertos.stj.jus.br/ Agent D api Superior court decisions
52 cnciai_improbidade CNIAI improbidade judiciary P1 not_built not_implemented not_loaded monthly true https://www.cnj.jus.br/sistemas/cnciai/ Agent D api Misconduct convictions
53 carf_tax_appeals CARF tax appeals judiciary P2 not_built not_implemented not_loaded monthly true https://carf.economia.gov.br/dados-abertos Agent D file Tax litigation
54 anp_royalties ANP royalties and fuel regulatory P2 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anp Agent F api Oil and gas royalties
55 aneel_concessions ANEEL concessions regulatory P2 not_built not_implemented not_loaded monthly true https://dadosabertos.aneel.gov.br/ Agent F api Energy concessions
56 anm_mining_rights ANM mining rights regulatory P1 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anm Agent F api Mining rights and permits
57 antt_transport_concessions ANTT concessions regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/antt Agent F api Transport concessions
58 ans_health_plans ANS operators regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/ans Agent H api Health insurance operators
59 anvisa_registrations ANVISA products regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anvisa Agent H api Regulatory registrations
60 anac_aviation_concessions ANAC concessions regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anac Agent F api Aviation contracts
61 antaq_port_contracts ANTAQ contracts regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/antaq Agent F api Port concessions
62 ana_water_grants ANA water grants regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/ana Agent F api Water use rights
63 anatel_telecom_licenses ANATEL licenses regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/anatel Agent G api Telecom operators
64 susep_insurance_market SUSEP insurance market regulatory P3 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/susep Agent G file Insurance entities
65 cvm_full_ownership_chain CVM ownership chains market P1 not_built not_implemented not_loaded monthly true https://dados.cvm.gov.br/ Agent G file Shareholder graph expansion
66 receita_dirbi Receita DIRBI tax P1 not_built not_implemented not_loaded monthly true https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi Agent G file Tax benefit declarations
67 mapbiomas_alertas MapBiomas Alerta environment P1 not_built not_implemented not_loaded monthly true https://alerta.mapbiomas.org/api Agent F api Deforestation alerts
68 sicar_rural_registry SiCAR rural registry environment P1 not_built not_implemented not_loaded quarterly true https://www.car.gov.br/publico/municipios/downloads Agent F file Property boundaries and owners
69 icmbio_cnuc ICMBio CNUC units environment P2 not_built not_implemented not_loaded monthly true https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao Agent F file Protected areas
70 tesouro_emendas Tesouro emendas budget P0 not_built not_implemented not_loaded monthly true https://www.tesourotransparente.gov.br/ Agent C file Budget execution
71 siga_brasil SIGA Brasil budget P0 not_built not_implemented not_loaded monthly true https://www12.senado.leg.br/orcamento/sigabrasil Agent C file Federal budget traces
72 camara_votes_bills Camara votes and bills legislative P1 not_built not_implemented not_loaded daily true https://dadosabertos.camara.leg.br/api/v2 Agent E api Legislative behavior
73 senado_votes_bills Senado votes and bills legislative P1 not_built not_implemented not_loaded daily true https://legis.senado.leg.br/dadosabertos Agent E api Legislative behavior
74 interpol_red_notices Interpol red notices international P2 not_built not_implemented not_loaded weekly true https://www.interpol.int/How-we-work/Notices/Red-Notices Agent G api Requires key
75 tce_sp TCE Sao Paulo state P2 not_built not_implemented not_loaded monthly true https://transparencia.tce.sp.gov.br/ Agent H api State audit procurement
76 tce_pe TCE Pernambuco state P2 not_built not_implemented not_loaded monthly true https://sistemas.tce.pe.gov.br/ Agent H api State audit procurement
77 tce_rj TCE Rio de Janeiro state P2 not_built not_implemented not_loaded monthly true https://dados.tce.rj.gov.br/ Agent H api State audit procurement
78 tce_rs TCE Rio Grande do Sul state P2 not_built not_implemented not_loaded monthly true https://portal.tce.rs.gov.br/ Agent H file State audit procurement
79 tce_mg TCE Minas Gerais state P2 not_built not_implemented not_loaded monthly true https://www.tce.mg.gov.br/ Agent H web State audit procurement
80 tce_ba TCE Bahia state P3 not_built not_implemented not_loaded monthly true https://www.tce.ba.gov.br/ Agent H web State audit procurement
81 tce_ce TCE Ceara state P3 not_built not_implemented not_loaded monthly true https://www.tce.ce.gov.br/ Agent H web State audit procurement
82 tce_go TCE Goias state P3 not_built not_implemented not_loaded monthly true https://portal.tce.go.gov.br/ Agent H web State audit procurement
83 tce_pr TCE Parana state P3 not_built not_implemented not_loaded monthly true https://www1.tce.pr.gov.br/ Agent H web State audit procurement
84 tce_sc TCE Santa Catarina state P3 not_built not_implemented not_loaded monthly true https://www.tcesc.tc.br/ Agent H web State audit procurement
85 tce_es TCE Espirito Santo state P3 not_built not_implemented not_loaded monthly true https://www.tcees.tc.br/ Agent H web State audit procurement
86 tce_mt TCE Mato Grosso state P3 not_built not_implemented not_loaded monthly true https://www.tce.mt.gov.br/ Agent H web State audit procurement
87 tce_ms TCE Mato Grosso do Sul state P3 not_built not_implemented not_loaded monthly true https://www.tce.ms.gov.br/ Agent H web State audit procurement
88 tce_am TCE Amazonas state P3 not_built not_implemented not_loaded monthly true https://www.tce.am.gov.br/ Agent H web State audit procurement
89 tce_pa TCE Para state P3 not_built not_implemented not_loaded monthly true https://www.tcepa.tc.br/ Agent H web State audit procurement
90 tce_ro TCE Rondonia state P3 not_built not_implemented not_loaded monthly true https://www.tce.ro.gov.br/ Agent H web State audit procurement
91 tce_rr TCE Roraima state P3 not_built not_implemented not_loaded monthly true https://www.tcerr.tc.br/ Agent H web State audit procurement
92 tce_ap TCE Amapa state P3 not_built not_implemented not_loaded monthly true https://www.tce.ap.gov.br/ Agent H web State audit procurement
93 tce_to TCE Tocantins state P3 not_built not_implemented not_loaded monthly true https://www.tceto.tc.br/ Agent H web State audit procurement
94 tce_ma TCE Maranhao state P3 not_built not_implemented not_loaded monthly true https://www.tcema.tc.br/ Agent H web State audit procurement
95 tce_pi TCE Piaui state P3 not_built not_implemented not_loaded monthly true https://www.tce.pi.gov.br/ Agent H web State audit procurement
96 tce_rn TCE Rio Grande do Norte state P3 not_built not_implemented not_loaded monthly true https://www.tce.rn.gov.br/ Agent H web State audit procurement
97 tce_pb TCE Paraiba state P3 not_built not_implemented not_loaded monthly true https://tce.pb.gov.br/ Agent H web State audit procurement
98 tce_al TCE Alagoas state P3 not_built not_implemented not_loaded monthly true https://www.tceal.tc.br/ Agent H web State audit procurement
99 tce_se TCE Sergipe state P3 not_built not_implemented not_loaded monthly true https://www.tce.se.gov.br/ Agent H web State audit procurement
100 state_portal_sp Sao Paulo transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.sp.gov.br/ Agent H api State expenses and contracts
101 state_portal_mg Minas Gerais transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.mg.gov.br/ Agent H web State expenses and contracts
102 state_portal_ba Bahia transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.ba.gov.br/ Agent H web State expenses and contracts
103 state_portal_ce Ceara transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.ce.gov.br/ Agent H web State expenses and contracts
104 state_portal_go Goias transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.go.gov.br/ Agent H web State expenses and contracts
105 state_portal_pr Parana transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.pr.gov.br/ Agent H web State expenses and contracts
106 state_portal_sc Santa Catarina transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.sc.gov.br/ Agent H web State expenses and contracts
107 state_portal_rs Rio Grande do Sul transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.rs.gov.br/ Agent H web State expenses and contracts
108 state_portal_pe Pernambuco transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.pe.gov.br/ Agent H web State expenses and contracts
109 state_portal_rj Rio de Janeiro transparency portal state P2 not_built not_implemented not_loaded monthly true https://www.transparencia.rj.gov.br/ Agent H web State expenses and contracts

View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""Validate Brazil source registry completeness and code alignment."""
from __future__ import annotations
import argparse
import csv
import json
import re
from collections import Counter
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
REQUIRED_COLUMNS = {
"source_id",
"name",
"category",
"tier",
"status",
"implementation_state",
"load_state",
"frequency",
"in_universe_v1",
"primary_url",
"pipeline_id",
"owner_agent",
"access_mode",
"notes",
}
VALID_STATUS = {
"loaded",
"partial",
"stale",
"blocked_external",
"quality_fail",
"not_built",
}
VALID_IMPLEMENTATION = {"implemented", "not_implemented"}
VALID_LOAD_STATE = {"loaded", "partial", "not_loaded"}
PIPELINE_ENTRY_RE = re.compile(r'^\s*"([a-z0-9_]+)":\s*[A-Za-z_][A-Za-z0-9_]*,\s*$')
@dataclass(frozen=True)
class GateResult:
name: str
passed: bool
details: str
def parse_bool(value: str) -> bool:
return value.strip().lower() in {"1", "true", "yes", "y"}
def read_registry(path: Path) -> tuple[list[dict[str, str]], list[str]]:
if not path.exists():
return [], [f"registry file not found: {path}"]
with path.open(encoding="utf-8", newline="") as csv_file:
reader = csv.DictReader(csv_file)
missing_cols = REQUIRED_COLUMNS.difference(set(reader.fieldnames or []))
if missing_cols:
return [], [f"missing required columns: {sorted(missing_cols)}"]
rows = list(reader)
return rows, []
def parse_runner_pipelines(path: Path) -> tuple[set[str], list[str]]:
if not path.exists():
return set(), [f"runner file not found: {path}"]
pipelines: set[str] = set()
inside_map = False
with path.open(encoding="utf-8") as runner_file:
for raw_line in runner_file:
line = raw_line.rstrip("\n")
if line.startswith("PIPELINES: dict[str, type] = {"):
inside_map = True
continue
if inside_map and line.strip() == "}":
break
if inside_map:
match = PIPELINE_ENTRY_RE.match(line)
if match:
pipelines.add(match.group(1))
if not pipelines:
return set(), ["could not parse pipeline ids from runner"]
return pipelines, []
def build_gate_results(
rows: list[dict[str, str]],
runner_pipelines: set[str],
expected_universe: int,
expected_implemented: int,
) -> tuple[list[GateResult], dict[str, int], dict[str, int]]:
source_ids = [row["source_id"].strip() for row in rows]
duplicate_ids = [sid for sid, count in Counter(source_ids).items() if count > 1]
invalid_status = sorted(
{
row["status"].strip()
for row in rows
if row["status"].strip() and row["status"].strip() not in VALID_STATUS
}
)
invalid_implementation = sorted(
{
row["implementation_state"].strip()
for row in rows
if row["implementation_state"].strip()
and row["implementation_state"].strip() not in VALID_IMPLEMENTATION
}
)
invalid_load_state = sorted(
{
row["load_state"].strip()
for row in rows
if row["load_state"].strip() and row["load_state"].strip() not in VALID_LOAD_STATE
}
)
universe_rows = [row for row in rows if parse_bool(row["in_universe_v1"])]
implemented_rows = [
row for row in universe_rows if row["implementation_state"].strip() == "implemented"
]
implemented_ids = {row["source_id"].strip() for row in implemented_rows}
status_counter = Counter(row["status"].strip() for row in universe_rows)
implementation_counter = Counter(
row["implementation_state"].strip() for row in universe_rows
)
missing_from_registry = sorted(runner_pipelines - implemented_ids)
not_in_runner = sorted(implemented_ids - runner_pipelines)
gates = [
GateResult(
name="registry_has_no_duplicate_source_ids",
passed=not duplicate_ids,
details=f"duplicates={duplicate_ids}" if duplicate_ids else "ok",
),
GateResult(
name="registry_values_are_valid",
passed=not invalid_status and not invalid_implementation and not invalid_load_state,
details=(
f"invalid_status={invalid_status}; "
f"invalid_implementation={invalid_implementation}; "
f"invalid_load_state={invalid_load_state}"
),
),
GateResult(
name="universe_v1_count_matches_expected",
passed=len(universe_rows) == expected_universe,
details=f"actual={len(universe_rows)} expected={expected_universe}",
),
GateResult(
name="implemented_count_matches_expected",
passed=len(implemented_rows) == expected_implemented,
details=f"actual={len(implemented_rows)} expected={expected_implemented}",
),
GateResult(
name="runner_pipelines_are_all_marked_implemented",
passed=not missing_from_registry,
details=f"missing={missing_from_registry}" if missing_from_registry else "ok",
),
GateResult(
name="implemented_registry_ids_exist_in_runner",
passed=not not_in_runner,
details=f"extra={not_in_runner}" if not_in_runner else "ok",
),
]
return gates, dict(status_counter), dict(implementation_counter)
def write_outputs(
output_dir: Path,
registry_path: Path,
runner_path: Path,
expected_universe: int,
expected_implemented: int,
gates: list[GateResult],
status_counter: dict[str, int],
implementation_counter: dict[str, int],
) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
payload = {
"timestamp_utc": datetime.now(UTC).isoformat(),
"registry_path": str(registry_path),
"runner_path": str(runner_path),
"expected_universe_v1": expected_universe,
"expected_implemented": expected_implemented,
"status_counter": status_counter,
"implementation_counter": implementation_counter,
"gates": [gate.__dict__ for gate in gates],
"all_passed": all(gate.passed for gate in gates),
}
(output_dir / "source_completeness_report.json").write_text(
json.dumps(payload, indent=2, ensure_ascii=True) + "\n",
encoding="utf-8",
)
lines = [
"# Source Completeness Gate Report",
"",
f"- Timestamp (UTC): `{payload['timestamp_utc']}`",
f"- Registry: `{registry_path}`",
f"- Runner: `{runner_path}`",
f"- Expected universe_v1: `{expected_universe}`",
f"- Expected implemented: `{expected_implemented}`",
"",
"## Counters",
"",
f"- status_counter: `{status_counter}`",
f"- implementation_counter: `{implementation_counter}`",
"",
"## Gate Results",
"",
]
for gate in gates:
mark = "PASS" if gate.passed else "FAIL"
lines.append(f"- `{mark}` `{gate.name}`: {gate.details}")
lines.append("")
lines.append(f"## Final: `{'PASS' if payload['all_passed'] else 'FAIL'}`")
lines.append("")
(output_dir / "source_completeness_report.md").write_text(
"\n".join(lines), encoding="utf-8"
)
def main() -> int:
parser = argparse.ArgumentParser(
description="Validate source registry completeness and code alignment."
)
parser.add_argument(
"--registry-path",
default="docs/source_registry_br_v1.csv",
help="Path to source registry CSV",
)
parser.add_argument(
"--runner-path",
default="etl/src/icarus_etl/runner.py",
help="Path to ETL runner with PIPELINES map",
)
parser.add_argument(
"--expected-universe-v1",
type=int,
default=108,
help="Expected count for in_universe_v1=true rows",
)
parser.add_argument(
"--expected-implemented",
type=int,
default=45,
help="Expected count for implementation_state=implemented rows",
)
parser.add_argument(
"--output-dir",
default=f"audit-results/brazil-coverage-{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}",
help="Directory for gate reports",
)
args = parser.parse_args()
registry_path = Path(args.registry_path)
runner_path = Path(args.runner_path)
output_dir = Path(args.output_dir)
rows, registry_errors = read_registry(registry_path)
runner_pipelines, runner_errors = parse_runner_pipelines(runner_path)
if registry_errors or runner_errors:
output_dir.mkdir(parents=True, exist_ok=True)
combined = registry_errors + runner_errors
(output_dir / "source_completeness_report.md").write_text(
"# Source Completeness Gate Report\n\n"
+ "\n".join(f"- FAIL: {msg}" for msg in combined)
+ "\n",
encoding="utf-8",
)
print("\n".join(combined))
return 1
gates, status_counter, implementation_counter = build_gate_results(
rows=rows,
runner_pipelines=runner_pipelines,
expected_universe=args.expected_universe_v1,
expected_implemented=args.expected_implemented,
)
write_outputs(
output_dir=output_dir,
registry_path=registry_path,
runner_path=runner_path,
expected_universe=args.expected_universe_v1,
expected_implemented=args.expected_implemented,
gates=gates,
status_counter=status_counter,
implementation_counter=implementation_counter,
)
all_passed = all(gate.passed for gate in gates)
print("PASS" if all_passed else "FAIL")
return 0 if all_passed else 1
if __name__ == "__main__":
raise SystemExit(main())