mirror of
https://github.com/kharonsec/br-acc
synced 2026-04-25 17:15:02 +02:00
Add source registry governance and dynamic meta source stats (#6)
Co-authored-by: bruno cesar <brunoclz@brunos-MacBook-Pro.local>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -49,6 +49,7 @@ htmlcov/
|
||||
|
||||
# Data files (too large for git)
|
||||
*.csv
|
||||
!docs/source_registry_br_v1.csv
|
||||
*.tsv
|
||||
*.parquet
|
||||
*.dbc
|
||||
|
||||
@@ -6,6 +6,7 @@ from neo4j import AsyncSession
|
||||
|
||||
from icarus.dependencies import get_session
|
||||
from icarus.services.neo4j_service import execute_query_single
|
||||
from icarus.services.source_registry import load_source_registry, source_registry_summary
|
||||
|
||||
router = APIRouter(prefix="/api/v1/meta", tags=["meta"])
|
||||
|
||||
@@ -33,6 +34,9 @@ async def database_stats(
|
||||
return _stats_cache
|
||||
|
||||
record = await execute_query_single(session, "meta_stats", {})
|
||||
source_entries = load_source_registry()
|
||||
source_summary = source_registry_summary(source_entries)
|
||||
|
||||
result = {
|
||||
"total_nodes": record["total_nodes"] if record else 0,
|
||||
"total_relationships": record["total_relationships"] if record else 0,
|
||||
@@ -77,7 +81,12 @@ async def database_stats(
|
||||
"municipal_bid_count": record["municipal_bid_count"] if record else 0,
|
||||
"municipal_contract_count": record["municipal_contract_count"] if record else 0,
|
||||
"municipal_gazette_act_count": record["municipal_gazette_act_count"] if record else 0,
|
||||
"data_sources": 45,
|
||||
"data_sources": source_summary["universe_v1_sources"],
|
||||
"implemented_sources": source_summary["implemented_sources"],
|
||||
"loaded_sources": source_summary["loaded_sources"],
|
||||
"stale_sources": source_summary["stale_sources"],
|
||||
"blocked_external_sources": source_summary["blocked_external_sources"],
|
||||
"quality_fail_sources": source_summary["quality_fail_sources"],
|
||||
}
|
||||
|
||||
_stats_cache = result
|
||||
@@ -86,77 +95,6 @@ async def database_stats(
|
||||
|
||||
|
||||
@router.get("/sources")
|
||||
async def list_sources() -> dict[str, list[dict[str, str]]]:
|
||||
return {
|
||||
"sources": [
|
||||
{"id": "cnpj", "name": "Receita Federal (CNPJ)", "frequency": "monthly"},
|
||||
{"id": "tse", "name": "Tribunal Superior Eleitoral", "frequency": "biennial"},
|
||||
{"id": "transparencia", "name": "Portal da Transparência", "frequency": "monthly"},
|
||||
{"id": "ceis", "name": "CEIS/CNEP/CEPIM/CEAF", "frequency": "monthly"},
|
||||
{"id": "cnes", "name": "CNES/DATASUS", "frequency": "monthly"},
|
||||
{"id": "bndes", "name": "BNDES (Empréstimos)", "frequency": "monthly"},
|
||||
{"id": "pgfn", "name": "PGFN (Dívida Ativa)", "frequency": "monthly"},
|
||||
{"id": "ibama", "name": "IBAMA (Embargos)", "frequency": "monthly"},
|
||||
{"id": "comprasnet", "name": "ComprasNet/PNCP", "frequency": "monthly"},
|
||||
{"id": "tcu", "name": "TCU (Sanções)", "frequency": "monthly"},
|
||||
{"id": "transferegov", "name": "TransfereGov (Convênios)", "frequency": "monthly"},
|
||||
{"id": "rais", "name": "RAIS (Estatísticas Trabalhistas)", "frequency": "annual"},
|
||||
{"id": "inep", "name": "INEP (Censo Educação)", "frequency": "annual"},
|
||||
{"id": "dou", "name": "Diário Oficial da União", "frequency": "daily"},
|
||||
{"id": "icij", "name": "ICIJ Offshore Leaks", "frequency": "yearly"},
|
||||
{"id": "opensanctions", "name": "OpenSanctions (PEPs globais)", "frequency": "monthly"},
|
||||
{"id": "cvm", "name": "CVM (Processos Sancionadores)", "frequency": "monthly"},
|
||||
{"id": "camara", "name": "Câmara dos Deputados (CEAP)", "frequency": "monthly"},
|
||||
{"id": "senado", "name": "Senado Federal (CEAPS)", "frequency": "monthly"},
|
||||
{"id": "pep_cgu", "name": "CGU PEP (Pessoas Expostas)", "frequency": "monthly"},
|
||||
{"id": "ceaf", "name": "CEAF (Servidores Expulsos)", "frequency": "monthly"},
|
||||
{"id": "leniency", "name": "Acordos de Leniência", "frequency": "monthly"},
|
||||
{"id": "ofac", "name": "OFAC SDN (Sanções Internacionais)", "frequency": "monthly"},
|
||||
{"id": "holdings", "name": "Brasil.IO (Holdings Empresariais)", "frequency": "monthly"},
|
||||
{"id": "cpgf", "name": "CPGF (Cartão de Pagamento)", "frequency": "monthly"},
|
||||
{"id": "viagens", "name": "Viagens a Serviço", "frequency": "monthly"},
|
||||
{"id": "siop", "name": "SIOP (Emendas Parlamentares)", "frequency": "annual"},
|
||||
{"id": "pncp", "name": "PNCP (Licitações)", "frequency": "monthly"},
|
||||
{"id": "cvm_funds", "name": "CVM (Fundos de Investimento)", "frequency": "monthly"},
|
||||
{"id": "renuncias", "name": "Renúncias Fiscais", "frequency": "annual"},
|
||||
{"id": "siconfi", "name": "SICONFI (Finanças Municipais)", "frequency": "annual"},
|
||||
{"id": "tse_bens", "name": "TSE Bens Declarados", "frequency": "biennial"},
|
||||
{"id": "tse_filiados", "name": "TSE Filiação Partidária", "frequency": "monthly"},
|
||||
{"id": "cepim", "name": "CEPIM (ONGs Impedidas)", "frequency": "monthly"},
|
||||
{"id": "bcb", "name": "BCB (Penalidades Bancárias)", "frequency": "monthly"},
|
||||
{"id": "caged", "name": "CAGED (Movimentações Trabalhistas)", "frequency": "monthly"},
|
||||
{"id": "stf", "name": "STF (Decisões Corte Aberta)", "frequency": "monthly"},
|
||||
{"id": "eu_sanctions", "name": "EU (Sanções Europeias)", "frequency": "monthly"},
|
||||
{
|
||||
"id": "un_sanctions",
|
||||
"name": "ONU (Sanções do Conselho de Segurança)",
|
||||
"frequency": "monthly",
|
||||
},
|
||||
{
|
||||
"id": "world_bank",
|
||||
"name": "Banco Mundial (Firmas Impedidas)",
|
||||
"frequency": "monthly",
|
||||
},
|
||||
{"id": "senado_cpis", "name": "Senado CPIs", "frequency": "yearly"},
|
||||
{
|
||||
"id": "camara_inquiries",
|
||||
"name": "Câmara (CPIs/CPMIs e Requerimentos)",
|
||||
"frequency": "daily",
|
||||
},
|
||||
{
|
||||
"id": "mides",
|
||||
"name": "MiDES (Licitações/Contratos Municipais)",
|
||||
"frequency": "daily",
|
||||
},
|
||||
{
|
||||
"id": "querido_diario",
|
||||
"name": "Querido Diário (Atos Municipais)",
|
||||
"frequency": "daily",
|
||||
},
|
||||
{
|
||||
"id": "datajud",
|
||||
"name": "CNJ DataJud (Processos Judiciais)",
|
||||
"frequency": "monthly",
|
||||
},
|
||||
]
|
||||
}
|
||||
async def list_sources() -> dict[str, list[dict[str, Any]]]:
|
||||
sources = [entry.to_public_dict() for entry in load_source_registry() if entry.in_universe_v1]
|
||||
return {"sources": sources}
|
||||
|
||||
107
api/src/icarus/services/source_registry.py
Normal file
107
api/src/icarus/services/source_registry.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import csv
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceRegistryEntry:
|
||||
id: str
|
||||
name: str
|
||||
category: str
|
||||
tier: str
|
||||
status: str
|
||||
implementation_state: str
|
||||
load_state: str
|
||||
frequency: str
|
||||
in_universe_v1: bool
|
||||
primary_url: str
|
||||
pipeline_id: str
|
||||
owner_agent: str
|
||||
access_mode: str
|
||||
notes: str
|
||||
|
||||
def to_public_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"id": self.id,
|
||||
"name": self.name,
|
||||
"category": self.category,
|
||||
"tier": self.tier,
|
||||
"status": self.status,
|
||||
"implementation_state": self.implementation_state,
|
||||
"load_state": self.load_state,
|
||||
"frequency": self.frequency,
|
||||
"in_universe_v1": self.in_universe_v1,
|
||||
"primary_url": self.primary_url,
|
||||
"pipeline_id": self.pipeline_id,
|
||||
"owner_agent": self.owner_agent,
|
||||
"access_mode": self.access_mode,
|
||||
"notes": self.notes,
|
||||
}
|
||||
|
||||
|
||||
def _str_to_bool(value: str) -> bool:
|
||||
return value.strip().lower() in {"1", "true", "yes", "y"}
|
||||
|
||||
|
||||
def _default_registry_path() -> Path:
|
||||
# .../api/src/icarus/services/source_registry.py -> repo root is parents[4]
|
||||
return Path(__file__).resolve().parents[4] / "docs" / "source_registry_br_v1.csv"
|
||||
|
||||
|
||||
def get_registry_path() -> Path:
|
||||
configured = os.getenv("ICARUS_SOURCE_REGISTRY_PATH", "").strip()
|
||||
return Path(configured) if configured else _default_registry_path()
|
||||
|
||||
|
||||
def load_source_registry() -> list[SourceRegistryEntry]:
|
||||
registry_path = get_registry_path()
|
||||
if not registry_path.exists():
|
||||
return []
|
||||
|
||||
entries: list[SourceRegistryEntry] = []
|
||||
with registry_path.open(encoding="utf-8", newline="") as csv_file:
|
||||
reader = csv.DictReader(csv_file)
|
||||
for row in reader:
|
||||
entries.append(
|
||||
SourceRegistryEntry(
|
||||
id=(row.get("source_id") or "").strip(),
|
||||
name=(row.get("name") or "").strip(),
|
||||
category=(row.get("category") or "").strip(),
|
||||
tier=(row.get("tier") or "").strip(),
|
||||
status=(row.get("status") or "").strip(),
|
||||
implementation_state=(row.get("implementation_state") or "").strip(),
|
||||
load_state=(row.get("load_state") or "").strip(),
|
||||
frequency=(row.get("frequency") or "").strip(),
|
||||
in_universe_v1=_str_to_bool(row.get("in_universe_v1") or ""),
|
||||
primary_url=(row.get("primary_url") or "").strip(),
|
||||
pipeline_id=(row.get("pipeline_id") or "").strip(),
|
||||
owner_agent=(row.get("owner_agent") or "").strip(),
|
||||
access_mode=(row.get("access_mode") or "").strip(),
|
||||
notes=(row.get("notes") or "").strip(),
|
||||
)
|
||||
)
|
||||
|
||||
entries.sort(key=lambda entry: entry.id)
|
||||
return entries
|
||||
|
||||
|
||||
def source_registry_summary(entries: list[SourceRegistryEntry]) -> dict[str, int]:
|
||||
universe_v1 = [entry for entry in entries if entry.in_universe_v1]
|
||||
implemented = [
|
||||
entry for entry in universe_v1 if entry.implementation_state == "implemented"
|
||||
]
|
||||
loaded = [entry for entry in universe_v1 if entry.load_state == "loaded"]
|
||||
stale = [entry for entry in universe_v1 if entry.status == "stale"]
|
||||
blocked = [entry for entry in universe_v1 if entry.status == "blocked_external"]
|
||||
quality_fail = [entry for entry in universe_v1 if entry.status == "quality_fail"]
|
||||
|
||||
return {
|
||||
"universe_v1_sources": len(universe_v1),
|
||||
"implemented_sources": len(implemented),
|
||||
"loaded_sources": len(loaded),
|
||||
"stale_sources": len(stale),
|
||||
"blocked_external_sources": len(blocked),
|
||||
"quality_fail_sources": len(quality_fail),
|
||||
}
|
||||
@@ -17,7 +17,7 @@ async def test_meta_sources(client: AsyncClient) -> None:
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "sources" in data
|
||||
assert len(data["sources"]) == 45
|
||||
assert len(data["sources"]) == 108
|
||||
source_ids = [s["id"] for s in data["sources"]]
|
||||
assert "cnpj" in source_ids
|
||||
assert "tse" in source_ids
|
||||
@@ -61,6 +61,11 @@ async def test_meta_sources(client: AsyncClient) -> None:
|
||||
assert "mides" in source_ids
|
||||
assert "querido_diario" in source_ids
|
||||
assert "datajud" in source_ids
|
||||
first = data["sources"][0]
|
||||
assert "status" in first
|
||||
assert "implementation_state" in first
|
||||
assert "load_state" in first
|
||||
assert "in_universe_v1" in first
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
@@ -169,4 +174,9 @@ async def test_meta_stats(client: AsyncClient) -> None:
|
||||
assert data["municipal_bid_count"] == 8_000_000
|
||||
assert data["municipal_contract_count"] == 6_000_000
|
||||
assert data["municipal_gazette_act_count"] == 4_000_000
|
||||
assert data["data_sources"] == 45
|
||||
assert data["data_sources"] == 108
|
||||
assert data["implemented_sources"] == 45
|
||||
assert data["loaded_sources"] >= 1
|
||||
assert data["stale_sources"] >= 0
|
||||
assert data["blocked_external_sources"] >= 0
|
||||
assert data["quality_fail_sources"] >= 0
|
||||
|
||||
60
docs/source_onboarding_contract.md
Normal file
60
docs/source_onboarding_contract.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Source Onboarding Contract (Brazil Coverage v1)
|
||||
|
||||
This contract is mandatory for every new source before `shadow -> promote`.
|
||||
|
||||
## 1. Source Identity
|
||||
- `source_id`:
|
||||
- `name`:
|
||||
- `category`:
|
||||
- `tier`:
|
||||
- `owner_agent`:
|
||||
- `primary_url`:
|
||||
- `access_mode` (`file|api|bigquery|web`):
|
||||
|
||||
## 2. Access and Legal
|
||||
- Credential required:
|
||||
- Secret name/path:
|
||||
- License or usage restriction:
|
||||
- LGPD/privacy considerations:
|
||||
- `blocked_external` criteria:
|
||||
|
||||
## 3. Data Contract
|
||||
- Downloader script: `etl/scripts/download_<source>.py`
|
||||
- Canonical output files:
|
||||
- Manifest file:
|
||||
- Update cadence:
|
||||
- Expected row volume:
|
||||
- Partition/window strategy:
|
||||
|
||||
## 4. Graph Contract
|
||||
- Node labels introduced:
|
||||
- Relationship types introduced:
|
||||
- Natural key(s) per node:
|
||||
- Merge key strategy:
|
||||
- Relationship quality tier (`strong|probable`):
|
||||
- Provenance fields (`method`, `confidence`, `source_ref`, `run_id`):
|
||||
|
||||
## 5. Index and Constraint Contract
|
||||
- Required uniqueness constraints:
|
||||
- Required date indexes:
|
||||
- Required lookup indexes:
|
||||
- Required fulltext indexes (if text-heavy):
|
||||
|
||||
## 6. Quality Gates (Hard Stop/Go)
|
||||
- Identity integrity preserved (`Person.cpf` masked = 0, 14-digit = 0):
|
||||
- Freshness SLA threshold:
|
||||
- Temporal sanity (`<= now + 365d`):
|
||||
- Null/duplicate key thresholds:
|
||||
- Mandatory non-zero nodes/rels:
|
||||
|
||||
## 7. Operational Flow
|
||||
- Shadow load command:
|
||||
- Gate runner commands:
|
||||
- API smoke checks:
|
||||
- Promote command:
|
||||
- Rollback command:
|
||||
|
||||
## 8. Acceptance
|
||||
- Evidence bundle path in `audit-results/`:
|
||||
- Final status: `resolved | resolved_full | blocked_external | quality_fail`
|
||||
- Reviewer sign-off:
|
||||
109
docs/source_registry_br_v1.csv
Normal file
109
docs/source_registry_br_v1.csv
Normal file
@@ -0,0 +1,109 @@
|
||||
source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes
|
||||
cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br
|
||||
tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded
|
||||
transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants
|
||||
sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions
|
||||
pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline
|
||||
bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships
|
||||
pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core
|
||||
ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement
|
||||
comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill
|
||||
tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions
|
||||
transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships
|
||||
rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only
|
||||
inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage
|
||||
dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion
|
||||
datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments
|
||||
icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers
|
||||
opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching
|
||||
cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded
|
||||
cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline
|
||||
camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement
|
||||
camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low
|
||||
senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded
|
||||
ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence
|
||||
cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions
|
||||
cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source
|
||||
leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume
|
||||
ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions
|
||||
holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/holding/,holdings,Agent G,file,Ownership enrichment
|
||||
viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline
|
||||
siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited
|
||||
pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/,pncp,Agent C,api,Freshness SLA pending
|
||||
renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline
|
||||
siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links
|
||||
tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/dataset/bens-candidato,tse_bens,Agent E,file,Patrimony baseline
|
||||
tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/dataset/filiados-partidos,tse_filiados,Agent E,file,Party network
|
||||
bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/penalidades,bcb,Agent G,file,Bank penalties loaded
|
||||
stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage
|
||||
caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation
|
||||
eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions
|
||||
un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions
|
||||
world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions
|
||||
senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements
|
||||
mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix
|
||||
querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap
|
||||
datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod
|
||||
bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities
|
||||
estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/estban,,Agent G,file,Banking aggregates
|
||||
if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/dataset/if-data,,Agent G,file,Institution KPIs
|
||||
bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao,,Agent G,file,Regulatory actions
|
||||
stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions
|
||||
cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/cnciai/,,Agent D,api,Misconduct convictions
|
||||
carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation
|
||||
anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties
|
||||
aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions
|
||||
anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits
|
||||
antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions
|
||||
ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators
|
||||
anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations
|
||||
anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts
|
||||
antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions
|
||||
ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights
|
||||
anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators
|
||||
susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities
|
||||
cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion
|
||||
receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations
|
||||
mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts
|
||||
sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners
|
||||
icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao,,Agent F,file,Protected areas
|
||||
tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution
|
||||
siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces
|
||||
camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior
|
||||
senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior
|
||||
interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key
|
||||
tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement
|
||||
tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement
|
||||
tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement
|
||||
tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.rs.gov.br/,,Agent H,file,State audit procurement
|
||||
tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement
|
||||
tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement
|
||||
tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement
|
||||
state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts
|
||||
state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts
|
||||
state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts
|
||||
|
307
scripts/run_source_completeness_gates.py
Normal file
307
scripts/run_source_completeness_gates.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate Brazil source registry completeness and code alignment."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
REQUIRED_COLUMNS = {
|
||||
"source_id",
|
||||
"name",
|
||||
"category",
|
||||
"tier",
|
||||
"status",
|
||||
"implementation_state",
|
||||
"load_state",
|
||||
"frequency",
|
||||
"in_universe_v1",
|
||||
"primary_url",
|
||||
"pipeline_id",
|
||||
"owner_agent",
|
||||
"access_mode",
|
||||
"notes",
|
||||
}
|
||||
|
||||
VALID_STATUS = {
|
||||
"loaded",
|
||||
"partial",
|
||||
"stale",
|
||||
"blocked_external",
|
||||
"quality_fail",
|
||||
"not_built",
|
||||
}
|
||||
VALID_IMPLEMENTATION = {"implemented", "not_implemented"}
|
||||
VALID_LOAD_STATE = {"loaded", "partial", "not_loaded"}
|
||||
PIPELINE_ENTRY_RE = re.compile(r'^\s*"([a-z0-9_]+)":\s*[A-Za-z_][A-Za-z0-9_]*,\s*$')
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GateResult:
|
||||
name: str
|
||||
passed: bool
|
||||
details: str
|
||||
|
||||
|
||||
def parse_bool(value: str) -> bool:
|
||||
return value.strip().lower() in {"1", "true", "yes", "y"}
|
||||
|
||||
|
||||
def read_registry(path: Path) -> tuple[list[dict[str, str]], list[str]]:
|
||||
if not path.exists():
|
||||
return [], [f"registry file not found: {path}"]
|
||||
|
||||
with path.open(encoding="utf-8", newline="") as csv_file:
|
||||
reader = csv.DictReader(csv_file)
|
||||
missing_cols = REQUIRED_COLUMNS.difference(set(reader.fieldnames or []))
|
||||
if missing_cols:
|
||||
return [], [f"missing required columns: {sorted(missing_cols)}"]
|
||||
rows = list(reader)
|
||||
return rows, []
|
||||
|
||||
|
||||
def parse_runner_pipelines(path: Path) -> tuple[set[str], list[str]]:
|
||||
if not path.exists():
|
||||
return set(), [f"runner file not found: {path}"]
|
||||
|
||||
pipelines: set[str] = set()
|
||||
inside_map = False
|
||||
with path.open(encoding="utf-8") as runner_file:
|
||||
for raw_line in runner_file:
|
||||
line = raw_line.rstrip("\n")
|
||||
if line.startswith("PIPELINES: dict[str, type] = {"):
|
||||
inside_map = True
|
||||
continue
|
||||
if inside_map and line.strip() == "}":
|
||||
break
|
||||
if inside_map:
|
||||
match = PIPELINE_ENTRY_RE.match(line)
|
||||
if match:
|
||||
pipelines.add(match.group(1))
|
||||
if not pipelines:
|
||||
return set(), ["could not parse pipeline ids from runner"]
|
||||
return pipelines, []
|
||||
|
||||
|
||||
def build_gate_results(
|
||||
rows: list[dict[str, str]],
|
||||
runner_pipelines: set[str],
|
||||
expected_universe: int,
|
||||
expected_implemented: int,
|
||||
) -> tuple[list[GateResult], dict[str, int], dict[str, int]]:
|
||||
source_ids = [row["source_id"].strip() for row in rows]
|
||||
duplicate_ids = [sid for sid, count in Counter(source_ids).items() if count > 1]
|
||||
invalid_status = sorted(
|
||||
{
|
||||
row["status"].strip()
|
||||
for row in rows
|
||||
if row["status"].strip() and row["status"].strip() not in VALID_STATUS
|
||||
}
|
||||
)
|
||||
invalid_implementation = sorted(
|
||||
{
|
||||
row["implementation_state"].strip()
|
||||
for row in rows
|
||||
if row["implementation_state"].strip()
|
||||
and row["implementation_state"].strip() not in VALID_IMPLEMENTATION
|
||||
}
|
||||
)
|
||||
invalid_load_state = sorted(
|
||||
{
|
||||
row["load_state"].strip()
|
||||
for row in rows
|
||||
if row["load_state"].strip() and row["load_state"].strip() not in VALID_LOAD_STATE
|
||||
}
|
||||
)
|
||||
|
||||
universe_rows = [row for row in rows if parse_bool(row["in_universe_v1"])]
|
||||
implemented_rows = [
|
||||
row for row in universe_rows if row["implementation_state"].strip() == "implemented"
|
||||
]
|
||||
implemented_ids = {row["source_id"].strip() for row in implemented_rows}
|
||||
|
||||
status_counter = Counter(row["status"].strip() for row in universe_rows)
|
||||
implementation_counter = Counter(
|
||||
row["implementation_state"].strip() for row in universe_rows
|
||||
)
|
||||
|
||||
missing_from_registry = sorted(runner_pipelines - implemented_ids)
|
||||
not_in_runner = sorted(implemented_ids - runner_pipelines)
|
||||
|
||||
gates = [
|
||||
GateResult(
|
||||
name="registry_has_no_duplicate_source_ids",
|
||||
passed=not duplicate_ids,
|
||||
details=f"duplicates={duplicate_ids}" if duplicate_ids else "ok",
|
||||
),
|
||||
GateResult(
|
||||
name="registry_values_are_valid",
|
||||
passed=not invalid_status and not invalid_implementation and not invalid_load_state,
|
||||
details=(
|
||||
f"invalid_status={invalid_status}; "
|
||||
f"invalid_implementation={invalid_implementation}; "
|
||||
f"invalid_load_state={invalid_load_state}"
|
||||
),
|
||||
),
|
||||
GateResult(
|
||||
name="universe_v1_count_matches_expected",
|
||||
passed=len(universe_rows) == expected_universe,
|
||||
details=f"actual={len(universe_rows)} expected={expected_universe}",
|
||||
),
|
||||
GateResult(
|
||||
name="implemented_count_matches_expected",
|
||||
passed=len(implemented_rows) == expected_implemented,
|
||||
details=f"actual={len(implemented_rows)} expected={expected_implemented}",
|
||||
),
|
||||
GateResult(
|
||||
name="runner_pipelines_are_all_marked_implemented",
|
||||
passed=not missing_from_registry,
|
||||
details=f"missing={missing_from_registry}" if missing_from_registry else "ok",
|
||||
),
|
||||
GateResult(
|
||||
name="implemented_registry_ids_exist_in_runner",
|
||||
passed=not not_in_runner,
|
||||
details=f"extra={not_in_runner}" if not_in_runner else "ok",
|
||||
),
|
||||
]
|
||||
return gates, dict(status_counter), dict(implementation_counter)
|
||||
|
||||
|
||||
def write_outputs(
|
||||
output_dir: Path,
|
||||
registry_path: Path,
|
||||
runner_path: Path,
|
||||
expected_universe: int,
|
||||
expected_implemented: int,
|
||||
gates: list[GateResult],
|
||||
status_counter: dict[str, int],
|
||||
implementation_counter: dict[str, int],
|
||||
) -> None:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
payload = {
|
||||
"timestamp_utc": datetime.now(UTC).isoformat(),
|
||||
"registry_path": str(registry_path),
|
||||
"runner_path": str(runner_path),
|
||||
"expected_universe_v1": expected_universe,
|
||||
"expected_implemented": expected_implemented,
|
||||
"status_counter": status_counter,
|
||||
"implementation_counter": implementation_counter,
|
||||
"gates": [gate.__dict__ for gate in gates],
|
||||
"all_passed": all(gate.passed for gate in gates),
|
||||
}
|
||||
(output_dir / "source_completeness_report.json").write_text(
|
||||
json.dumps(payload, indent=2, ensure_ascii=True) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
lines = [
|
||||
"# Source Completeness Gate Report",
|
||||
"",
|
||||
f"- Timestamp (UTC): `{payload['timestamp_utc']}`",
|
||||
f"- Registry: `{registry_path}`",
|
||||
f"- Runner: `{runner_path}`",
|
||||
f"- Expected universe_v1: `{expected_universe}`",
|
||||
f"- Expected implemented: `{expected_implemented}`",
|
||||
"",
|
||||
"## Counters",
|
||||
"",
|
||||
f"- status_counter: `{status_counter}`",
|
||||
f"- implementation_counter: `{implementation_counter}`",
|
||||
"",
|
||||
"## Gate Results",
|
||||
"",
|
||||
]
|
||||
for gate in gates:
|
||||
mark = "PASS" if gate.passed else "FAIL"
|
||||
lines.append(f"- `{mark}` `{gate.name}`: {gate.details}")
|
||||
lines.append("")
|
||||
lines.append(f"## Final: `{'PASS' if payload['all_passed'] else 'FAIL'}`")
|
||||
lines.append("")
|
||||
(output_dir / "source_completeness_report.md").write_text(
|
||||
"\n".join(lines), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate source registry completeness and code alignment."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--registry-path",
|
||||
default="docs/source_registry_br_v1.csv",
|
||||
help="Path to source registry CSV",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--runner-path",
|
||||
default="etl/src/icarus_etl/runner.py",
|
||||
help="Path to ETL runner with PIPELINES map",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--expected-universe-v1",
|
||||
type=int,
|
||||
default=108,
|
||||
help="Expected count for in_universe_v1=true rows",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--expected-implemented",
|
||||
type=int,
|
||||
default=45,
|
||||
help="Expected count for implementation_state=implemented rows",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default=f"audit-results/brazil-coverage-{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}",
|
||||
help="Directory for gate reports",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
registry_path = Path(args.registry_path)
|
||||
runner_path = Path(args.runner_path)
|
||||
output_dir = Path(args.output_dir)
|
||||
|
||||
rows, registry_errors = read_registry(registry_path)
|
||||
runner_pipelines, runner_errors = parse_runner_pipelines(runner_path)
|
||||
|
||||
if registry_errors or runner_errors:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
combined = registry_errors + runner_errors
|
||||
(output_dir / "source_completeness_report.md").write_text(
|
||||
"# Source Completeness Gate Report\n\n"
|
||||
+ "\n".join(f"- FAIL: {msg}" for msg in combined)
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
print("\n".join(combined))
|
||||
return 1
|
||||
|
||||
gates, status_counter, implementation_counter = build_gate_results(
|
||||
rows=rows,
|
||||
runner_pipelines=runner_pipelines,
|
||||
expected_universe=args.expected_universe_v1,
|
||||
expected_implemented=args.expected_implemented,
|
||||
)
|
||||
write_outputs(
|
||||
output_dir=output_dir,
|
||||
registry_path=registry_path,
|
||||
runner_path=runner_path,
|
||||
expected_universe=args.expected_universe_v1,
|
||||
expected_implemented=args.expected_implemented,
|
||||
gates=gates,
|
||||
status_counter=status_counter,
|
||||
implementation_counter=implementation_counter,
|
||||
)
|
||||
|
||||
all_passed = all(gate.passed for gate in gates)
|
||||
print("PASS" if all_passed else "FAIL")
|
||||
return 0 if all_passed else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user