mirror of
https://github.com/kharonsec/br-acc
synced 2026-04-25 17:15:02 +02:00
481 lines
15 KiB
Python
481 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""Download Câmara CPI/CPMI metadata, requirements and sessions.
|
|
|
|
Outputs canonical CSV files consumed by CamaraInquiriesPipeline:
|
|
- data/camara_inquiries/inquiries.csv
|
|
- data/camara_inquiries/requirements.csv
|
|
- data/camara_inquiries/sessions.csv
|
|
|
|
Default strategy is BigQuery-first (historical coverage).
|
|
API-only fallback is preserved as non-default mode.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import json
|
|
import logging
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import click
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CAMARA_BASE_URL = "https://dadosabertos.camara.leg.br/api/v2"
|
|
CAMARA_BQ_DATASET = "basedosdados.br_camara_dados_abertos"
|
|
INQUIRY_TYPES = (
|
|
"Comissão Parlamentar de Inquérito",
|
|
"Comissão Parlamentar Mista de Inquérito",
|
|
)
|
|
|
|
|
|
def _request_json(
|
|
client: httpx.Client,
|
|
url: str,
|
|
params: dict[str, Any] | None = None,
|
|
tolerated_statuses: set[int] | None = None,
|
|
) -> dict[str, Any]:
|
|
response = client.get(url, params=params, timeout=60)
|
|
if tolerated_statuses and response.status_code in tolerated_statuses:
|
|
logger.warning("Endpoint returned %d for %s", response.status_code, response.url)
|
|
return {}
|
|
response.raise_for_status()
|
|
payload = response.json()
|
|
if isinstance(payload, dict):
|
|
return payload
|
|
return {}
|
|
|
|
|
|
def _extract_items(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|
dados = payload.get("dados")
|
|
if isinstance(dados, list):
|
|
return [x for x in dados if isinstance(x, dict)]
|
|
return []
|
|
|
|
|
|
def _write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
|
|
if not rows:
|
|
logger.warning("No rows for %s", path.name)
|
|
path.write_text("", encoding="utf-8")
|
|
return
|
|
|
|
fieldnames: list[str] = []
|
|
seen: set[str] = set()
|
|
for row in rows:
|
|
for key in row:
|
|
if key not in seen:
|
|
seen.add(key)
|
|
fieldnames.append(key)
|
|
|
|
with path.open("w", newline="", encoding="utf-8") as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
for row in rows:
|
|
writer.writerow(row)
|
|
|
|
logger.info("Wrote %d rows to %s", len(rows), path)
|
|
|
|
|
|
def _kind_from_tipo(tipo_orgao: str) -> str:
|
|
return "CPMI" if "MISTA" in tipo_orgao.upper() else "CPI"
|
|
|
|
|
|
def _parse_date(value: Any) -> str:
|
|
if value is None:
|
|
return ""
|
|
raw = str(value).strip()
|
|
if len(raw) >= 10 and raw[4] == "-" and raw[7] == "-":
|
|
return raw[:10]
|
|
return ""
|
|
|
|
|
|
def _dedupe(rows: list[dict[str, Any]], key: str) -> list[dict[str, Any]]:
|
|
seen: set[str] = set()
|
|
output: list[dict[str, Any]] = []
|
|
for row in rows:
|
|
value = str(row.get(key, "")).strip()
|
|
if not value or value in seen:
|
|
continue
|
|
seen.add(value)
|
|
output.append(row)
|
|
return output
|
|
|
|
|
|
def _fetch_from_bigquery(
|
|
billing_project: str,
|
|
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
|
|
try:
|
|
import google.auth
|
|
from google.cloud import bigquery
|
|
except ImportError as exc:
|
|
raise RuntimeError("Install optional deps: pip install '.[bigquery]'") from exc
|
|
|
|
credentials, _ = google.auth.default()
|
|
client = bigquery.Client(project=billing_project, credentials=credentials)
|
|
|
|
inquiries_query = f"""
|
|
SELECT
|
|
id_orgao,
|
|
nome,
|
|
apelido,
|
|
sigla,
|
|
tipo_orgao,
|
|
data_inicio,
|
|
data_final,
|
|
situacao
|
|
FROM `{CAMARA_BQ_DATASET}.orgao`
|
|
WHERE tipo_orgao IN UNNEST(@types)
|
|
"""
|
|
|
|
inquiry_job = client.query(
|
|
inquiries_query,
|
|
job_config=bigquery.QueryJobConfig(
|
|
query_parameters=[
|
|
bigquery.ArrayQueryParameter("types", "STRING", list(INQUIRY_TYPES)),
|
|
],
|
|
),
|
|
)
|
|
inquiry_rows = list(inquiry_job.result())
|
|
|
|
inquiries: list[dict[str, Any]] = []
|
|
for row in inquiry_rows:
|
|
orgao_id = str(row["id_orgao"]).strip()
|
|
if not orgao_id:
|
|
continue
|
|
tipo = str(row["tipo_orgao"] or "").strip()
|
|
kind = _kind_from_tipo(tipo)
|
|
inquiries.append({
|
|
"inquiry_id": f"camara-{orgao_id}",
|
|
"inquiry_code": str(row["sigla"] or "").strip(),
|
|
"name": str(row["nome"] or row["apelido"] or "").strip(),
|
|
"kind": kind,
|
|
"house": "congresso" if kind == "CPMI" else "camara",
|
|
"status": str(row["situacao"] or "").strip(),
|
|
"subject": tipo,
|
|
"date_start": _parse_date(row["data_inicio"]),
|
|
"date_end": _parse_date(row["data_final"]),
|
|
"source_url": f"{CAMARA_BASE_URL}/orgaos/{orgao_id}",
|
|
"source_system": "camara_bq",
|
|
"extraction_method": "orgao_tipo_inquerito",
|
|
})
|
|
|
|
sessions_query = f"""
|
|
WITH inq AS (
|
|
SELECT id_orgao
|
|
FROM `{CAMARA_BQ_DATASET}.orgao`
|
|
WHERE tipo_orgao IN UNNEST(@types)
|
|
)
|
|
SELECT DISTINCT
|
|
eo.id_orgao,
|
|
e.id_evento,
|
|
e.data_inicio,
|
|
e.descricao,
|
|
e.tipo
|
|
FROM `{CAMARA_BQ_DATASET}.evento_orgao` eo
|
|
JOIN inq i ON i.id_orgao = eo.id_orgao
|
|
JOIN `{CAMARA_BQ_DATASET}.evento` e ON e.id_evento = eo.id_evento
|
|
"""
|
|
sessions_job = client.query(
|
|
sessions_query,
|
|
job_config=bigquery.QueryJobConfig(
|
|
query_parameters=[
|
|
bigquery.ArrayQueryParameter("types", "STRING", list(INQUIRY_TYPES)),
|
|
],
|
|
),
|
|
)
|
|
sessions_rows = list(sessions_job.result())
|
|
|
|
sessions: list[dict[str, Any]] = []
|
|
for row in sessions_rows:
|
|
orgao_id = str(row["id_orgao"]).strip()
|
|
event_id = str(row["id_evento"]).strip()
|
|
if not orgao_id or not event_id:
|
|
continue
|
|
sessions.append({
|
|
"session_id": f"camara-event-{event_id}",
|
|
"inquiry_id": f"camara-{orgao_id}",
|
|
"date": _parse_date(row["data_inicio"]),
|
|
"topic": str(row["descricao"] or row["tipo"] or "").strip(),
|
|
"source_url": f"{CAMARA_BASE_URL}/eventos/{event_id}",
|
|
"source_system": "camara_bq",
|
|
"extraction_method": "evento_orgao_join",
|
|
})
|
|
|
|
requirements_query = f"""
|
|
WITH inq AS (
|
|
SELECT id_orgao
|
|
FROM `{CAMARA_BQ_DATASET}.orgao`
|
|
WHERE tipo_orgao IN UNNEST(@types)
|
|
),
|
|
ev AS (
|
|
SELECT DISTINCT eo.id_orgao, eo.id_evento
|
|
FROM `{CAMARA_BQ_DATASET}.evento_orgao` eo
|
|
JOIN inq i ON i.id_orgao = eo.id_orgao
|
|
),
|
|
pa AS (
|
|
SELECT
|
|
id_proposicao,
|
|
ARRAY_AGG(
|
|
STRUCT(
|
|
nome_autor,
|
|
SAFE_CAST(proponente AS INT64) AS proponente_rank,
|
|
SAFE_CAST(ordem_assinatura AS INT64) AS assinatura_rank
|
|
)
|
|
ORDER BY SAFE_CAST(proponente AS INT64) DESC, SAFE_CAST(ordem_assinatura AS INT64) ASC
|
|
LIMIT 1
|
|
)[OFFSET(0)] AS picked
|
|
FROM `{CAMARA_BQ_DATASET}.proposicao_autor`
|
|
GROUP BY id_proposicao
|
|
)
|
|
SELECT
|
|
ev.id_orgao,
|
|
er.id_evento,
|
|
er.id_proposicao,
|
|
er.titulo_requerimento,
|
|
pm.sigla AS prop_sigla,
|
|
pm.data AS prop_data,
|
|
pm.ementa,
|
|
pm.situacao_ultimo_status,
|
|
pa.picked.nome_autor AS author_name
|
|
FROM `{CAMARA_BQ_DATASET}.evento_requerimento` er
|
|
JOIN ev ON ev.id_evento = er.id_evento
|
|
LEFT JOIN `{CAMARA_BQ_DATASET}.proposicao_microdados` pm ON pm.id_proposicao = er.id_proposicao
|
|
LEFT JOIN pa ON pa.id_proposicao = er.id_proposicao
|
|
"""
|
|
req_job = client.query(
|
|
requirements_query,
|
|
job_config=bigquery.QueryJobConfig(
|
|
query_parameters=[
|
|
bigquery.ArrayQueryParameter("types", "STRING", list(INQUIRY_TYPES)),
|
|
],
|
|
),
|
|
)
|
|
req_rows = list(req_job.result())
|
|
|
|
requirements: list[dict[str, Any]] = []
|
|
for row in req_rows:
|
|
orgao_id = str(row["id_orgao"]).strip()
|
|
event_id = str(row["id_evento"]).strip()
|
|
prop_id = str(row["id_proposicao"] or "").strip()
|
|
if not orgao_id or not event_id:
|
|
continue
|
|
if prop_id:
|
|
requirement_id = f"camara-req-{prop_id}-ev-{event_id}"
|
|
else:
|
|
requirement_id = f"camara-req-event-{event_id}"
|
|
if prop_id:
|
|
source_url = f"{CAMARA_BASE_URL}/proposicoes/{prop_id}"
|
|
else:
|
|
source_url = f"{CAMARA_BASE_URL}/eventos/{event_id}"
|
|
requirements.append({
|
|
"requirement_id": requirement_id,
|
|
"inquiry_id": f"camara-{orgao_id}",
|
|
"type": str(row["prop_sigla"] or "REQ").strip(),
|
|
"date": _parse_date(row["prop_data"]),
|
|
"text": str(row["ementa"] or row["titulo_requerimento"] or "").strip(),
|
|
"status": str(row["situacao_ultimo_status"] or "").strip(),
|
|
"author_name": str(row["author_name"] or "").strip(),
|
|
"author_cpf": "",
|
|
"source_url": source_url,
|
|
"source_system": "camara_bq",
|
|
"extraction_method": "evento_requerimento_join",
|
|
})
|
|
|
|
return (
|
|
_dedupe(inquiries, "inquiry_id"),
|
|
_dedupe(requirements, "requirement_id"),
|
|
_dedupe(sessions, "session_id"),
|
|
)
|
|
|
|
|
|
def _fetch_from_api_only(
|
|
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
|
|
inquiries: list[dict[str, Any]] = []
|
|
sessions: list[dict[str, Any]] = []
|
|
requirements: list[dict[str, Any]] = []
|
|
|
|
with httpx.Client(headers={"Accept": "application/json"}, follow_redirects=True) as client:
|
|
payload_cpi = _request_json(
|
|
client,
|
|
f"{CAMARA_BASE_URL}/orgaos",
|
|
{"sigla": "CPI", "itens": 100},
|
|
)
|
|
payload_cpmi = _request_json(
|
|
client,
|
|
f"{CAMARA_BASE_URL}/orgaos",
|
|
{"sigla": "CPMI", "itens": 100},
|
|
)
|
|
orgaos = _extract_items(payload_cpi) + _extract_items(payload_cpmi)
|
|
logger.info("API-only mode found %d candidate orgaos", len(orgaos))
|
|
|
|
seen_orgaos: set[str] = set()
|
|
for orgao in orgaos:
|
|
orgao_id = str(orgao.get("id", "")).strip()
|
|
if not orgao_id or orgao_id in seen_orgaos:
|
|
continue
|
|
seen_orgaos.add(orgao_id)
|
|
|
|
sigla = str(orgao.get("sigla", "")).strip()
|
|
nome = str(orgao.get("nomePublicacao") or orgao.get("nome", "")).strip()
|
|
if "CPI" not in sigla.upper() and "CPI" not in nome.upper():
|
|
continue
|
|
|
|
inquiry_id = f"camara-{orgao_id}"
|
|
inquiry_url = f"{CAMARA_BASE_URL}/orgaos/{orgao_id}"
|
|
kind = "CPMI" if "CPMI" in (sigla or nome).upper() else "CPI"
|
|
|
|
details = _request_json(client, inquiry_url)
|
|
dado = details.get("dados") if isinstance(details.get("dados"), dict) else {}
|
|
inquiries.append({
|
|
"inquiry_id": inquiry_id,
|
|
"inquiry_code": sigla,
|
|
"name": nome,
|
|
"kind": kind,
|
|
"house": "congresso" if kind == "CPMI" else "camara",
|
|
"status": str(dado.get("situacao") or "").strip(),
|
|
"subject": str(dado.get("descricao") or "").strip(),
|
|
"date_start": str(dado.get("dataInicio") or "").strip()[:10],
|
|
"date_end": str(dado.get("dataFim") or "").strip()[:10],
|
|
"source_url": inquiry_url,
|
|
"source_system": "camara_api",
|
|
"extraction_method": "orgaos_sigla",
|
|
})
|
|
|
|
eventos_payload = _request_json(
|
|
client,
|
|
f"{CAMARA_BASE_URL}/orgaos/{orgao_id}/eventos",
|
|
{"itens": 200},
|
|
)
|
|
for event in _extract_items(eventos_payload):
|
|
event_id = str(event.get("id", "")).strip()
|
|
if not event_id:
|
|
continue
|
|
sessions.append({
|
|
"session_id": f"camara-event-{event_id}",
|
|
"inquiry_id": inquiry_id,
|
|
"date": str(event.get("dataHoraInicio") or "").strip()[:10],
|
|
"topic": str(event.get("descricaoTipo") or event.get("titulo") or "").strip(),
|
|
"source_url": str(event.get("uri") or inquiry_url),
|
|
"source_system": "camara_api",
|
|
"extraction_method": "orgaos_eventos",
|
|
})
|
|
|
|
logger.warning(
|
|
"API-only mode does not build full historical requirements; "
|
|
"use mode=bq_first for complete extraction.",
|
|
)
|
|
return (
|
|
_dedupe(inquiries, "inquiry_id"),
|
|
requirements,
|
|
_dedupe(sessions, "session_id"),
|
|
)
|
|
|
|
|
|
def _write_manifest(
|
|
manifest_path: Path,
|
|
mode: str,
|
|
inquiries: int,
|
|
requirements: int,
|
|
sessions: int,
|
|
status: str,
|
|
error: str | None = None,
|
|
) -> None:
|
|
payload = {
|
|
"generated_at_utc": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"mode": mode,
|
|
"status": status,
|
|
"counts": {
|
|
"inquiries": inquiries,
|
|
"requirements": requirements,
|
|
"sessions": sessions,
|
|
},
|
|
"error": error,
|
|
}
|
|
manifest_path.write_text(
|
|
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
logger.info("Wrote Camara inquiries manifest: %s", manifest_path)
|
|
|
|
|
|
@click.command()
|
|
@click.option("--output-dir", default="./data/camara_inquiries", help="Output directory")
|
|
@click.option("--skip-existing/--no-skip-existing", default=True)
|
|
@click.option(
|
|
"--mode",
|
|
type=click.Choice(["bq_first", "api_only"], case_sensitive=False),
|
|
default="bq_first",
|
|
show_default=True,
|
|
help="Extraction mode.",
|
|
)
|
|
@click.option(
|
|
"--billing-project",
|
|
default="icarus-corruptos",
|
|
help="GCP billing project for BQ mode.",
|
|
)
|
|
@click.option(
|
|
"--manifest-path",
|
|
default=None,
|
|
help="Optional manifest JSON output path (default: <output-dir>/download_manifest.json).",
|
|
)
|
|
def main(
|
|
output_dir: str,
|
|
skip_existing: bool,
|
|
mode: str,
|
|
billing_project: str,
|
|
manifest_path: str | None,
|
|
) -> None:
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
out = Path(output_dir)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
|
|
inquiries_csv = out / "inquiries.csv"
|
|
req_csv = out / "requirements.csv"
|
|
sessions_csv = out / "sessions.csv"
|
|
manifest_file = Path(manifest_path) if manifest_path else out / "download_manifest.json"
|
|
|
|
if skip_existing and inquiries_csv.exists() and req_csv.exists() and sessions_csv.exists():
|
|
logger.info("Skipping download (all outputs exist).")
|
|
return
|
|
|
|
try:
|
|
if mode.lower() == "bq_first":
|
|
inquiries, requirements, sessions = _fetch_from_bigquery(billing_project)
|
|
else:
|
|
inquiries, requirements, sessions = _fetch_from_api_only()
|
|
except Exception as exc: # noqa: BLE001
|
|
_write_manifest(
|
|
manifest_file,
|
|
mode=mode,
|
|
inquiries=0,
|
|
requirements=0,
|
|
sessions=0,
|
|
status="failed",
|
|
error=str(exc),
|
|
)
|
|
raise
|
|
|
|
_write_csv(inquiries_csv, inquiries)
|
|
_write_csv(req_csv, requirements)
|
|
_write_csv(sessions_csv, sessions)
|
|
|
|
status = "ok"
|
|
if mode.lower() == "api_only":
|
|
status = "partial"
|
|
_write_manifest(
|
|
manifest_file,
|
|
mode=mode,
|
|
inquiries=len(inquiries),
|
|
requirements=len(requirements),
|
|
sessions=len(sessions),
|
|
status=status,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|