Files
br-acc/etl/scripts/download_dou.py
2026-03-02 03:51:26 -03:00

172 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""Download DOU (Diario Oficial da Uniao) from Imprensa Nacional XML dumps.
Uses the official open data XML distribution from dadosabertos-download.cgu.gov.br.
Each month has 3 ZIP files (one per DOU section), each containing XML articles.
URL pattern: https://dadosabertos-download.cgu.gov.br/inlabs/{AAMM}/S0{section}{AAMM}.zip
Usage:
python etl/scripts/download_dou.py
python etl/scripts/download_dou.py --start-month 2024-01 --end-month 2024-12
python etl/scripts/download_dou.py --skip-existing --output-dir ./data/dou
"""
from __future__ import annotations
import logging
import sys
import zipfile
from io import BytesIO
from pathlib import Path
import click
import httpx
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
)
logger = logging.getLogger(__name__)
BASE_URL = "https://dadosabertos-download.cgu.gov.br/inlabs"
SECTIONS = [1, 2, 3]
TIMEOUT = 120
def _month_range(start: str, end: str) -> list[str]:
"""Generate AAMM strings from YYYY-MM to YYYY-MM inclusive."""
sy, sm = start.split("-")
ey, em = end.split("-")
year, month = int(sy), int(sm)
end_year, end_month = int(ey), int(em)
months: list[str] = []
while (year, month) <= (end_year, end_month):
# AAMM format: last 2 digits of year + 2-digit month
aamm = f"{year % 100:02d}{month:02d}"
months.append(aamm)
month += 1
if month > 12:
month = 1
year += 1
return months
def _download_zip(
client: httpx.Client,
aamm: str,
section: int,
output_dir: Path,
*,
skip_existing: bool,
) -> int:
"""Download and extract one section ZIP. Returns number of XML files extracted."""
zip_name = f"S0{section}{aamm}.zip"
url = f"{BASE_URL}/{aamm}/{zip_name}"
# Check if already extracted
section_dir = output_dir / f"S{section}" / aamm
marker = section_dir / ".done"
if skip_existing and marker.exists():
logger.info("Skipping %s (already extracted)", zip_name)
return 0
logger.info("Downloading %s", url)
try:
resp = client.get(url, timeout=TIMEOUT)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
logger.warning("Not found: %s (month may not be available yet)", zip_name)
else:
logger.warning("HTTP %d for %s", e.response.status_code, zip_name)
return 0
except httpx.RequestError as e:
logger.warning("Request failed for %s: %s", zip_name, e)
return 0
section_dir.mkdir(parents=True, exist_ok=True)
xml_count = 0
try:
resolved_dir = section_dir.resolve()
with zipfile.ZipFile(BytesIO(resp.content)) as zf:
for member in zf.namelist():
# Path traversal guard
target = (section_dir / member).resolve()
if not target.is_relative_to(resolved_dir):
logger.warning(
"Path traversal detected in %s: %s — skipping",
zip_name,
member,
)
continue
if member.lower().endswith(".xml"):
zf.extract(member, section_dir)
xml_count += 1
except zipfile.BadZipFile:
logger.warning("Bad ZIP file: %s", zip_name)
return 0
if xml_count > 0:
marker.write_text(str(xml_count))
logger.info("Extracted %d XML files from %s", xml_count, zip_name)
return xml_count
@click.command()
@click.option(
"--start-month",
default="2024-01",
help="Start month (YYYY-MM)",
)
@click.option(
"--end-month",
default="2025-02",
help="End month (YYYY-MM)",
)
@click.option("--output-dir", default="./data/dou", help="Output directory")
@click.option("--skip-existing/--no-skip-existing", default=True)
def main(
start_month: str,
end_month: str,
output_dir: str,
skip_existing: bool,
) -> None:
"""Download DOU XML dumps from Imprensa Nacional open data."""
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
months = _month_range(start_month, end_month)
logger.info("=== DOU XML Download ===")
logger.info("Months: %s to %s (%d months)", start_month, end_month, len(months))
logger.info("Output: %s", out.resolve())
total_xml = 0
total_zips = 0
with httpx.Client(follow_redirects=True) as client:
for aamm in months:
for section in SECTIONS:
count = _download_zip(
client, aamm, section, out, skip_existing=skip_existing,
)
total_xml += count
if count > 0:
total_zips += 1
logger.info("=== Done ===")
logger.info("Downloaded %d ZIPs, extracted %d XML files", total_zips, total_xml)
if total_xml == 0:
logger.warning(
"No XML files extracted. The URL pattern may have changed. "
"Check https://dadosabertos-download.cgu.gov.br/inlabs/ manually."
)
sys.exit(1)
if __name__ == "__main__":
main()