BHI layer v1: docs, schema, Phase A ingestion stubs

This commit is contained in:
BHI Staging Agent
2026-04-05 20:15:36 +00:00
commit 3dfd9ea3c6
21 changed files with 2399 additions and 0 deletions

146
jobs/ingestion/_common.py Normal file
View File

@@ -0,0 +1,146 @@
"""
Shared helpers for BHI ingestion jobs.
READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
Base Brain is expected to expose:
- env DATABASE_URL pointing at the `brain` Postgres
- a `job_runs` table (the base Brain maintains this)
- optional Vault at http://localhost:8200 for API keys
Every BHI job imports from this module to keep behavior consistent.
"""
from __future__ import annotations
import logging
import os
import time
from contextlib import contextmanager
from datetime import datetime
from typing import Any, Callable, Iterable
import requests
try:
import psycopg2
import psycopg2.extras
except ImportError:
psycopg2 = None # type: ignore
LOG_FMT = "%(asctime)s %(levelname)s %(name)s | %(message)s"
logging.basicConfig(level=os.environ.get("BHI_LOG_LEVEL", "INFO"), format=LOG_FMT)
# --- HTTP session with retries + rate limiting ------------------------------
class RateLimitedSession(requests.Session):
def __init__(self, min_interval: float = 0.2, max_retries: int = 5):
super().__init__()
self.headers.update({"User-Agent": "EconomicBrain-BHI/1.0 (+research)"})
self.min_interval = min_interval
self.max_retries = max_retries
self._last = 0.0
def request(self, method, url, **kw): # type: ignore[override]
kw.setdefault("timeout", 60)
backoff = 1.0
for attempt in range(self.max_retries):
dt = time.monotonic() - self._last
if dt < self.min_interval:
time.sleep(self.min_interval - dt)
self._last = time.monotonic()
try:
resp = super().request(method, url, **kw)
if resp.status_code in (429, 500, 502, 503, 504):
logging.warning("HTTP %s on %s, retrying in %.1fs", resp.status_code, url, backoff)
time.sleep(backoff)
backoff *= 2
continue
resp.raise_for_status()
return resp
except requests.RequestException as e:
logging.warning("Request error: %s (attempt %d)", e, attempt + 1)
time.sleep(backoff)
backoff *= 2
raise RuntimeError(f"Exceeded retries for {url}")
# --- DB helpers -------------------------------------------------------------
def get_conn():
if psycopg2 is None:
raise RuntimeError("psycopg2 not installed. pip install psycopg2-binary")
dsn = os.environ.get("DATABASE_URL") or os.environ.get("BRAIN_DATABASE_URL")
if not dsn:
raise RuntimeError("DATABASE_URL env var not set")
return psycopg2.connect(dsn)
@contextmanager
def job_run(job_name: str):
"""Context manager that logs a row in the base Brain's job_runs table."""
conn = get_conn()
run_id = None
started = datetime.utcnow()
try:
with conn.cursor() as c:
c.execute(
"""
INSERT INTO job_runs (job_name, started_at, status)
VALUES (%s, %s, 'running') RETURNING id
""",
(job_name, started),
)
run_id = c.fetchone()[0]
conn.commit()
yield conn, run_id
with conn.cursor() as c:
c.execute(
"UPDATE job_runs SET status='success', finished_at=%s WHERE id=%s",
(datetime.utcnow(), run_id),
)
conn.commit()
except Exception as e:
if run_id is not None:
try:
with conn.cursor() as c:
c.execute(
"UPDATE job_runs SET status='error', finished_at=%s, error=%s WHERE id=%s",
(datetime.utcnow(), str(e)[:2000], run_id),
)
conn.commit()
except Exception:
pass
raise
finally:
conn.close()
def bulk_insert(conn, table: str, columns: list[str], rows: Iterable[tuple]):
with conn.cursor() as c:
psycopg2.extras.execute_values(
c,
f"INSERT INTO {table} ({', '.join(columns)}) VALUES %s",
list(rows),
page_size=500,
)
conn.commit()
# --- Vault (optional) -------------------------------------------------------
def vault_secret(path: str, key: str) -> str | None:
token = os.environ.get("VAULT_TOKEN")
addr = os.environ.get("VAULT_ADDR", "http://localhost:8200")
if not token:
return os.environ.get(key.upper())
try:
r = requests.get(
f"{addr}/v1/{path}",
headers={"X-Vault-Token": token},
timeout=5,
)
return r.json()["data"]["data"].get(key)
except Exception as e:
logging.warning("vault fetch failed: %s", e)
return os.environ.get(key.upper())