BHI layer v1: docs, schema, Phase A ingestion stubs

2026-04-05 20:15:36 +00:00
commit 3dfd9ea3c6
21 changed files with 2399 additions and 0 deletions
--- a/jobs/ingestion/pycache/_common.cpython-312.pyc
+++ b/jobs/ingestion/pycache/_common.cpython-312.pyc
--- a/jobs/ingestion/_common.py
+++ b/jobs/ingestion/_common.py
@@ -0,0 +1,146 @@
+"""
+Shared helpers for BHI ingestion jobs.
+
+READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+
+Base Brain is expected to expose:
+  - env DATABASE_URL pointing at the `brain` Postgres
+  - a `job_runs` table (the base Brain maintains this)
+  - optional Vault at http://localhost:8200 for API keys
+
+Every BHI job imports from this module to keep behavior consistent.
+"""
+from __future__ import annotations
+
+import logging
+import os
+import time
+from contextlib import contextmanager
+from datetime import datetime
+from typing import Any, Callable, Iterable
+
+import requests
+
+try:
+    import psycopg2
+    import psycopg2.extras
+except ImportError:
+    psycopg2 = None  # type: ignore
+
+LOG_FMT = "%(asctime)s %(levelname)s %(name)s | %(message)s"
+logging.basicConfig(level=os.environ.get("BHI_LOG_LEVEL", "INFO"), format=LOG_FMT)
+
+
+# --- HTTP session with retries + rate limiting ------------------------------
+
+class RateLimitedSession(requests.Session):
+    def __init__(self, min_interval: float = 0.2, max_retries: int = 5):
+        super().__init__()
+        self.headers.update({"User-Agent": "EconomicBrain-BHI/1.0 (+research)"})
+        self.min_interval = min_interval
+        self.max_retries = max_retries
+        self._last = 0.0
+
+    def request(self, method, url, **kw):  # type: ignore[override]
+        kw.setdefault("timeout", 60)
+        backoff = 1.0
+        for attempt in range(self.max_retries):
+            dt = time.monotonic() - self._last
+            if dt < self.min_interval:
+                time.sleep(self.min_interval - dt)
+            self._last = time.monotonic()
+            try:
+                resp = super().request(method, url, **kw)
+                if resp.status_code in (429, 500, 502, 503, 504):
+                    logging.warning("HTTP %s on %s, retrying in %.1fs", resp.status_code, url, backoff)
+                    time.sleep(backoff)
+                    backoff *= 2
+                    continue
+                resp.raise_for_status()
+                return resp
+            except requests.RequestException as e:
+                logging.warning("Request error: %s (attempt %d)", e, attempt + 1)
+                time.sleep(backoff)
+                backoff *= 2
+        raise RuntimeError(f"Exceeded retries for {url}")
+
+
+# --- DB helpers -------------------------------------------------------------
+
+def get_conn():
+    if psycopg2 is None:
+        raise RuntimeError("psycopg2 not installed. pip install psycopg2-binary")
+    dsn = os.environ.get("DATABASE_URL") or os.environ.get("BRAIN_DATABASE_URL")
+    if not dsn:
+        raise RuntimeError("DATABASE_URL env var not set")
+    return psycopg2.connect(dsn)
+
+
+@contextmanager
+def job_run(job_name: str):
+    """Context manager that logs a row in the base Brain's job_runs table."""
+    conn = get_conn()
+    run_id = None
+    started = datetime.utcnow()
+    try:
+        with conn.cursor() as c:
+            c.execute(
+                """
+                INSERT INTO job_runs (job_name, started_at, status)
+                VALUES (%s, %s, 'running') RETURNING id
+                """,
+                (job_name, started),
+            )
+            run_id = c.fetchone()[0]
+        conn.commit()
+        yield conn, run_id
+        with conn.cursor() as c:
+            c.execute(
+                "UPDATE job_runs SET status='success', finished_at=%s WHERE id=%s",
+                (datetime.utcnow(), run_id),
+            )
+        conn.commit()
+    except Exception as e:
+        if run_id is not None:
+            try:
+                with conn.cursor() as c:
+                    c.execute(
+                        "UPDATE job_runs SET status='error', finished_at=%s, error=%s WHERE id=%s",
+                        (datetime.utcnow(), str(e)[:2000], run_id),
+                    )
+                conn.commit()
+            except Exception:
+                pass
+        raise
+    finally:
+        conn.close()
+
+
+def bulk_insert(conn, table: str, columns: list[str], rows: Iterable[tuple]):
+    with conn.cursor() as c:
+        psycopg2.extras.execute_values(
+            c,
+            f"INSERT INTO {table} ({', '.join(columns)}) VALUES %s",
+            list(rows),
+            page_size=500,
+        )
+    conn.commit()
+
+
+# --- Vault (optional) -------------------------------------------------------
+
+def vault_secret(path: str, key: str) -> str | None:
+    token = os.environ.get("VAULT_TOKEN")
+    addr = os.environ.get("VAULT_ADDR", "http://localhost:8200")
+    if not token:
+        return os.environ.get(key.upper())
+    try:
+        r = requests.get(
+            f"{addr}/v1/{path}",
+            headers={"X-Vault-Token": token},
+            timeout=5,
+        )
+        return r.json()["data"]["data"].get(key)
+    except Exception as e:
+        logging.warning("vault fetch failed: %s", e)
+        return os.environ.get(key.upper())
--- a/jobs/ingestion/bls_oes.py
+++ b/jobs/ingestion/bls_oes.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+BLS OES (Occupational Employment and Wage Statistics) — behavioral health
+workforce by MSA.
+
+Primary approach: annual bulk download (no auth, simplest):
+  https://www.bls.gov/oes/special-requests/oesmYYma.zip
+
+Fallback / enrichment: BLS public API (optional free key via vault).
+"""
+import csv
+import io
+import logging
+import sys
+import zipfile
+from _common import RateLimitedSession, bulk_insert, job_run, vault_secret
+
+LOG = logging.getLogger("bhi.bls_oes")
+
+BULK_URL = "https://www.bls.gov/oes/special-requests/oesm23ma.zip"  # update year annually
+BH_SOC_CODES = {
+    "29-1223": "Psychiatrists",
+    "29-1229": "Physicians, All Other",
+    "21-1014": "Mental Health Counselors",
+    "21-1015": "Rehabilitation Counselors",
+    "21-1018": "SUD / Behavioral Disorder Counselors",
+    "21-1023": "Mental Health & Substance Abuse Social Workers",
+    "19-3033": "Clinical & Counseling Psychologists",
+}
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.head(BULK_URL, allow_redirects=True)
+    print(f"OK: status={r.status_code}, content-length={r.headers.get('content-length')}")
+    return r.status_code == 200
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=1.0)
+    r = s.get(BULK_URL)
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+    # Bulk zip contains one CSV/XLSX with MSA rows
+    csv_name = next((n for n in z.namelist() if n.lower().endswith(".csv")), None)
+    if not csv_name:
+        LOG.error("no CSV in BLS zip")
+        return []
+    with z.open(csv_name) as f:
+        reader = csv.DictReader(io.TextIOWrapper(f, encoding="latin-1"))
+        rows = [r for r in reader if (r.get("OCC_CODE") or r.get("occ_code")) in BH_SOC_CODES]
+    LOG.info("BLS OES BH rows: %d", len(rows))
+    return rows
+
+
+def _num(v):
+    try:
+        return float(str(v).replace(",", "")) if v not in (None, "", "*", "#") else None
+    except (TypeError, ValueError):
+        return None
+
+
+def write_rows(conn, raw):
+    cols = ["msa_code","msa_name","occupation_code","occupation_title",
+            "employment","annual_wage_median","annual_wage_mean","period","source"]
+    rows = []
+    for r in raw:
+        code = r.get("OCC_CODE") or r.get("occ_code")
+        rows.append((
+            r.get("AREA") or r.get("area"),
+            r.get("AREA_TITLE") or r.get("area_title"),
+            code,
+            BH_SOC_CODES.get(code, r.get("OCC_TITLE") or r.get("occ_title")),
+            int(_num(r.get("TOT_EMP") or r.get("tot_emp")) or 0) or None,
+            _num(r.get("A_MEDIAN") or r.get("a_median")),
+            _num(r.get("A_MEAN") or r.get("a_mean")),
+            "May2023",
+            "bls_oes",
+        ))
+    bulk_insert(conn, "bhi_workforce", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_bls_oes") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cdc_brfss.py
+++ b/jobs/ingestion/cdc_brfss.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CDC BRFSS Prevalence Data (Socrata).
+
+Source: https://data.cdc.gov/resource/dttw-5yxu.json
+Pulls depression + mental-health-not-good items by state, with
+young-adult (18-24) breakouts where available.
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cdc_brfss")
+BASE = "https://data.cdc.gov/resource/dttw-5yxu.json"
+
+# BRFSS topics of interest for BHI
+TOPICS = [
+    "Depression",
+    "Mental Health Status",
+    "Poor Mental Health",
+]
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(BASE, params={"$limit": 2}).json()
+    print(f"OK: returned {len(r)} rows")
+    if r:
+        print("sample topic:", r[0].get("topic"))
+    return bool(r)
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.2)
+    out = []
+    for topic in TOPICS:
+        offset = 0
+        while True:
+            batch = s.get(BASE, params={
+                "$where": f"topic='{topic}'",
+                "$limit": 5000,
+                "$offset": offset,
+            }).json()
+            if not batch:
+                break
+            out.extend(batch)
+            if len(batch) < 5000:
+                break
+            offset += 5000
+        LOG.info("topic=%s total=%d", topic, len(out))
+    return out
+
+
+def write_rows(conn, raw):
+    cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
+    rows = []
+    for r in raw:
+        try:
+            val = float(r.get("data_value") or 0)
+        except (TypeError, ValueError):
+            continue
+        breakout = (r.get("break_out") or "Overall").lower()
+        if "18" in breakout and "24" in breakout:
+            bracket = "18-25"
+        elif "overall" in breakout:
+            bracket = "all"
+        else:
+            bracket = breakout
+        rows.append((
+            "state",
+            r.get("locationabbr"),
+            (r.get("question") or r.get("topic") or "").strip()[:120],
+            bracket,
+            str(r.get("year") or ""),
+            val,
+            "cdc_brfss",
+        ))
+    bulk_insert(conn, "bhi_demand_indicators", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_cdc_brfss") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cdc_wonder_mortality.py
+++ b/jobs/ingestion/cdc_wonder_mortality.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CDC WONDER — Underlying Cause of Death by county, age bracket, ICD-10.
+
+Posts XML request body to https://wonder.cdc.gov/controller/datarequest/D76
+(Underlying Cause of Death 1999-2020) or D77 (2018+). The public non-restricted
+datasets return XML tables; county-level cells with <10 deaths are suppressed.
+
+We request two slices:
+  1. Suicide (X60-X84) for ages 13-17 and 18-25, by county
+  2. Drug poisoning (X40-X44, Y10-Y14) for 13-17 and 18-25, by county
+"""
+import logging
+import sys
+import xml.etree.ElementTree as ET
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cdc_wonder")
+ENDPOINT = "https://wonder.cdc.gov/controller/datarequest/D76"
+
+
+def _build_xml(icd_codes: list[str], age_bracket: str) -> str:
+    """Assemble WONDER POST XML. Structure is value-order dependent."""
+    # Age groups in WONDER: 15-19, 20-24, 25-29 etc. Adolescent and young-adult
+    # brackets don't align perfectly with 5-year WONDER bins — closest fit:
+    ages = {
+        "13-17": ["15-19"],   # approximate
+        "18-25": ["20-24", "25-29"],
+    }[age_bracket]
+    icd_vals = "".join(f"<v>{c}</v>" for c in icd_codes)
+    age_vals = "".join(f"<v>{a}</v>" for a in ages)
+    return f"""<?xml version="1.0" encoding="utf-8"?>
+<request-parameters>
+  <parameter><name>accept_datause_restrictions</name><value>true</value></parameter>
+  <parameter><name>B_1</name><value>D76.V2-level1</value></parameter>
+  <parameter><name>B_2</name><value>D76.V51</value></parameter>
+  <parameter><name>F_D76.V1</name>{age_vals}</parameter>
+  <parameter><name>F_D76.V2</name><value>*All*</value></parameter>
+  <parameter><name>F_D76.V22</name>{icd_vals}</parameter>
+  <parameter><name>O_age</name><value>D76.V51</value></parameter>
+  <parameter><name>O_location</name><value>D76.V9</value></parameter>
+  <parameter><name>VM_D76.M6_D76.V10</name><value/></parameter>
+</request-parameters>"""
+
+
+def test_endpoint():
+    s = RateLimitedSession(min_interval=1.0)
+    body = _build_xml(["X60-X84"], "13-17")
+    r = s.post(ENDPOINT, data={"request_xml": body, "accept_datause_restrictions": "true"})
+    ok = r.status_code == 200 and b"<response" in r.content
+    print(f"OK={ok}, status={r.status_code}, len={len(r.content)}")
+    return ok
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=1.0)
+    out = []
+    for measure, icd in [("suicide_rate", ["X60-X84"]),
+                         ("overdose_rate", ["X40-X44", "Y10-Y14"])]:
+        for bracket in ("13-17", "18-25"):
+            body = _build_xml(icd, bracket)
+            r = s.post(ENDPOINT, data={
+                "request_xml": body,
+                "accept_datause_restrictions": "true",
+            })
+            rows = _parse_wonder_xml(r.text, measure, bracket)
+            out.extend(rows)
+            LOG.info("%s %s -> %d rows", measure, bracket, len(rows))
+    return out
+
+
+def _parse_wonder_xml(xml_text: str, measure: str, bracket: str):
+    out = []
+    try:
+        root = ET.fromstring(xml_text)
+    except ET.ParseError:
+        LOG.error("WONDER XML parse failed")
+        return out
+    # WONDER returns <data-table> with <r> rows containing <c l="label"/>
+    for r in root.iter("r"):
+        cells = [c.get("l") or c.text for c in r.findall("c")]
+        if len(cells) < 3:
+            continue
+        county = cells[0]
+        try:
+            rate = float(cells[-1])
+        except (TypeError, ValueError):
+            continue
+        out.append({
+            "geo_type": "county",
+            "geo_code": county,
+            "measure": measure,
+            "age_bracket": bracket,
+            "period": "2018-2022",  # WONDER typical 5-year window
+            "value": rate,
+            "source": "cdc_wonder",
+        })
+    return out
+
+
+def write_rows(conn, raw):
+    cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
+    rows = [(r["geo_type"], r["geo_code"], r["measure"], r["age_bracket"],
+             r["period"], r["value"], r["source"]) for r in raw]
+    bulk_insert(conn, "bhi_demand_indicators", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_cdc_wonder") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cdc_yrbss.py
+++ b/jobs/ingestion/cdc_yrbss.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CDC YRBSS — Youth Risk Behavior Survey (high and middle school).
+
+Sources (Socrata):
+  - High school: https://data.cdc.gov/resource/3qty-g4aq.json
+  - Middle school: https://data.cdc.gov/resource/uqmk-4y2w.json
+
+Key items: "considered suicide", "attempted suicide", "persistent sadness",
+substance use — all adolescent (13-17) bracket.
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cdc_yrbss")
+DATASETS = {
+    "hs": "https://data.cdc.gov/resource/3qty-g4aq.json",
+    "ms": "https://data.cdc.gov/resource/uqmk-4y2w.json",
+}
+
+KEYWORDS = ["suicide", "sad", "hopeless", "mental health", "electronic"]
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    ok = True
+    for k, url in DATASETS.items():
+        r = s.get(url, params={"$limit": 1})
+        print(f"{k}: status={r.status_code}, rows={len(r.json())}")
+        ok = ok and r.status_code == 200
+    return ok
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.2)
+    out = []
+    for key, url in DATASETS.items():
+        offset = 0
+        while True:
+            batch = s.get(url, params={"$limit": 5000, "$offset": offset}).json()
+            if not batch:
+                break
+            for row in batch:
+                row["_dataset"] = key
+            out.extend(batch)
+            if len(batch) < 5000:
+                break
+            offset += 5000
+        LOG.info("yrbss %s -> %d", key, len(out))
+    return out
+
+
+def _question_is_relevant(q: str) -> bool:
+    ql = (q or "").lower()
+    return any(k in ql for k in KEYWORDS)
+
+
+def write_rows(conn, raw):
+    cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
+    rows = []
+    for r in raw:
+        question = r.get("questioncode") or r.get("shortquestiontext") or r.get("question") or ""
+        if not _question_is_relevant(question):
+            continue
+        try:
+            val = float(r.get("data_value") or r.get("greater_risk_data_value") or 0)
+        except (TypeError, ValueError):
+            continue
+        if val == 0:
+            continue
+        rows.append((
+            "state" if r.get("locationdesc") else "district",
+            r.get("locationabbr") or r.get("sitecode"),
+            question[:120],
+            "13-17",
+            str(r.get("year") or ""),
+            val,
+            f"cdc_yrbss_{r.get('_dataset','hs')}",
+        ))
+    bulk_insert(conn, "bhi_demand_indicators", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_cdc_yrbss") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cms_hospital_compare.py
+++ b/jobs/ingestion/cms_hospital_compare.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CMS Hospital General Information (Care Compare) — used to cross-reference
+which acute hospitals host behavioral health units and to capture CCN-level
+facility metadata.
+
+Source: https://data.cms.gov/provider-data/api/1/datastore/query/xubh-q36u/0
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cms_hospital_compare")
+BASE = "https://data.cms.gov/provider-data/api/1/datastore/query/xubh-q36u/0"
+PAGE = 500
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(BASE, params={"limit": 2}).json()
+    rows = r.get("results", [])
+    print(f"OK: {len(rows)} rows, sample:", rows[0].get("facility_name") if rows else None)
+    return bool(rows)
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.25)
+    offset, out = 0, []
+    while True:
+        b = s.get(BASE, params={"limit": PAGE, "offset": offset}).json().get("results", [])
+        if not b:
+            break
+        out.extend(b)
+        if len(b) < PAGE:
+            break
+        offset += PAGE
+    LOG.info("fetched %d hospitals", len(out))
+    return out
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        rows.append((
+            r.get("facility_id"), None,
+            r.get("facility_name"), r.get("address"),
+            r.get("citytown"), r.get("state"), r.get("zip_code"), None,
+            None, None,
+            (r.get("hospital_type") or "hospital"),
+            r.get("hospital_ownership"),
+            None, None, None, None, None,
+            [], [], [], None, None, None, None, None,
+            "cms_hospital_compare", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_cms_hospital_compare") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cms_ipfqr.py
+++ b/jobs/ingestion/cms_ipfqr.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CMS Inpatient Psychiatric Facility Quality Reporting (IPFQR) ingestion.
+
+Source: https://data.cms.gov/provider-data/api/1/datastore/query/q9vs-r7wp/0
+Writes facilities to bhi_facilities and measures to bhi_facility_quality.
+"""
+import logging
+import sys
+from typing import Any
+
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cms_ipfqr")
+
+DATASET_ID = "q9vs-r7wp"  # IPFQR by Facility
+BASE = f"https://data.cms.gov/provider-data/api/1/datastore/query/{DATASET_ID}/0"
+PAGE_SIZE = 500
+
+MEASURE_FIELDS = [
+    ("hbips2", "HBIPS-2", "Hours of physical-restraint use"),
+    ("hbips3", "HBIPS-3", "Hours of seclusion use"),
+    ("smd",    "SMD",     "Screening for metabolic disorders"),
+    ("sub2",   "SUB-2",   "Alcohol use brief intervention"),
+    ("sub3",   "SUB-3",   "Alcohol/other drug use treatment at discharge"),
+    ("tob3",   "TOB-3",   "Tobacco use treatment at discharge"),
+]
+
+
+# --- TEST function (no DB) --------------------------------------------------
+
+def test_endpoint():
+    """Run standalone to verify the endpoint works."""
+    s = RateLimitedSession()
+    r = s.get(BASE, params={"limit": 3})
+    data = r.json()
+    rows = data.get("results", [])
+    print(f"OK: fetched {len(rows)} rows from {BASE}")
+    if rows:
+        print("Sample keys:", list(rows[0].keys())[:12])
+        print("Sample facility:", rows[0].get("facility_name"), rows[0].get("state"))
+    return len(rows) > 0
+
+
+# --- Fetch ------------------------------------------------------------------
+
+def fetch_rows() -> list[dict[str, Any]]:
+    s = RateLimitedSession(min_interval=0.25)
+    offset = 0
+    out: list[dict[str, Any]] = []
+    while True:
+        r = s.get(BASE, params={"limit": PAGE_SIZE, "offset": offset})
+        batch = r.json().get("results", [])
+        if not batch:
+            break
+        out.extend(batch)
+        LOG.info("fetched %d (total %d)", len(batch), len(out))
+        if len(batch) < PAGE_SIZE:
+            break
+        offset += PAGE_SIZE
+    return out
+
+
+# --- Write ------------------------------------------------------------------
+
+def write_rows(conn, raw_rows: list[dict[str, Any]]) -> tuple[int, int]:
+    facility_rows = []
+    for r in raw_rows:
+        facility_rows.append((
+            r.get("facility_id"),               # ccn
+            None,                               # npi
+            r.get("facility_name"),
+            r.get("address"),
+            r.get("citytown"),
+            r.get("state"),
+            r.get("zip_code"),
+            None,                               # county_fips (join later via zip->fips)
+            None, None,                         # lat, lon
+            "IPF",                              # facility_type
+            None, None, None, None,             # ownership, bed counts
+            None, None,                         # adolescent_unit, young_adult_unit
+            [], [], [], None,                   # arrays, medicaid_accepted
+            None, None, None,                   # accreditation, opened, closed
+            None,                               # last_verified
+            "cms_ipfqr",                        # source
+            None,                               # source_raw_id
+        ))
+
+    facility_cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    bulk_insert(conn, "bhi_facilities", facility_cols, facility_rows)
+
+    # Map ccn -> facility_id for measures
+    with conn.cursor() as c:
+        c.execute(
+            "SELECT ccn, facility_id FROM bhi_facilities WHERE source='cms_ipfqr'"
+        )
+        ccn_map = dict(c.fetchall())
+
+    measure_rows = []
+    for r in raw_rows:
+        fid = ccn_map.get(r.get("facility_id"))
+        if not fid:
+            continue
+        for field, mid, mname in MEASURE_FIELDS:
+            val = r.get(field) or r.get(f"{field}_overall_rate_per_1000")
+            try:
+                v = float(val) if val not in (None, "", "Not Available") else None
+            except (TypeError, ValueError):
+                v = None
+            if v is None:
+                continue
+            measure_rows.append((fid, mid, mname, v, None, None, None, "cms_ipfqr"))
+
+    cols = ["facility_id","measure_id","measure_name","value","benchmark","period","reported_at","source"]
+    bulk_insert(conn, "bhi_facility_quality", cols, measure_rows)
+    return len(facility_rows), len(measure_rows)
+
+
+def main():
+    with job_run("bhi_cms_ipfqr") as (conn, run_id):
+        rows = fetch_rows()
+        f, m = write_rows(conn, rows)
+        LOG.info("inserted %d facilities, %d measures (run %s)", f, m, run_id)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cms_nursing_home.py
+++ b/jobs/ingestion/cms_nursing_home.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CMS Nursing Home Provider Information — captures SNFs that house behavioral
+health residents (SNF-IMD dynamic) for later filtering on chain + ownership.
+
+Source: https://data.cms.gov/provider-data/api/1/datastore/query/4pq5-n9py/0
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cms_nursing_home")
+BASE = "https://data.cms.gov/provider-data/api/1/datastore/query/4pq5-n9py/0"
+PAGE = 1000
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(BASE, params={"limit": 2}).json()
+    rows = r.get("results", [])
+    print(f"OK: {len(rows)} rows, sample:", rows[0].get("provider_name") if rows else None)
+    return bool(rows)
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.25)
+    offset, out = 0, []
+    while True:
+        b = s.get(BASE, params={"limit": PAGE, "offset": offset}).json().get("results", [])
+        if not b:
+            break
+        out.extend(b)
+        if len(b) < PAGE:
+            break
+        offset += PAGE
+    LOG.info("fetched %d nursing homes", len(out))
+    return out
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        try:
+            beds = int(r.get("number_of_certified_beds") or 0) or None
+        except (TypeError, ValueError):
+            beds = None
+        opened = r.get("date_first_approved_to_provide_medicare_and_medicaid_services")
+        rows.append((
+            r.get("cms_certification_number_ccn"), None,
+            r.get("provider_name"), r.get("provider_address"),
+            r.get("citytown"), r.get("state"), r.get("zip_code"), None,
+            None, None,
+            "nursing_home",
+            r.get("ownership_type"),
+            beds, None, None, None, None,
+            [], [], [], None, None,
+            opened if opened else None, None, None,
+            "cms_nursing_home", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_cms_nursing_home") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/cms_pos.py
+++ b/jobs/ingestion/cms_pos.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CMS Provider of Services (POS) file — quarterly bulk CSV with every
+Medicare-certified facility including provider category (IPFs, PRTFs, etc.),
+bed counts, certification date, and termination date. Critical for
+closure/opening tracking used in composite_score.capacity_trend.
+"""
+import csv
+import io
+import logging
+import sys
+import zipfile
+from datetime import datetime
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.cms_pos")
+CATALOG_URL = "https://data.cms.gov/data.json"
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(CATALOG_URL).json()
+    pos = [d for d in r.get("dataset", []) if "provider of services" in d.get("title", "").lower()]
+    print(f"OK: {len(pos)} POS datasets in catalog")
+    for d in pos[:3]:
+        print(" -", d.get("title"))
+    return len(pos) > 0
+
+
+def _latest_pos_distribution():
+    s = RateLimitedSession(min_interval=0.3)
+    r = s.get(CATALOG_URL).json()
+    pos = [d for d in r.get("dataset", [])
+           if "provider of services" in d.get("title", "").lower()
+           and "hospital" in d.get("title", "").lower()]
+    if not pos:
+        return None
+    latest = max(pos, key=lambda d: d.get("modified", ""))
+    for dist in latest.get("distribution", []):
+        url = dist.get("downloadURL") or dist.get("accessURL", "")
+        if url.endswith((".zip", ".csv")):
+            return url
+    return None
+
+
+def fetch_rows():
+    url = _latest_pos_distribution()
+    if not url:
+        LOG.error("Could not resolve POS download URL")
+        return []
+    LOG.info("fetching POS: %s", url)
+    s = RateLimitedSession(min_interval=0.5)
+    r = s.get(url)
+    content = r.content
+    if url.endswith(".zip"):
+        z = zipfile.ZipFile(io.BytesIO(content))
+        csvname = next((n for n in z.namelist() if n.lower().endswith(".csv")), None)
+        with z.open(csvname) as f:
+            text = io.TextIOWrapper(f, encoding="latin-1").read()
+    else:
+        text = content.decode("latin-1", errors="replace")
+    reader = csv.DictReader(io.StringIO(text))
+    # Filter to psychiatric + BH provider categories
+    # CMS PRVDR_CTGRY_CD: 04 = psych hospital, sub-category variations
+    keep = []
+    for row in reader:
+        cat = row.get("PRVDR_CTGRY_CD") or row.get("prvdr_ctgry_cd") or ""
+        subcat = row.get("PRVDR_CTGRY_SBTYP_CD") or row.get("prvdr_ctgry_sbtyp_cd") or ""
+        if cat in ("04",) or "psych" in (row.get("FAC_NAME", "") + row.get("fac_name", "")).lower():
+            keep.append(row)
+    LOG.info("filtered POS to %d BH-relevant rows", len(keep))
+    return keep
+
+
+def _parse_date(s):
+    if not s:
+        return None
+    for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%Y%m%d"):
+        try:
+            return datetime.strptime(s, fmt).date()
+        except ValueError:
+            continue
+    return None
+
+
+def _num(v):
+    try:
+        return int(float(v)) if v not in (None, "") else None
+    except (TypeError, ValueError):
+        return None
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        def g(*keys):
+            for k in keys:
+                v = r.get(k) or r.get(k.lower())
+                if v:
+                    return v
+            return None
+        rows.append((
+            g("PRVDR_NUM", "prvdr_num"), None,
+            g("FAC_NAME", "fac_name"),
+            g("ST_ADR", "st_adr"),
+            g("CITY_NAME", "city_name"),
+            g("STATE_CD", "state_cd"),
+            g("ZIP_CD", "zip_cd"),
+            None, None, None,
+            "IPF",
+            g("GNRL_CNTL_TYPE_CD", "gnrl_cntl_type_cd"),
+            _num(g("BED_CNT", "bed_cnt")),
+            _num(g("CRTFD_BED_CNT", "crtfd_bed_cnt")),
+            None, None, None,
+            [], [], [], None, None,
+            _parse_date(g("ORGNL_PRTCPTN_DT", "orgnl_prtcptn_dt")),
+            _parse_date(g("TRMNTN_EXPRTN_DT", "trmntn_exprtn_dt")),
+            None,
+            "cms_pos", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_cms_pos") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/hrsa_hpsa.py
+++ b/jobs/ingestion/hrsa_hpsa.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+HRSA Mental Health HPSA (Health Professional Shortage Areas) bulk CSV.
+
+Source: https://data.hrsa.gov/DataDownload/DD_Files/BCD_HPSA_FCT_DET_MH.csv
+Confirmed: ~23 MB CSV, all active + historical MH HPSAs.
+"""
+import csv
+import io
+import logging
+import sys
+from datetime import datetime
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.hrsa_hpsa")
+URL = "https://data.hrsa.gov/DataDownload/DD_Files/BCD_HPSA_FCT_DET_MH.csv"
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(URL, stream=True)
+    first = next(r.iter_lines())
+    print(f"OK: content-length={r.headers.get('content-length')}")
+    print("header:", first.decode("utf-8", errors="replace")[:200])
+    return True
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.5)
+    r = s.get(URL)
+    r.encoding = "utf-8"
+    reader = csv.DictReader(io.StringIO(r.text))
+    rows = list(reader)
+    LOG.info("fetched %d HPSA rows", len(rows))
+    return rows
+
+
+def _parse_date(s):
+    if not s:
+        return None
+    for fmt in ("%Y-%m-%d", "%m/%d/%Y"):
+        try:
+            return datetime.strptime(s, fmt).date()
+        except ValueError:
+            continue
+    return None
+
+
+def _parse_int(s):
+    try:
+        return int(float(s)) if s not in (None, "") else None
+    except (TypeError, ValueError):
+        return None
+
+
+def write_rows(conn, raw):
+    cols = ["hpsa_id","state","county_fips","score","population_served",
+            "designated_date","withdrawn_date","source"]
+    rows = []
+    for r in raw:
+        rows.append((
+            r.get("HPSA ID"),
+            r.get("Primary State Abbreviation"),
+            r.get("Common County FIPS Code") or r.get("HPSA Geography Identification Number"),
+            _parse_int(r.get("HPSA Score")),
+            _parse_int(r.get("HPSA Designation Population")),
+            _parse_date(r.get("HPSA Designation Date")),
+            _parse_date(r.get("Withdrawn Date")),
+            "hrsa_hpsa_mh",
+        ))
+    bulk_insert(conn, "bhi_shortages", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_hrsa_hpsa") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/idea_part_b.py
+++ b/jobs/ingestion/idea_part_b.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+IDEA Part B child count — specifically "Emotional Disturbance" (ED)
+classification by state and local education agency (LEA).
+
+Static CSVs hosted by US Department of Education / OSEP. No API. This job
+pulls the most recent static tables. Update MANIFEST when new year drops.
+"""
+import csv
+import io
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.idea_part_b")
+
+# Static CSV links — placeholder pattern. The user confirmed landing at
+# https://www2.ed.gov/programs/osepidea/618-data/static-tables/index.html
+MANIFEST = [
+    # (year, scope, url)
+    ("2022-23", "state", "https://www2.ed.gov/programs/osepidea/618-data/static-tables/part-b/child-count-and-educational-environment/bchildcountandedenvironments2022-23.csv"),
+]
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    ok = True
+    for year, scope, url in MANIFEST:
+        r = s.head(url, allow_redirects=True)
+        print(f"{year} {scope}: {r.status_code}")
+        ok = ok and r.status_code in (200, 302)
+    return ok
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.5)
+    out = []
+    for year, scope, url in MANIFEST:
+        try:
+            r = s.get(url)
+            r.encoding = "utf-8"
+            reader = csv.DictReader(io.StringIO(r.text))
+            for row in reader:
+                row["_year"] = year
+                row["_scope"] = scope
+                out.append(row)
+        except Exception as e:
+            LOG.warning("failed %s: %s", url, e)
+    LOG.info("IDEA rows: %d", len(out))
+    return out
+
+
+def _int(v):
+    try:
+        return int(str(v).replace(",", "")) if v not in (None, "", "-") else None
+    except (TypeError, ValueError):
+        return None
+
+
+def write_rows(conn, raw):
+    cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
+    rows = []
+    for r in raw:
+        disability = (r.get("Disability Category") or r.get("SEA Disability Category") or "").lower()
+        if "emotional" not in disability:
+            continue
+        val = _int(r.get("Students Served") or r.get("Total") or r.get("ED"))
+        if val is None:
+            continue
+        rows.append((
+            "state",
+            r.get("State") or r.get("SEA State"),
+            "idea_emotional_disturbance_count",
+            "13-17",  # ED classification predominantly school-age; approximate
+            r["_year"],
+            float(val),
+            "idea_part_b",
+        ))
+    bulk_insert(conn, "bhi_demand_indicators", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_idea_part_b") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/nppes.py
+++ b/jobs/ingestion/nppes.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CMS NPPES (National Plan & Provider Enumeration System) — behavioral health
+providers by taxonomy + state.
+
+API: https://npiregistry.cms.hhs.gov/api/?version=2.1
+Filter: taxonomy codes for psychiatry, psychology, counseling, SUD.
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.nppes")
+BASE = "https://npiregistry.cms.hhs.gov/api/"
+
+BH_TAXONOMY_CODES = [
+    "2084P0800X",  # Psychiatry
+    "2084P0802X",  # Addiction Psychiatry
+    "2084P0804X",  # Child & Adolescent Psychiatry
+    "103T00000X",  # Psychologist
+    "103TC2200X",  # Clinical Child & Adolescent Psychologist
+    "101YM0800X",  # Mental Health Counselor
+    "1041C0700X",  # Clinical Social Worker
+    "324500000X",  # Substance Abuse Rehabilitation Facility
+    "283Q00000X",  # Psychiatric Hospital
+    "323P00000X",  # Psychiatric Residential Treatment Facility
+]
+STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN",
+          "IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV",
+          "NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN",
+          "TX","UT","VT","VA","WA","WV","WI","WY","DC"]
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(BASE, params={
+        "version": "2.1", "taxonomy_description": "psychiatric",
+        "state": "NY", "limit": 2,
+    }).json()
+    print(f"OK: result_count={r.get('result_count')}")
+    return r.get("result_count", 0) > 0
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.1)
+    all_rows = []
+    for state in STATES:
+        for taxonomy in BH_TAXONOMY_CODES:
+            skip = 0
+            while True:
+                r = s.get(BASE, params={
+                    "version": "2.1",
+                    "taxonomy_description": taxonomy,
+                    "state": state,
+                    "limit": 200,
+                    "skip": skip,
+                }).json()
+                results = r.get("results", [])
+                if not results:
+                    break
+                for row in results:
+                    row["_state"] = state
+                    row["_taxonomy"] = taxonomy
+                all_rows.extend(results)
+                if len(results) < 200:
+                    break
+                skip += 200
+                if skip > 1200:  # NPPES caps paging
+                    break
+            LOG.info("state=%s tax=%s total=%d", state, taxonomy, len(all_rows))
+    return all_rows
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        addresses = r.get("addresses") or []
+        location = next((a for a in addresses if a.get("address_purpose") == "LOCATION"), addresses[0] if addresses else {})
+        basic = r.get("basic") or {}
+        name = basic.get("organization_name") or " ".join(filter(None, [basic.get("first_name"), basic.get("last_name")]))
+        rows.append((
+            None, str(r.get("number", "")),
+            name,
+            location.get("address_1"), location.get("city"),
+            location.get("state"), location.get("postal_code"), None,
+            None, None,
+            "provider" if basic.get("name_prefix") is None else "org",
+            None, None, None, None, None, None,
+            [r.get("_taxonomy", "")], [], [], None, None, None, None, None,
+            "nppes", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_nppes") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/nsch.py
+++ b/jobs/ingestion/nsch.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+NSCH — National Survey of Children's Health (HRSA/MCHB).
+
+Source: https://mchb.hrsa.gov/data-research/national-survey-childrens-health
+Bulk files by year; we parse state-level indicator tables. Manifest below.
+"""
+import csv
+import io
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.nsch")
+
+MANIFEST = [
+    # (year, url_to_indicator_csv)
+    ("2022", "https://mchb.hrsa.gov/sites/default/files/mchb/data-research/nsch/2022/nsch-2022-state-level-indicators.csv"),
+]
+
+INDICATORS_OF_INTEREST = {
+    "anxiety": "anxiety_pct",
+    "depression": "depression_pct",
+    "behavioral": "behavioral_pct",
+    "mental health treatment": "unmet_mh_treatment_pct",
+    "unmet": "unmet_mh_treatment_pct",
+}
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    ok = True
+    for year, url in MANIFEST:
+        r = s.head(url, allow_redirects=True)
+        print(f"{year}: {r.status_code}")
+        ok = ok and r.status_code in (200, 302)
+    return ok
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.5)
+    out = []
+    for year, url in MANIFEST:
+        try:
+            r = s.get(url)
+            r.encoding = "utf-8"
+            reader = csv.DictReader(io.StringIO(r.text))
+            for row in reader:
+                row["_year"] = year
+                out.append(row)
+        except Exception as e:
+            LOG.warning("failed %s: %s", url, e)
+    LOG.info("NSCH rows: %d", len(out))
+    return out
+
+
+def write_rows(conn, raw):
+    cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
+    rows = []
+    for r in raw:
+        indicator = (r.get("Indicator") or "").lower()
+        measure = None
+        for k, v in INDICATORS_OF_INTEREST.items():
+            if k in indicator:
+                measure = v
+                break
+        if not measure:
+            continue
+        try:
+            val = float((r.get("Estimate") or r.get("Value") or "0").replace("%", ""))
+        except (TypeError, ValueError):
+            continue
+        rows.append((
+            "state",
+            r.get("State"),
+            measure,
+            "13-17",
+            r["_year"],
+            val,
+            "nsch",
+        ))
+    bulk_insert(conn, "bhi_demand_indicators", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_nsch") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/samhsa_locator.py
+++ b/jobs/ingestion/samhsa_locator.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+SAMHSA findtreatment.gov behavioral health facility locator.
+
+Source: https://findtreatment.gov/locator/exportsAsJson/v2
+Confirmed: 96,009 facilities across 3,201 pages (sType=BH).
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.samhsa_locator")
+BASE = "https://findtreatment.gov/locator/exportsAsJson/v2"
+ZIP_SEED = "10001"  # any valid zip works; results are national in the 'BH' sType
+PAGE_SIZE = 30      # server default; respected
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(BASE, params={"sType": "BH", "sAddr": ZIP_SEED, "page": 1}).json()
+    print(f"OK: recordCount={r.get('recordCount')}, totalPages={r.get('totalPages')}")
+    rows = r.get("rows", [])
+    if rows:
+        print("sample:", rows[0].get("name1"), rows[0].get("state"))
+    return bool(rows)
+
+
+def fetch_rows(max_pages: int | None = None):
+    s = RateLimitedSession(min_interval=0.3)
+    out = []
+    page = 1
+    total = None
+    while True:
+        r = s.get(BASE, params={"sType": "BH", "sAddr": ZIP_SEED, "pageSize": PAGE_SIZE, "page": page}).json()
+        total = total or r.get("totalPages", 1)
+        out.extend(r.get("rows", []))
+        if page % 50 == 0:
+            LOG.info("page %d/%d (total rows %d)", page, total, len(out))
+        if page >= total or (max_pages and page >= max_pages):
+            break
+        page += 1
+    LOG.info("fetched %d facilities", len(out))
+    return out
+
+
+def _parse_float(v):
+    try:
+        return float(v) if v not in (None, "") else None
+    except (TypeError, ValueError):
+        return None
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        name = " ".join(filter(None, [r.get("name1"), (r.get("name2") or "").strip()])).strip()
+        services = (r.get("services") or "").split(",") if r.get("services") else []
+        # SAMHSA flags adolescent/young-adult services in the services string
+        services_lc = [s.lower() for s in services]
+        adolescent = any("adolescent" in s or "youth" in s or "teen" in s for s in services_lc) or None
+        young_adult = any("young adult" in s or "transitional age" in s for s in services_lc) or None
+        rows.append((
+            None, None,  # ccn/npi unknown from this source
+            name, r.get("street1"),
+            r.get("city"), r.get("state"), r.get("zip"), None,
+            _parse_float(r.get("latitude")), _parse_float(r.get("longitude")),
+            r.get("typeFacility") or "bh_facility",
+            None, None, None, None,
+            adolescent, young_adult,
+            services, [], [], None, None, None, None, None,
+            "samhsa_locator", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_samhsa_locator") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()
--- a/jobs/ingestion/samhsa_nssats_nmhss.py
+++ b/jobs/ingestion/samhsa_nssats_nmhss.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+SAMHSA N-SSATS + N-MHSS bulk downloads.
+
+SAMHSA Data Archive hosts annual CSV/SAS files. The landing pages do not
+expose a machine-listing API, so we maintain a manifest of known direct URLs
+and parse whichever are present. Update the MANIFEST when new years drop.
+"""
+import csv
+import io
+import logging
+import sys
+import zipfile
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.samhsa_surveys")
+
+# Known bulk files. Confirmed on samhsa.gov/data as of 2026. Update as needed.
+MANIFEST = [
+    # (year, survey, url)
+    ("2022", "N-MHSS", "https://www.samhsa.gov/data/sites/default/files/reports/rpt42936/2022-nmhss-datafile-csv.zip"),
+    ("2022", "N-SSATS", "https://www.samhsa.gov/data/sites/default/files/reports/rpt42725/2022-nssats-datafile-csv.zip"),
+]
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    ok = True
+    for year, survey, url in MANIFEST:
+        r = s.head(url, allow_redirects=True)
+        print(f"{survey} {year}: {r.status_code}")
+        ok = ok and r.status_code == 200
+    return ok
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.5)
+    out = []
+    for year, survey, url in MANIFEST:
+        LOG.info("fetching %s %s", survey, year)
+        try:
+            r = s.get(url)
+            z = zipfile.ZipFile(io.BytesIO(r.content))
+            csvname = next((n for n in z.namelist() if n.lower().endswith(".csv")), None)
+            if not csvname:
+                continue
+            with z.open(csvname) as f:
+                reader = csv.DictReader(io.TextIOWrapper(f, encoding="latin-1"))
+                for row in reader:
+                    row["_survey"] = survey
+                    row["_year"] = year
+                    out.append(row)
+        except Exception as e:
+            LOG.warning("failed %s %s: %s", survey, year, e)
+    LOG.info("total rows: %d", len(out))
+    return out
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        def y(field):
+            v = r.get(field) or r.get(field.upper()) or r.get(field.lower())
+            return v == "1" or str(v).lower() == "yes"
+        name = r.get("NAME") or r.get("name") or r.get("FACNAME") or ""
+        rows.append((
+            None, None, name,
+            r.get("STREET1") or r.get("street1"),
+            r.get("CITY") or r.get("city"),
+            r.get("STATE") or r.get("state"),
+            r.get("ZIP") or r.get("zip"),
+            None, None, None,
+            "sud" if r["_survey"] == "N-SSATS" else "mh",
+            None, None, None, None,
+            y("YOUTH") or y("ADOLESCENT"),
+            y("YAD") or y("YOUNGADULT"),
+            [], [], [], None, None, None, None, None,
+            f"samhsa_{r['_survey'].lower()}_{r['_year']}", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_samhsa_surveys") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()