economic-brain-bhi/jobs/ingestion/cdc_yrbss.py

#!/usr/bin/env python3
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
"""
CDC YRBSS — Youth Risk Behavior Survey (high and middle school).

Sources (Socrata):
  - High school: https://data.cdc.gov/resource/3qty-g4aq.json
  - Middle school: https://data.cdc.gov/resource/uqmk-4y2w.json

Key items: "considered suicide", "attempted suicide", "persistent sadness",
substance use — all adolescent (13-17) bracket.
"""
import logging
import sys
from _common import RateLimitedSession, bulk_insert, job_run

LOG = logging.getLogger("bhi.cdc_yrbss")
DATASETS = {
    "hs": "https://data.cdc.gov/resource/3qty-g4aq.json",
    "ms": "https://data.cdc.gov/resource/uqmk-4y2w.json",
}

KEYWORDS = ["suicide", "sad", "hopeless", "mental health", "electronic"]


def test_endpoint():
    s = RateLimitedSession()
    ok = True
    for k, url in DATASETS.items():
        r = s.get(url, params={"$limit": 1})
        print(f"{k}: status={r.status_code}, rows={len(r.json())}")
        ok = ok and r.status_code == 200
    return ok


def fetch_rows():
    s = RateLimitedSession(min_interval=0.2)
    out = []
    for key, url in DATASETS.items():
        offset = 0
        while True:
            batch = s.get(url, params={"$limit": 5000, "$offset": offset}).json()
            if not batch:
                break
            for row in batch:
                row["_dataset"] = key
            out.extend(batch)
            if len(batch) < 5000:
                break
            offset += 5000
        LOG.info("yrbss %s -> %d", key, len(out))
    return out


def _question_is_relevant(q: str) -> bool:
    ql = (q or "").lower()
    return any(k in ql for k in KEYWORDS)


def write_rows(conn, raw):
    cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
    rows = []
    for r in raw:
        question = r.get("questioncode") or r.get("shortquestiontext") or r.get("question") or ""
        if not _question_is_relevant(question):
            continue
        try:
            val = float(r.get("data_value") or r.get("greater_risk_data_value") or 0)
        except (TypeError, ValueError):
            continue
        if val == 0:
            continue
        rows.append((
            "state" if r.get("locationdesc") else "district",
            r.get("locationabbr") or r.get("sitecode"),
            question[:120],
            "13-17",
            str(r.get("year") or ""),
            val,
            f"cdc_yrbss_{r.get('_dataset','hs')}",
        ))
    bulk_insert(conn, "bhi_demand_indicators", cols, rows)
    return len(rows)


def main():
    with job_run("bhi_cdc_yrbss") as (conn, _):
        n = write_rows(conn, fetch_rows())
        LOG.info("inserted %d", n)


if __name__ == "__main__":
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        sys.exit(0 if test_endpoint() else 1)
    main()