Files
2026-04-05 20:15:36 +00:00

96 lines
2.8 KiB
Python

#!/usr/bin/env python3
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
"""
CDC YRBSS — Youth Risk Behavior Survey (high and middle school).
Sources (Socrata):
- High school: https://data.cdc.gov/resource/3qty-g4aq.json
- Middle school: https://data.cdc.gov/resource/uqmk-4y2w.json
Key items: "considered suicide", "attempted suicide", "persistent sadness",
substance use — all adolescent (13-17) bracket.
"""
import logging
import sys
from _common import RateLimitedSession, bulk_insert, job_run
LOG = logging.getLogger("bhi.cdc_yrbss")
DATASETS = {
"hs": "https://data.cdc.gov/resource/3qty-g4aq.json",
"ms": "https://data.cdc.gov/resource/uqmk-4y2w.json",
}
KEYWORDS = ["suicide", "sad", "hopeless", "mental health", "electronic"]
def test_endpoint():
s = RateLimitedSession()
ok = True
for k, url in DATASETS.items():
r = s.get(url, params={"$limit": 1})
print(f"{k}: status={r.status_code}, rows={len(r.json())}")
ok = ok and r.status_code == 200
return ok
def fetch_rows():
s = RateLimitedSession(min_interval=0.2)
out = []
for key, url in DATASETS.items():
offset = 0
while True:
batch = s.get(url, params={"$limit": 5000, "$offset": offset}).json()
if not batch:
break
for row in batch:
row["_dataset"] = key
out.extend(batch)
if len(batch) < 5000:
break
offset += 5000
LOG.info("yrbss %s -> %d", key, len(out))
return out
def _question_is_relevant(q: str) -> bool:
ql = (q or "").lower()
return any(k in ql for k in KEYWORDS)
def write_rows(conn, raw):
cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
rows = []
for r in raw:
question = r.get("questioncode") or r.get("shortquestiontext") or r.get("question") or ""
if not _question_is_relevant(question):
continue
try:
val = float(r.get("data_value") or r.get("greater_risk_data_value") or 0)
except (TypeError, ValueError):
continue
if val == 0:
continue
rows.append((
"state" if r.get("locationdesc") else "district",
r.get("locationabbr") or r.get("sitecode"),
question[:120],
"13-17",
str(r.get("year") or ""),
val,
f"cdc_yrbss_{r.get('_dataset','hs')}",
))
bulk_insert(conn, "bhi_demand_indicators", cols, rows)
return len(rows)
def main():
with job_run("bhi_cdc_yrbss") as (conn, _):
n = write_rows(conn, fetch_rows())
LOG.info("inserted %d", n)
if __name__ == "__main__":
if len(sys.argv) > 1 and sys.argv[1] == "test":
sys.exit(0 if test_endpoint() else 1)
main()