BHI layer v1: docs, schema, Phase A ingestion stubs
This commit is contained in:
95
jobs/ingestion/cdc_yrbss.py
Normal file
95
jobs/ingestion/cdc_yrbss.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
|
||||
"""
|
||||
CDC YRBSS — Youth Risk Behavior Survey (high and middle school).
|
||||
|
||||
Sources (Socrata):
|
||||
- High school: https://data.cdc.gov/resource/3qty-g4aq.json
|
||||
- Middle school: https://data.cdc.gov/resource/uqmk-4y2w.json
|
||||
|
||||
Key items: "considered suicide", "attempted suicide", "persistent sadness",
|
||||
substance use — all adolescent (13-17) bracket.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from _common import RateLimitedSession, bulk_insert, job_run
|
||||
|
||||
LOG = logging.getLogger("bhi.cdc_yrbss")
|
||||
DATASETS = {
|
||||
"hs": "https://data.cdc.gov/resource/3qty-g4aq.json",
|
||||
"ms": "https://data.cdc.gov/resource/uqmk-4y2w.json",
|
||||
}
|
||||
|
||||
KEYWORDS = ["suicide", "sad", "hopeless", "mental health", "electronic"]
|
||||
|
||||
|
||||
def test_endpoint():
|
||||
s = RateLimitedSession()
|
||||
ok = True
|
||||
for k, url in DATASETS.items():
|
||||
r = s.get(url, params={"$limit": 1})
|
||||
print(f"{k}: status={r.status_code}, rows={len(r.json())}")
|
||||
ok = ok and r.status_code == 200
|
||||
return ok
|
||||
|
||||
|
||||
def fetch_rows():
|
||||
s = RateLimitedSession(min_interval=0.2)
|
||||
out = []
|
||||
for key, url in DATASETS.items():
|
||||
offset = 0
|
||||
while True:
|
||||
batch = s.get(url, params={"$limit": 5000, "$offset": offset}).json()
|
||||
if not batch:
|
||||
break
|
||||
for row in batch:
|
||||
row["_dataset"] = key
|
||||
out.extend(batch)
|
||||
if len(batch) < 5000:
|
||||
break
|
||||
offset += 5000
|
||||
LOG.info("yrbss %s -> %d", key, len(out))
|
||||
return out
|
||||
|
||||
|
||||
def _question_is_relevant(q: str) -> bool:
|
||||
ql = (q or "").lower()
|
||||
return any(k in ql for k in KEYWORDS)
|
||||
|
||||
|
||||
def write_rows(conn, raw):
|
||||
cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
|
||||
rows = []
|
||||
for r in raw:
|
||||
question = r.get("questioncode") or r.get("shortquestiontext") or r.get("question") or ""
|
||||
if not _question_is_relevant(question):
|
||||
continue
|
||||
try:
|
||||
val = float(r.get("data_value") or r.get("greater_risk_data_value") or 0)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if val == 0:
|
||||
continue
|
||||
rows.append((
|
||||
"state" if r.get("locationdesc") else "district",
|
||||
r.get("locationabbr") or r.get("sitecode"),
|
||||
question[:120],
|
||||
"13-17",
|
||||
str(r.get("year") or ""),
|
||||
val,
|
||||
f"cdc_yrbss_{r.get('_dataset','hs')}",
|
||||
))
|
||||
bulk_insert(conn, "bhi_demand_indicators", cols, rows)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def main():
|
||||
with job_run("bhi_cdc_yrbss") as (conn, _):
|
||||
n = write_rows(conn, fetch_rows())
|
||||
LOG.info("inserted %d", n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "test":
|
||||
sys.exit(0 if test_endpoint() else 1)
|
||||
main()
|
||||
Reference in New Issue
Block a user