96 lines
2.8 KiB
Python
96 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
|
|
"""
|
|
CDC YRBSS — Youth Risk Behavior Survey (high and middle school).
|
|
|
|
Sources (Socrata):
|
|
- High school: https://data.cdc.gov/resource/3qty-g4aq.json
|
|
- Middle school: https://data.cdc.gov/resource/uqmk-4y2w.json
|
|
|
|
Key items: "considered suicide", "attempted suicide", "persistent sadness",
|
|
substance use — all adolescent (13-17) bracket.
|
|
"""
|
|
import logging
|
|
import sys
|
|
from _common import RateLimitedSession, bulk_insert, job_run
|
|
|
|
LOG = logging.getLogger("bhi.cdc_yrbss")
|
|
DATASETS = {
|
|
"hs": "https://data.cdc.gov/resource/3qty-g4aq.json",
|
|
"ms": "https://data.cdc.gov/resource/uqmk-4y2w.json",
|
|
}
|
|
|
|
KEYWORDS = ["suicide", "sad", "hopeless", "mental health", "electronic"]
|
|
|
|
|
|
def test_endpoint():
|
|
s = RateLimitedSession()
|
|
ok = True
|
|
for k, url in DATASETS.items():
|
|
r = s.get(url, params={"$limit": 1})
|
|
print(f"{k}: status={r.status_code}, rows={len(r.json())}")
|
|
ok = ok and r.status_code == 200
|
|
return ok
|
|
|
|
|
|
def fetch_rows():
|
|
s = RateLimitedSession(min_interval=0.2)
|
|
out = []
|
|
for key, url in DATASETS.items():
|
|
offset = 0
|
|
while True:
|
|
batch = s.get(url, params={"$limit": 5000, "$offset": offset}).json()
|
|
if not batch:
|
|
break
|
|
for row in batch:
|
|
row["_dataset"] = key
|
|
out.extend(batch)
|
|
if len(batch) < 5000:
|
|
break
|
|
offset += 5000
|
|
LOG.info("yrbss %s -> %d", key, len(out))
|
|
return out
|
|
|
|
|
|
def _question_is_relevant(q: str) -> bool:
|
|
ql = (q or "").lower()
|
|
return any(k in ql for k in KEYWORDS)
|
|
|
|
|
|
def write_rows(conn, raw):
|
|
cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
|
|
rows = []
|
|
for r in raw:
|
|
question = r.get("questioncode") or r.get("shortquestiontext") or r.get("question") or ""
|
|
if not _question_is_relevant(question):
|
|
continue
|
|
try:
|
|
val = float(r.get("data_value") or r.get("greater_risk_data_value") or 0)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
if val == 0:
|
|
continue
|
|
rows.append((
|
|
"state" if r.get("locationdesc") else "district",
|
|
r.get("locationabbr") or r.get("sitecode"),
|
|
question[:120],
|
|
"13-17",
|
|
str(r.get("year") or ""),
|
|
val,
|
|
f"cdc_yrbss_{r.get('_dataset','hs')}",
|
|
))
|
|
bulk_insert(conn, "bhi_demand_indicators", cols, rows)
|
|
return len(rows)
|
|
|
|
|
|
def main():
|
|
with job_run("bhi_cdc_yrbss") as (conn, _):
|
|
n = write_rows(conn, fetch_rows())
|
|
LOG.info("inserted %d", n)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) > 1 and sys.argv[1] == "test":
|
|
sys.exit(0 if test_endpoint() else 1)
|
|
main()
|