BHI layer v1: docs, schema, Phase A ingestion stubs
This commit is contained in:
92
jobs/ingestion/cdc_brfss.py
Normal file
92
jobs/ingestion/cdc_brfss.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
|
||||
"""
|
||||
CDC BRFSS Prevalence Data (Socrata).
|
||||
|
||||
Source: https://data.cdc.gov/resource/dttw-5yxu.json
|
||||
Pulls depression + mental-health-not-good items by state, with
|
||||
young-adult (18-24) breakouts where available.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from _common import RateLimitedSession, bulk_insert, job_run
|
||||
|
||||
LOG = logging.getLogger("bhi.cdc_brfss")
|
||||
BASE = "https://data.cdc.gov/resource/dttw-5yxu.json"
|
||||
|
||||
# BRFSS topics of interest for BHI
|
||||
TOPICS = [
|
||||
"Depression",
|
||||
"Mental Health Status",
|
||||
"Poor Mental Health",
|
||||
]
|
||||
|
||||
|
||||
def test_endpoint():
|
||||
s = RateLimitedSession()
|
||||
r = s.get(BASE, params={"$limit": 2}).json()
|
||||
print(f"OK: returned {len(r)} rows")
|
||||
if r:
|
||||
print("sample topic:", r[0].get("topic"))
|
||||
return bool(r)
|
||||
|
||||
|
||||
def fetch_rows():
|
||||
s = RateLimitedSession(min_interval=0.2)
|
||||
out = []
|
||||
for topic in TOPICS:
|
||||
offset = 0
|
||||
while True:
|
||||
batch = s.get(BASE, params={
|
||||
"$where": f"topic='{topic}'",
|
||||
"$limit": 5000,
|
||||
"$offset": offset,
|
||||
}).json()
|
||||
if not batch:
|
||||
break
|
||||
out.extend(batch)
|
||||
if len(batch) < 5000:
|
||||
break
|
||||
offset += 5000
|
||||
LOG.info("topic=%s total=%d", topic, len(out))
|
||||
return out
|
||||
|
||||
|
||||
def write_rows(conn, raw):
|
||||
cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
|
||||
rows = []
|
||||
for r in raw:
|
||||
try:
|
||||
val = float(r.get("data_value") or 0)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
breakout = (r.get("break_out") or "Overall").lower()
|
||||
if "18" in breakout and "24" in breakout:
|
||||
bracket = "18-25"
|
||||
elif "overall" in breakout:
|
||||
bracket = "all"
|
||||
else:
|
||||
bracket = breakout
|
||||
rows.append((
|
||||
"state",
|
||||
r.get("locationabbr"),
|
||||
(r.get("question") or r.get("topic") or "").strip()[:120],
|
||||
bracket,
|
||||
str(r.get("year") or ""),
|
||||
val,
|
||||
"cdc_brfss",
|
||||
))
|
||||
bulk_insert(conn, "bhi_demand_indicators", cols, rows)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def main():
|
||||
with job_run("bhi_cdc_brfss") as (conn, _):
|
||||
n = write_rows(conn, fetch_rows())
|
||||
LOG.info("inserted %d", n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "test":
|
||||
sys.exit(0 if test_endpoint() else 1)
|
||||
main()
|
||||
Reference in New Issue
Block a user