BHI layer v1: docs, schema, Phase A ingestion stubs
This commit is contained in:
119
jobs/ingestion/cdc_wonder_mortality.py
Normal file
119
jobs/ingestion/cdc_wonder_mortality.py
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
|
||||
"""
|
||||
CDC WONDER — Underlying Cause of Death by county, age bracket, ICD-10.
|
||||
|
||||
Posts XML request body to https://wonder.cdc.gov/controller/datarequest/D76
|
||||
(Underlying Cause of Death 1999-2020) or D77 (2018+). The public non-restricted
|
||||
datasets return XML tables; county-level cells with <10 deaths are suppressed.
|
||||
|
||||
We request two slices:
|
||||
1. Suicide (X60-X84) for ages 13-17 and 18-25, by county
|
||||
2. Drug poisoning (X40-X44, Y10-Y14) for 13-17 and 18-25, by county
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from _common import RateLimitedSession, bulk_insert, job_run
|
||||
|
||||
LOG = logging.getLogger("bhi.cdc_wonder")
|
||||
ENDPOINT = "https://wonder.cdc.gov/controller/datarequest/D76"
|
||||
|
||||
|
||||
def _build_xml(icd_codes: list[str], age_bracket: str) -> str:
|
||||
"""Assemble WONDER POST XML. Structure is value-order dependent."""
|
||||
# Age groups in WONDER: 15-19, 20-24, 25-29 etc. Adolescent and young-adult
|
||||
# brackets don't align perfectly with 5-year WONDER bins — closest fit:
|
||||
ages = {
|
||||
"13-17": ["15-19"], # approximate
|
||||
"18-25": ["20-24", "25-29"],
|
||||
}[age_bracket]
|
||||
icd_vals = "".join(f"<v>{c}</v>" for c in icd_codes)
|
||||
age_vals = "".join(f"<v>{a}</v>" for a in ages)
|
||||
return f"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<request-parameters>
|
||||
<parameter><name>accept_datause_restrictions</name><value>true</value></parameter>
|
||||
<parameter><name>B_1</name><value>D76.V2-level1</value></parameter>
|
||||
<parameter><name>B_2</name><value>D76.V51</value></parameter>
|
||||
<parameter><name>F_D76.V1</name>{age_vals}</parameter>
|
||||
<parameter><name>F_D76.V2</name><value>*All*</value></parameter>
|
||||
<parameter><name>F_D76.V22</name>{icd_vals}</parameter>
|
||||
<parameter><name>O_age</name><value>D76.V51</value></parameter>
|
||||
<parameter><name>O_location</name><value>D76.V9</value></parameter>
|
||||
<parameter><name>VM_D76.M6_D76.V10</name><value/></parameter>
|
||||
</request-parameters>"""
|
||||
|
||||
|
||||
def test_endpoint():
|
||||
s = RateLimitedSession(min_interval=1.0)
|
||||
body = _build_xml(["X60-X84"], "13-17")
|
||||
r = s.post(ENDPOINT, data={"request_xml": body, "accept_datause_restrictions": "true"})
|
||||
ok = r.status_code == 200 and b"<response" in r.content
|
||||
print(f"OK={ok}, status={r.status_code}, len={len(r.content)}")
|
||||
return ok
|
||||
|
||||
|
||||
def fetch_rows():
|
||||
s = RateLimitedSession(min_interval=1.0)
|
||||
out = []
|
||||
for measure, icd in [("suicide_rate", ["X60-X84"]),
|
||||
("overdose_rate", ["X40-X44", "Y10-Y14"])]:
|
||||
for bracket in ("13-17", "18-25"):
|
||||
body = _build_xml(icd, bracket)
|
||||
r = s.post(ENDPOINT, data={
|
||||
"request_xml": body,
|
||||
"accept_datause_restrictions": "true",
|
||||
})
|
||||
rows = _parse_wonder_xml(r.text, measure, bracket)
|
||||
out.extend(rows)
|
||||
LOG.info("%s %s -> %d rows", measure, bracket, len(rows))
|
||||
return out
|
||||
|
||||
|
||||
def _parse_wonder_xml(xml_text: str, measure: str, bracket: str):
|
||||
out = []
|
||||
try:
|
||||
root = ET.fromstring(xml_text)
|
||||
except ET.ParseError:
|
||||
LOG.error("WONDER XML parse failed")
|
||||
return out
|
||||
# WONDER returns <data-table> with <r> rows containing <c l="label"/>
|
||||
for r in root.iter("r"):
|
||||
cells = [c.get("l") or c.text for c in r.findall("c")]
|
||||
if len(cells) < 3:
|
||||
continue
|
||||
county = cells[0]
|
||||
try:
|
||||
rate = float(cells[-1])
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
out.append({
|
||||
"geo_type": "county",
|
||||
"geo_code": county,
|
||||
"measure": measure,
|
||||
"age_bracket": bracket,
|
||||
"period": "2018-2022", # WONDER typical 5-year window
|
||||
"value": rate,
|
||||
"source": "cdc_wonder",
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def write_rows(conn, raw):
|
||||
cols = ["geo_type","geo_code","measure","age_bracket","period","value","source"]
|
||||
rows = [(r["geo_type"], r["geo_code"], r["measure"], r["age_bracket"],
|
||||
r["period"], r["value"], r["source"]) for r in raw]
|
||||
bulk_insert(conn, "bhi_demand_indicators", cols, rows)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def main():
|
||||
with job_run("bhi_cdc_wonder") as (conn, _):
|
||||
n = write_rows(conn, fetch_rows())
|
||||
LOG.info("inserted %d", n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "test":
|
||||
sys.exit(0 if test_endpoint() else 1)
|
||||
main()
|
||||
Reference in New Issue
Block a user