BHI layer v1: docs, schema, Phase A ingestion stubs
This commit is contained in:
85
jobs/ingestion/hrsa_hpsa.py
Normal file
85
jobs/ingestion/hrsa_hpsa.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
|
||||
"""
|
||||
HRSA Mental Health HPSA (Health Professional Shortage Areas) bulk CSV.
|
||||
|
||||
Source: https://data.hrsa.gov/DataDownload/DD_Files/BCD_HPSA_FCT_DET_MH.csv
|
||||
Confirmed: ~23 MB CSV, all active + historical MH HPSAs.
|
||||
"""
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from _common import RateLimitedSession, bulk_insert, job_run
|
||||
|
||||
LOG = logging.getLogger("bhi.hrsa_hpsa")
|
||||
URL = "https://data.hrsa.gov/DataDownload/DD_Files/BCD_HPSA_FCT_DET_MH.csv"
|
||||
|
||||
|
||||
def test_endpoint():
|
||||
s = RateLimitedSession()
|
||||
r = s.get(URL, stream=True)
|
||||
first = next(r.iter_lines())
|
||||
print(f"OK: content-length={r.headers.get('content-length')}")
|
||||
print("header:", first.decode("utf-8", errors="replace")[:200])
|
||||
return True
|
||||
|
||||
|
||||
def fetch_rows():
|
||||
s = RateLimitedSession(min_interval=0.5)
|
||||
r = s.get(URL)
|
||||
r.encoding = "utf-8"
|
||||
reader = csv.DictReader(io.StringIO(r.text))
|
||||
rows = list(reader)
|
||||
LOG.info("fetched %d HPSA rows", len(rows))
|
||||
return rows
|
||||
|
||||
|
||||
def _parse_date(s):
|
||||
if not s:
|
||||
return None
|
||||
for fmt in ("%Y-%m-%d", "%m/%d/%Y"):
|
||||
try:
|
||||
return datetime.strptime(s, fmt).date()
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _parse_int(s):
|
||||
try:
|
||||
return int(float(s)) if s not in (None, "") else None
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def write_rows(conn, raw):
|
||||
cols = ["hpsa_id","state","county_fips","score","population_served",
|
||||
"designated_date","withdrawn_date","source"]
|
||||
rows = []
|
||||
for r in raw:
|
||||
rows.append((
|
||||
r.get("HPSA ID"),
|
||||
r.get("Primary State Abbreviation"),
|
||||
r.get("Common County FIPS Code") or r.get("HPSA Geography Identification Number"),
|
||||
_parse_int(r.get("HPSA Score")),
|
||||
_parse_int(r.get("HPSA Designation Population")),
|
||||
_parse_date(r.get("HPSA Designation Date")),
|
||||
_parse_date(r.get("Withdrawn Date")),
|
||||
"hrsa_hpsa_mh",
|
||||
))
|
||||
bulk_insert(conn, "bhi_shortages", cols, rows)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def main():
|
||||
with job_run("bhi_hrsa_hpsa") as (conn, _):
|
||||
n = write_rows(conn, fetch_rows())
|
||||
LOG.info("inserted %d", n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "test":
|
||||
sys.exit(0 if test_endpoint() else 1)
|
||||
main()
|
||||
Reference in New Issue
Block a user