BHI layer v1: docs, schema, Phase A ingestion stubs
This commit is contained in:
95
jobs/ingestion/samhsa_locator.py
Normal file
95
jobs/ingestion/samhsa_locator.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
|
||||
"""
|
||||
SAMHSA findtreatment.gov behavioral health facility locator.
|
||||
|
||||
Source: https://findtreatment.gov/locator/exportsAsJson/v2
|
||||
Confirmed: 96,009 facilities across 3,201 pages (sType=BH).
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from _common import RateLimitedSession, bulk_insert, job_run
|
||||
|
||||
LOG = logging.getLogger("bhi.samhsa_locator")
|
||||
BASE = "https://findtreatment.gov/locator/exportsAsJson/v2"
|
||||
ZIP_SEED = "10001" # any valid zip works; results are national in the 'BH' sType
|
||||
PAGE_SIZE = 30 # server default; respected
|
||||
|
||||
|
||||
def test_endpoint():
|
||||
s = RateLimitedSession()
|
||||
r = s.get(BASE, params={"sType": "BH", "sAddr": ZIP_SEED, "page": 1}).json()
|
||||
print(f"OK: recordCount={r.get('recordCount')}, totalPages={r.get('totalPages')}")
|
||||
rows = r.get("rows", [])
|
||||
if rows:
|
||||
print("sample:", rows[0].get("name1"), rows[0].get("state"))
|
||||
return bool(rows)
|
||||
|
||||
|
||||
def fetch_rows(max_pages: int | None = None):
|
||||
s = RateLimitedSession(min_interval=0.3)
|
||||
out = []
|
||||
page = 1
|
||||
total = None
|
||||
while True:
|
||||
r = s.get(BASE, params={"sType": "BH", "sAddr": ZIP_SEED, "pageSize": PAGE_SIZE, "page": page}).json()
|
||||
total = total or r.get("totalPages", 1)
|
||||
out.extend(r.get("rows", []))
|
||||
if page % 50 == 0:
|
||||
LOG.info("page %d/%d (total rows %d)", page, total, len(out))
|
||||
if page >= total or (max_pages and page >= max_pages):
|
||||
break
|
||||
page += 1
|
||||
LOG.info("fetched %d facilities", len(out))
|
||||
return out
|
||||
|
||||
|
||||
def _parse_float(v):
|
||||
try:
|
||||
return float(v) if v not in (None, "") else None
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def write_rows(conn, raw):
|
||||
cols = [
|
||||
"ccn","npi","name","address","city","state","zip","county_fips",
|
||||
"lat","lon","facility_type","ownership","bed_count","psych_bed_count",
|
||||
"pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
|
||||
"services_offered","populations_served","payment_accepted",
|
||||
"medicaid_accepted","accreditation","opened_date","closed_date",
|
||||
"last_verified","source","source_raw_id",
|
||||
]
|
||||
rows = []
|
||||
for r in raw:
|
||||
name = " ".join(filter(None, [r.get("name1"), (r.get("name2") or "").strip()])).strip()
|
||||
services = (r.get("services") or "").split(",") if r.get("services") else []
|
||||
# SAMHSA flags adolescent/young-adult services in the services string
|
||||
services_lc = [s.lower() for s in services]
|
||||
adolescent = any("adolescent" in s or "youth" in s or "teen" in s for s in services_lc) or None
|
||||
young_adult = any("young adult" in s or "transitional age" in s for s in services_lc) or None
|
||||
rows.append((
|
||||
None, None, # ccn/npi unknown from this source
|
||||
name, r.get("street1"),
|
||||
r.get("city"), r.get("state"), r.get("zip"), None,
|
||||
_parse_float(r.get("latitude")), _parse_float(r.get("longitude")),
|
||||
r.get("typeFacility") or "bh_facility",
|
||||
None, None, None, None,
|
||||
adolescent, young_adult,
|
||||
services, [], [], None, None, None, None, None,
|
||||
"samhsa_locator", None,
|
||||
))
|
||||
bulk_insert(conn, "bhi_facilities", cols, rows)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def main():
|
||||
with job_run("bhi_samhsa_locator") as (conn, _):
|
||||
n = write_rows(conn, fetch_rows())
|
||||
LOG.info("inserted %d", n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "test":
|
||||
sys.exit(0 if test_endpoint() else 1)
|
||||
main()
|
||||
Reference in New Issue
Block a user