BHI layer v1: docs, schema, Phase A ingestion stubs

2026-04-05 20:15:36 +00:00
commit 3dfd9ea3c6
21 changed files with 2399 additions and 0 deletions
--- a/jobs/ingestion/nppes.py
+++ b/jobs/ingestion/nppes.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# READY TO DEPLOY — requires base Brain Postgres schema + run schemas/bhi_tables.sql
+"""
+CMS NPPES (National Plan & Provider Enumeration System) — behavioral health
+providers by taxonomy + state.
+
+API: https://npiregistry.cms.hhs.gov/api/?version=2.1
+Filter: taxonomy codes for psychiatry, psychology, counseling, SUD.
+"""
+import logging
+import sys
+from _common import RateLimitedSession, bulk_insert, job_run
+
+LOG = logging.getLogger("bhi.nppes")
+BASE = "https://npiregistry.cms.hhs.gov/api/"
+
+BH_TAXONOMY_CODES = [
+    "2084P0800X",  # Psychiatry
+    "2084P0802X",  # Addiction Psychiatry
+    "2084P0804X",  # Child & Adolescent Psychiatry
+    "103T00000X",  # Psychologist
+    "103TC2200X",  # Clinical Child & Adolescent Psychologist
+    "101YM0800X",  # Mental Health Counselor
+    "1041C0700X",  # Clinical Social Worker
+    "324500000X",  # Substance Abuse Rehabilitation Facility
+    "283Q00000X",  # Psychiatric Hospital
+    "323P00000X",  # Psychiatric Residential Treatment Facility
+]
+STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN",
+          "IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV",
+          "NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN",
+          "TX","UT","VT","VA","WA","WV","WI","WY","DC"]
+
+
+def test_endpoint():
+    s = RateLimitedSession()
+    r = s.get(BASE, params={
+        "version": "2.1", "taxonomy_description": "psychiatric",
+        "state": "NY", "limit": 2,
+    }).json()
+    print(f"OK: result_count={r.get('result_count')}")
+    return r.get("result_count", 0) > 0
+
+
+def fetch_rows():
+    s = RateLimitedSession(min_interval=0.1)
+    all_rows = []
+    for state in STATES:
+        for taxonomy in BH_TAXONOMY_CODES:
+            skip = 0
+            while True:
+                r = s.get(BASE, params={
+                    "version": "2.1",
+                    "taxonomy_description": taxonomy,
+                    "state": state,
+                    "limit": 200,
+                    "skip": skip,
+                }).json()
+                results = r.get("results", [])
+                if not results:
+                    break
+                for row in results:
+                    row["_state"] = state
+                    row["_taxonomy"] = taxonomy
+                all_rows.extend(results)
+                if len(results) < 200:
+                    break
+                skip += 200
+                if skip > 1200:  # NPPES caps paging
+                    break
+            LOG.info("state=%s tax=%s total=%d", state, taxonomy, len(all_rows))
+    return all_rows
+
+
+def write_rows(conn, raw):
+    cols = [
+        "ccn","npi","name","address","city","state","zip","county_fips",
+        "lat","lon","facility_type","ownership","bed_count","psych_bed_count",
+        "pediatric_psych_bed_count","adolescent_unit","young_adult_unit",
+        "services_offered","populations_served","payment_accepted",
+        "medicaid_accepted","accreditation","opened_date","closed_date",
+        "last_verified","source","source_raw_id",
+    ]
+    rows = []
+    for r in raw:
+        addresses = r.get("addresses") or []
+        location = next((a for a in addresses if a.get("address_purpose") == "LOCATION"), addresses[0] if addresses else {})
+        basic = r.get("basic") or {}
+        name = basic.get("organization_name") or " ".join(filter(None, [basic.get("first_name"), basic.get("last_name")]))
+        rows.append((
+            None, str(r.get("number", "")),
+            name,
+            location.get("address_1"), location.get("city"),
+            location.get("state"), location.get("postal_code"), None,
+            None, None,
+            "provider" if basic.get("name_prefix") is None else "org",
+            None, None, None, None, None, None,
+            [r.get("_taxonomy", "")], [], [], None, None, None, None, None,
+            "nppes", None,
+        ))
+    bulk_insert(conn, "bhi_facilities", cols, rows)
+    return len(rows)
+
+
+def main():
+    with job_run("bhi_nppes") as (conn, _):
+        n = write_rows(conn, fetch_rows())
+        LOG.info("inserted %d", n)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        sys.exit(0 if test_endpoint() else 1)
+    main()