diff --git a/obbba_district_impacts/Congressional-Hackathon-2025 b/obbba_district_impacts/Congressional-Hackathon-2025 new file mode 160000 index 0000000..3f6d05e --- /dev/null +++ b/obbba_district_impacts/Congressional-Hackathon-2025 @@ -0,0 +1 @@ +Subproject commit 3f6d05e76400c6e396a3a4eddd34a7b3f6919fc3 diff --git a/ri_dataset_exploration.ipynb b/ri_dataset_exploration.ipynb new file mode 100644 index 0000000..daa7ea0 --- /dev/null +++ b/ri_dataset_exploration.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "be1cea7a", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us import Microsimulation\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0d21b774", + "metadata": {}, + "outputs": [], + "source": [ + "# Load RI dataset\n", + "sim = Microsimulation(dataset=\"hf://policyengine/policyengine-us-data/states/RI.h5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1870e7ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of households in dataset: 11,735\n", + "Household count (mapped): 390,872\n", + "Person count (mapped): 1,120,502\n" + ] + } + ], + "source": [ + "# Check dataset size\n", + "household_weight = sim.calculate(\"household_weight\", period=2025)\n", + "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n", + "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n", + "\n", + "print(f\"Number of households in dataset: {len(household_weight):,}\")\n", + "print(f\"Household count (mapped): {household_count.sum():,.0f}\")\n", + "print(f\"Person count (mapped): {person_count.sum():,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f0c79a50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Income distribution:\n", + " Median AGI: $41,974\n", + " 75th percentile: $95,029\n", + " 90th percentile: $150,242\n", + " 95th percentile: $240,597\n", + " Max AGI: $681,759,168\n", + "\n", + "Households by income threshold:\n", + " Households over $80k: 122,227.22258193106\n", + " Households over $120k: 63,052.99693266199\n", + " Households over $160k: 32,025.514712579505\n", + " Households over $240k: 19,711.762751945134\n" + ] + } + ], + "source": [ + "# Check household income distribution (aggregate to household level using map_to)\n", + "agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "print(f\"Income distribution:\")\n", + "print(f\" Median AGI: ${agi.median():,.0f}\")\n", + "print(f\" 75th percentile: ${agi.quantile(0.75):,.0f}\")\n", + "print(f\" 90th percentile: ${agi.quantile(0.90):,.0f}\")\n", + "print(f\" 95th percentile: ${agi.quantile(0.95):,.0f}\")\n", + "print(f\" Max AGI: ${agi.max():,.0f}\")\n", + "print(f\"\\nHouseholds by income threshold:\")\n", + "print(f\" Households over $80k: {(agi > 80_000).sum():,}\")\n", + "print(f\" Households over $120k: {(agi > 120_000).sum():,}\")\n", + "print(f\" Households over $160k: {(agi > 160_000).sum():,}\")\n", + "print(f\" Households over $240k: {(agi > 240_000).sum():,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "71b548db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Households with children (weighted):\n", + " Total households with children: 115,262\n", + " Households with 1 child: 52,115\n", + " Households with 2 children: 35,227\n", + " Households with 3+ children: 27,919\n" + ] + } + ], + "source": [ + "# Check households with children (count at person level, aggregate to household)\n", + "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n", + "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n", + "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n", + "\n", + "# Create DataFrame for easier manipulation\n", + "df_households = pd.DataFrame({\n", + " 'household_id': household_id,\n", + " 'is_child': is_child,\n", + " 'household_weight': household_weight\n", + "})\n", + "\n", + "# Count children per household\n", + "children_per_household = df_households.groupby('household_id').agg({\n", + " 'is_child': 'sum',\n", + " 'household_weight': 'first' # household_weight is same for all members\n", + "}).reset_index()\n", + "\n", + "# Calculate weighted household counts\n", + "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n", + "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n", + "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n", + "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n", + "\n", + "print(f\"\\nHouseholds with children (weighted):\")\n", + "print(f\" Total households with children: {total_households_with_children:,.0f}\")\n", + "print(f\" Households with 1 child: {households_with_1_child:,.0f}\")\n", + "print(f\" Households with 2 children: {households_with_2_children:,.0f}\")\n", + "print(f\" Households with 3+ children: {households_with_3plus_children:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a215302f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Children by age:\n", + " Total children under 18: 220,202\n", + " Children under 4: 41,849\n", + " Children under 6: 62,791\n", + " Children ages 6-17: 157,410\n", + "\n", + "Verification: under 6 + ages 6-17 = 220,202\n", + "\n", + "Sample of children under 4:\n", + " household_id tax_unit_id person_id age\n", + "33 14 20 33 2.0\n", + "40 15 23 40 0.0\n", + "51 17 28 51 2.0\n", + "100 37 54 100 2.0\n", + "117 44 63 117 2.0\n", + "139 50 71 139 1.0\n", + "165 62 84 165 3.0\n", + "166 62 84 166 1.0\n", + "188 73 96 188 3.0\n", + "228 93 119 228 2.0\n" + ] + } + ], + "source": [ + "# Check children by age groups\n", + "df = pd.DataFrame({\n", + " \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n", + " \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n", + " \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n", + " \"age\": sim.calculate(\"age\", map_to=\"person\"),\n", + " \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n", + "})\n", + "\n", + "# Filter for children by age and apply weights\n", + "children_under_4_df = df[df['age'] < 4]\n", + "children_under_6_df = df[df['age'] < 6]\n", + "children_under_18_df = df[df['age'] < 18]\n", + "children_6_17_df = df[(df['age'] >= 6) & (df['age'] < 18)]\n", + "\n", + "# Calculate weighted totals using consistent age < 18 definition\n", + "total_children = children_under_18_df['person_weight'].sum()\n", + "children_under_4 = children_under_4_df['person_weight'].sum()\n", + "children_under_6 = children_under_6_df['person_weight'].sum()\n", + "children_6_17 = children_6_17_df['person_weight'].sum()\n", + "\n", + "print(f\"\\nChildren by age:\")\n", + "print(f\" Total children under 18: {total_children:,.0f}\")\n", + "print(f\" Children under 4: {children_under_4:,.0f}\")\n", + "print(f\" Children under 6: {children_under_6:,.0f}\")\n", + "print(f\" Children ages 6-17: {children_6_17:,.0f}\")\n", + "\n", + "# Verify counts add up\n", + "print(f\"\\nVerification: under 6 + ages 6-17 = {children_under_6 + children_6_17:,.0f}\")\n", + "\n", + "print(f\"\\nSample of children under 4:\")\n", + "print(children_under_4_df[['household_id', 'tax_unit_id', 'person_id', 'age']].head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9468033e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "RI DATASET SUMMARY - WEIGHTED (Population Estimates)\n", + "============================================================\n", + " Metric Value\n", + " Household count (weighted) 390,872\n", + " Person count (weighted) 1,120,502\n", + " Median AGI $41,974\n", + " 75th percentile AGI $95,029\n", + " 90th percentile AGI $150,242\n", + " 95th percentile AGI $240,597\n", + " Max AGI $681,759,168\n", + " Households over $80k 122,227\n", + " Households over $120k 63,053\n", + " Households over $160k 32,026\n", + " Households over $240k 19,712\n", + "Total households with children 115,262\n", + " Households with 1 child 52,115\n", + " Households with 2 children 35,227\n", + " Households with 3+ children 27,919\n", + " Total children (age < 18) 220,202\n", + " Children under 4 41,849\n", + " Children under 6 62,791\n", + " Children ages 6-17 157,410\n", + "============================================================\n", + "\n", + "============================================================\n", + "RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\n", + "============================================================\n", + " Metric Value\n", + " Number of households in dataset 11,735\n", + " Number of persons in dataset 29,623\n", + " Households with children (unweighted) 3,836\n", + " Households with 1 child (unweighted) 1,684\n", + " Households with 2 children (unweighted) 1,324\n", + "Households with 3+ children (unweighted) 828\n", + " Children under 18 (unweighted) 7,248\n", + " Children under 4 (unweighted) 1,291\n", + " Children under 6 (unweighted) 1,999\n", + " Children ages 6-17 (unweighted) 5,249\n", + "============================================================\n", + "\n", + "Summaries saved to:\n", + " - ri_dataset_summary_weighted.csv\n", + " - ri_dataset_summary_unweighted.csv\n" + ] + } + ], + "source": [ + "# Create weighted summary table\n", + "weighted_summary_data = {\n", + " 'Metric': [\n", + " 'Household count (weighted)',\n", + " 'Person count (weighted)',\n", + " 'Median AGI',\n", + " '75th percentile AGI',\n", + " '90th percentile AGI',\n", + " '95th percentile AGI',\n", + " 'Max AGI',\n", + " 'Households over $80k',\n", + " 'Households over $120k',\n", + " 'Households over $160k',\n", + " 'Households over $240k',\n", + " 'Total households with children',\n", + " 'Households with 1 child',\n", + " 'Households with 2 children',\n", + " 'Households with 3+ children',\n", + " 'Total children (age < 18)',\n", + " 'Children under 4',\n", + " 'Children under 6',\n", + " 'Children ages 6-17'\n", + " ],\n", + " 'Value': [\n", + " f\"{household_count.sum():,.0f}\",\n", + " f\"{person_count.sum():,.0f}\",\n", + " f\"${agi.median():,.0f}\",\n", + " f\"${agi.quantile(0.75):,.0f}\",\n", + " f\"${agi.quantile(0.90):,.0f}\",\n", + " f\"${agi.quantile(0.95):,.0f}\",\n", + " f\"${agi.max():,.0f}\",\n", + " f\"{(agi > 80_000).sum():,.0f}\",\n", + " f\"{(agi > 120_000).sum():,.0f}\",\n", + " f\"{(agi > 160_000).sum():,.0f}\",\n", + " f\"{(agi > 240_000).sum():,.0f}\",\n", + " f\"{total_households_with_children:,.0f}\",\n", + " f\"{households_with_1_child:,.0f}\",\n", + " f\"{households_with_2_children:,.0f}\",\n", + " f\"{households_with_3plus_children:,.0f}\",\n", + " f\"{total_children:,.0f}\",\n", + " f\"{children_under_4:,.0f}\",\n", + " f\"{children_under_6:,.0f}\",\n", + " f\"{children_6_17:,.0f}\"\n", + " ]\n", + "}\n", + "\n", + "# Get unique counts for unweighted table\n", + "unique_households = df['household_id'].nunique()\n", + "unique_persons = len(df)\n", + "\n", + "# Create unweighted summary table\n", + "unweighted_summary_data = {\n", + " 'Metric': [\n", + " 'Number of households in dataset',\n", + " 'Number of persons in dataset',\n", + " 'Households with children (unweighted)',\n", + " 'Households with 1 child (unweighted)',\n", + " 'Households with 2 children (unweighted)',\n", + " 'Households with 3+ children (unweighted)',\n", + " 'Children under 18 (unweighted)',\n", + " 'Children under 4 (unweighted)',\n", + " 'Children under 6 (unweighted)',\n", + " 'Children ages 6-17 (unweighted)'\n", + " ],\n", + " 'Value': [\n", + " f\"{unique_households:,}\",\n", + " f\"{unique_persons:,}\",\n", + " f\"{(children_per_household['is_child'] > 0).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 1).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 2).sum():,}\",\n", + " f\"{(children_per_household['is_child'] >= 3).sum():,}\",\n", + " f\"{len(children_under_18_df):,}\",\n", + " f\"{len(children_under_4_df):,}\",\n", + " f\"{len(children_under_6_df):,}\",\n", + " f\"{len(children_6_17_df):,}\"\n", + " ]\n", + "}\n", + "\n", + "weighted_df = pd.DataFrame(weighted_summary_data)\n", + "unweighted_df = pd.DataFrame(unweighted_summary_data)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n", + "print(\"=\"*60)\n", + "print(weighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\")\n", + "print(\"=\"*60)\n", + "print(unweighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "# Save both tables\n", + "weighted_df.to_csv('ri_dataset_summary_weighted.csv', index=False)\n", + "unweighted_df.to_csv('ri_dataset_summary_unweighted.csv', index=False)\n", + "print(\"\\nSummaries saved to:\")\n", + "print(\" - ri_dataset_summary_weighted.csv\")\n", + "print(\" - ri_dataset_summary_unweighted.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dzvou2zqia4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGI Distribution:\n", + " Tax unit median: $22,998\n", + " Household median: $41,974\n", + "\n", + "Total AGI for Rhode Island: $40,095,429,101\n", + "Number of tax units: 65,515,466\n" + ] + } + ], + "source": [ + "# AGI analysis at the correct aggregation level\n", + "# AGI is a tax_unit variable - always use tax_unit or household level for totals\n", + "\n", + "agi_tax_unit = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "agi_household = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "\n", + "print(\"AGI Distribution:\")\n", + "print(f\" Tax unit median: ${agi_tax_unit.median():,.0f}\")\n", + "print(f\" Household median: ${agi_household.median():,.0f}\")\n", + "\n", + "# Total AGI - tax_unit and household levels give same correct result\n", + "# (household aggregates tax units within the same household)\n", + "total_agi = agi_tax_unit.sum()\n", + "print(f\"\\nTotal AGI for Rhode Island: ${total_agi:,.0f}\")\n", + "\n", + "# Count tax units\n", + "tax_unit_weight = sim.calculate(\"tax_unit_weight\", period=2025)\n", + "print(f\"Number of tax units: {tax_unit_weight.sum():,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "gispfkxpnph", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGI Component Breakdown (Tax Unit Level)\n", + "============================================================\n", + "\n", + "Total Income (Statewide):\n", + " Employment Income: $ 29,923,667,093\n", + " Self-Employment Income: $ 683,648,282\n", + " Capital Gains: $ -3,858,537,645\n", + " Qualified Dividends: $ 921,313,179\n", + " Interest Income: $ 600,727,409\n", + " Taxable Social Security: $ 586,089,199\n", + " Pension Income: $ 8,103,077,487\n", + " Adjusted Gross Income (AGI): $ 40,095,429,101\n", + "\n", + "Median Values:\n", + " Employment Income: $ 16,790\n", + " Self-Employment Income: $ 0\n", + " Capital Gains: $ 0\n", + " Qualified Dividends: $ 0\n", + " Interest Income: $ 0\n", + " Taxable Social Security: $ 0\n", + " Pension Income: $ 0\n", + " Adjusted Gross Income (AGI): $ 22,998\n", + "\n", + "Sum of income components: $ 36,959,985,006\n", + "AGI (for comparison): $ 40,095,429,101\n", + "Difference (potential missing income or deductions): $ -3,135,444,095\n" + ] + } + ], + "source": [ + "# Break down AGI components at tax unit level\n", + "print(\"AGI Component Breakdown (Tax Unit Level)\")\n", + "print(\"=\"*60)\n", + "\n", + "# Calculate key income components\n", + "employment_income = sim.calculate(\"employment_income\", period=2025, map_to=\"tax_unit\")\n", + "self_employment_income = sim.calculate(\"self_employment_income\", period=2025, map_to=\"tax_unit\")\n", + "capital_gains = sim.calculate(\"capital_gains\", period=2025, map_to=\"tax_unit\")\n", + "qualified_dividend_income = sim.calculate(\"qualified_dividend_income\", period=2025, map_to=\"tax_unit\")\n", + "interest_income = sim.calculate(\"interest_income\", period=2025, map_to=\"tax_unit\")\n", + "taxable_social_security = sim.calculate(\"taxable_social_security\", period=2025, map_to=\"tax_unit\")\n", + "pension_income = sim.calculate(\"pension_income\", period=2025, map_to=\"tax_unit\")\n", + "adjusted_gross_income = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "\n", + "print(\"\\nTotal Income (Statewide):\")\n", + "print(f\" Employment Income: ${employment_income.sum():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.sum():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.sum():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.sum():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.sum():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.sum():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.sum():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "\n", + "print(\"\\nMedian Values:\")\n", + "print(f\" Employment Income: ${employment_income.median():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.median():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.median():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.median():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.median():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.median():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.median():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.median():>15,.0f}\")\n", + "\n", + "# Calculate sum of components to compare with AGI\n", + "total_components = (employment_income + self_employment_income + capital_gains + \n", + " qualified_dividend_income + interest_income + taxable_social_security + pension_income)\n", + "print(f\"\\nSum of income components: ${total_components.sum():>15,.0f}\")\n", + "print(f\"AGI (for comparison): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "print(f\"Difference (potential missing income or deductions): ${(total_components.sum() - adjusted_gross_income.sum()):>15,.0f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ri_dataset_summary_unweighted.csv b/ri_dataset_summary_unweighted.csv new file mode 100644 index 0000000..65552d1 --- /dev/null +++ b/ri_dataset_summary_unweighted.csv @@ -0,0 +1,11 @@ +Metric,Value +Number of households in dataset,"11,735" +Number of persons in dataset,"29,623" +Households with children (unweighted),"3,836" +Households with 1 child (unweighted),"1,684" +Households with 2 children (unweighted),"1,324" +Households with 3+ children (unweighted),828 +Children under 18 (unweighted),"7,248" +Children under 4 (unweighted),"1,291" +Children under 6 (unweighted),"1,999" +Children ages 6-17 (unweighted),"5,249" diff --git a/ri_dataset_summary_weighted.csv b/ri_dataset_summary_weighted.csv new file mode 100644 index 0000000..feedcf4 --- /dev/null +++ b/ri_dataset_summary_weighted.csv @@ -0,0 +1,20 @@ +Metric,Value +Household count (weighted),"390,872" +Person count (weighted),"1,120,502" +Median AGI,"$41,974" +75th percentile AGI,"$95,029" +90th percentile AGI,"$150,242" +95th percentile AGI,"$240,597" +Max AGI,"$681,759,168" +Households over $80k,"122,227" +Households over $120k,"63,053" +Households over $160k,"32,026" +Households over $240k,"19,712" +Total households with children,"115,262" +Households with 1 child,"52,115" +Households with 2 children,"35,227" +Households with 3+ children,"27,919" +Total children (age < 18),"220,202" +Children under 4,"41,849" +Children under 6,"62,791" +Children ages 6-17,"157,410"