THIS IS A DRAFT! Please talk to me before you use this.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from great_tables import GT
from pathlib import Path
import gc
import matplotlib.colors as mcolors
# Set up pandas display options
pd.set_option('display.max_columns' , None )
pd.set_option('display.width' , None )
pd.set_option('display.max_colwidth' , None )
# Constants
EMPLOYMENT_PARQUET_DIR = "../../employment_cube/parquet/"
SEPARATIONS_PARQUET_DIR = "../../separations_accessions/parquet/"
AMS_CODE = 'AG02'
# Color configuration
HIGH_SEPARATIONS_COLOR = "#d62728"
NET_POSITIVE_COLOR = "#2ca02c"
NET_NEGATIVE_COLOR = "#d62728"
# Utility Functions
def clean_employment(x):
"""Convert employment values to numeric, handling REDACTED and ***** values"""
if pd.isna(x) or x in ['REDACTED' , '*****' , '' ]:
return 0
try :
return int (x)
except :
return 0
def calculate_fiscal_year(efdate):
"""Convert efdate (YYYYMM) to fiscal year."""
efdate_str = str (efdate)
year = int (efdate_str[:4 ])
month = int (efdate_str[4 :6 ])
return year + 1 if month >= 10 else year
Data Loading
Show code
# Load Employment Data
fiscal_years = [2016 , 2017 , 2018 , 2019 , 2020 , 2021 , 2022 ]
employment_data = []
tenure_employment_data = []
for fy in fiscal_years:
baseline_year = fy - 1
file_path = f" { EMPLOYMENT_PARQUET_DIR} fedscope_employment_September_ { baseline_year} .parquet"
try :
df = pd.read_parquet(file_path)
# Filter for AMS
ams_data = df[df['agysub' ] == AMS_CODE].copy()
if not ams_data.empty:
ams_data['employment_num' ] = ams_data['employment' ].apply (clean_employment)
# Overall employment
total_emp = ams_data['employment_num' ].sum ()
agency_name = ams_data['agysubt' ].iloc[0 ]
employment_data.append({
'fiscal_year' : fy,
'employment' : total_emp,
'agysub' : AMS_CODE,
'agysubt' : agency_name
})
# Tenure-specific employment
for tenure in ['A' , 'B' ]:
tenure_emp = ams_data[ams_data['loslvl' ] == tenure]['employment_num' ].sum ()
tenure_employment_data.append({
'fiscal_year' : fy,
'loslvl' : tenure,
'employment' : tenure_emp,
'agysub' : AMS_CODE,
'agysubt' : agency_name
})
# 2+ years
tenure_2plus = ams_data[~ ams_data['loslvl' ].isin(['A' , 'B' ])]['employment_num' ].sum ()
tenure_employment_data.append({
'fiscal_year' : fy,
'loslvl' : 'C+' ,
'employment' : tenure_2plus,
'agysub' : AMS_CODE,
'agysubt' : agency_name
})
del df
gc.collect()
except Exception as e:
pass
employment_df = pd.DataFrame(employment_data)
tenure_employment_df = pd.DataFrame(tenure_employment_data)
# Load Separations Data
separations_files = [
"fedscope_separations_FY2015-2019.parquet" ,
"fedscope_separations_FY2020-2024.parquet" ,
"fedscope_separations_March_2025.parquet"
]
separations_data = []
tenure_separations_data = []
for file in separations_files:
file_path = f" { SEPARATIONS_PARQUET_DIR}{ file } "
try :
df = pd.read_parquet(file_path)
# Filter for AMS
ams_data = df[df['agysub' ] == AMS_CODE].copy()
if not ams_data.empty:
ams_data['fiscal_year' ] = ams_data['efdate' ].apply (calculate_fiscal_year)
ams_data = ams_data[ams_data['fiscal_year' ].isin(fiscal_years)]
# Overall separations by fiscal year
fy_seps = ams_data.groupby('fiscal_year' ).size().reset_index(name= 'separations' )
fy_seps['agysub' ] = AMS_CODE
separations_data.append(fy_seps)
# Tenure-specific separations
if 'loslvl' in ams_data.columns:
for tenure in ['A' , 'B' ]:
tenure_seps = ams_data[ams_data['loslvl' ] == tenure].groupby('fiscal_year' ).size().reset_index(name= 'separations' )
tenure_seps['loslvl' ] = tenure
tenure_seps['agysub' ] = AMS_CODE
tenure_separations_data.append(tenure_seps)
# 2+ years
tenure_2plus_seps = ams_data[~ ams_data['loslvl' ].isin(['A' , 'B' ])].groupby('fiscal_year' ).size().reset_index(name= 'separations' )
tenure_2plus_seps['loslvl' ] = 'C+'
tenure_2plus_seps['agysub' ] = AMS_CODE
tenure_separations_data.append(tenure_2plus_seps)
del df
gc.collect()
except Exception as e:
pass
separations_df = pd.concat(separations_data, ignore_index= True )
separations_df = separations_df.groupby(['fiscal_year' , 'agysub' ])['separations' ].sum ().reset_index()
tenure_separations_df = pd.concat(tenure_separations_data, ignore_index= True )
tenure_separations_df = tenure_separations_df.groupby(['fiscal_year' , 'agysub' , 'loslvl' ])['separations' ].sum ().reset_index()
# Get AMS name
AMS_NAME = employment_df['agysubt' ].iloc[0 ] if not employment_df.empty else 'AGRICULTURAL MARKETING SERVICE'
Calculate Separation Rates
Show code
# Annual rates
annual_data = pd.merge(
employment_df,
separations_df,
on= ['fiscal_year' , 'agysub' ],
how= 'outer'
)
annual_data['employment' ] = annual_data['employment' ].fillna(0 )
annual_data['separations' ] = annual_data['separations' ].fillna(0 )
annual_data['separation_rate' ] = np.where(
annual_data['employment' ] > 0 ,
(annual_data['separations' ] / annual_data['employment' ]) * 100 ,
0
)
# Tenure-specific rates
tenure_data = pd.merge(
tenure_employment_df,
tenure_separations_df,
on= ['fiscal_year' , 'agysub' , 'loslvl' ],
how= 'outer'
)
tenure_data['employment' ] = tenure_data['employment' ].fillna(0 )
tenure_data['separations' ] = tenure_data['separations' ].fillna(0 )
tenure_data['separation_rate' ] = np.where(
tenure_data['employment' ] > 0 ,
(tenure_data['separations' ] / tenure_data['employment' ]) * 100 ,
0
)
# Calculate averages
avg_employment = annual_data['employment' ].mean()
avg_separations = annual_data['separations' ].mean()
avg_separation_rate = annual_data['separation_rate' ].mean()
AMS Annual Separation Rates by Fiscal Year
Show code
# Create GT table
annual_display = annual_data[['fiscal_year' , 'employment' , 'separations' , 'separation_rate' ]].copy()
annual_display['fiscal_year' ] = annual_display['fiscal_year' ].astype(int )
annual_display.columns = ['FY' , 'Employment' , 'Separations' , 'Rate (%)' ]
# Add average row
avg_row = pd.DataFrame({
'FY' : ['Average' ],
'Employment' : [int (avg_employment)],
'Separations' : [int (avg_separations)],
'Rate (%)' : [round (avg_separation_rate, 1 )]
})
annual_display = pd.concat([annual_display, avg_row], ignore_index= True )
gt_annual = (
GT(annual_display)
.tab_header(
title= "Agricultural Marketing Service" ,
subtitle= "Annual separation rates by fiscal year"
)
.fmt_number(columns= ['Employment' , 'Separations' ], decimals= 0 , use_seps= True )
.fmt_number(columns= ['Rate (%)' ], decimals= 1 )
.data_color(
columns= ['Rate (%)' ],
palette= ["white" , HIGH_SEPARATIONS_COLOR],
domain= [0 , annual_display['Rate (%)' ].max ()],
na_color= "lightgray"
)
)
gt_annual.show()
Annual separation rates by fiscal year
2016
3,547
585
16.5
2017
3,524
482
13.7
2018
3,512
428
12.2
2019
4,413
482
10.9
2020
4,375
596
13.6
2021
4,355
469
10.8
2022
4,318
484
11.2
Average
4,006
503
12.7
AMS Separations by Employee Tenure
Show code
# Calculate tenure summary
tenure_summary = tenure_data.groupby('loslvl' ).agg({
'employment' : 'mean' ,
'separations' : 'mean' ,
'separation_rate' : 'mean'
}).reset_index()
# Create bar chart visualization
fig, ax = plt.subplots(figsize= (10 , 6 ))
tenure_labels_list = ['<1 Year' , '1-2 Years' , '2+ Years' , 'Overall' ]
tenure_rates = []
# Get rates from tenure_summary
for loslvl in ['A' , 'B' , 'C+' ]:
rate = tenure_summary[tenure_summary['loslvl' ] == loslvl]['separation_rate' ].values[0 ] if len (tenure_summary[tenure_summary['loslvl' ] == loslvl]) > 0 else 0
tenure_rates.append(rate)
tenure_rates.append(avg_separation_rate)
# Create bar chart
bars = ax.bar(tenure_labels_list, tenure_rates, color= ['#d62728' , '#ff7f0e' , '#2ca02c' , '#1f77b4' ])
# Add value labels on bars
for bar, rate in zip (bars, tenure_rates):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/ 2. , height + 0.5 ,
f' { rate:.1f} %' , ha= 'center' , va= 'bottom' , fontsize= 11 )
ax.set_ylabel('Separation Rate (%)' , fontsize= 12 )
ax.set_title('Agricultural Marketing Service Separations by Tenure \n Average Annual Separation Rates (FY2016-2022)' ,
fontsize= 14 , fontweight= 'bold' )
ax.grid(True , alpha= 0.3 , axis= 'y' )
ax.set_ylim(0 , max (tenure_rates) * 1.1 )
# Add average employment as subtitle
plt.text(0.5 , - 0.15 , f'Average Employment: { int (avg_employment):,} ' ,
ha= 'center' , transform= ax.transAxes, fontsize= 11 , style= 'italic' )
plt.tight_layout()
plt.show()
AMS Monthly Net Change Heatmap
Show code
# Load Monthly Data for Heatmap
# Accessions
accessions_files = [
"fedscope_accessions_FY2015-2019.parquet" ,
"fedscope_accessions_FY2020-2024.parquet" ,
"fedscope_accessions_March_2025.parquet"
]
monthly_accessions = []
for file in accessions_files:
file_path = f" { SEPARATIONS_PARQUET_DIR}{ file } "
try :
df = pd.read_parquet(file_path)
ams_data = df[df['agysub' ] == AMS_CODE]
if not ams_data.empty:
ams_data['year' ] = ams_data['efdate' ].astype(str ).str [:4 ].astype(int )
ams_data['month' ] = ams_data['efdate' ].astype(str ).str [4 :6 ].astype(int )
ams_data = ams_data[ams_data['year' ].between(2016 , 2025 )]
monthly = ams_data.groupby(['year' , 'month' ]).size().reset_index(name= 'count' )
monthly_accessions.append(monthly)
del df
gc.collect()
except :
pass
# Separations (already loaded above, reuse logic)
monthly_separations = []
for file in separations_files:
file_path = f" { SEPARATIONS_PARQUET_DIR}{ file } "
try :
df = pd.read_parquet(file_path)
ams_data = df[df['agysub' ] == AMS_CODE]
if not ams_data.empty:
ams_data['year' ] = ams_data['efdate' ].astype(str ).str [:4 ].astype(int )
ams_data['month' ] = ams_data['efdate' ].astype(str ).str [4 :6 ].astype(int )
ams_data = ams_data[ams_data['year' ].between(2016 , 2025 )]
monthly = ams_data.groupby(['year' , 'month' ]).size().reset_index(name= 'count' )
monthly_separations.append(monthly)
del df
gc.collect()
except :
pass
if monthly_accessions and monthly_separations:
acc_df = pd.concat(monthly_accessions).groupby(['year' , 'month' ])['count' ].sum ().reset_index()
acc_df.rename(columns= {'count' : 'accessions' }, inplace= True )
sep_df = pd.concat(monthly_separations).groupby(['year' , 'month' ])['count' ].sum ().reset_index()
sep_df.rename(columns= {'count' : 'separations' }, inplace= True )
# Merge and calculate net change
monthly_combined = pd.merge(acc_df, sep_df, on= ['year' , 'month' ], how= 'outer' )
monthly_combined['net_change' ] = monthly_combined['accessions' ].fillna(0 ) - monthly_combined['separations' ].fillna(0 )
# Only calculate net_change where we have both accessions and separations
mask = monthly_combined['accessions' ].isna() | monthly_combined['separations' ].isna()
monthly_combined.loc[mask, 'net_change' ] = np.nan
# Create pivot
net_pivot = monthly_combined.pivot(index= 'month' , columns= 'year' , values= 'net_change' )
# Create GT table version
net_df = net_pivot.reset_index()
net_df['month' ] = pd.to_datetime(net_df['month' ], format = '%m' ).dt.strftime('%B' )
# Convert year columns to strings
net_df.columns = ['Month' ] + [str (col) for col in net_df.columns[1 :]]
# Calculate max absolute value for symmetric color scale
max_abs_val = np.nanmax(np.abs (net_pivot.values)) if not net_pivot.empty else 100
# Create GT table
year_columns = [col for col in net_df.columns if col != 'Month' ]
gt_net = (
GT(net_df)
.tab_header(
title= f" { AMS_NAME} " ,
subtitle= f"Monthly Net Change (2016-2025) - Average Separation Rate: { avg_separation_rate:.0f} %"
)
.fmt_number(columns= year_columns, decimals= 0 , use_seps= True )
.data_color(
columns= year_columns,
palette= [NET_NEGATIVE_COLOR, "white" , NET_POSITIVE_COLOR],
domain= [- max_abs_val, max_abs_val],
na_color= "lightgray"
)
.cols_width({
'Month' : '100px'
})
)
gt_net.show()
Monthly Net Change (2016-2025) - Average Separation Rate: 13%
January
−21
−16
−1
−21
−3
−17
17
−16
3
February
−31
−8
−11
6
17
2
0
2
March
−55
−9
−13
−17
23
−4
−17
−12
April
−10
−45
5
−52
−23
−9
−3
−62
−6
May
46
−33
12
30
−7
14
4
−75
June
−46
−18
−1
8
7
−15
−10
−5
−50
July
−15
−12
−25
−29
−24
−15
−22
16
−18
August
−4
10
−3
−4
−26
48
14
3
6
September
20
8
−20
46
24
−8
13
57
16
October
112
145
87
202
33
98
14
60
November
123
158
55
51
52
106
62
33
December
44
37
−4
−42
−2
16
−49
−19
AMS Employment Over Time
Show code
# Employment Timeline
employment_timeline = []
for year in range (2013 , 2026 ):
# March
march_file = f" { EMPLOYMENT_PARQUET_DIR} fedscope_employment_March_ { year} .parquet"
if Path(march_file).exists():
try :
df = pd.read_parquet(march_file)
ams_data = df[df['agysub' ] == AMS_CODE]
if not ams_data.empty:
ams_data['employment_num' ] = ams_data['employment' ].apply (clean_employment)
total_emp = ams_data['employment_num' ].sum ()
employment_timeline.append({
'period' : f"March { year} " ,
'year' : year,
'month' : 'March' ,
'employment' : total_emp
})
del df
gc.collect()
except :
pass
# September
sept_file = f" { EMPLOYMENT_PARQUET_DIR} fedscope_employment_September_ { year} .parquet"
if Path(sept_file).exists():
try :
df = pd.read_parquet(sept_file)
ams_data = df[df['agysub' ] == AMS_CODE]
if not ams_data.empty:
ams_data['employment_num' ] = ams_data['employment' ].apply (clean_employment)
total_emp = ams_data['employment_num' ].sum ()
employment_timeline.append({
'period' : f"September { year} " ,
'year' : year,
'month' : 'September' ,
'employment' : total_emp
})
del df
gc.collect()
except :
pass
if employment_timeline:
timeline_df = pd.DataFrame(employment_timeline)
timeline_df['sort_key' ] = timeline_df['year' ] * 10 + timeline_df['month' ].map ({'March' : 1 , 'September' : 2 })
timeline_df = timeline_df.sort_values('sort_key' )
# Create employment timeline graph
plt.figure(figsize= (12 , 6 ))
# Separate March and September data
march_data = timeline_df[timeline_df['month' ] == 'March' ]
sept_data = timeline_df[timeline_df['month' ] == 'September' ]
# Plot lines
plt.plot(timeline_df['period' ], timeline_df['employment' ],
'k-' , alpha= 0.3 , linewidth= 1 )
# Plot points
if not march_data.empty:
plt.scatter(march_data['period' ], march_data['employment' ],
color= 'blue' , s= 100 , label= 'March' , zorder= 5 )
if not sept_data.empty:
plt.scatter(sept_data['period' ], sept_data['employment' ],
color= 'orange' , s= 100 , label= 'September' , zorder= 5 )
plt.xlabel('Period' , fontsize= 12 )
plt.ylabel('Total Employment' , fontsize= 12 )
plt.title('AGRICULTURAL MARKETING SERVICE Employment Over Time' ,
fontsize= 14 , fontweight= 'bold' )
# Rotate x-axis labels
plt.xticks(rotation= 45 , ha= 'right' )
# Add grid
plt.grid(True , alpha= 0.3 )
# Set y-axis to start at 0
plt.ylim(bottom= 0 )
# Format y-axis with commas
ax = plt.gca()
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f' { int (x):,} ' ))
# Add legend
plt.legend()
plt.tight_layout()
plt.show()