Load and Prepare Data
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.1 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.2
✔ purrr 1.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (haven)
library (ggplot2)
library (MetBrewer)
library (oaxaca)
Please cite as:
Hlavac, Marek (2022). oaxaca: Blinder-Oaxaca Decomposition in R.
R package version 0.1.5. https://CRAN.R-project.org/package=oaxaca
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
Full dataset loaded as df_acs using read_dta().
Sample Restrictions
# sample restrictions
df_acs = df_acs %>%
filter (age > 24 & age < 64 ) %>% # Focus on working age
filter (incwage > 0 & incwage < 999998 ) %>% # Exclude non-earners/missing
filter (uhrswork >= 30 & uhrswork < 99 ) %>% # Exclude people who didn't work full-time
filter (classwkrd != 10 ) %>% # Exclude self-employed
filter (wkswork2 >= 3 ) # Exclude people who worked less than 27 weeks last year
dim (df_acs)
Add/Modify Variables
# make doubles into integers for better memory space
df_acs$ year = as.integer (df_acs$ year)
df_acs$ age = as.integer (df_acs$ age)
df_acs$ educ = as.integer (df_acs$ educ)
df_acs$ uhrswork = as.integer (df_acs$ uhrswork)
df_acs$ wkswork2 = as.integer (df_acs$ wkswork2)
# make doubles into factors
df_acs$ ind1990 = as.factor (df_acs$ ind1990)
df_acs$ statefip = as.factor (df_acs$ statefip)
# binary gender variable
df_acs$ female = ifelse (df_acs$ sex == 2 , 1 L, 0 L)
# move column to after sex
df_acs = df_acs %>%
relocate (female, .after = sex)
# log wages
df_acs$ log_wage = log (df_acs$ incwage)
# move column to after incwage
df_acs = df_acs %>%
relocate (log_wage, .after = incwage)
# recode disability variables to 0 and 1
df_acs$ diffrem = df_acs$ diffrem - 1
df_acs$ diffphys = df_acs$ diffphys - 1
df_acs$ diffmob = df_acs$ diffmob - 1
df_acs$ diffcare = df_acs$ diffcare - 1
df_acs$ diffsens = df_acs$ diffsens - 1
df_acs$ diffeye = df_acs$ diffeye - 1
df_acs$ diffhear = df_acs$ diffhear - 1
# any disability binary
df_acs$ disability = ifelse (
df_acs$ diffrem == 1 |
df_acs$ diffphys == 1 |
df_acs$ diffmob == 1 |
df_acs$ diffcare == 1 |
df_acs$ diffsens == 1 |
df_acs$ diffeye == 1 |
df_acs$ diffhear == 1 ,
1 L, 0 L
)
# number of disabilities
df_acs$ n_disabilities = df_acs$ diffrem + df_acs$ diffphys + df_acs$ diffmob + df_acs$ diffcare + df_acs$ diffeye + df_acs$ diffhear
# disability type
df_acs$ disability_type = case_when (
df_acs$ diffrem == 1 ~ "cognitive" ,
df_acs$ diffphys == 1 ~ "ambulatory" ,
df_acs$ diffmob == 1 ~ "independent living" ,
df_acs$ diffcare == 1 ~ "self-care" ,
# df_acs$diffsens == 1 ~ "",
df_acs$ diffeye == 1 & df_acs$ diffhear == 0 ~ "vision only" ,
df_acs$ diffhear == 1 & df_acs$ diffeye == 0 ~ "hearing only" ,
df_acs$ n_disabilities > 1 ~ "multiple" ,
df_acs$ n_disabilities == 0 ~ "none"
)
# education in years
# midpoint used for ranges
educ_map = c (
'0' = 0 , # N/A or no schooling
'1' = 2 , # Nursery school to grade 4
'2' = 6.5 , # Grade 5, 6, 7, or 8
'3' = 9 , # Grade 9
'4' = 10 , # Grade 10
'5' = 11 , # Grade 11
'6' = 12 , # Grade 12
'7' = 13 , # 1 year of college
'8' = 14 , # 2 years of college
'9' = 15 , # 3 years of college
'10' = 16 , # 4 years of college
'11' = 18 # 5+ years of college
)
df_acs$ educ_years = educ_map[as.character (df_acs$ educ)]
# move column to after educ
df_acs = df_acs %>%
relocate (educ_years, .after = educ)
# binary married
df_acs$ married = ifelse (df_acs$ marst == 1 , # married and spouse present
1 L, 0 L)
# work from home
df_acs$ wfh = ifelse (df_acs$ tranwork == 80 , 1 L, 0 L)
Descriptive Statistics
# summary stats
df_acs %>%
group_by (disability) %>%
summarise (
n = n (),
mean_logwage = mean (log_wage),
sd_logwage = sd (log_wage),
mean_educyears = mean (educ_years),
sd_educyears = sd (educ_years),
mean_age = mean (age),
sd_age = sd (age),
.groups = "drop"
) %>%
round (2 )
# A tibble: 2 × 8
disability n mean_logwage sd_logwage mean_educyears sd_educyears mean_age
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 9.30e6 10.9 0.77 14.2 2.91 43.8
2 1 5.22e5 10.7 0.77 13.4 2.93 47.1
# ℹ 1 more variable: sd_age <dbl>
df_acs %>%
group_by (disability) %>%
summarise (
mean_hours = mean (uhrswork),
sd_hours = sd (uhrswork),
mean_married = mean (married),
sd_married = sd (married),
mean_wfh = mean (wfh),
sd_wfh = sd (wfh),
mean_female = mean (female),
sd_female = sd (female),
.groups = "drop"
) %>%
round (2 )
# A tibble: 2 × 9
disability mean_hours sd_hours mean_married sd_married mean_wfh sd_wfh
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 43.1 8.03 0.61 0.49 0.1 0.3
2 1 42.9 8.54 0.52 0.5 0.09 0.29
# ℹ 2 more variables: mean_female <dbl>, sd_female <dbl>
# year distribution
df_acs %>%
group_by (year, disability) %>%
summarise (
count = n (),
.groups = "drop"
) %>%
ggplot (aes (x = factor (year), y = count, fill = factor (disability))) +
geom_col (position = "dodge" ) +
scale_fill_manual (values = c ("#447861" , "#b1a1cc" )) +
labs (
title = "Distribution of Disabled and Non-Disabled" ,
y = "Count" ,
x = "Year" ,
fill = "Disability"
) +
theme_linedraw () +
theme (text = element_text (family = "serif" ))
df_acs %>%
group_by (disability_type) %>%
summarise (n = n ())
# A tibble: 8 × 2
disability_type n
<chr> <int>
1 ambulatory 130210
2 cognitive 154126
3 hearing only 120969
4 independent living 20586
5 multiple 8908
6 none 9300072
7 self-care 4253
8 vision only 82599
# work from home
df_acs %>%
group_by (disability, year) %>%
summarise (
wfh = sum (wfh),
prop = (sum (wfh) / n ()) * 100
) %>%
pivot_wider (
names_from = disability,
names_glue = "{.value}_{disability}" ,
values_from = c (wfh, prop)
) %>%
round (2 )
`summarise()` has grouped output by 'disability'. You can override using the
`.groups` argument.
# A tibble: 10 × 5
year wfh_0 wfh_1 prop_0 prop_1
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2015 32464 1535 3.53 3.3
2 2016 36793 1727 3.96 3.54
3 2017 39334 1921 4.14 3.9
4 2018 41285 1916 4.28 3.91
5 2019 45691 2131 4.76 4.28
6 2020 112074 4592 15.2 11.8
7 2021 178756 8256 19.8 15.9
8 2022 159697 8435 16.4 14.1
9 2023 145987 8331 14.9 13.2
10 2024 139975 8427 14.2 13.1
df_acs %>%
group_by (year) %>%
summarise (
n = sum (wfh),
prop = (sum (wfh) / n ()) * 100
) %>%
round (2 )
# A tibble: 10 × 3
year n prop
<dbl> <dbl> <dbl>
1 2015 33999 3.52
2 2016 38520 3.94
3 2017 41255 4.12
4 2018 43201 4.26
5 2019 47822 4.73
6 2020 116666 15.0
7 2021 187012 19.6
8 2022 168132 16.3
9 2023 154318 14.8
10 2024 148402 14.2
# remote work over time
df_acs %>%
ggplot (aes (x = factor (year),
fill = factor (ifelse (wfh == 0 , "Non-Remote" , ifelse (disability == 1 , "Disabled Remote" , "Non-Disabled Remote" )))
)) +
geom_bar () +
scale_fill_manual (values = c ("#c44d76" , "#b1a1cc" , "#447861" )) +
labs (
title = "Distribution of Remote Work" ,
y = "Count" ,
x = "Year" ,
fill = "Category"
) +
theme_linedraw () +
theme (text = element_text (family = "serif" ))
# gender summary stats
# female
df_acs %>%
filter (female == 1 ) %>%
group_by (disability) %>%
summarise (
n = n (),
mean_logwage = mean (log_wage),
sd_logwage = sd (log_wage),
mean_educyears = mean (educ_years),
sd_educyears = sd (educ_years),
mean_age = mean (age),
sd_age = sd (age),
mean_hours = mean (uhrswork),
sd_hours = sd (uhrswork),
mean_married = mean (married),
sd_married = sd (married),
mean_wfh = mean (wfh),
sd_wfh = sd (wfh),
mean_female = mean (female),
sd_female = sd (female),
.groups = "drop"
) %>%
round (2 )
# A tibble: 2 × 16
disability n mean_logwage sd_logwage mean_educyears sd_educyears mean_age
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 4.28e6 10.8 0.73 14.5 2.78 43.9
2 1 2.41e5 10.5 0.73 13.7 2.86 46.9
# ℹ 9 more variables: sd_age <dbl>, mean_hours <dbl>, sd_hours <dbl>,
# mean_married <dbl>, sd_married <dbl>, mean_wfh <dbl>, sd_wfh <dbl>,
# mean_female <dbl>, sd_female <dbl>
# male
df_acs %>%
filter (female == 0 ) %>%
group_by (disability) %>%
summarise (
n = n (),
mean_logwage = mean (log_wage),
sd_logwage = sd (log_wage),
mean_educyears = mean (educ_years),
sd_educyears = sd (educ_years),
mean_age = mean (age),
sd_age = sd (age),
mean_hours = mean (uhrswork),
sd_hours = sd (uhrswork),
mean_married = mean (married),
sd_married = sd (married),
mean_wfh = mean (wfh),
sd_wfh = sd (wfh),
mean_female = mean (female),
sd_female = sd (female),
.groups = "drop"
) %>%
round (2 )
# A tibble: 2 × 16
disability n mean_logwage sd_logwage mean_educyears sd_educyears mean_age
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 5.02e6 11.0 0.78 13.9 3 43.7
2 1 2.81e5 10.8 0.78 13.2 2.97 47.2
# ℹ 9 more variables: sd_age <dbl>, mean_hours <dbl>, sd_hours <dbl>,
# mean_married <dbl>, sd_married <dbl>, mean_wfh <dbl>, sd_wfh <dbl>,
# mean_female <dbl>, sd_female <dbl>
Linear Regresion
mod_female = lm (log_wage ~ disability + age + educ_years + uhrswork + married + statefip + wfh + factor (year), df_acs[df_acs$ female == 1 , ])
summary (mod_female)
Call:
lm(formula = log_wage ~ disability + age + educ_years + uhrswork +
married + statefip + wfh + factor(year), data = df_acs[df_acs$female ==
1, ])
Residuals:
Min 1Q Median 3Q Max
-10.1072 -0.3184 0.0221 0.3589 4.2936
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.664e+00 3.591e-03 2134.050 < 2e-16 ***
disability -1.428e-01 1.293e-03 -110.396 < 2e-16 ***
age 7.768e-03 2.670e-05 290.896 < 2e-16 ***
educ_years 9.818e-02 1.068e-04 918.854 < 2e-16 ***
uhrswork 2.273e-02 4.107e-05 553.449 < 2e-16 ***
married 1.094e-01 5.962e-04 183.521 < 2e-16 ***
statefip2 2.219e-01 7.256e-03 30.582 < 2e-16 ***
statefip4 1.468e-01 3.205e-03 45.799 < 2e-16 ***
statefip5 -9.357e-03 3.970e-03 -2.357 0.018426 *
statefip6 3.154e-01 2.601e-03 121.293 < 2e-16 ***
statefip8 1.923e-01 3.230e-03 59.546 < 2e-16 ***
statefip9 3.117e-01 3.604e-03 86.491 < 2e-16 ***
statefip10 2.060e-01 5.838e-03 35.284 < 2e-16 ***
statefip11 5.163e-01 5.778e-03 89.363 < 2e-16 ***
statefip12 8.107e-02 2.720e-03 29.802 < 2e-16 ***
statefip13 9.022e-02 2.941e-03 30.682 < 2e-16 ***
statefip15 1.661e-01 4.914e-03 33.806 < 2e-16 ***
statefip16 5.851e-03 4.920e-03 1.189 0.234395
statefip17 1.695e-01 2.833e-03 59.832 < 2e-16 ***
statefip18 5.388e-02 3.172e-03 16.989 < 2e-16 ***
statefip19 4.286e-02 3.724e-03 11.510 < 2e-16 ***
statefip20 1.131e-02 3.886e-03 2.910 0.003616 **
statefip21 3.339e-02 3.506e-03 9.526 < 2e-16 ***
statefip22 6.146e-03 3.539e-03 1.736 0.082483 .
statefip23 5.735e-02 5.155e-03 11.125 < 2e-16 ***
statefip24 3.243e-01 3.138e-03 103.344 < 2e-16 ***
statefip25 3.418e-01 3.077e-03 111.065 < 2e-16 ***
statefip26 8.562e-02 2.982e-03 28.712 < 2e-16 ***
statefip27 1.585e-01 3.227e-03 49.112 < 2e-16 ***
statefip28 -7.821e-02 3.999e-03 -19.558 < 2e-16 ***
statefip29 4.191e-02 3.207e-03 13.068 < 2e-16 ***
statefip30 -2.673e-02 5.867e-03 -4.556 5.22e-06 ***
statefip31 2.987e-02 4.347e-03 6.871 6.37e-12 ***
statefip32 1.733e-01 3.931e-03 44.094 < 2e-16 ***
statefip33 1.917e-01 4.884e-03 39.256 < 2e-16 ***
statefip34 3.188e-01 2.964e-03 107.552 < 2e-16 ***
statefip35 4.933e-02 4.675e-03 10.551 < 2e-16 ***
statefip36 2.654e-01 2.706e-03 98.093 < 2e-16 ***
statefip37 7.288e-02 2.939e-03 24.794 < 2e-16 ***
statefip38 6.900e-02 6.306e-03 10.942 < 2e-16 ***
statefip39 9.968e-02 2.878e-03 34.631 < 2e-16 ***
statefip40 -1.339e-02 3.724e-03 -3.596 0.000323 ***
statefip41 1.811e-01 3.571e-03 50.719 < 2e-16 ***
statefip42 1.246e-01 2.840e-03 43.880 < 2e-16 ***
statefip44 2.484e-01 5.482e-03 45.319 < 2e-16 ***
statefip45 2.444e-02 3.389e-03 7.213 5.47e-13 ***
statefip46 -1.849e-03 5.929e-03 -0.312 0.755193
statefip47 4.268e-02 3.171e-03 13.460 < 2e-16 ***
statefip48 1.123e-01 2.656e-03 42.273 < 2e-16 ***
statefip49 1.046e-01 4.044e-03 25.871 < 2e-16 ***
statefip50 9.662e-02 6.696e-03 14.431 < 2e-16 ***
statefip51 2.177e-01 2.985e-03 72.918 < 2e-16 ***
statefip53 2.829e-01 3.099e-03 91.291 < 2e-16 ***
statefip54 -6.566e-03 4.854e-03 -1.353 0.176103
statefip55 9.727e-02 3.201e-03 30.382 < 2e-16 ***
statefip56 1.547e-02 7.386e-03 2.095 0.036152 *
wfh 1.901e-01 9.566e-04 198.757 < 2e-16 ***
factor(year)2016 2.293e-02 1.301e-03 17.623 < 2e-16 ***
factor(year)2017 4.999e-02 1.293e-03 38.660 < 2e-16 ***
factor(year)2018 7.226e-02 1.289e-03 56.046 < 2e-16 ***
factor(year)2019 1.105e-01 1.290e-03 85.668 < 2e-16 ***
factor(year)2020 1.292e-01 1.387e-03 93.170 < 2e-16 ***
factor(year)2021 1.511e-01 1.318e-03 114.573 < 2e-16 ***
factor(year)2022 2.023e-01 1.289e-03 156.931 < 2e-16 ***
factor(year)2023 2.633e-01 1.283e-03 205.341 < 2e-16 ***
factor(year)2024 3.157e-01 1.281e-03 246.532 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6134 on 4519432 degrees of freedom
Multiple R-squared: 0.3007, Adjusted R-squared: 0.3007
F-statistic: 2.99e+04 on 65 and 4519432 DF, p-value: < 2.2e-16
mod_male = lm (log_wage ~ disability + age + educ_years + uhrswork + married + statefip + wfh + factor (year), df_acs[df_acs$ female == 0 , ])
summary (mod_male)
Call:
lm(formula = log_wage ~ disability + age + educ_years + uhrswork +
married + statefip + wfh + factor(year), data = df_acs[df_acs$female ==
0, ])
Residuals:
Min 1Q Median 3Q Max
-10.6314 -0.3494 0.0176 0.3743 4.0132
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.077e+00 3.440e-03 2348.174 < 2e-16 ***
disability -1.636e-01 1.287e-03 -127.067 < 2e-16 ***
age 9.914e-03 2.705e-05 366.536 < 2e-16 ***
educ_years 9.130e-02 9.817e-05 930.014 < 2e-16 ***
uhrswork 1.797e-02 3.372e-05 533.027 < 2e-16 ***
married 2.675e-01 6.217e-04 430.328 < 2e-16 ***
statefip2 8.954e-02 7.025e-03 12.746 < 2e-16 ***
statefip4 7.322e-02 3.175e-03 23.059 < 2e-16 ***
statefip5 -5.544e-02 4.000e-03 -13.860 < 2e-16 ***
statefip6 2.231e-01 2.582e-03 86.407 < 2e-16 ***
statefip8 1.385e-01 3.177e-03 43.607 < 2e-16 ***
statefip9 2.626e-01 3.627e-03 72.400 < 2e-16 ***
statefip10 9.465e-02 6.005e-03 15.763 < 2e-16 ***
statefip11 3.733e-01 6.226e-03 59.965 < 2e-16 ***
statefip12 1.993e-03 2.719e-03 0.733 0.463545
statefip13 4.868e-02 2.939e-03 16.566 < 2e-16 ***
statefip15 8.680e-02 4.897e-03 17.726 < 2e-16 ***
statefip16 -1.154e-02 4.634e-03 -2.491 0.012742 *
statefip17 1.247e-01 2.819e-03 44.233 < 2e-16 ***
statefip18 3.314e-02 3.129e-03 10.592 < 2e-16 ***
statefip19 -2.455e-03 3.723e-03 -0.659 0.509712
statefip20 -3.799e-03 3.845e-03 -0.988 0.323064
statefip21 -1.258e-02 3.498e-03 -3.597 0.000322 ***
statefip22 4.410e-02 3.597e-03 12.262 < 2e-16 ***
statefip23 -3.624e-02 5.302e-03 -6.836 8.15e-12 ***
statefip24 2.288e-01 3.172e-03 72.133 < 2e-16 ***
statefip25 2.657e-01 3.094e-03 85.869 < 2e-16 ***
statefip26 3.967e-02 2.946e-03 13.466 < 2e-16 ***
statefip27 8.261e-02 3.220e-03 25.653 < 2e-16 ***
statefip28 -8.505e-02 4.146e-03 -20.514 < 2e-16 ***
statefip29 -2.778e-03 3.210e-03 -0.865 0.386832
statefip30 -8.354e-02 5.770e-03 -14.478 < 2e-16 ***
statefip31 -2.746e-02 4.331e-03 -6.339 2.31e-10 ***
statefip32 8.563e-02 3.864e-03 22.160 < 2e-16 ***
statefip33 1.597e-01 4.896e-03 32.620 < 2e-16 ***
statefip34 2.652e-01 2.953e-03 89.817 < 2e-16 ***
statefip35 -1.606e-02 4.742e-03 -3.387 0.000706 ***
statefip36 1.664e-01 2.708e-03 61.440 < 2e-16 ***
statefip37 1.139e-02 2.936e-03 3.879 0.000105 ***
statefip38 4.584e-02 6.155e-03 7.447 9.53e-14 ***
statefip39 4.842e-02 2.862e-03 16.916 < 2e-16 ***
statefip40 -2.769e-02 3.682e-03 -7.520 5.47e-14 ***
statefip41 9.146e-02 3.524e-03 25.950 < 2e-16 ***
statefip42 7.131e-02 2.825e-03 25.237 < 2e-16 ***
statefip44 1.491e-01 5.611e-03 26.569 < 2e-16 ***
statefip45 -1.085e-02 3.415e-03 -3.178 0.001482 **
statefip46 -7.769e-02 5.988e-03 -12.973 < 2e-16 ***
statefip47 -5.404e-03 3.165e-03 -1.708 0.087699 .
statefip48 9.167e-02 2.636e-03 34.781 < 2e-16 ***
statefip49 1.072e-01 3.685e-03 29.076 < 2e-16 ***
statefip50 -2.532e-02 6.933e-03 -3.652 0.000260 ***
statefip51 1.694e-01 2.976e-03 56.927 < 2e-16 ***
statefip53 2.464e-01 3.035e-03 81.182 < 2e-16 ***
statefip54 -3.512e-02 4.802e-03 -7.314 2.60e-13 ***
statefip55 4.354e-02 3.183e-03 13.677 < 2e-16 ***
statefip56 3.908e-02 7.070e-03 5.527 3.25e-08 ***
wfh 2.127e-01 1.026e-03 207.231 < 2e-16 ***
factor(year)2016 2.460e-02 1.285e-03 19.139 < 2e-16 ***
factor(year)2017 4.811e-02 1.278e-03 37.632 < 2e-16 ***
factor(year)2018 7.544e-02 1.274e-03 59.190 < 2e-16 ***
factor(year)2019 1.167e-01 1.276e-03 91.464 < 2e-16 ***
factor(year)2020 1.342e-01 1.371e-03 97.860 < 2e-16 ***
factor(year)2021 1.522e-01 1.303e-03 116.756 < 2e-16 ***
factor(year)2022 2.011e-01 1.277e-03 157.454 < 2e-16 ***
factor(year)2023 2.610e-01 1.272e-03 205.141 < 2e-16 ***
factor(year)2024 3.099e-01 1.272e-03 243.561 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.66 on 5302159 degrees of freedom
Multiple R-squared: 0.295, Adjusted R-squared: 0.295
F-statistic: 3.413e+04 on 65 and 5302159 DF, p-value: < 2.2e-16
mod_int_female = lm (log_wage ~ disability* (educ_years + age + uhrswork + wfh) + married + statefip + factor (year), df_acs[df_acs$ female == 1 , ])
summary (mod_int_female)
Call:
lm(formula = log_wage ~ disability * (educ_years + age + uhrswork +
wfh) + married + statefip + factor(year), data = df_acs[df_acs$female ==
1, ])
Residuals:
Min 1Q Median 3Q Max
-10.1092 -0.3183 0.0222 0.3589 4.1621
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.644e+00 3.642e-03 2098.710 < 2e-16 ***
disability 1.955e-01 1.069e-02 18.294 < 2e-16 ***
educ_years 9.932e-02 1.100e-04 903.181 < 2e-16 ***
age 7.783e-03 2.745e-05 283.533 < 2e-16 ***
uhrswork 2.281e-02 4.237e-05 538.326 < 2e-16 ***
wfh 1.905e-01 9.808e-04 194.248 < 2e-16 ***
married 1.092e-01 5.961e-04 183.201 < 2e-16 ***
statefip2 2.221e-01 7.255e-03 30.621 < 2e-16 ***
statefip4 1.469e-01 3.205e-03 45.833 < 2e-16 ***
statefip5 -9.452e-03 3.969e-03 -2.381 0.01725 *
statefip6 3.154e-01 2.600e-03 121.308 < 2e-16 ***
statefip8 1.923e-01 3.229e-03 59.544 < 2e-16 ***
statefip9 3.113e-01 3.603e-03 86.401 < 2e-16 ***
statefip10 2.059e-01 5.836e-03 35.280 < 2e-16 ***
statefip11 5.153e-01 5.776e-03 89.199 < 2e-16 ***
statefip12 8.121e-02 2.720e-03 29.862 < 2e-16 ***
statefip13 9.027e-02 2.940e-03 30.707 < 2e-16 ***
statefip15 1.663e-01 4.913e-03 33.843 < 2e-16 ***
statefip16 6.010e-03 4.919e-03 1.222 0.22176
statefip17 1.693e-01 2.832e-03 59.779 < 2e-16 ***
statefip18 5.379e-02 3.171e-03 16.964 < 2e-16 ***
statefip19 4.301e-02 3.723e-03 11.552 < 2e-16 ***
statefip20 1.117e-02 3.885e-03 2.875 0.00404 **
statefip21 3.327e-02 3.505e-03 9.492 < 2e-16 ***
statefip22 6.211e-03 3.538e-03 1.755 0.07921 .
statefip23 5.730e-02 5.154e-03 11.116 < 2e-16 ***
statefip24 3.240e-01 3.137e-03 103.289 < 2e-16 ***
statefip25 3.413e-01 3.076e-03 110.931 < 2e-16 ***
statefip26 8.558e-02 2.981e-03 28.705 < 2e-16 ***
statefip27 1.584e-01 3.226e-03 49.097 < 2e-16 ***
statefip28 -7.814e-02 3.998e-03 -19.544 < 2e-16 ***
statefip29 4.171e-02 3.206e-03 13.008 < 2e-16 ***
statefip30 -2.686e-02 5.866e-03 -4.580 4.66e-06 ***
statefip31 2.980e-02 4.346e-03 6.855 7.11e-12 ***
statefip32 1.736e-01 3.930e-03 44.163 < 2e-16 ***
statefip33 1.918e-01 4.883e-03 39.267 < 2e-16 ***
statefip34 3.185e-01 2.963e-03 107.484 < 2e-16 ***
statefip35 4.957e-02 4.674e-03 10.605 < 2e-16 ***
statefip36 2.652e-01 2.705e-03 98.036 < 2e-16 ***
statefip37 7.284e-02 2.939e-03 24.789 < 2e-16 ***
statefip38 6.923e-02 6.304e-03 10.981 < 2e-16 ***
statefip39 9.949e-02 2.878e-03 34.574 < 2e-16 ***
statefip40 -1.343e-02 3.723e-03 -3.607 0.00031 ***
statefip41 1.810e-01 3.570e-03 50.705 < 2e-16 ***
statefip42 1.245e-01 2.840e-03 43.835 < 2e-16 ***
statefip44 2.482e-01 5.481e-03 45.292 < 2e-16 ***
statefip45 2.452e-02 3.388e-03 7.237 4.59e-13 ***
statefip46 -1.731e-03 5.927e-03 -0.292 0.77023
statefip47 4.259e-02 3.170e-03 13.434 < 2e-16 ***
statefip48 1.122e-01 2.655e-03 42.275 < 2e-16 ***
statefip49 1.047e-01 4.043e-03 25.901 < 2e-16 ***
statefip50 9.650e-02 6.694e-03 14.417 < 2e-16 ***
statefip51 2.175e-01 2.985e-03 72.864 < 2e-16 ***
statefip53 2.829e-01 3.098e-03 91.319 < 2e-16 ***
statefip54 -6.500e-03 4.853e-03 -1.339 0.18042
statefip55 9.732e-02 3.201e-03 30.408 < 2e-16 ***
statefip56 1.586e-02 7.384e-03 2.148 0.03172 *
factor(year)2016 2.290e-02 1.301e-03 17.606 < 2e-16 ***
factor(year)2017 5.000e-02 1.293e-03 38.676 < 2e-16 ***
factor(year)2018 7.225e-02 1.289e-03 56.051 < 2e-16 ***
factor(year)2019 1.104e-01 1.290e-03 85.647 < 2e-16 ***
factor(year)2020 1.291e-01 1.387e-03 93.128 < 2e-16 ***
factor(year)2021 1.510e-01 1.318e-03 114.566 < 2e-16 ***
factor(year)2022 2.022e-01 1.289e-03 156.938 < 2e-16 ***
factor(year)2023 2.634e-01 1.282e-03 205.391 < 2e-16 ***
factor(year)2024 3.158e-01 1.280e-03 246.612 < 2e-16 ***
disability:educ_years -2.003e-02 4.569e-04 -43.845 < 2e-16 ***
disability:age -1.542e-04 1.151e-04 -1.339 0.18047
disability:uhrswork -1.346e-03 1.715e-04 -7.847 4.25e-15 ***
disability:wfh -8.656e-03 4.223e-03 -2.050 0.04037 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6133 on 4519428 degrees of freedom
Multiple R-squared: 0.301, Adjusted R-squared: 0.301
F-statistic: 2.821e+04 on 69 and 4519428 DF, p-value: < 2.2e-16
mod_int_male = lm (log_wage ~ disability* (educ_years + age + uhrswork + wfh) + married + statefip + factor (year), df_acs[df_acs$ female == 0 , ])
summary (mod_int_male)
Call:
lm(formula = log_wage ~ disability * (educ_years + age + uhrswork +
wfh) + married + statefip + factor(year), data = df_acs[df_acs$female ==
0, ])
Residuals:
Min 1Q Median 3Q Max
-10.6371 -0.3493 0.0177 0.3743 4.0010
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.059e+00 3.481e-03 2315.143 < 2e-16 ***
disability 1.726e-01 9.905e-03 17.423 < 2e-16 ***
educ_years 9.251e-02 1.008e-04 917.766 < 2e-16 ***
age 9.978e-03 2.778e-05 359.175 < 2e-16 ***
uhrswork 1.794e-02 3.475e-05 516.311 < 2e-16 ***
wfh 2.124e-01 1.050e-03 202.259 < 2e-16 ***
married 2.674e-01 6.215e-04 430.273 < 2e-16 ***
statefip2 8.968e-02 7.023e-03 12.770 < 2e-16 ***
statefip4 7.338e-02 3.175e-03 23.116 < 2e-16 ***
statefip5 -5.555e-02 3.999e-03 -13.890 < 2e-16 ***
statefip6 2.231e-01 2.581e-03 86.428 < 2e-16 ***
statefip8 1.385e-01 3.176e-03 43.595 < 2e-16 ***
statefip9 2.621e-01 3.626e-03 72.294 < 2e-16 ***
statefip10 9.435e-02 6.003e-03 15.717 < 2e-16 ***
statefip11 3.722e-01 6.224e-03 59.796 < 2e-16 ***
statefip12 2.155e-03 2.718e-03 0.793 0.427826
statefip13 4.865e-02 2.938e-03 16.561 < 2e-16 ***
statefip15 8.703e-02 4.896e-03 17.778 < 2e-16 ***
statefip16 -1.140e-02 4.633e-03 -2.461 0.013865 *
statefip17 1.246e-01 2.818e-03 44.216 < 2e-16 ***
statefip18 3.301e-02 3.128e-03 10.555 < 2e-16 ***
statefip19 -2.353e-03 3.722e-03 -0.632 0.527251
statefip20 -3.746e-03 3.844e-03 -0.975 0.329765
statefip21 -1.280e-02 3.497e-03 -3.660 0.000252 ***
statefip22 4.406e-02 3.596e-03 12.253 < 2e-16 ***
statefip23 -3.621e-02 5.300e-03 -6.832 8.35e-12 ***
statefip24 2.287e-01 3.171e-03 72.109 < 2e-16 ***
statefip25 2.650e-01 3.093e-03 85.682 < 2e-16 ***
statefip26 3.950e-02 2.945e-03 13.412 < 2e-16 ***
statefip27 8.244e-02 3.220e-03 25.606 < 2e-16 ***
statefip28 -8.506e-02 4.145e-03 -20.522 < 2e-16 ***
statefip29 -2.991e-03 3.209e-03 -0.932 0.351306
statefip30 -8.330e-02 5.769e-03 -14.441 < 2e-16 ***
statefip31 -2.748e-02 4.330e-03 -6.346 2.21e-10 ***
statefip32 8.594e-02 3.863e-03 22.246 < 2e-16 ***
statefip33 1.596e-01 4.895e-03 32.600 < 2e-16 ***
statefip34 2.647e-01 2.952e-03 89.680 < 2e-16 ***
statefip35 -1.598e-02 4.741e-03 -3.370 0.000752 ***
statefip36 1.662e-01 2.708e-03 61.389 < 2e-16 ***
statefip37 1.132e-02 2.935e-03 3.856 0.000115 ***
statefip38 4.613e-02 6.153e-03 7.496 6.56e-14 ***
statefip39 4.818e-02 2.862e-03 16.836 < 2e-16 ***
statefip40 -2.775e-02 3.681e-03 -7.538 4.76e-14 ***
statefip41 9.134e-02 3.523e-03 25.923 < 2e-16 ***
statefip42 7.117e-02 2.825e-03 25.196 < 2e-16 ***
statefip44 1.488e-01 5.610e-03 26.519 < 2e-16 ***
statefip45 -1.088e-02 3.414e-03 -3.187 0.001438 **
statefip46 -7.753e-02 5.987e-03 -12.951 < 2e-16 ***
statefip47 -5.564e-03 3.164e-03 -1.759 0.078636 .
statefip48 9.174e-02 2.635e-03 34.818 < 2e-16 ***
statefip49 1.072e-01 3.684e-03 29.089 < 2e-16 ***
statefip50 -2.525e-02 6.931e-03 -3.644 0.000269 ***
statefip51 1.694e-01 2.976e-03 56.927 < 2e-16 ***
statefip53 2.463e-01 3.035e-03 81.155 < 2e-16 ***
statefip54 -3.519e-02 4.801e-03 -7.329 2.32e-13 ***
statefip55 4.370e-02 3.183e-03 13.729 < 2e-16 ***
statefip56 3.938e-02 7.068e-03 5.571 2.53e-08 ***
factor(year)2016 2.457e-02 1.285e-03 19.123 < 2e-16 ***
factor(year)2017 4.804e-02 1.278e-03 37.589 < 2e-16 ***
factor(year)2018 7.543e-02 1.274e-03 59.198 < 2e-16 ***
factor(year)2019 1.166e-01 1.275e-03 91.449 < 2e-16 ***
factor(year)2020 1.341e-01 1.371e-03 97.824 < 2e-16 ***
factor(year)2021 1.522e-01 1.303e-03 116.799 < 2e-16 ***
factor(year)2022 2.011e-01 1.277e-03 157.537 < 2e-16 ***
factor(year)2023 2.609e-01 1.272e-03 205.175 < 2e-16 ***
factor(year)2024 3.099e-01 1.272e-03 243.641 < 2e-16 ***
disability:educ_years -2.287e-02 4.358e-04 -52.473 < 2e-16 ***
disability:age -1.202e-03 1.137e-04 -10.571 < 2e-16 ***
disability:uhrswork 5.183e-04 1.416e-04 3.660 0.000253 ***
disability:wfh -1.053e-03 4.782e-03 -0.220 0.825695
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6598 on 5302155 degrees of freedom
Multiple R-squared: 0.2954, Adjusted R-squared: 0.2953
F-statistic: 3.221e+04 on 69 and 5302155 DF, p-value: < 2.2e-16
Oaxaca-Blinder Decomposition
# conserving computer memory
# delete linear regression models
rm (mod_female, mod_male, mod_int_female, mod_int_male)
# keep only necessary columns for oaxaca decomposition
cols_keep = c ("year" , "log_wage" , "age" , "educ_years" , "uhrswork" , "female" , "married" , "wfh" , "ind1990" , "disability" , "statefip" , "n_disabilities" , "disability_type" )
df_acs = df_acs[, cols_keep]
# make sure non-disabled are reference group A
df_acs$ disability = factor (df_acs$ disability, levels = c (0 , 1 ))
years = unique (df_acs$ year)
detailed_results = list ()
for (yr in years) {
# subset to the specific year
data_year = df_acs[df_acs$ year == yr, ]
# oaxaca decomposition
res = oaxaca (log_wage ~ age + educ_years + uhrswork + female + married + statefip + wfh | disability,
data = data_year, R = 10 )
# pull variables from the reference group (Index 1 = Group A)
vars_w = res$ twofold$ variables[[1 ]]
# extract specific explained components
explained_vals = vars_w[, "coef(explained)" ]
# sum the unexplained components into one total
unexplained_total = sum (vars_w[, "coef(unexplained)" ])
# use grepl to find and sum all dummy variables for a category
sum_category = function (name, vec) {
relevant_rows = vec[grepl (paste0 ("^" , name), names (vec))]
return (sum (relevant_rows, na.rm = TRUE ))
}
# build the data frame for this specific year
df_year_stack = data.frame (
year = yr,
component = c ("Age" , "Education" , "Hours" , "Female" , "Marriage Status" , "State" , "Work From Home" ,"Unexplained" ),
log_points = c (
explained_vals["age" ],
explained_vals["educ_years" ],
explained_vals["uhrswork" ],
explained_vals["female" ],
explained_vals["married" ],
sum_category ("statefip" , explained_vals),
explained_vals["wfh" ],
unexplained_total
)
)
# add results to master list
detailed_results[[as.character (yr)]] = df_year_stack
# delete used variables for better RAM performance
rm (res, data_year, vars_w, explained_vals, unexplained_total, df_year_stack)
gc ()
}
# df for ggplot visualization
plot_df = do.call (rbind, detailed_results)
# convert log points to percent difference
plot_df$ percent_diff = 100 * plot_df$ log_points
female_results = list ()
for (yr in years) {
# subset to the specific year
data_year = df_acs[df_acs$ year == yr & df_acs$ female == 1 , ]
# oaxaca decomposition
res = oaxaca (log_wage ~ age + educ_years + uhrswork + married + statefip + wfh | disability,
data = data_year, R = 10 )
# pull variables from the reference group (Index 1 = Group A)
vars_w = res$ twofold$ variables[[1 ]]
# extract specific explained components
explained_vals = vars_w[, "coef(explained)" ]
# sum the unexplained components into one total
unexplained_total = sum (vars_w[, "coef(unexplained)" ])
# use grepl to find and sum all dummy variables for a category
sum_category = function (name, vec) {
relevant_rows = vec[grepl (paste0 ("^" , name), names (vec))]
return (sum (relevant_rows, na.rm = TRUE ))
}
# build the data frame for this specific year
df_year_stack = data.frame (
year = yr,
component = c ("Age" , "Education" , "Hours" , "Marriage Status" , "State" , "Work From Home" , "Unexplained" ),
log_points = c (
explained_vals["age" ],
explained_vals["educ_years" ],
explained_vals["uhrswork" ],
explained_vals["married" ],
sum_category ("statefip" , explained_vals),
explained_vals["wfh" ],
unexplained_total
)
)
# add results to master list
female_results[[as.character (yr)]] = df_year_stack
# delete used variables for better RAM performance
rm (res, data_year, vars_w, explained_vals, unexplained_total, df_year_stack)
gc ()
}
# df for ggplot visualization
female_plot_df = do.call (rbind, female_results)
# convert log points to percent difference
female_plot_df$ percent_diff = 100 * female_plot_df$ log_points
male_results = list ()
for (yr in years) {
# subset to the specific year
data_year = df_acs[df_acs$ year == yr & df_acs$ female == 0 , ]
# oaxaca decomposition
res = oaxaca (log_wage ~ age + educ_years + uhrswork + married + statefip + wfh | disability,
data = data_year, R = 10 )
# pull variables from the reference group (Index 1 = Group A)
vars_w = res$ twofold$ variables[[1 ]]
# extract specific explained components
explained_vals = vars_w[, "coef(explained)" ]
# sum the unexplained components into one total
unexplained_total = sum (vars_w[, "coef(unexplained)" ])
# use grepl to find and sum all dummy variables for a category
sum_category = function (name, vec) {
relevant_rows = vec[grepl (paste0 ("^" , name), names (vec))]
return (sum (relevant_rows, na.rm = TRUE ))
}
# build the data frame for this specific year
df_year_stack = data.frame (
year = yr,
component = c ("Age" , "Education" , "Hours" , "Marriage Status" , "State" , "Work From Home" , "Unexplained" ),
log_points = c (
explained_vals["age" ],
explained_vals["educ_years" ],
explained_vals["uhrswork" ],
explained_vals["married" ],
sum_category ("statefip" , explained_vals),
explained_vals["wfh" ],
unexplained_total
)
)
# add results to master list
male_results[[as.character (yr)]] = df_year_stack
# delete used variables for better RAM performance
rm (res, data_year, vars_w, explained_vals, unexplained_total, df_year_stack)
gc ()
}
# df for ggplot visualization
male_plot_df = do.call (rbind, male_results)
# convert log points to percent difference
male_plot_df$ percent_diff = 100 * male_plot_df$ log_points
# total wage gap each year
plot_df %>%
group_by (year) %>%
summarise (
total_gap = sum (percent_diff)
) %>%
round (2 )
# A tibble: 10 × 2
year total_gap
<dbl> <dbl>
1 2015 22.0
2 2016 22.2
3 2017 23.2
4 2018 23.1
5 2019 24.3
6 2020 25.2
7 2021 23.8
8 2022 24.1
9 2023 24.5
10 2024 24.2
female_plot_df %>%
group_by (year) %>%
summarise (
total_gap = sum (percent_diff)
) %>%
round (2 )
# A tibble: 10 × 2
year total_gap
<dbl> <dbl>
1 2015 21.6
2 2016 21.9
3 2017 23.2
4 2018 22.8
5 2019 24.6
6 2020 25.6
7 2021 24.3
8 2022 23.8
9 2023 23.9
10 2024 24.1
male_plot_df %>%
group_by (year) %>%
summarise (
total_gap = sum (percent_diff)
) %>%
round (2 )
# A tibble: 10 × 2
year total_gap
<dbl> <dbl>
1 2015 23.0
2 2016 22.8
3 2017 23.7
4 2018 23.4
5 2019 24.5
6 2020 25.0
7 2021 23.0
8 2022 23.9
9 2023 24.7
10 2024 23.6
Visualizations
# Daly et al. (2017) style decomposition plot
plot_df %>%
ggplot (aes (x = factor (year), y = percent_diff, fill = component)) +
geom_bar (stat = "identity" , width = 0.6 ) +
# geom_hline(yintercept = 0, color = "black", linewidth = 0.3) + # x-axis line
# line for total wage gap
stat_summary (fun = sum, aes (group = 1 ), geom = "line" ,
color = "black" , linewidth = 0.8 ) +
stat_summary (fun = sum, aes (group = 1 ), geom = "point" ,
color = "black" , size = 1.3 ) +
labs (title = "Disability Wage Gap" ,
y = "Percent difference in wages" , x = "Year" ,
fill = "Component" ) +
scale_fill_manual (values = met.brewer ("Thomas" , 8 )) + # Palette Options: Thomas, Tara, Signac, Homer2, Hiroshige, Derain
theme_linedraw () +
theme (text = element_text (family = "serif" ))
# Daly et al. (2017) style decomposition plot
female_plot_df %>%
ggplot (aes (x = factor (year), y = percent_diff, fill = component)) +
geom_bar (stat = "identity" , width = 0.6 ) +
# geom_hline(yintercept = 0, color = "black", linewidth = 0.3) + # x-axis line
# line for total wage gap
stat_summary (fun = sum, aes (group = 1 ), geom = "line" ,
color = "black" , linewidth = 0.8 ) +
stat_summary (fun = sum, aes (group = 1 ), geom = "point" ,
color = "black" , size = 1.3 ) +
labs (title = "Female Disability Wage Gap" ,
y = "Percent difference in wages" , x = "Year" ,
fill = "Component" ) +
scale_fill_manual (values = met.brewer ("Thomas" , 7 )) + # Palette Options: Thomas, Tara, Signac, Homer2, Hiroshige, Derain
theme_linedraw () +
theme (text = element_text (family = "serif" ))
# Daly et al. (2017) style decomposition plot
male_plot_df %>%
ggplot (aes (x = factor (year), y = percent_diff, fill = component)) +
geom_bar (stat = "identity" , width = 0.6 ) +
# geom_hline(yintercept = 0, color = "black", linewidth = 0.3) + # x-axis line
# line for total wage gap
stat_summary (fun = sum, aes (group = 1 ), geom = "line" ,
color = "black" , linewidth = 0.8 ) +
stat_summary (fun = sum, aes (group = 1 ), geom = "point" ,
color = "black" , size = 1.3 ) +
labs (title = "Male Disability Wage Gap" ,
y = "Percent difference in wages" , x = "Year" ,
fill = "Component" ) +
scale_fill_manual (values = met.brewer ("Thomas" , 7 )) + # Palette Options: Thomas, Tara, Signac, Homer2, Hiroshige, Derain
theme_linedraw () +
theme (text = element_text (family = "serif" ))