knitr::opts_chunk$set(comment = "#>",
echo = T,
results = 'show',
message = FALSE,
warning = FALSE,
cache = F)
# Load necessary libraries
library(readxl)
library(tidyverse)
library(lubridate)
library(stringr)
library(tools)
# Read list of country codes used by WB
wb_codes <- read_excel("./bases/ILO/Country_codes_wb.xlsx")
## AUTOMATION
autom <- read_csv("./bases/ILO/Microdata_ILO/EMP_SEX_ISCO08_2D_Automation.csv") %>%
mutate(obs_value = round(obs_value * 1000)) %>% # Convert to thousands
# Move totals to a column
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(total_empl = max(obs_value)) %>%
ungroup %>%
filter(classif2.label != "TOTAL") %>%
# Calculate total per ISCO 2-digit
group_by(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>%
mutate(total_empl_by_isco = sum(obs_value)) %>%
ungroup() %>%
# Calculate share of each isco in total empl
mutate(isco_share_in_empl = round(total_empl_by_isco / total_empl, 4)) %>%
# Calculate automation "yes" as share of each isco
mutate(yes_share_of_isco = round(obs_value / total_empl_by_isco, 3),
yes_share_of_isco = ifelse(classif2.label == "NO", NA, yes_share_of_isco)) %>%
# Select last survey available per country
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
# Filter out LFS for Ecuador
filter(ACRO != "ECV")
autom_check <- autom %>% distinct(ref_area.label, source.label, ACRO, time)
# By isco 2-digit
automation_by_isco <- autom %>%
filter(classif2.label == "YES") %>%
select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, yes_share_of_isco) %>%
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
rename(automation_isco = yes_share_of_isco) %>%
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
ungroup() %>%
filter(ACRO != "ECV")
# by country
automation_by_country <- autom %>%
filter(classif2.label == "YES") %>%
select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, classif2.label, obs_value, total_empl) %>%
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(automation_total = sum(obs_value)) %>%
ungroup() %>%
mutate(automation_country = round(automation_total / total_empl, 3)) %>%
distinct(ref_area.label, source.label, ACRO, sex.label, time, automation_country) %>%
ungroup()
# Final Automation DF
automation_df <- autom %>%
distinct(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, total_empl, total_empl_by_isco, isco_share_in_empl) %>%
ungroup() %>%
left_join(.,automation_by_isco) %>%
left_join(., automation_by_country)
############## ############## ############## ##############
## AUGMENTATION
augment <- read_csv("./bases/ILO/Microdata_ILO/EMP_SEX_ISCO08_2D_Augmentation.csv") %>%
mutate(obs_value = round(obs_value * 1000)) %>% # Convert to thousands
# Move totals to a column
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(total_empl = max(obs_value)) %>%
ungroup() %>%
filter(classif2.label != "TOTAL") %>%
# Calculate total per ISCO 2-digit
group_by(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>%
mutate(total_empl_by_isco = sum(obs_value)) %>%
ungroup() %>%
# Calculate share of each isco in total empl
mutate(isco_share_in_empl = round(total_empl_by_isco / total_empl, 4)) %>%
# Calculate automation "yes" as share of each isco
mutate(yes_share_of_isco = round(obs_value / total_empl_by_isco, 3),
yes_share_of_isco = ifelse(classif2.label == "NO", NA, yes_share_of_isco)) %>%
# Select last survey available per country
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
# Filter LFS for Ecuador
filter(ACRO != "ECV")
# By isco 2-digit
augmentation_by_isco <- augment %>%
filter(classif2.label == "YES") %>%
select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, yes_share_of_isco) %>%
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
rename(augmentation_isco = yes_share_of_isco) %>%
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
ungroup() %>%
filter(ACRO != "ECV")
# by country
augmentation_by_country <- augment %>%
filter(classif2.label == "YES") %>%
select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, classif2.label, obs_value, total_empl) %>%
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(augmentation_total = sum(obs_value)) %>%
ungroup() %>%
mutate(augmentation_country = round(augmentation_total / total_empl, 3)) %>%
distinct(ref_area.label, source.label, ACRO, sex.label, time, augmentation_country)
# Final Augmentation DF
augmentation_df <- augment %>%
distinct(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>%
ungroup() %>%
left_join(.,augmentation_by_isco) %>%
left_join(., augmentation_by_country)
########### ########### ########### ########### ###########
## BIG UNKNOWN
unknown <- read_csv("./bases/ILO/Microdata_ILO/EMP_SEX_ISCO08_2D_Bigunknown.csv") %>%
mutate(obs_value = round(obs_value * 1000)) %>% # Convert to thousands
# Move totals to a column
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(total_empl = max(obs_value)) %>%
ungroup() %>%
filter(classif2.label != "TOTAL") %>%
# Calculate total per ISCO 2-digit
group_by(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>%
mutate(total_empl_by_isco = sum(obs_value)) %>%
ungroup() %>%
# Calculate share of each isco in total empl
mutate(isco_share_in_empl = round(total_empl_by_isco / total_empl, 4)) %>%
# Calculate automation "yes" as share of each isco
mutate(yes_share_of_isco = round(obs_value / total_empl_by_isco, 3),
yes_share_of_isco = ifelse(classif2.label == "NO", NA, yes_share_of_isco)) %>%
# Select last survey available per country
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
# Filter LFS for Ecuador
filter(ACRO != "ECV")
# By isco 2-digit
unknown_by_isco <- unknown %>%
filter(classif2.label == "YES") %>%
select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, yes_share_of_isco) %>%
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
rename(unknown_isco = yes_share_of_isco) %>%
group_by(ref_area.label, source.label, ACRO) %>%
filter(time == max(time)) %>%
ungroup() %>%
filter(ACRO != "ECV")
# by country
unknown_by_country <- unknown %>%
filter(classif2.label == "YES") %>%
select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, classif2.label, obs_value, total_empl) %>%
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(automation_total = sum(obs_value)) %>%
ungroup() %>%
mutate(unknown_country = round(automation_total / total_empl, 3)) %>%
distinct(ref_area.label, source.label, ACRO, sex.label, time, unknown_country)
# Final Unknown DF
unknown_df <- unknown %>%
distinct(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>%
ungroup() %>%
left_join(.,unknown_by_isco) %>%
left_join(., unknown_by_country)
########### ########### ########### ########### ###########
# FINAL DF with ISCO and country level scores
final_df <- left_join(automation_df, augmentation_df) %>%
left_join(., unknown_df)
test <- final_df %>%
mutate(check1 = automation_isco + augmentation_isco + unknown_isco) %>%
filter(check1> 1)
test2 <- final_df %>%
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(check2 = sum(total_empl_by_isco))
test3 <- final_df %>%
group_by(ref_area.label, source.label, ACRO, sex.label, time) %>%
mutate(check2 = sum(isco_share_in_empl))
final_df1 <- final_df %>%
rename(country = ref_area.label,
survey = source.label,
acro = ACRO,
sex = sex.label,
year = time,
isco2d = classif1.label) %>%
left_join(.,wb_codes) %>%
select(country, code, everything())
final_df1 %>% write_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_data.csv")
# Combine data by country
by_country_sex <- final_df1 %>%
distinct(country, code, survey, acro, sex, year, automation_country, augmentation_country, unknown_country) %>%
ungroup() %>%
pivot_longer(cols = ends_with("_country"),
names_to = "potential",
values_to = "value") %>%
mutate(potential = str_replace(potential,"_country", ""),
potential = toTitleCase(potential),
potential = ifelse(potential == "Unknown", "Big Unknown", potential)) %>%
mutate(potential = factor(potential,
levels = c("Automation",
"Augmentation",
"Big Unknown")))
# Create ILO small df
ilo_estimates <- by_country_sex %>%
filter(sex == "Total") %>%
select(country, code, potential, value) %>%
rename(distribution_percentage = value) %>%
mutate(source = "ILO") %>%
arrange(country) %>%
mutate(distribution_percentage = distribution_percentage * 100)
ilo_estimates %>% write_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_totals_small.csv")