knitr::opts_chunk$set(comment = "#>",
                      echo = T,
                      results = 'show',
                      message = FALSE, 
                      warning = FALSE,
                      cache = F)
library(readxl)
library(tidyverse)
library(wbstats)
library(gghighlight)
library(lubridate)
library(ggridges)
library(sjPlot)
library(webshot)
library(DT)
library(tidytext)
library(plotly)
library(stringr)
library(stringdist)
library(fuzzyjoin)
library(wbstats)
library(countrycode)
library(maps)
library(Rilostat)
library(geomtextpath)
library(kableExtra)
library(flextable)
library(officer)
library(htmltools)
library(scales)
library(forcats)
library(stringr)
library(tools)
library(countrycode)
library(wbstats)
library(GGally)
library(WDI)
library(ggrepel)
library(flextable)
library(ggpubr)

BASIC DATA TO LOAD: Run this top be able to execute other parts of the code in chunks

# Read the main results file with exposure levels (no computer data)
data <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_LAC") %>% 
  select(-1) %>% 
  rename(
    code = 1,
    `AI Categories` = 2,
    Total = 3
  ) %>% 
  fill(code, .direction = "down") %>% 
  rename(potential = "AI Categories") %>% 
  mutate(potential = ifelse(potential == "missing", "Other", potential)) 

# Read list of country codes used by WB
wb_codes <- read_excel("./bases/ILO/Country_codes_wb.xlsx")

# Attach WB codes to project data
data <- left_join(data, wb_codes) %>% 
  select(country, code, everything())

# Create a long version of project data
data1 <- data %>% 
  select(country, 1:6) %>% 
  pivot_longer(cols = c(Total:Male),
               names_to = "sex",
               values_to = "value") %>%
  mutate(potential = factor(potential, levels = c("Automation Potential",
                                                  "Augmentation Potential",
                                                  "The Big Uknown",
                                                  "Other"))) 


# Create a list of all countries (from WB and ILO results)
countries_wb <- data %>% distinct(code) %>% drop_na()
  
countries_ilo <- read_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_totals_small.csv") %>%  distinct(code)

all_countries <- rbind(countries_ilo, countries_wb) %>% 
  unique() %>% 
  left_join(., wb_codes)

# Create a df with LAC names and codes
LAC_countries <- all_countries %>% 
  select(country, code) %>% 
  distinct()

# Create a list of LAC codes
LAC_codes <- LAC_countries %>% 
  select(code) %>% 
  pull()

# Get ISCO 1-digit data from ILO modeled estimates for year 2022 through API:
Z <- get_ilostat("EMP_2EMP_SEX_OCU_NB_A", cache = F) 
Z <- Z %>% 
  filter(time == 2022,
         ref_area %in% LAC_codes) 
Z <- Z %>% label_ilostat()
toc <- get_ilostat_toc(segment = "ref_area") %>%
                       select(ref_area,
                       ref_area.label,
                       wb_income_group.label,
                       ilo_region.label,
                       ilo_subregion_broad.label,
                       ilo_subregion_detailed.label) %>%
       distinct() %>%
       drop_na() %>%
       # Create income-subregion categories for each country
       mutate(region_income = paste0(ilo_region.label, ": ", gsub("World: ", "", wb_income_group.label)),
              subregion_income = paste0(ilo_subregion_broad.label, ": ", gsub("World: ", "", wb_income_group.label)))

# Merge with labels
Z1 <- left_join(Z, toc) %>%
  # Create easier labels
  rename(country = ref_area.label,
         countrycode = ref_area,
         region = ilo_region.label,
         income = wb_income_group.label,
         subregion = ilo_subregion_broad.label,
         subregion2 = ilo_subregion_detailed.label,
         year = time,
         sex = sex.label)

# Format
lac1d <- Z1 %>% 
  rename(code = countrycode) %>% 
  select(country, code, year, sex, classif1.label, obs_value, income) %>% 
  # Copy the totals by gender from the main column into a new column
  mutate(total_by_gender = ifelse(classif1.label == "Occupation (Detailed): Total", obs_value, NA)) %>%
  # Fill in totals by country, year, sex, source in the new column
  fill(total_by_gender, .direction = "down") %>% 
  rename(job = classif1.label,
         value = obs_value,
         total = total_by_gender) %>% 
  mutate(share = round(value/total, 2)) %>% 
  group_by(country, code, year, sex) %>% 
  mutate(check = sum(share)) %>% 
  filter(job != "Occupation (Detailed): Total") %>% 
  mutate(check = sum(share)) %>% 
  #Adjust gender labels
  mutate(sex = case_when(sex == "SEX_M" ~ "Men",
                         sex == "SEX_F" ~ "Women",
                         sex == "SEX_T" ~ "Total",
                         TRUE ~ sex)) %>%
  # Create smaller ISCO labels
  mutate(job = str_replace(job, "Occupation \\(Detailed\\): ", "")) %>% 
  mutate(isco1 = substr(job, start = 1, stop = 1)) %>% 
  mutate(isco1 = ifelse(isco1 == "9", "96", isco1)) %>% 
  mutate(sex = str_replace(sex, "Sex: ", "")) %>% 
  mutate(income = str_replace(income, "World: ", "")) %>% 
  ungroup() %>% 
  mutate(job = factor(job, levels = c(
    "1. Managers",
    "2. Professionals",
    "3. Technicians and associate professionals",
    "4. Clerical support workers",
    "5. Service and sales workers",
    "7. Craft and related trades workers",
    "8. Plant and machine operators, and assemblers",
    "96. Elementary occupations and skilled agricultural, forestry and fishery workers"
  ))) %>% 
  mutate(income = factor(income, levels = c(
    "Lower-middle income",
    "Upper-middle income",
    "High income"
  )))

# Create list of LAC countries with income status from ILO API download
income_ILO1d <- lac1d %>% 
  select(country, income) %>% 
  distinct()

# Create the final df with 22 country names, codes and income status, adding manually the missing ones
income2 <- left_join(all_countries, income_ILO1d) %>% 
  select(country, code, income) %>% 
  mutate(income = as.character(income))%>% 
  mutate(income = ifelse(code %in% c("GRD","MSR"), "Upper-middle income", income),
         income = ifelse(code %in% c("BOL"), "Lower-middle income", income )) %>% 
  mutate(income = factor(income, levels = c(
    "Lower-middle income",
    "Upper-middle income",
    "High income"
  ))) %>% 
  arrange(income, country) 

# Get ILO Employment by sex and ISCO2: R download
X <- get_ilostat("EMP_2EMP_SEX_AGE_NB_A", cache = F)

modelled_totals_2023 <- X %>% filter(sex == "SEX_T") %>% 
  mutate(obs_value = obs_value * 1000) %>% 
  filter(classif1 == "AGE_YTHADULT_YGE15") %>% 
  rename(code = ref_area) %>% 
  rename(Total_EMPL_modelled_2022_R = obs_value) %>% 
  select(code, time, Total_EMPL_modelled_2022_R) %>% 
  rename(year = time) %>% 
  filter(year == "2023")

modelled_totals_2023_by_sex <- X %>% 
  mutate(sex = case_when(sex == "SEX_M" ~ "Male",
                         sex == "SEX_F" ~ "Female",
                         sex == "SEX_T" ~ "Total",
                         TRUE ~ sex)) %>% 
  filter(sex != "Total") %>% 
  mutate(obs_value = obs_value * 1000) %>% 
  filter(classif1 == "AGE_YTHADULT_YGE15") %>% 
  rename(code = ref_area) %>% 
  rename(Total_EMPL_modelled_2022_R_by_sex = obs_value) %>% 
  filter(time == "2023") %>% 
  select(code, sex, Total_EMPL_modelled_2022_R_by_sex) %>% 
  mutate(sex = as.character(sex))

Figure 1. GDP per capita, population and income status of LAC countries in the sample

# GET POPULATION AND GDP FROM THE WORLD BANK for 2022
variables <- c("SP.POP.TOTL", "NY.GDP.PCAP.CD")
popul_gdp_data <- WDI(country = LAC_codes, indicator = variables, start = 2022, end = 2022) %>% 
  rename(population = "SP.POP.TOTL",
         gdp_percap = "NY.GDP.PCAP.CD") %>% 
  # Merge to df with income status
  left_join(., income2)

# Generate Figure 1
ggplot(popul_gdp_data, aes(x = population / 1000000, y = gdp_percap, label = country)) +
  geom_point(aes(color = factor(income)), size = 3) +  
  geom_text_repel(size = 4, angle = 0, max.overlaps = Inf) + 
  scale_color_brewer(palette = "Set1") +  
  theme_minimal() + 
  labs(x = "Population (millions)", y = "GDP per Capita (USD)",
       color = "") +  
  theme(legend.position = "bottom",
        legend.text = element_text(size = 11))  

# Save 
ggsave(filename = "Figure1.jpeg",   
       path = "Figures", 
       width = 8, height = 6,        
       units = "in",                 
       dpi = 300) 

Figure 2. Automation and augmentation potential: LAC vs other regions

gbb_data <- read_csv("./bases/ILO/GBB_subregion_sex.csv") %>% 
  filter(sex == "Total") %>% 
  select(region, Distribution_Percentage, impact) %>% 
  rename(total = Distribution_Percentage) %>% 
  mutate(lac = ifelse(region == "Latin America and the Caribbean", "LAC", "nonLAC"))

a <- ggplot(gbb_data %>% filter(impact == "Automation Potential"),
       aes(x = reorder(region, total), y = total, fill = lac)) +
  geom_col(alpha = 1) +
  coord_flip() +
  facet_wrap(~impact) +
  scale_fill_manual(values = c("orange3", "skyblue3"), guide = guide_legend(reverse = TRUE)) +
  theme(legend.position = "none",
        axis.text.x = element_text(size =11),
        axis.text.y = element_text(size =13),
        axis.title = element_text(size = 10),
        strip.text = element_text(size = 12)) +
  labs(x = "", y = "Share of total employment (%)")+ 
  ylim(0, 35) +
  geom_text(aes(label = paste(total, "%")), hjust = -0.2, size = 4)

b <- ggplot(gbb_data %>% filter(impact == "Augmentation Potential"),
       aes(x = reorder(region, total), y = total, fill = lac)) +
  geom_col(alpha = 1) +
  coord_flip() +
  facet_wrap(~impact) +
  scale_fill_manual(values = c("orange3", "skyblue3"), guide = guide_legend(reverse = TRUE)) +
  theme(legend.position = "none",
        axis.text = element_text(size =11),
        axis.text.y = element_text(size =13),
        axis.title = element_text(size = 10),
        strip.text = element_text(size = 12)) +
  labs(x = "", y = "Share of total employment (%)") + 
  ylim(0, 35) +
  geom_text(aes(label = paste(total, "%")), hjust = -0.2, size = 4)


ggarrange(a, b)

# Save 
ggsave(filename = "Figure2.jpeg",   
       path = "Figures", 
       width = 11, height = 4,        
       units = "in",                 
       dpi = 230)

Figure 3. Internet coverage vs per capita income: global and LAC

# WB data
indicators <- c("NY.GDP.PCAP.PP.KD", "IT.NET.USER.ZS")
data <- WDI(country = "all", indicator = indicators, start = 2019, end = 2021, extra = TRUE) 

data1 <- data %>%  
  # Drop rows with missing values in the selected indicators
  filter(!is.na(NY.GDP.PCAP.PP.KD) & !is.na(IT.NET.USER.ZS)) %>% 
  # Drop aggregates and regional NAs
  filter(!is.na(region),
         region != "Aggregates") %>% 
  # Select the latest year for each country
  group_by(country) %>%
  filter(year == max(year)) %>%
  ungroup() %>% 
  # Drop rows with missing year
  filter(!is.na(year)) %>% 
  # Filter data for the years 2019 or later
  filter(year >= 2019) %>% 
  # Create new variable lgdp (log of GDP per capita) and LAC
  mutate(lgdp = log(NY.GDP.PCAP.PP.KD)) %>% 
  # Create LAC identifier
  mutate(lac = ifelse(region =="Latin America & Caribbean", "LAC", "non-LAC"))

# Plot
a <- ggplot(data1, aes(x = lgdp, y = IT.NET.USER.ZS, color = lac)) +
  geom_point(size = 1) +
  geom_smooth(method = "loess", formula = y ~ log(x), se = FALSE, aes(color = "Fitted line (loess)"), linewidth = 1) +
  scale_color_manual(values = c("Fitted line (loess)" = "green", "LAC" = "red", "non-LAC" = "blue"),
                     labels = c("Fitted line (loess)", "LAC", "non-LAC"),
                     guide = guide_legend(reverse = TRUE)) +
  labs(x = "GDP per capita, USD 2017 PPP (log) (WDI, circa 2021)", 
       y = "Internet Users per 100 people (ITU, circa 2021)", 
       color = "") +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 10),
        axis.title = element_text(size = 11))

# Plot LAC only
data_lac_net <- data1 %>%
  filter(lac == "LAC") %>%
  mutate(
    country = ifelse(country == "Bahamas, The", "The Bahamas", country),
    income = factor(income, levels = c("High income", "Upper middle income", "Lower middle income"))
  )

b <- ggplot(data_lac_net,
       aes(x = lgdp, y = IT.NET.USER.ZS, color = income)) +
  geom_point(size = 1) +
  geom_smooth(method = "glm", formula = y ~ log(x), se = FALSE, color = "green2") +
  scale_y_continuous(limits = c(0, 100)) +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        legend.text = element_text(size = 10),
        axis.title = element_text(size = 11)) +
  labs(x = "Per capita income in USD (log) (WDI, circa 2021)",
       y = "% population with internet access (ITU, 2021)") +
  geom_text_repel(aes(label = country), size = 3, angle = 0, max.overlaps = Inf, color = "black") +
  scale_color_manual(
    values = c(
      "High income" = "blue", 
      "Upper middle income" = "orange2", 
      "Lower middle income" = "red"
    ),
    labels = c("High income", "Upper middle income", "Lower middle income"),
    guide = guide_legend(reverse = TRUE)
  )

# Save as one figure
ggarrange(a, b, ncol =2)

ggsave(filename = "Figure3.jpeg",   
       path = "Figures", 
       width = 12, height = 5,        
       units = "in",                 
       dpi = 300) 

Figure 4. Occupations in the LAC region, by ISCO 1-digit and gender

# Calculate mean shares based on ILO modelled data at 1-digit
means <- lac1d %>% 
  group_by(job, sex, income) %>% 
  summarise(mean = mean(share)) %>% 
  ungroup()

# Plot
ggplot(means %>% filter(sex != "Total"),
       aes(x = fct_rev(job), y = mean, fill = sex)) +
  geom_col(position = "dodge") +
  facet_wrap(~income) +
  coord_flip() +
  theme(legend.position = "right",
        legend.title =element_blank(),
        axis.text.y = element_text()) +
  labs(x = "",
       y = "Mean share of employment within each income group") +
  geom_text(aes(label = paste(round(mean *100, 1), "%")), 
            position = position_dodge(width = 0.9), 
            hjust = -0.1, 
            size = 3) +
  scale_fill_manual(values = c("#ff7f0e", "#1f77b4"), guide = guide_legend(reverse = TRUE)) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) +
  ylim(0,0.56)

# Save 
ggsave(filename = "Figure4.jpeg",   
       path = "Figures", 
       width = 9, height = 4.5,        
       units = "in",                 
       dpi = 300) 

Figure 5. Coverage of ISCO-08 4-digit microdata in SEDLAC (WB) and ILO harmonized microdata collection

# ILO dataset based on micro data extraction
data_5_ilo <- read_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_totals_small.csv") %>% 
  rename(ilo_total = distribution_percentage) %>% 
  select(-source, -country) %>% 
  mutate(potential = case_when(potential == "Automation" ~ "Automation Potential",
                               potential == "Augmentation" ~"Augmentation Potential",
                               potential == "Big Unknown" ~ "The Big Unknown"))

# WB main results
data_5_wb <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_LAC") %>%
    select(-1) %>% 
  rename(code = 1, `AI Categories` = 2, Total = 3) %>% 
  select(1:3) %>% 
  fill(code, .direction = "down") %>% 
  rename(potential = "AI Categories") %>% 
  mutate(potential = ifelse(potential == "missing", "Other", potential)) %>% 
  filter(potential != "Other") %>% 
  rename(wb_total = Total) %>% 
  mutate(potential = ifelse(potential == "The Big Uknown", "The Big Unknown", potential))

data_5_compare <- full_join(data_5_ilo, data_5_wb)%>% # check 2d vs 4d data labels, then remove
  left_join(., wb_codes) %>% 
  filter(!is.na(country)) %>% 
  select(1:5) %>% 
  rename(WB = wb_total,
         ILO = ilo_total) %>%
  pivot_longer(cols = ILO:WB,
               names_to = "source",
               values_to = "value") %>% 
  drop_na() %>% 
  mutate(country = factor(country),
         potential = case_when(potential == "Automation Potential" ~ "Automation",
                               potential == "Augmentation Potential" ~ "Augmentation",
                               TRUE ~potential),
         potential = factor(potential, levels = c("Automation", "Augmentation", "The Big Unknown"))) %>% 
  filter(country != "Argentina")

# Plot
ggplot(data_5_compare, aes(x = fct_rev(country), y = value, fill = source)) + 
  geom_col(position = "dodge") +
  coord_flip() +
  facet_wrap(~ potential, ncol = 3) +
  geom_text(aes(label = paste0(round(value, 2), "%", " (", source, ")")), 
            position = position_dodge(width = 1),
            vjust = 0.4,
            hjust = -0.1,
            size = 2.5) +
  labs(x = "", 
       y = "Share of total employment (%)",
       title = "") +
  scale_fill_manual(values = c("skyblue3", "orange2"), guide = guide_legend(reverse = TRUE)) +
  theme(legend.title = element_blank(),
        axis.title = element_text(size = 10),
        strip.text = element_text(size = 10),
        axis.text.y = element_text(size = 10)) +
  ylim(0, 50)

# Save 
ggsave(filename = "Figure5.jpeg",   
       path = "Figures", 
       width = 9, height = 6,        
       units = "in",                 
       dpi = 300)

Figure 6. Hierarchical clustering based on ISCO 2-digit shares, GDP(PPP) and total population

# Load the data
data <- read_excel("bases/ILO/Clustering_data.xlsx", sheet = "clustering")

# Exclude non-relevant columns and add log transformation to GDP_percap_ppp and popul
data_for_clustering <- data %>% 
  select(-Name, -Country, -wb_only2d, -GDP_percap) %>%
  mutate(across(GDP_percap_ppp:popul, ~log(.))) %>% 
  as.data.frame()

# Set the row names to the country names before scaling
row.names(data_for_clustering) <- data$Name

# Scale the entire dataset, including the log-transformed variables
data_for_clustering_scaled_complete <- scale(data_for_clustering)

# Scale the data based on ISCO only (excluding the newly transformed columns)
data_for_clustering_scaled_isco <- data_for_clustering %>%
  select(-GDP_percap_ppp, -popul) %>% 
  scale()

# Calculate the distance matrix for ISCO only scaled data
dist_matrix2 <- dist(data_for_clustering_scaled_complete, method = "euclidean")

# Hierarchical Clustering
hc2 <- hclust(dist_matrix2, method = "ward.D2")

# Create the plot and save it as a PNG file
png("Figures/Figure6.jpeg", width = 800, height = 600, res = 150)

plot(hc2, 
     hang = -1, 
     labels = row.names(data_for_clustering_scaled_isco), 
     cex = 0.7, 
     main = "Hierarchical Clustering based only on ISCO 2d, GDP (PPP) and Population", 
     cex.main = 0.8,  # Adjust the size of the title text
     xlab = "",       # Set x-axis label to empty
     ylab = "Height")

# Turn off the plotting device
dev.off()
#> png 
#>   2

Figure 7. Total exposure to GenAI by country

# ILO dataset based on micro data extraction
data_7_ilo <- read_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_totals_small.csv") %>% 
  rename(ilo_total = distribution_percentage) %>% 
  select(-source, -country) %>% 
  mutate(potential = case_when(potential == "Automation" ~ "Automation Potential",
                               potential == "Augmentation" ~"Augmentation Potential",
                               potential == "Big Unknown" ~ "The Big Unknown"))


data_7_wb <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_LAC") %>%
    select(-1) %>% 
  rename(code = 1, `AI Categories` = 2, Total = 3) %>% 
  select(1:3) %>% 
  fill(code, .direction = "down") %>% 
  rename(potential = "AI Categories") %>% 
  mutate(potential = ifelse(potential == "missing", "Other", potential)) %>% 
  filter(potential != "Other") %>% 
  rename(wb_total = Total) %>% 
  mutate(potential = ifelse(potential == "The Big Uknown", "The Big Unknown", potential))


data_7_combined <- full_join(data_7_ilo, data_7_wb)%>% 
  mutate(source = ifelse(is.na(wb_total), "ILO", "WB"),
         exposure = ifelse(is.na(wb_total), ilo_total, wb_total)) %>% 
  mutate(exposure = round(exposure, 2))%>% 
  left_join(., wb_codes) %>%
  left_join(., modelled_totals_2023) %>% # Add ILO estimates
  # Add manually employment in Grenada (33800, 2020) and Montserrat (2489, 2020)
  mutate(Total_EMPL_modelled_2022_R = case_when(country == "Montserrat" ~ 2489,
                                                country == "Grenada" ~ 33800,
                                                TRUE ~ Total_EMPL_modelled_2022_R)) %>% 
  mutate(exposed_jobs_country = round(exposure/100 * Total_EMPL_modelled_2022_R, 0)) %>% 
  # Calculate total exposed jobs in 3 categories for each country
  group_by(code) %>% 
  mutate(total_jobs_exposed_country = sum(exposed_jobs_country),
         exposure_total = sum(exposure)) %>% 
  ungroup() %>% 
  select(country, potential, exposure, exposure_total, total_jobs_exposed_country, Total_EMPL_modelled_2022_R) 

# Add results for HIC from Gmyrek, Berg and Bescond 2023 and format to append to the main df
hic_ilo <- read_csv("./bases/ILO/GBB_2023_HIC_results_formatted.csv") %>% 
  rename(exposure = value,
         total_jobs_exposed_country = exposed_total) %>% 
  select(country, potential, exposure, exposure_total, total_jobs_exposed_country, Total_EMPL_modelled_2022_R)
  
final_exposures_LAC <- rbind(data_7_combined, hic_ilo) %>% 
  filter(country != "Argentina") %>% 
  mutate(potential = factor(potential, levels = c("Automation Potential", "Augmentation Potential", "The Big Unknown")))

# Plot totals by country with both datasets
ggplot(final_exposures_LAC, 
       aes(x = reorder(country, exposure_total), y = exposure, fill = fct_rev(potential))) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("Augmentation Potential" = "orange",
                               "Automation Potential" = "red3",
                               "The Big Unknown" = "skyblue"),
                    guide = guide_legend(reverse = TRUE)) +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1),
        legend.position = "bottom") +
  labs(x = "",
       y = "Share of total employment (%)") +
  geom_text(aes(label = paste(round(exposure, 0), "%")), position = position_stack(vjust = 0.6), size = 3) +
  ylim(0,60) +
  geom_text(aes(label = paste(round(exposure_total, 0), "%", "(", round(total_jobs_exposed_country / 1000000, 2), "M jobs", ")"), y = exposure_total), size = 3, hjust = -0.1)

# Save 
ggsave(filename = "Figure7.jpeg",   
       path = "Figures", 
       width = 9, height = 5,        
       units = "in",                 
       dpi = 300)

Figure 8

Figure 9

# Box plots by category
category = "Augmentation Potential"
box <- data1 %>% 
    filter(potential == category) %>% 
    select(-potential) %>% 
    pivot_longer(cols = c(2:last_col()),
                 values_to = "value",
                 names_to = "var") 

varnames <- box %>% distinct(var) %>% pull()

box <- box %>% 
  mutate(var = factor(var, levels = varnames)) %>% 
  mutate(class = case_when(var %in% c("Female", "Male") ~ "Gender",
                           var %in% c("Rural", "Urban") ~ "Location",
                           var %in% c("15-24", "25-34", "35-44", "45-54", "55-64") ~ "Age",
                           var %in% c("Low education", "Medium education", "High education") ~ "Education",
                           var %in% c("Poor", "Non-Poor") ~ "Poverty",
                           var %in% c("Q1", "Q2", "Q3", "Q4", "Q5") ~ "Household Income (Q)",
                           var %in% c("Formal (legal)", "Informal (legal)") ~ "Informality (legal)",
                           var %in% c("Formal (prod)", "Informal (prod)") ~ "Informality (prod)",
                           var %in% c("Employer", "Salaried employee", 
                                      "Self-employed", "Family worker (without salary)") ~ "Employment",
                           var %in% c("Primary Activities", "Low Technology Industries", "Other Industries",
                                       "Construction", "Retail, Wholesale Trade, Restaurants, Hotels, etc.",
                                       "Electricity, Gas, Water, Transportation, Communication",
                                       "Banks, Finance, Insurance, Professional Services",
                                       "Public Administration, Defense", "Education, Health, Personal Services",
                                       "Domestic Services") ~ "Sector"
                           )) %>% 
  filter(var != "Total") %>% 
  mutate(class = factor(class, levels = c("Gender", 
                                          "Location", 
                                          "Age", 
                                          "Education", 
                                          "Poverty", 
                                          "Household Income (Q)", 
                                          "Informality (legal)", 
                                          "Informality (prod)", 
                                          "Employment",
                                          "Sector")))

# Plot
ggplot(box, aes(x = var, y = value)) +
  geom_boxplot() +
  geom_point(alpha = 0.5, shape = "diamond", size = 2) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
  stat_summary(fun = mean, geom = "line", color = "red", group = 1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0, size = 11),
        legend.title = element_blank(),
        strip.text = element_text(size = 11)) +
  labs(
       # title = category,
       # subtitle = "Distribution of country-level correlates",
       y = "Share of employment exposed to augmentation (%)",
       x = "") +
  facet_wrap(~class, scales = "free", nrow = 2)

# Save 
ggsave(filename = "Figure9.jpeg",   
       path = "Figures", 
       width = 10, height = 8,        
       units = "in",                 
       dpi = 300)  

Figure 10. Jobs with augmentation potential and access to computer at work, based on PIAAC data

oecd_countries <- c("AUS", "AUT", "BEL", "CAN", "CHL", "COL", "CZE", "DNK", "EST", "FIN", "FRA", "DEU", "GRC", 
                    "HUN", "ISL", "IRL", "ISR", "ITA", "JPN", "KOR", "LVA", "LTU", "LUX", "MEX", "NLD", "NZL", 
                    "NOR", "POL", "PRT", "SVK", "SVN", "ESP", "SWE", "CHE", "TUR", "GBR", "USA")

# Selection of countries from 2015 and 2017 PIAAC 
piaac_2015 <- c("Chile", "New Zealand", "Slovenia")
piaac_2017 <- c("Ecuador", "Mexico", "Peru")
# Country names df
country_names <- c("Chile", "Ecuador", "Mexico", "New Zealand", "Peru", "Slovenia")
country_codes <- c("CHL", "ECU", "MEX", "NZL", "PER", "SVN")
country_tibble <- tibble(code = country_codes, country = country_names)

# PIACC dataset
data_piaac <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_PIAAC_ONLY_CPU") %>% 
  select(-1, -4) %>% 
  rename(code = 1,
         potential = 2,
         exposed = 3) %>% 
  fill(code, .direction = "down") %>% 
  filter(code %in% c("SVN", "NZL", "CHL", "ECU", "MEX", "PER")) %>% 
  mutate(computer = str_split_fixed(potential, ", ", 2)[,2],
         potential = str_split_fixed(potential, ", ", 2)[,1]) %>% 
  select(code, potential, computer, everything()) %>% 
  mutate(potential = str_replace_all(potential, c("^Pr\\(" = "", "\\)$" = "")))%>% 
  mutate(potential = str_replace_all(potential, "=1", ""))  %>% 
  mutate(potential = ifelse(potential == "Missing", "Other", potential)) %>% 
  mutate(computer = str_replace_all(computer, "Computer at work=", "")) %>% 
  mutate(computer = str_replace_all(computer, "\\)$", "")) %>% 
  mutate(computer = ifelse(computer == 1, "Computer at work", "No computer at work")) %>% 
  left_join(., wb_codes) %>% 
  filter(potential == "Augmentation") %>% 
  select(code, potential, computer, exposed) %>% 
  group_by(code) %>% 
  mutate(total_augmentation = sum(exposed)) %>% 
  ungroup() %>% 
  # ADD LAC and OECD labels
  mutate(OECD = if_else(code %in% oecd_countries, "OECD", "Non-OECD")) %>% 
  left_join(., LAC_countries) %>% 
  rename(LAC = country) %>% 
  mutate(LAC = ifelse(is.na(LAC), "Other", "LAC")) %>%  
  left_join(., country_tibble) %>% 
  select(country, code, everything()) %>% 
  # Add a dummy variable for legend purpose
  mutate(dummy_legend = ifelse(LAC == "LAC", "LAC countries", NA)) %>% 
  # Create a sorting variable
  mutate(comp = ifelse(computer == "Computer at work", exposed, NA)) %>% 
  fill(comp, .direction = "down") 

# Plot
ggplot(data_piaac, 
       aes(x = reorder(country, comp), y = exposed, fill = fct_rev(computer))) +
  geom_col(alpha = 0.8) +
  scale_fill_manual(values = c("skyblue", "orange2"), 
                    guide = guide_legend(reverse = T, override.aes = list(color = NA))) +
  theme(legend.position = "top",
        axis.text.x = element_text(angle = 0, size = 12, hjust = 1, vjust = 1),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size=10),
        legend.title = element_blank(),
        legend.text = element_text(size = 9)) +
  labs(x = "",
       y = "Share of total employment (%)"
       ) +
  coord_flip() +
  ylim(0,23) +
  geom_text(aes(label = paste(round(exposed, 1), "%")), position = position_stack(vjust = 0.5), angle = 0, size = 3.5) +
  geom_text(aes(label = paste(round(total_augmentation, 1), "%"), y = total_augmentation + 1.5), angle = 0, size = 3.5) +
  # Add LAC markers
  geom_point(data = subset(data_piaac, !is.na(dummy_legend)), aes(x = country, y = 30, color = dummy_legend), size = 3, show.legend = TRUE) +
  geom_col(data = subset(data_piaac, LAC == "LAC"), aes(x = reorder(country, comp), y = total_augmentation), 
           position = "identity", color = "brown", size = 0.7, alpha = 0.1, show.legend = FALSE) +
  scale_color_manual(values = c("LAC countries" = "brown"), guide = guide_legend(override.aes = list(shape = 16))) 

# Save 
ggsave(filename = "Figure10.jpeg",   
       path = "Figures", 
       width = 10, height = 3,        
       units = "in",                 
       dpi = 300)  

Figure 11. Exposure by country, exposure type and access to digital infrastructure

data_11 <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_SEDLAC_PIAAC_CPU") %>% 
  select(-1, -4) %>% 
  rename(code = 1,
         potential = 2,
         Total = 3) %>% 
  fill(code, .direction = "down") %>% 
  mutate(potential = ifelse(potential == "missing", "Other", potential)) %>%
  mutate(computer = str_split_fixed(potential, ", ", 2)[,2],
         potential = str_split_fixed(potential, ", ", 2)[,1]) %>% 
  select(code, potential, computer, everything()) %>% 
  mutate(potential = str_replace_all(potential, c("^Pr\\(" = "", "\\)$" = ""))) %>% 
  mutate(potential = str_replace_all(potential, "=1", "")) %>% 
  mutate(computer = str_replace_all(computer, "Computer at work=", "")) %>% 
  mutate(computer = str_replace_all(computer, "\\)$", "")) %>% 
  mutate(computer = ifelse(computer == 1, "Computer", "No computer")) %>% 
  left_join(., wb_codes) %>% 
  select(country, 1:6) %>%
  select(-Total) %>% 
  pivot_longer(cols = c(Female:Male),
               names_to = "sex",
               values_to = "value") %>%  
  group_by(country, potential, sex) %>% 
  mutate(total = sum(value)) %>% 
  ungroup() %>% 
  mutate(computer = factor(computer, levels= c("No computer", "Computer"))) %>% 
  # Create twoway data
  mutate(male = ifelse(sex == "Male", -value, NA),
         female = ifelse(sex == "Female", value, NA)) %>% 
  mutate(total_men = -total,
         total_women = total) %>%
  mutate(total_men = round(total_men, 1),
         total_women = round(total_women, 1)) %>% 
  mutate(total_men = ifelse(sex == "Male", total_men, NA_character_)) %>% 
  mutate(total_women = ifelse(sex == "Female", total_women, NA_character_)) %>% 
  filter(potential %in% c("Automation", "Augmentation", "Big Unknown")) %>% 
  mutate(potential = factor(potential, levels =c("Automation", "Augmentation", "Big Unknown")))


# Plot 
ggplot(data_11,
       aes(x = country,  y = female, fill = computer, label = paste0(round(female, 1)))) +
  geom_col(position = "stack") +
  coord_flip() + 
  facet_wrap(~potential, ncol = 1) +
  geom_text(position = position_stack(vjust = 0.5), size = 2) +
  labs(x = "",
       y = "Share of employment (%)") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        legend.position = "bottom",
        legend.title = element_blank()) +
  scale_fill_manual(values = c("skyblue", "orange2"), guide = guide_legend(reverse = TRUE)) +
  geom_hline(yintercept = 0, linetype = "solid", color = "black") +
  geom_col(aes(x = country,  y = male, fill = computer, position = "stack")) +
  geom_text(aes(x = reorder(country, value),  y = male, label = paste0(round(-male, 1))),  position = position_stack(vjust = 0.5), size = 2) + 
  annotate(geom = "label", x = max(data_11$male, na.rm = TRUE) * 1.05, 
           y = length(unique(data_11$country)) + 1, label = "Men  ", 
           hjust = 9.5, vjust = -0.5, size = 4, color = "red", fill = "grey", label.size = 0.4) +
  annotate(geom = "label", x = min(data_11$female, na.rm = TRUE) * 1.05, 
           y = length(unique(data_11$country)) + 1, label = "Women", hjust = -1, vjust = 0, size = 4, color = "blue", fill = "grey", label.size = 0.4) 

# Save 
ggsave(filename = "Figure11.jpeg",   
       path = "Figures", 
       width = 9, height = 8,        
       units = "in",                 
       dpi = 300) 

data_11 %>% write_csv("Figure11_data.csv")

Calculation of jobs in augmentation category that do not have a computer

### Take data on exposure: select only no computer and total augmentation, by sex (total is total % of empl in augmentation by sex, value is % of empl with no computer)
jobs_data <- data_11 %>% 
    filter(computer == "No computer",
         potential == "Augmentation") %>% 
  select(country, code, potential, computer, sex, value, total) 

    
# Get ILO Employment by sex for 2023 and ISCO2 to create country weights for LAC countries: R download
X <- get_ilostat("EMP_2EMP_SEX_AGE_NB_A", cache = F) 

modelled_totals_2023_by_sex <- X %>% 
  mutate(sex = case_when(sex == "SEX_M" ~ "Male",
                         sex == "SEX_F" ~ "Female",
                         sex == "SEX_T" ~ "Total",
                         TRUE ~ sex)) %>% 
  filter(sex != "Total") %>% 
  mutate(obs_value = obs_value * 1000) %>% 
  filter(classif1 == "AGE_YTHADULT_YGE15") %>% # Take all workers 15+
  rename(code = ref_area) %>% 
  rename(EMPL_modelled_2023_R_by_sex = obs_value) %>% 
  filter(time == "2023") %>% 
  select(code, sex, EMPL_modelled_2023_R_by_sex) %>% 
  mutate(sex = as.character(sex)) %>% 
  filter(code %in% LAC_codes) 

# Create a final dataset with empl totals and empl weights by sex 
jobs_data2 <- left_join(jobs_data, modelled_totals_2023_by_sex) %>% 
  arrange(sex, country) %>% 
  group_by(sex) %>%
  mutate(total_LAC_empl_by_sex = sum(EMPL_modelled_2023_R_by_sex),
         country_weight_by_sex = EMPL_modelled_2023_R_by_sex / total_LAC_empl_by_sex,
         check =sum(country_weight_by_sex)) %>% # check that weights grouped by sex add to 1
  ungroup() %>% 
  # Calculate the nr of jobs in augmentation, based on ILO 2023 estimates
  mutate(country_augment_jobs_no_comp_sex = round(value/100 * EMPL_modelled_2023_R_by_sex, 0),
         total_jobs_augmentation_country_sex = round(total/100 * EMPL_modelled_2023_R_by_sex, 0),
         share_no_comp_in_augmentation_country_sex = round(country_augment_jobs_no_comp_sex/total_jobs_augmentation_country_sex,2)) %>% 
  group_by(sex) %>% 
  mutate(total_LAC_jobs_aug_no_comp_sex = sum(country_augment_jobs_no_comp_sex),
         total_LAC_jobs_augmentation = sum(total_jobs_augmentation_country_sex)) %>% 
  ungroup() %>% 
  mutate(LAC_share_no_comp_augmentation_in_total_empl_sex = total_LAC_jobs_aug_no_comp_sex / total_LAC_empl_by_sex * 100,
         LAC_share_no_comp_in_augmentation_sex = total_LAC_jobs_aug_no_comp_sex / total_LAC_jobs_augmentation * 100) %>% 
  # Alternative calculation: mean of country "value" wieghted by employment
  group_by(sex) %>% 
  mutate(LAC_share_no_comp_augmentation_in_total_empl_sex2 = sum(value * country_weight_by_sex)) %>% # Same as going through totals
  ungroup()

Conclusion:

1. Share of augmentation & no computer in total employment in these LAC countries: 6.15% for women and 6.13 % for men (empl weighted mean). This corresponds to 15.7 million female jobs and 19.8 million male jobs in these countries.

2. Share of no computer in the jobs exposed to augmentation: 44.1% for women 49.3% for men. This corresponds to 6.9 million jobs for women and 9.7 million men jobs in these countries

Figure 12: Exposure by country, type and detailed country-level characteristics

data_12 <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_SEDLAC_PIAAC_CPU") %>% 
  select(-1, -4) %>% 
  rename(code = 1,
         potential = 2,
         Total = 3) %>% 
  rename("Formal (legal)" = "Formal...25",
         "Informal (legal)" = "Informal...26",
         "Formal (prod)" ="Formal...27",
         "Informal (prod)" = "Informal...28") %>%
  fill(code, .direction = "down") %>% 
  mutate(computer = str_split_fixed(potential, ", ", 2)[,2],
         potential = str_split_fixed(potential, ", ", 2)[,1]) %>% 
  select(code, potential, computer, everything()) %>% 
  mutate(potential = str_replace_all(potential, c("^Pr\\(" = "", "\\)$" = ""))) %>% 
  mutate(potential = str_replace_all(potential, "=1", "")) %>% 
  mutate(computer = str_replace_all(computer, "Computer at work=", "")) %>% 
  mutate(computer = str_replace_all(computer, "\\)$", "")) %>% 
  mutate(computer = ifelse(computer == 1, "Computer", "No computer")) %>% 
  mutate(potential = ifelse(potential == "Missing", "Other", potential)) %>% 
  left_join(., wb_codes) %>% 
  select(country, code, everything()) %>% 
  select(-code, -country_label)

all_comp <- data_12 %>% 
  pivot_longer(cols = Total:last_col(),
               names_to = "variable",
               values_to = "value") %>% 
  mutate(potential = factor(potential, levels = c("Automation",
                                                  "Augmentation",
                                                  "Big Unknown",
                                                  "Other"))) %>% 
  group_by(country, potential, variable) %>% 
  mutate(total = sum(value)) %>% 
  ungroup() %>% 
  mutate(variable = factor(variable, levels = c(
  "Total", "Female", "Male", 
  "Rural", "Urban", 
  "15-24", "25-34", "35-44", "45-54", "55-64", 
  "Low education", "Medium education", "High education", 
  "Non-Poor", "Poor", 
  "Q1", "Q2", "Q3", "Q4", "Q5", 
  "Formal (legal)", "Informal (legal)", "Formal (prod)", "Informal (prod)", 
  "Employer", "Salaried employee", "Self-employed", "Family worker (without salary)"))) %>% 
  drop_na()

computer <- all_comp %>% 
  filter(computer == "Computer",
         potential != "Other",
         country %in% c("Colombia", "Costa Rica"))

no_computer <- all_comp %>% 
  filter(computer == "No computer",
         potential != "Other",
         country %in% c("Colombia", "Costa Rica"))

ggplot(computer,
       aes(x = fct_rev(variable), y = value, fill = fct_rev(potential))) +
  geom_col() +
  geom_area() +
  facet_wrap(country~potential, ncol = 3) +
  coord_flip() +
  theme(legend.position = "none",
        axis.title = element_blank()) +
  geom_col(data = no_computer, aes(x = variable, y = total, fill = fct_rev(potential)), alpha = 0.5) +
  scale_fill_manual(values = c("#1f77b4", "#2ca02c", "#d62728"), guide = guide_legend(reverse = TRUE)) +
  geom_text(aes(label = paste(round(total, 1), "%"), y = total), hjust = -0.1, size = 2.5)

# Save 
ggsave(filename = "Figure12.jpeg",   
       path = "Figures", 
       width = 9, height = 8,        
       units = "in",                 
       dpi = 300) 

Figure 13. ISCO 2-digit occupations by type of exposure and country (share of exposure > 25%)

heatmap_data <- read_excel("./Results/Results_AI_LAC.xlsx", sheet = "AI_CPU_ISCO2d", skip = 2) %>% 
  slice(2:nrow(.)) %>% 
  fill(Country, .direction = "down") %>% 
  select(-1, -TOTAL) %>% 
  rename(country = Country,
         isco = "ISCO 2d",
         `Other & Computer` = "Missing & Computer",
         `Other & No Computer` = "Missing & No Computer") %>% 
  mutate(across(`Augmentation & Computer`:`Other & No Computer`, as.numeric)) %>% 
  pivot_longer(cols = `Augmentation & Computer`: `Other & No Computer`,
               names_to = "variable",
               values_to = "Share of exposure") %>% 
  filter(isco != "TOTAL") %>% 
  rename(isco2d_code = isco) %>% 
  mutate(isco2d_code = as.character(isco2d_code)) %>% 
  arrange(isco2d_code)

# Define and apply function to change case of countries
to_title_case <- function(string) {
  sapply(strsplit(string, " "), function(x) {
    paste(toupper(substring(x, 1, 1)), tolower(substring(x, 2)), sep = "", collapse = " ")
  }, USE.NAMES = FALSE)
}
heatmap_data <- heatmap_data %>% mutate(country = to_title_case(as.character(country)))

# Read file with ISCO-08 mapping and create labels
isco_labels <- read_csv("./bases/ILO/mapping.csv") %>% 
  select(isco08_2d, description_2d) %>% 
  distinct() %>% 
  rename(isco2d_code = isco08_2d) %>% 
  arrange(isco2d_code)

heatmap_data2 <- left_join(heatmap_data, isco_labels) %>% 
  mutate(label = paste(isco2d_code, "-", description_2d)) %>% 
  mutate(label = factor(label)) %>% 
  mutate(country = toTitleCase(as.character(country))) %>% 
  mutate("Share of employment" = `Share of exposure` * `Share of Total Employment` /100)

# Verify if employment shares of isco add up to 100
check <- heatmap_data2 %>% 
  select(country, label, variable, "Share of employment") %>% 
  arrange(country) %>% 
  group_by(country) %>% 
  mutate(check_sums = sum(`Share of employment`))

check2 <- heatmap_data2 %>% 
  select(country, label, variable, "Share of employment") %>% 
  filter(country == "Chile") %>% 
  mutate(check_sums = sum(`Share of employment`))


# Share of exposure > 25%
exposure_shares <- heatmap_data2 %>% 
  filter(`Share of exposure` > 25) %>% 
  filter(variable %in% c("Automation & Computer",
                         "Augmentation & Computer", 
                         "Augmentation & No Computer",
                         "Big Unknown & Computer",
                         "Big Unknown & No Computer")) %>% 
  mutate(country = ifelse(country == "Dominican Republic", "Dominican Rep.", country))


ggplot(exposure_shares, aes(x = country, y = fct_rev(label), fill = `Share of exposure`)) +
  geom_tile() + 
  scale_fill_gradient(low = "white", high = "red") +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, size = 11, hjust = 1),
        axis.text.y = element_text(size = 9),
        legend.position = "none",
        strip.text = element_text(size = 14)) +
  labs(#title = paste("Share of ISCO 2-digit job exposed to", filtered),
       x = "",
       y = "",
       fill = "% of exposure by ISCO 2-digit") +
  geom_text(aes(label = paste(round(`Share of exposure`, 1), "%")), size = 2) +
  facet_wrap(~variable, ncol = 1, scales = "free_y")

# Save 
ggsave(filename = "Figure13.jpeg",   
       path = "Figures", 
       width = 10, height = 10,        
       units = "in",                 
       dpi = 300) 

Figure 14. Earnings of occupations exposed to GenAI, by employment status (exposure above 25%)

# Data from ILO STATS with employment related income for LAC countries at 2 digit level, local currency
data <- read_excel("./bases/ILO/Microdata_ILO/EAR_OCU2Digits_LAC.xlsx") %>% 
  filter(sample_ees > 5,# Filter out too small samples
         sample_slf > 5) %>% 
  rename(country = ref_area,
         isco = classif1,
         year = time) %>% 
  select(country, ACRO, year, isco, median2d_SLF, median2d_EES, sample_ees, sample_slf) %>%
  group_by(country, ACRO) %>%
  # Determine the latest year available for country and survey 
  mutate(latest_year_for_total = ifelse(all(is.na(ifelse(isco == "OC2_ISCO08_TOTAL" & !is.na(median2d_EES), year, NA))),
                                        NA,
                                        max(ifelse(isco == "OC2_ISCO08_TOTAL" & !is.na(median2d_EES), year, NA), na.rm = TRUE))) %>%
  filter(year == latest_year_for_total) %>%
  ungroup() %>% 
  mutate(small_sample_ees = ifelse(sample_ees < 15, "U", NA),
         small_sample_slf = ifelse(sample_slf < 15, "U", NA))


overview <- data %>% distinct(country, ACRO, year, latest_year_for_total) # In several cases more than 1 survey per country but always in different years

data1 <- data %>% 
  select(-latest_year_for_total) %>%
  group_by(country) %>%
  # Determine the latest survey available for country and filter only these obs
  mutate(latest_year_for_total = max(ifelse(isco == "OC2_ISCO08_TOTAL" & !is.na(median2d_EES), year, NA), na.rm = TRUE)) %>% 
  filter(year == latest_year_for_total) %>%
  select(-latest_year_for_total) 

overview2 <- data1 %>% distinct(country, ACRO, year) # 1 survey per country, latest year available

data2 <- data1 %>% 
  # Move the overall median to a column for employees and self-employed
  mutate(median2d_SLF_total = ifelse(isco =="OC2_ISCO08_TOTAL", median2d_SLF, NA)) %>% 
  fill(median2d_SLF_total, .direction = "up") %>% 
  mutate(median2d_EES_total = ifelse(isco =="OC2_ISCO08_TOTAL", median2d_EES, NA)) %>% 
  fill(median2d_EES_total, .direction = "up") %>% 
  filter(isco != "OC2_ISCO08_TOTAL") %>% 
  ungroup() %>% 
  select(country, year, isco, median2d_SLF, median2d_SLF_total, median2d_EES, median2d_EES_total, sample_ees, sample_slf, small_sample_ees, small_sample_slf) %>% 
  # Calculate the distance of individual ISCO 2-digts occupations from the median of Employees
  mutate(difference_from_overall_median_EES_perc = round((median2d_EES - median2d_EES_total) / median2d_EES_total * 100, 1)) %>%
  mutate(difference_from_overall_median_SLF_perc = round((median2d_SLF - median2d_SLF_total) / median2d_EES_total * 100, 1)) %>%
  filter(difference_from_overall_median_SLF_perc < 500) %>%
  # Extract last two characters from 'isco'
  mutate(isco2d = substr(isco, nchar(isco)-1, nchar(isco))) %>%  
  filter(!isco %in% c("OC2_ISCO08_01", # Take out the army occupations and non-classified
                      "OC2_ISCO08_02",
                      "OC2_ISCO08_03",
                      "OC2_ISCO08_96")) %>% 
  rename(code = country)
  
# Data from the final results - sheet with exposure type and computer use by ISCO 2-digit and 2-digit shares of employment by country
heatmap_data <- read_excel("./Results/Results_AI_LAC.xlsx", sheet = "AI_CPU_ISCO2d", skip = 2) %>% 
  fill(Country, .direction = "down") %>% 
  select(-1, -TOTAL) %>% 
  rename(country_name = Country,
         isco2d = "ISCO 2d") %>% 
  mutate(country_name = toTitleCase(tolower(country_name)))

# File with country codes and labels (from basic data on top)
wb_codes1 <- wb_codes %>% 
  rename(country_name = country)

# Merge datasets
heatmap_data2 <- left_join(heatmap_data, wb_codes1) %>% 
  select(country_name, code, everything()) 

data3 <- left_join(data2, heatmap_data2) %>% 
  rename(share_empl = 'Share of Total Employment') %>% 
  mutate(across(`Augmentation & Computer`:`Missing & No Computer`, as.numeric)) %>%
  pivot_longer(cols = `Augmentation & Computer`: `Missing & No Computer`,
               names_to = "variable",
               values_to = "Share_of_exposure") %>% 
  mutate(isco2d = as.character(isco2d)) %>% 
  arrange(isco2d) %>% 
  # Select only exposed with computer the not affected occupations
  filter(variable %in% c("Augmentation & Computer", 
                         "Automation & Computer",
                         "Big Unknown & Computer"))

# Read isco labels from a file with official mapping of ISCO levels
isco_labels <- read_csv("./bases/ILO/mapping.csv") %>% 
  select(isco08_2d, description_2d) %>% 
  distinct() %>% 
  rename(isco2d_code = isco08_2d) %>% 
  arrange(isco2d_code) %>% 
  rename(isco2d = isco2d_code)


# Add labels to the main dataset
data4 <- left_join(data3, isco_labels) %>% 
  mutate(isco_labels = paste(isco2d, "-", description_2d)) %>% 
  mutate(isco_labels = factor(isco_labels)) %>% 
  # Pivot long to plot
  pivot_longer(cols= difference_from_overall_median_EES_perc: difference_from_overall_median_SLF_perc,
               names_to = "difference",
               values_to = "difference_value") %>% 
  mutate(income_difference_from_EES_median_filtered = ifelse(Share_of_exposure > 25, difference_value, NA)) %>% 
  select(code, year, isco_labels, median2d_SLF, median2d_EES, difference, difference_value, income_difference_from_EES_median_filtered, 
         share_empl, variable, Share_of_exposure, sample_ees, sample_slf, small_sample_ees, small_sample_slf) %>% 
  mutate(currency = "local currency") %>% 
  rename(potential = variable) %>% 
  mutate(small_sample_ees_marker = ifelse(small_sample_ees == "U", difference_value, NA))

data4 %>% write_csv("./bases/ILO/Microdata_ILO/EAR_OCU2Digits_LAC_Processed.csv")
data4 <- read_csv("./bases/ILO/Microdata_ILO/EAR_OCU2Digits_LAC_Processed.csv")

# Define custom labels for x-axis
custom_labels <- setNames(
  c("Self-employed", "Employees"),
  c("difference_from_overall_median_SLF_perc", "difference_from_overall_median_EES_perc"))

# Define custom legend colors
custom_Legend_colors <- setNames(
  c("red", "green", "blue3"),
  c("Automation & Computer", "Augmentation & Computer", "Big Unknown & Computer")
)

# Plot
ggplot(data4, aes(x = fct_rev(isco_labels), y = difference_value)) +
  geom_jitter(aes(size = share_empl), alpha = 0.3, color = "grey") + # Plot all 
  geom_jitter(aes(x = fct_rev(isco_labels), y = income_difference_from_EES_median_filtered, color = potential, size = share_empl, alpha = Share_of_exposure)) + # Ovelay exposed jobs
  geom_smooth(method = "loess", se = FALSE, color = "black", linewidth = 0.3, linetype ="dashed", aes(group = potential)) +
  geom_hline(yintercept = 0, color = "brown", size = 0.7) +
  facet_wrap(~difference, ncol = 1, labeller = labeller(difference = custom_labels)) +
  scale_color_manual(values = custom_Legend_colors) +  # Apply custom color scale
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        legend.position = "top",  
        legend.box = "horizontal", 
        legend.box.just = "left",  
        legend.direction = "horizontal",
        legend.title = element_text(size = 10),
        axis.title.y = element_text(size = 10)) +  
  labs(x = "",
       y = "Income: distance from the group's median income as % of the median income of employees",
       size = "Size: share of total employment",
       alpha = "Shading: Exposed share of ISCO-08 2-digit occupations") +
  guides(
    color = guide_legend(order = 3, title.position = "top", nrow = 3, byrow = TRUE, title = NULL),
    size = guide_legend(order = 1, title.position = "top", nrow = 1, byrow = TRUE),
    alpha = "none")

# Save 
ggsave(filename = "Figure14.jpeg",   
       path = "Figures", 
       width = 10, height = 10,        
       units = "in",                 
       dpi = 300) 

APPENDIX

Figure A1. Comparison of TechXposure scores vs GBB scores (mean by occupation, z-scores)

data_tech <- read_csv("./bases/ILO/techXposure_isco4d_by_tech.csv") %>% 
  rename(ISCO_08 = isco4d) %>% 
  mutate(ISCO_08 = as.character(ISCO_08)) %>% 
  group_by(ISCO_08, isco4d_label) %>% 
  summarise(x_mean = mean(xposure4d, na.rm = T)) # calculate the mean score for each 4-d isco08

data_gbb <- read_csv("./bases/ILO/Gmyrek_Berg_Bescond_scores_2023.csv") %>% 
  rename(mean_score_ilo = mean_score)

comparison_gbb_techxposure <- left_join(data_tech, data_gbb)

# Standardize the data
comparison_gbb_techxposure$mean_score_ilo_centered <- scale(comparison_gbb_techxposure$mean_score_ilo)[, 1]
comparison_gbb_techxposure$xposure4d_centered <- scale(comparison_gbb_techxposure$x_mean)[, 1]

# Plot - all values
ggplot(comparison_gbb_techxposure,
       aes(x = mean_score_ilo_centered, y = xposure4d_centered, colour = description_1d)) +
  geom_point(alpha = .5) +
  facet_wrap(~description_1d, ncol = 3) +
  theme(legend.position = "none",
        strip.text = element_text(size = 10)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
  labs(y = "TechXposure (mean by occupation), z-scores",
       x = "ILO scores (mean by occupation), z-scores") +
  ylim(-3,3) +
  xlim(-3,3)

ggsave(filename = "FigureA1.jpeg",   
       path = "Figures", 
       width = 10, height = 6,        
       units = "in",                 
       dpi = 300) 

Figure A2. Comparison of TechXposure scores vs GBB scores (mean by occupation, z-scores)

data_imf <- read_excel("./bases/ILO/IMF_AIOE_CAIOE_theta_for_sharing.xlsx", sheet = "data_ISCO08") %>% 
  mutate(exposure = ifelse(aioe_all > median_aioe_all, "High Exposure", "Low exposure"),
         complementarity = ifelse(complementarity_theta > median_theta, "High Complementarity", "Low Complementarity"),
         imf = paste(exposure, "-", complementarity)) %>% 
  rename(ISCO_08 = isco08)

data_gbb <- read_csv("./bases/ILO/Gmyrek_Berg_Bescond_scores_2023.csv") %>% 
  rename(mean_score_ilo = mean_score)

gbb_imf_compare <- left_join(data_imf, data_gbb) %>% 
  mutate(potential = ifelse(potential =="missing", "Other", potential)) %>% 
  filter(!is.na(Title), # IMF data contains also ISCO 3d jobs, so these titles appear as NA after merger
         description_1d != "Armed forces occupations")
# Standardize the ilo and imf scores
gbb_imf_compare$mean_score_ilo_centered <- scale(gbb_imf_compare$mean_score_ilo)[, 1]
gbb_imf_compare$aioe_all_centered <- scale(gbb_imf_compare$aioe_lm)[, 1]


# Plot the standardized scores with a 45 degree line
ggplot(gbb_imf_compare,
       aes(x = mean_score_ilo_centered, y = aioe_all_centered, colour = description_1d)) +
  geom_point(alpha = .5) +
  facet_wrap(~description_1d, ncol = 3) +
  theme(legend.position = "none",
        strip.text = element_text(size = 10)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") + # 45-degree line
  scale_x_continuous(name = "Standardized Mean ILO Score") +
  scale_y_continuous(name = "Standardized AIOE (LM Scores)")

# Save 
ggsave(filename = "FigureA2.jpeg",   
       path = "Figures", 
       width = 10, height = 6,        
       units = "in",                 
       dpi = 300) 

Figure A3. Breakdown of occupations by country, ISCO-08 2-digits and sex

# PLOT 2: 2-digit employment
data_2d <-  read_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_data.csv") %>% 
  select(country, acro, sex, year, isco2d, isco_share_in_empl) %>% 
  group_by(country) %>% 
  filter(year == max(year)) %>% 
  ungroup() %>% 
  drop_na() %>% 
  filter(isco2d != "Total", 
         country != "Nicaragua") %>% # 2014 survey in ISCO-88 format
  arrange(country, isco2d) %>% 
  mutate(isco2d = factor(isco2d)) 

ggplot(data_2d %>% filter(sex != "Total"), aes(x = country, y = fct_rev(isco2d), fill = isco_share_in_empl)) +
      geom_tile() +
      scale_fill_gradient(low = "white", high = "red", labels = percent_format(accuracy = 1)) +
      labs(x = "", y = "", fill = "Employment Share") +
      theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0.5),
            axis.text.y = element_text(size = 9),
            legend.position = "top",
            legend.title = element_text(size = 9),
            plot.background = element_rect(fill = "white", color = NA),
            panel.background = element_rect(fill = "white", color = NA),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank()) +
      geom_text(aes(label = paste(round(isco_share_in_empl * 100, 1), "%")), size = 2) +
  facet_wrap(~sex, ncol = 1)

# Save 
ggsave(filename = "FigureA3.jpeg",   
       path = "Figures", 
       width = 10, height = 14,        
       units = "in",                 
       dpi = 300) 

Figure A4

ggplot(final_exposures_LAC,
       aes(x = reorder_within(country, exposure, potential), y = exposure, fill = fct_rev(potential))) +
  geom_col() +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        legend.position = "none") +
  labs(x = "",
       y = "Percentage of employment") +
  facet_wrap(~potential, scales = "free_x", ncol = 3) +
  scale_fill_manual(values = c("Augmentation Potential" = "orange",
                               "Automation Potential" = "red3",
                               "The Big Unknown" = "skyblue")) +
  scale_x_reordered() +
  geom_text(aes(label = paste(round(exposure, 0), "%"), y = exposure + 3), size = 2.5, angle = 90) +
  ylim(0,31)

# Save 
ggsave(filename = "FigureA4.jpeg",   
       path = "Figures", 
       width = 10, height = 4,        
       units = "in",                 
       dpi = 300) 

Figure A5. Comparison of results on computer use between PIAAC (at work) and SEDLAC (at home) - augmentation category

# SEDLAC with imputed computer at work
sedlac_imputed_computer_at_work <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_SEDLAC_PIAAC_CPU") %>% 
  select(-1, -4) %>% 
  rename(code = 1,
         potential = 2,
         Total = 3) %>% 
  fill(code, .direction = "down") %>% 
  mutate(potential = ifelse(potential == "missing", "Other", potential)) %>%
  mutate(computer = str_split_fixed(potential, ", ", 2)[,2],
         potential = str_split_fixed(potential, ", ", 2)[,1]) %>% 
  select(code, potential, computer, everything()) %>% 
  mutate(potential = str_replace_all(potential, c("^Pr\\(" = "", "\\)$" = ""))) %>% 
  mutate(potential = str_replace_all(potential, "=1", "")) %>% 
  mutate(computer = str_replace_all(computer, "Computer at work=", "")) %>% 
  mutate(computer = str_replace_all(computer, "\\)$", "")) %>% 
  mutate(computer = ifelse(computer == 1, "Computer", "No computer")) %>% 
  left_join(., wb_codes) %>% 
  select(country, code, everything()) %>% 
  filter(potential == "Augmentation", 
         computer == "Computer") %>% 
  select(-country_label) %>% 
  rename(SEDLAC_computer_at_work_imputed = Total) %>% 
  select(1:5)

# SEDLAC with imputed computer and internet at work
sedlac_imputed_computer_internet_at_work <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_SEDLAC_PIAAC_CPUINT") %>% 
  select(-1, -4) %>% 
  rename(code = 1,
         potential = 2,
         Total = 3) %>% 
  fill(code, .direction = "down") %>%
  mutate(computer = str_split_fixed(potential, ", ", 2)[,2],
         potential = str_split_fixed(potential, ", ", 2)[,1]) %>% 
  select(code, potential, computer, everything()) %>% 
  mutate(potential = str_replace_all(potential, c("^Pr\\(" = "", "\\)$" = ""))) %>% 
  mutate(potential = str_replace_all(potential, "=1", "")) %>% 
  mutate(potential = ifelse(potential == "Missing", "Other", potential)) %>% 
  mutate(computer = str_replace_all(computer, "Computer & Internet at work=", "")) %>% 
  mutate(computer = str_replace_all(computer, "\\)$", "")) %>% 
  mutate(computer = ifelse(computer == 1, "Computer", "No computer")) %>% 
  left_join(., wb_codes) %>% 
  select(country, code, everything()) %>% 
  filter(potential == "Augmentation", 
         computer == "Computer") %>% 
  select(-country_label) %>% 
  rename(SEDLAC_computer_internet_at_work_imputed = Total) %>% 
  select(1:5)


# Read final results sheet for PIAAC
sedlac_computer_at_home <- read_excel("./Results/Results_AI_LAC.xlsx", skip = 3, sheet = "AI_COMPUHH") %>% 
  select(-1, -4) %>% 
  rename(code = 1,
         potential = 2,
         Total = 3) %>% 
  fill(code, .direction = "down") %>% 
  mutate(potential = ifelse(potential == "missing", "Other", potential)) %>%
  mutate(computer = str_split_fixed(potential, ", ", 2)[,2],
         potential = str_split_fixed(potential, ", ", 2)[,1]) %>% 
  select(code, potential, computer, everything()) %>% 
  mutate(potential = str_replace_all(potential, c("^Pr\\(" = "", "\\)$" = ""))) %>% 
  mutate(potential = str_replace_all(potential, "=1", "")) %>% 
  mutate(computer = str_replace_all(computer, "Computer at home=", "")) %>% 
  mutate(computer = str_replace_all(computer, "\\)$", "")) %>% 
  mutate(computer = ifelse(computer == 1, "Computer", "No computer")) %>% 
  #Append sedlac codes
  left_join(., wb_codes) %>% 
  select(country, code, everything()) %>% 
  filter(potential == "Augmentation", 
         computer == "Computer") %>% 
  select(-country_label) %>% 
  rename(SEDLAC_reported_computer_at_home = Total) %>% 
  select(1:5)


# Merge the three datasets
compare <- left_join(sedlac_imputed_computer_at_work, sedlac_imputed_computer_internet_at_work) %>% 
  left_join(.,sedlac_computer_at_home) %>% 
  pivot_longer(cols = SEDLAC_computer_at_work_imputed: SEDLAC_reported_computer_at_home,
               names_to = "source",
               values_to = "value") %>% 
  filter(value != 0) %>% 
  mutate(country = factor(country))

ggplot(compare,
       aes(x = country, y = value, fill = source)) +
  geom_col(position = position_dodge(width = 0.8), alpha = 0.9) +
  geom_text(aes(label = paste(round(value, 1), "%")),
            position = position_dodge(width = 1),   
            size = 3.5,
            angle = 90,
            vjust = 0.5,
            hjust = -0.2) +
  facet_wrap(~potential, ncol = 1) +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11, vjust = 1),
        legend.position = "top") +
  scale_fill_manual(values = c("SEDLAC_computer_at_work_imputed" = "skyblue3",  
                               "SEDLAC_computer_internet_at_work_imputed" = "brown",
                               "SEDLAC_reported_computer_at_home" = "yellow3"),
                    labels = c("SEDLAC_computer_at_work_imputed" = "Computer at work (imputed from PIACC)", 
                               "SEDLAC_computer_internet_at_work_imputed" = "Computer & Internet at work (imputed from PIACC)",
                               "SEDLAC_reported_computer_at_home" = "Computer at home (reported in SEDLAC)"
                               )) +
  ylim(0,13) +
  guides(fill = guide_legend(reverse = F)) +
  labs(x = "",
       y = "Share of employment (%)")

# Save 
ggsave(filename = "FigureA5.jpeg",   
       path = "Figures", 
       width = 10, height = 4,        
       units = "in",                 
       dpi = 300) 

Figure A6. Jobs in augmentation category that do not use a computed at work: totals by country

# Data from calculations in line 980
total_jobs_aug_no_computer <- jobs_data2 %>% 
  select(1:5, country_augment_jobs_no_comp_sex) %>% 
  mutate(plot = ifelse(sex == "Male", -round(country_augment_jobs_no_comp_sex / 1000000, 2), round(country_augment_jobs_no_comp_sex/ 1000000, 2))) %>% 
  mutate(label_men = ifelse(sex == "Male", plot, NA),
         label_women = ifelse(sex == "Female", plot, NA))

ggplot(total_jobs_aug_no_computer,
       aes(x = reorder(country, country_augment_jobs_no_comp_sex), y = plot, fill = sex)) +
  geom_bar(stat = "identity", position = "identity") +
  coord_flip() +
  scale_y_continuous(labels = abs, limits = c(-3.7,3.7)) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(x = "", y = "Millions of jobs") +
  geom_text(aes(x = reorder(country, country_augment_jobs_no_comp_sex), y = plot,
              label = ifelse(!is.na(label_men), paste(abs(label_men), "M"), "")),
          size = 3,
          hjust = 1.1) +
  geom_text(aes(x = reorder(country, country_augment_jobs_no_comp_sex), y = plot,
              label = ifelse(!is.na(label_women), paste(abs(label_women), "M"), "")),
          size = 3,
          hjust = -0.2) +
  scale_fill_manual(values = c("#ff7f0e", "#1f77b4"), guide = guide_legend(reverse = TRUE)) 

# Save 
ggsave(filename = "FigureA6.jpeg",   
       path = "Figures", 
       width = 10, height = 4,        
       units = "in",                 
       dpi = 300) 

version