knitr::opts_chunk$set(comment = "#>",
                      echo = T,
                      results = 'show',
                      message = FALSE, 
                      warning = FALSE,
                      cache = F)
# Load necessary libraries
library(readxl)
library(tidyverse)
library(lubridate)
library(stringr)
library(tools)
# Read list of country codes used by WB
wb_codes <- read_excel("./bases/ILO/Country_codes_wb.xlsx")

## AUTOMATION
autom <- read_csv("./bases/ILO/Microdata_ILO/EMP_SEX_ISCO08_2D_Automation.csv") %>% 
mutate(obs_value = round(obs_value * 1000)) %>% # Convert to thousands
  # Move totals to a column
  group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
  mutate(total_empl = max(obs_value)) %>% 
  ungroup %>% 
  filter(classif2.label != "TOTAL") %>% 
  # Calculate total per ISCO 2-digit
  group_by(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>% 
  mutate(total_empl_by_isco = sum(obs_value)) %>% 
  ungroup() %>% 
  # Calculate share of each isco in total empl
  mutate(isco_share_in_empl = round(total_empl_by_isco / total_empl, 4)) %>% 
  # Calculate automation "yes" as share of each isco
  mutate(yes_share_of_isco = round(obs_value / total_empl_by_isco, 3),
         yes_share_of_isco = ifelse(classif2.label == "NO", NA, yes_share_of_isco)) %>% 
  # Select last survey available per country
  group_by(ref_area.label, source.label, ACRO) %>% 
  filter(time == max(time)) %>% 
  # Filter out LFS for Ecuador
  filter(ACRO != "ECV")

autom_check <- autom %>% distinct(ref_area.label, source.label, ACRO, time) 
    
  # By isco 2-digit
  automation_by_isco <- autom %>% 
    filter(classif2.label == "YES") %>% 
    select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, yes_share_of_isco) %>% 
    group_by(ref_area.label, source.label, ACRO) %>% 
    filter(time == max(time)) %>% 
    rename(automation_isco = yes_share_of_isco) %>% 
    group_by(ref_area.label, source.label, ACRO) %>% 
    filter(time == max(time)) %>% 
    ungroup() %>% 
    filter(ACRO != "ECV") 
    
  
  # by country
  automation_by_country <- autom %>% 
    filter(classif2.label == "YES")   %>% 
    select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, classif2.label, obs_value, total_empl) %>% 
    group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
    mutate(automation_total = sum(obs_value)) %>% 
    ungroup() %>% 
    mutate(automation_country = round(automation_total / total_empl, 3)) %>% 
    distinct(ref_area.label, source.label, ACRO, sex.label, time, automation_country) %>% 
    ungroup() 

# Final Automation DF
  automation_df <- autom %>% 
    distinct(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, total_empl, total_empl_by_isco, isco_share_in_empl) %>% 
    ungroup() %>% 
    left_join(.,automation_by_isco) %>% 
    left_join(., automation_by_country)
  
  
############## ############## ############## ##############
  
  
## AUGMENTATION
augment <- read_csv("./bases/ILO/Microdata_ILO/EMP_SEX_ISCO08_2D_Augmentation.csv") %>% 
  mutate(obs_value = round(obs_value * 1000)) %>% # Convert to thousands
  # Move totals to a column
  group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
  mutate(total_empl = max(obs_value)) %>% 
  ungroup() %>% 
  filter(classif2.label != "TOTAL") %>% 
  # Calculate total per ISCO 2-digit
  group_by(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>% 
  mutate(total_empl_by_isco = sum(obs_value)) %>% 
  ungroup() %>% 
  # Calculate share of each isco in total empl
  mutate(isco_share_in_empl = round(total_empl_by_isco / total_empl, 4)) %>% 
  # Calculate automation "yes" as share of each isco
  mutate(yes_share_of_isco = round(obs_value / total_empl_by_isco, 3),
         yes_share_of_isco = ifelse(classif2.label == "NO", NA, yes_share_of_isco)) %>% 
  # Select last survey available per country
  group_by(ref_area.label, source.label, ACRO) %>% 
  filter(time == max(time)) %>% 
  # Filter LFS for Ecuador
  filter(ACRO != "ECV")
  
  # By isco 2-digit
  augmentation_by_isco <- augment %>% 
    filter(classif2.label == "YES") %>% 
    select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, yes_share_of_isco) %>% 
    group_by(ref_area.label, source.label, ACRO) %>% 
    filter(time == max(time)) %>% 
    rename(augmentation_isco = yes_share_of_isco) %>% 
    group_by(ref_area.label, source.label, ACRO) %>% 
    filter(time == max(time)) %>% 
    ungroup() %>% 
    filter(ACRO != "ECV") 
  
  # by country
  augmentation_by_country <- augment %>% 
    filter(classif2.label == "YES") %>% 
    select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, classif2.label, obs_value, total_empl) %>% 
    group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
    mutate(augmentation_total = sum(obs_value)) %>% 
    ungroup() %>% 
    mutate(augmentation_country = round(augmentation_total / total_empl, 3)) %>% 
    distinct(ref_area.label, source.label, ACRO, sex.label, time, augmentation_country) 

# Final Augmentation DF
  augmentation_df <- augment %>% 
    distinct(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>% 
    ungroup() %>% 
    left_join(.,augmentation_by_isco) %>% 
    left_join(., augmentation_by_country)
    
  
###########  ###########  ###########  ###########  ###########    
  
  
## BIG UNKNOWN
unknown <- read_csv("./bases/ILO/Microdata_ILO/EMP_SEX_ISCO08_2D_Bigunknown.csv") %>% 
   mutate(obs_value = round(obs_value * 1000)) %>% # Convert to thousands
  # Move totals to a column
  group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
  mutate(total_empl = max(obs_value)) %>% 
  ungroup() %>% 
  filter(classif2.label != "TOTAL") %>% 
  # Calculate total per ISCO 2-digit
  group_by(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>% 
  mutate(total_empl_by_isco = sum(obs_value)) %>% 
  ungroup() %>% 
  # Calculate share of each isco in total empl
  mutate(isco_share_in_empl = round(total_empl_by_isco / total_empl, 4)) %>% 
  # Calculate automation "yes" as share of each isco
  mutate(yes_share_of_isco = round(obs_value / total_empl_by_isco, 3),
         yes_share_of_isco = ifelse(classif2.label == "NO", NA, yes_share_of_isco)) %>% 
  # Select last survey available per country
  group_by(ref_area.label, source.label, ACRO) %>% 
  filter(time == max(time)) %>% 
  # Filter LFS for Ecuador
  filter(ACRO != "ECV")

  
  # By isco 2-digit
  unknown_by_isco <- unknown %>% 
    filter(classif2.label == "YES") %>% 
    select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, yes_share_of_isco) %>% 
    group_by(ref_area.label, source.label, ACRO) %>% 
    filter(time == max(time)) %>% 
    rename(unknown_isco = yes_share_of_isco) %>% 
    group_by(ref_area.label, source.label, ACRO) %>% 
    filter(time == max(time)) %>% 
    ungroup() %>% 
    filter(ACRO != "ECV")

  
  # by country
  unknown_by_country <- unknown %>% 
    filter(classif2.label == "YES") %>% 
    select(ref_area.label, source.label, ACRO, sex.label, time, classif1.label, classif2.label, obs_value, total_empl) %>% 
    group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
    mutate(automation_total = sum(obs_value)) %>% 
    ungroup() %>% 
    mutate(unknown_country = round(automation_total / total_empl, 3)) %>% 
    distinct(ref_area.label, source.label, ACRO, sex.label, time, unknown_country) 


# Final Unknown DF
  unknown_df <- unknown %>% 
    distinct(ref_area.label, source.label, ACRO, sex.label, time, classif1.label) %>% 
    ungroup() %>% 
    left_join(.,unknown_by_isco) %>% 
    left_join(., unknown_by_country)
    


###########  ###########  ###########  ###########  ###########    
  
# FINAL DF with ISCO and country level scores  
final_df <- left_join(automation_df, augmentation_df) %>% 
    left_join(., unknown_df) 
  
test <- final_df %>% 
    mutate(check1 = automation_isco + augmentation_isco + unknown_isco) %>% 
  filter(check1> 1)
  
test2 <- final_df %>% 
  group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
  mutate(check2 = sum(total_empl_by_isco))
 
test3 <- final_df %>% 
  group_by(ref_area.label, source.label, ACRO, sex.label, time) %>% 
  mutate(check2 = sum(isco_share_in_empl))

final_df1 <- final_df %>% 
  rename(country = ref_area.label,
         survey = source.label,
         acro = ACRO,
         sex = sex.label,
         year = time,
         isco2d = classif1.label) %>% 
  left_join(.,wb_codes) %>% 
  select(country, code, everything())


final_df1 %>% write_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_data.csv")

# Combine data by country
by_country_sex <- final_df1 %>% 
  distinct(country, code, survey, acro, sex, year, automation_country, augmentation_country, unknown_country) %>% 
  ungroup() %>% 
  pivot_longer(cols = ends_with("_country"),
               names_to = "potential",
               values_to = "value") %>% 
  mutate(potential = str_replace(potential,"_country", ""),
         potential = toTitleCase(potential),
         potential = ifelse(potential == "Unknown", "Big Unknown", potential)) %>% 
  mutate(potential = factor(potential, 
                            levels = c("Automation",
                                      "Augmentation",
                                      "Big Unknown"))) 

# Create ILO small df
ilo_estimates <- by_country_sex %>% 
  filter(sex == "Total") %>% 
  select(country, code, potential, value) %>% 
  rename(distribution_percentage = value) %>% 
  mutate(source = "ILO") %>% 
  arrange(country) %>% 
  mutate(distribution_percentage = distribution_percentage * 100)

ilo_estimates %>% write_csv("./bases/ILO/Microdata_ILO/LATAM_FINAL_ILO_totals_small.csv")