1 getting started

#start with clean workspace 
rm(list=ls())
getwd()
load("./data/names_df_v20220106.RData")

2 packages

library(data.table) 
library(tidyverse) 
require(xml2)
require(rvest)
require(devtools)
require(scholar)
require(stringi)

2.1 adding gender to dataset

2.1.1 gender to cs_df

names_df %>% mutate(firstname=first_name) -> names_df
names_df$gender <- NA

# make links to scrape
names_df$name_url <- paste0("https://www.meertens.knaw.nl/nvb/naam/is/", names_df[, c("firstname")])

name_list <- list()
table_list <- list()

for (i in 1:nrow(names_df)) {
    print(i)
    if (!(is.na(names_df$gender[i]))) next
    name_list[[i]] <- read_html(names_df$name_url[i])
    # extract name frequency table and gender info
    if (length(name_list[[i]] %>% html_table())>0) {
      
      table_list[[i]] <- name_list[[i]] %>% html_table()
      table_list[[i]][[1]][table_list[[i]][[1]]=="--"] <- "0"
      if (as.numeric(table_list[[i]][[1]]$X3[2]) > as.numeric(table_list[[i]][[1]]$X3[6])) {
        names_df$gender[i] <- "male" } else {
          names_df$gender[i] <- "female"
        }
    }
    save(names_df, file="./data/names_df_temp.RData")
}
#devtools::install_github("kalimu/genderizeR")
require(genderizeR)
names_df$gender2 <- NA

for (i in 1:nrow(names_df)) {
    print(i)
    tryCatch({
      if (!(is.na(names_df$gender2[i]))) next
      gender <- genderizeAPI(names_df$first_name[i])
      gender <- gender[[1]]
      names_df$gender2[i] <- gender$gender[which.max(gender$probability)]
    }, 
    warning = function(w) {
        cat("WARNING:", conditionMessage(w), "\n") #WARNING message
    },
    error=function(e){
      cat("Error:", conditionMessage(e), "\n") #ERROR message
    })  
    save(names_df, file="./data/names_df_temp.RData")
}
save(names_df, file="./data/names_df_v20221005.RData")
LS0tDQp0aXRsZTogIjIuIEdlbmRlciINCmF1dGhvcjogImJ5OiBOaW5hIEJyYW50ZW4iDQpiaWJsaW9ncmFwaHk6IHJlZmVyZW5jZXMuYmliDQotLS0NCg0KDQoNCmBgYHtyLCBpbnN0YWxsIHJlbW90ZWx5LCBnbG9iYWxzZXR0aW5ncywgZWNobz1GQUxTRSwgd2FybmluZz1GQUxTRSwgcmVzdWx0cz0naGlkZScsIGV2YWw9RkFMU0V9DQppbnN0YWxsLnBhY2thZ2VzKCJyZW1vdGVzIikNCnJlbW90ZXM6Omluc3RhbGxfZ2l0aHViKCJybGVzdXIva2xpcHB5IikNCmBgYCANCg0KYGBge3IsIGdsb2JhbHNldHRpbmdzLCBlY2hvPUZBTFNFLCB3YXJuaW5nPUZBTFNFLCByZXN1bHRzPSdoaWRlJ30NCmxpYnJhcnkoa25pdHIpDQpsaWJyYXJ5KHJnbCkNCg0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQ0Kb3B0c19jaHVuayRzZXQodGlkeS5vcHRzPWxpc3Qod2lkdGguY3V0b2ZmPTEwMCksdGlkeT1UUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSxjb21tZW50ID0gIiM+IiwgY2FjaGU9VFJVRSwgY2xhc3Muc291cmNlPWMoInRlc3QiKSwgY2xhc3Mub3V0cHV0PWMoInRlc3QyIikpDQpvcHRpb25zKHdpZHRoID0gMTAwKQ0KcmdsOjpzZXR1cEtuaXRyKCkNCg0KDQoNCmNvbG9yaXplIDwtIGZ1bmN0aW9uKHgsIGNvbG9yKSB7c3ByaW50ZigiPHNwYW4gc3R5bGU9J2NvbG9yOiAlczsnPiVzPC9zcGFuPiIsIGNvbG9yLCB4KSB9DQoNCmBgYA0KDQpgYGB7ciBrbGlwcHksIGVjaG89RkFMU0UsIGluY2x1ZGU9VFJVRX0NCmtsaXBweTo6a2xpcHB5KHBvc2l0aW9uID0gYygndG9wJywgJ3JpZ2h0JykpDQoja2xpcHB5OjprbGlwcHkoY29sb3IgPSAnZGFya3JlZCcpDQoja2xpcHB5OjprbGlwcHkodG9vbHRpcF9tZXNzYWdlID0gJ0NsaWNrIHRvIGNvcHknLCB0b29sdGlwX3N1Y2Nlc3MgPSAnRG9uZScpDQpgYGANCg0KDQojIGdldHRpbmcgc3RhcnRlZA0KDQpgYGB7ciwgZXZhbD1GQUxTRX0NCiNzdGFydCB3aXRoIGNsZWFuIHdvcmtzcGFjZSANCnJtKGxpc3Q9bHMoKSkNCmdldHdkKCkNCmxvYWQoIi4vZGF0YS9uYW1lc19kZl92MjAyMjAxMDYuUkRhdGEiKQ0KYGBgDQoNCiMgcGFja2FnZXMNCg0KYGBge3J9DQpsaWJyYXJ5KGRhdGEudGFibGUpIA0KbGlicmFyeSh0aWR5dmVyc2UpIA0KcmVxdWlyZSh4bWwyKQ0KcmVxdWlyZShydmVzdCkNCnJlcXVpcmUoZGV2dG9vbHMpDQpyZXF1aXJlKHNjaG9sYXIpDQpyZXF1aXJlKHN0cmluZ2kpDQoNCmBgYA0KDQoNCg0KIyMgYWRkaW5nIGdlbmRlciB0byBkYXRhc2V0DQoNCg0KDQoNCiMjIyBnZW5kZXIgdG8gY3NfZGYNCmBgYHtyLCBldmFsPUZBTFNFfQ0KbmFtZXNfZGYgJT4lIG11dGF0ZShmaXJzdG5hbWU9Zmlyc3RfbmFtZSkgLT4gbmFtZXNfZGYNCm5hbWVzX2RmJGdlbmRlciA8LSBOQQ0KDQojIG1ha2UgbGlua3MgdG8gc2NyYXBlDQpuYW1lc19kZiRuYW1lX3VybCA8LSBwYXN0ZTAoImh0dHBzOi8vd3d3Lm1lZXJ0ZW5zLmtuYXcubmwvbnZiL25hYW0vaXMvIiwgbmFtZXNfZGZbLCBjKCJmaXJzdG5hbWUiKV0pDQoNCm5hbWVfbGlzdCA8LSBsaXN0KCkNCnRhYmxlX2xpc3QgPC0gbGlzdCgpDQoNCmZvciAoaSBpbiAxOm5yb3cobmFtZXNfZGYpKSB7DQogICAgcHJpbnQoaSkNCiAgICBpZiAoIShpcy5uYShuYW1lc19kZiRnZW5kZXJbaV0pKSkgbmV4dA0KICAgIG5hbWVfbGlzdFtbaV1dIDwtIHJlYWRfaHRtbChuYW1lc19kZiRuYW1lX3VybFtpXSkNCiAgICAjIGV4dHJhY3QgbmFtZSBmcmVxdWVuY3kgdGFibGUgYW5kIGdlbmRlciBpbmZvDQogICAgaWYgKGxlbmd0aChuYW1lX2xpc3RbW2ldXSAlPiUgaHRtbF90YWJsZSgpKT4wKSB7DQogICAgICANCiAgICAgIHRhYmxlX2xpc3RbW2ldXSA8LSBuYW1lX2xpc3RbW2ldXSAlPiUgaHRtbF90YWJsZSgpDQogICAgICB0YWJsZV9saXN0W1tpXV1bWzFdXVt0YWJsZV9saXN0W1tpXV1bWzFdXT09Ii0tIl0gPC0gIjAiDQogICAgICBpZiAoYXMubnVtZXJpYyh0YWJsZV9saXN0W1tpXV1bWzFdXSRYM1syXSkgPiBhcy5udW1lcmljKHRhYmxlX2xpc3RbW2ldXVtbMV1dJFgzWzZdKSkgew0KICAgICAgICBuYW1lc19kZiRnZW5kZXJbaV0gPC0gIm1hbGUiIH0gZWxzZSB7DQogICAgICAgICAgbmFtZXNfZGYkZ2VuZGVyW2ldIDwtICJmZW1hbGUiDQogICAgICAgIH0NCiAgICB9DQogICAgc2F2ZShuYW1lc19kZiwgZmlsZT0iLi9kYXRhL25hbWVzX2RmX3RlbXAuUkRhdGEiKQ0KfQ0KDQoNCmBgYA0KDQpgYGB7ciwgZXZhbD1GQUxTRX0NCiNkZXZ0b29sczo6aW5zdGFsbF9naXRodWIoImthbGltdS9nZW5kZXJpemVSIikNCnJlcXVpcmUoZ2VuZGVyaXplUikNCm5hbWVzX2RmJGdlbmRlcjIgPC0gTkENCg0KZm9yIChpIGluIDE6bnJvdyhuYW1lc19kZikpIHsNCiAgICBwcmludChpKQ0KICAgIHRyeUNhdGNoKHsNCiAgICAgIGlmICghKGlzLm5hKG5hbWVzX2RmJGdlbmRlcjJbaV0pKSkgbmV4dA0KICAgICAgZ2VuZGVyIDwtIGdlbmRlcml6ZUFQSShuYW1lc19kZiRmaXJzdF9uYW1lW2ldKQ0KICAgICAgZ2VuZGVyIDwtIGdlbmRlcltbMV1dDQogICAgICBuYW1lc19kZiRnZW5kZXIyW2ldIDwtIGdlbmRlciRnZW5kZXJbd2hpY2gubWF4KGdlbmRlciRwcm9iYWJpbGl0eSldDQogICAgfSwgDQogICAgd2FybmluZyA9IGZ1bmN0aW9uKHcpIHsNCiAgICAgICAgY2F0KCJXQVJOSU5HOiIsIGNvbmRpdGlvbk1lc3NhZ2UodyksICJcbiIpICNXQVJOSU5HIG1lc3NhZ2UNCiAgICB9LA0KICAgIGVycm9yPWZ1bmN0aW9uKGUpew0KICAgICAgY2F0KCJFcnJvcjoiLCBjb25kaXRpb25NZXNzYWdlKGUpLCAiXG4iKSAjRVJST1IgbWVzc2FnZQ0KICAgIH0pICANCiAgICBzYXZlKG5hbWVzX2RmLCBmaWxlPSIuL2RhdGEvbmFtZXNfZGZfdGVtcC5SRGF0YSIpDQp9DQoNCg0KDQoNCmBgYA0KDQoNCmBgYHtyLCBldmFsPUZBTFNFfQ0Kc2F2ZShuYW1lc19kZiwgZmlsZT0iLi9kYXRhL25hbWVzX2RmX3YyMDIyMTAwNS5SRGF0YSIpDQpgYGANCg0K