getting started
#start with clean workspace
rm(list=ls())
load("./data/names_df_v20221005.RData")
packages
library(data.table)
library(tidyverse)
require(xml2)
require(rvest)
require(devtools)
require(scholar)
require(stringi)
andere strategie. alle nobiliary particles er af. dan zonder deze
door databank. dan indien geen info, via particles NL-identiteit (of
spaans of Duits of arabisch)
lastname_df <- names_df
lastname_df$lastname2 <- as.character(str_split(lastname_df$lastname, pattern=",", n = 2, simplify = TRUE)[,1])
lastname_df$np <- as.character(str_split(lastname_df$lastname, pattern=",", n = 2, simplify = TRUE)[,2])
# creating URLs: origin
flaname <- function(x){
paste(toupper(substring(x, 1, 1)),
tolower(substring(x, 2, nchar(x))),
sep = "")
}
lastname_df$lastname2 <- flaname(lastname_df$lastname2)
lastname_df$name_origin <- paste0("https://www.cbgfamilienamen.nl/nfb/detail_naam.php?gba_naam=",
lastname_df$lastname2,
"&nfd_naam=",
lastname_df$lastname2,
"&info=analyse+en+verklaring&operator=eq&taal=")
https://www.cbgfamilienamen.nl/nfb/detail_naam.php?gba_naam=tolsma&gba_naam=Tolsma&nfd_naam=&info=analyse+en+verklaring&operator=eq&taal=
#lastname_df$name_origin[14] https://www.cbgfamilienamen.nl/nfb/detail_naam.php?gba_naam=
Tolsma &nfd_naam= Tolsma
&info=analyse+en+verklaring&operator=eq&taal=
#https://www.cbgfamilienamen.nl/nfb/detail_naam.php?gba_lcnaam=kraaykamp&gba_naam=Kraaykamp&nfd_naam=Kraaijkamp%20(y)&operator=eq&taal=
hier slaan we alles op
name_originl <- list()
table_originl <- list()
time <- 0.1
crucical scrape
loop
for (i in 1:nrow(lastname_df)) {
print(i)
Sys.sleep(time)
tryCatch({
name_originl[[i]] <- read_html(lastname_df[i, c("name_origin")])
table_originl[[i]] <- name_originl[[i]] %>% html_table()
},
warning = function(w) {
cat("WARNING:", conditionMessage(w), "\n") #WARNING message
},
error=function(e){
err <- conditionMessage(e)
cat("Error:", conditionMessage(e), "\n") #ERROR message
}
)
}
en vanaf hier is het eigenlijk alleen maar opschonen.
origin_txt <- list()
for (i in 1:length(name_originl)) {
origin_txt [[i]] <- name_originl[[i]] %>% html_text() %>% as.character()
}
# Get out the relevant origin information from the xml lists
origin_ln <- list()
for (i in 1:length(name_originl)) {
origin_ln[[i]] <- name_originl[[i]] %>% html_nodes("div") %>% rvest::html_text()
origin_ln[[i]] <- origin_ln[[i]][[3]]
}
# Remove mess
for (i in 1:length(origin_ln)) {
origin_ln[[i]] <- gsub("\\t", " ", origin_ln[[i]])
origin_ln[[i]] <- gsub("\\n", " ", origin_ln[[i]])
}
# Flatten nested structure of the origin information
#origin_ln <- rbind(flatten(origin_ln))
# Detaching the names and origin info for easier data handling
origin <- unlist(origin_ln)
origin <- str_extract_all(origin, "varianten(.*?)©")
# Origin information is usually mentioned after "verklaring" or "kenmerken"
origin <- str_remove_all(origin, "varianten")
origin <- str_remove_all(origin, "CBG Bronnen")
origin <- str_remove_all(origin, "catalogus")
origin <- str_remove_all(origin, "©")
verklaring <- str_remove_all(origin, "kenmerken:(.*?)$")
kenmerken <- str_extract_all(origin, "kenmerken:(.*?)$")
kenmerken <- str_remove_all(kenmerken, "specifieke componenten:(.*?)$")
sc <- str_extract_all(origin, "specifieke componenten:(.*?)$") # Not directly relevant to us, but does mean that the name has a webpage
# Make into a neat dataframe with the names attached
verklaring <- trimws(verklaring, which = "both")
kenmerken <- trimws(kenmerken, which = "both")
sc <- trimws(sc, which = "both")
vk <- data.frame(lastname_df$id,lastname_df$lastname, verklaring, kenmerken, sc)
Separating names with
Dutch & unknown origin
Next, we identify those names for which no additional information was
found. This is important to distinguish Dutch names from names with
unknown origins.
- Dutch names: no label indicating that the name is Dutch, but some
other information available on name origin
- Unknown names: web page cannot be found, so origin information is
empty.
# Identify last names that could not be found
vk <- vk %>%
mutate(verklaring = ifelse(verklaring=="", 0, verklaring),
kenmerken = ifelse(kenmerken=="character(0)", 0, kenmerken),
sc = ifelse(sc=="character(0)", 0, sc),
no_info = nchar(verklaring) + nchar(kenmerken) + nchar(sc))
vk <- vk %>%
mutate(no_info = ifelse(no_info==3, 1, 0),
verklaring = ifelse(verklaring==0, NA, verklaring),
kenmerken = ifelse(kenmerken==0, NA, kenmerken))
# If there is no text in verklaring or kenmerken, the name could not be found in the databases.
