1 getting started

#start with clean workspace 
rm(list=ls())
getwd()
#> [1] "C:/Users/ninab/OneDrive/Documenten/GitHub/labjournal"

2 packages

library(data.table) 
library(tidyverse) 
require(xml2)
require(rvest)
require(devtools)
require(scholar)
require(stringi)

3 collect names

3.1 Computer Science!

# Let's first get the staff page read_html is a function that simply extracts html webpages and
# puts them in xml format
cs_staff <- read_html("https://www.cs.ru.nl/staff/index.html")

#head(soc_staff)
#class(soc_staff)
cs_staff <- cs_staff %>%
    rvest::html_nodes("body") %>%
    xml2::xml_find_all("//a") %>%
    rvest::html_text()

3.1.1 Cleaning

Encoding(cs_staff) <- "UTF-8"
cs_staff <- iconv(cs_staff, from="UTF-8", to="LATIN1")

cs_staff <- stri_trans_general(cs_staff, id = "Latin-ASCII")

check <- NA

for (i in 1:length(cs_staff)) {
  check[i] <- str_length(cs_staff[i]) > 1
}

cs_staff2 <- cs_staff[check]
cs_staff <- cs_staff2
cs_df <- data.frame(cs_staff)  
# Last name seems to be everything after ) 
cs_df$last_name <-as.character(str_split(cs_df$cs_staff, pattern="\\)", n = 2, simplify = TRUE)[,2])

# first name is everything between brackets
cs_df$first_name <- as.character(str_extract_all(cs_df$cs_staff, "(?<=\\().+?(?=\\))", simplify = TRUE))
cs_df$first_name <- tolower(cs_df$first_name)  # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)

cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")

cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1])
cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1])

cs_df$last_name
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op den ", "ul ") 

for (i in 1: length(lastname_df$lastname)) {
  if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    last <-  as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
    last <- last[length(last)]
    first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
    lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
  }
}

#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO 
for (i in 1: length(lastname_df$lastname)) {
  if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    
    
    if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
    }
    
    if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
    }
    
  }
}

lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
cs_df <- lastname_df
cs_df$affiliation <- "RU"
cs_df$field <- "computer science"
save(cs_df, file="./data/cs_df_v20221005.RData")

3.2 Sociology-RU

rm(list=ls())
soc_staff <- read_html("https://www.ru.nl/sociology/research/staff/")
soc_staff <- soc_staff %>%
    rvest::html_nodes("body") %>%
    xml2::xml_find_all("//td") %>%
    rvest::html_text()

3.3 selecting only the odd rowes with the names

fodd <- function(x) {
  #what is x, x is a vector
 x%%2 != 0 
}

nstaf <- length(soc_staff)

soc_names <- soc_staff[fodd(1:nstaf)] 
soc_names <- stri_trans_general(soc_names, id = "Latin-ASCII")
soc_df <- data.frame(soc_names)  

delrows <- which(soc_df$soc_names == "Staff:" | soc_df$soc_names == "PhD:" | soc_df$soc_names == "External PhD:" |
    soc_df$soc_names == "Guest researchers:" | soc_df$soc_names == "Other researchers:")

soc_df <- data.frame(soc_df[-delrows, ])
colnames(soc_df) <- "soc_names"

3.3.1 cleaning

# Last name seems to be everything before the comma
soc_df$last_name <- gsub(",.*$", "", soc_df$soc_names)

# first name is everything between brackets
soc_df$first_name <- as.character(str_extract_all(soc_df$soc_names, "(?<=\\().+?(?=\\))", simplify = TRUE))

# tussenvoegsel is evertying between last . and first bracket
test <- gsub("\\(.*$", "", soc_df$soc_names)
test <- substr(test, start= regexpr("\\.[^\\.]*$", test) + 2, length(test) ) 
soc_df$last_name <- gsub(" J. \\(Jansje\\) van MSc", "", soc_df$last_name)
soc_df$first_name <- tolower(soc_df$first_name)  # everything to lower!
soc_df$last_name <- tolower(soc_df$last_name)
cs_df <- soc_df
cs_df$first_name <- tolower(cs_df$first_name)  # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)

cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")

cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1])
cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1])

cs_df$last_name
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")

cs_df$last_name <- paste(test, cs_df$last_name, sep="" )
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op de ", "ul ") 

for (i in 1: length(lastname_df$lastname)) {
  if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    last <-  as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
    last <- last[length(last)]
    first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
    lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
  }
}

#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO 
for (i in 1: length(lastname_df$lastname)) {
  if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    
    
    if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
    }
    
    if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
    }
    
  }
}

lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
soc_df <- lastname_df
soc_df$affiliation <- "RU" 
soc_df$field <- "sociology"
colnames(soc_df)[1] <- "names"
save(soc_df, file="./data/soc_df_v20221005.RData")

3.4 Data Science

rm(list=ls())
library(V8)
link <- 'https://www.cs.ru.nl/das/staff/index.html'
namesjs <- read_html(link)  %>% html_nodes('script') %>% html_text()
names <- as.character(namesjs[4])


# first name is everything between brackets
names <- as.character(str_extract_all(names, "\\[(.*?)\\]", simplify = TRUE))

names <- as.character(str_extract_all(names, "\\'(.*?)\\'", simplify = TRUE))

names <- gsub("'", "", names)

names <- names[1:95]

3.4.1 cleaning

names <- stri_trans_general(names, id = "Latin-ASCII")

cs_df <- data.frame(names)

cs_df$first_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,1]
cs_df$last_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,2]
cs_df$first_name <- tolower(cs_df$first_name)  # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)

cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
lastname_df$lastname
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op den ", "ul ") 

for (i in 1: length(lastname_df$lastname)) {
  if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    last <-  as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
    last <- last[length(last)]
    first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
    lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
  }
}

#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO 
for (i in 1: length(lastname_df$lastname)) {
  if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    
    
    if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
    }
    
    if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
    }
    
  }
}

lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
ds_df <- lastname_df
ds_df$affiliation <- "RU"
ds_df$field <- "data science"
save(ds_df, file="./data/ds_df_v20221005.RData")

3.5 Sociology - UU

rm(list=ls())
library(RSelenium) # note that this is something new you need to install yourself first 
pjs <- wdman::phantomjs() # never mind this

# Open a connection to browse
dr <- rsDriver(browser = "phantomjs")

# get a browser ready
remdr <- dr[['client']] 

# Go to the site we want to scrape that uses javascript tables
remdr$navigate("https://www.uu.nl/organisatie/sociologie/medewerkers")

tables <- remdr$findElements(using = "xpath", '//*[contains(concat( " ", @class, " " ), concat( " ", "profile", " " ))]')

# now we have the javascript loaded tables and we want to extract some information from that
persons <- list()
for (i in 1:length(tables)) { # so for all the elements (18 persons)
  
  persons[[i]] <- tables[[i]]$getElementText()[[1]] # we want to simply extract the text
  persons[[i]] <- strsplit(persons[[i]], split = "\\\n") # split that string on the "\n" substring, note the escape
  persons[[i]] <- data.frame(t(data.frame(persons[[i]]))) # do some data crunching, nevermind this
  
}
persons <- bind_rows(persons) # bind the rows out of that list
rownames(persons) <- 1:nrow(persons) # rename rows

names <- persons[,1]

names <- gsub("([A-Z]\\.)*", "", names)

names <- tolower(names)
removes <- c("\\(", "\\)", "prof. ", "dr. ", "drs. ", "msc", "bsc", "ir. ", "drs. ")

for (i in 1:length(names)) {
  for (j in 1:length(removes)) {
    names[i] <- gsub(removes[j], "", names[i])
  }
}

names <- stri_trans_general(names, id = "Latin-ASCII")

names <- trimws(names, which = c("both"), whitespace = "[ \t\r\n]")
cs_df <- data.frame(names)

cs_df$first_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,1]
cs_df$last_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,2]
cs_df$first_name <- tolower(cs_df$first_name)  # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)

cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
lastname_df$lastname
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op den ", "ul ", "op de ") 

for (i in 1: length(lastname_df$lastname)) {
  if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    last <-  as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
    last <- last[length(last)]
    first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
    lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
  }
}

#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO 
for (i in 1: length(lastname_df$lastname)) {
  if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
    
    
    if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
    }
    
    if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
      lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
    }
    
  }
}

lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
lastname_df$first_name <- str_split(lastname_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1]
lastname_df$first_name <- str_split(lastname_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1]
socuu_df <- lastname_df
socuu_df$affiliation <- "UU"
socuu_df$field <- "sociology"
save(socuu_df, file="./data/socuu_df_v20221005.RData")

4 combine

rm(list=ls())
load("./data/soc_df_v20221005.RData")
load("./data/cs_df_v20221005.RData")
names(cs_df)[1] <- "names"
load("./data/ds_df_v20221005.RData")
load("./data/socuu_df_v20221005.RData")
names_df <- rbind(soc_df, cs_df, ds_df, socuu_df)
id <- 1:nrow(names_df)
names_df <- cbind(id, names_df)
save(names_df, file="./data/names_df_v20221005.RData")
