Computer
Science!
# Let's first get the staff page read_html is a function that simply extracts html webpages and
# puts them in xml format
cs_staff <- read_html("https://www.cs.ru.nl/staff/index.html")
#head(soc_staff)
#class(soc_staff)
cs_staff <- cs_staff %>%
rvest::html_nodes("body") %>%
xml2::xml_find_all("//a") %>%
rvest::html_text()
Cleaning
Encoding(cs_staff) <- "UTF-8"
cs_staff <- iconv(cs_staff, from="UTF-8", to="LATIN1")
cs_staff <- stri_trans_general(cs_staff, id = "Latin-ASCII")
check <- NA
for (i in 1:length(cs_staff)) {
check[i] <- str_length(cs_staff[i]) > 1
}
cs_staff2 <- cs_staff[check]
cs_staff <- cs_staff2
cs_df <- data.frame(cs_staff)
# Last name seems to be everything after )
cs_df$last_name <-as.character(str_split(cs_df$cs_staff, pattern="\\)", n = 2, simplify = TRUE)[,2])
# first name is everything between brackets
cs_df$first_name <- as.character(str_extract_all(cs_df$cs_staff, "(?<=\\().+?(?=\\))", simplify = TRUE))
cs_df$first_name <- tolower(cs_df$first_name) # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1])
cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1])
cs_df$last_name
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op den ", "ul ")
for (i in 1: length(lastname_df$lastname)) {
if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
last <- as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
last <- last[length(last)]
first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
}
}
#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO
for (i in 1: length(lastname_df$lastname)) {
if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
}
if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
}
}
}
lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
cs_df <- lastname_df
cs_df$affiliation <- "RU"
cs_df$field <- "computer science"
save(cs_df, file="./data/cs_df_v20221005.RData")
selecting only the
odd rowes with the names
fodd <- function(x) {
#what is x, x is a vector
x%%2 != 0
}
nstaf <- length(soc_staff)
soc_names <- soc_staff[fodd(1:nstaf)]
soc_names <- stri_trans_general(soc_names, id = "Latin-ASCII")
soc_df <- data.frame(soc_names)
delrows <- which(soc_df$soc_names == "Staff:" | soc_df$soc_names == "PhD:" | soc_df$soc_names == "External PhD:" |
soc_df$soc_names == "Guest researchers:" | soc_df$soc_names == "Other researchers:")
soc_df <- data.frame(soc_df[-delrows, ])
colnames(soc_df) <- "soc_names"
cleaning
# Last name seems to be everything before the comma
soc_df$last_name <- gsub(",.*$", "", soc_df$soc_names)
# first name is everything between brackets
soc_df$first_name <- as.character(str_extract_all(soc_df$soc_names, "(?<=\\().+?(?=\\))", simplify = TRUE))
# tussenvoegsel is evertying between last . and first bracket
test <- gsub("\\(.*$", "", soc_df$soc_names)
test <- substr(test, start= regexpr("\\.[^\\.]*$", test) + 2, length(test) )
soc_df$last_name <- gsub(" J. \\(Jansje\\) van MSc", "", soc_df$last_name)
soc_df$first_name <- tolower(soc_df$first_name) # everything to lower!
soc_df$last_name <- tolower(soc_df$last_name)
cs_df <- soc_df
cs_df$first_name <- tolower(cs_df$first_name) # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1])
cs_df$first_name <- as.character(str_split(cs_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1])
cs_df$last_name
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$last_name <- paste(test, cs_df$last_name, sep="" )
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op de ", "ul ")
for (i in 1: length(lastname_df$lastname)) {
if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
last <- as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
last <- last[length(last)]
first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
}
}
#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO
for (i in 1: length(lastname_df$lastname)) {
if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
}
if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
}
}
}
lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
soc_df <- lastname_df
soc_df$affiliation <- "RU"
soc_df$field <- "sociology"
colnames(soc_df)[1] <- "names"
save(soc_df, file="./data/soc_df_v20221005.RData")
Data Science
rm(list=ls())
library(V8)
link <- 'https://www.cs.ru.nl/das/staff/index.html'
namesjs <- read_html(link) %>% html_nodes('script') %>% html_text()
names <- as.character(namesjs[4])
# first name is everything between brackets
names <- as.character(str_extract_all(names, "\\[(.*?)\\]", simplify = TRUE))
names <- as.character(str_extract_all(names, "\\'(.*?)\\'", simplify = TRUE))
names <- gsub("'", "", names)
names <- names[1:95]
cleaning
names <- stri_trans_general(names, id = "Latin-ASCII")
cs_df <- data.frame(names)
cs_df$first_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,1]
cs_df$last_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,2]
cs_df$first_name <- tolower(cs_df$first_name) # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
lastname_df$lastname
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op den ", "ul ")
for (i in 1: length(lastname_df$lastname)) {
if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
last <- as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
last <- last[length(last)]
first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
}
}
#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO
for (i in 1: length(lastname_df$lastname)) {
if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
}
if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
}
}
}
lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
ds_df <- lastname_df
ds_df$affiliation <- "RU"
ds_df$field <- "data science"
save(ds_df, file="./data/ds_df_v20221005.RData")
Sociology - UU
rm(list=ls())
library(RSelenium) # note that this is something new you need to install yourself first
pjs <- wdman::phantomjs() # never mind this
# Open a connection to browse
dr <- rsDriver(browser = "phantomjs")
# get a browser ready
remdr <- dr[['client']]
# Go to the site we want to scrape that uses javascript tables
remdr$navigate("https://www.uu.nl/organisatie/sociologie/medewerkers")
tables <- remdr$findElements(using = "xpath", '//*[contains(concat( " ", @class, " " ), concat( " ", "profile", " " ))]')
# now we have the javascript loaded tables and we want to extract some information from that
persons <- list()
for (i in 1:length(tables)) { # so for all the elements (18 persons)
persons[[i]] <- tables[[i]]$getElementText()[[1]] # we want to simply extract the text
persons[[i]] <- strsplit(persons[[i]], split = "\\\n") # split that string on the "\n" substring, note the escape
persons[[i]] <- data.frame(t(data.frame(persons[[i]]))) # do some data crunching, nevermind this
}
persons <- bind_rows(persons) # bind the rows out of that list
rownames(persons) <- 1:nrow(persons) # rename rows
names <- persons[,1]
names <- gsub("([A-Z]\\.)*", "", names)
names <- tolower(names)
removes <- c("\\(", "\\)", "prof. ", "dr. ", "drs. ", "msc", "bsc", "ir. ", "drs. ")
for (i in 1:length(names)) {
for (j in 1:length(removes)) {
names[i] <- gsub(removes[j], "", names[i])
}
}
names <- stri_trans_general(names, id = "Latin-ASCII")
names <- trimws(names, which = c("both"), whitespace = "[ \t\r\n]")
cs_df <- data.frame(names)
cs_df$first_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,1]
cs_df$last_name <- str_split(cs_df$names, pattern=" ", n = 2, simplify = TRUE)[,2]
cs_df$first_name <- tolower(cs_df$first_name) # everything to lower!
cs_df$last_name <- tolower(cs_df$last_name)
cs_df$last_name <- trimws(cs_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df$first_name <- trimws(cs_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")
cs_df %>% mutate(lastname=last_name) -> cs_df
lastname_df <- cs_df
lastname_df$lastname
#voorvoegsels correct zetten voor scraper
voorvoegsels <- c("'t ", "d' ", "de ", "de la ", "den ", "del ", "der ", "des ", "el ", "el- ", "in 't ", "la ", "le ", "les ", "op den ", "ten ", "ter ", "tes ", "van ", "van 't ", "van de " , "van der ", "van den ", "von der ", "op den ", "ul ", "op de ")
for (i in 1: length(lastname_df$lastname)) {
if (sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
last <- as.character(str_split(lastname_df$lastname[i], pattern=" ", simplify = TRUE))
last <- last[length(last)]
first <- as.character(unlist(strsplit(lastname_df$lastname[i], split=last, fixed=TRUE)))
lastname_df$lastname[i] <- paste(last, ", ", first, sep="")
}
}
#dubbele namen verwijderen. let op dubbele namen met voorvoegsel worden niet gecleaned. TO DO
for (i in 1: length(lastname_df$lastname)) {
if (!sum(str_detect(lastname_df$lastname[i], voorvoegsels))>0) {
if (sum(str_detect(lastname_df$lastname[i], " "))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern=" ", n = 2, simplify = TRUE)), 1)
}
if (sum(str_detect(lastname_df$lastname[i], "-"))>0) {
lastname_df$lastname[i] <- tail(as.character(str_split(lastname_df$lastname[i], pattern="-", n = 2, simplify = TRUE)), 1)
}
}
}
lastname_df$lastname<- trimws(lastname_df$lastname, which = c("right"), whitespace = "[ \t\r\n]")
lastname_df$first_name <- str_split(lastname_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1]
lastname_df$first_name <- str_split(lastname_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1]
socuu_df <- lastname_df
socuu_df$affiliation <- "UU"
socuu_df$field <- "sociology"
save(socuu_df, file="./data/socuu_df_v20221005.RData")