library(knitr)

knitr::opts_chunk$set(echo = TRUE)
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE, warning = FALSE, message = FALSE,comment = "#>", cache=TRUE, class.source=c("test"), class.output=c("test2"))
options(width = 100)
rgl::setupKnitr()



colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
klippy::klippy(position = c('top', 'right'))
#klippy::klippy(color = 'darkred')
#klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done')
#start with clean workspace 
rm(list=ls())
getwd()
#> [1] "C:/Users/ninab/OneDrive/Documenten/GitHub/labjournal"

1 packages

#install.packages("data.table") 
library(data.table) # mainly for faster data handling
library(tidyverse) # I assume you already installed this one!
# install.packages("httr") # we don't need this for now
# require(httr)
#install.packages("xml2")
require(xml2)
#install.packages("rvest")
require(rvest)
#install.packages("devtools")
require(devtools)
# Note we're doing something different here. We're installing a *latest* version directly from GitHub
# This is because the released version of this packages contains some errors!
#devtools::install_github("jkeirstead/scholar") 


require(scholar)

#define workdirectory, note the double *backslashes* if you're on windows
# setwd("/yourpathhere)"

2 saving webpage

# Let's first get the staff page read_html is a function that simply extracts html webpages and
# puts them in xml format
soc_staff <- read_html("https://www.ru.nl/sociology/research/staff/")

#head(soc_staff)
#class(soc_staff)

3 making data with names

3.1 selecting the table with names

soc_staff <- soc_staff %>%
    rvest::html_nodes("body") %>%
    xml2::xml_find_all("//td") %>%
    rvest::html_text()

3.2 selecting only the odd rowes with the names

fodd <- function(x) {
  #what is x, x is a vector
 x%%2 != 0 
}

nstaf <- length(soc_staff)

soc_names <- soc_staff[fodd(1:nstaf)] 
head(soc_names)

3.3 selecting only the even rowes with the expertise

soc_experts <- soc_staff[!fodd(1:nstaf)]
head(soc_experts)

3.4 combining names and expertise in soc_df

soc_df <- data.frame(cbind(soc_names, soc_experts)) 
save(soc_df, file = "names_experts.RData")
load("names_experts.RData")

4 Cleaning the dataset

4.1 delete rows without name info

# inspect again, and remove the rows we don't need (check for yourself to be certain!)

delrows <- which(soc_df$soc_names == "Staff:" | soc_df$soc_names == "PhD:" | soc_df$soc_names == "External PhD:" |
    soc_df$soc_names == "Guest researchers:" | soc_df$soc_names == "Other researchers:")

soc_df <- soc_df[-delrows, ]

4.2 cleaning the names

# Last name seems to be everything before the comma
soc_df$last_name <- gsub(",.*$", "", soc_df$soc_names)

# first name is everything between brackets
soc_df$first_name <- as.character(str_extract_all(soc_df$soc_names, "(?<=\\().+?(?=\\))", simplify = TRUE))
soc_df$last_name <- gsub(" J. \\(Jansje\\) van MSc", "", soc_df$last_name)
soc_df$first_name <- tolower(soc_df$first_name)  # everything to lower!
soc_df$last_name <- tolower(soc_df$last_name)

4.3 Loose double names

# trimws looses all spacing before and after (if you specify 'both') a character string
soc_df$last_name <- trimws(soc_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
soc_df$first_name <- trimws(soc_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")

soc_df$first_name <- as.character(str_split(soc_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1])
soc_df$first_name <- as.character(str_split(soc_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1])

#removing strange characters
soc_df$first_name <- iconv(soc_df$first_name, from = 'UTF-8', to = 'ASCII//TRANSLIT')

soc_df$soc_experts <- trimws(soc_df$soc_experts, which = c("both"), whitespace = "[ \t\r\n]")
soc_df$soc_names <- trimws(soc_df$soc_names, which = c("both"), whitespace = "[ \t\r\n]")
soc_df$first_name
#>  [1] "ronald"     "katia"      "hidde"      "lonneke"    "lieselotte" "rob"        "maurice"   
#>  [8] "nella"      "saskia"     "margriet"   "remco"      "bas"        "judith"     "gerbert"   
#> [15] "roza"       "michael"    "peer"       "niels"      "jochem"     "ellen"      "mark"      
#> [22] "maarten"    "carlijn"    "rob"        "mustafa"    "aysegul"    "inge"       "thijmen"   
#> [29] "rachel"     "nik"        "renae"      "maikel"     "carly"      "anne"       "katrin"    
#> [36] "klara"      "marlou"     "sara"       "janos"      "jansje"     "elize"      "tijmen"    
#> [43] "elissa"     "carl"       "paul"       "malou"
save(soc_df, file="soc_df_s1.RData")

5 adding gender to dataset

5.1 fgender

fgender <- function(soc_df, me, file=NULL) {

####################################
# Author: Bas Hofstra, Anne Maaike Mulders, Jochem Tolsma
# DAte:   13-10-2021, last edit: 22-09-2022
# Tasks:  - assign gender baed on name
#         - Adapted from Rense Corten code April 2021
####################################


#Input: 
#  - firstname_df: a data.frame with a column named firstname  
#  - me: a character vector introducing yourself: e.g. "J Tolsma, Radboud University"
#  - file: location and name of file to be saved. 
  
#------------------------------------------------------------------------------------
# Load required packages

if (!require("tidyverse", character.only = TRUE)) {
  install.packages("tidyverse", dependencies = TRUE)
  library(tidyverse, character.only = TRUE)
}

if (!require("rvest", character.only = TRUE)) {
  install.packages("rvest", dependencies = TRUE)
  library(rvest, character.only = TRUE)
}

if (!require("polite", character.only = TRUE)) {
  install.packages("polite", dependencies = TRUE)
  library(polite, character.only = TRUE)
}



# make links to scrape
soc_df$name_url <- paste0("https://www.meertens.knaw.nl/nvb/naam/is/", soc_df[, c("firstname")])



#------------------------------------------------------------------------------------
### 2: introduce to server ###

# Introduce myself to the server
session <- bow("https://www.meertens.knaw.nl/nvb/naam/is", user_agent = me , delay = 1)


#------------------------------------------------------------------------------------
### 3: make function to get table from ###
  fnames <- function(link){ 
    name_session <-nod(session, path = link)
    name_page <- scrape(name_session) 
    return(name_page)
  }
  
name_list <- list()
table_list <- list()
soc_df$gender <- NA

  for (i in 1:nrow(soc_df)) {
    print(i)
    name_list[[i]] <- fnames(soc_df[i, c("name_url")])
    # extract name frequency table and gender info
    table_list[[i]] <- name_list[[i]] %>% html_table()
    
    table_list[[i]][[1]][table_list[[i]][[1]]=="--"] <- "0"
    
    if (as.numeric(table_list[[i]][[1]]$X3[2]) > as.numeric(table_list[[i]][[1]]$X3[6])) {
      soc_df$gender[i] <- "male" } else {
        soc_df$gender[i] <- "female"
      }
    
    if (!is.null(file)) (save(soc_df, file=file))
    
    }
  return(soc_df)
}

5.2 gender to soc_df

soc_df %>% mutate(firstname=first_name) -> soc_df

soc_df$firstname
soc_df <- fgender(soc_df, me="Jochem Tolsma, RU/RUG")
save(soc_df, file="soc_df_s2.RData")
load("soc_df_s2.RData")

5.3 affiliation

# set affiliation to radboud, comes in handy for querying google scholar
soc_df$affiliation <- "radboud university"

6 harvesting data from google scholar

#require(scholar)

get_scholar_id_fix <- function (last_name = "", first_name = "", affiliation = NA)
{
  if (!any(nzchar(c(first_name, last_name))))
    stop("At least one of first and last name must be specified!")
  site <- getOption("scholar_site")
  url <- paste0(site, "/citations?view_op=search_authors&mauthors=",
                first_name, "+", last_name, "&hl=en&oi=ao")
  page <- get_scholar_resp(url)
  if (is.null(page))
    return(NA)
  aa <- httr::content(page, as = "text")
  # added by Bas Hofstra: bugfix for IDs that have a dash ("-")
  ids <- substring(aa, regexpr(";user=", aa))
  ids <- substr(ids, 1, 19) # error prone, but unsure how to solve otherwise
  # if (nchar(stringr::str_extract_all(string = aa, pattern = ";user=[[:alnum:]]+[[:punct:]]")[[1]][1]) < 18) {
  #   ids <- stringr::str_extract_all(string = aa, pattern = ";user=[[:alnum:]]+[[:punct:]]+[[:alnum:]]+[[:punct:]]")
  # } else {
  #   ids <- stringr::str_extract_all(string = aa, pattern = ";user=[[:alnum:]]+[[:punct:]]")
  # }
  if (length(unlist(ids)) == 0) {
    message("No Scholar ID found.")
    return(NA)
  }
  ids <- ids %>% unlist %>% gsub(";user=|[[:punct:]]$", "",
                                 .) %>% unique
  if (length(ids) > 1) {
    profiles <- lapply(ids, scholar::get_profile)
    if (is.na(affiliation)) {
      x_profile <- profiles[[1]]
      warning("Selecting first out of ", length(profiles),
              " candidate matches.")
    }
    else {
      which_profile <- sapply(profiles, function(x) {
        stringr::str_count(string = x$affiliation, pattern = stringr::coll(affiliation,
                                                                           ignore_case = TRUE))
      })
      if (all(which_profile == 0)) {
        warning("No researcher found at the indicated affiliation.")
        return(NA)
      }
      else {
        x_profile <- profiles[[which(which_profile !=
                                       0)]]
      }
    }
  }
  else {
    x_profile <- scholar::get_profile(id = ids)
  }
  return(x_profile$id)
}

6.1 scholars id.

Remove staff members without scholar ids.

soc_df$gs_id <- ""

for (i in 1:nrow(soc_df)) {
  print(i)
  time <- runif(1, 0, 1)
  Sys.sleep(time)

  tryCatch({
     soc_df[i,c("gs_id")] <- get_scholar_id_fix(last_name = soc_df[i, c("last_name")], # so search on last_name of staff (third column)
                                             first_name = soc_df[i, c("first_name")],  # search on first_name of staff (fourth column)
                                             affiliation = soc_df[i,c("affiliation")]) # search on affiliation of each staff (fifth column)

    }, error=function(e){cat("ERROR :", conditionMessage(e), "\n")}) # continue on error, but print the error
  }

# remove those without pubs from the df
# seems we're left with about 34 sociology staff members!
soc_df_copy <- soc_df #just to also have the data with staff without scholar_id
soc_df <- soc_df[!soc_df$gs_id == "", ]
save(soc_df, file="soc_df_s3.RData")
save(soc_df_copy, file="soc_df_copy.RData")
load("soc_df_s3.RData")

7 publications and profiles

We save the publications and info of the scholar profiles in new objects.

soc_list_profiles <- list()  # first we create an empty list that we then fill up with the for loop
soc_list_publications <- list()

time <- 1 # I placed the waiting time outside the loop
i <- 1 # Our loop iterator is now a variable. This means I can change it within a while loop. Using a for loop you cant change your iterator in the loop itself. 

while (i <= nrow(soc_df)) {
    print(i)
    Sys.sleep(time)

   
    tryCatch({
    #In this part of the tryCatch function you put all the stuff you want to do in the loop.
    # note how you call different elements in a list '[[]]', fill in the i-th element
    soc_list_profiles[[i]] <- get_profile(soc_df[i, c("gs_id")])  # Note how we call row i (remember how to call rows in a DF/Matrix) and then the associated scholar id
    soc_list_publications[[i]] <- get_publications(soc_df[i, c("gs_id")])
    soc_list_publications[[i]][, c("gs_id")] <- soc_df[i, c("gs_id")]  # note that we again attach an id
    # so both functions here call the entire profile and pubs for an author, based on google
    # scholar ids
    i <- i + 1 #IMPORTANT, YOU NEED TO TELL THE WHILE LOOP THAT YOUR ITERATOR HAS TO BE INCREASED
    },
      warning = function(w) {
        cat("WARNING:", conditionMessage(w), "\n") #WARNING message
        i <<- i + 1}, #BUT WE DO WANT TO CONTINUE. NOTE THE DOUBLE << THIS IS BECAUSE I WANT TO CHANGE A VARIABLE WHICH EXISTS OUTSIDE THE WARNING FUNCTION
      error =function(e) {
        time <<- min(time *10, 3600*2)
        cat("Error:", conditionMessage(e), "\n") #ERROR message
        cat("sleep time:", time,  "\n")
        cat("ik zit in loop", i)
        #AFTER THE NEW SLEEP TIME, WE TRY AGAIN, WE THEREFORE DO NOT UPDATE i. ideally you also want to have some break option. And maybe you also want to save your data when you hit a time out error. 
      })
   
 
}
# The 36 RU sociology scholars publish ~3000 papers
soc_df_publications <- bind_rows(soc_list_publications)
save(soc_df_publications, file="soc_df_publications.RData")
save(soc_list_profiles, file= "soc_list_profiles.RData")
load("soc_df_publications.RData")
load("soc_list_profiles.RData")

7.1 put the info of the profiles in our data set of staff members soc_df

soc_profiles_df <- list()


for (i in 1:length(soc_list_profiles)) {
    # soc_profiles_df[[i]] <- data.frame(t(unlist(soc_list_profiles[[i]][1:8]))) #some annyoing
    # data handling
  if (!is.null(soc_list_profiles[[i]])) {
    soc_profiles_df[[i]] <- unlist(soc_list_profiles[[i]][1:8])
    soc_profiles_df[[i]] <- data.frame(soc_profiles_df[[i]])
    soc_profiles_df[[i]] <- t(soc_profiles_df[[i]])
    row.names(soc_profiles_df[[i]]) <- NULL
    soc_profiles_df[[i]] <- data.frame(soc_profiles_df[[i]])
  }
}

#soc_profiles_df

soc_profiles_df2 <- bind_rows(soc_profiles_df)
soc_df <- left_join(soc_df, soc_profiles_df2, by = c(gs_id = "id"))  # merge data with soc_df
soc_df  # notice all the new information we were able to get from the scholar profiles!
save(soc_df, file="soc_df_s4.RData")
load("soc_df_s4.RData")

8 citation history

# get citation history of a scholar
soc_staff_cit <- list()


time <- 1 # I placed the waiting time outside the loop
i <- 1 # Our loop iterator is now a variable. This means I can change it within a while loop. Using a for loop you cant change your iterator in the loop itself. 

while (i <= nrow(soc_df)) {
    print(i)
    Sys.sleep(time)

    tryCatch({
      soc_staff_cit[[i]] <- get_citation_history(soc_df[i, c("gs_id")])
        if (nrow(soc_staff_cit[[i]]) > 0) {
          soc_staff_cit[[i]][, c("gs_id")] <- soc_df[i, c("gs_id")]  # again attach the gs_id as third column
        }
    i <- i + 1
    },
      warning = function(w) {
        cat("WARNING:", conditionMessage(w), "\n") #WARNING message
        i <<- i + 1}, #BUT WE DO WANT TO CONTINUE. NOTE THE DOUBLE << THIS IS BECAUSE I WANT TO CHANGE A VARIABLE WHICH EXISTS OUTSIDE THE WARNING FUNCTION
      error =function(e) {
        time <<- min(time *10, 3600*2)
        cat("Error:", conditionMessage(e), "\n") #ERROR message
        cat("sleep time:", time,  "\n")
        cat("ik zit in loop", i)
        #AFTER THE NEW SLEEP TIME, WE TRY AGAIN, WE THEREFORE DO NOT UPDATE i. ideally you also want to have some break option. And maybe you also want to save your data when you hit a time out error. 
      })

    
}
soc_staff_cit <- bind_rows(soc_staff_cit)
save(soc_staff_cit, file="soc_staff_cit.RData")
load("soc_staff_cit.RData")

9 collaborators

require(rvest)
require(xml2)
require(tidyverse)

# function to get collaborators and names from GS profiles
fcollabs <- function(gsid, lookforcollabs) {

  htmlpage1 <- read_html(paste0("https://scholar.google.nl/citations?user=", gsid, "&hl=en")) # so we paste the google scholar id
  profilename <- htmlpage1 %>% html_nodes(xpath = "//*/div[@id='gsc_prf_in']") %>% html_text() # we extract the profile name of that google scholar page
  profilecollabs1 <- as.data.frame(0) # empty df necessary for later
  profilecollabs2 <- as.data.frame(0) # empty df necessary for later

  if (lookforcollabs == 1) { # so if you want to look for collabs, set function to 1

    htmlpage2 <- read_html(paste0("https://scholar.google.com/citations?view_op=list_colleagues&hl=en&user=", gsid)) # so we paste the google scholar id
    profilecollabs1 <-  htmlpage2 %>% html_nodes(css="h3") %>% html_text() # get names
    profilecollabs1 <-  as.data.frame(profilecollabs1)

    profilecollabs2 <- htmlpage2 %>% html_nodes("a") %>% html_attr("href") # get the link
    profilecollabs2 <- profilecollabs2[seq_along(profilecollabs2) %% 2 > 0]
    profilecollabs2 <- substring(profilecollabs2, 23)

  }
  if (nrow(profilecollabs1)>1) { # if there ARE collabs

    profilecollabs1 <- as.data.frame(profilecollabs1) # we want to...
    profilecollabs2 <-  as.data.frame(profilecollabs2)
    profilecollabs1[,c("coauth_id")] <- profilecollabs2[,1]

    profilecollabs1[,c("gs_id")] <- gsid #... add gs_ids of focal GS profile
    profilecollabs1[,c("name")] <- profilename #...and the the profile name of GS profile attached

    names(profilecollabs1)[1] <- "coauth"

  } else {
    profilecollabs1 <- as.data.frame(cbind(gsid, profilename)) # if NOT looking for collabs...
    names(profilecollabs1) <- c("gs_id", "name") #...we only attach gs_id and profilename

  }
  return(profilecollabs1)

}
# input a google scholar id and a 1 (if you want to find collabs) or 0 (only extracting names)

soc_collabs <- list()
for (i in 1:nrow(soc_df)) {
    print(i)
    time <- runif(1, 0, 1)
    Sys.sleep(time)

    soc_collabs[[i]] <- fcollabs(soc_df[i, c("gs_id")], 1)

}

soc_collabs <- bind_rows(soc_collabs)  # bind rows, get the unique ones!
soc_collabs_unique <- unique(soc_collabs[, 3]) 
soc_collabs_unique <- soc_collabs_unique[!is.na(soc_collabs_unique)]
save(soc_collabs, file = "soc_df_collabs1.RData")  # you notice this takes a while, so we save the data here

10 network based on publications.

Deze tijdsintervallen zijn waarschijnlijk te groot!

load("soc_df_collabs1.RData")
library(stringr)

#empty adjacency matrix for the years 2001-2007
network2001_2007 <- matrix(NA, nrow=nrow(soc_df), ncol=nrow(soc_df))

#select publications of the corresponding time era
pubs_sel <- soc_df_publications %>%
              mutate(author = tolower(author)) %>%
              filter(year>2000 & year<2008)

save(pubs_sel, file = "soc_pubs_sel_2001_2007.RData")

#fill the matrix
for (ego in 1: nrow(soc_df)) {
  name_ego <- soc_df$last_name[ego] #which ego? 
  pubs_sel2 <- pubs_sel[str_detect(pubs_sel$author, name_ego),] #publications of ego
  for (alter in 1:nrow(soc_df)){
    name_alter <- soc_df$last_name[alter] #which alter? 
    network2001_2007[ego,alter] <- sum(str_detect(pubs_sel2$author, name_alter)) > 1 #did alter publish with ego
  }
}

save(network2001_2007, file = "soc_network2001_2007.RData")
library(stringr)

#empty adjacency matrix for the years 2008-2014
network2008_2014 <- matrix(NA, nrow=nrow(soc_df), ncol=nrow(soc_df))

#select publications of the corresponding time era
pubs_sel <- soc_df_publications %>%
              mutate(author = tolower(author)) %>%
              filter(year>2007 & year<2015)

save(pubs_sel, file = "soc_pubs_sel_2008_2014.RData")

#fill the matrix
for (ego in 1: nrow(soc_df)) {
  name_ego <- soc_df$last_name[ego] #which ego? 
  pubs_sel2 <- pubs_sel[str_detect(pubs_sel$author, name_ego),] #publications of ego
  for (alter in 1:nrow(soc_df)){
    name_alter <- soc_df$last_name[alter] #which alter? 
    network2008_2014[ego,alter] <- sum(str_detect(pubs_sel2$author, name_alter)) > 1 #did alter publish with ego
  }
}

save(network2008_2014, file = "soc_network2008_2014.RData")
library(stringr)

#empty adjacency matrix for the years 2015-2021
network2015_2021 <- matrix(NA, nrow=nrow(soc_df), ncol=nrow(soc_df))

#select publications of the corresponding time era
pubs_sel <- soc_df_publications %>%
              mutate(author = tolower(author)) %>%
              filter(year>2014 & year<2022)

save(pubs_sel, file = "soc_pubs_sel_2015_2021.RData")

#fill the matrix
for (ego in 1: nrow(soc_df)) {
  name_ego <- soc_df$last_name[ego] #which ego? 
  pubs_sel2 <- pubs_sel[str_detect(pubs_sel$author, name_ego),] #publications of ego
  for (alter in 1:nrow(soc_df)){
    name_alter <- soc_df$last_name[alter] #which alter? 
    network2015_2021[ego,alter] <- sum(str_detect(pubs_sel2$author, name_alter)) > 1 #did alter publish with ego
  }
}

save(network2015_2021, file = "soc_network2015_2021.RData")
---
title: "Webscraping data sociology Radboud University"
#bibliography: references.bib
author: "Nina Branten"
---

```{r, echo=TRUE, warning=FALSE, results='hide'}
library(knitr)

knitr::opts_chunk$set(echo = TRUE)
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE, warning = FALSE, message = FALSE,comment = "#>", cache=TRUE, class.source=c("test"), class.output=c("test2"))
options(width = 100)
rgl::setupKnitr()



colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }

```

```{r klippy, echo=TRUE, include=TRUE}
klippy::klippy(position = c('top', 'right'))
#klippy::klippy(color = 'darkred')
#klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done')
```



```{r}
#start with clean workspace 
rm(list=ls())
getwd()
```

# packages
```{r}
#install.packages("data.table") 
library(data.table) # mainly for faster data handling
library(tidyverse) # I assume you already installed this one!
# install.packages("httr") # we don't need this for now
# require(httr)
#install.packages("xml2")
require(xml2)
#install.packages("rvest")
require(rvest)
#install.packages("devtools")
require(devtools)
# Note we're doing something different here. We're installing a *latest* version directly from GitHub
# This is because the released version of this packages contains some errors!
#devtools::install_github("jkeirstead/scholar") 


require(scholar)

#define workdirectory, note the double *backslashes* if you're on windows
# setwd("/yourpathhere)"
```

# saving webpage

```{r, eval=FALSE}
# Let's first get the staff page read_html is a function that simply extracts html webpages and
# puts them in xml format
soc_staff <- read_html("https://www.ru.nl/sociology/research/staff/")

#head(soc_staff)
#class(soc_staff)
```

# making data with names

## selecting the table with names

```{r, eval=FALSE}
soc_staff <- soc_staff %>%
    rvest::html_nodes("body") %>%
    xml2::xml_find_all("//td") %>%
    rvest::html_text()
```

## selecting only the odd rowes with the names
```{r, eval=FALSE}
fodd <- function(x) {
  #what is x, x is a vector
 x%%2 != 0 
}

nstaf <- length(soc_staff)

soc_names <- soc_staff[fodd(1:nstaf)] 
head(soc_names)

```

## selecting only the even rowes with the expertise
```{r, eval=FALSE}
soc_experts <- soc_staff[!fodd(1:nstaf)]
head(soc_experts)
```

## combining names and expertise in soc_df 
```{r, eval=FALSE}
soc_df <- data.frame(cbind(soc_names, soc_experts)) 
save(soc_df, file = "names_experts.RData")
```


```{r}
load("names_experts.RData")
```

# Cleaning the dataset  

## delete rows without name info
```{r}
# inspect again, and remove the rows we don't need (check for yourself to be certain!)

delrows <- which(soc_df$soc_names == "Staff:" | soc_df$soc_names == "PhD:" | soc_df$soc_names == "External PhD:" |
    soc_df$soc_names == "Guest researchers:" | soc_df$soc_names == "Other researchers:")

soc_df <- soc_df[-delrows, ]
```

## cleaning the names  
```{r}
# Last name seems to be everything before the comma
soc_df$last_name <- gsub(",.*$", "", soc_df$soc_names)

# first name is everything between brackets
soc_df$first_name <- as.character(str_extract_all(soc_df$soc_names, "(?<=\\().+?(?=\\))", simplify = TRUE))
```

```{r}
soc_df$last_name <- gsub(" J. \\(Jansje\\) van MSc", "", soc_df$last_name)
soc_df$first_name <- tolower(soc_df$first_name)  # everything to lower!
soc_df$last_name <- tolower(soc_df$last_name)
```

## Loose double names
```{r}
# trimws looses all spacing before and after (if you specify 'both') a character string
soc_df$last_name <- trimws(soc_df$last_name, which = c("both"), whitespace = "[ \t\r\n]")
soc_df$first_name <- trimws(soc_df$first_name, which = c("both"), whitespace = "[ \t\r\n]")

soc_df$first_name <- as.character(str_split(soc_df$first_name, pattern=" ", n = 2, simplify = TRUE)[,1])
soc_df$first_name <- as.character(str_split(soc_df$first_name, pattern="-", n = 2, simplify = TRUE)[,1])

#removing strange characters
soc_df$first_name <- iconv(soc_df$first_name, from = 'UTF-8', to = 'ASCII//TRANSLIT')

soc_df$soc_experts <- trimws(soc_df$soc_experts, which = c("both"), whitespace = "[ \t\r\n]")
soc_df$soc_names <- trimws(soc_df$soc_names, which = c("both"), whitespace = "[ \t\r\n]")
soc_df$first_name
```
```{r}
save(soc_df, file="soc_df_s1.RData")
```


# adding gender to dataset

## fgender

```{r, eval=FALSE}
fgender <- function(soc_df, me, file=NULL) {

####################################
# Author: Bas Hofstra, Anne Maaike Mulders, Jochem Tolsma
# DAte:   13-10-2021, last edit: 22-09-2022
# Tasks:  - assign gender baed on name
#         - Adapted from Rense Corten code April 2021
####################################


#Input: 
#  - firstname_df: a data.frame with a column named firstname  
#  - me: a character vector introducing yourself: e.g. "J Tolsma, Radboud University"
#  - file: location and name of file to be saved. 
  
#------------------------------------------------------------------------------------
# Load required packages

if (!require("tidyverse", character.only = TRUE)) {
  install.packages("tidyverse", dependencies = TRUE)
  library(tidyverse, character.only = TRUE)
}

if (!require("rvest", character.only = TRUE)) {
  install.packages("rvest", dependencies = TRUE)
  library(rvest, character.only = TRUE)
}

if (!require("polite", character.only = TRUE)) {
  install.packages("polite", dependencies = TRUE)
  library(polite, character.only = TRUE)
}



# make links to scrape
soc_df$name_url <- paste0("https://www.meertens.knaw.nl/nvb/naam/is/", soc_df[, c("firstname")])



#------------------------------------------------------------------------------------
### 2: introduce to server ###

# Introduce myself to the server
session <- bow("https://www.meertens.knaw.nl/nvb/naam/is", user_agent = me , delay = 1)


#------------------------------------------------------------------------------------
### 3: make function to get table from ###
  fnames <- function(link){ 
    name_session <-nod(session, path = link)
    name_page <- scrape(name_session) 
    return(name_page)
  }
  
name_list <- list()
table_list <- list()
soc_df$gender <- NA

  for (i in 1:nrow(soc_df)) {
    print(i)
    name_list[[i]] <- fnames(soc_df[i, c("name_url")])
    # extract name frequency table and gender info
    table_list[[i]] <- name_list[[i]] %>% html_table()
    
    table_list[[i]][[1]][table_list[[i]][[1]]=="--"] <- "0"
    
    if (as.numeric(table_list[[i]][[1]]$X3[2]) > as.numeric(table_list[[i]][[1]]$X3[6])) {
      soc_df$gender[i] <- "male" } else {
        soc_df$gender[i] <- "female"
      }
    
    if (!is.null(file)) (save(soc_df, file=file))
    
    }
  return(soc_df)
}
```

## gender to soc_df
```{r, eval=FALSE}
soc_df %>% mutate(firstname=first_name) -> soc_df

soc_df$firstname
soc_df <- fgender(soc_df, me="Jochem Tolsma, RU/RUG")
```


```{r, eval=FALSE}
save(soc_df, file="soc_df_s2.RData")
```

```{r}
load("soc_df_s2.RData")
```


## affiliation
```{r}
# set affiliation to radboud, comes in handy for querying google scholar
soc_df$affiliation <- "radboud university"
```

# harvesting data from google scholar

```{r, eval=FALSE}
#require(scholar)

get_scholar_id_fix <- function (last_name = "", first_name = "", affiliation = NA)
{
  if (!any(nzchar(c(first_name, last_name))))
    stop("At least one of first and last name must be specified!")
  site <- getOption("scholar_site")
  url <- paste0(site, "/citations?view_op=search_authors&mauthors=",
                first_name, "+", last_name, "&hl=en&oi=ao")
  page <- get_scholar_resp(url)
  if (is.null(page))
    return(NA)
  aa <- httr::content(page, as = "text")
  # added by Bas Hofstra: bugfix for IDs that have a dash ("-")
  ids <- substring(aa, regexpr(";user=", aa))
  ids <- substr(ids, 1, 19) # error prone, but unsure how to solve otherwise
  # if (nchar(stringr::str_extract_all(string = aa, pattern = ";user=[[:alnum:]]+[[:punct:]]")[[1]][1]) < 18) {
  #   ids <- stringr::str_extract_all(string = aa, pattern = ";user=[[:alnum:]]+[[:punct:]]+[[:alnum:]]+[[:punct:]]")
  # } else {
  #   ids <- stringr::str_extract_all(string = aa, pattern = ";user=[[:alnum:]]+[[:punct:]]")
  # }
  if (length(unlist(ids)) == 0) {
    message("No Scholar ID found.")
    return(NA)
  }
  ids <- ids %>% unlist %>% gsub(";user=|[[:punct:]]$", "",
                                 .) %>% unique
  if (length(ids) > 1) {
    profiles <- lapply(ids, scholar::get_profile)
    if (is.na(affiliation)) {
      x_profile <- profiles[[1]]
      warning("Selecting first out of ", length(profiles),
              " candidate matches.")
    }
    else {
      which_profile <- sapply(profiles, function(x) {
        stringr::str_count(string = x$affiliation, pattern = stringr::coll(affiliation,
                                                                           ignore_case = TRUE))
      })
      if (all(which_profile == 0)) {
        warning("No researcher found at the indicated affiliation.")
        return(NA)
      }
      else {
        x_profile <- profiles[[which(which_profile !=
                                       0)]]
      }
    }
  }
  else {
    x_profile <- scholar::get_profile(id = ids)
  }
  return(x_profile$id)
}
```

## scholars id. 

Remove staff members without scholar ids. 

```{r, eval=FALSE}

soc_df$gs_id <- ""

for (i in 1:nrow(soc_df)) {
  print(i)
  time <- runif(1, 0, 1)
  Sys.sleep(time)

  tryCatch({
     soc_df[i,c("gs_id")] <- get_scholar_id_fix(last_name = soc_df[i, c("last_name")], # so search on last_name of staff (third column)
                                             first_name = soc_df[i, c("first_name")],  # search on first_name of staff (fourth column)
                                             affiliation = soc_df[i,c("affiliation")]) # search on affiliation of each staff (fifth column)

    }, error=function(e){cat("ERROR :", conditionMessage(e), "\n")}) # continue on error, but print the error
  }

# remove those without pubs from the df
# seems we're left with about 34 sociology staff members!
soc_df_copy <- soc_df #just to also have the data with staff without scholar_id
soc_df <- soc_df[!soc_df$gs_id == "", ]

```


```{r, eval=FALSE}
save(soc_df, file="soc_df_s3.RData")
save(soc_df_copy, file="soc_df_copy.RData")

```

```{r}
load("soc_df_s3.RData")
```


# publications and profiles

We save the publications and info of the scholar profiles in new objects. 


```{r, eval=FALSE}
soc_list_profiles <- list()  # first we create an empty list that we then fill up with the for loop
soc_list_publications <- list()

time <- 1 # I placed the waiting time outside the loop
i <- 1 # Our loop iterator is now a variable. This means I can change it within a while loop. Using a for loop you cant change your iterator in the loop itself. 

while (i <= nrow(soc_df)) {
    print(i)
    Sys.sleep(time)

   
    tryCatch({
    #In this part of the tryCatch function you put all the stuff you want to do in the loop.
    # note how you call different elements in a list '[[]]', fill in the i-th element
    soc_list_profiles[[i]] <- get_profile(soc_df[i, c("gs_id")])  # Note how we call row i (remember how to call rows in a DF/Matrix) and then the associated scholar id
    soc_list_publications[[i]] <- get_publications(soc_df[i, c("gs_id")])
    soc_list_publications[[i]][, c("gs_id")] <- soc_df[i, c("gs_id")]  # note that we again attach an id
    # so both functions here call the entire profile and pubs for an author, based on google
    # scholar ids
    i <- i + 1 #IMPORTANT, YOU NEED TO TELL THE WHILE LOOP THAT YOUR ITERATOR HAS TO BE INCREASED
    },
      warning = function(w) {
        cat("WARNING:", conditionMessage(w), "\n") #WARNING message
        i <<- i + 1}, #BUT WE DO WANT TO CONTINUE. NOTE THE DOUBLE << THIS IS BECAUSE I WANT TO CHANGE A VARIABLE WHICH EXISTS OUTSIDE THE WARNING FUNCTION
      error =function(e) {
        time <<- min(time *10, 3600*2)
        cat("Error:", conditionMessage(e), "\n") #ERROR message
        cat("sleep time:", time,  "\n")
        cat("ik zit in loop", i)
        #AFTER THE NEW SLEEP TIME, WE TRY AGAIN, WE THEREFORE DO NOT UPDATE i. ideally you also want to have some break option. And maybe you also want to save your data when you hit a time out error. 
      })
   
 
}
```


```{r, eval=FALSE}
# The 36 RU sociology scholars publish ~3000 papers
soc_df_publications <- bind_rows(soc_list_publications)
```


```{r, eval=FALSE}
save(soc_df_publications, file="soc_df_publications.RData")
save(soc_list_profiles, file= "soc_list_profiles.RData")
```

```{r}
load("soc_df_publications.RData")
load("soc_list_profiles.RData")
```


## put the info of the profiles in our data set of staff members soc_df

```{r, eval=FALSE}
soc_profiles_df <- list()


for (i in 1:length(soc_list_profiles)) {
    # soc_profiles_df[[i]] <- data.frame(t(unlist(soc_list_profiles[[i]][1:8]))) #some annyoing
    # data handling
  if (!is.null(soc_list_profiles[[i]])) {
    soc_profiles_df[[i]] <- unlist(soc_list_profiles[[i]][1:8])
    soc_profiles_df[[i]] <- data.frame(soc_profiles_df[[i]])
    soc_profiles_df[[i]] <- t(soc_profiles_df[[i]])
    row.names(soc_profiles_df[[i]]) <- NULL
    soc_profiles_df[[i]] <- data.frame(soc_profiles_df[[i]])
  }
}

#soc_profiles_df

soc_profiles_df2 <- bind_rows(soc_profiles_df)
soc_df <- left_join(soc_df, soc_profiles_df2, by = c(gs_id = "id"))  # merge data with soc_df
soc_df  # notice all the new information we were able to get from the scholar profiles!

```


```{r, eval=FALSE}
save(soc_df, file="soc_df_s4.RData")

```

```{r}
load("soc_df_s4.RData")
```


# citation history

```{r, eval=FALSE}
# get citation history of a scholar
soc_staff_cit <- list()


time <- 1 # I placed the waiting time outside the loop
i <- 1 # Our loop iterator is now a variable. This means I can change it within a while loop. Using a for loop you cant change your iterator in the loop itself. 

while (i <= nrow(soc_df)) {
    print(i)
    Sys.sleep(time)

    tryCatch({
      soc_staff_cit[[i]] <- get_citation_history(soc_df[i, c("gs_id")])
        if (nrow(soc_staff_cit[[i]]) > 0) {
          soc_staff_cit[[i]][, c("gs_id")] <- soc_df[i, c("gs_id")]  # again attach the gs_id as third column
        }
    i <- i + 1
    },
      warning = function(w) {
        cat("WARNING:", conditionMessage(w), "\n") #WARNING message
        i <<- i + 1}, #BUT WE DO WANT TO CONTINUE. NOTE THE DOUBLE << THIS IS BECAUSE I WANT TO CHANGE A VARIABLE WHICH EXISTS OUTSIDE THE WARNING FUNCTION
      error =function(e) {
        time <<- min(time *10, 3600*2)
        cat("Error:", conditionMessage(e), "\n") #ERROR message
        cat("sleep time:", time,  "\n")
        cat("ik zit in loop", i)
        #AFTER THE NEW SLEEP TIME, WE TRY AGAIN, WE THEREFORE DO NOT UPDATE i. ideally you also want to have some break option. And maybe you also want to save your data when you hit a time out error. 
      })

    
}
soc_staff_cit <- bind_rows(soc_staff_cit)
```


```{r, eval=FALSE}
save(soc_staff_cit, file="soc_staff_cit.RData")

```

```{r}
load("soc_staff_cit.RData")
```


# collaborators
```{r, eval=FALSE}
require(rvest)
require(xml2)
require(tidyverse)

# function to get collaborators and names from GS profiles
fcollabs <- function(gsid, lookforcollabs) {

  htmlpage1 <- read_html(paste0("https://scholar.google.nl/citations?user=", gsid, "&hl=en")) # so we paste the google scholar id
  profilename <- htmlpage1 %>% html_nodes(xpath = "//*/div[@id='gsc_prf_in']") %>% html_text() # we extract the profile name of that google scholar page
  profilecollabs1 <- as.data.frame(0) # empty df necessary for later
  profilecollabs2 <- as.data.frame(0) # empty df necessary for later

  if (lookforcollabs == 1) { # so if you want to look for collabs, set function to 1

    htmlpage2 <- read_html(paste0("https://scholar.google.com/citations?view_op=list_colleagues&hl=en&user=", gsid)) # so we paste the google scholar id
    profilecollabs1 <-  htmlpage2 %>% html_nodes(css="h3") %>% html_text() # get names
    profilecollabs1 <-  as.data.frame(profilecollabs1)

    profilecollabs2 <- htmlpage2 %>% html_nodes("a") %>% html_attr("href") # get the link
    profilecollabs2 <- profilecollabs2[seq_along(profilecollabs2) %% 2 > 0]
    profilecollabs2 <- substring(profilecollabs2, 23)

  }
  if (nrow(profilecollabs1)>1) { # if there ARE collabs

    profilecollabs1 <- as.data.frame(profilecollabs1) # we want to...
    profilecollabs2 <-  as.data.frame(profilecollabs2)
    profilecollabs1[,c("coauth_id")] <- profilecollabs2[,1]

    profilecollabs1[,c("gs_id")] <- gsid #... add gs_ids of focal GS profile
    profilecollabs1[,c("name")] <- profilename #...and the the profile name of GS profile attached

    names(profilecollabs1)[1] <- "coauth"

  } else {
    profilecollabs1 <- as.data.frame(cbind(gsid, profilename)) # if NOT looking for collabs...
    names(profilecollabs1) <- c("gs_id", "name") #...we only attach gs_id and profilename

  }
  return(profilecollabs1)

}
```

```{r, eval=FALSE}
# input a google scholar id and a 1 (if you want to find collabs) or 0 (only extracting names)

soc_collabs <- list()
for (i in 1:nrow(soc_df)) {
    print(i)
    time <- runif(1, 0, 1)
    Sys.sleep(time)

    soc_collabs[[i]] <- fcollabs(soc_df[i, c("gs_id")], 1)

}

soc_collabs <- bind_rows(soc_collabs)  # bind rows, get the unique ones!
soc_collabs_unique <- unique(soc_collabs[, 3]) 
soc_collabs_unique <- soc_collabs_unique[!is.na(soc_collabs_unique)]
save(soc_collabs, file = "soc_df_collabs1.RData")  # you notice this takes a while, so we save the data here
```
# network based on publications. 

Deze tijdsintervallen zijn waarschijnlijk te groot!

```{r}
load("soc_df_collabs1.RData")
```


```{r}
library(stringr)

#empty adjacency matrix for the years 2001-2007
network2001_2007 <- matrix(NA, nrow=nrow(soc_df), ncol=nrow(soc_df))

#select publications of the corresponding time era
pubs_sel <- soc_df_publications %>%
              mutate(author = tolower(author)) %>%
              filter(year>2000 & year<2008)

save(pubs_sel, file = "soc_pubs_sel_2001_2007.RData")

#fill the matrix
for (ego in 1: nrow(soc_df)) {
  name_ego <- soc_df$last_name[ego] #which ego? 
  pubs_sel2 <- pubs_sel[str_detect(pubs_sel$author, name_ego),] #publications of ego
  for (alter in 1:nrow(soc_df)){
    name_alter <- soc_df$last_name[alter] #which alter? 
    network2001_2007[ego,alter] <- sum(str_detect(pubs_sel2$author, name_alter)) > 1 #did alter publish with ego
  }
}

save(network2001_2007, file = "soc_network2001_2007.RData")

```

```{r}
library(stringr)

#empty adjacency matrix for the years 2008-2014
network2008_2014 <- matrix(NA, nrow=nrow(soc_df), ncol=nrow(soc_df))

#select publications of the corresponding time era
pubs_sel <- soc_df_publications %>%
              mutate(author = tolower(author)) %>%
              filter(year>2007 & year<2015)

save(pubs_sel, file = "soc_pubs_sel_2008_2014.RData")

#fill the matrix
for (ego in 1: nrow(soc_df)) {
  name_ego <- soc_df$last_name[ego] #which ego? 
  pubs_sel2 <- pubs_sel[str_detect(pubs_sel$author, name_ego),] #publications of ego
  for (alter in 1:nrow(soc_df)){
    name_alter <- soc_df$last_name[alter] #which alter? 
    network2008_2014[ego,alter] <- sum(str_detect(pubs_sel2$author, name_alter)) > 1 #did alter publish with ego
  }
}

save(network2008_2014, file = "soc_network2008_2014.RData")

```

```{r}
library(stringr)

#empty adjacency matrix for the years 2015-2021
network2015_2021 <- matrix(NA, nrow=nrow(soc_df), ncol=nrow(soc_df))

#select publications of the corresponding time era
pubs_sel <- soc_df_publications %>%
              mutate(author = tolower(author)) %>%
              filter(year>2014 & year<2022)

save(pubs_sel, file = "soc_pubs_sel_2015_2021.RData")

#fill the matrix
for (ego in 1: nrow(soc_df)) {
  name_ego <- soc_df$last_name[ego] #which ego? 
  pubs_sel2 <- pubs_sel[str_detect(pubs_sel$author, name_ego),] #publications of ego
  for (alter in 1:nrow(soc_df)){
    name_alter <- soc_df$last_name[alter] #which alter? 
    network2015_2021[ego,alter] <- sum(str_detect(pubs_sel2$author, name_alter)) > 1 #did alter publish with ego
  }
}

save(network2015_2021, file = "soc_network2015_2021.RData")
```

