# File-Name: GScholarScraper_2.R # Date: 2011-11-12 # Author: Kay Cichini # Email: kay.cichini@gmail.com # Purpose: Extract and examine Google Scholar search result (publication titles) # Packages used: RCurl, Rcpp, stringr, tm, wordcloud, # Licence: CC BY-SA-NC # # Arguments: # # (1) search.str: # A search string as used in Google Scholar search dialog # (!) Important: use "+" between elements of the search string.. # # (2) write.table: # Logical, defining if a table holding category (book, article, pdf), # full titles & links to publications should be saved to the default system folder. # # Output: a data frame with word frequencies (publication titles), optionally a # CSV-file of the results, a word cloud # # Error reported: Error in substring(string, start, end) : # invalid multibyte string at ' * Wi<6c>dlife # may be resolved by: Sys.setlocale(locale="C") # # recent edits: 6-12-2011, resolved bug with no. of search results.. GScholarScraper <- function(search.str, write.table = FALSE){ require(Rcpp) require(RCurl) require(stringr) require(tm) require(wordcloud) # Some explanations regarding the search string parameterization: # "&lr?lang_en" will search only publications in English. # "&num=100" will retur 100 results per site, strangely one yields different # numbers of results when changing this parameter.. so I will use num = 100 # which will give me the largest number of results. # "&as_vis=1" exculdes citations, in this version of the function I will # exclude these because they may bias the final word frequencies # due to the fact that citations often occurr multiply. # "&hl_en" defines language used on site. # "&as_sdt=1" returns only articles excluding patents. # Get number of search results, making a first input to Google Scholar, # retrieving results 1 to 100 from first result page, and containing the # total no. of results somewhere: url <- paste("http://scholar.google.com/scholar?start=0&q=", search.str, "&hl=en&lr=lang_en&num=100&as_sdt=1&as_vis=1", sep = "") # ...i’m using urls like: http://scholar.google.com/scholar?start=0&q=allintitle:+amphibians+richness+OR+diversity&hl=en&lr=lang_en&num=100&as_sdt=1&as_vis=1 webpage <- getURL(url) html_str <- paste(webpage, collapse="\n") # Find html place holders (2 alternatives!) for number of results, # and pull the number. # (!) Strangely Google Scholar gives different numbers of results # dependent on start value.. i.e., a change from 900 to 980 results # when changing start = 0 to start = 800 match_no.res <- str_match(html_str, "Results 1 - (.*?) of (.*?)") no.res <- match_no.res[1, max(dim(match_no.res))] if(nchar(no.res) == 0 | is.na(no.res) | nchar(gsub("\\d", "", no.res))) { match_no.res <- str_match(html_str, "Results 1 - (.*?) of about (.*?)") no.res <- match_no.res[1, max(dim(match_no.res))] } # Remove punctuation (Google uses decimal commas): no.res <- as.integer(gsub("[[:punct:]]", "", no.res)) # If there are no results, stop and throw an error message: if(nchar(no.res) == 0 | is.na(no.res) | nchar(gsub("\\d", "", no.res))) { stop("\n\n...There is no result for the submitted search string!")} # Define number of pages with results to be used subsequently # pages.max = maximum number of pages (chunk with 100 results each) # to be submitted subsequently. # Above it was said that no.res varies, depending on start value. # However, we use ceiling and the change will very unlikely be greater # than 100, so we may also add one page plus, to be save: pages.max <- ceiling(no.res/100)+1 # "start" as used in url: start <- c(100*1:pages.max) - 100 # Collect webpages as list: urls <- paste("http://scholar.google.com/scholar?start=", start, "&q=", search.str, "&hl=en&lr=lang_en&num=100&as_sdt=1&as_vis=1", sep = "") webpages <- lapply(urls, getURL) # Paste all content: html_str <- paste(unlist(webpages), collapse="\n") # Pull titles between h3 tags: match_h3 <- str_match_all(html_str, "