From 6cc1f6c7994d5ce7103123e545b764d44ff092fa Mon Sep 17 00:00:00 2001 From: trangdata Date: Fri, 20 Oct 2023 13:49:05 -0400 Subject: [PATCH] improve paging control, resolves #166 --- R/oa_fetch.R | 38 ++++++++++++++++++------- R/oa_snowball.R | 2 +- man/oa_fetch.Rd | 12 ++++++++ man/oa_request.Rd | 10 +++++-- tests/testthat/test-oa_fetch.R | 51 ++++++++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 13 deletions(-) diff --git a/R/oa_fetch.R b/R/oa_fetch.R index 1489ec4..40ceaec 100644 --- a/R/oa_fetch.R +++ b/R/oa_fetch.R @@ -67,6 +67,8 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten abstract = TRUE, endpoint = "https://api.openalex.org", per_page = 200, + paging = NULL, + pages = NULL, count_only = FALSE, mailto = oa_email(), api_key = oa_apikey(), @@ -96,7 +98,9 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten if (!is.null(options$sample) && (options$sample > per_page)) { paging <- "page" - } else { + } else if (!is.null(options$page)){ + paging <- "page" + } else if (is.null(paging)){ paging <- "cursor" } @@ -122,6 +126,7 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten ), per_page = per_page, paging = paging, + pages = pages, count_only = count_only, mailto = mailto, api_key = api_key, @@ -130,7 +135,6 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten } if (length(final_res[[1]]) == 0) { # || is.null(final_res[[1]][[1]]$id) - warning("No collection found!") return(NULL) } @@ -161,8 +165,12 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten #' Defaults to 200. #' @param paging Character. #' Either "cursor" for cursor paging or "page" for basic paging. -#' When used with options$sample, please set `paging = "page"` -#' to avoid duplicates. +#' When used with `options$sample` and or `pages`, +#' paging is also automatically set to basic paging: `paging = "page"` +#' to avoid duplicates and get the right page. +#' See https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging. +#' @param pages Integer vector. +#' The range of pages to return. If NULL, return all pages. #' @param count_only Logical. #' If TRUE, the function returns only the number of item matching the query. #' Defaults to FALSE. @@ -303,6 +311,7 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten oa_request <- function(query_url, per_page = 200, paging = "cursor", + pages = NULL, count_only = FALSE, mailto = oa_email(), api_key = oa_apikey(), @@ -337,13 +346,22 @@ oa_request <- function(query_url, } else { return(res) } + n_items <- res$meta$count + n_pages <- ceiling(n_items / per_page) ## number of pages - n_items <- res$meta$count - n_pages <- ceiling(res$meta$count / per_page) - pages <- seq.int(n_pages) + if (is.null(pages)){ + pages <- seq.int(n_pages) + } else { + pages <- pages[pages <= n_pages] + n_pages <- length(pages) + n_items <- min(n_items - per_page * (utils::tail(pages, 1) - n_pages), per_page * n_pages) + message("Using basic paging...") + paging <- "page" + } - if (n_items <= 0) { + if (n_items <= 0 || n_pages <= 0) { + warning("No records found!") return(list()) } @@ -362,14 +380,14 @@ oa_request <- function(query_url, query_ls[["per-page"]] <- per_page # Activation of cursor pagination - next_page <- get_next_page(paging, 1) data <- vector("list", length = n_pages) + res <- NULL for (i in pages) { if (verbose) pb$tick() Sys.sleep(1 / 100) + next_page <- get_next_page(paging, i, res) query_ls[[paging]] <- next_page res <- api_request(query_url, ua, query = query_ls) - next_page <- get_next_page(paging, i + 1, res) if (!is.null(res$results)) data[[i]] <- res$results } diff --git a/R/oa_snowball.R b/R/oa_snowball.R index 759eff5..8fc4855 100644 --- a/R/oa_snowball.R +++ b/R/oa_snowball.R @@ -90,7 +90,7 @@ oa_snowball <- function(identifier = NULL, citing$oa_input <- FALSE cited$oa_input <- FALSE paper$oa_input <- TRUE - nodes <- rbind(paper, citing, cited) + nodes <- rbind_oa_ls(list(paper, citing, cited)) nodes <- nodes[!duplicated(nodes$id), ] # relationships/edges diff --git a/man/oa_fetch.Rd b/man/oa_fetch.Rd index 3b7dcc0..f3dfe5d 100644 --- a/man/oa_fetch.Rd +++ b/man/oa_fetch.Rd @@ -16,6 +16,8 @@ oa_fetch( abstract = TRUE, endpoint = "https://api.openalex.org", per_page = 200, + paging = NULL, + pages = NULL, count_only = FALSE, mailto = oa_email(), api_key = oa_apikey(), @@ -77,6 +79,16 @@ Defaults to endpoint = "https://api.openalex.org".} The per-page argument can assume any number between 1 and 200. Defaults to 200.} +\item{paging}{Character. +Either "cursor" for cursor paging or "page" for basic paging. +When used with `options$sample` and or `pages`, +paging is also automatically set to basic paging: `paging = "page"` +to avoid duplicates and get the right page. +See https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging.} + +\item{pages}{Integer vector. +The range of pages to return. If NULL, return all pages.} + \item{count_only}{Logical. If TRUE, the function returns only the number of item matching the query. Defaults to FALSE.} diff --git a/man/oa_request.Rd b/man/oa_request.Rd index 77c334c..3a7c5fc 100644 --- a/man/oa_request.Rd +++ b/man/oa_request.Rd @@ -8,6 +8,7 @@ oa_request( query_url, per_page = 200, paging = "cursor", + pages = NULL, count_only = FALSE, mailto = oa_email(), api_key = oa_apikey(), @@ -25,8 +26,13 @@ Defaults to 200.} \item{paging}{Character. Either "cursor" for cursor paging or "page" for basic paging. -When used with options$sample, please set `paging = "page"` -to avoid duplicates.} +When used with `options$sample` and or `pages`, +paging is also automatically set to basic paging: `paging = "page"` +to avoid duplicates and get the right page. +See https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging.} + +\item{pages}{Integer vector. +The range of pages to return. If NULL, return all pages.} \item{count_only}{Logical. If TRUE, the function returns only the number of item matching the query. diff --git a/tests/testthat/test-oa_fetch.R b/tests/testthat/test-oa_fetch.R index 099c49c..5c80ff4 100644 --- a/tests/testthat/test-oa_fetch.R +++ b/tests/testthat/test-oa_fetch.R @@ -376,3 +376,54 @@ test_that("oa_fetch for identifiers works with options", { expect_equal(dim(i), c(1, 2)) expect_equal(dim(a), c(1, 3)) }) + +test_that("different paging methods yield the same result", { + w0 <- oa_fetch( + entity = "works", + title.search = c("bibliometric analysis", "science mapping"), + cited_by_count = ">50", + options = list(select = "id"), + from_publication_date = "2021-01-01", + to_publication_date = "2021-12-31", + verbose = TRUE + ) + + w24 <- oa_fetch( + entity = "works", + title.search = c("bibliometric analysis", "science mapping"), + cited_by_count = ">50", + from_publication_date = "2021-01-01", + to_publication_date = "2021-12-31", + options = list(select = "id"), + pages = c(2, 4:5), + per_page = 10, + verbose = TRUE + ) + expect_equal( + w0[c(11:20, 31:min(50, nrow(w0))), ], + w24 + ) + + + +}) + +test_that("pages works", { + # The last 10 pages when per_page = 20 + # should be the same as the 10 pages when fetching page 2 + w1 <- oa_fetch( + search = "transformative change", + options = list(select = c("id", "display_name", "publication_date")), + pages = 1, + per_page = 20, + verbose = TRUE + ) + w2 <- oa_fetch( + search = "transformative change", + options = list(select = c("id", "display_name", "publication_date")), + pages = 2, + per_page = 10, + verbose = TRUE + ) + expect_equal(w1[11:20,], w2) +})