Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to specify which github stats to return #91

Merged
merged 2 commits into from
Oct 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 97 additions & 22 deletions R/github.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ get_user_repo_list <- function(owner, count = 100000, data_format = "dataframe",
#' @param token You can provide the Personal Access Token key directly or this function will attempt to grab a PAT that was stored using the `authorize("github")` function
#' @param repo The repository name. So for `https://github.com/fhdsl/metricminer`, it would be `fhdsl/metricminer`
#' @param count How many items would you like to receive? default is 100000
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param time_course Should the time course data be collected or only the summary metrics?
#' @return Repository summary or time course metrics for a particular GitHub repository as a dataframe
Expand All @@ -155,17 +160,40 @@ get_user_repo_list <- function(owner, count = 100000, data_format = "dataframe",
#' authorize("github")
#' metrics <- get_github_metrics(repo = "fhdsl/metricminer")
#'
#' # If you only want some of the stats you can choose which ones with the github_stats argument
#' metrics <- get_github_metrics(repo = "fhdsl/metricminer", github_stats = c("repo_activity", "stars"))
#'
#' summary_metrics <- get_github_repo_summary(repo = "fhdsl/metricminer")
#' timecourse_metrics <- get_github_repo_timecourse(repo = "fhdsl/metricminer")
#' }
get_github_metrics <- function(repo, token = NULL, count = 100000, data_format = "dataframe", time_course = FALSE) {
get_github_metrics <- function(repo,
token = NULL,
count = 100000,
data_format = "dataframe",
github_stats = "all",
time_course = FALSE) {

if (is.null(token)) {
# Get auth token
token <- get_token(app_name = "github")
if (is.null(token)) warning("No token found. Only public repositories will be retrieved.")
}

# There's different variables depending on whether time_course data is being collected
if (time_course) {
github_stats_full_list <- c("clones", "views")
} else {
github_stats_full_list <- c("repo_activity", "stars", "forks", "contributors", "community")
}

# Need to make sure what the user specified is what we collect
if (github_stats[1] != "all") {
if (!all(github_stats %in% github_stats_full_list)) stop("The stats specified in the github_stats argument are not supported stats. Please see documentation.")
} else {
github_stats <- github_stats_full_list
}


# Split it up
split_it <- strsplit(repo, split = "\\/")
owner <- split_it[[1]][1]
Expand All @@ -185,6 +213,9 @@ get_github_metrics <- function(repo, token = NULL, count = 100000, data_format =
community = "GET /repos/{owner}/{repo}/community/profile"
)
}

api_calls <- api_calls[github_stats]

# Put gh_repo_wrapper inside function
gh_repo_wrapper_fn <- function(api_call) {
gh_repo_wrapper(
Expand Down Expand Up @@ -242,6 +273,11 @@ get_github_metrics <- function(repo, token = NULL, count = 100000, data_format =
#' @param repo The repository name. So for `https://github.com/fhdsl/metricminer`, it would be `fhdsl/metricminer`
#' @param count How many items would you like to receive? default is 100000
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @return GitHub repository timecourse metrics for views and clones
#' @export
#' @examples \dontrun{
Expand All @@ -250,12 +286,13 @@ get_github_metrics <- function(repo, token = NULL, count = 100000, data_format =
#'
#' timecourse_metrics <- get_github_repo_timecourse(repo = "fhdsl/metricminer")
#' }
get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_format = "dataframe") {
get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_format = "dataframe", github_stats = "all") {
result <- get_github_metrics(
repo = repo,
token = token,
count = count,
data_format = data_format,
github_stats = github_stats,
time_course = TRUE
)
return(result)
Expand All @@ -268,6 +305,11 @@ get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_
#' @param repo The repository name. So for `https://github.com/fhdsl/metricminer`, it would be `fhdsl/metricminer`
#' @param count How many items would you like to receive? default is 100000
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @return GitHub repository summary metrics
#' @export
#' @examples \dontrun{
Expand All @@ -276,12 +318,13 @@ get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_
#'
#' summary_metrics <- get_github_repo_summary(repo = "fhdsl/metricminer")
#' }
get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_format = "dataframe") {
get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_format = "dataframe", github_stats = "all") {
result <- get_github_metrics(
repo = repo,
token = token,
count = count,
data_format = data_format,
github_stats = github_stats,
time_course = FALSE
)

Expand All @@ -295,7 +338,12 @@ get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_for
#' @param repo_names a character vector of repositories you'd like to collect metrics from.
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param time_course Should the time course data be collected or only the summary metrics?
#' @return Information regarding a Github account
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @return A list of metrics for a list of repos in a dataframe format
#' @importFrom gh gh
#' @importFrom purrr map
#' @importFrom dplyr bind_rows
Expand All @@ -307,10 +355,12 @@ get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_for
#' repo_names <- c("fhdsl/metricminer", "jhudsl/OTTR_Template")
#' some_repos_metrics <- get_multiple_repos_metrics(repo_names = repo_names)
#'
#' stars_and_forks <- get_multiple_repos_metrics(repo_names = repo_names, github_stats = c("stars", "forks"))
#'
#' some_repos_metrics <- get_multiple_repos_metrics(repo_names = repo_names, time_course = TRUE)
#' }
#'
get_multiple_repos_metrics <- function(repo_names = NULL, token = NULL, data_format = "dataframe", time_course = FALSE) {
get_multiple_repos_metrics <- function(repo_names = NULL, token = NULL, data_format = "dataframe", time_course = FALSE, github_stats = "all") {
if (is.null(token)) {
# Get auth token
token <- get_token(app_name = "github", try = TRUE)
Expand All @@ -326,7 +376,8 @@ get_multiple_repos_metrics <- function(repo_names = NULL, token = NULL, data_for
token = token,
repo = repo,
data_format = data_format,
time_course = time_course
time_course = time_course,
github_stats = github_stats
)
})

Expand Down Expand Up @@ -388,8 +439,15 @@ gh_repo_wrapper <- function(api_call, owner, repo, token = NULL, count = 100000)
#' @export
#'
clean_repo_metrics <- function(repo_name, repo_metric_list) {
### Summarize the rest
if (repo_metric_list$contributors[1] != "No results") {

stats_collected <- names(repo_metric_list)
cleaned_metrics <- list()

if (is.list(repo_metric_list$repo_activity)) {
cleaned_metrics$num_repo_activities <- length(repo_metric_list$repo_activity)
}
# If there's not no results and there's not an error because the $contributor field doesn't exist THEN do this.
if (is.list(repo_metric_list$contributors)) {
contributors <-
lapply(repo_metric_list$contributors, function(contributor) {
data.frame(
Expand All @@ -400,29 +458,46 @@ clean_repo_metrics <- function(repo_name, repo_metric_list) {
dplyr::bind_rows() %>%
dplyr::distinct()

num_contributors <- length(unique(contributors$contributor))
total_contributors <- sum(contributors$num_contributors)
cleaned_metrics$num_contributors <- length(unique(contributors$contributor))
cleaned_metrics$total_contributors <- sum(contributors$num_contributors)
} else {
num_contributors <- NA
total_contributors <- NA
cleaned_metrics$num_contributors <- NA
cleaned_metrics$total_contributors <- NA
}

if (repo_metric_list$forks[1] != "No results") {
if (is.list(repo_metric_list$forks)) {
forks <- unlist(purrr::map(repo_metric_list$forks, "full_name"))
num_forks <- length(forks)
cleaned_metrics$num_forks <- length(forks)
} else {
cleaned_metrics$num_forks <- NA
}

if (is.list(repo_metric_list$stars)) {
# GitHub API returns a list of who's given a star we just want a number
cleaned_metrics$num_stars <- length(unlist(purrr::map(repo_metric_list$stars, "login")))
} else {
num_forks <- NA
cleaned_metrics$num_stars <- NA
}

if (is.list(repo_metric_list$community)) {
cleaned_metrics$health_percentage <- as.numeric(repo_metric_list$community$health_percentage)
} else {
cleaned_metrics$health_percentage <- NA
}

clean_stats_names <- list(
repo_activity = "num_repo_activities",
contributors = c("num_contributors","total_contributors"),
forks = "num_forks",
stars = "num_stars",
community = "health_percentage")

# We're going to only put the stats we collected in the data.frame
clean_stats_names <- unlist(clean_stats_names[stats_collected], use.names = FALSE)

metrics <- data.frame(
repo_name,
num_forks = num_forks,
num_contributors = num_contributors,
total_contributions = total_contributors,
num_stars = length(unlist(purrr::map(repo_metric_list$stars, "login"))),
health_percentage = ifelse(repo_metric_list$community[1] != "No results", as.numeric(repo_metric_list$community$health_percentage), NA)
cleaned_metrics[clean_stats_names]
)

rownames(metrics) <- repo_name

return(metrics)
Expand Down
Loading