-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add current implementation for metadata report * Update README * Remove hard-coded fileviews list, to be replaced with new implementation
- Loading branch information
Showing
6 changed files
with
2,435 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Compile and render a report for metadata (annotations) across projects | ||
AUTHTOKEN = ${SYNAPSE_AUTH_TOKEN} | ||
|
||
# Recompile the data before rendering | ||
updated_report: | ||
Rscript -e "rmarkdown::render('build_report.Rmd', output_file='index.html',params=list(update = TRUE, authtoken = '${AUTHTOKEN}'))" | ||
|
||
# Render report with saved data | ||
report: | ||
Rscript -e "rmarkdown::render('build_report.Rmd', output_file='index.html')" | ||
|
||
# (TO DO) diff generates a diff using the current version vs last commit in git history; | ||
# This facilitates tracking what has been corrected | ||
diff: | ||
@echo "Not yet implemented" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
## Metadata Report (WIP) | ||
|
||
This job/service is for regularly scanning file metadata to review the state of annotations and create a report. | ||
There are several important ideas/questions that this will try to tackle: | ||
|
||
- The main portal fileview gives a fiew only on the core subset of annotations, what's minimally required. | ||
It DOES NOT provide a comprehensive view of annotations, including a lot of past "legacy" annotations. | ||
For the present, we may also want to see what is "average" vs. "above average" when people go above and beyond to add what's _not_ required. | ||
This requires examining all annotations via an improved crawler implementation. | ||
This helps to better understand the state of metadata in the past, present, and potentially the future evolution (e.g. if we see new annotations used outside of the data model). | ||
|
||
- What percent of metadata are complete / correct? Can we use this to see how the quality of metadata has changed over time? Though imperfect, these report records might provide an additional reference. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
--- | ||
title: "Report" | ||
author: "NF Service" | ||
date: "`r Sys.Date()`" | ||
output: | ||
html_document: | ||
theme: lumen | ||
css: custom.css | ||
params: | ||
schema_csv: "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.csv" | ||
report_data_bin: report_data.rds # compiled summary over all fileviews | ||
update: FALSE # rerun compilation of fileviews to update `report_data_bin` | ||
fileviews: fileviews # file containing fileview ids on separated lines | ||
authtoken: authtoken # only needed if update = TRUE | ||
--- | ||
|
||
```{r setup, include=FALSE} | ||
knitr::opts_chunk$set(echo = TRUE) | ||
options(readr.show_col_types = FALSE) | ||
library(data.table) | ||
library(reactable) | ||
library(htmltools) | ||
source("qc_helpers.R") | ||
# Pull schema as reference lookup | ||
schema <- fread(params$schema_csv) | ||
lookup <- asLookup(schema) | ||
``` | ||
|
||
|
||
```{r compile, include=FALSE, eval=params$update} | ||
library(nfportalutils) # required only for compile | ||
# reticulate::use_condaenv("/r-reticulate", required = T) | ||
nfportalutils::syn_login(authtoken = params$authtoken) | ||
fileview_ids <- readLines(params$fileviews) | ||
exclude <- c(c("name", "fileName", "studyId", "studyName", "modifiedOn", "entityId", | ||
"eTag", "dataFileHandleId", "dataFileMD5Hex", "dataFileSizeBytes"), | ||
schema[Parent %in% c("dccProperty", "synapseProperty"), Attribute]) | ||
metadata <- compileFileviews(fileview_ids, exclude) | ||
metadata <- metadata[, .(validMeta(key, value, lookup), .id, .r)] | ||
# assign CSS classes | ||
metadata[, valueCSS := fifelse(valid, yes = "valid", no = "invalid", na = "neutral") ] | ||
# comma-sep values need to be re-listed | ||
metadata <- metadata[, .(value = list(value), valid = list(valid), valueCSS = list(valueCSS)), | ||
by = .(key, validkey, .r, .id)] | ||
# Save as RDS | ||
saveRDS(metadata, file = params$report_data_bin) | ||
``` | ||
|
||
|
||
```{r render, echo=FALSE} | ||
report_data <- readRDS(params$report_data_bin) | ||
# Render only problematic keys/values | ||
qc_report <- report_data_subset(report_data) | ||
reportTable(qc_report) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
.invalid { | ||
color: #fff; | ||
background-color: red; | ||
padding: 2px 3px 2px 3px; | ||
margin-right: 5px; | ||
} | ||
|
||
.neutral { | ||
color: #000; | ||
background-color: lightgray; | ||
padding: 2px 3px 2px 3px; | ||
margin-right: 5px; | ||
} | ||
|
||
.valid { | ||
color: #000; | ||
background-color: mediumspringgreen; | ||
padding: 2px 3px 2px 3px; | ||
margin-right: 5px; | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#' Compile all individual fileviews | ||
#' | ||
#' This pulls indicated fileviews into a single long-format table with: | ||
#' `key`, `value`, `.r`, `.id`, | ||
#' where `.id` is the fileview that contains that key-value pair. | ||
#' The result can be passed to the check fun `validMeta`. | ||
#' @param fileview_ids IDs of of relevant fileviews | ||
#' @param exclude Annotation keys that will not be included in the compilation. | ||
#' @param verbose Output details along the way. | ||
compileFileviews <- function(fileview_ids, exclude, verbose = TRUE) { | ||
|
||
result <- list() | ||
for(i in fileview_ids) { | ||
result[[i]] <- try(table_query(i)) | ||
if(verbose) cat("Queried", i) | ||
} | ||
failed_pull <- sapply(result, class) == "try-error" | ||
if(any(failed_pull)) { | ||
message("Failed to pull, ignoring:", fileview_ids[which(failed_pull)]) | ||
} | ||
fvs <- result[!failed_pull] | ||
# If return a fileview with no annotations (no rows), also ignore | ||
empty <- sapply(fvs, nrow) == 0 | ||
if(verbose) cat("Empty:", names(fvs)[which(empty)]) | ||
fvs <- fvs[!empty] | ||
if(!length(fvs)) error("No fileviews to process!") | ||
fvs <- lapply(fvs, compileFormat, exclude) | ||
fvs <- rbindlist(fvs, idcol = TRUE) # concatenation | ||
return(fvs) | ||
} | ||
|
||
#' Helper to transform data to long format for straightforward concatenation | ||
#' | ||
#' This also filters out any annotations not supposed to be in final compilation | ||
compileFormat <- function(dt, exclude) { | ||
dt <- as.data.table(dt) | ||
dt <- dt[, !names(dt) %in% exclude, with = FALSE] | ||
for(col in names(dt)) { | ||
set(dt, j = col, value = as.character(dt[[col]])) | ||
} | ||
if(!length(dt)) { | ||
message("Ignoring view with no user annotations") | ||
return(NULL) | ||
} | ||
dt <- melt(dt, measure.vars = names(dt), variable.name = "key", na.rm = TRUE) | ||
# Aside from <NA>, remove rows where value %in% c("", "nan", "NA", "NaN") | ||
dt <- dt[!value %in% c("", "nan", "NA", "NaN")] | ||
# Unique key-value pairs | ||
dt <- unique(dt) | ||
# Add some bookkeeping before expanding any list values | ||
dt[, .r := 1:.N] | ||
dt <- dt[, .(value = unlist(strsplit(value, split = ", ?"))), by = .(key, .r)] | ||
dt | ||
} | ||
|
||
#' Convert a .csv schema to a keyed lookup | ||
asLookup <- function(schema) { | ||
lookup <- schema[!duplicated(Attribute), .(key = Attribute, value = strsplit(`Valid Values`, split = ", ?"))] | ||
lookup[, constrained := lengths(value) > 0] | ||
lookup <- lookup[, .(value = unlist(value), valid = TRUE), by = .(key, constrained)] | ||
setkey(lookup, key, value) # keys with length-0 valid values have <NA> value | ||
return(lookup) | ||
} | ||
|
||
#' Check whether valid keys and values | ||
#' | ||
#' Note that `.key` and `.value` should be vectors to take advantage of `data.table`'s lookup. | ||
#' This annotates data with `validkey` and `valid` and according to the schema: | ||
#' `validkey` indicates whether the concept exists in schema; | ||
#' `valid` is more combinatorial and indicates whether value is valid, dependent on `validkey`. | ||
#' If an annotation key is not in the schema, then `valid` is <NA>, | ||
#' e.g. if historically someone annotated a data file using key-value pair | ||
#' "RandomAnnotation" = "meh", | ||
#' and "RandomAnnotation" exists outside our definitions, | ||
#' we don't pass judgement on whether "meh" is valid. | ||
validMeta <- function(.key, .value, lookup) { | ||
# valid indicates if key-value combination is valid, i.e. present in lookup | ||
result <- lookup[.(.key, .value), .(key, value, valid)] | ||
# key can be valid without key-value combo being valid | ||
result[, validkey := .key %in% lookup$key] | ||
# for valid = NA, truly NA for unconstrained free-text fields, otherwise not valid | ||
result[is.na(valid), valid := fifelse(lookup$constrained[match(key, lookup$key)], FALSE, NA)] | ||
return(result) | ||
} | ||
|
||
#' Helper to subset to only problematic keys/values | ||
report_data_subset <- function(report_data) { | ||
report_data[validkey == FALSE | sapply(valid, function(x) any(x == FALSE))] | ||
} | ||
|
||
#' Render pretty HTML report table | ||
#' | ||
#' @param records The table. | ||
#' @param limit Max unique values by key shown (mostly matters for free-text fields). | ||
reportTable <- function(dt, limit = 20) { | ||
dt <- dt[, head(.SD, limit), by = key] | ||
dt[, .id := sapply(.id, toString)] | ||
reactable( | ||
dt, | ||
groupBy = c("key"), | ||
filterable = TRUE, | ||
columns = list( | ||
key = colDef(), # aggregate = "count" | ||
value = colDef(cell = function(value, index) { | ||
css <- dt[index, valueCSS][[1]] | ||
Map(function(value, css) span(class = css, value), value, css) | ||
}), | ||
valid = colDef(maxWidth = 50), | ||
validkey = colDef(aggregate = "unique", | ||
maxWidth = 50), | ||
valueCSS = colDef(show = FALSE), | ||
.r = colDef(show = FALSE) | ||
), | ||
bordered = TRUE | ||
) | ||
} | ||
|