Feat/metadata report renovate (#69)

* Add current implementation for metadata report * Update README * Remove hard-coded fileviews list, to be replaced with new implementation
nf-osi · Jul 14, 2023 · 1531ec0 · 1531ec0
1 parent de9db2a
commit 1531ec0
Show file tree

Hide file tree

Showing 6 changed files with 2,435 additions and 0 deletions.
diff --git a/metadata-report/Makefile b/metadata-report/Makefile
@@ -0,0 +1,15 @@
+# Compile and render a report for metadata (annotations) across projects
+AUTHTOKEN = ${SYNAPSE_AUTH_TOKEN}
+
+# Recompile the data before rendering
+updated_report:
+	  Rscript -e "rmarkdown::render('build_report.Rmd', output_file='index.html',params=list(update = TRUE, authtoken = '${AUTHTOKEN}'))"
+
+# Render report with saved data
+report:
+	  Rscript -e "rmarkdown::render('build_report.Rmd', output_file='index.html')"
+
+# (TO DO) diff generates a diff using the current version vs last commit in git history;
+# This facilitates tracking what has been corrected
+diff:
+	  @echo "Not yet implemented"
diff --git a/metadata-report/README.md b/metadata-report/README.md
@@ -0,0 +1,13 @@
+## Metadata Report (WIP)
+
+This job/service is for regularly scanning file metadata to review the state of annotations and create a report. 
+There are several important ideas/questions that this will try to tackle:
+
+- The main portal fileview gives a fiew only on the core subset of annotations, what's minimally required. 
+It DOES NOT provide a comprehensive view of annotations, including a lot of past "legacy" annotations.
+For the present, we may also want to see what is "average" vs. "above average" when people go above and beyond to add what's _not_ required.
+This requires examining all annotations via an improved crawler implementation. 
+This helps to better understand the state of metadata in the past, present, and potentially the future evolution (e.g. if we see new annotations used outside of the data model). 
+
+- What percent of metadata are complete / correct? Can we use this to see how the quality of metadata has changed over time? Though imperfect, these report records might provide an additional reference. 
+
diff --git a/metadata-report/build_report.Rmd b/metadata-report/build_report.Rmd
@@ -0,0 +1,65 @@
+---
+title: "Report"
+author: "NF Service"
+date: "`r Sys.Date()`"
+output: 
+  html_document:
+    theme: lumen
+    css: custom.css
+params:
+  schema_csv: "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.csv"
+  report_data_bin: report_data.rds # compiled summary over all fileviews
+  update: FALSE # rerun compilation of fileviews to update `report_data_bin`
+  fileviews: fileviews # file containing fileview ids on separated lines
+  authtoken: authtoken # only needed if update = TRUE
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+
+options(readr.show_col_types = FALSE)
+
+library(data.table)
+library(reactable)
+library(htmltools)
+
+source("qc_helpers.R")
+
+# Pull schema as reference lookup
+schema <- fread(params$schema_csv)
+lookup <- asLookup(schema)
+```
+
+
+```{r compile, include=FALSE, eval=params$update}
+
+library(nfportalutils) # required only for compile
+# reticulate::use_condaenv("/r-reticulate", required = T)
+nfportalutils::syn_login(authtoken = params$authtoken)
+
+fileview_ids <- readLines(params$fileviews)
+
+exclude <- c(c("name", "fileName", "studyId", "studyName", "modifiedOn", "entityId",
+               "eTag", "dataFileHandleId", "dataFileMD5Hex", "dataFileSizeBytes"),
+             schema[Parent %in% c("dccProperty", "synapseProperty"), Attribute])
+metadata <- compileFileviews(fileview_ids, exclude)
+metadata <- metadata[, .(validMeta(key, value, lookup), .id, .r)]
+# assign CSS classes
+metadata[, valueCSS := fifelse(valid, yes = "valid", no = "invalid", na = "neutral") ]
+# comma-sep values need to be re-listed
+metadata <- metadata[, .(value = list(value), valid = list(valid), valueCSS = list(valueCSS)), 
+                      by = .(key, validkey, .r, .id)]
+
+# Save as RDS
+saveRDS(metadata, file = params$report_data_bin)
+```
+
+
+```{r render, echo=FALSE}
+
+report_data <- readRDS(params$report_data_bin)
+# Render only problematic keys/values
+qc_report <- report_data_subset(report_data)
+reportTable(qc_report)
+
+```
diff --git a/metadata-report/custom.css b/metadata-report/custom.css
@@ -0,0 +1,20 @@
+.invalid {
+  color: #fff;
+  background-color: red;
+  padding: 2px 3px 2px 3px;
+  margin-right: 5px;
+}
+
+.neutral {
+  color: #000;
+  background-color: lightgray;
+  padding: 2px 3px 2px 3px;
+  margin-right: 5px;
+}
+
+.valid {
+  color: #000;
+  background-color: mediumspringgreen;
+  padding: 2px 3px 2px 3px;
+  margin-right: 5px;
+}
diff --git a/metadata-report/index.html b/metadata-report/index.html
diff --git a/metadata-report/qc_helpers.R b/metadata-report/qc_helpers.R
@@ -0,0 +1,117 @@
+#' Compile all individual fileviews
+#' 
+#' This pulls indicated fileviews into a single long-format table with:
+#' `key`, `value`, `.r`, `.id`, 
+#' where `.id` is the fileview that contains that key-value pair.
+#' The result can be passed to the check fun `validMeta`.
+#' @param fileview_ids IDs of of relevant fileviews
+#' @param exclude Annotation keys that will not be included in the compilation.
+#' @param verbose Output details along the way.
+compileFileviews <- function(fileview_ids, exclude, verbose = TRUE) {
+
+  result <- list()
+  for(i in fileview_ids) {
+    result[[i]] <- try(table_query(i))
+    if(verbose) cat("Queried", i)
+  }
+  failed_pull <- sapply(result, class) == "try-error"
+  if(any(failed_pull)) {
+    message("Failed to pull, ignoring:", fileview_ids[which(failed_pull)])
+  }
+  fvs <- result[!failed_pull]
+  # If return a fileview with no annotations (no rows), also ignore
+  empty <- sapply(fvs, nrow) == 0 
+  if(verbose) cat("Empty:", names(fvs)[which(empty)])
+  fvs <- fvs[!empty]
+  if(!length(fvs)) error("No fileviews to process!")
+  fvs <- lapply(fvs, compileFormat, exclude)
+  fvs <- rbindlist(fvs, idcol = TRUE) # concatenation
+  return(fvs)
+}
+
+#' Helper to transform data to long format for straightforward concatenation
+#' 
+#' This also filters out any annotations not supposed to be in final compilation
+compileFormat <- function(dt, exclude) {
+  dt <- as.data.table(dt)
+  dt <- dt[, !names(dt) %in% exclude, with = FALSE]
+  for(col in names(dt)) {
+    set(dt, j = col, value = as.character(dt[[col]]))
+  }
+  if(!length(dt)) {
+    message("Ignoring view with no user annotations")
+    return(NULL)
+  }
+  dt <- melt(dt, measure.vars = names(dt), variable.name = "key", na.rm = TRUE)
+  # Aside from <NA>, remove rows where value %in% c("", "nan", "NA", "NaN")
+  dt <- dt[!value %in% c("", "nan", "NA", "NaN")]
+  # Unique key-value pairs 
+  dt <- unique(dt)
+  # Add some bookkeeping before expanding any list values
+  dt[, .r := 1:.N]
+  dt <- dt[, .(value = unlist(strsplit(value, split = ", ?"))), by = .(key, .r)]
+  dt
+}
+
+#' Convert a .csv schema to a keyed lookup
+asLookup <- function(schema) {
+  lookup <- schema[!duplicated(Attribute), .(key = Attribute, value = strsplit(`Valid Values`, split = ", ?"))]
+  lookup[, constrained := lengths(value) > 0] 
+  lookup <- lookup[, .(value = unlist(value), valid = TRUE), by = .(key, constrained)] 
+  setkey(lookup, key, value) # keys with length-0 valid values have <NA> value
+  return(lookup)
+}
+
+#' Check whether valid keys and values
+#' 
+#' Note that `.key` and `.value` should be vectors to take advantage of `data.table`'s lookup.
+#' This annotates data with `validkey` and `valid` and according to the schema:
+#' `validkey` indicates whether the concept exists in schema;
+#' `valid` is more combinatorial and indicates whether value is valid, dependent on `validkey`.
+#' If an annotation key is not in the schema, then `valid` is <NA>,
+#' e.g. if historically someone annotated a data file using key-value pair
+#' "RandomAnnotation" = "meh", 
+#' and "RandomAnnotation" exists outside our definitions, 
+#' we don't pass judgement on whether "meh" is valid.
+validMeta <- function(.key, .value, lookup) {
+  # valid indicates if key-value combination is valid, i.e. present in lookup
+  result <- lookup[.(.key, .value), .(key, value, valid)]
+  # key can be valid without key-value combo being valid
+  result[, validkey := .key %in% lookup$key]
+  # for valid = NA, truly NA for unconstrained free-text fields, otherwise not valid
+  result[is.na(valid), valid := fifelse(lookup$constrained[match(key, lookup$key)], FALSE, NA)]
+  return(result)
+}
+
+#' Helper to subset to only problematic keys/values
+report_data_subset <- function(report_data) {
+  report_data[validkey == FALSE | sapply(valid, function(x) any(x == FALSE))]
+}
+
+#' Render pretty HTML report table
+#' 
+#' @param records The table.
+#' @param limit Max unique values by key shown (mostly matters for free-text fields).
+reportTable <- function(dt, limit = 20) {
+  dt <- dt[, head(.SD, limit), by = key]
+  dt[, .id := sapply(.id, toString)]
+  reactable(
+    dt,
+    groupBy = c("key"),
+    filterable = TRUE,
+    columns = list(
+      key = colDef(), # aggregate = "count"
+      value = colDef(cell = function(value, index) {
+        css <- dt[index, valueCSS][[1]]
+        Map(function(value, css) span(class = css, value), value, css)
+      }),
+      valid = colDef(maxWidth = 50),
+      validkey = colDef(aggregate = "unique",
+                        maxWidth = 50),
+      valueCSS = colDef(show = FALSE),
+      .r = colDef(show = FALSE)
+    ),
+    bordered = TRUE
+  )
+}
+