From 6f69c0073c6bbd6a4f78169c3b5ea0db3ceb8180 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Mon, 2 Mar 2020 14:23:02 +0000
Subject: [PATCH 1/7] Simplify dev docker setup.

---
 docker-compose.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 1721a7c..e421354 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,8 +12,3 @@ services:
     ports:
      - "1001:80"
 
-  logviewer:
-    image: ukwa/crawl-log-viewer
-    ports:
-     - "8000:8000"
-

From bdd567ed89ed96f251e2c67dcd991967caae492a Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Mon, 2 Mar 2020 14:53:40 +0000
Subject: [PATCH 2/7] Add badges.

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..cb76787
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+UKWA Intranet
+=============
+
+[![Build Status](https://travis-ci.org/ukwa/ukwa-reports.svg?branch=master)](https://travis-ci.org/ukwa/ukwa-reports)
+[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://registry.hub.docker.com/r/ukwa/ukwa-intranet/)
+

From a594b3a84b93209db6332022d0cfc4255fd285cb Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Fri, 20 Oct 2023 14:59:43 +0100
Subject: [PATCH 3/7] Switch to jupyter-book build.

---
 .gitignore                                    |   6 +-
 .gitmodules                                   |   3 -
 Dockerfile                                    |  13 +-
 README.md                                     |   3 +
 archetypes/default.md                         |   6 -
 build.sh                                      |  12 +
 config.toml                                   |  80 ------
 content/_config.yml                           |  38 +++
 content/_toc.yml                              |  15 ++
 .../assets/logos/ukwa-2018-onwhite-close.svg  |  33 +++
 content/intro.md                              |  11 +
 content/markdown-notebooks.md                 |  53 ++++
 content/markdown.md                           |  55 ++++
 content/notebooks.ipynb                       | 122 +++++++++
 content/references.bib                        |  56 ++++
 content/reports/hdfs/index.md                 |  47 ----
 ...ld-total-file-size-by-stream-per-month.csv | 135 ----------
 ...pld-total-file-size-by-stream-per-year.csv |   9 -
 .../npld-total-file-size-by-stream-totals.csv |   5 -
 .../hdfs/total-file-count-by-stream.csv       | 100 -------
 .../hdfs/total-file-size-by-stream.csv        | 100 -------
 content/reports/storage/humanbytes.py         |  56 ++++
 content/reports/storage/scratch.md            | 221 ++++++++++++++++
 content/reports/storage/summary.md            | 243 ++++++++++++++++++
 content/reports/storage/test.md               |  20 ++
 docker-compose.yml                            |  30 ++-
 index.html                                    |   0
 layouts/_default/list.html                    |  13 -
 layouts/_default/terms.html                   |  13 -
 layouts/crawls/single.html                    |  43 ----
 layouts/partials/list-item.html               |  25 --
 layouts/reports/single.html                   |  46 ----
 layouts/shortcodes/csv-table.html             |  27 --
 layouts/shortcodes/date-bar-chart.html        |  22 --
 setup.py                                      |  13 +
 static/css/main.css                           | 109 --------
 themes/minimal                                |   1 -
 37 files changed, 983 insertions(+), 801 deletions(-)
 delete mode 100644 archetypes/default.md
 create mode 100644 build.sh
 delete mode 100644 config.toml
 create mode 100644 content/_config.yml
 create mode 100644 content/_toc.yml
 create mode 100644 content/assets/logos/ukwa-2018-onwhite-close.svg
 create mode 100644 content/intro.md
 create mode 100644 content/markdown-notebooks.md
 create mode 100644 content/markdown.md
 create mode 100644 content/notebooks.ipynb
 create mode 100644 content/references.bib
 delete mode 100644 content/reports/hdfs/index.md
 delete mode 100644 content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv
 delete mode 100644 content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv
 delete mode 100644 content/reports/hdfs/npld-total-file-size-by-stream-totals.csv
 delete mode 100644 content/reports/hdfs/total-file-count-by-stream.csv
 delete mode 100644 content/reports/hdfs/total-file-size-by-stream.csv
 create mode 100644 content/reports/storage/humanbytes.py
 create mode 100644 content/reports/storage/scratch.md
 create mode 100644 content/reports/storage/summary.md
 create mode 100644 content/reports/storage/test.md
 delete mode 100644 index.html
 delete mode 100644 layouts/_default/list.html
 delete mode 100644 layouts/_default/terms.html
 delete mode 100644 layouts/crawls/single.html
 delete mode 100644 layouts/partials/list-item.html
 delete mode 100644 layouts/reports/single.html
 delete mode 100644 layouts/shortcodes/csv-table.html
 delete mode 100644 layouts/shortcodes/date-bar-chart.html
 create mode 100644 setup.py
 delete mode 100644 static/css/main.css
 delete mode 160000 themes/minimal

diff --git a/.gitignore b/.gitignore
index 17982b6..a5508db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
-public
-content/crawls
-data/crawls
+_build
+.ipynb_checkpoints
+__pycache__
diff --git a/.gitmodules b/.gitmodules
index 7b30435..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "themes/minimal"]
-	path = themes/minimal
-	url = https://github.com/calintat/minimal.git
diff --git a/Dockerfile b/Dockerfile
index 573cb6b..6f26c55 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,13 +1,8 @@
-FROM klakegg/hugo:0.65.3 AS hugo
+FROM python:3.11
 
-COPY . /src
+WORKDIR /ukwa-reports
 
-WORKDIR /src
+COPY setup.py .
 
-ENV HUGO_DESTINATION=/onbuild
-
-RUN hugo
-
-FROM nginx
-COPY --from=hugo /onbuild /usr/share/nginx/html/intranet
+RUN pip install --no-cache -v .
 
diff --git a/README.md b/README.md
index cb76787..50b9a02 100644
--- a/README.md
+++ b/README.md
@@ -4,3 +4,6 @@ UKWA Intranet
 [![Build Status](https://travis-ci.org/ukwa/ukwa-reports.svg?branch=master)](https://travis-ci.org/ukwa/ukwa-reports)
 [![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://registry.hub.docker.com/r/ukwa/ukwa-intranet/)
 
+This static website acts as a the gateway for our 'intranet' services. It's a static site built using [Hugo](https://gohugo.io/), and is deployed by being embedded in the [`ukwa-services/manage/intranet`](https://github.com/ukwa/ukwa-services/tree/master/manage/intranet) stack.
+
+See that project for more details.
\ No newline at end of file
diff --git a/archetypes/default.md b/archetypes/default.md
deleted file mode 100644
index f5a9e45..0000000
--- a/archetypes/default.md
+++ /dev/null
@@ -1,6 +0,0 @@
----
-title: "{{ replace .TranslationBaseName "-" " " | title }}"
-date: {{ .Date }}
-draft: true
----
-
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..ff975bb
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# https://discourse.jupyter.org/t/debugger-warning-it-seems-that-frozen-modules-are-being-used-python-3-11-0/16544
+export PYDEVD_DISABLE_FILE_VALIDATION=1
+
+# Build the book part:
+jb build --path-output . content/
+
+# Copy over CSV files, retaining the paths:
+cd content 
+find reports -name "*.csv" -exec cp -v {} ../_build/html/{} \;
+cd -
\ No newline at end of file
diff --git a/config.toml b/config.toml
deleted file mode 100644
index 1fbc119..0000000
--- a/config.toml
+++ /dev/null
@@ -1,80 +0,0 @@
-baseURL = "/intranet"
-languageCode = "en-gb"
-title = "UKWA Reports"
-theme = "minimal"
-#disqusShortname = "username" # delete this to disable disqus comments
-googleAnalytics = ""
-copyright = ""
-
-[params]
-    author = "UK Web Archive"
-    description = "Technical reports"
-    githubUsername = "#"
-    accent = "#2e6dd9"
-    showBorder = true
-    backgroundColor = "white"
-    font = "Raleway" # should match the name on Google Fonts!
-    highlight = true
-    highlightStyle = "solarized-dark"
-    highlightLanguages = ["go", "haskell", "kotlin", "scala", "swift"]
-    css = ["https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css"]
-
-[[menu.main]]
-    url = "/"
-    name = "Home"
-    weight = 1
-
-[[menu.main]]
-    url = "/logs/"
-    name = "Logs"
-    weight = 2
-
-[[menu.main]]
-    url = "/api/"
-    name = "API"
-    weight = 3
-
-[[menu.main]]
-    url = "/nbapps/"
-    name = "Tools"
-    weight = 4
-
-[[menu.main]]
-    url = "/trackdb"
-    name = "TrackDB"
-    weight = 5
-
-[[menu.main]]
-    url = "/reports/"
-    name = "Reports"
-    weight = 6
-
-[[menu.main]]
-    url = "/categories/"
-    name = "Categories"
-    weight = 7
-
-[[menu.main]]
-    url = "/tags/"
-    name = "Tags"
-    weight = 8
-
-# Social icons to be shown on the right-hand side of the navigation bar
-# The "name" field should match the name of the icon to be used
-# The list of available icons can be found at http://fontawesome.io/icons/
-
-[[menu.icon]]
-    url = "mailto:web-archivist@bl.uk"
-    name = "envelope-o"
-    weight = 1
-
-[[menu.icon]]
-    url = "https://github.com/ukwa/"
-    name = "github"
-    weight = 2
-
-[[menu.icon]]
-    url = "https://twitter.com/UKWebArchive/"
-    name = "twitter"
-    weight = 3
-
diff --git a/content/_config.yml b/content/_config.yml
new file mode 100644
index 0000000..33bd657
--- /dev/null
+++ b/content/_config.yml
@@ -0,0 +1,38 @@
+# Book settings
+# Learn more at https://jupyterbook.org/customize/config.html
+
+title: UKWA Technical Documentation
+author: The UK Web Archive
+logo: "assets/logos/ukwa-2018-onwhite-close.svg"
+
+# Don't include these:
+exclude_patterns: ['_build', '**.ipynb_checkpoints']
+
+# Auto-exclude files not in the toc
+only_build_toc_files: true
+
+# Force re-execution of notebooks on each build?
+# See https://jupyterbook.org/content/execute.html
+execute:
+  execute_notebooks: auto
+  # Long timeout because some analyses take a while:
+  timeout: 1000
+
+# Add a bibtex file so that we can create citations
+bibtex_bibfiles:
+  - references.bib
+
+# Information about where the book exists on the web
+repository:
+  url: https://github.com/ukwa/ukwa-reports  # Online location of your book
+  path_to_book: content  # Optional path to your book, relative to the repository root
+  branch: master  # Which branch of the repository should be used when creating links (optional)
+
+# Add GitHub buttons to your book
+# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
+html:
+  use_issues_button: true
+  use_repository_button: true
+  use_edit_page_button: true
+  home_page_in_navbar: false
+
diff --git a/content/_toc.yml b/content/_toc.yml
new file mode 100644
index 0000000..27ab8b8
--- /dev/null
+++ b/content/_toc.yml
@@ -0,0 +1,15 @@
+# Table of contents
+# Learn more at https://jupyterbook.org/customize/toc.html
+
+format: jb-book
+root: intro
+parts:
+- caption: Storage
+  chapters:
+  - file: reports/storage/summary
+  - file: reports/storage/test
+- caption: Examples
+  chapters:
+  - file: markdown
+  - file: notebooks
+  - file: markdown-notebooks
diff --git a/content/assets/logos/ukwa-2018-onwhite-close.svg b/content/assets/logos/ukwa-2018-onwhite-close.svg
new file mode 100644
index 0000000..5281f63
--- /dev/null
+++ b/content/assets/logos/ukwa-2018-onwhite-close.svg
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.0" width="2028.7008pt" height="830.44391pt" viewBox="0 0 2028.7008 830.44391" preserveAspectRatio="xMidYMid meet" id="svg5493" sodipodi:docname="ukwa-2018-onwhite-close.svg" inkscape:version="0.92.3 (0612fd7ea4, 2018-10-23)">
+  <defs id="defs5497"/>
+  <sodipodi:namedview pagecolor="#ffffff" bordercolor="#666666" borderopacity="1" objecttolerance="10" gridtolerance="10" guidetolerance="10" inkscape:pageopacity="0" inkscape:pageshadow="2" inkscape:window-width="1389" inkscape:window-height="1004" id="namedview5495" showgrid="false" fit-margin-top="0" fit-margin-left="0" fit-margin-right="0" fit-margin-bottom="0" inkscape:zoom="0.46618506" inkscape:cx="1352.4672" inkscape:cy="553.62927" inkscape:window-x="122" inkscape:window-y="23" inkscape:window-maximized="0" inkscape:current-layer="svg5493"/>
+  <metadata id="metadata5457">
+Created by potrace 1.15, written by Peter Selinger 2001-2017
+<rdf:RDF>
+  <cc:Work rdf:about="">
+    <dc:format>image/svg+xml</dc:format>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:title/>
+  </cc:Work>
+</rdf:RDF>
+</metadata>
+  <g transform="matrix(0.1,0,0,-0.1,-786,1724)" id="g5491" style="fill:#2e6ddf;fill-opacity:1;stroke:none">
+    <path d="m 7860,15363 c 0,-1968 1,-2012 44,-2258 155,-890 688,-1471 1553,-1695 473,-123 1148,-135 1652,-30 719,150 1236,535 1519,1129 101,212 173,465 219,771 16,106 17,273 20,2038 l 4,1922 h -791 -790 v -1828 c 0,-1552 -2,-1845 -15,-1940 -67,-512 -256,-750 -660,-833 -85,-18 -126,-20 -275,-16 -206,5 -283,21 -430,93 -81,40 -108,60 -185,138 -123,125 -193,263 -240,475 -44,200 -45,251 -45,2129 v 1782 h -790 -790 z" id="path5459" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 14845,17188 c -145,-211 -217,-315 -500,-728 -175,-256 -420,-612 -544,-792 -334,-484 -479,-701 -549,-818 l -62,-105 v -687 c 0,-379 2,-688 4,-688 2,0 102,59 221,130 120,72 219,129 220,128 1,-2 44,-84 95,-183 51,-99 123,-238 160,-310 130,-250 681,-1316 790,-1527 l 109,-213 891,-3 891,-2 -12,22 c -11,21 -301,537 -574,1023 -121,215 -599,1065 -865,1540 -89,160 -207,370 -262,467 l -100,177 44,63 c 37,53 407,578 1146,1628 80,113 218,309 307,435 89,127 200,284 246,350 47,66 88,126 93,133 6,9 -167,12 -853,12 h -860 z" id="path5461" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 16940,16933 c 43,-170 150,-600 238,-958 89,-357 247,-996 352,-1420 105,-423 229,-923 275,-1110 46,-187 163,-659 260,-1050 97,-390 192,-779 213,-862 l 37,-153 h 935 935 l 303,1368 c 166,752 316,1432 332,1512 77,381 150,792 180,1008 7,50 16,92 19,92 4,0 15,-60 25,-132 53,-394 155,-948 275,-1488 33,-151 70,-315 81,-365 11,-49 91,-416 179,-815 88,-399 182,-826 208,-950 l 49,-225 935,-3 935,-2 58,237 c 57,235 219,891 546,2218 92,374 243,986 335,1360 311,1264 496,2010 501,2028 5,16 -35,17 -755,17 -715,0 -760,-1 -765,-17 -2,-10 -43,-193 -90,-408 -47,-214 -190,-860 -317,-1435 -341,-1545 -446,-2065 -505,-2492 -9,-71 -20,-128 -24,-128 -4,0 -12,39 -19,88 -7,48 -26,165 -42,261 -95,561 -208,1144 -314,1611 -31,135 -90,396 -131,580 -42,184 -102,450 -134,590 -32,140 -106,469 -165,730 -59,261 -115,508 -123,548 l -16,72 h -732 -732 l -5,-27 c -2,-16 -14,-68 -26,-118 -37,-159 -58,-247 -95,-410 -33,-140 -183,-787 -236,-1010 -10,-44 -30,-129 -45,-190 -14,-60 -35,-148 -45,-195 -11,-47 -54,-227 -94,-400 -78,-332 -99,-429 -171,-785 -83,-418 -143,-750 -189,-1050 -46,-294 -48,-306 -52,-302 -3,3 -18,87 -35,188 -45,280 -130,727 -189,999 -16,74 -61,279 -99,455 -38,176 -91,417 -116,535 -49,227 -133,615 -215,990 -27,124 -79,362 -115,530 -36,168 -83,386 -105,485 -22,99 -45,208 -52,243 l -12,62 h -761 -762 z" id="path5463" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 25411,17183 c -6,-21 -40,-164 -76,-318 -88,-372 -122,-518 -139,-587 -16,-63 -13,-81 54,-369 33,-140 226,-913 259,-1034 48,-178 93,-346 140,-525 28,-107 76,-289 106,-404 30,-114 55,-210 55,-212 0,-2 -274,-4 -608,-4 h -608 l -23,-97 c -38,-163 -270,-1146 -276,-1170 l -5,-23 h 924 923 l 13,-47 c 7,-27 20,-77 30,-113 10,-36 32,-119 49,-185 17,-66 67,-257 111,-425 44,-168 80,-311 80,-317 0,-10 180,-13 866,-13 476,0 863,3 861,8 -2,4 -14,39 -26,77 -12,39 -75,232 -141,430 -340,1033 -446,1354 -640,1945 -256,781 -431,1311 -595,1810 -178,537 -341,1034 -442,1343 l -87,267 h -398 -397 z" id="path5465" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 21963,10510 c -157,-22 -326,-109 -423,-218 -61,-68 -133,-208 -161,-312 -26,-101 -36,-293 -19,-412 45,-331 212,-537 500,-615 120,-32 429,-18 543,26 23,9 27,16 27,51 0,46 0,46 -85,24 -93,-23 -344,-29 -433,-10 -214,45 -360,194 -423,431 -32,116 -32,375 -1,488 35,128 77,203 162,288 129,130 256,175 475,166 114,-4 204,-25 306,-71 13,-5 20,2 33,34 9,22 16,44 16,49 0,11 -97,47 -180,66 -82,19 -256,27 -337,15 z" id="path5467" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 18420,9744 c -168,-422 -306,-773 -308,-780 -3,-11 10,-14 51,-14 h 54 l 32,83 c 18,45 66,169 108,275 l 75,192 h 318 318 l 72,-182 c 39,-101 88,-225 107,-275 l 36,-93 h 58 59 l -21,53 c -12,28 -147,374 -301,767 -154,393 -282,721 -285,728 -3,6 -19,12 -37,12 h -31 z m 360,531 c 7,-22 66,-181 131,-354 66,-173 118,-316 116,-318 -2,-2 -128,-2 -279,-1 l -275,3 89,225 c 82,209 175,456 185,489 5,20 18,3 33,-44 z" id="path5469" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 8053,9943 c 3,-484 6,-567 21,-622 49,-186 181,-318 367,-368 87,-23 281,-23 367,0 105,28 172,66 243,136 69,70 107,135 136,236 15,53 17,128 21,618 l 3,557 h -55 -55 l -3,-572 c -3,-561 -3,-574 -25,-631 -44,-115 -131,-201 -248,-242 -116,-41 -341,-28 -447,26 -58,29 -130,100 -161,157 -54,100 -57,129 -57,720 v 542 h -56 -55 z" id="path5471" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 9890,9725 v -775 h 50 50 v 318 317 l 105,93 c 57,50 106,92 109,92 2,0 137,-184 301,-410 l 297,-410 h 59 c 33,0 59,2 59,5 0,2 -139,195 -309,429 -170,233 -315,433 -322,444 -12,19 13,46 302,344 l 314,323 -60,3 -60,3 -215,-223 c -118,-123 -296,-306 -397,-408 l -183,-185 v 408 407 h -50 -50 z" id="path5473" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 12070,10495 c 0,-3 95,-349 210,-770 116,-420 210,-766 210,-769 0,-4 20,-6 45,-6 25,0 45,2 45,4 0,3 68,247 151,543 83,296 170,614 194,706 23,92 45,170 47,173 3,2 10,-20 17,-48 6,-29 96,-350 200,-713 l 188,-660 40,-3 c 39,-3 42,-1 52,30 5,18 101,362 211,763 110,402 203,736 205,743 3,8 -10,12 -48,12 h -52 l -168,-628 c -92,-345 -173,-653 -179,-684 -7,-32 -15,-58 -19,-58 -3,0 -9,18 -13,40 -5,33 -288,1032 -358,1263 l -20,67 h -56 -57 l -157,-563 c -154,-547 -208,-754 -208,-789 0,-10 -3,-18 -7,-18 -5,0 -18,44 -29,98 -12,54 -93,363 -180,685 l -159,587 h -52 c -29,0 -53,-2 -53,-5 z" id="path5475" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 14420,9725 v -775 h 420 420 v 50 50 h -370 -370 v 330 330 h 350 350 v 50 50 h -350 -350 v 295 295 h 370 370 v 50 50 h -420 -420 z" id="path5477" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 15880,9725 v -777 l 308,5 c 328,4 363,9 480,62 75,34 154,113 187,186 52,116 54,264 5,365 -42,87 -155,162 -270,180 -57,9 -57,20 -2,34 65,16 158,67 189,104 14,17 38,56 52,86 21,46 26,69 25,140 0,110 -22,174 -81,240 -53,59 -115,93 -221,122 -67,18 -115,21 -374,25 l -298,5 z m 611,661 c 125,-27 211,-88 240,-173 23,-65 17,-179 -13,-244 -30,-65 -71,-101 -154,-131 -54,-20 -82,-23 -321,-26 l -263,-4 v 296 296 h 223 c 152,0 243,-5 288,-14 z m 20,-690 c 136,-29 223,-94 255,-188 12,-37 15,-73 12,-142 -7,-142 -54,-218 -169,-270 -85,-39 -142,-46 -395,-46 h -234 v 330 330 h 233 c 159,0 252,-5 298,-14 z" id="path5479" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 19880,9726 v -776 h 50 50 v 335 335 h 198 198 l 199,-335 200,-335 h 58 c 31,0 57,3 57,6 0,3 -90,156 -200,339 -110,183 -202,339 -205,347 -4,9 9,18 40,27 66,19 157,82 198,136 125,163 110,437 -31,566 -24,23 -69,52 -100,66 -104,46 -169,55 -449,60 l -263,5 z m 611,646 c 84,-28 148,-83 180,-154 19,-44 23,-70 23,-153 0,-121 -23,-183 -90,-245 -89,-82 -168,-100 -429,-100 h -195 v 341 341 l 223,-4 c 193,-4 231,-7 288,-26 z" id="path5481" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 23030,9725 v -775 h 50 50 v 380 380 h 455 455 v -380 -380 h 55 55 v 775 775 h -55 -55 v -345 -345 h -455 -455 v 345 345 h -50 -50 z" id="path5483" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 24850,9725 v -775 h 50 50 v 775 775 h -50 -50 z" id="path5485" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 25450,10492 c 0,-7 525,-1424 565,-1524 5,-13 19,-18 50,-18 h 44 l 21,58 c 12,31 138,368 280,747 142,380 263,703 269,718 l 11,28 -57,-3 -56,-3 -204,-555 c -111,-305 -226,-621 -254,-703 -29,-81 -53,-146 -54,-145 -1,2 -28,77 -59,168 -31,91 -145,406 -254,700 l -198,535 -52,3 c -29,2 -52,-1 -52,-6 z" id="path5487" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+    <path d="m 27170,9725 v -775 h 420 420 v 50 50 h -370 -370 v 330 330 h 350 350 v 50 50 h -350 -350 v 295 295 h 370 370 v 50 50 h -420 -420 z" id="path5489" style="fill:#2e6ddf;fill-opacity:1" inkscape:connector-curvature="0"/>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/content/intro.md b/content/intro.md
new file mode 100644
index 0000000..f8cdc73
--- /dev/null
+++ b/content/intro.md
@@ -0,0 +1,11 @@
+# Welcome to your Jupyter Book
+
+This is a small sample book to give you a feel for how book content is
+structured.
+It shows off a few of the major file types, as well as some sample content.
+It does not go in-depth into any particular topic - check out [the Jupyter Book documentation](https://jupyterbook.org) for more information.
+
+Check out the content pages bundled with this sample book to see more.
+
+```{tableofcontents}
+```
diff --git a/content/markdown-notebooks.md b/content/markdown-notebooks.md
new file mode 100644
index 0000000..a057a32
--- /dev/null
+++ b/content/markdown-notebooks.md
@@ -0,0 +1,53 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.11.5
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+# Notebooks with MyST Markdown
+
+Jupyter Book also lets you write text-based notebooks using MyST Markdown.
+See [the Notebooks with MyST Markdown documentation](https://jupyterbook.org/file-types/myst-notebooks.html) for more detailed instructions.
+This page shows off a notebook written in MyST Markdown.
+
+## An example cell
+
+With MyST Markdown, you can define code cells with a directive like so:
+
+```{code-cell}
+print(2 + 2)
+```
+
+When your book is built, the contents of any `{code-cell}` blocks will be
+executed with your default Jupyter kernel, and their outputs will be displayed
+in-line with the rest of your content.
+
+```{seealso}
+Jupyter Book uses [Jupytext](https://jupytext.readthedocs.io/en/latest/) to convert text-based files to notebooks, and can support [many other text-based notebook files](https://jupyterbook.org/file-types/jupytext.html).
+```
+
+## Create a notebook with MyST Markdown
+
+MyST Markdown notebooks are defined by two things:
+
+1. YAML metadata that is needed to understand if / how it should convert text files to notebooks (including information about the kernel needed).
+   See the YAML at the top of this page for example.
+2. The presence of `{code-cell}` directives, which will be executed with your book.
+
+That's all that is needed to get started!
+
+## Quickly add YAML metadata for MyST Notebooks
+
+If you have a markdown file and you'd like to quickly add YAML metadata to it, so that Jupyter Book will treat it as a MyST Markdown Notebook, run the following command:
+
+```
+jupyter-book myst init path/to/markdownfile.md
+```
diff --git a/content/markdown.md b/content/markdown.md
new file mode 100644
index 0000000..0ddaab3
--- /dev/null
+++ b/content/markdown.md
@@ -0,0 +1,55 @@
+# Markdown Files
+
+Whether you write your book's content in Jupyter Notebooks (`.ipynb`) or
+in regular markdown files (`.md`), you'll write in the same flavor of markdown
+called **MyST Markdown**.
+This is a simple file to help you get started and show off some syntax.
+
+## What is MyST?
+
+MyST stands for "Markedly Structured Text". It
+is a slight variation on a flavor of markdown called "CommonMark" markdown,
+with small syntax extensions to allow you to write **roles** and **directives**
+in the Sphinx ecosystem.
+
+For more about MyST, see [the MyST Markdown Overview](https://jupyterbook.org/content/myst.html).
+
+## Sample Roles and Directives
+
+Roles and directives are two of the most powerful tools in Jupyter Book. They
+are kind of like functions, but written in a markup language. They both
+serve a similar purpose, but **roles are written in one line**, whereas
+**directives span many lines**. They both accept different kinds of inputs,
+and what they do with those inputs depends on the specific role or directive
+that is being called.
+
+Here is a "note" directive:
+
+```{note}
+Here is a note
+```
+
+It will be rendered in a special box when you build your book.
+
+Here is an inline directive to refer to a document: {doc}`markdown-notebooks`.
+
+
+## Citations
+
+You can also cite references that are stored in a `bibtex` file. For example,
+the following syntax: `` {cite}`holdgraf_evidence_2014` `` will render like
+this: {cite}`holdgraf_evidence_2014`.
+
+Moreover, you can insert a bibliography into your page with this syntax:
+The `{bibliography}` directive must be used for all the `{cite}` roles to
+render properly.
+For example, if the references for your book are stored in `references.bib`,
+then the bibliography is inserted with:
+
+```{bibliography}
+```
+
+## Learn more
+
+This is just a simple starter to get you started.
+You can learn a lot more at [jupyterbook.org](https://jupyterbook.org).
diff --git a/content/notebooks.ipynb b/content/notebooks.ipynb
new file mode 100644
index 0000000..fdb7176
--- /dev/null
+++ b/content/notebooks.ipynb
@@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Content with notebooks\n",
+    "\n",
+    "You can also create content with Jupyter Notebooks. This means that you can include\n",
+    "code blocks and their outputs in your book.\n",
+    "\n",
+    "## Markdown + notebooks\n",
+    "\n",
+    "As it is markdown, you can embed images, HTML, etc into your posts!\n",
+    "\n",
+    "![](https://myst-parser.readthedocs.io/en/latest/_static/logo-wide.svg)\n",
+    "\n",
+    "You can also $add_{math}$ and\n",
+    "\n",
+    "$$\n",
+    "math^{blocks}\n",
+    "$$\n",
+    "\n",
+    "or\n",
+    "\n",
+    "$$\n",
+    "\\begin{aligned}\n",
+    "\\mbox{mean} la_{tex} \\\\ \\\\\n",
+    "math blocks\n",
+    "\\end{aligned}\n",
+    "$$\n",
+    "\n",
+    "But make sure you \\$Escape \\$your \\$dollar signs \\$you want to keep!\n",
+    "\n",
+    "## MyST markdown\n",
+    "\n",
+    "MyST markdown works in Jupyter Notebooks as well. For more information about MyST markdown, check\n",
+    "out [the MyST guide in Jupyter Book](https://jupyterbook.org/content/myst.html),\n",
+    "or see [the MyST markdown documentation](https://myst-parser.readthedocs.io/en/latest/).\n",
+    "\n",
+    "## Code blocks and outputs\n",
+    "\n",
+    "Jupyter Book will also embed your code blocks and output in your book.\n",
+    "For example, here's some sample Matplotlib code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from matplotlib import rcParams, cycler\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "plt.ion()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fixing random state for reproducibility\n",
+    "np.random.seed(19680801)\n",
+    "\n",
+    "N = 10\n",
+    "data = [np.logspace(0, 1, 100) + np.random.randn(100) + ii for ii in range(N)]\n",
+    "data = np.array(data).T\n",
+    "cmap = plt.cm.coolwarm\n",
+    "rcParams['axes.prop_cycle'] = cycler(color=cmap(np.linspace(0, 1, N)))\n",
+    "\n",
+    "\n",
+    "from matplotlib.lines import Line2D\n",
+    "custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),\n",
+    "                Line2D([0], [0], color=cmap(.5), lw=4),\n",
+    "                Line2D([0], [0], color=cmap(1.), lw=4)]\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "lines = ax.plot(data)\n",
+    "ax.legend(custom_lines, ['Cold', 'Medium', 'Hot']);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is a lot more that you can do with outputs (such as including interactive outputs)\n",
+    "with your book. For more information about this, see [the Jupyter Book documentation](https://jupyterbook.org)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/content/references.bib b/content/references.bib
new file mode 100644
index 0000000..783ec6a
--- /dev/null
+++ b/content/references.bib
@@ -0,0 +1,56 @@
+---
+---
+
+@inproceedings{holdgraf_evidence_2014,
+	address = {Brisbane, Australia, Australia},
+	title = {Evidence for {Predictive} {Coding} in {Human} {Auditory} {Cortex}},
+	booktitle = {International {Conference} on {Cognitive} {Neuroscience}},
+	publisher = {Frontiers in Neuroscience},
+	author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Knight, Robert T.},
+	year = {2014}
+}
+
+@article{holdgraf_rapid_2016,
+	title = {Rapid tuning shifts in human auditory cortex enhance speech intelligibility},
+	volume = {7},
+	issn = {2041-1723},
+	url = {http://www.nature.com/doifinder/10.1038/ncomms13654},
+	doi = {10.1038/ncomms13654},
+	number = {May},
+	journal = {Nature Communications},
+	author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Rieger, Jochem W. and Crone, Nathan and Lin, Jack J. and Knight, Robert T. and Theunissen, Frédéric E.},
+	year = {2016},
+	pages = {13654},
+	file = {Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:C\:\\Users\\chold\\Zotero\\storage\\MDQP3JWE\\Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:application/pdf}
+}
+
+@inproceedings{holdgraf_portable_2017,
+	title = {Portable learning environments for hands-on computational instruction using container-and cloud-based technology to teach data science},
+	volume = {Part F1287},
+	isbn = {978-1-4503-5272-7},
+	doi = {10.1145/3093338.3093370},
+	abstract = {© 2017 ACM. There is an increasing interest in learning outside of the traditional classroom setting. This is especially true for topics covering computational tools and data science, as both are challenging to incorporate in the standard curriculum. These atypical learning environments offer new opportunities for teaching, particularly when it comes to combining conceptual knowledge with hands-on experience/expertise with methods and skills. Advances in cloud computing and containerized environments provide an attractive opportunity to improve the effciency and ease with which students can learn. This manuscript details recent advances towards using commonly-Available cloud computing services and advanced cyberinfrastructure support for improving the learning experience in bootcamp-style events. We cover the benets (and challenges) of using a server hosted remotely instead of relying on student laptops, discuss the technology that was used in order to make this possible, and give suggestions for how others could implement and improve upon this model for pedagogy and reproducibility.},
+	booktitle = {{ACM} {International} {Conference} {Proceeding} {Series}},
+	author = {Holdgraf, Christopher Ramsay and Culich, A. and Rokem, A. and Deniz, F. and Alegro, M. and Ushizima, D.},
+	year = {2017},
+	keywords = {Teaching, Bootcamps, Cloud computing, Data science, Docker, Pedagogy}
+}
+
+@article{holdgraf_encoding_2017,
+	title = {Encoding and decoding models in cognitive electrophysiology},
+	volume = {11},
+	issn = {16625137},
+	doi = {10.3389/fnsys.2017.00061},
+	abstract = {© 2017 Holdgraf, Rieger, Micheli, Martin, Knight and Theunissen. Cognitive neuroscience has seen rapid growth in the size and complexity of data recorded from the human brain as well as in the computational tools available to analyze this data. This data explosion has resulted in an increased use of multivariate, model-based methods for asking neuroscience questions, allowing scientists to investigate multiple hypotheses with a single dataset, to use complex, time-varying stimuli, and to study the human brain under more naturalistic conditions. These tools come in the form of “Encoding” models, in which stimulus features are used to model brain activity, and “Decoding” models, in which neural features are used to generated a stimulus output. Here we review the current state of encoding and decoding models in cognitive electrophysiology and provide a practical guide toward conducting experiments and analyses in this emerging field. Our examples focus on using linear models in the study of human language and audition. We show how to calculate auditory receptive fields from natural sounds as well as how to decode neural recordings to predict speech. The paper aims to be a useful tutorial to these approaches, and a practical introduction to using machine learning and applied statistics to build models of neural activity. The data analytic approaches we discuss may also be applied to other sensory modalities, motor systems, and cognitive systems, and we cover some examples in these areas. In addition, a collection of Jupyter notebooks is publicly available as a complement to the material covered in this paper, providing code examples and tutorials for predictive modeling in python. The aimis to provide a practical understanding of predictivemodeling of human brain data and to propose best-practices in conducting these analyses.},
+	journal = {Frontiers in Systems Neuroscience},
+	author = {Holdgraf, Christopher Ramsay and Rieger, J.W. and Micheli, C. and Martin, S. and Knight, R.T. and Theunissen, F.E.},
+	year = {2017},
+	keywords = {Decoding models, Encoding models, Electrocorticography (ECoG), Electrophysiology/evoked potentials, Machine learning applied to neuroscience, Natural stimuli, Predictive modeling, Tutorials}
+}
+
+@book{ruby,
+  title     = {The Ruby Programming Language},
+  author    = {Flanagan, David and Matsumoto, Yukihiro},
+  year      = {2008},
+  publisher = {O'Reilly Media}
+}
diff --git a/content/reports/hdfs/index.md b/content/reports/hdfs/index.md
deleted file mode 100644
index 091207d..0000000
--- a/content/reports/hdfs/index.md
+++ /dev/null
@@ -1,47 +0,0 @@
----
-title: HDFS Storage Reports
-description: Summaries generated from the data stored on HDFS.
----
-
-# Introduction
-
-The following statistics are generated based on the contents of the HDFS file system we use to store our data. We regularly scan the store and classify each item based on where it is stored, it's file type, and so on. Where possible, we also extract date information, e.g. using the date stamp within the filename of our WARC files to estimate the time that WARC data was collected (although strictly speaking we are using the date the file was created). This means the dates are reliable for all but the earliest years of selective web archiving, i.e. before we started putting the dates in the filenames. 
-
-All figures are in bytes unless otherwise stated.
-
-Deeper analysis can be performed using Python notebooks, e.g. [hdfs-reports-full.ipynb](http://intranet.wa.bl.uk/ukwa/jupyter/notebooks/ukwa-manage/notebooks/hdfs-reports-full.ipynb).
-
-# Non-Print Legal Deposit Content
-
-This section only includes archival content, i.e. WARCs (either normal content or 'viral WARCs' containing material that appears to contain computer viruses), crawl logs and any additional archival package material. 
-
-## NPLD Totals
-
-{{< csv-table src="reports/hdfs/npld-total-file-size-by-stream-totals.csv" >}}
-
-## NPLD Total By Year
-
-{{< csv-table src="reports/hdfs/npld-total-file-size-by-stream-per-year.csv" >}}
-
-## NPLD Total By Month
-
-{{< date-bar-chart src="reports/hdfs/npld-total-file-size-by-stream-per-month.csv" >}}
-
-# All Holdings
-
-These section includes all material on the cluster. If the files appear to be associated with a crawl stream, then the collection (e.g. Non-Print Legal Deposit) and stream (e.g. Domain crawl) will be set. If not, the collection and stream will both be 'None'.
-
-## Total bytes of files on HDFS, by collection, stream and type
-
-This report breaks down the total size of files (in bytes) stored on HDFS by the collection, stream, and type of file.
-
-{{< csv-table src="reports/hdfs/total-file-size-by-stream.csv" >}}
-
-## Total numbers of files on HDFS, by collection, stream and type
-
-This report breaks down the number of files stored on HDFS by the collection, stream, and type of file.
-
-{{< csv-table src="reports/hdfs/total-file-count-by-stream.csv" >}}
-
-
-
diff --git a/content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv b/content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv
deleted file mode 100644
index 3a45410..0000000
--- a/content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv
+++ /dev/null
@@ -1,135 +0,0 @@
-timestamp,stream,file_size
-2013-04,domain,21496645180530
-2013-04,frequent,1233494513066
-2013-05,domain,10646577013628
-2013-05,frequent,503687665781
-2013-06,domain,1763992820313
-2013-06,frequent,399955364390
-2013-07,domain,497880630017
-2013-07,frequent,60835781997
-2013-08,frequent,585453340666
-2013-09,frequent,212254954288
-2013-10,frequent,691888255783
-2013-11,frequent,744359661269
-2013-12,frequent,814236551583
-2014-01,frequent,713736844362
-2014-02,frequent,897239626634
-2014-03,frequent,713895516622
-2014-04,frequent,1005532161253
-2014-05,frequent,857723734233
-2014-06,domain,4990292019947
-2014-06,frequent,1011213873805
-2014-07,domain,7997786988830
-2014-07,frequent,850225024059
-2014-08,domain,4228763495435
-2014-08,frequent,836258087252
-2014-09,domain,1887002283200
-2014-09,frequent,589813630251
-2014-10,domain,12901366343469
-2014-10,frequent,1152274513168
-2014-11,domain,19704030330247
-2014-11,frequent,544292623756
-2014-12,domain,10428775788758
-2014-12,frequent,515455415080
-2015-01,frequent,784513489409
-2015-02,frequent,1054532023006
-2015-03,frequent,937694648538
-2015-04,frequent,1628677164184
-2015-05,domain,235613102166
-2015-05,frequent,655365232861
-2015-06,frequent,996598324470
-2015-07,domain,158765321
-2015-07,frequent,1511157141370
-2015-08,domain,4572882297168
-2015-08,frequent,1307160334076
-2015-09,domain,21517695604196
-2015-09,frequent,1125054533737
-2015-10,domain,10428473325631
-2015-10,frequent,1621492476057
-2015-11,domain,24232100394090
-2015-11,frequent,1298818389994
-2015-12,domain,14741495433783
-2015-12,frequent,1452828183176
-2016-01,frequent,5090601554703
-2016-02,frequent,1596757583284
-2016-03,frequent,1537087882554
-2016-04,frequent,2606971226735
-2016-05,frequent,2333724868781
-2016-06,domain,112437071
-2016-06,frequent,1783338096350
-2016-07,frequent,1820459951534
-2016-08,domain,4137229470967
-2016-08,frequent,1394484649025
-2016-09,domain,21530388796510
-2016-09,frequent,2351233461774
-2016-10,domain,15181982950820
-2016-10,frequent,3306292773345
-2016-11,domain,36239249417877
-2016-11,frequent,1416921135036
-2016-12,domain,27739226101697
-2016-12,frequent,1495647147271
-2016-12,webrecorder,491399057
-2017-01,domain,52119907584
-2017-01,frequent,2771308110362
-2017-02,frequent,4827898598837
-2017-03,frequent,1335312593832
-2017-04,frequent,2653267245899
-2017-05,frequent,1721199155315
-2017-06,domain,30883890274313
-2017-06,frequent,1950827074054
-2017-07,domain,22315390740390
-2017-07,frequent,3813423988275
-2017-08,domain,15199739069784
-2017-08,frequent,1210113588184
-2017-09,domain,9341478791455
-2017-09,frequent,1442995071443
-2017-10,domain,8375778101
-2017-10,frequent,1913313636381
-2017-11,domain,4127772894
-2017-11,frequent,1268306765297
-2017-12,domain,0
-2017-12,frequent,1802498617427
-2018-01,frequent,1798915526045
-2018-02,frequent,1699718044903
-2018-03,frequent,1696905474489
-2018-04,frequent,2711542892486
-2018-05,frequent,4145278849571
-2018-06,frequent,10696122110533
-2018-07,domain,16726081849085
-2018-07,frequent,6991581069333
-2018-08,domain,1378165058697
-2018-08,frequent,5499116104767
-2018-09,domain,20518602522171
-2018-09,frequent,7312429865198
-2018-10,domain,8586705094806
-2018-10,frequent,6702959190875
-2018-11,domain,8379383778088
-2018-11,frequent,7521458564640
-2018-12,domain,3414467033155
-2018-12,frequent,5176282768651
-2019-01,domain,139067
-2019-01,frequent,6317749831118
-2019-02,frequent,5436992410931
-2019-03,frequent,8290189122855
-2019-04,domain,65160862590
-2019-04,frequent,9895183469900
-2019-05,frequent,10784644761351
-2019-05,webrecorder,1138465383
-2019-06,domain,6836933688282
-2019-06,frequent,6832035740538
-2019-07,domain,11973478622062
-2019-07,frequent,14330210460019
-2019-08,domain,11689079851921
-2019-08,frequent,7234706643136
-2019-09,domain,4976564393961
-2019-09,frequent,8251140337157
-2019-10,domain,8427751607269
-2019-10,frequent,11613138687572
-2019-11,domain,10113950588123
-2019-11,frequent,8615401186359
-2019-12,domain,9621765218631
-2019-12,frequent,8037741598606
-2020-01,domain,28670272746
-2020-01,frequent,5429618611087
-2020-02,frequent,3211085834107
-2020-02,webrecorder,1653237053
diff --git a/content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv b/content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv
deleted file mode 100644
index cd756e7..0000000
--- a/content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv
+++ /dev/null
@@ -1,9 +0,0 @@
-timestamp,domain,frequent,webrecorder
-2013,34.41 TB,5.25 TB,
-2014,62.14 TB,9.69 TB,
-2015,75.73 TB,14.37 TB,
-2016,104.83 TB,26.73 TB,491.40 MB
-2017,77.81 TB,26.71 TB,
-2018,59.00 TB,61.95 TB,
-2019,63.70 TB,105.64 TB,1.14 GB
-2020,28.67 GB,8.64 TB,1.65 GB
diff --git a/content/reports/hdfs/npld-total-file-size-by-stream-totals.csv b/content/reports/hdfs/npld-total-file-size-by-stream-totals.csv
deleted file mode 100644
index 0cf8fb2..0000000
--- a/content/reports/hdfs/npld-total-file-size-by-stream-totals.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-stream,file_size,file_count
-domain,477.64 TB,481846
-frequent,258.98 TB,329881
-webrecorder,3.28 GB,104
-total,736.63 TB,811831
diff --git a/content/reports/hdfs/total-file-count-by-stream.csv b/content/reports/hdfs/total-file-count-by-stream.csv
deleted file mode 100644
index 39aae76..0000000
--- a/content/reports/hdfs/total-file-count-by-stream.csv
+++ /dev/null
@@ -1,100 +0,0 @@
-collection,stream,timestamp,cdx,crawl-logs,dlx,logs,unknown,viral,warcs,warcs-invalid
-0_original,None,2016,,,,,146514,,,
-0_original,None,2017,,,,,1028,,,
-1_data,None,2014,,,,,4,,,
-1_data,None,2015,,,,,25,,,
-1_data,None,2016,,,,,349,,,
-1_data,None,2017,,,,,9,,,
-1_data,None,2018,,,,,313,,,
-1_data,None,2019,,,,,10706,,,
-1_data,None,2020,,,,,420,,,
-2_backups,None,2017,,,,,162,,,
-2_backups,None,2018,,,,,413,,,
-2_backups,None,2019,,,,,866259,,,
-2_backups,None,2020,,,,,418893,,,
-9_processing,None,2016,,,,,1738,,,
-9_processing,None,2017,,,,,24,,,
-9_processing,None,2018,,,,,12579,,,
-9_processing,None,2019,,,,,13110,,,
-9_processing,None,2020,,,,,984,,,
-blit,None,2016,,,,,230,,,
-blit,None,2017,,,,,405,,,
-blit,None,2018,,,,,86,,,
-blit,None,2019,,,,,51,,,
-crawls,None,2012,,,,,16,,,
-data,selective,2011,,,,,125,,,
-data,selective,2012,,,,,62,,,
-data,selective,2014,,,,,4173,,,
-data,selective,2017,,,,,1,,,
-data,selective,2019,,,,,2,,,
-datasets,None,2014,,,,,18,,,
-datasets,None,2017,,,,,2219,,,
-heritrix,None,2013,,,,,261,,,
-heritrix,None,2014,,,,,8506,,,
-heritrix,None,2015,,,,,7481,,,
-heritrix,None,2016,,,,,871,,,
-heritrix,None,2017,,,,,2841,,,
-heritrix,None,2019,,,,,4872,,,
-ia,None,2011,,,,,884220,,,
-ia,None,2012,,,,,56714,,,
-ia,None,2014,,,,,203502,,,
-ia,None,2017,,,,,11,,,
-logs,None,2014,,,,,2735,,,
-logs,None,2015,,,,,1939,,,
-logs,None,2016,,,,,1171,,,
-logs,None,2017,,,,,1405,,,
-logs,None,2018,,,,,1493,,,
-logs,None,2019,,,,,8868,,,
-logs,None,2020,,,,,863,,,
-lvdata,None,2011,,,,,18,,,
-lvdata,None,2015,,,,,12,,,
-lvdata,None,2016,,,,,2,,,
-lvdata,None,2017,,,,,43,,,
-lvdata,None,2018,,,,,25,,,
-lvdata,None,2019,,,,,72,,,
-npld,domain,2013,,9,,4,,10,33102,
-npld,domain,2014,,,,50,,228,61196,
-npld,domain,2015,,109,,9,,199,74237,
-npld,domain,2016,,587,,2725,,237,102141,
-npld,domain,2017,,254,,1516,,154,76079,
-npld,domain,2018,,2109,,,,445,59420,
-npld,domain,2019,,451,,,,139,70727,
-npld,domain,2020,,9,,,,4,,
-npld,frequent,2013,,,,173,,17,6581,
-npld,frequent,2014,,,,4830,,11,15787,
-npld,frequent,2015,,,,1806,,36,18635,
-npld,frequent,2016,,92,,780,,125,27212,
-npld,frequent,2017,,18035,,102704,,346,40397,70
-npld,frequent,2018,,5768,,34948,,707,71285,99
-npld,frequent,2019,,4924,,25492,,1152,109693,
-npld,frequent,2020,,409,,2062,,14,8655,
-npld,webrecorder,2016,,,,,,,6,
-npld,webrecorder,2019,,,,,,,97,
-npld,webrecorder,2020,,,,,,,1,
-selective,selective,2010,,50,5076,42,5032,,5078,
-selective,selective,2011,,30896,141666,30718,65920,,121927,33
-selective,selective,2012,14581,11337,53229,12150,22681,,51356,
-selective,selective,2013,49866,9853,49891,11370,20212,,49994,
-selective,selective,2014,29058,5065,29067,6108,10139,,29058,
-selective,selective,2015,27650,7945,29820,9537,16147,,30081,
-selective,selective,2016,5227,1372,4946,1931,2780,,5224,
-tmp,None,2010,,,,,1815,,,
-tmp,None,2011,,,,,13929,,,
-tmp,None,2012,,,,,1164,,,
-tmp,None,2013,,,,,3291,,,
-tmp,None,2014,,,,,5679,,,
-tmp,None,2015,,,,,6348,,,
-tmp,None,2016,,,,,3230,,,
-tmp,None,2017,,,,,61,,,
-tmp,None,2018,,,,,156,,,
-tmp,None,2019,,,,,327,,,
-tmp,None,2020,,,,,48,,,
-user,None,2015,,,,,1456,,,
-user,None,2016,,,,,19338,,,
-user,None,2017,,,,,34525,,,
-user,None,2018,,,,,8170,,,
-user,None,2019,,,,,12971,,,
-user,None,2020,,,,,1456,,,
-wap,None,2010,,,,,34,,,
-wayback,None,2015,,,,,789,,,
-wayback,None,2016,,,,,104,,,
diff --git a/content/reports/hdfs/total-file-size-by-stream.csv b/content/reports/hdfs/total-file-size-by-stream.csv
deleted file mode 100644
index 99d1110..0000000
--- a/content/reports/hdfs/total-file-size-by-stream.csv
+++ /dev/null
@@ -1,100 +0,0 @@
-collection,stream,timestamp,cdx,crawl-logs,dlx,logs,unknown,viral,warcs,warcs-invalid
-0_original,None,2016,,,,,22356608006374,,,
-0_original,None,2017,,,,,33802480975,,,
-1_data,None,2014,,,,,1137405109,,,
-1_data,None,2015,,,,,20119609024,,,
-1_data,None,2016,,,,,161862536422,,,
-1_data,None,2017,,,,,1717439291,,,
-1_data,None,2018,,,,,15902128015,,,
-1_data,None,2019,,,,,322733997749,,,
-1_data,None,2020,,,,,7526480384,,,
-2_backups,None,2017,,,,,11214754536,,,
-2_backups,None,2018,,,,,47633278863,,,
-2_backups,None,2019,,,,,9088959223560,,,
-2_backups,None,2020,,,,,941915847706,,,
-9_processing,None,2016,,,,,165581265235,,,
-9_processing,None,2017,,,,,837868177,,,
-9_processing,None,2018,,,,,16122448923,,,
-9_processing,None,2019,,,,,17086617481,,,
-9_processing,None,2020,,,,,3118411218,,,
-blit,None,2016,,,,,8217960771,,,
-blit,None,2017,,,,,106802817414,,,
-blit,None,2018,,,,,5502164646,,,
-blit,None,2019,,,,,399982567,,,
-crawls,None,2012,,,,,12800085491,,,
-data,selective,2011,,,,,2297602445,,,
-data,selective,2012,,,,,1998893419,,,
-data,selective,2014,,,,,1107799922839,,,
-data,selective,2017,,,,,115055397,,,
-data,selective,2019,,,,,807614932,,,
-datasets,None,2014,,,,,308058048005,,,
-datasets,None,2017,,,,,200036276729,,,
-heritrix,None,2013,,,,,170254006349,,,
-heritrix,None,2014,,,,,3681747033289,,,
-heritrix,None,2015,,,,,6084460400898,,,
-heritrix,None,2016,,,,,558186402532,,,
-heritrix,None,2017,,,,,2674517692954,,,
-heritrix,None,2019,,,,,2434853095435,,,
-ia,None,2011,,,,,31273038350790,,,
-ia,None,2012,,,,,4056519069254,,,
-ia,None,2014,,,,,30036052322986,,,
-ia,None,2017,,,,,23163443,,,
-logs,None,2014,,,,,10326703264,,,
-logs,None,2015,,,,,967435220,,,
-logs,None,2016,,,,,216110700,,,
-logs,None,2017,,,,,269406885,,,
-logs,None,2018,,,,,468741857,,,
-logs,None,2019,,,,,1519682542,,,
-logs,None,2020,,,,,1885568599,,,
-lvdata,None,2011,,,,,14758580,,,
-lvdata,None,2015,,,,,450484624,,,
-lvdata,None,2016,,,,,215438,,,
-lvdata,None,2017,,,,,550013821,,,
-lvdata,None,2018,,,,,743900179,,,
-lvdata,None,2019,,,,,21142482,,,
-npld,domain,2013,,497880630017,,225807897946,,4378472582,33902836541889,
-npld,domain,2014,,,,1397836972235,,4882231936,62133135017950,
-npld,domain,2015,,235613102166,,337291291481,,4570058043,75488235762146,
-npld,domain,2016,,1021393765281,,889369950291,,8012152161,103798783257500,
-npld,domain,2017,,827631587106,,650096581292,,4444901826,76973045845589,
-npld,domain,2018,,1417051338354,,,,2075196719,57584278800929,
-npld,domain,2019,,998109935920,,,,1771940785,62704803095201,
-npld,domain,2020,,28648779575,,,,21493171,,
-npld,frequent,2013,,,,784344449,,99041071,5246067047752,
-npld,frequent,2014,,,,117389643041,,34269049,9687626781426,
-npld,frequent,2015,,,,74768951781,,271945467,14373619995411,
-npld,frequent,2016,,40202387198,,119957450800,,2170071165,26691147872029,
-npld,frequent,2017,,933573121920,,1036051730769,,119883019,25776771440367,13143678154
-npld,frequent,2018,,1273256843460,,1590177149560,,255134402,60678798483629,17960316500
-npld,frequent,2019,,3306036343595,,2919995202567,,244227036,102332853678911,
-npld,frequent,2020,,326451164873,,127397676420,,8261211,8314245019110,
-npld,webrecorder,2016,,,,,,,491399057,
-npld,webrecorder,2019,,,,,,,1138465383,
-npld,webrecorder,2020,,,,,,,1653237053,
-selective,selective,2010,,16174178,14852340758,1466992,87516346638,,889691193222,
-selective,selective,2011,,70177720850,114203699462,7627959672,55165225240,,9682572380858,96736608
-selective,selective,2012,8597134205,29180359746,42420273550,5782744821,960121658,,5126476552554,
-selective,selective,2013,32234715695,30828782244,39798000486,7371246183,343710799,,4944394041969,
-selective,selective,2014,20844001981,17389874737,24449401822,4919485664,181445191,,3122414823039,
-selective,selective,2015,14243250050,15662303216,20358418164,5676141494,255820890,,2259050467116,
-selective,selective,2016,2430183563,2211062877,3181261947,1110765359,44371755,,391490414932,
-tmp,None,2010,,,,,640716644,,,
-tmp,None,2011,,,,,12887736803,,,
-tmp,None,2012,,,,,1045726679,,,
-tmp,None,2013,,,,,175177527,,,
-tmp,None,2014,,,,,24530609,,,
-tmp,None,2015,,,,,91108259,,,
-tmp,None,2016,,,,,25889003,,,
-tmp,None,2017,,,,,180527337,,,
-tmp,None,2018,,,,,185392360,,,
-tmp,None,2019,,,,,2089575884,,,
-tmp,None,2020,,,,,1624290646,,,
-user,None,2015,,,,,26882901161,,,
-user,None,2016,,,,,26603665515,,,
-user,None,2017,,,,,111546570706,,,
-user,None,2018,,,,,317525167912,,,
-user,None,2019,,,,,66679231335,,,
-user,None,2020,,,,,8888634521,,,
-wap,None,2010,,,,,108129896030,,,
-wayback,None,2015,,,,,239024157440,,,
-wayback,None,2016,,,,,75861089075,,,
diff --git a/content/reports/storage/humanbytes.py b/content/reports/storage/humanbytes.py
new file mode 100644
index 0000000..020f027
--- /dev/null
+++ b/content/reports/storage/humanbytes.py
@@ -0,0 +1,56 @@
+from typing import List, Union
+
+# From https://stackoverflow.com/a/63839503/6689
+
+class HumanBytes:
+    METRIC_LABELS: List[str] = ["B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
+    BINARY_LABELS: List[str] = ["B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"]
+    PRECISION_OFFSETS: List[float] = [0.5, 0.05, 0.005, 0.0005] # PREDEFINED FOR SPEED.
+    PRECISION_FORMATS: List[str] = ["{}{:.0f} {}", "{}{:.1f} {}", "{}{:.2f} {}", "{}{:.3f} {}"] # PREDEFINED FOR SPEED.
+
+    @staticmethod
+    def format(num: Union[int, float], metric: bool=False, precision: int=1) -> str:
+        """
+        Human-readable formatting of bytes, using binary (powers of 1024)
+        or metric (powers of 1000) representation.
+        """
+
+        assert isinstance(num, (int, float)), "num must be an int or float"
+        assert isinstance(metric, bool), "metric must be a bool"
+        assert isinstance(precision, int) and precision >= 0 and precision <= 3, "precision must be an int (range 0-3)"
+
+        unit_labels = HumanBytes.METRIC_LABELS if metric else HumanBytes.BINARY_LABELS
+        last_label = unit_labels[-1]
+        unit_step = 1000 if metric else 1024
+        unit_step_thresh = unit_step - HumanBytes.PRECISION_OFFSETS[precision]
+
+        is_negative = num < 0
+        if is_negative: # Faster than ternary assignment or always running abs().
+            num = abs(num)
+
+        for unit in unit_labels:
+            if num < unit_step_thresh:
+                # VERY IMPORTANT:
+                # Only accepts the CURRENT unit if we're BELOW the threshold where
+                # float rounding behavior would place us into the NEXT unit: F.ex.
+                # when rounding a float to 1 decimal, any number ">= 1023.95" will
+                # be rounded to "1024.0". Obviously we don't want ugly output such
+                # as "1024.0 KiB", since the proper term for that is "1.0 MiB".
+                break
+            if unit != last_label:
+                # We only shrink the number if we HAVEN'T reached the last unit.
+                # NOTE: These looped divisions accumulate floating point rounding
+                # errors, but each new division pushes the rounding errors further
+                # and further down in the decimals, so it doesn't matter at all.
+                num /= unit_step
+
+        return HumanBytes.PRECISION_FORMATS[precision].format("-" if is_negative else "", num, unit)
+
+#print(HumanBytes.format(2251799813685247)) # 2 pebibytes
+#print(HumanBytes.format(2000000000000000, True)) # 2 petabytes
+#print(HumanBytes.format(1099511627776)) # 1 tebibyte
+#print(HumanBytes.format(1000000000000, True)) # 1 terabyte
+#print(HumanBytes.format(1000000000, True)) # 1 gigabyte
+#print(HumanBytes.format(4318498233, precision=3)) # 4.022 gibibytes
+#print(HumanBytes.format(4318498233, True, 3)) # 4.318 gigabytes
+#print(HumanBytes.format(-4318498233, precision=2)) # -4.02 gibibytes
\ No newline at end of file
diff --git a/content/reports/storage/scratch.md b/content/reports/storage/scratch.md
new file mode 100644
index 0000000..a70ae6b
--- /dev/null
+++ b/content/reports/storage/scratch.md
@@ -0,0 +1,221 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+# Scratch space
+
+A place to experiment with other analyses
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-cell]
+---
+import os
+import requests
+import pandas as pd
+from humanbytes import HumanBytes
+from IPython.display import display, HTML, FileLink, FileLinks
+
+pd.set_option('display.max_rows', 100)
+
+# Pick up source locations:
+trackdb_jsonl = os.environ.get('TRACKDB_LIST_JSONL','trackdb_list.jsonl')
+aws_jsonl = os.environ.get('AWS_S3_LIST_JSONL','aws_s3_list.jsonl')
+
+# Load TrackDB records:
+df = pd.read_json(trackdb_jsonl, lines=True)
+
+# Also load AWS records:
+aws_df = pd.read_json(aws_jsonl, lines=True)
+# Filter out non-content files:
+aws_df  = aws_df[aws_df['kind_s'] != 'unknown']
+df = pd.concat([df,aws_df], sort=True)
+
+# Set up timestamp:
+df['timestamp_dt']= pd.to_datetime(df['timestamp_dt'])
+total_records = len(df)
+
+# Force integers:
+df['file_size_l'] = df['file_size_l'].fillna(0)
+df['file_size_l'] = df['file_size_l'].apply(int)
+
+display(HTML(f"Found a total of {total_records:,} WARC and crawl log files."))
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+# Dataframe of all unique paths (drop others for paths appearing in more than one 'fresh' TrackDB record):
+dfu = df.drop_duplicates(subset=['file_path_s']).drop(columns=['file_path_s'])
+
+unique_records = len(dfu)
+
+display(HTML(f"Found {unique_records:,} unique files (based on file path). This means there are {(total_records-unique_records):,} files duplicated across storage systems."))
+```
+
+The following table shows the most recent WARCs for each data store, along with the associated timestamp. This can be used to check the source data for this report is up to date.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+---
+
+```
+
+```{code-cell} ipython3
+
+```
+
+## Radial Visualization
+
+This is a work in progress and is not working yet.
+
+```{code-cell} ipython3
+#for gn, g in dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']):
+dfuu = dfu.filter(['timestamp_dt', 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s', 'file_size_l']).rename(
+    columns={
+        'file_size_l': 'size_bytes', 
+        'kind_s': 'kind', 
+        'stream_s': 'stream',
+        'count': 'file_count',
+        'timestamp_dt': 'year',
+        'collection_s': 'collection',
+        'hdfs_service_id_s': 'store'
+    }
+)
+dfuu
+```
+
+```{code-cell} ipython3
+# Build up items for the tree:
+#  {
+#    "id": 246,
+#    "name": "TreeMapLayout",
+#    "parent": 231,
+#    "size": 9191
+#  },
+
+
+entries = []
+entry_id = 0
+
+entries.append({
+    'id': entry_id,
+    'name': "total",
+    'size': dfuu['size_bytes'].sum(),
+    'count': dfuu['size_bytes'].count()
+})
+parent_id = entry_id
+entry_id += 1
+
+for ts, ts_g in dfuu.groupby(pd.Grouper(key='year', freq="A")):
+    print(ts.year)
+    for col, col_g in ts_g.groupby('collection'):
+        print(ts.year, col, col_g['size_bytes'].count(), col_g['size_bytes'].sum())
+        for stream, stream_g in col_g.groupby('stream'):
+            print(ts.year, col, stream)
+            for kind, kind_g in stream_g.groupby('kind'):
+                print(ts.year, col, stream, kind)
+                for store, store_g in kind_g.groupby('store'):
+                    print(ts.year, col, stream, kind, store, store_g['size_bytes'].count(), store_g['size_bytes'].sum())
+```
+
+```{code-cell} ipython3
+from altair.vega import vega
+
+vega({
+  "$schema": "https://vega.github.io/schema/vega/v5.json",
+  "description": "An example of a space-fulling radial layout for hierarchical data.",
+  "width": 600,
+  "height": 600,
+  "padding": 5,
+  "autosize": "none",
+
+  "data": [
+    {
+      "name": "tree",
+      "url": "https://vega.github.io/vega/data/flare.json",
+      "transform": [
+        {
+          "type": "stratify",
+          "key": "id",
+          "parentKey": "parent"
+        },
+        {
+          "type": "partition",
+          "field": "size",
+          "sort": {"field": "value"},
+          "size": [{"signal": "2 * PI"}, {"signal": "width / 2"}],
+          "as": ["a0", "r0", "a1", "r1", "depth", "children"]
+        }
+      ]
+    }
+  ],
+
+  "scales": [
+    {
+      "name": "color",
+      "type": "ordinal",
+      "domain": {"data": "tree", "field": "depth"},
+      "range": {"scheme": "tableau10"}
+    }
+  ],
+
+  "marks": [
+    {
+      "type": "arc",
+      "from": {"data": "tree"},
+      "encode": {
+        "enter": {
+          "x": {"signal": "width / 2"},
+          "y": {"signal": "height / 2"},
+          "fill": {"scale": "color", "field": "depth"},
+          "tooltip": {"signal": "datum.name + (datum.size ? ', ' + datum.size + ' bytes' : '')"}
+        },
+        "update": {
+          "startAngle": {"field": "a0"},
+          "endAngle": {"field": "a1"},
+          "innerRadius": {"field": "r0"},
+          "outerRadius": {"field": "r1"},
+          "stroke": {"value": "white"},
+          "strokeWidth": {"value": 0.75},
+          "zindex": {"value": 0}
+        },
+        "hover": {
+          "stroke": {"value": "red"},
+          "strokeWidth": {"value": 1.5},
+          "zindex": {"value": 1}
+        }
+      }
+    }
+  ]
+})
+```
+
+```{code-cell} ipython3
+
+```
+
+```{code-cell} ipython3
+
+```
+
+```{code-cell} ipython3
+
+```
diff --git a/content/reports/storage/summary.md b/content/reports/storage/summary.md
new file mode 100644
index 0000000..0fa3e04
--- /dev/null
+++ b/content/reports/storage/summary.md
@@ -0,0 +1,243 @@
+---
+jupytext:
+  cell_metadata_filter: -all
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# Summary Report
+
+First we load the data and look for duplicate data across the different storage services.
+
+```{code-cell} ipython3
+import os
+import requests
+import pandas as pd
+from humanbytes import HumanBytes
+from IPython.display import display, HTML, FileLink, FileLinks
+import pathlib
+
+dir_path = pathlib.Path().absolute()
+
+pd.set_option('display.max_rows', 100)
+
+# Pick up source locations:
+trackdb_jsonl = os.environ.get('TRACKDB_LIST_JSONL', dir_path.joinpath('trackdb_list.jsonl'))
+aws_jsonl = os.environ.get('AWS_S3_LIST_JSONL', dir_path.joinpath('aws_s3_list.jsonl'))
+
+# Load TrackDB records:
+df = pd.read_json(trackdb_jsonl, lines=True)
+
+# Also load AWS records:
+aws_df = pd.read_json(aws_jsonl, lines=True)
+# Filter out non-content files:
+aws_df  = aws_df[aws_df['kind_s'] != 'unknown']
+df = pd.concat([df,aws_df], sort=True)
+
+# Set up timestamp:
+df['timestamp_dt']= pd.to_datetime(df['timestamp_dt'])
+total_records = len(df)
+
+# Force integers:
+df['file_size_l'] = df['file_size_l'].fillna(0)
+df['file_size_l'] = df['file_size_l'].apply(int)
+
+display(HTML(f"Found a total of {total_records:,} WARC and crawl log files."))
+```
+
+```{code-cell} ipython3
+# Dataframe of all unique paths (drop others for paths appearing in more than one 'fresh' TrackDB record):
+dfu = df.drop_duplicates(subset=['file_path_s']).drop(columns=['file_path_s'])
+
+unique_records = len(dfu)
+
+display(HTML(f"Found {unique_records:,} unique files (based on file path). This means there are {(total_records-unique_records):,} files duplicated across storage systems."))
+```
+
+The following table shows the most recent WARCs for each data store, along with the associated timestamp. This can be used to check the source data for this report is up to date.
+
+```{code-cell} ipython3
+pd.set_option('display.max_colwidth', 1024)
+# Now we look for the most recent WARC files:
+dflw = df.filter(items=['hdfs_service_id_s', 'file_path_s', 'kind_s', 'timestamp_dt'], axis=1)
+dflw = dflw.loc[dflw['kind_s'] == 'warcs'].sort_values(['timestamp_dt'],ascending=False).groupby('hdfs_service_id_s').first()
+dflw = dflw.reset_index().rename(columns={
+    'hdfs_service_id_s': 'store',
+})
+dflw
+```
+
+## Statistics by Year
+
+This table summarises our overall totals for the different kinds of data we hold.
+
+### Overall totals by year
+
+```{code-cell} ipython3
+from IPython.display import display
+
+def show_table_and_dl(df, slug):
+    # Shift to standard Column Names
+    df = df.rename(columns={
+        'timestamp_dt': 'year',
+        'collection_s': 'collection',
+        'stream_s': 'stream',
+        'kind_s': 'kind',
+        'size': 'size_bytes',
+        'count': 'file_count',
+        'hdfs_service_id_s': 'store'
+    })
+
+    # Add a Total:
+    df.loc['Total']= df.sum(numeric_only=True)
+    
+    # Replace NaNs
+    df = df.fillna('')
+
+    # Clean up size formatting:
+    df['size'] = df['size_bytes'].apply(lambda x: HumanBytes.format(x, True))
+    df['size_bytes'] = df['size_bytes'].apply(int)
+    df['file_count'] = df['file_count'].apply(int)
+    
+    # Also make the data available for download:
+    csv_file = f'{slug}.csv'
+    df.to_csv(csv_file, index=False)
+    dl = FileLink(csv_file, result_html_prefix='Download the data from this table here: ')
+    display(df,dl)
+
+#tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A")]).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+tots = tots.reset_index()
+
+# Clip year:
+tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x))
+
+# Show table and downloader:
+show_table_and_dl(tots, 'totals_by_year')
+```
+
+### Totals by Year & Collection
+
+```{code-cell} ipython3
+#tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+tots = tots.reset_index()
+
+# Clip year:
+tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x))
+
+# Show table and downloader:
+show_table_and_dl(tots, 'totals_by_year_collection')
+```
+
+### Totals by Year, Collection, Stream, Store & Kind
+
+```{code-cell} ipython3
+tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+tots = tots.reset_index()
+
+# Clip year:
+tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x))
+
+# Show table and downloader:
+show_table_and_dl(tots, 'totals_by_year_collection_stream_store_kind')
+```
+
+## Statistics by Financial Year
+
+The same data, but aggregating by financial year.
+
+### Totals by Financial Year
+
+```{code-cell} ipython3
+by_fy = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR")]).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+
+# Removed heirarchical index so we can plot:
+by_fy = by_fy.reset_index()
+
+# Transform how FY is presented:
+by_fy['timestamp_dt'] = by_fy['timestamp_dt'].dt.year.apply(lambda x: str(x-1) + "-" + str(x))
+
+# Show table and downloader:
+show_table_and_dl(by_fy, 'totals_by_fy')
+```
+
+### Totals by Financial Year, Collection, Stream, Store & Kind
+
+```{code-cell} ipython3
+by_fy = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+
+# Removed heirarchical index so we can plot:
+by_fy = by_fy.reset_index()
+
+# Transform how FY is presented:
+by_fy['timestamp_dt'] = by_fy['timestamp_dt'].dt.year.apply(lambda x: str(x-1) + "-" + str(x))
+
+# Show table and downloader:
+show_table_and_dl(by_fy, 'totals_by_fy_collection_stream_store_kind')
+```
+
+### Graphs of Totals by Stream & Kind, over Time
+
+```{code-cell} ipython3
+by_fy_s = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR"), 'stream_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+
+# Removed heirarchical index so we can plot:
+by_fy_s = by_fy_s.reset_index()
+
+# Transform how FY is presented:
+by_fy_s['fy'] = by_fy_s['timestamp_dt'].dt.year.apply(lambda x: str(x-1) + "-" + str(x))
+
+# Refactor/renaming:
+by_fy_s = by_fy_s.filter(['fy', 'stream_s', 'kind_s', 'count', 'size'])
+
+# Present sizes in a readable way
+by_fy_s['readable_size'] = by_fy_s['size'].apply(lambda x: HumanBytes.format(x, True))
+```
+
+```{code-cell} ipython3
+import altair as alt
+
+selection = alt.selection_point(fields=['stream_s'])
+color = alt.condition(
+    selection,
+    alt.Color('stream_s:N').legend(None),
+    alt.value('lightgray')
+)
+
+scatter = alt.Chart(by_fy_s).mark_bar().encode(
+    x=alt.X('fy', axis = alt.Axis(title = 'Financial year')),
+    y=alt.Y('size', axis = alt.Axis(title = 'Total bytes', format='s')),
+    color=color,
+    row=alt.Row('kind_s', title='Kind'),
+    tooltip=[
+        alt.Tooltip('fy', title='Financial year'), 
+        alt.Tooltip('stream_s', title='Content stream'), 
+        alt.Tooltip('count', title='Number of files'), 
+        alt.Tooltip('readable_size', title='Total bytes')
+    ]
+).properties(
+    width=800,height=200
+).resolve_scale(y='independent')
+
+legend = alt.Chart(by_fy_s).mark_point().encode(
+    alt.Y('stream_s').axis(orient='right'),
+    color=color
+).add_params(
+    selection
+)
+
+scatter | legend
+```
+
+```{code-cell} ipython3
+
+```
diff --git a/content/reports/storage/test.md b/content/reports/storage/test.md
new file mode 100644
index 0000000..b0f3f9e
--- /dev/null
+++ b/content/reports/storage/test.md
@@ -0,0 +1,20 @@
+---
+jupytext:
+  cell_metadata_filter: -all
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# Hello
+
+```{code-cell} ipython3
+print(2+2)
+```
diff --git a/docker-compose.yml b/docker-compose.yml
index e421354..20c050c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,14 +1,34 @@
 # This Compose file is intended for local development, not production deployment.
 #
-# TODO change this file back to v.2 and set it up for local dev.
-#
 
 version: "3.2"
 
 services:
 
-  hugo:
-    build: .
+  lab:
+    # NOTE this will need to be changed for each user - for some reason username strings don't work:
+    user: "1001"
+    build:
+      context: .
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}   
+    ports:
+    - "8888:8888"
+    command: "jupyter-lab --ip=0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password=''" 
+    volumes:
+    - /etc/passwd:/etc/passwd:ro
+    - /etc/group:/etc/group:ro
+    - /etc/shadow:/etc/shadow:ro
+    - /home/${USER}:/home/${USER}
+    - ${PWD}:/host
+    working_dir: /host
+
+  site:
+    # Make results of build.sh available at http://<host>:8889/act/static/reports/
+    image: nginx
     ports:
-     - "1001:80"
+    - "8889:80"
+    volumes:
+    - ./_build/html:/usr/share/nginx/html/act/static/reports:ro
 
diff --git a/index.html b/index.html
deleted file mode 100644
index e69de29..0000000
diff --git a/layouts/_default/list.html b/layouts/_default/list.html
deleted file mode 100644
index 62fae05..0000000
--- a/layouts/_default/list.html
+++ /dev/null
@@ -1,13 +0,0 @@
-{{ partial "header" . }}
-
-<main>
-
-    <h2>{{ .Title }}</h2>
-
-    {{ range (.Paginator 100).Pages }} {{ partial "list-item" . }} {{ end }}
-
-</main>
-
-{{ partial "paginator" . }}
-
-{{ partial "footer" . }}
diff --git a/layouts/_default/terms.html b/layouts/_default/terms.html
deleted file mode 100644
index 62fae05..0000000
--- a/layouts/_default/terms.html
+++ /dev/null
@@ -1,13 +0,0 @@
-{{ partial "header" . }}
-
-<main>
-
-    <h2>{{ .Title }}</h2>
-
-    {{ range (.Paginator 100).Pages }} {{ partial "list-item" . }} {{ end }}
-
-</main>
-
-{{ partial "paginator" . }}
-
-{{ partial "footer" . }}
diff --git a/layouts/crawls/single.html b/layouts/crawls/single.html
deleted file mode 100644
index db16518..0000000
--- a/layouts/crawls/single.html
+++ /dev/null
@@ -1,43 +0,0 @@
-{{ partial "header" . }}
-
-<main>
-
-    {{ partial "list-item" . }}
-
-    <br> <div class="text-justify">{{ .Content }}</div>
-
-    <!-- related posts with the same tags -->
-    {{ $related := first 3 (where (where (where .Site.Pages.ByDate.Reverse ".Type" "==" "post") ".Params.tags" "intersect" .Params.tags) "Permalink" "!=" .Permalink) }}
-
-    {{ if $related }}
-
-        <h4 class="page-header">Related</h4>
-
-        {{ range $related }} {{ partial "list-item" . }} {{ end }}
-
-    {{ end }}
-
-    <h2>Files</h2>
-    <table>
-    <tr><th>File</th><th>Kind</th><th>Date</th><th>Size [bytes]</th></tr>
-    {{ range .Params.files }}
-    {{ $downloadUrl := print "http://hdfs.gtw.wa.bl.uk:14000/webhdfs/v1" .path "?user.name=access&op=OPEN" }}
-    <tr>
-      <td>
-        {{ if eq .kind "crawl-logs" }}
-          <a href="http://intranet.wa.bl.uk:8000/?log_url={{ $downloadUrl }}">{{ index (last 1 (split .path "/")) 0 }}</a>
-        {{ else }}
-          {{ index (last 1 (split .path "/")) 0 }}
-        {{ end }}
-        &nbsp;<a href="{{ $downloadUrl }}">[download]</a>
-      </td>
-      <td>{{ .kind }}</td>
-      <td>{{ .timestamp }}</td>
-      <td>{{ .filesize }}</td>
-    </tr>
-    {{ end }}
-    </table>
-
-</main>
-
-{{ partial "footer.html" . }}
diff --git a/layouts/partials/list-item.html b/layouts/partials/list-item.html
deleted file mode 100644
index bf88530..0000000
--- a/layouts/partials/list-item.html
+++ /dev/null
@@ -1,25 +0,0 @@
-<div class="item">
-
-    {{ $.Scratch.Set "link" .RelPermalink }}
-    {{ with .Params.repo }}
-        {{ $repoHost := default "github" $.Params.repoHost }}
-        {{ if eq "github" $repoHost }}
-            {{ printf "https://github.com/%s/%s/" $.Site.Params.githubUsername . | $.Scratch.Set "link" }}
-        {{ else if eq "gitlab" $repoHost }}
-            {{ printf "https://gitlab.com/%s/%s/" $.Site.Params.gitlabUsername . | $.Scratch.Set "link" }}
-        {{ else if eq "bitbucket" $repoHost }}
-            {{ printf "https://bitbucket.org/%s/%s/" $.Site.Params.bitbucketUsername . | $.Scratch.Set "link" }}
-        {{ end }}
-    {{ end }}
-    {{ with .Params.link }} {{ $.Scratch.Set "link" . }} {{ end }}
-
-    {{ .Date.Format (.Site.Params.dateFormat | default "2 January 2006") | $.Scratch.Set "subtitle" }}
-    {{ with .Description }} {{ $.Scratch.Set "subtitle" . }} {{ end }}
-
-    <h4><a href="{{ .Scratch.Get "link" }}">{{ .Title }}</a></h4>
-    <h5>{{ $.Scratch.Get "subtitle" }}</h5>
-    {{ range .Params.tags }}
-    <a href="{{ $.Site.BaseURL }}/tags/{{ . | urlize }}"><kbd class="item-tag">{{ . }}</kbd></a>
-    {{ end }}
-
-</div>
diff --git a/layouts/reports/single.html b/layouts/reports/single.html
deleted file mode 100644
index c0dbd10..0000000
--- a/layouts/reports/single.html
+++ /dev/null
@@ -1,46 +0,0 @@
-{{ partial "header" . }}
-
-<script src="//cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js"></script>
-
-<script type="text/javascript" charset="utf-8">
-$(document).ready( function () {
-      $('.data-table')
-        .addClass( 'nowrap' )
-        .dataTable( {
-          "paging": false,
-          "lengthMenu": [ 25, 50, 100, 200 ]
-        } );
-    } );
-</script>
-
-<style>
-.dataTable {
-  font-family: sans-serif;
-}
-</style>
-
-<main>
-
-    {{ partial "list-item" . }}
-    
-    <br> <div class="text-justify">
-      <h1>Table of contents</h1>
-     {{.TableOfContents}}
-    </div>
-
-    <br> <div class="text-justify">{{ .Content }}</div>
-
-    <!-- related posts with the same tags -->
-    {{ $related := first 3 (where (where (where .Site.Pages.ByDate.Reverse ".Type" "==" "post") ".Params.tags" "intersect" .Params.tags) "Permalink" "!=" .Permalink) }}
-
-    {{ if $related }}
-
-        <h4 class="page-header">Related</h4>
-
-        {{ range $related }} {{ partial "list-item" . }} {{ end }}
-
-    {{ end }}
-
-</main>
-
-{{ partial "footer.html" . }}
diff --git a/layouts/shortcodes/csv-table.html b/layouts/shortcodes/csv-table.html
deleted file mode 100644
index eeca347..0000000
--- a/layouts/shortcodes/csv-table.html
+++ /dev/null
@@ -1,27 +0,0 @@
-<table class="data-table compact stripe">
-  {{ $url := print "content/" (.Get "src") }}
-  {{ $sep := "," }}
-  {{ range $i, $r := getCSV $sep $url }}
-    {{ if eq 0 $i }}
-    <thead>
-    <tr>
-      {{ range $c := $r }}
-        <th>{{ $c }}</th>
-      {{ end }}
-    </tr>
-    </thead>
-    <tbody>
-    {{ else }}
-    <tr>
-      {{ range $c := $r }}
-        <td>{{ $c }}</td>
-      {{ end }}
-    </tr>
-    {{ end }}
-  {{ end }}
-  </tbody>
-</table>
-
-<p>
-  You can download the underlying dataset for the above table from <a href="{{ $.Site.BaseURL }}{{ .Get "src" }}">here</a>.
-</p>
diff --git a/layouts/shortcodes/date-bar-chart.html b/layouts/shortcodes/date-bar-chart.html
deleted file mode 100644
index 90f72fc..0000000
--- a/layouts/shortcodes/date-bar-chart.html
+++ /dev/null
@@ -1,22 +0,0 @@
-{{ $id := base64Encode (.Get "src") }}
-<div id="chartContainer-{{ $id }}" class="chart" style="text-align: center;">
-  <script src="https://d3js.org/d3.v4.min.js"></script>
-  <script src="http://dimplejs.org/dist/dimple.v2.3.0.min.js"></script>
-  <script type="text/javascript">
-    var svg = dimple.newSvg("#chartContainer-{{ $id }}", 800, 400);
-    d3.csv("{{ $.Site.BaseURL }}{{ .Get "src" }}", function (data) {
-      var myChart = new dimple.chart(svg, data);
-      var x = myChart.addTimeAxis("x", "timestamp", "%Y-%m", "%Y-%m");
-      x.addOrderRule("Date");
-      x.title = "Date"
-      y = myChart.addMeasureAxis("y", "file_size");
-      y.title = "Total size in bytes"
-      myChart.addSeries("stream", dimple.plot.bar);
-      myChart.addLegend(65, 10, 710, 20, "right");
-      myChart.draw();
-    });
-  </script>
-</div>
-<p>
-  You can download the underlying dataset for the above graph from <a href="{{ $.Site.BaseURL }}{{ .Get "src" }}">here</a>.
-</p>
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..8ee1b19
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='ukwa-reports',
+    version='0.1.0',
+    packages=find_packages(include=['ukwa_reports', 'ukwa_reports.*']),
+    install_requires=[
+        'jupyterlab',
+        'jupyter-book',
+        'altair',
+        'jupytext',
+    ]
+)
diff --git a/static/css/main.css b/static/css/main.css
deleted file mode 100644
index cafb2dc..0000000
--- a/static/css/main.css
+++ /dev/null
@@ -1,109 +0,0 @@
-html, body {
-    height: 100%;
-}
-
-body {
-    padding-top: 55px;
-    display: flex;
-    text-align: center;
-    flex-direction: column;
-}
-
-main {
-    margin: auto;
-    padding: 25px;
-    flex: 1 0 auto;
-}
-
-main table {
-    text-align: left;
-}
-
-main table th{
-  border-bottom: 1px solid #333;
-}
-main table td {
-    padding: 1px 5px;
-}
-/*footer*/
-
-.copyright {
-    margin: 15px 0;
-}
-
-/*home page*/
-
-.intro {
-    transform: translateY(22vh);
-}
-
-.intro > h1 {
-    color: #212121;
-    font-size: 12vh;
-}
-
-.intro > h2 {
-    color: #757575;
-    font-size: 3vmin;
-}
-
-.intro > .profile {
-    width: 10vh;
-    height: 10vh;
-    border-radius: 50%;
-}
-
-/*apply accent colour to links*/
-
-a:link, a:visited {
-    color: var(--accent);
-}
-
-a.icon:hover {
-    text-decoration: none;
-}
-
-a:hover {
-    color: var(--accent) !important;
-}
-
-/*paginator at bottom of list view*/
-
-.pages {
-    padding: 15px 0;
-}
-
-.pages-icon {
-    padding: 0 15px;
-}
-
-/*list item for posts and projects*/
-
-.item {
-    padding: 10px 0;
-}
-
-.item-tag {
-    background-color: var(--accent);
-}
-
-/*navigation bar icons*/
-
-.navbar-icon {
-    font-size: 125%;
-    display: inline-block !important;
-}
-
-/*coloured borders at top and bottom of the page*/
-
-.navbar.navbar-default {
-    border-top: var(--border-width) solid var(--accent);
-}
-
-footer {
-    border-bottom: var(--border-width) solid var(--accent);
-}
-
-img {
-    max-width: 100%;
-}
diff --git a/themes/minimal b/themes/minimal
deleted file mode 160000
index 7d92985..0000000
--- a/themes/minimal
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7d929851ffdd5a0752d8b1f05596cf7cbf907982

From 7e3a26e2a605bad81088c7eea5cc46fc453d52bb Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Fri, 20 Oct 2023 15:01:12 +0100
Subject: [PATCH 4/7] Add some ignores.

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index a5508db..2fff441 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
 _build
 .ipynb_checkpoints
 __pycache__
+content/reports/storage/*.jsonl
+content/reports/storage/*.csv
+.Trash-*
+.python-version

From 47ed67c46ddcee919561abb384d16c78829ebf68 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Fri, 20 Oct 2023 23:40:07 +0100
Subject: [PATCH 5/7] Update so jupyter-book build can be run from the
 container.

---
 .dockerignore                      | 2 ++
 Dockerfile                         | 6 ++++++
 build.sh                           | 1 +
 content/reports/storage/summary.md | 2 +-
 4 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 .dockerignore
 mode change 100644 => 100755 build.sh

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..7ce0a84
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+*.jsonl
+*.csv
diff --git a/Dockerfile b/Dockerfile
index 6f26c55..57b001d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,3 +6,9 @@ COPY setup.py .
 
 RUN pip install --no-cache -v .
 
+COPY content .
+COPY build.sh .
+
+# Default action is to run the full build script to generate output at ./_build
+# Use volumes to map input (content) and/or output (_build)
+CMD ./build.sh
diff --git a/build.sh b/build.sh
old mode 100644
new mode 100755
index ff975bb..496d83a
--- a/build.sh
+++ b/build.sh
@@ -7,6 +7,7 @@ export PYDEVD_DISABLE_FILE_VALIDATION=1
 jb build --path-output . content/
 
 # Copy over CSV files, retaining the paths:
+echo Copying CSV files from the content folder to the _build: 
 cd content 
 find reports -name "*.csv" -exec cp -v {} ../_build/html/{} \;
 cd -
\ No newline at end of file
diff --git a/content/reports/storage/summary.md b/content/reports/storage/summary.md
index 0fa3e04..e1b2f3f 100644
--- a/content/reports/storage/summary.md
+++ b/content/reports/storage/summary.md
@@ -225,7 +225,7 @@ scatter = alt.Chart(by_fy_s).mark_bar().encode(
         alt.Tooltip('readable_size', title='Total bytes')
     ]
 ).properties(
-    width=800,height=200
+    width=600,height=200
 ).resolve_scale(y='independent')
 
 legend = alt.Chart(by_fy_s).mark_point().encode(

From 10497cb36c43431b94c1ed61ec8cb601e9dfee1f Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Tue, 24 Oct 2023 11:09:47 +0100
Subject: [PATCH 6/7] Cleaned up, adding in other reports.

---
 .gitignore                                  |   4 +-
 Dockerfile                                  |   7 +-
 build.sh                                    |   4 +-
 content/_toc.yml                            |  18 +-
 content/intro.md                            |   9 +-
 content/storage/dls.md                      | 205 ++++++++++++++++++++
 content/{reports => }/storage/humanbytes.py |   0
 content/storage/indexed.md                  |  96 +++++++++
 content/{reports => }/storage/scratch.md    |   0
 content/{reports => }/storage/summary.md    | 173 +++++++++++++----
 content/{reports => }/storage/test.md       |   0
 content/storage/timeline.md                 | 149 ++++++++++++++
 setup.py                                    |   1 +
 ukwa_reports/__init__.py                    |   0
 ukwa_reports/solr_facet_helper.py           |  38 ++++
 15 files changed, 648 insertions(+), 56 deletions(-)
 create mode 100644 content/storage/dls.md
 rename content/{reports => }/storage/humanbytes.py (100%)
 create mode 100644 content/storage/indexed.md
 rename content/{reports => }/storage/scratch.md (100%)
 rename content/{reports => }/storage/summary.md (73%)
 rename content/{reports => }/storage/test.md (100%)
 create mode 100644 content/storage/timeline.md
 create mode 100644 ukwa_reports/__init__.py
 create mode 100644 ukwa_reports/solr_facet_helper.py

diff --git a/.gitignore b/.gitignore
index 2fff441..7b13414 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 _build
 .ipynb_checkpoints
 __pycache__
-content/reports/storage/*.jsonl
-content/reports/storage/*.csv
+content/storage/*.jsonl
+content/storage/*.csv
 .Trash-*
 .python-version
diff --git a/Dockerfile b/Dockerfile
index 57b001d..e802b6c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,17 @@
 FROM python:3.11
 
+RUN apt-get install -y libffi-dev
+
 WORKDIR /ukwa-reports
 
+# Python dependencies and shared code:
 COPY setup.py .
-
+COPY ukwa_reports ./ukwa_reports
 RUN pip install --no-cache -v .
 
+# Jupyter Book work:
 COPY content .
 COPY build.sh .
-
 # Default action is to run the full build script to generate output at ./_build
 # Use volumes to map input (content) and/or output (_build)
 CMD ./build.sh
diff --git a/build.sh b/build.sh
index 496d83a..2bc9cae 100755
--- a/build.sh
+++ b/build.sh
@@ -6,8 +6,8 @@ export PYDEVD_DISABLE_FILE_VALIDATION=1
 # Build the book part:
 jb build --path-output . content/
 
-# Copy over CSV files, retaining the paths:
+# Copy over CSV files, retaining the full paths:
 echo Copying CSV files from the content folder to the _build: 
 cd content 
-find reports -name "*.csv" -exec cp -v {} ../_build/html/{} \;
+find . -name "*.csv" -exec cp -v {} ../_build/html/{} \;
 cd -
\ No newline at end of file
diff --git a/content/_toc.yml b/content/_toc.yml
index 27ab8b8..44400f8 100644
--- a/content/_toc.yml
+++ b/content/_toc.yml
@@ -4,12 +4,14 @@
 format: jb-book
 root: intro
 parts:
-- caption: Storage
+- caption: Reports
   chapters:
-  - file: reports/storage/summary
-  - file: reports/storage/test
-- caption: Examples
-  chapters:
-  - file: markdown
-  - file: notebooks
-  - file: markdown-notebooks
+  - file: storage/summary
+  - file: storage/timeline
+  - file: storage/indexed
+  - file: storage/dls
+#- caption: Examples
+#  chapters:
+#  - file: markdown
+#  - file: notebooks
+#  - file: markdown-notebooks
diff --git a/content/intro.md b/content/intro.md
index f8cdc73..a023993 100644
--- a/content/intro.md
+++ b/content/intro.md
@@ -1,11 +1,6 @@
-# Welcome to your Jupyter Book
+# UKWA Reports
 
-This is a small sample book to give you a feel for how book content is
-structured.
-It shows off a few of the major file types, as well as some sample content.
-It does not go in-depth into any particular topic - check out [the Jupyter Book documentation](https://jupyterbook.org) for more information.
-
-Check out the content pages bundled with this sample book to see more.
+This sub-section of the UKWA internal web site contains regularly re-generated reports.
 
 ```{tableofcontents}
 ```
diff --git a/content/storage/dls.md b/content/storage/dls.md
new file mode 100644
index 0000000..e480e6b
--- /dev/null
+++ b/content/storage/dls.md
@@ -0,0 +1,205 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+# DLS Comparison
+
+Comparing holdings on HDFS with what's in DLS, based on the status information stored in the tracking database.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+import json
+import requests
+import pandas as pd
+from ukwa_reports.solr_facet_helper import flatten_solr_buckets
+
+headers = {'content-type': "application/json" }
+
+json_facet = {
+    # Primary facet is by date - here we break down the last month(s) into days
+    'facet': {
+        'dates' : { 
+            'type' : 'range', 
+            'field' : 'timestamp_dt', 
+            'start' : "NOW/YEAR-10YEAR",
+            'end' : "NOW/YEAR+1YEAR", 
+            'gap' : "+1MONTH", 
+            # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded:
+            'facet': { 
+                'stream': { 
+                    'type': 'terms', 
+                    "field": "stream_s", 
+                    'missing': True,
+                    'facet': { 
+                        'index_status': { 
+                            'type': 'terms', 
+                            "field": "dls_status_i", 
+                            'missing': True,
+                            'facet' : {
+                                'bytes': 'sum(file_size_l)'
+                            }
+                        }
+                    }
+                }
+            }
+        } 
+    }
+}
+
+
+params = {
+  'q': '(kind_s:"warcs" OR kind_s:"logs") AND collection_s:"npld"',
+  'rows': 0
+}
+
+r = requests.post("http://trackdb.dapi.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)
+
+if r.status_code != 200:
+    print(r.text)
+
+df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
+# Filter empty rows:
+df=df[df['count'] != 0]
+
+# Add compound column:
+df['status'] = df.apply(lambda row: "%s, status %s" % (row.stream, row.index_status), axis=1)
+
+# And CHART
+import altair as alt
+
+alt.Chart(df).mark_bar().encode(
+    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
+    y=alt.Y('bytes'),
+    color='status:N',
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
+).properties(width=600).interactive()
+```
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+import json
+import requests
+import pandas as pd
+from ukwa_reports.solr_facet_helper import flatten_solr_buckets
+
+headers = {'content-type': "application/json" }
+
+json_facet = {
+    # Primary facet is by date - here we break down the last month(s) into days
+    'facet': {
+        'dates' : { 
+            'type' : 'range', 
+            'field' : 'timestamp_dt', 
+            'start' : "NOW/YEAR-10YEAR",
+            'end' : "NOW/YEAR+1YEAR", 
+            'gap' : "+1MONTH", 
+            # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded:
+            'facet': { 
+                'stream': { 
+                    'type': 'terms', 
+                    "field": "stream_s", 
+                    'missing': True,
+                    'facet': { 
+                        'index_status': { 
+                            'type': 'terms', 
+                            "field": "dls_status_i", 
+                            'missing': True,
+                            'facet' : {
+                                'bytes': 'sum(file_size_l)'
+                            }
+                        }
+                    }
+                }
+            }
+        } 
+    }
+}
+
+
+params = {
+  'q': '(kind_s:"warcs" OR kind_s:"logs") AND collection_s:"npld"',
+  'rows': 0
+}
+
+r = requests.post("http://trackdb.dapi.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)
+
+if r.status_code != 200:
+    print(r.text)
+
+df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
+# Filter empty rows:
+df=df[df['count'] != 0]
+
+# Add compound column:
+df['status'] = df.apply(lambda row: "%s, status %s" % (row.stream, row.index_status), axis=1)
+
+# And CHART
+import altair as alt
+
+alt.Chart(df).mark_bar().encode(
+    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
+    y=alt.Y('bytes'),
+    color='status:N',
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
+).properties(width=600).interactive()
+```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+And the same data, shown as percentage of bytes rather than total bytes.
+
+```{code-cell} ipython3
+---
+editable: true
+jupyter:
+  source_hidden: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+alt.Chart(df).mark_bar().encode(
+    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
+    y=alt.Y('count', stack="normalize", axis=alt.Axis(format='%')),
+    color='status:N',
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
+).properties(width=600).interactive()
+```
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+---
+
+```
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+---
+
+```
diff --git a/content/reports/storage/humanbytes.py b/content/storage/humanbytes.py
similarity index 100%
rename from content/reports/storage/humanbytes.py
rename to content/storage/humanbytes.py
diff --git a/content/storage/indexed.md b/content/storage/indexed.md
new file mode 100644
index 0000000..d467a71
--- /dev/null
+++ b/content/storage/indexed.md
@@ -0,0 +1,96 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+# Recent CDX Indexed WARCs
+
+This page shows recent WARCs and their CDX-indexing status. The last month's worth of data is shown, and any WARCs that are known to the tracking database, but not yet CDX indexed, will be marked as `missing`.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+import json
+import requests
+import pandas as pd
+from ukwa_reports.solr_facet_helper import flatten_solr_buckets
+
+headers = {'content-type': "application/json" }
+
+json_facet = {
+    # Primary facet is by date - here we break down the last month(s) into days
+    'facet': {
+        'dates' : { 
+            'type' : 'range', 
+            'field' : 'timestamp_dt', 
+            'start' : "NOW/MONTH-1MONTH",
+            'end' : "NOW/MONTH+32DAY", 
+#            'start' : "NOW/MONTH-10YEAR",
+#            'end' : "NOW/MONTH+1MONTH", 
+            'gap' : "+1DAY", 
+#            'gap' : "+1MONTH", 
+            # For each day, we facet:
+            'facet': { 
+                'stream': { 
+                    'type': 'terms', 
+                    "field": "stream_s", 
+                    'missing': True,
+                    'facet': { 
+                        'cdx_status': { 
+                            'type': 'terms', 
+                            "field": "cdx_index_ss", 
+                            'missing': True,
+                            'facet' : {
+                                'bytes': 'sum(file_size_l)'
+                            }
+                        }
+                    }
+                }
+            }
+        } 
+    }
+}
+
+
+params = {
+  'q': 'kind_s:"warcs"',
+  'rows': 0
+}
+
+r = requests.post("http://solr8.api.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)
+
+if r.status_code != 200:
+    print(r.text)
+
+df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
+# Filter empty rows:
+df=df[df['count'] != 0]
+
+# Add compound column:
+df['status'] = df.apply(lambda row: "%s, %s" % (row.stream, row.cdx_status), axis=1)
+
+
+# And CHART it:
+import altair as alt
+
+alt.Chart(df).mark_bar(size=6).encode(
+    x='dates:T',
+    y='count',
+    color='status',
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'), 'stream', 'cdx_status', 'count', 'bytes']
+).properties(width=600).interactive()
+```
diff --git a/content/reports/storage/scratch.md b/content/storage/scratch.md
similarity index 100%
rename from content/reports/storage/scratch.md
rename to content/storage/scratch.md
diff --git a/content/reports/storage/summary.md b/content/storage/summary.md
similarity index 73%
rename from content/reports/storage/summary.md
rename to content/storage/summary.md
index e1b2f3f..559eabc 100644
--- a/content/reports/storage/summary.md
+++ b/content/storage/summary.md
@@ -1,6 +1,5 @@
 ---
 jupytext:
-  cell_metadata_filter: -all
   formats: md:myst
   text_representation:
     extension: .md
@@ -13,11 +12,32 @@ kernelspec:
   name: python3
 ---
 
-# Summary Report
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
-First we load the data and look for duplicate data across the different storage services.
+# Storage Summary Report
+
+## Introduction
+Every day, all active data stores are scanned and their inventories taken. This report uses that data to summarise our current holdings in terms of volumes of data and numbers of files.  We classify files in various ways, and provide summary statistics based on those groupings.  They include:
+
+| Classification | Description |
+| -------------- | ----------- |
+**Date**       | File creation date by e.g. year, or financial year, or month, etc.   |
+**Collection** | The legal framework we collect under. One of `selective`, `npld` or `bypm` (by permission)  
+| **Stream**     | The capture process, e.g. `selective` (WCT), `frequent`, `domain`, `webrecorder`, `warcit` |
+| **Kind**       | The kind of file, e.g. `warcs`, `viral` (meaning WARCs of nullified viral records), `crawl-logs` etc. |
+| **Store**      | The storage system the files reside on, e.g. `h020` (old Hadoop), `h3` (new Hadoop), `aws_s3` (AWS S3 Glacier) |  |
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+## Current Totals
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 import os
 import requests
 import pandas as pd
@@ -25,8 +45,40 @@ from humanbytes import HumanBytes
 from IPython.display import display, HTML, FileLink, FileLinks
 import pathlib
 
+# Helper function to show a table and offer a downloadable version:
+def show_table_and_dl(df, slug):
+    # Shift to standard Column Names
+    df = df.rename(columns={
+        'timestamp_dt': 'year',
+        'collection_s': 'collection',
+        'stream_s': 'stream',
+        'kind_s': 'kind',
+        'size': 'size_bytes',
+        'count': 'file_count',
+        'hdfs_service_id_s': 'store'
+    })
+
+    # Add a Total:
+    df.loc['Total']= df.sum(numeric_only=True)
+    
+    # Replace NaNs
+    df = df.fillna('')
+
+    # Clean up size formatting:
+    df['size'] = df['size_bytes'].apply(lambda x: HumanBytes.format(x, True))
+    df['size_bytes'] = df['size_bytes'].apply(int)
+    df['file_count'] = df['file_count'].apply(int)
+    
+    # Also make the data available for download:
+    csv_file = f'{slug}.csv'
+    df.to_csv(csv_file, index=False)
+    dl = FileLink(csv_file, result_html_prefix='Download the data from this table here: ')
+    display(df,dl)
+
+# Establish the current folder:
 dir_path = pathlib.Path().absolute()
 
+# Display more rows by default:
 pd.set_option('display.max_rows', 100)
 
 # Pick up source locations:
@@ -54,6 +106,12 @@ display(HTML(f"Found a total of {total_records:,} WARC and crawl log files."))
 ```
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 # Dataframe of all unique paths (drop others for paths appearing in more than one 'fresh' TrackDB record):
 dfu = df.drop_duplicates(subset=['file_path_s']).drop(columns=['file_path_s'])
 
@@ -62,9 +120,18 @@ unique_records = len(dfu)
 display(HTML(f"Found {unique_records:,} unique files (based on file path). This means there are {(total_records-unique_records):,} files duplicated across storage systems."))
 ```
 
-The following table shows the most recent WARCs for each data store, along with the associated timestamp. This can be used to check the source data for this report is up to date.
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+## Most Recent Files
+The following table shows the most recent WARCs for each data store, along with the associated timestamp. This can be used to check the source data for this report is up to date, as the date for the `h3` store should be within the last day or two.
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 pd.set_option('display.max_colwidth', 1024)
 # Now we look for the most recent WARC files:
 dflw = df.filter(items=['hdfs_service_id_s', 'file_path_s', 'kind_s', 'timestamp_dt'], axis=1)
@@ -75,44 +142,37 @@ dflw = dflw.reset_index().rename(columns={
 dflw
 ```
 
-## Statistics by Year
-
-This table summarises our overall totals for the different kinds of data we hold.
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
-### Overall totals by year
+## Totals by Collection & Stream
 
 ```{code-cell} ipython3
-from IPython.display import display
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+tots = dfu.groupby(['collection_s','stream_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
+tots = tots.reset_index()
 
-def show_table_and_dl(df, slug):
-    # Shift to standard Column Names
-    df = df.rename(columns={
-        'timestamp_dt': 'year',
-        'collection_s': 'collection',
-        'stream_s': 'stream',
-        'kind_s': 'kind',
-        'size': 'size_bytes',
-        'count': 'file_count',
-        'hdfs_service_id_s': 'store'
-    })
+# Show table and downloader:
+show_table_and_dl(tots, 'totals_by_collection_and_stream')
+```
 
-    # Add a Total:
-    df.loc['Total']= df.sum(numeric_only=True)
-    
-    # Replace NaNs
-    df = df.fillna('')
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
-    # Clean up size formatting:
-    df['size'] = df['size_bytes'].apply(lambda x: HumanBytes.format(x, True))
-    df['size_bytes'] = df['size_bytes'].apply(int)
-    df['file_count'] = df['file_count'].apply(int)
-    
-    # Also make the data available for download:
-    csv_file = f'{slug}.csv'
-    df.to_csv(csv_file, index=False)
-    dl = FileLink(csv_file, result_html_prefix='Download the data from this table here: ')
-    display(df,dl)
+## Statistics by Year
 
+### Overall totals by year
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 #tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A")]).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 tots = tots.reset_index()
@@ -124,9 +184,17 @@ tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x))
 show_table_and_dl(tots, 'totals_by_year')
 ```
 
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
 ### Totals by Year & Collection
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 #tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 tots = tots.reset_index()
@@ -141,6 +209,12 @@ show_table_and_dl(tots, 'totals_by_year_collection')
 ### Totals by Year, Collection, Stream, Store & Kind
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 tots = tots.reset_index()
 
@@ -158,6 +232,12 @@ The same data, but aggregating by financial year.
 ### Totals by Financial Year
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 by_fy = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR")]).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 
 # Removed heirarchical index so we can plot:
@@ -173,6 +253,12 @@ show_table_and_dl(by_fy, 'totals_by_fy')
 ### Totals by Financial Year, Collection, Stream, Store & Kind
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 by_fy = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 
 # Removed heirarchical index so we can plot:
@@ -188,6 +274,12 @@ show_table_and_dl(by_fy, 'totals_by_fy_collection_stream_store_kind')
 ### Graphs of Totals by Stream & Kind, over Time
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 by_fy_s = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR"), 'stream_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum'))
 
 # Removed heirarchical index so we can plot:
@@ -204,6 +296,12 @@ by_fy_s['readable_size'] = by_fy_s['size'].apply(lambda x: HumanBytes.format(x,
 ```
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
 import altair as alt
 
 selection = alt.selection_point(fields=['stream_s'])
@@ -239,5 +337,10 @@ scatter | legend
 ```
 
 ```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+---
 
 ```
diff --git a/content/reports/storage/test.md b/content/storage/test.md
similarity index 100%
rename from content/reports/storage/test.md
rename to content/storage/test.md
diff --git a/content/storage/timeline.md b/content/storage/timeline.md
new file mode 100644
index 0000000..c1a33c8
--- /dev/null
+++ b/content/storage/timeline.md
@@ -0,0 +1,149 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+# HDFS Timeline
+
+Breaking down what's stored on HDFS onto a timeline, i.e. totals do not include data held only on AWS Glacier.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+import json
+import requests
+import pandas as pd
+from ukwa_reports.solr_facet_helper import flatten_solr_buckets
+
+headers = {'content-type': "application/json" }
+
+json_facet = {
+    # Primary facet is by date - here we break down the last month(s) into days
+    'facet': {
+        'dates' : { 
+            'type' : 'range', 
+            'field' : 'timestamp_dt', 
+            'start' : "NOW/YEAR-20YEAR",
+            'end' : "NOW/YEAR+1YEAR", 
+            'gap' : "+1MONTH", 
+            # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded:
+            'facet': { 
+                'collection': { 
+                    'type': 'terms', 
+                    "field": "collection_s", 
+                    'missing': True,
+                    'facet': { 
+                        'stream': { 
+                            'type': 'terms', 
+                            "field": "stream_s", 
+                            'missing': True,
+                            'facet' : {
+                                'bytes': 'sum(file_size_l)'
+                            }
+                        }
+                    }
+                }
+            }
+        } 
+    }
+}
+
+
+params = {
+  'q': '(kind_s:"warcs" OR kind_s:"logs")',
+  'rows': 0
+}
+
+r = requests.post("http://solr8.api.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)
+
+if r.status_code != 200:
+    print(r.text)
+
+df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
+# Filter empty rows:
+df=df[df['count'] != 0]
+
+# Add compound column:
+df['status'] = df.apply(lambda row: "%s, %s" % (row.collection, row.stream), axis=1)
+df['terabytes'] = df.apply(lambda row: row.bytes / (1000*1000*1000*1000), axis=1)
+
+# CHART
+import altair as alt
+
+alt.Chart(df).mark_bar().encode(
+    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
+    y=alt.Y('terabytes', axis=alt.Axis(title='Data volume (TB)')),
+    color='status:N',
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'terabytes']
+).properties(width=600)
+```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+And the same data as a percentage per time period.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+alt.Chart(df).mark_bar().encode(
+    x=alt.X('dates:T', axis = alt.Axis(title='Date', format=("%b %Y"))),
+    y=alt.Y('count', stack="normalize", axis=alt.Axis(title='Percentage of files', format='%')),
+    color='status:N',
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
+).properties(width=600)
+```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+And as a cumulative graph.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+import altair as alt
+
+alt.Chart(df).transform_window(
+    cumulative_terabytes="sum(terabytes)",
+).mark_line().encode(
+    x=alt.X('dates:T', axis=alt.Axis(title='Date', format=("%b %Y"))),
+    y=alt.Y('cumulative_terabytes:Q', axis=alt.Axis(title='Cumulative total data volume (TB)')),
+    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'), 'cumulative_terabytes:Q']
+).properties(width=600)
+```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+And as cumulative totals (calculated directly rather than using the graph library):
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+df2 = df.groupby(['status'])['terabytes'].sum().groupby(level=0).cumsum().reset_index()
+df2
+```
diff --git a/setup.py b/setup.py
index 8ee1b19..37dba74 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 from setuptools import setup, find_packages
 
+# This package contains a few helper functions
 setup(
     name='ukwa-reports',
     version='0.1.0',
diff --git a/ukwa_reports/__init__.py b/ukwa_reports/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ukwa_reports/solr_facet_helper.py b/ukwa_reports/solr_facet_helper.py
new file mode 100644
index 0000000..4be44c7
--- /dev/null
+++ b/ukwa_reports/solr_facet_helper.py
@@ -0,0 +1,38 @@
+#
+# This is a bit knarly but it helpfully flattens the Solr JSON API reponse
+# (which is a kind of tree shape) into a flat table that Pandas can work with.
+#
+# See [Solr's JSON Facet API](https://lucene.apache.org/solr/guide/8_4/json-facet-api.html)
+#
+
+def flatten_solr_buckets(solr_facets):
+    flat = []
+    for key in solr_facets:
+        if isinstance(solr_facets[key], dict):
+            for vals in _flatten_facet_buckets(key, solr_facets):
+                flat.append(vals.copy())
+    return flat
+
+def _flatten_facet_buckets(facet_name, bucket, values={}):
+    subfacets = []
+    for bucket_name in bucket:
+        if isinstance(bucket[bucket_name],dict):
+            subfacets.append(bucket_name)
+    if len(subfacets) > 0:
+        for bucket_name in subfacets:
+            for sub_bucket in bucket[bucket_name]['buckets']:                
+                values[bucket_name] = sub_bucket['val']
+                for sub_values in _flatten_facet_buckets(bucket_name, sub_bucket, values.copy()):
+                    yield sub_values
+            # Also deal with the special 'missing' bucket:
+            if 'missing' in bucket[bucket_name]:
+                values[bucket_name] = "missing"
+                for sub_values in _flatten_facet_buckets(bucket_name, bucket[bucket_name]['missing'], values.copy()):
+                    yield sub_values
+    else:
+        for bucket_name in bucket:
+            if bucket_name != 'val':
+                values[bucket_name] = bucket[bucket_name]
+        yield values
+        
+

From dd251769bd126969722ea70abc7dce5ff92463e6 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Tue, 24 Oct 2023 11:12:20 +0100
Subject: [PATCH 7/7] Add Docker image build action.

---
 .github/workflows/push-to-docker-hub.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/workflows/push-to-docker-hub.yml

diff --git a/.github/workflows/push-to-docker-hub.yml b/.github/workflows/push-to-docker-hub.yml
new file mode 100644
index 0000000..43142e7
--- /dev/null
+++ b/.github/workflows/push-to-docker-hub.yml
@@ -0,0 +1,19 @@
+name: Build, scan and push to Docker Hub
+
+on:
+  push:
+    tags:
+      - '*'
+    branches:
+      - master
+
+
+jobs:
+  run_docker_build_workflow:
+    uses: ukwa/ukwa-services/.github/workflows/push-to-docker-hub.yml@master
+    secrets:
+      DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }}
+      DOCKER_HUB_ACCESS_TOKEN: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+
+