diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7ce0a84 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +*.jsonl +*.csv diff --git a/.github/workflows/push-to-docker-hub.yml b/.github/workflows/push-to-docker-hub.yml new file mode 100644 index 0000000..43142e7 --- /dev/null +++ b/.github/workflows/push-to-docker-hub.yml @@ -0,0 +1,19 @@ +name: Build, scan and push to Docker Hub + +on: + push: + tags: + - '*' + branches: + - master + + +jobs: + run_docker_build_workflow: + uses: ukwa/ukwa-services/.github/workflows/push-to-docker-hub.yml@master + secrets: + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_ACCESS_TOKEN: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + + + diff --git a/.gitignore b/.gitignore index 17982b6..7b13414 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ -public -content/crawls -data/crawls +_build +.ipynb_checkpoints +__pycache__ +content/storage/*.jsonl +content/storage/*.csv +.Trash-* +.python-version diff --git a/.gitmodules b/.gitmodules index 7b30435..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "themes/minimal"] - path = themes/minimal - url = https://github.com/calintat/minimal.git diff --git a/Dockerfile b/Dockerfile index 573cb6b..e802b6c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,17 @@ -FROM klakegg/hugo:0.65.3 AS hugo +FROM python:3.11 -COPY . /src +RUN apt-get install -y libffi-dev -WORKDIR /src +WORKDIR /ukwa-reports -ENV HUGO_DESTINATION=/onbuild - -RUN hugo - -FROM nginx -COPY --from=hugo /onbuild /usr/share/nginx/html/intranet +# Python dependencies and shared code: +COPY setup.py . +COPY ukwa_reports ./ukwa_reports +RUN pip install --no-cache -v . +# Jupyter Book work: +COPY content . +COPY build.sh . +# Default action is to run the full build script to generate output at ./_build +# Use volumes to map input (content) and/or output (_build) +CMD ./build.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..50b9a02 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +UKWA Intranet +============= + +[![Build Status](https://travis-ci.org/ukwa/ukwa-reports.svg?branch=master)](https://travis-ci.org/ukwa/ukwa-reports) +[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://registry.hub.docker.com/r/ukwa/ukwa-intranet/) + +This static website acts as a the gateway for our 'intranet' services. It's a static site built using [Hugo](https://gohugo.io/), and is deployed by being embedded in the [`ukwa-services/manage/intranet`](https://github.com/ukwa/ukwa-services/tree/master/manage/intranet) stack. + +See that project for more details. \ No newline at end of file diff --git a/archetypes/default.md b/archetypes/default.md deleted file mode 100644 index f5a9e45..0000000 --- a/archetypes/default.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "{{ replace .TranslationBaseName "-" " " | title }}" -date: {{ .Date }} -draft: true ---- - diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..2bc9cae --- /dev/null +++ b/build.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +# https://discourse.jupyter.org/t/debugger-warning-it-seems-that-frozen-modules-are-being-used-python-3-11-0/16544 +export PYDEVD_DISABLE_FILE_VALIDATION=1 + +# Build the book part: +jb build --path-output . content/ + +# Copy over CSV files, retaining the full paths: +echo Copying CSV files from the content folder to the _build: +cd content +find . -name "*.csv" -exec cp -v {} ../_build/html/{} \; +cd - \ No newline at end of file diff --git a/config.toml b/config.toml deleted file mode 100644 index 1fbc119..0000000 --- a/config.toml +++ /dev/null @@ -1,80 +0,0 @@ -baseURL = "/intranet" -languageCode = "en-gb" -title = "UKWA Reports" -theme = "minimal" -#disqusShortname = "username" # delete this to disable disqus comments -googleAnalytics = "" -copyright = "" - -[params] - author = "UK Web Archive" - description = "Technical reports" - githubUsername = "#" - accent = "#2e6dd9" - showBorder = true - backgroundColor = "white" - font = "Raleway" # should match the name on Google Fonts! - highlight = true - highlightStyle = "solarized-dark" - highlightLanguages = ["go", "haskell", "kotlin", "scala", "swift"] - css = ["https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css"] - -[[menu.main]] - url = "/" - name = "Home" - weight = 1 - -[[menu.main]] - url = "/logs/" - name = "Logs" - weight = 2 - -[[menu.main]] - url = "/api/" - name = "API" - weight = 3 - -[[menu.main]] - url = "/nbapps/" - name = "Tools" - weight = 4 - -[[menu.main]] - url = "/trackdb" - name = "TrackDB" - weight = 5 - -[[menu.main]] - url = "/reports/" - name = "Reports" - weight = 6 - -[[menu.main]] - url = "/categories/" - name = "Categories" - weight = 7 - -[[menu.main]] - url = "/tags/" - name = "Tags" - weight = 8 - -# Social icons to be shown on the right-hand side of the navigation bar -# The "name" field should match the name of the icon to be used -# The list of available icons can be found at http://fontawesome.io/icons/ - -[[menu.icon]] - url = "mailto:web-archivist@bl.uk" - name = "envelope-o" - weight = 1 - -[[menu.icon]] - url = "https://github.com/ukwa/" - name = "github" - weight = 2 - -[[menu.icon]] - url = "https://twitter.com/UKWebArchive/" - name = "twitter" - weight = 3 - diff --git a/content/_config.yml b/content/_config.yml new file mode 100644 index 0000000..33bd657 --- /dev/null +++ b/content/_config.yml @@ -0,0 +1,38 @@ +# Book settings +# Learn more at https://jupyterbook.org/customize/config.html + +title: UKWA Technical Documentation +author: The UK Web Archive +logo: "assets/logos/ukwa-2018-onwhite-close.svg" + +# Don't include these: +exclude_patterns: ['_build', '**.ipynb_checkpoints'] + +# Auto-exclude files not in the toc +only_build_toc_files: true + +# Force re-execution of notebooks on each build? +# See https://jupyterbook.org/content/execute.html +execute: + execute_notebooks: auto + # Long timeout because some analyses take a while: + timeout: 1000 + +# Add a bibtex file so that we can create citations +bibtex_bibfiles: + - references.bib + +# Information about where the book exists on the web +repository: + url: https://github.com/ukwa/ukwa-reports # Online location of your book + path_to_book: content # Optional path to your book, relative to the repository root + branch: master # Which branch of the repository should be used when creating links (optional) + +# Add GitHub buttons to your book +# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository +html: + use_issues_button: true + use_repository_button: true + use_edit_page_button: true + home_page_in_navbar: false + diff --git a/content/_toc.yml b/content/_toc.yml new file mode 100644 index 0000000..44400f8 --- /dev/null +++ b/content/_toc.yml @@ -0,0 +1,17 @@ +# Table of contents +# Learn more at https://jupyterbook.org/customize/toc.html + +format: jb-book +root: intro +parts: +- caption: Reports + chapters: + - file: storage/summary + - file: storage/timeline + - file: storage/indexed + - file: storage/dls +#- caption: Examples +# chapters: +# - file: markdown +# - file: notebooks +# - file: markdown-notebooks diff --git a/content/assets/logos/ukwa-2018-onwhite-close.svg b/content/assets/logos/ukwa-2018-onwhite-close.svg new file mode 100644 index 0000000..5281f63 --- /dev/null +++ b/content/assets/logos/ukwa-2018-onwhite-close.svg @@ -0,0 +1,33 @@ + + + + + +Created by potrace 1.15, written by Peter Selinger 2001-2017 + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/content/intro.md b/content/intro.md new file mode 100644 index 0000000..a023993 --- /dev/null +++ b/content/intro.md @@ -0,0 +1,6 @@ +# UKWA Reports + +This sub-section of the UKWA internal web site contains regularly re-generated reports. + +```{tableofcontents} +``` diff --git a/content/markdown-notebooks.md b/content/markdown-notebooks.md new file mode 100644 index 0000000..a057a32 --- /dev/null +++ b/content/markdown-notebooks.md @@ -0,0 +1,53 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.11.5 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Notebooks with MyST Markdown + +Jupyter Book also lets you write text-based notebooks using MyST Markdown. +See [the Notebooks with MyST Markdown documentation](https://jupyterbook.org/file-types/myst-notebooks.html) for more detailed instructions. +This page shows off a notebook written in MyST Markdown. + +## An example cell + +With MyST Markdown, you can define code cells with a directive like so: + +```{code-cell} +print(2 + 2) +``` + +When your book is built, the contents of any `{code-cell}` blocks will be +executed with your default Jupyter kernel, and their outputs will be displayed +in-line with the rest of your content. + +```{seealso} +Jupyter Book uses [Jupytext](https://jupytext.readthedocs.io/en/latest/) to convert text-based files to notebooks, and can support [many other text-based notebook files](https://jupyterbook.org/file-types/jupytext.html). +``` + +## Create a notebook with MyST Markdown + +MyST Markdown notebooks are defined by two things: + +1. YAML metadata that is needed to understand if / how it should convert text files to notebooks (including information about the kernel needed). + See the YAML at the top of this page for example. +2. The presence of `{code-cell}` directives, which will be executed with your book. + +That's all that is needed to get started! + +## Quickly add YAML metadata for MyST Notebooks + +If you have a markdown file and you'd like to quickly add YAML metadata to it, so that Jupyter Book will treat it as a MyST Markdown Notebook, run the following command: + +``` +jupyter-book myst init path/to/markdownfile.md +``` diff --git a/content/markdown.md b/content/markdown.md new file mode 100644 index 0000000..0ddaab3 --- /dev/null +++ b/content/markdown.md @@ -0,0 +1,55 @@ +# Markdown Files + +Whether you write your book's content in Jupyter Notebooks (`.ipynb`) or +in regular markdown files (`.md`), you'll write in the same flavor of markdown +called **MyST Markdown**. +This is a simple file to help you get started and show off some syntax. + +## What is MyST? + +MyST stands for "Markedly Structured Text". It +is a slight variation on a flavor of markdown called "CommonMark" markdown, +with small syntax extensions to allow you to write **roles** and **directives** +in the Sphinx ecosystem. + +For more about MyST, see [the MyST Markdown Overview](https://jupyterbook.org/content/myst.html). + +## Sample Roles and Directives + +Roles and directives are two of the most powerful tools in Jupyter Book. They +are kind of like functions, but written in a markup language. They both +serve a similar purpose, but **roles are written in one line**, whereas +**directives span many lines**. They both accept different kinds of inputs, +and what they do with those inputs depends on the specific role or directive +that is being called. + +Here is a "note" directive: + +```{note} +Here is a note +``` + +It will be rendered in a special box when you build your book. + +Here is an inline directive to refer to a document: {doc}`markdown-notebooks`. + + +## Citations + +You can also cite references that are stored in a `bibtex` file. For example, +the following syntax: `` {cite}`holdgraf_evidence_2014` `` will render like +this: {cite}`holdgraf_evidence_2014`. + +Moreover, you can insert a bibliography into your page with this syntax: +The `{bibliography}` directive must be used for all the `{cite}` roles to +render properly. +For example, if the references for your book are stored in `references.bib`, +then the bibliography is inserted with: + +```{bibliography} +``` + +## Learn more + +This is just a simple starter to get you started. +You can learn a lot more at [jupyterbook.org](https://jupyterbook.org). diff --git a/content/notebooks.ipynb b/content/notebooks.ipynb new file mode 100644 index 0000000..fdb7176 --- /dev/null +++ b/content/notebooks.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Content with notebooks\n", + "\n", + "You can also create content with Jupyter Notebooks. This means that you can include\n", + "code blocks and their outputs in your book.\n", + "\n", + "## Markdown + notebooks\n", + "\n", + "As it is markdown, you can embed images, HTML, etc into your posts!\n", + "\n", + "![](https://myst-parser.readthedocs.io/en/latest/_static/logo-wide.svg)\n", + "\n", + "You can also $add_{math}$ and\n", + "\n", + "$$\n", + "math^{blocks}\n", + "$$\n", + "\n", + "or\n", + "\n", + "$$\n", + "\\begin{aligned}\n", + "\\mbox{mean} la_{tex} \\\\ \\\\\n", + "math blocks\n", + "\\end{aligned}\n", + "$$\n", + "\n", + "But make sure you \\$Escape \\$your \\$dollar signs \\$you want to keep!\n", + "\n", + "## MyST markdown\n", + "\n", + "MyST markdown works in Jupyter Notebooks as well. For more information about MyST markdown, check\n", + "out [the MyST guide in Jupyter Book](https://jupyterbook.org/content/myst.html),\n", + "or see [the MyST markdown documentation](https://myst-parser.readthedocs.io/en/latest/).\n", + "\n", + "## Code blocks and outputs\n", + "\n", + "Jupyter Book will also embed your code blocks and output in your book.\n", + "For example, here's some sample Matplotlib code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib import rcParams, cycler\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "plt.ion()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fixing random state for reproducibility\n", + "np.random.seed(19680801)\n", + "\n", + "N = 10\n", + "data = [np.logspace(0, 1, 100) + np.random.randn(100) + ii for ii in range(N)]\n", + "data = np.array(data).T\n", + "cmap = plt.cm.coolwarm\n", + "rcParams['axes.prop_cycle'] = cycler(color=cmap(np.linspace(0, 1, N)))\n", + "\n", + "\n", + "from matplotlib.lines import Line2D\n", + "custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),\n", + " Line2D([0], [0], color=cmap(.5), lw=4),\n", + " Line2D([0], [0], color=cmap(1.), lw=4)]\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5))\n", + "lines = ax.plot(data)\n", + "ax.legend(custom_lines, ['Cold', 'Medium', 'Hot']);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a lot more that you can do with outputs (such as including interactive outputs)\n", + "with your book. For more information about this, see [the Jupyter Book documentation](https://jupyterbook.org)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/content/references.bib b/content/references.bib new file mode 100644 index 0000000..783ec6a --- /dev/null +++ b/content/references.bib @@ -0,0 +1,56 @@ +--- +--- + +@inproceedings{holdgraf_evidence_2014, + address = {Brisbane, Australia, Australia}, + title = {Evidence for {Predictive} {Coding} in {Human} {Auditory} {Cortex}}, + booktitle = {International {Conference} on {Cognitive} {Neuroscience}}, + publisher = {Frontiers in Neuroscience}, + author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Knight, Robert T.}, + year = {2014} +} + +@article{holdgraf_rapid_2016, + title = {Rapid tuning shifts in human auditory cortex enhance speech intelligibility}, + volume = {7}, + issn = {2041-1723}, + url = {http://www.nature.com/doifinder/10.1038/ncomms13654}, + doi = {10.1038/ncomms13654}, + number = {May}, + journal = {Nature Communications}, + author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Rieger, Jochem W. and Crone, Nathan and Lin, Jack J. and Knight, Robert T. and Theunissen, Frédéric E.}, + year = {2016}, + pages = {13654}, + file = {Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:C\:\\Users\\chold\\Zotero\\storage\\MDQP3JWE\\Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:application/pdf} +} + +@inproceedings{holdgraf_portable_2017, + title = {Portable learning environments for hands-on computational instruction using container-and cloud-based technology to teach data science}, + volume = {Part F1287}, + isbn = {978-1-4503-5272-7}, + doi = {10.1145/3093338.3093370}, + abstract = {© 2017 ACM. There is an increasing interest in learning outside of the traditional classroom setting. This is especially true for topics covering computational tools and data science, as both are challenging to incorporate in the standard curriculum. These atypical learning environments offer new opportunities for teaching, particularly when it comes to combining conceptual knowledge with hands-on experience/expertise with methods and skills. Advances in cloud computing and containerized environments provide an attractive opportunity to improve the effciency and ease with which students can learn. This manuscript details recent advances towards using commonly-Available cloud computing services and advanced cyberinfrastructure support for improving the learning experience in bootcamp-style events. We cover the benets (and challenges) of using a server hosted remotely instead of relying on student laptops, discuss the technology that was used in order to make this possible, and give suggestions for how others could implement and improve upon this model for pedagogy and reproducibility.}, + booktitle = {{ACM} {International} {Conference} {Proceeding} {Series}}, + author = {Holdgraf, Christopher Ramsay and Culich, A. and Rokem, A. and Deniz, F. and Alegro, M. and Ushizima, D.}, + year = {2017}, + keywords = {Teaching, Bootcamps, Cloud computing, Data science, Docker, Pedagogy} +} + +@article{holdgraf_encoding_2017, + title = {Encoding and decoding models in cognitive electrophysiology}, + volume = {11}, + issn = {16625137}, + doi = {10.3389/fnsys.2017.00061}, + abstract = {© 2017 Holdgraf, Rieger, Micheli, Martin, Knight and Theunissen. Cognitive neuroscience has seen rapid growth in the size and complexity of data recorded from the human brain as well as in the computational tools available to analyze this data. This data explosion has resulted in an increased use of multivariate, model-based methods for asking neuroscience questions, allowing scientists to investigate multiple hypotheses with a single dataset, to use complex, time-varying stimuli, and to study the human brain under more naturalistic conditions. These tools come in the form of “Encoding” models, in which stimulus features are used to model brain activity, and “Decoding” models, in which neural features are used to generated a stimulus output. Here we review the current state of encoding and decoding models in cognitive electrophysiology and provide a practical guide toward conducting experiments and analyses in this emerging field. Our examples focus on using linear models in the study of human language and audition. We show how to calculate auditory receptive fields from natural sounds as well as how to decode neural recordings to predict speech. The paper aims to be a useful tutorial to these approaches, and a practical introduction to using machine learning and applied statistics to build models of neural activity. The data analytic approaches we discuss may also be applied to other sensory modalities, motor systems, and cognitive systems, and we cover some examples in these areas. In addition, a collection of Jupyter notebooks is publicly available as a complement to the material covered in this paper, providing code examples and tutorials for predictive modeling in python. The aimis to provide a practical understanding of predictivemodeling of human brain data and to propose best-practices in conducting these analyses.}, + journal = {Frontiers in Systems Neuroscience}, + author = {Holdgraf, Christopher Ramsay and Rieger, J.W. and Micheli, C. and Martin, S. and Knight, R.T. and Theunissen, F.E.}, + year = {2017}, + keywords = {Decoding models, Encoding models, Electrocorticography (ECoG), Electrophysiology/evoked potentials, Machine learning applied to neuroscience, Natural stimuli, Predictive modeling, Tutorials} +} + +@book{ruby, + title = {The Ruby Programming Language}, + author = {Flanagan, David and Matsumoto, Yukihiro}, + year = {2008}, + publisher = {O'Reilly Media} +} diff --git a/content/reports/hdfs/index.md b/content/reports/hdfs/index.md deleted file mode 100644 index 091207d..0000000 --- a/content/reports/hdfs/index.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: HDFS Storage Reports -description: Summaries generated from the data stored on HDFS. ---- - -# Introduction - -The following statistics are generated based on the contents of the HDFS file system we use to store our data. We regularly scan the store and classify each item based on where it is stored, it's file type, and so on. Where possible, we also extract date information, e.g. using the date stamp within the filename of our WARC files to estimate the time that WARC data was collected (although strictly speaking we are using the date the file was created). This means the dates are reliable for all but the earliest years of selective web archiving, i.e. before we started putting the dates in the filenames. - -All figures are in bytes unless otherwise stated. - -Deeper analysis can be performed using Python notebooks, e.g. [hdfs-reports-full.ipynb](http://intranet.wa.bl.uk/ukwa/jupyter/notebooks/ukwa-manage/notebooks/hdfs-reports-full.ipynb). - -# Non-Print Legal Deposit Content - -This section only includes archival content, i.e. WARCs (either normal content or 'viral WARCs' containing material that appears to contain computer viruses), crawl logs and any additional archival package material. - -## NPLD Totals - -{{< csv-table src="reports/hdfs/npld-total-file-size-by-stream-totals.csv" >}} - -## NPLD Total By Year - -{{< csv-table src="reports/hdfs/npld-total-file-size-by-stream-per-year.csv" >}} - -## NPLD Total By Month - -{{< date-bar-chart src="reports/hdfs/npld-total-file-size-by-stream-per-month.csv" >}} - -# All Holdings - -These section includes all material on the cluster. If the files appear to be associated with a crawl stream, then the collection (e.g. Non-Print Legal Deposit) and stream (e.g. Domain crawl) will be set. If not, the collection and stream will both be 'None'. - -## Total bytes of files on HDFS, by collection, stream and type - -This report breaks down the total size of files (in bytes) stored on HDFS by the collection, stream, and type of file. - -{{< csv-table src="reports/hdfs/total-file-size-by-stream.csv" >}} - -## Total numbers of files on HDFS, by collection, stream and type - -This report breaks down the number of files stored on HDFS by the collection, stream, and type of file. - -{{< csv-table src="reports/hdfs/total-file-count-by-stream.csv" >}} - - - diff --git a/content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv b/content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv deleted file mode 100644 index 3a45410..0000000 --- a/content/reports/hdfs/npld-total-file-size-by-stream-per-month.csv +++ /dev/null @@ -1,135 +0,0 @@ -timestamp,stream,file_size -2013-04,domain,21496645180530 -2013-04,frequent,1233494513066 -2013-05,domain,10646577013628 -2013-05,frequent,503687665781 -2013-06,domain,1763992820313 -2013-06,frequent,399955364390 -2013-07,domain,497880630017 -2013-07,frequent,60835781997 -2013-08,frequent,585453340666 -2013-09,frequent,212254954288 -2013-10,frequent,691888255783 -2013-11,frequent,744359661269 -2013-12,frequent,814236551583 -2014-01,frequent,713736844362 -2014-02,frequent,897239626634 -2014-03,frequent,713895516622 -2014-04,frequent,1005532161253 -2014-05,frequent,857723734233 -2014-06,domain,4990292019947 -2014-06,frequent,1011213873805 -2014-07,domain,7997786988830 -2014-07,frequent,850225024059 -2014-08,domain,4228763495435 -2014-08,frequent,836258087252 -2014-09,domain,1887002283200 -2014-09,frequent,589813630251 -2014-10,domain,12901366343469 -2014-10,frequent,1152274513168 -2014-11,domain,19704030330247 -2014-11,frequent,544292623756 -2014-12,domain,10428775788758 -2014-12,frequent,515455415080 -2015-01,frequent,784513489409 -2015-02,frequent,1054532023006 -2015-03,frequent,937694648538 -2015-04,frequent,1628677164184 -2015-05,domain,235613102166 -2015-05,frequent,655365232861 -2015-06,frequent,996598324470 -2015-07,domain,158765321 -2015-07,frequent,1511157141370 -2015-08,domain,4572882297168 -2015-08,frequent,1307160334076 -2015-09,domain,21517695604196 -2015-09,frequent,1125054533737 -2015-10,domain,10428473325631 -2015-10,frequent,1621492476057 -2015-11,domain,24232100394090 -2015-11,frequent,1298818389994 -2015-12,domain,14741495433783 -2015-12,frequent,1452828183176 -2016-01,frequent,5090601554703 -2016-02,frequent,1596757583284 -2016-03,frequent,1537087882554 -2016-04,frequent,2606971226735 -2016-05,frequent,2333724868781 -2016-06,domain,112437071 -2016-06,frequent,1783338096350 -2016-07,frequent,1820459951534 -2016-08,domain,4137229470967 -2016-08,frequent,1394484649025 -2016-09,domain,21530388796510 -2016-09,frequent,2351233461774 -2016-10,domain,15181982950820 -2016-10,frequent,3306292773345 -2016-11,domain,36239249417877 -2016-11,frequent,1416921135036 -2016-12,domain,27739226101697 -2016-12,frequent,1495647147271 -2016-12,webrecorder,491399057 -2017-01,domain,52119907584 -2017-01,frequent,2771308110362 -2017-02,frequent,4827898598837 -2017-03,frequent,1335312593832 -2017-04,frequent,2653267245899 -2017-05,frequent,1721199155315 -2017-06,domain,30883890274313 -2017-06,frequent,1950827074054 -2017-07,domain,22315390740390 -2017-07,frequent,3813423988275 -2017-08,domain,15199739069784 -2017-08,frequent,1210113588184 -2017-09,domain,9341478791455 -2017-09,frequent,1442995071443 -2017-10,domain,8375778101 -2017-10,frequent,1913313636381 -2017-11,domain,4127772894 -2017-11,frequent,1268306765297 -2017-12,domain,0 -2017-12,frequent,1802498617427 -2018-01,frequent,1798915526045 -2018-02,frequent,1699718044903 -2018-03,frequent,1696905474489 -2018-04,frequent,2711542892486 -2018-05,frequent,4145278849571 -2018-06,frequent,10696122110533 -2018-07,domain,16726081849085 -2018-07,frequent,6991581069333 -2018-08,domain,1378165058697 -2018-08,frequent,5499116104767 -2018-09,domain,20518602522171 -2018-09,frequent,7312429865198 -2018-10,domain,8586705094806 -2018-10,frequent,6702959190875 -2018-11,domain,8379383778088 -2018-11,frequent,7521458564640 -2018-12,domain,3414467033155 -2018-12,frequent,5176282768651 -2019-01,domain,139067 -2019-01,frequent,6317749831118 -2019-02,frequent,5436992410931 -2019-03,frequent,8290189122855 -2019-04,domain,65160862590 -2019-04,frequent,9895183469900 -2019-05,frequent,10784644761351 -2019-05,webrecorder,1138465383 -2019-06,domain,6836933688282 -2019-06,frequent,6832035740538 -2019-07,domain,11973478622062 -2019-07,frequent,14330210460019 -2019-08,domain,11689079851921 -2019-08,frequent,7234706643136 -2019-09,domain,4976564393961 -2019-09,frequent,8251140337157 -2019-10,domain,8427751607269 -2019-10,frequent,11613138687572 -2019-11,domain,10113950588123 -2019-11,frequent,8615401186359 -2019-12,domain,9621765218631 -2019-12,frequent,8037741598606 -2020-01,domain,28670272746 -2020-01,frequent,5429618611087 -2020-02,frequent,3211085834107 -2020-02,webrecorder,1653237053 diff --git a/content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv b/content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv deleted file mode 100644 index cd756e7..0000000 --- a/content/reports/hdfs/npld-total-file-size-by-stream-per-year.csv +++ /dev/null @@ -1,9 +0,0 @@ -timestamp,domain,frequent,webrecorder -2013,34.41 TB,5.25 TB, -2014,62.14 TB,9.69 TB, -2015,75.73 TB,14.37 TB, -2016,104.83 TB,26.73 TB,491.40 MB -2017,77.81 TB,26.71 TB, -2018,59.00 TB,61.95 TB, -2019,63.70 TB,105.64 TB,1.14 GB -2020,28.67 GB,8.64 TB,1.65 GB diff --git a/content/reports/hdfs/npld-total-file-size-by-stream-totals.csv b/content/reports/hdfs/npld-total-file-size-by-stream-totals.csv deleted file mode 100644 index 0cf8fb2..0000000 --- a/content/reports/hdfs/npld-total-file-size-by-stream-totals.csv +++ /dev/null @@ -1,5 +0,0 @@ -stream,file_size,file_count -domain,477.64 TB,481846 -frequent,258.98 TB,329881 -webrecorder,3.28 GB,104 -total,736.63 TB,811831 diff --git a/content/reports/hdfs/total-file-count-by-stream.csv b/content/reports/hdfs/total-file-count-by-stream.csv deleted file mode 100644 index 39aae76..0000000 --- a/content/reports/hdfs/total-file-count-by-stream.csv +++ /dev/null @@ -1,100 +0,0 @@ -collection,stream,timestamp,cdx,crawl-logs,dlx,logs,unknown,viral,warcs,warcs-invalid -0_original,None,2016,,,,,146514,,, -0_original,None,2017,,,,,1028,,, -1_data,None,2014,,,,,4,,, -1_data,None,2015,,,,,25,,, -1_data,None,2016,,,,,349,,, -1_data,None,2017,,,,,9,,, -1_data,None,2018,,,,,313,,, -1_data,None,2019,,,,,10706,,, -1_data,None,2020,,,,,420,,, -2_backups,None,2017,,,,,162,,, -2_backups,None,2018,,,,,413,,, -2_backups,None,2019,,,,,866259,,, -2_backups,None,2020,,,,,418893,,, -9_processing,None,2016,,,,,1738,,, -9_processing,None,2017,,,,,24,,, -9_processing,None,2018,,,,,12579,,, -9_processing,None,2019,,,,,13110,,, -9_processing,None,2020,,,,,984,,, -blit,None,2016,,,,,230,,, -blit,None,2017,,,,,405,,, -blit,None,2018,,,,,86,,, -blit,None,2019,,,,,51,,, -crawls,None,2012,,,,,16,,, -data,selective,2011,,,,,125,,, -data,selective,2012,,,,,62,,, -data,selective,2014,,,,,4173,,, -data,selective,2017,,,,,1,,, -data,selective,2019,,,,,2,,, -datasets,None,2014,,,,,18,,, -datasets,None,2017,,,,,2219,,, -heritrix,None,2013,,,,,261,,, -heritrix,None,2014,,,,,8506,,, -heritrix,None,2015,,,,,7481,,, -heritrix,None,2016,,,,,871,,, -heritrix,None,2017,,,,,2841,,, -heritrix,None,2019,,,,,4872,,, -ia,None,2011,,,,,884220,,, -ia,None,2012,,,,,56714,,, -ia,None,2014,,,,,203502,,, -ia,None,2017,,,,,11,,, -logs,None,2014,,,,,2735,,, -logs,None,2015,,,,,1939,,, -logs,None,2016,,,,,1171,,, -logs,None,2017,,,,,1405,,, -logs,None,2018,,,,,1493,,, -logs,None,2019,,,,,8868,,, -logs,None,2020,,,,,863,,, -lvdata,None,2011,,,,,18,,, -lvdata,None,2015,,,,,12,,, -lvdata,None,2016,,,,,2,,, -lvdata,None,2017,,,,,43,,, -lvdata,None,2018,,,,,25,,, -lvdata,None,2019,,,,,72,,, -npld,domain,2013,,9,,4,,10,33102, -npld,domain,2014,,,,50,,228,61196, -npld,domain,2015,,109,,9,,199,74237, -npld,domain,2016,,587,,2725,,237,102141, -npld,domain,2017,,254,,1516,,154,76079, -npld,domain,2018,,2109,,,,445,59420, -npld,domain,2019,,451,,,,139,70727, -npld,domain,2020,,9,,,,4,, -npld,frequent,2013,,,,173,,17,6581, -npld,frequent,2014,,,,4830,,11,15787, -npld,frequent,2015,,,,1806,,36,18635, -npld,frequent,2016,,92,,780,,125,27212, -npld,frequent,2017,,18035,,102704,,346,40397,70 -npld,frequent,2018,,5768,,34948,,707,71285,99 -npld,frequent,2019,,4924,,25492,,1152,109693, -npld,frequent,2020,,409,,2062,,14,8655, -npld,webrecorder,2016,,,,,,,6, -npld,webrecorder,2019,,,,,,,97, -npld,webrecorder,2020,,,,,,,1, -selective,selective,2010,,50,5076,42,5032,,5078, -selective,selective,2011,,30896,141666,30718,65920,,121927,33 -selective,selective,2012,14581,11337,53229,12150,22681,,51356, -selective,selective,2013,49866,9853,49891,11370,20212,,49994, -selective,selective,2014,29058,5065,29067,6108,10139,,29058, -selective,selective,2015,27650,7945,29820,9537,16147,,30081, -selective,selective,2016,5227,1372,4946,1931,2780,,5224, -tmp,None,2010,,,,,1815,,, -tmp,None,2011,,,,,13929,,, -tmp,None,2012,,,,,1164,,, -tmp,None,2013,,,,,3291,,, -tmp,None,2014,,,,,5679,,, -tmp,None,2015,,,,,6348,,, -tmp,None,2016,,,,,3230,,, -tmp,None,2017,,,,,61,,, -tmp,None,2018,,,,,156,,, -tmp,None,2019,,,,,327,,, -tmp,None,2020,,,,,48,,, -user,None,2015,,,,,1456,,, -user,None,2016,,,,,19338,,, -user,None,2017,,,,,34525,,, -user,None,2018,,,,,8170,,, -user,None,2019,,,,,12971,,, -user,None,2020,,,,,1456,,, -wap,None,2010,,,,,34,,, -wayback,None,2015,,,,,789,,, -wayback,None,2016,,,,,104,,, diff --git a/content/reports/hdfs/total-file-size-by-stream.csv b/content/reports/hdfs/total-file-size-by-stream.csv deleted file mode 100644 index 99d1110..0000000 --- a/content/reports/hdfs/total-file-size-by-stream.csv +++ /dev/null @@ -1,100 +0,0 @@ -collection,stream,timestamp,cdx,crawl-logs,dlx,logs,unknown,viral,warcs,warcs-invalid -0_original,None,2016,,,,,22356608006374,,, -0_original,None,2017,,,,,33802480975,,, -1_data,None,2014,,,,,1137405109,,, -1_data,None,2015,,,,,20119609024,,, -1_data,None,2016,,,,,161862536422,,, -1_data,None,2017,,,,,1717439291,,, -1_data,None,2018,,,,,15902128015,,, -1_data,None,2019,,,,,322733997749,,, -1_data,None,2020,,,,,7526480384,,, -2_backups,None,2017,,,,,11214754536,,, -2_backups,None,2018,,,,,47633278863,,, -2_backups,None,2019,,,,,9088959223560,,, -2_backups,None,2020,,,,,941915847706,,, -9_processing,None,2016,,,,,165581265235,,, -9_processing,None,2017,,,,,837868177,,, -9_processing,None,2018,,,,,16122448923,,, -9_processing,None,2019,,,,,17086617481,,, -9_processing,None,2020,,,,,3118411218,,, -blit,None,2016,,,,,8217960771,,, -blit,None,2017,,,,,106802817414,,, -blit,None,2018,,,,,5502164646,,, -blit,None,2019,,,,,399982567,,, -crawls,None,2012,,,,,12800085491,,, -data,selective,2011,,,,,2297602445,,, -data,selective,2012,,,,,1998893419,,, -data,selective,2014,,,,,1107799922839,,, -data,selective,2017,,,,,115055397,,, -data,selective,2019,,,,,807614932,,, -datasets,None,2014,,,,,308058048005,,, -datasets,None,2017,,,,,200036276729,,, -heritrix,None,2013,,,,,170254006349,,, -heritrix,None,2014,,,,,3681747033289,,, -heritrix,None,2015,,,,,6084460400898,,, -heritrix,None,2016,,,,,558186402532,,, -heritrix,None,2017,,,,,2674517692954,,, -heritrix,None,2019,,,,,2434853095435,,, -ia,None,2011,,,,,31273038350790,,, -ia,None,2012,,,,,4056519069254,,, -ia,None,2014,,,,,30036052322986,,, -ia,None,2017,,,,,23163443,,, -logs,None,2014,,,,,10326703264,,, -logs,None,2015,,,,,967435220,,, -logs,None,2016,,,,,216110700,,, -logs,None,2017,,,,,269406885,,, -logs,None,2018,,,,,468741857,,, -logs,None,2019,,,,,1519682542,,, -logs,None,2020,,,,,1885568599,,, -lvdata,None,2011,,,,,14758580,,, -lvdata,None,2015,,,,,450484624,,, -lvdata,None,2016,,,,,215438,,, -lvdata,None,2017,,,,,550013821,,, -lvdata,None,2018,,,,,743900179,,, -lvdata,None,2019,,,,,21142482,,, -npld,domain,2013,,497880630017,,225807897946,,4378472582,33902836541889, -npld,domain,2014,,,,1397836972235,,4882231936,62133135017950, -npld,domain,2015,,235613102166,,337291291481,,4570058043,75488235762146, -npld,domain,2016,,1021393765281,,889369950291,,8012152161,103798783257500, -npld,domain,2017,,827631587106,,650096581292,,4444901826,76973045845589, -npld,domain,2018,,1417051338354,,,,2075196719,57584278800929, -npld,domain,2019,,998109935920,,,,1771940785,62704803095201, -npld,domain,2020,,28648779575,,,,21493171,, -npld,frequent,2013,,,,784344449,,99041071,5246067047752, -npld,frequent,2014,,,,117389643041,,34269049,9687626781426, -npld,frequent,2015,,,,74768951781,,271945467,14373619995411, -npld,frequent,2016,,40202387198,,119957450800,,2170071165,26691147872029, -npld,frequent,2017,,933573121920,,1036051730769,,119883019,25776771440367,13143678154 -npld,frequent,2018,,1273256843460,,1590177149560,,255134402,60678798483629,17960316500 -npld,frequent,2019,,3306036343595,,2919995202567,,244227036,102332853678911, -npld,frequent,2020,,326451164873,,127397676420,,8261211,8314245019110, -npld,webrecorder,2016,,,,,,,491399057, -npld,webrecorder,2019,,,,,,,1138465383, -npld,webrecorder,2020,,,,,,,1653237053, -selective,selective,2010,,16174178,14852340758,1466992,87516346638,,889691193222, -selective,selective,2011,,70177720850,114203699462,7627959672,55165225240,,9682572380858,96736608 -selective,selective,2012,8597134205,29180359746,42420273550,5782744821,960121658,,5126476552554, -selective,selective,2013,32234715695,30828782244,39798000486,7371246183,343710799,,4944394041969, -selective,selective,2014,20844001981,17389874737,24449401822,4919485664,181445191,,3122414823039, -selective,selective,2015,14243250050,15662303216,20358418164,5676141494,255820890,,2259050467116, -selective,selective,2016,2430183563,2211062877,3181261947,1110765359,44371755,,391490414932, -tmp,None,2010,,,,,640716644,,, -tmp,None,2011,,,,,12887736803,,, -tmp,None,2012,,,,,1045726679,,, -tmp,None,2013,,,,,175177527,,, -tmp,None,2014,,,,,24530609,,, -tmp,None,2015,,,,,91108259,,, -tmp,None,2016,,,,,25889003,,, -tmp,None,2017,,,,,180527337,,, -tmp,None,2018,,,,,185392360,,, -tmp,None,2019,,,,,2089575884,,, -tmp,None,2020,,,,,1624290646,,, -user,None,2015,,,,,26882901161,,, -user,None,2016,,,,,26603665515,,, -user,None,2017,,,,,111546570706,,, -user,None,2018,,,,,317525167912,,, -user,None,2019,,,,,66679231335,,, -user,None,2020,,,,,8888634521,,, -wap,None,2010,,,,,108129896030,,, -wayback,None,2015,,,,,239024157440,,, -wayback,None,2016,,,,,75861089075,,, diff --git a/content/storage/dls.md b/content/storage/dls.md new file mode 100644 index 0000000..e480e6b --- /dev/null +++ b/content/storage/dls.md @@ -0,0 +1,205 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +# DLS Comparison + +Comparing holdings on HDFS with what's in DLS, based on the status information stored in the tracking database. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import json +import requests +import pandas as pd +from ukwa_reports.solr_facet_helper import flatten_solr_buckets + +headers = {'content-type': "application/json" } + +json_facet = { + # Primary facet is by date - here we break down the last month(s) into days + 'facet': { + 'dates' : { + 'type' : 'range', + 'field' : 'timestamp_dt', + 'start' : "NOW/YEAR-10YEAR", + 'end' : "NOW/YEAR+1YEAR", + 'gap' : "+1MONTH", + # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded: + 'facet': { + 'stream': { + 'type': 'terms', + "field": "stream_s", + 'missing': True, + 'facet': { + 'index_status': { + 'type': 'terms', + "field": "dls_status_i", + 'missing': True, + 'facet' : { + 'bytes': 'sum(file_size_l)' + } + } + } + } + } + } + } +} + + +params = { + 'q': '(kind_s:"warcs" OR kind_s:"logs") AND collection_s:"npld"', + 'rows': 0 +} + +r = requests.post("http://trackdb.dapi.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers) + +if r.status_code != 200: + print(r.text) + +df = pd.DataFrame(flatten_solr_buckets(r.json()['facets'])) +# Filter empty rows: +df=df[df['count'] != 0] + +# Add compound column: +df['status'] = df.apply(lambda row: "%s, status %s" % (row.stream, row.index_status), axis=1) + +# And CHART +import altair as alt + +alt.Chart(df).mark_bar().encode( + x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))), + y=alt.Y('bytes'), + color='status:N', + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes'] +).properties(width=600).interactive() +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import json +import requests +import pandas as pd +from ukwa_reports.solr_facet_helper import flatten_solr_buckets + +headers = {'content-type': "application/json" } + +json_facet = { + # Primary facet is by date - here we break down the last month(s) into days + 'facet': { + 'dates' : { + 'type' : 'range', + 'field' : 'timestamp_dt', + 'start' : "NOW/YEAR-10YEAR", + 'end' : "NOW/YEAR+1YEAR", + 'gap' : "+1MONTH", + # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded: + 'facet': { + 'stream': { + 'type': 'terms', + "field": "stream_s", + 'missing': True, + 'facet': { + 'index_status': { + 'type': 'terms', + "field": "dls_status_i", + 'missing': True, + 'facet' : { + 'bytes': 'sum(file_size_l)' + } + } + } + } + } + } + } +} + + +params = { + 'q': '(kind_s:"warcs" OR kind_s:"logs") AND collection_s:"npld"', + 'rows': 0 +} + +r = requests.post("http://trackdb.dapi.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers) + +if r.status_code != 200: + print(r.text) + +df = pd.DataFrame(flatten_solr_buckets(r.json()['facets'])) +# Filter empty rows: +df=df[df['count'] != 0] + +# Add compound column: +df['status'] = df.apply(lambda row: "%s, status %s" % (row.stream, row.index_status), axis=1) + +# And CHART +import altair as alt + +alt.Chart(df).mark_bar().encode( + x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))), + y=alt.Y('bytes'), + color='status:N', + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes'] +).properties(width=600).interactive() +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +And the same data, shown as percentage of bytes rather than total bytes. + +```{code-cell} ipython3 +--- +editable: true +jupyter: + source_hidden: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +alt.Chart(df).mark_bar().encode( + x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))), + y=alt.Y('count', stack="normalize", axis=alt.Axis(format='%')), + color='status:N', + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes'] +).properties(width=600).interactive() +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- + +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- + +``` diff --git a/content/storage/humanbytes.py b/content/storage/humanbytes.py new file mode 100644 index 0000000..020f027 --- /dev/null +++ b/content/storage/humanbytes.py @@ -0,0 +1,56 @@ +from typing import List, Union + +# From https://stackoverflow.com/a/63839503/6689 + +class HumanBytes: + METRIC_LABELS: List[str] = ["B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"] + BINARY_LABELS: List[str] = ["B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"] + PRECISION_OFFSETS: List[float] = [0.5, 0.05, 0.005, 0.0005] # PREDEFINED FOR SPEED. + PRECISION_FORMATS: List[str] = ["{}{:.0f} {}", "{}{:.1f} {}", "{}{:.2f} {}", "{}{:.3f} {}"] # PREDEFINED FOR SPEED. + + @staticmethod + def format(num: Union[int, float], metric: bool=False, precision: int=1) -> str: + """ + Human-readable formatting of bytes, using binary (powers of 1024) + or metric (powers of 1000) representation. + """ + + assert isinstance(num, (int, float)), "num must be an int or float" + assert isinstance(metric, bool), "metric must be a bool" + assert isinstance(precision, int) and precision >= 0 and precision <= 3, "precision must be an int (range 0-3)" + + unit_labels = HumanBytes.METRIC_LABELS if metric else HumanBytes.BINARY_LABELS + last_label = unit_labels[-1] + unit_step = 1000 if metric else 1024 + unit_step_thresh = unit_step - HumanBytes.PRECISION_OFFSETS[precision] + + is_negative = num < 0 + if is_negative: # Faster than ternary assignment or always running abs(). + num = abs(num) + + for unit in unit_labels: + if num < unit_step_thresh: + # VERY IMPORTANT: + # Only accepts the CURRENT unit if we're BELOW the threshold where + # float rounding behavior would place us into the NEXT unit: F.ex. + # when rounding a float to 1 decimal, any number ">= 1023.95" will + # be rounded to "1024.0". Obviously we don't want ugly output such + # as "1024.0 KiB", since the proper term for that is "1.0 MiB". + break + if unit != last_label: + # We only shrink the number if we HAVEN'T reached the last unit. + # NOTE: These looped divisions accumulate floating point rounding + # errors, but each new division pushes the rounding errors further + # and further down in the decimals, so it doesn't matter at all. + num /= unit_step + + return HumanBytes.PRECISION_FORMATS[precision].format("-" if is_negative else "", num, unit) + +#print(HumanBytes.format(2251799813685247)) # 2 pebibytes +#print(HumanBytes.format(2000000000000000, True)) # 2 petabytes +#print(HumanBytes.format(1099511627776)) # 1 tebibyte +#print(HumanBytes.format(1000000000000, True)) # 1 terabyte +#print(HumanBytes.format(1000000000, True)) # 1 gigabyte +#print(HumanBytes.format(4318498233, precision=3)) # 4.022 gibibytes +#print(HumanBytes.format(4318498233, True, 3)) # 4.318 gigabytes +#print(HumanBytes.format(-4318498233, precision=2)) # -4.02 gibibytes \ No newline at end of file diff --git a/content/storage/indexed.md b/content/storage/indexed.md new file mode 100644 index 0000000..d467a71 --- /dev/null +++ b/content/storage/indexed.md @@ -0,0 +1,96 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +# Recent CDX Indexed WARCs + +This page shows recent WARCs and their CDX-indexing status. The last month's worth of data is shown, and any WARCs that are known to the tracking database, but not yet CDX indexed, will be marked as `missing`. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import json +import requests +import pandas as pd +from ukwa_reports.solr_facet_helper import flatten_solr_buckets + +headers = {'content-type': "application/json" } + +json_facet = { + # Primary facet is by date - here we break down the last month(s) into days + 'facet': { + 'dates' : { + 'type' : 'range', + 'field' : 'timestamp_dt', + 'start' : "NOW/MONTH-1MONTH", + 'end' : "NOW/MONTH+32DAY", +# 'start' : "NOW/MONTH-10YEAR", +# 'end' : "NOW/MONTH+1MONTH", + 'gap' : "+1DAY", +# 'gap' : "+1MONTH", + # For each day, we facet: + 'facet': { + 'stream': { + 'type': 'terms', + "field": "stream_s", + 'missing': True, + 'facet': { + 'cdx_status': { + 'type': 'terms', + "field": "cdx_index_ss", + 'missing': True, + 'facet' : { + 'bytes': 'sum(file_size_l)' + } + } + } + } + } + } + } +} + + +params = { + 'q': 'kind_s:"warcs"', + 'rows': 0 +} + +r = requests.post("http://solr8.api.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers) + +if r.status_code != 200: + print(r.text) + +df = pd.DataFrame(flatten_solr_buckets(r.json()['facets'])) +# Filter empty rows: +df=df[df['count'] != 0] + +# Add compound column: +df['status'] = df.apply(lambda row: "%s, %s" % (row.stream, row.cdx_status), axis=1) + + +# And CHART it: +import altair as alt + +alt.Chart(df).mark_bar(size=6).encode( + x='dates:T', + y='count', + color='status', + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'), 'stream', 'cdx_status', 'count', 'bytes'] +).properties(width=600).interactive() +``` diff --git a/content/storage/scratch.md b/content/storage/scratch.md new file mode 100644 index 0000000..a70ae6b --- /dev/null +++ b/content/storage/scratch.md @@ -0,0 +1,221 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +# Scratch space + +A place to experiment with other analyses + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-cell] +--- +import os +import requests +import pandas as pd +from humanbytes import HumanBytes +from IPython.display import display, HTML, FileLink, FileLinks + +pd.set_option('display.max_rows', 100) + +# Pick up source locations: +trackdb_jsonl = os.environ.get('TRACKDB_LIST_JSONL','trackdb_list.jsonl') +aws_jsonl = os.environ.get('AWS_S3_LIST_JSONL','aws_s3_list.jsonl') + +# Load TrackDB records: +df = pd.read_json(trackdb_jsonl, lines=True) + +# Also load AWS records: +aws_df = pd.read_json(aws_jsonl, lines=True) +# Filter out non-content files: +aws_df = aws_df[aws_df['kind_s'] != 'unknown'] +df = pd.concat([df,aws_df], sort=True) + +# Set up timestamp: +df['timestamp_dt']= pd.to_datetime(df['timestamp_dt']) +total_records = len(df) + +# Force integers: +df['file_size_l'] = df['file_size_l'].fillna(0) +df['file_size_l'] = df['file_size_l'].apply(int) + +display(HTML(f"Found a total of {total_records:,} WARC and crawl log files.")) +``` + +```{code-cell} ipython3 +:tags: [hide-input] + +# Dataframe of all unique paths (drop others for paths appearing in more than one 'fresh' TrackDB record): +dfu = df.drop_duplicates(subset=['file_path_s']).drop(columns=['file_path_s']) + +unique_records = len(dfu) + +display(HTML(f"Found {unique_records:,} unique files (based on file path). This means there are {(total_records-unique_records):,} files duplicated across storage systems.")) +``` + +The following table shows the most recent WARCs for each data store, along with the associated timestamp. This can be used to check the source data for this report is up to date. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- + +``` + +```{code-cell} ipython3 + +``` + +## Radial Visualization + +This is a work in progress and is not working yet. + +```{code-cell} ipython3 +#for gn, g in dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']): +dfuu = dfu.filter(['timestamp_dt', 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s', 'file_size_l']).rename( + columns={ + 'file_size_l': 'size_bytes', + 'kind_s': 'kind', + 'stream_s': 'stream', + 'count': 'file_count', + 'timestamp_dt': 'year', + 'collection_s': 'collection', + 'hdfs_service_id_s': 'store' + } +) +dfuu +``` + +```{code-cell} ipython3 +# Build up items for the tree: +# { +# "id": 246, +# "name": "TreeMapLayout", +# "parent": 231, +# "size": 9191 +# }, + + +entries = [] +entry_id = 0 + +entries.append({ + 'id': entry_id, + 'name': "total", + 'size': dfuu['size_bytes'].sum(), + 'count': dfuu['size_bytes'].count() +}) +parent_id = entry_id +entry_id += 1 + +for ts, ts_g in dfuu.groupby(pd.Grouper(key='year', freq="A")): + print(ts.year) + for col, col_g in ts_g.groupby('collection'): + print(ts.year, col, col_g['size_bytes'].count(), col_g['size_bytes'].sum()) + for stream, stream_g in col_g.groupby('stream'): + print(ts.year, col, stream) + for kind, kind_g in stream_g.groupby('kind'): + print(ts.year, col, stream, kind) + for store, store_g in kind_g.groupby('store'): + print(ts.year, col, stream, kind, store, store_g['size_bytes'].count(), store_g['size_bytes'].sum()) +``` + +```{code-cell} ipython3 +from altair.vega import vega + +vega({ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "description": "An example of a space-fulling radial layout for hierarchical data.", + "width": 600, + "height": 600, + "padding": 5, + "autosize": "none", + + "data": [ + { + "name": "tree", + "url": "https://vega.github.io/vega/data/flare.json", + "transform": [ + { + "type": "stratify", + "key": "id", + "parentKey": "parent" + }, + { + "type": "partition", + "field": "size", + "sort": {"field": "value"}, + "size": [{"signal": "2 * PI"}, {"signal": "width / 2"}], + "as": ["a0", "r0", "a1", "r1", "depth", "children"] + } + ] + } + ], + + "scales": [ + { + "name": "color", + "type": "ordinal", + "domain": {"data": "tree", "field": "depth"}, + "range": {"scheme": "tableau10"} + } + ], + + "marks": [ + { + "type": "arc", + "from": {"data": "tree"}, + "encode": { + "enter": { + "x": {"signal": "width / 2"}, + "y": {"signal": "height / 2"}, + "fill": {"scale": "color", "field": "depth"}, + "tooltip": {"signal": "datum.name + (datum.size ? ', ' + datum.size + ' bytes' : '')"} + }, + "update": { + "startAngle": {"field": "a0"}, + "endAngle": {"field": "a1"}, + "innerRadius": {"field": "r0"}, + "outerRadius": {"field": "r1"}, + "stroke": {"value": "white"}, + "strokeWidth": {"value": 0.75}, + "zindex": {"value": 0} + }, + "hover": { + "stroke": {"value": "red"}, + "strokeWidth": {"value": 1.5}, + "zindex": {"value": 1} + } + } + } + ] +}) +``` + +```{code-cell} ipython3 + +``` + +```{code-cell} ipython3 + +``` + +```{code-cell} ipython3 + +``` diff --git a/content/storage/summary.md b/content/storage/summary.md new file mode 100644 index 0000000..559eabc --- /dev/null +++ b/content/storage/summary.md @@ -0,0 +1,346 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +# Storage Summary Report + +## Introduction +Every day, all active data stores are scanned and their inventories taken. This report uses that data to summarise our current holdings in terms of volumes of data and numbers of files. We classify files in various ways, and provide summary statistics based on those groupings. They include: + +| Classification | Description | +| -------------- | ----------- | +**Date** | File creation date by e.g. year, or financial year, or month, etc. | +**Collection** | The legal framework we collect under. One of `selective`, `npld` or `bypm` (by permission) +| **Stream** | The capture process, e.g. `selective` (WCT), `frequent`, `domain`, `webrecorder`, `warcit` | +| **Kind** | The kind of file, e.g. `warcs`, `viral` (meaning WARCs of nullified viral records), `crawl-logs` etc. | +| **Store** | The storage system the files reside on, e.g. `h020` (old Hadoop), `h3` (new Hadoop), `aws_s3` (AWS S3 Glacier) | | + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +## Current Totals + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import os +import requests +import pandas as pd +from humanbytes import HumanBytes +from IPython.display import display, HTML, FileLink, FileLinks +import pathlib + +# Helper function to show a table and offer a downloadable version: +def show_table_and_dl(df, slug): + # Shift to standard Column Names + df = df.rename(columns={ + 'timestamp_dt': 'year', + 'collection_s': 'collection', + 'stream_s': 'stream', + 'kind_s': 'kind', + 'size': 'size_bytes', + 'count': 'file_count', + 'hdfs_service_id_s': 'store' + }) + + # Add a Total: + df.loc['Total']= df.sum(numeric_only=True) + + # Replace NaNs + df = df.fillna('') + + # Clean up size formatting: + df['size'] = df['size_bytes'].apply(lambda x: HumanBytes.format(x, True)) + df['size_bytes'] = df['size_bytes'].apply(int) + df['file_count'] = df['file_count'].apply(int) + + # Also make the data available for download: + csv_file = f'{slug}.csv' + df.to_csv(csv_file, index=False) + dl = FileLink(csv_file, result_html_prefix='Download the data from this table here: ') + display(df,dl) + +# Establish the current folder: +dir_path = pathlib.Path().absolute() + +# Display more rows by default: +pd.set_option('display.max_rows', 100) + +# Pick up source locations: +trackdb_jsonl = os.environ.get('TRACKDB_LIST_JSONL', dir_path.joinpath('trackdb_list.jsonl')) +aws_jsonl = os.environ.get('AWS_S3_LIST_JSONL', dir_path.joinpath('aws_s3_list.jsonl')) + +# Load TrackDB records: +df = pd.read_json(trackdb_jsonl, lines=True) + +# Also load AWS records: +aws_df = pd.read_json(aws_jsonl, lines=True) +# Filter out non-content files: +aws_df = aws_df[aws_df['kind_s'] != 'unknown'] +df = pd.concat([df,aws_df], sort=True) + +# Set up timestamp: +df['timestamp_dt']= pd.to_datetime(df['timestamp_dt']) +total_records = len(df) + +# Force integers: +df['file_size_l'] = df['file_size_l'].fillna(0) +df['file_size_l'] = df['file_size_l'].apply(int) + +display(HTML(f"Found a total of {total_records:,} WARC and crawl log files.")) +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +# Dataframe of all unique paths (drop others for paths appearing in more than one 'fresh' TrackDB record): +dfu = df.drop_duplicates(subset=['file_path_s']).drop(columns=['file_path_s']) + +unique_records = len(dfu) + +display(HTML(f"Found {unique_records:,} unique files (based on file path). This means there are {(total_records-unique_records):,} files duplicated across storage systems.")) +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +## Most Recent Files +The following table shows the most recent WARCs for each data store, along with the associated timestamp. This can be used to check the source data for this report is up to date, as the date for the `h3` store should be within the last day or two. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +pd.set_option('display.max_colwidth', 1024) +# Now we look for the most recent WARC files: +dflw = df.filter(items=['hdfs_service_id_s', 'file_path_s', 'kind_s', 'timestamp_dt'], axis=1) +dflw = dflw.loc[dflw['kind_s'] == 'warcs'].sort_values(['timestamp_dt'],ascending=False).groupby('hdfs_service_id_s').first() +dflw = dflw.reset_index().rename(columns={ + 'hdfs_service_id_s': 'store', +}) +dflw +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +## Totals by Collection & Stream + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +tots = dfu.groupby(['collection_s','stream_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) +tots = tots.reset_index() + +# Show table and downloader: +show_table_and_dl(tots, 'totals_by_collection_and_stream') +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +## Statistics by Year + +### Overall totals by year + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +#tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) +tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A")]).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) +tots = tots.reset_index() + +# Clip year: +tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x)) + +# Show table and downloader: +show_table_and_dl(tots, 'totals_by_year') +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +### Totals by Year & Collection + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +#tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) +tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) +tots = tots.reset_index() + +# Clip year: +tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x)) + +# Show table and downloader: +show_table_and_dl(tots, 'totals_by_year_collection') +``` + +### Totals by Year, Collection, Stream, Store & Kind + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +tots = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) +tots = tots.reset_index() + +# Clip year: +tots['timestamp_dt'] = tots['timestamp_dt'].dt.year.apply(lambda x: str(x)) + +# Show table and downloader: +show_table_and_dl(tots, 'totals_by_year_collection_stream_store_kind') +``` + +## Statistics by Financial Year + +The same data, but aggregating by financial year. + +### Totals by Financial Year + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +by_fy = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR")]).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) + +# Removed heirarchical index so we can plot: +by_fy = by_fy.reset_index() + +# Transform how FY is presented: +by_fy['timestamp_dt'] = by_fy['timestamp_dt'].dt.year.apply(lambda x: str(x-1) + "-" + str(x)) + +# Show table and downloader: +show_table_and_dl(by_fy, 'totals_by_fy') +``` + +### Totals by Financial Year, Collection, Stream, Store & Kind + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +by_fy = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR"), 'collection_s', 'stream_s', 'hdfs_service_id_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) + +# Removed heirarchical index so we can plot: +by_fy = by_fy.reset_index() + +# Transform how FY is presented: +by_fy['timestamp_dt'] = by_fy['timestamp_dt'].dt.year.apply(lambda x: str(x-1) + "-" + str(x)) + +# Show table and downloader: +show_table_and_dl(by_fy, 'totals_by_fy_collection_stream_store_kind') +``` + +### Graphs of Totals by Stream & Kind, over Time + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +by_fy_s = dfu.groupby([pd.Grouper(key='timestamp_dt', freq="A-MAR"), 'stream_s', 'kind_s']).agg(count=('file_size_l', 'count'), size=('file_size_l', 'sum')) + +# Removed heirarchical index so we can plot: +by_fy_s = by_fy_s.reset_index() + +# Transform how FY is presented: +by_fy_s['fy'] = by_fy_s['timestamp_dt'].dt.year.apply(lambda x: str(x-1) + "-" + str(x)) + +# Refactor/renaming: +by_fy_s = by_fy_s.filter(['fy', 'stream_s', 'kind_s', 'count', 'size']) + +# Present sizes in a readable way +by_fy_s['readable_size'] = by_fy_s['size'].apply(lambda x: HumanBytes.format(x, True)) +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import altair as alt + +selection = alt.selection_point(fields=['stream_s']) +color = alt.condition( + selection, + alt.Color('stream_s:N').legend(None), + alt.value('lightgray') +) + +scatter = alt.Chart(by_fy_s).mark_bar().encode( + x=alt.X('fy', axis = alt.Axis(title = 'Financial year')), + y=alt.Y('size', axis = alt.Axis(title = 'Total bytes', format='s')), + color=color, + row=alt.Row('kind_s', title='Kind'), + tooltip=[ + alt.Tooltip('fy', title='Financial year'), + alt.Tooltip('stream_s', title='Content stream'), + alt.Tooltip('count', title='Number of files'), + alt.Tooltip('readable_size', title='Total bytes') + ] +).properties( + width=600,height=200 +).resolve_scale(y='independent') + +legend = alt.Chart(by_fy_s).mark_point().encode( + alt.Y('stream_s').axis(orient='right'), + color=color +).add_params( + selection +) + +scatter | legend +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- + +``` diff --git a/content/storage/test.md b/content/storage/test.md new file mode 100644 index 0000000..b0f3f9e --- /dev/null +++ b/content/storage/test.md @@ -0,0 +1,20 @@ +--- +jupytext: + cell_metadata_filter: -all + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Hello + +```{code-cell} ipython3 +print(2+2) +``` diff --git a/content/storage/timeline.md b/content/storage/timeline.md new file mode 100644 index 0000000..c1a33c8 --- /dev/null +++ b/content/storage/timeline.md @@ -0,0 +1,149 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +# HDFS Timeline + +Breaking down what's stored on HDFS onto a timeline, i.e. totals do not include data held only on AWS Glacier. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import json +import requests +import pandas as pd +from ukwa_reports.solr_facet_helper import flatten_solr_buckets + +headers = {'content-type': "application/json" } + +json_facet = { + # Primary facet is by date - here we break down the last month(s) into days + 'facet': { + 'dates' : { + 'type' : 'range', + 'field' : 'timestamp_dt', + 'start' : "NOW/YEAR-20YEAR", + 'end' : "NOW/YEAR+1YEAR", + 'gap' : "+1MONTH", + # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded: + 'facet': { + 'collection': { + 'type': 'terms', + "field": "collection_s", + 'missing': True, + 'facet': { + 'stream': { + 'type': 'terms', + "field": "stream_s", + 'missing': True, + 'facet' : { + 'bytes': 'sum(file_size_l)' + } + } + } + } + } + } + } +} + + +params = { + 'q': '(kind_s:"warcs" OR kind_s:"logs")', + 'rows': 0 +} + +r = requests.post("http://solr8.api.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers) + +if r.status_code != 200: + print(r.text) + +df = pd.DataFrame(flatten_solr_buckets(r.json()['facets'])) +# Filter empty rows: +df=df[df['count'] != 0] + +# Add compound column: +df['status'] = df.apply(lambda row: "%s, %s" % (row.collection, row.stream), axis=1) +df['terabytes'] = df.apply(lambda row: row.bytes / (1000*1000*1000*1000), axis=1) + +# CHART +import altair as alt + +alt.Chart(df).mark_bar().encode( + x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))), + y=alt.Y('terabytes', axis=alt.Axis(title='Data volume (TB)')), + color='status:N', + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'terabytes'] +).properties(width=600) +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +And the same data as a percentage per time period. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +alt.Chart(df).mark_bar().encode( + x=alt.X('dates:T', axis = alt.Axis(title='Date', format=("%b %Y"))), + y=alt.Y('count', stack="normalize", axis=alt.Axis(title='Percentage of files', format='%')), + color='status:N', + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes'] +).properties(width=600) +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +And as a cumulative graph. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +import altair as alt + +alt.Chart(df).transform_window( + cumulative_terabytes="sum(terabytes)", +).mark_line().encode( + x=alt.X('dates:T', axis=alt.Axis(title='Date', format=("%b %Y"))), + y=alt.Y('cumulative_terabytes:Q', axis=alt.Axis(title='Cumulative total data volume (TB)')), + tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'), 'cumulative_terabytes:Q'] +).properties(width=600) +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +And as cumulative totals (calculated directly rather than using the graph library): + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [remove-input] +--- +df2 = df.groupby(['status'])['terabytes'].sum().groupby(level=0).cumsum().reset_index() +df2 +``` diff --git a/docker-compose.yml b/docker-compose.yml index 1721a7c..20c050c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,19 +1,34 @@ # This Compose file is intended for local development, not production deployment. # -# TODO change this file back to v.2 and set it up for local dev. -# version: "3.2" services: - hugo: - build: . + lab: + # NOTE this will need to be changed for each user - for some reason username strings don't work: + user: "1001" + build: + context: . + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} ports: - - "1001:80" + - "8888:8888" + command: "jupyter-lab --ip=0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password=''" + volumes: + - /etc/passwd:/etc/passwd:ro + - /etc/group:/etc/group:ro + - /etc/shadow:/etc/shadow:ro + - /home/${USER}:/home/${USER} + - ${PWD}:/host + working_dir: /host - logviewer: - image: ukwa/crawl-log-viewer + site: + # Make results of build.sh available at http://:8889/act/static/reports/ + image: nginx ports: - - "8000:8000" + - "8889:80" + volumes: + - ./_build/html:/usr/share/nginx/html/act/static/reports:ro diff --git a/layouts/_default/list.html b/layouts/_default/list.html deleted file mode 100644 index 62fae05..0000000 --- a/layouts/_default/list.html +++ /dev/null @@ -1,13 +0,0 @@ -{{ partial "header" . }} - -
- -

{{ .Title }}

- - {{ range (.Paginator 100).Pages }} {{ partial "list-item" . }} {{ end }} - -
- -{{ partial "paginator" . }} - -{{ partial "footer" . }} diff --git a/layouts/_default/terms.html b/layouts/_default/terms.html deleted file mode 100644 index 62fae05..0000000 --- a/layouts/_default/terms.html +++ /dev/null @@ -1,13 +0,0 @@ -{{ partial "header" . }} - -
- -

{{ .Title }}

- - {{ range (.Paginator 100).Pages }} {{ partial "list-item" . }} {{ end }} - -
- -{{ partial "paginator" . }} - -{{ partial "footer" . }} diff --git a/layouts/crawls/single.html b/layouts/crawls/single.html deleted file mode 100644 index db16518..0000000 --- a/layouts/crawls/single.html +++ /dev/null @@ -1,43 +0,0 @@ -{{ partial "header" . }} - -
- - {{ partial "list-item" . }} - -
{{ .Content }}
- - - {{ $related := first 3 (where (where (where .Site.Pages.ByDate.Reverse ".Type" "==" "post") ".Params.tags" "intersect" .Params.tags) "Permalink" "!=" .Permalink) }} - - {{ if $related }} - - - - {{ range $related }} {{ partial "list-item" . }} {{ end }} - - {{ end }} - -

Files

- - - {{ range .Params.files }} - {{ $downloadUrl := print "http://hdfs.gtw.wa.bl.uk:14000/webhdfs/v1" .path "?user.name=access&op=OPEN" }} - - - - - - - {{ end }} -
FileKindDateSize [bytes]
- {{ if eq .kind "crawl-logs" }} - {{ index (last 1 (split .path "/")) 0 }} - {{ else }} - {{ index (last 1 (split .path "/")) 0 }} - {{ end }} -  [download] - {{ .kind }}{{ .timestamp }}{{ .filesize }}
- -
- -{{ partial "footer.html" . }} diff --git a/layouts/partials/list-item.html b/layouts/partials/list-item.html deleted file mode 100644 index bf88530..0000000 --- a/layouts/partials/list-item.html +++ /dev/null @@ -1,25 +0,0 @@ -
- - {{ $.Scratch.Set "link" .RelPermalink }} - {{ with .Params.repo }} - {{ $repoHost := default "github" $.Params.repoHost }} - {{ if eq "github" $repoHost }} - {{ printf "https://github.com/%s/%s/" $.Site.Params.githubUsername . | $.Scratch.Set "link" }} - {{ else if eq "gitlab" $repoHost }} - {{ printf "https://gitlab.com/%s/%s/" $.Site.Params.gitlabUsername . | $.Scratch.Set "link" }} - {{ else if eq "bitbucket" $repoHost }} - {{ printf "https://bitbucket.org/%s/%s/" $.Site.Params.bitbucketUsername . | $.Scratch.Set "link" }} - {{ end }} - {{ end }} - {{ with .Params.link }} {{ $.Scratch.Set "link" . }} {{ end }} - - {{ .Date.Format (.Site.Params.dateFormat | default "2 January 2006") | $.Scratch.Set "subtitle" }} - {{ with .Description }} {{ $.Scratch.Set "subtitle" . }} {{ end }} - -

{{ .Title }}

-
{{ $.Scratch.Get "subtitle" }}
- {{ range .Params.tags }} - {{ . }} - {{ end }} - -
diff --git a/layouts/reports/single.html b/layouts/reports/single.html deleted file mode 100644 index c0dbd10..0000000 --- a/layouts/reports/single.html +++ /dev/null @@ -1,46 +0,0 @@ -{{ partial "header" . }} - - - - - - - -
- - {{ partial "list-item" . }} - -
-

Table of contents

- {{.TableOfContents}} -
- -
{{ .Content }}
- - - {{ $related := first 3 (where (where (where .Site.Pages.ByDate.Reverse ".Type" "==" "post") ".Params.tags" "intersect" .Params.tags) "Permalink" "!=" .Permalink) }} - - {{ if $related }} - - - - {{ range $related }} {{ partial "list-item" . }} {{ end }} - - {{ end }} - -
- -{{ partial "footer.html" . }} diff --git a/layouts/shortcodes/csv-table.html b/layouts/shortcodes/csv-table.html deleted file mode 100644 index eeca347..0000000 --- a/layouts/shortcodes/csv-table.html +++ /dev/null @@ -1,27 +0,0 @@ - - {{ $url := print "content/" (.Get "src") }} - {{ $sep := "," }} - {{ range $i, $r := getCSV $sep $url }} - {{ if eq 0 $i }} - - - {{ range $c := $r }} - - {{ end }} - - - - {{ else }} - - {{ range $c := $r }} - - {{ end }} - - {{ end }} - {{ end }} - -
{{ $c }}
{{ $c }}
- -

- You can download the underlying dataset for the above table from here. -

diff --git a/layouts/shortcodes/date-bar-chart.html b/layouts/shortcodes/date-bar-chart.html deleted file mode 100644 index 90f72fc..0000000 --- a/layouts/shortcodes/date-bar-chart.html +++ /dev/null @@ -1,22 +0,0 @@ -{{ $id := base64Encode (.Get "src") }} -
- - - -
-

- You can download the underlying dataset for the above graph from here. -

diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..37dba74 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +# This package contains a few helper functions +setup( + name='ukwa-reports', + version='0.1.0', + packages=find_packages(include=['ukwa_reports', 'ukwa_reports.*']), + install_requires=[ + 'jupyterlab', + 'jupyter-book', + 'altair', + 'jupytext', + ] +) diff --git a/static/css/main.css b/static/css/main.css deleted file mode 100644 index cafb2dc..0000000 --- a/static/css/main.css +++ /dev/null @@ -1,109 +0,0 @@ -html, body { - height: 100%; -} - -body { - padding-top: 55px; - display: flex; - text-align: center; - flex-direction: column; -} - -main { - margin: auto; - padding: 25px; - flex: 1 0 auto; -} - -main table { - text-align: left; -} - -main table th{ - border-bottom: 1px solid #333; -} -main table td { - padding: 1px 5px; -} -/*footer*/ - -.copyright { - margin: 15px 0; -} - -/*home page*/ - -.intro { - transform: translateY(22vh); -} - -.intro > h1 { - color: #212121; - font-size: 12vh; -} - -.intro > h2 { - color: #757575; - font-size: 3vmin; -} - -.intro > .profile { - width: 10vh; - height: 10vh; - border-radius: 50%; -} - -/*apply accent colour to links*/ - -a:link, a:visited { - color: var(--accent); -} - -a.icon:hover { - text-decoration: none; -} - -a:hover { - color: var(--accent) !important; -} - -/*paginator at bottom of list view*/ - -.pages { - padding: 15px 0; -} - -.pages-icon { - padding: 0 15px; -} - -/*list item for posts and projects*/ - -.item { - padding: 10px 0; -} - -.item-tag { - background-color: var(--accent); -} - -/*navigation bar icons*/ - -.navbar-icon { - font-size: 125%; - display: inline-block !important; -} - -/*coloured borders at top and bottom of the page*/ - -.navbar.navbar-default { - border-top: var(--border-width) solid var(--accent); -} - -footer { - border-bottom: var(--border-width) solid var(--accent); -} - -img { - max-width: 100%; -} diff --git a/themes/minimal b/themes/minimal deleted file mode 160000 index 7d92985..0000000 --- a/themes/minimal +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7d929851ffdd5a0752d8b1f05596cf7cbf907982 diff --git a/index.html b/ukwa_reports/__init__.py similarity index 100% rename from index.html rename to ukwa_reports/__init__.py diff --git a/ukwa_reports/solr_facet_helper.py b/ukwa_reports/solr_facet_helper.py new file mode 100644 index 0000000..4be44c7 --- /dev/null +++ b/ukwa_reports/solr_facet_helper.py @@ -0,0 +1,38 @@ +# +# This is a bit knarly but it helpfully flattens the Solr JSON API reponse +# (which is a kind of tree shape) into a flat table that Pandas can work with. +# +# See [Solr's JSON Facet API](https://lucene.apache.org/solr/guide/8_4/json-facet-api.html) +# + +def flatten_solr_buckets(solr_facets): + flat = [] + for key in solr_facets: + if isinstance(solr_facets[key], dict): + for vals in _flatten_facet_buckets(key, solr_facets): + flat.append(vals.copy()) + return flat + +def _flatten_facet_buckets(facet_name, bucket, values={}): + subfacets = [] + for bucket_name in bucket: + if isinstance(bucket[bucket_name],dict): + subfacets.append(bucket_name) + if len(subfacets) > 0: + for bucket_name in subfacets: + for sub_bucket in bucket[bucket_name]['buckets']: + values[bucket_name] = sub_bucket['val'] + for sub_values in _flatten_facet_buckets(bucket_name, sub_bucket, values.copy()): + yield sub_values + # Also deal with the special 'missing' bucket: + if 'missing' in bucket[bucket_name]: + values[bucket_name] = "missing" + for sub_values in _flatten_facet_buckets(bucket_name, bucket[bucket_name]['missing'], values.copy()): + yield sub_values + else: + for bucket_name in bucket: + if bucket_name != 'val': + values[bucket_name] = bucket[bucket_name] + yield values + +