diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..c3a02dd --- /dev/null +++ b/404.html @@ -0,0 +1,597 @@ + + + + + + + + + + + + + + + + + + + + alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/assets/_mkdocstrings.css b/assets/_mkdocstrings.css new file mode 100644 index 0000000..049a254 --- /dev/null +++ b/assets/_mkdocstrings.css @@ -0,0 +1,64 @@ + +/* Avoid breaking parameter names, etc. in table cells. */ +.doc-contents td code { + word-break: normal !important; +} + +/* No line break before first paragraph of descriptions. */ +.doc-md-description, +.doc-md-description>p:first-child { + display: inline; +} + +/* Max width for docstring sections tables. */ +.doc .md-typeset__table, +.doc .md-typeset__table table { + display: table !important; + width: 100%; +} + +.doc .md-typeset__table tr { + display: table-row; +} + +/* Defaults in Spacy table style. */ +.doc-param-default { + float: right; +} + +/* Keep headings consistent. */ +h1.doc-heading, +h2.doc-heading, +h3.doc-heading, +h4.doc-heading, +h5.doc-heading, +h6.doc-heading { + font-weight: 400; + line-height: 1.5; + color: inherit; + text-transform: none; +} + +h1.doc-heading { + font-size: 1.6rem; +} + +h2.doc-heading { + font-size: 1.2rem; +} + +h3.doc-heading { + font-size: 1.15rem; +} + +h4.doc-heading { + font-size: 1.10rem; +} + +h5.doc-heading { + font-size: 1.05rem; +} + +h6.doc-heading { + font-size: 1rem; +} \ No newline at end of file diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 0000000..1cf13b9 Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.220ee61c.min.js b/assets/javascripts/bundle.220ee61c.min.js new file mode 100644 index 0000000..116072a --- /dev/null +++ b/assets/javascripts/bundle.220ee61c.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ci=Object.create;var gr=Object.defineProperty;var Ri=Object.getOwnPropertyDescriptor;var ki=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Hi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,nn=Object.prototype.propertyIsEnumerable;var rn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&rn(e,r,t[r]);if(Ht)for(var r of Ht(t))nn.call(t,r)&&rn(e,r,t[r]);return e};var on=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&nn.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Pi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ki(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=Ri(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ci(Hi(e)):{},Pi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var sn=Pt((xr,an)=>{(function(e,t){typeof xr=="object"&&typeof an!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(O){return!!(O&&O!==document&&O.nodeName!=="HTML"&&O.nodeName!=="BODY"&&"classList"in O&&"contains"in O.classList)}function f(O){var Qe=O.type,De=O.tagName;return!!(De==="INPUT"&&s[Qe]&&!O.readOnly||De==="TEXTAREA"&&!O.readOnly||O.isContentEditable)}function c(O){O.classList.contains("focus-visible")||(O.classList.add("focus-visible"),O.setAttribute("data-focus-visible-added",""))}function u(O){O.hasAttribute("data-focus-visible-added")&&(O.classList.remove("focus-visible"),O.removeAttribute("data-focus-visible-added"))}function p(O){O.metaKey||O.altKey||O.ctrlKey||(a(r.activeElement)&&c(r.activeElement),n=!0)}function m(O){n=!1}function d(O){a(O.target)&&(n||f(O.target))&&c(O.target)}function h(O){a(O.target)&&(O.target.classList.contains("focus-visible")||O.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(O.target))}function v(O){document.visibilityState==="hidden"&&(o&&(n=!0),Y())}function Y(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function B(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(O){O.target.nodeName&&O.target.nodeName.toLowerCase()==="html"||(n=!1,B())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),Y(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var cn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},s=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(B,N){d.append(N,B)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(O){throw new Error("URL unable to set base "+c+" due to "+O)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,Y=!0,B=this;["append","delete","set"].forEach(function(O){var Qe=h[O];h[O]=function(){Qe.apply(h,arguments),v&&(Y=!1,B.search=h.toString(),Y=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,Y&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(f){Object.defineProperty(s,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){a(f)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var qr=Pt((Mt,Nr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof Nr=="object"?Nr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ai}});var s=i(279),a=i.n(s),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(T){return!1}}var d=function(T){var E=p()(T);return m("cut"),E},h=d;function v(j){var T=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[T?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var Y=function(T,E){var H=v(T);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},B=function(T){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof T=="string"?H=Y(T,E):T instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(T==null?void 0:T.type)?H=Y(T.value,E):(H=p()(T),m("copy")),H},N=B;function O(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?O=function(E){return typeof E}:O=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},O(j)}var Qe=function(){var T=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=T.action,H=E===void 0?"copy":E,I=T.container,q=T.target,Me=T.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&O(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function Ei(j,T){if(!(j instanceof T))throw new TypeError("Cannot call a class as a function")}function tn(j,T){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=c()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ai=Li},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,f){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(f))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return c(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),s=f.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var f=this;function c(){f.off(i,c),s.apply(a,arguments)}return c._=s,this.on(i,c,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=a.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var rs=/["'&<>]/;Yo.exports=ns;function ns(e){var t=""+e,r=rs.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof et?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function pn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,f){s=e[i](s),o(a,f,s.done,s.value)})}}function o(i,s,a,f){Promise.resolve(f).then(function(c){i({value:c,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),f=a.next();!f.done;f=a.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{ln(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ln(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function ln(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new xn(r,n)},t}(F);var xn=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,f=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Sn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Oe=new Sn(wn);var M=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Te(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=zi();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return un(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return Ni(e);if(pt(e))return qi(e);if(Nt(e))return Ki(e);if(Kt(e))return On(e);if(Gt(e))return Qi(e);if(Jt(e))return Yi(e)}throw Qt(e)}function Ni(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function qi(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Dn(function(){return new Zt}))}}function Vn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,f=a===void 0?!0:a;return function(c){var u,p,m,d=0,h=!1,v=!1,Y=function(){p==null||p.unsubscribe(),p=void 0},B=function(){Y(),u=m=void 0,h=v=!1},N=function(){var O=u;B(),O==null||O.unsubscribe()};return y(function(O,Qe){d++,!v&&!h&&Y();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,f))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,Y(),p=$r(B,o,$e),De.error($e)},complete:function(){h=!0,Y(),p=$r(B,s),De.complete()}}),U(O).subscribe(u))})(c)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),J())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Kn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>rr(e)),V(rr(e)))}var Yn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Wr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),va?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Wr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ba.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Gn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Jn=typeof WeakMap!="undefined"?new WeakMap:new Yn,Xn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ga.getInstance(),n=new La(t,r,this);Jn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Xn.prototype[e]=function(){var t;return(t=Jn.get(this))[e].apply(t,arguments)}});var Aa=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Xn}(),Zn=Aa;var eo=new x,Ca=$(()=>k(new Zn(e=>{for(let t of e)eo.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ca.pipe(S(t=>t.observe(e)),g(t=>eo.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var to=new x,Ra=$(()=>k(new IntersectionObserver(e=>{for(let t of e)to.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function sr(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function ro(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),J())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function no(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function ka(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ha(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function oo(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:no("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!ka(n,r)}return!0}),pe());return Ha().pipe(g(t=>t?M:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function io(){return new x}function ao(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)ao(e,r)}function _(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)ao(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function so(){return location.hash.substring(1)}function Dr(e){let t=_("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Pa(e){return L(b(window,"hashchange"),e).pipe(l(so),V(so()),A(t=>t.length>0),X(1))}function co(e){return Pa(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function Vr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function fo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function zr(e,t){return e.pipe(g(r=>r?t():M))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>M),g(r=>r.status!==200?Ot(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),X(1))}function uo(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),X(1))}function pr(e){let t=_("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Ot(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function po(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function lo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(po),V(po()))}function mo(){return{width:innerWidth,height:innerHeight}}function ho(){return b(window,"resize",{passive:!0}).pipe(l(mo),V(mo()))}function bo(){return G([lo(),ho()]).pipe(l(([e,t])=>({offset:e,size:t})),X(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(ee("size")),o=G([n,r]).pipe(l(()=>Xe(e)));return G([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:f,y:c}])=>({offset:{x:s.x-f,y:s.y-c+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,f,c)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:f,error:c});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

alto2txt2fixture

+

img/header.png

+ + + +

mit-license +CI +coverage +pre-commit.ci status +Code style: black +doc

+ + +

alto2txt2fixture is a standalone tool to convert alto2txt XML output and other related datasets into JSON (and where feasible CSV) data with corresponding relational IDs to ease general use and ingestion into a relational database.

+

We target the the JSON produced for importing into lwmdb: a database built using the Django python webframework database fixture structure.

+

Installation and simple use

+

We provide a command line interface to process alto2txt XML files stored locally (or mounted via azure blobfuse), and for additional public data we automate a means of downloading those automatically.

+

Installation

+

We recommend downloading a copy of the reposity or using git clone. From a local copy use poetry to install dependencies:

+
$ cd alto2txt2fixture
+$ poetry install
+
+

If you would like to test, render documentation and/or contribute to the code included dev dependencies in a local install:

+
$ poetry install --with dev
+
+

Simple use

+

To processing newspaper metadata with a local copy of alto2txt XML results, it's easiest to have that data in the same folder as your alto2txt2fixture checkout and poetry installed folder. One arranged, you should be able to begin the JSON converstion with

+
$ poetry run a2t2f-news
+
+

To generate related data in JSON and CSV form, assuming you have an internet collection and access to a living-with-machines azure account, the following will download related data into JSON and CSV files. The JSON results should be consistent with lwmdb tables for ease of import.

+
$ poetry run a2t2f-adj
+
+ + + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/objects.inv b/objects.inv new file mode 100644 index 0000000..e1da392 Binary files /dev/null and b/objects.inv differ diff --git a/reference/SUMMARY.html b/reference/SUMMARY.html new file mode 100644 index 0000000..7a1764f --- /dev/null +++ b/reference/SUMMARY.html @@ -0,0 +1,622 @@ + + + + + + + + + + + + + + + + + + + + SUMMARY - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+ +
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/__main__.html b/reference/alto2txt2fixture/__main__.html new file mode 100644 index 0000000..2d4e165 --- /dev/null +++ b/reference/alto2txt2fixture/__main__.html @@ -0,0 +1,1213 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + __main__ - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

__main__

+ +
+ + + +
+ +

Entry point for alto2txt2fixture.parse to convert alto2txt XML -> JSON.

+

This module defines the run function which is the main driver for the entire +process.

+

It imports various functions from other modules and uses them to route and +parse XML data generated by alto2txt.

+

The following steps are performed in the run function:

+
    +
  1. Parses command line arguments using the parse_args function. If no + arguments are provided, the default values are taken from the settings + module.
  2. +
  3. Prints a setup report to the console, showing the values of the relevant + parameters.
  4. +
  5. Calls the route function to route alto2txt data into subdirectories with + structured files.
  6. +
  7. Calls the parse function to parse the resulting JSON files.
  8. +
  9. Calls the clear_cache function to clear the cache.
  10. +
+

If the script is run as a main program (i.e. if the name of the script is +__main__), the run() function is executed.

+

Note: at present this does not include any functunality in create_adjacent_tables.py

+ + + +
+ + + + + + + + + + +
+ + + +

+ parse_args + + +

+
parse_args(argv: list[str] | None = None) -> Namespace
+
+ +
+ +

Manage command line arguments for run()

+

This constructs an ArgumentParser instance to manage +configurating calls of run() to manage newspaper +XML to JSON converstion.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
argv + list[str] | None + +
+

If None treat as equivalent of ['--help], +if alistofstrpass those options toArgumentParser`

+
+
+ None +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Namespace + +
+

A Namespace dict-like configuration for run()

+
+
+ +
+ Source code in alto2txt2fixture/__main__.py +
def parse_args(argv: list[str] | None = None) -> Namespace:
+    """Manage command line arguments for `run()`
+
+    This constructs an `ArgumentParser` instance to manage
+    configurating calls of `run()` to manage `newspaper`
+    `XML` to `JSON` converstion.
+
+    Arguments:
+        argv:
+            If `None` treat as equivalent of ['--help`],
+            if a `list` of `str` pass those options to `ArgumentParser`
+
+    Returns:
+        A `Namespace` `dict`-like configuration for `run()`
+    """
+    argv = None if not argv else argv
+    parser = ArgumentParser(
+        prog="a2t2f-news",
+        description="Process alto2txt XML into and Django JSON Fixture files",
+        epilog=(
+            "Note: this is still in beta mode and contributions welcome\n\n" + __doc__
+        ),
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "-c",
+        "--collections",
+        nargs="+",
+        help="<Optional> Set collections",
+        required=False,
+    )
+    parser.add_argument(
+        "-m",
+        "--mountpoint",
+        type=str,
+        help="<Optional> Mountpoint",
+        required=False,
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        help="<Optional> Set an output directory",
+        required=False,
+    )
+    parser.add_argument(
+        "-t",
+        "--test-config",
+        default=False,
+        help="Only print the configuration",
+        action=BooleanOptionalAction,
+    )
+    parser.add_argument(
+        "-f",
+        "--show-fixture-tables",
+        default=True,
+        help="Print included fixture table configurations",
+        action=BooleanOptionalAction,
+    )
+    parser.add_argument(
+        "--export-fixture-tables",
+        default=True,
+        help="Experimental: export fixture tables prior to data processing",
+        action=BooleanOptionalAction,
+    )
+    parser.add_argument(
+        "--data-provider-field",
+        type=str,
+        default=DATA_PROVIDER_INDEX,
+        help="Key for indexing DataProvider records",
+    )
+    return parser.parse_args(argv)
+
+
+
+ +
+ + +
+ + + +

+ run + + +

+
run(local_args: list[str] | None = None) -> None
+
+ +
+ +

Manage running newspaper XML to JSON conversion.

+

First parse_args is called for command line arguments including:

+
    +
  • collections
  • +
  • output
  • +
  • mountpoint
  • +
+

If any of these arguments are specified, they will be used, otherwise they +will default to the values in the settings module.

+

The show_setup function is then called to display the configurations +being used.

+

The route function is then called to route the alto2txt files into +subdirectories with structured files.

+

The parse function is then called to parse the resulting JSON files.

+

Finally, the clear_cache function is called to clear the cache +(pending the user's confirmation).

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
local_args + list[str] | None + +
+

Options passed to parse_args()

+
+
+ None +
+ +
+ Source code in alto2txt2fixture/__main__.py +
def run(local_args: list[str] | None = None) -> None:
+    """Manage running newspaper `XML` to `JSON` conversion.
+
+    First `parse_args` is called for command line arguments including:
+
+    - `collections`
+    - `output`
+    - `mountpoint`
+
+    If any of these arguments are specified, they will be used, otherwise they
+    will default to the values in the `settings` module.
+
+    The `show_setup` function is then called to display the configurations
+    being used.
+
+    The `route` function is then called to route the alto2txt files into
+    subdirectories with structured files.
+
+    The `parse` function is then called to parse the resulting JSON files.
+
+    Finally, the `clear_cache` function is called to clear the cache
+    (pending the user's confirmation).
+
+    Arguments:
+        local_args:
+            Options passed to `parse_args()`
+    """
+    args: Namespace = parse_args(argv=local_args)
+
+    if args.collections:
+        COLLECTIONS = [x.lower() for x in args.collections]
+    else:
+        COLLECTIONS = settings.COLLECTIONS
+
+    if args.output:
+        OUTPUT = args.output.rstrip("/")
+    else:
+        OUTPUT = settings.OUTPUT
+
+    if args.mountpoint:
+        MOUNTPOINT = args.mountpoint.rstrip("/")
+    else:
+        MOUNTPOINT = settings.MOUNTPOINT
+
+    show_setup(
+        COLLECTIONS=COLLECTIONS,
+        OUTPUT=OUTPUT,
+        CACHE_HOME=settings.CACHE_HOME,
+        MOUNTPOINT=MOUNTPOINT,
+        JISC_PAPERS_CSV=settings.JISC_PAPERS_CSV,
+        REPORT_DIR=settings.REPORT_DIR,
+        MAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,
+    )
+
+    if args.show_fixture_tables:
+        # Show a table of fixtures used, defaults to DataProvider Table
+        show_fixture_tables(settings, data_provider_index=args.data_provider_field)
+
+    if args.export_fixture_tables:
+        export_fixtures(
+            fixture_tables=settings.FIXTURE_TABLES,
+            path=OUTPUT,
+            formats=settings.FIXTURE_TABLES_FORMATS,
+        )
+
+    if not args.test_config:
+        # Routing alto2txt into subdirectories with structured files
+        route(
+            COLLECTIONS,
+            settings.CACHE_HOME,
+            MOUNTPOINT,
+            settings.JISC_PAPERS_CSV,
+            settings.REPORT_DIR,
+        )
+
+        # Parsing the resulting JSON files
+        parse(
+            COLLECTIONS,
+            settings.CACHE_HOME,
+            OUTPUT,
+            settings.MAX_ELEMENTS_PER_FILE,
+        )
+
+        clear_cache(settings.CACHE_HOME)
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/cli.html b/reference/alto2txt2fixture/cli.html new file mode 100644 index 0000000..92ac251 --- /dev/null +++ b/reference/alto2txt2fixture/cli.html @@ -0,0 +1,1028 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + cli - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

cli

+ +
+ + + +
+ + + +
+ + + + + + + + + + +
+ + + +

+ show_fixture_tables + + +

+
show_fixture_tables(
+    run_settings: dotdict = settings,
+    print_in_call: bool = True,
+    data_provider_index: str = DATA_PROVIDER_INDEX,
+) -> list[Table]
+
+ +
+ +

Print fixture tables specified in settings.fixture_tables in rich.Table format.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
run_settings + dotdict + +
+

alto2txt2fixture run configuration

+
+
+ settings +
print_in_call + bool + +
+

whether to print to console (will use console variable if so)

+
+
+ True +
data_provider_index + str + +
+

key to index dataprovider from NEWSPAPER_COLLECTION_METADATA

+
+
+ DATA_PROVIDER_INDEX +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ list[Table] + +
+

A list of rich.Table renders from configurations in run_settings.FIXTURE_TABLES

+
+
+ +
+ Example +
>>> fixture_tables: list[Table] = show_fixture_tables(
+...     settings,
+...     print_in_call=False)
+>>> len(fixture_tables)
+1
+>>> fixture_tables[0].title
+'dataprovider'
+>>> [column.header for column in fixture_tables[0].columns]
+['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
+>>> fixture_tables = show_fixture_tables(settings)
+... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+<BLANKLINE>
+...dataprovider...Heritage...│ bl-hmd...│ hmd...
+
+
+
+ Note +

It is possible for the example test to fail in different screen sizes. Try +increasing the window or screen width of terminal used to check before +raising an issue.

+
+
+ Source code in alto2txt2fixture/cli.py +
32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
def show_fixture_tables(
+    run_settings: dotdict = settings,
+    print_in_call: bool = True,
+    data_provider_index: str = DATA_PROVIDER_INDEX,
+) -> list[Table]:
+    """Print fixture tables specified in ``settings.fixture_tables`` in `rich.Table` format.
+
+    Arguments:
+        run_settings: `alto2txt2fixture` run configuration
+        print_in_call: whether to print to console (will use ``console`` variable if so)
+        data_provider_index: key to index `dataprovider` from ``NEWSPAPER_COLLECTION_METADATA``
+
+    Returns:
+        A `list` of `rich.Table` renders from configurations in ``run_settings.FIXTURE_TABLES``
+
+    Example:
+        ```pycon
+        >>> fixture_tables: list[Table] = show_fixture_tables(
+        ...     settings,
+        ...     print_in_call=False)
+        >>> len(fixture_tables)
+        1
+        >>> fixture_tables[0].title
+        'dataprovider'
+        >>> [column.header for column in fixture_tables[0].columns]
+        ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
+        >>> fixture_tables = show_fixture_tables(settings)
+        ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+        <BLANKLINE>
+        ...dataprovider...Heritage...│ bl-hmd...│ hmd...
+
+        ```
+
+    Note:
+        It is possible for the example test to fail in different screen sizes. Try
+        increasing the window or screen width of terminal used to check before
+        raising an issue.
+    """
+    if run_settings.FIXTURE_TABLES:
+        if "dataprovider" in run_settings.FIXTURE_TABLES:
+            check_newspaper_collection_configuration(
+                run_settings.COLLECTIONS,
+                run_settings.FIXTURE_TABLES["dataprovider"],
+                data_provider_index=data_provider_index,
+            )
+        console_tables: list[Table] = list(
+            gen_fixture_tables(run_settings.FIXTURE_TABLES)
+        )
+        if print_in_call:
+            for console_table in console_tables:
+                console.print(console_table)
+        return console_tables
+    else:
+        return []
+
+
+
+ +
+ + +
+ + + +

+ show_setup + + +

+
show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs: str) -> None
+
+ +
+ +

Generate a rich.table.Table for printing configuration to console.

+ +
+ Source code in alto2txt2fixture/cli.py +
13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
def show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs) -> None:
+    """Generate a `rich.table.Table` for printing configuration to console."""
+    if clear and os.name == "posix":
+        os.system("clear")
+    elif clear:
+        os.system("cls")
+
+    table = Table(title=title)
+
+    table.add_column("Setting", justify="right", style="cyan", no_wrap=True)
+    table.add_column("Value", style="magenta")
+
+    for key, value in kwargs.items():
+        table.add_row(str(key), str(value))
+
+    console.print(table)
+    return
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/create_adjacent_tables.html b/reference/alto2txt2fixture/create_adjacent_tables.html new file mode 100644 index 0000000..d8dabe0 --- /dev/null +++ b/reference/alto2txt2fixture/create_adjacent_tables.html @@ -0,0 +1,2334 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + create_adjacent_tables - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

create_adjacent_tables

+ +
+ + + +
+ + + +
+ + + + + + + + + + +
+ + + +

+ correct_dict + + +

+
correct_dict(o: dict) -> list
+
+ +
+ +

Returns a list with corrected data from a provided dictionary.

+ +
+ Source code in alto2txt2fixture/create_adjacent_tables.py +
def correct_dict(o: dict) -> list:
+    """Returns a list with corrected data from a provided dictionary."""
+    return [(k, v[0], v[1]) for k, v in o.items() if not v[0].startswith("Q")] + [
+        (k, v[1], v[0]) for k, v in o.items() if v[0].startswith("Q")
+    ]
+
+
+
+ +
+ + +
+ + + +

+ csv2json_list + + +

+
csv2json_list(
+    csv_path: PathLike,
+    output_path: Path = OUTPUT,
+    saved: list[Path] | None = None,
+    indent: int = JSON_INDENT,
+) -> list
+
+ +
+ +

Save csv_path as a json file and return as a list.

+ +
+ Source code in alto2txt2fixture/create_adjacent_tables.py +
def csv2json_list(
+    csv_path: PathLike,
+    output_path: Path = OUTPUT,
+    saved: list[Path] | None = None,
+    indent: int = JSON_INDENT,
+) -> list:
+    """Save `csv_path` as a `json` file and return as a `list`."""
+    json_data = []
+    # See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486
+    df = (
+        pd.read_csv(csv_path, index_col=0).fillna(np.nan).replace([np.nan], [None])
+    )  # fillna(None)
+
+    if "political_leanings" in df.columns:
+        df["political_leanings"] = df["political_leanings"].apply(json.loads)
+    if "prices" in df.columns:
+        df["prices"] = df["prices"].apply(json.loads)
+
+    model = Path(csv_path).stem.lower()
+
+    for pk, row in df.iterrows():
+        fields = row.to_dict()
+        json_data.append({"pk": pk, "model": model, "fields": fields})
+
+    (Path(output_path) / csv_path).parent.mkdir(parents=True, exist_ok=True)
+    Path(output_path / f"{Path(csv_path).stem}.json").write_text(
+        json.dumps(json_data, indent=indent)
+    )
+    if not saved is None:
+        saved.append(output_path / f"{Path(csv_path).stem}.json")
+    return json_data
+
+
+
+ +
+ + +
+ + + +

+ download_data + + +

+
download_data(
+    files_dict: RemoteDataFilesType = {},
+    overwrite: bool = OVERWRITE,
+    exclude: list[str] = [],
+) -> None
+
+ +
+ +

Download files in files_dict, overwrite if specified.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
files_dict + RemoteDataFilesType + +
+

dict of related files to download

+
+
+ {} +
overwrite + bool + +
+

bool to overwrite LOCAL_CACHE files or not

+
+
+ OVERWRITE +
exclude + list[str] + +
+

list of files to exclude from files_dict

+
+
+ [] +
+ +
+ Example +
>>> tmp: Path = getfixture('tmpdir')
+>>> set_path: Path = tmp.chdir()
+>>> download_data(exclude=[
+...     "mitchells", "Newspaper-1", "linking"
+... ])  # doctest: +ELLIPSIS
+Excluding mitchells...
+Excluding Newspaper-1...
+Excluding linking...
+Downloading cache...dict_admin_counties.json
+100% ... 37/37 bytes
+Downloading cache...dict_countries.json
+100% ... 33.2/33.2 kB
+Downloading cache...dict_historic_counties.json
+100% ... 41.4/41.4 kB
+Downloading cache...nlp_loc_wikidata_concat.csv
+100% ... 59.8/59.8 kB
+Downloading cache...wikidata_gazetteer_selected_columns.csv
+100% ... 47.8/47.8 MB
+
+
+
+ Source code in alto2txt2fixture/create_adjacent_tables.py +
def download_data(
+    files_dict: RemoteDataFilesType = {},
+    overwrite: bool = OVERWRITE,
+    exclude: list[str] = [],
+) -> None:
+    """Download files in ``files_dict``, overwrite if specified.
+
+    Args:
+        files_dict: `dict` of related files to download
+        overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not
+        exclude: `list` of files to exclude from ``files_dict``
+
+    Example:
+        ```pycon
+        >>> tmp: Path = getfixture('tmpdir')
+        >>> set_path: Path = tmp.chdir()
+        >>> download_data(exclude=[
+        ...     "mitchells", "Newspaper-1", "linking"
+        ... ])  # doctest: +ELLIPSIS
+        Excluding mitchells...
+        Excluding Newspaper-1...
+        Excluding linking...
+        Downloading cache...dict_admin_counties.json
+        100% ... 37/37 bytes
+        Downloading cache...dict_countries.json
+        100% ... 33.2/33.2 kB
+        Downloading cache...dict_historic_counties.json
+        100% ... 41.4/41.4 kB
+        Downloading cache...nlp_loc_wikidata_concat.csv
+        100% ... 59.8/59.8 kB
+        Downloading cache...wikidata_gazetteer_selected_columns.csv
+        100% ... 47.8/47.8 MB
+
+        ```
+    """
+    if not files_dict:
+        files_dict = deepcopy(FILES)
+    for data_source in exclude:
+        if data_source in files_dict:
+            print(f"Excluding {data_source}...")
+            files_dict.pop(data_source, 0)
+        else:
+            logger.warning(
+                f'"{data_source}" not an option to exclude from {files_dict}'
+            )
+
+    # Describe whether local file exists
+    for k in files_dict.keys():
+        files_dict[k]["exists"] = files_dict[k]["local"].exists()
+
+    files_to_download = [
+        (v["remote"], v["local"], v["exists"])
+        for v in files_dict.values()
+        if "exists" in v and not v["exists"] or overwrite
+    ]
+    for url, out, exists in files_to_download:
+        rmtree(Path(out), ignore_errors=True) if exists else None
+        print(f"Downloading {out}")
+        Path(out).parent.mkdir(parents=True, exist_ok=True)
+        assert isinstance(url, str)
+        with urlopen(url) as response, open(out, "wb") as out_file:
+            total: int = int(response.info()["Content-length"])
+            with Progress(
+                "[progress.percentage]{task.percentage:>3.0f}%",
+                BarColumn(),  # removed bar_width=None to avoid too long when resized
+                DownloadColumn(),
+            ) as progress:
+                download_task = progress.add_task("Download", total=total)
+                for chunk in response:
+                    out_file.write(chunk)
+                    progress.update(download_task, advance=len(chunk))
+
+
+
+ +
+ + +
+ + + +

+ get_list + + +

+
get_list(x)
+
+ +
+ +

Get a list from a string, which contains as separator. If no +string is encountered, the function returns an empty list.

+ +
+ Source code in alto2txt2fixture/create_adjacent_tables.py +
def get_list(x):
+    """Get a list from a string, which contains <SEP> as separator. If no
+    string is encountered, the function returns an empty list."""
+    return x.split("<SEP>") if isinstance(x, str) else []
+
+
+
+ +
+ + +
+ + + +

+ get_outpaths_dict + + +

+
get_outpaths_dict(
+    names: Sequence[str], module_name: str
+) -> TableOutputConfigType
+
+ +
+ +

Return a dict of csv and json paths for each module_name table.

+

The csv and json paths

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
names + Sequence[str] + +
+

iterable of names of each module_name's component. Main target is csv and json table names

+
+
+ required +
module_name + str + +
+

name of module each name is part of, that is added as a prefix

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ TableOutputConfigType + +
+

A TableOutputConfigType: a dict of table names and output +csv and json filenames.

+
+
+ +
+ Example +
>>> from pprint import pprint
+>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, "mitchells"))
+{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},
+ 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},
+ 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',
+                      'json': 'mitchells.PoliticalLeaning.json'},
+ 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}
+
+
+
+ Source code in alto2txt2fixture/create_adjacent_tables.py +
60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConfigType:
+    """Return a `dict` of `csv` and `json` paths for each `module_name` table.
+
+    The `csv` and `json` paths
+
+    Args:
+        names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names
+        module_name: name of module each name is part of, that is added as a prefix
+
+    Returns:
+        A ``TableOutputConfigType``: a `dict` of table ``names`` and output
+            `csv` and `json` filenames.
+
+    Example:
+        ```pycon
+        >>> from pprint import pprint
+        >>> pprint(get_outpaths_dict(MITCHELLS_TABELS, "mitchells"))
+        {'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},
+         'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},
+         'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',
+                              'json': 'mitchells.PoliticalLeaning.json'},
+         'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}
+
+        ```
+    """
+    return {
+        name: OutputPathDict(
+            csv=f"{module_name}.{name}.csv",
+            json=f"{module_name}.{name}.json",
+        )
+        for name in names
+    }
+
+
+
+ +
+ + +
+ + + +

+ run + + +

+
run(
+    files_dict: dict = {},
+    files_to_download_overwrite: bool = OVERWRITE,
+    saved: list[PathLike] = SAVED,
+    time_stamp: str = "",
+    output_path: Path = OUTPUT,
+) -> None
+
+ +
+ +

Download, process and link files_dict to json and csv.

+ +
+ Note +

This will require access to https://zooniversedata.blob.core.windows.net/downloads/.

+
+
+ Source code in alto2txt2fixture/create_adjacent_tables.py +
299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
+725
+726
+727
+728
+729
+730
+731
+732
+733
+734
+735
+736
+737
+738
+739
+740
+741
+742
+743
+744
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
+757
+758
+759
+760
+761
+762
+763
+764
+765
+766
def run(
+    files_dict: dict = {},
+    files_to_download_overwrite: bool = OVERWRITE,
+    saved: list[PathLike] = SAVED,
+    time_stamp: str = "",
+    output_path: Path = OUTPUT,
+) -> None:
+    """Download, process and link ``files_dict`` to `json` and `csv`.
+
+    Note:
+        This will require access to `https://zooniversedata.blob.core.windows.net/downloads/`.
+    """
+
+    # Ensure time_stamp from the point of calling `run`
+    if not time_stamp:
+        time_stamp = get_now(as_str=False).strftime(TIME_FORMAT)
+
+    # Ensure an independent deepcopy of FILES to avoid modifying subsequent runs
+    if not files_dict:
+        files_dict = deepcopy(FILES)
+
+    # Download non-existing files
+    download_data(files_dict=files_dict, overwrite=files_to_download_overwrite)
+
+    # Create the output directory (defined in output_path)
+    output_path.mkdir(exist_ok=True, parents=True)
+
+    # Read all the Wikidata Q values from Mitchells
+    mitchells_df = pd.read_csv(files_dict["mitchells"]["local"], index_col=0)
+    mitchell_wikidata_mentions = sorted(
+        list(mitchells_df.PLACE_PUB_WIKI.unique()),
+        key=lambda x: int(x.replace("Q", "")),
+    )
+
+    # Set up wikidata_gazetteer
+    gaz_cols = ["wikidata_id", "english_label", "latitude", "longitude", "geonamesIDs"]
+    wikidata_gazetteer = pd.read_csv(
+        files_dict["wikidata_gazetteer_selected_columns"]["local"], usecols=gaz_cols
+    )
+    wikidata_gazetteer.rename(
+        {
+            "wikidata_id": "place_wikidata_id",
+            "english_label": "place_label",
+            "geonamesIDs": "geonames_ids",
+        },
+        axis=1,
+        inplace=True,
+    )
+
+    # Read in + fix all dictionaries
+    dict_historic_counties = json.loads(
+        Path(files_dict["dict_historic_counties"]["local"]).read_text()
+    )
+    dict_admin_counties = json.loads(
+        Path(files_dict["dict_admin_counties"]["local"]).read_text()
+    )
+    dict_countries = json.loads(Path(files_dict["dict_countries"]["local"]).read_text())
+    dict_historic_counties = correct_dict(dict_historic_counties)
+    dict_admin_counties = correct_dict(dict_admin_counties)
+    dict_countries = correct_dict(dict_countries)
+
+    # Create assisting frames
+    historical_counties_df = pd.DataFrame(
+        dict_historic_counties,
+        columns=["place_wikidata_id", "hcounty_label", "hcounty_wikidata_id"],
+    )
+    admin_county_df = pd.DataFrame(
+        dict_admin_counties,
+        columns=[
+            "place_wikidata_id",
+            "admin_county_label",
+            "admin_county_wikidata_id",
+        ],
+    )
+    countries_df = pd.DataFrame(
+        dict_countries,
+        columns=["place_wikidata_id", "country_label", "country_wikidata_id"],
+    )
+
+    wikidata_gazetteer = wikidata_gazetteer[
+        wikidata_gazetteer.place_wikidata_id.isin(mitchell_wikidata_mentions)
+    ].sort_values("place_wikidata_id")
+    wikidata_gazetteer["place_pk"] = np.arange(1, len(wikidata_gazetteer) + 1)
+    wikidata_gazetteer = wikidata_gazetteer[
+        ["place_pk"] + [x for x in wikidata_gazetteer.columns if not x == "place_pk"]
+    ]
+
+    # Merge wikidata_gazetteer with all the assisting frames (and rename the
+    # resulting columns)
+    wikidata_gazetteer = pd.merge(
+        wikidata_gazetteer, historical_counties_df, on="place_wikidata_id", how="left"
+    )
+    wikidata_gazetteer = pd.merge(
+        wikidata_gazetteer, admin_county_df, on="place_wikidata_id", how="left"
+    )
+    wikidata_gazetteer = pd.merge(
+        wikidata_gazetteer, countries_df, on="place_wikidata_id", how="left"
+    )
+
+    wikidata_gazetteer.rename(
+        {
+            "admin_county_label": "admin_county__label",
+            "admin_county_wikidata_id": "admin_county__wikidata_id",
+            "hcounty_label": "historic_county__label",
+            "hcounty_wikidata_id": "historic_county__wikidata_id",
+            "country_label": "country__label",
+            "country_wikidata_id": "country__wikidata_id",
+        },
+        axis=1,
+        inplace=True,
+    )
+
+    # Split back up into dataframes specific for the tables
+    historic_county_table = (
+        wikidata_gazetteer[["historic_county__label", "historic_county__wikidata_id"]]
+        .drop_duplicates()
+        .copy()
+    )
+    historic_county_table = historic_county_table.replace({"": np.nan}).dropna()
+    historic_county_table["historic_county__pk"] = np.arange(
+        1, len(historic_county_table) + 1
+    )
+
+    admin_county_table = (
+        wikidata_gazetteer[["admin_county__label", "admin_county__wikidata_id"]]
+        .drop_duplicates()
+        .copy()
+    )
+    admin_county_table = admin_county_table.replace({"": np.nan}).dropna()
+    admin_county_table["admin_county__pk"] = np.arange(1, len(admin_county_table) + 1)
+
+    country_table = (
+        wikidata_gazetteer[["country__label", "country__wikidata_id"]]
+        .drop_duplicates()
+        .copy()
+    )
+    country_table = country_table.replace({"": np.nan}).dropna()
+    country_table["country__pk"] = np.arange(1, len(country_table) + 1)
+
+    # Set up place_table from wikidata_gazetteer
+    place_table = wikidata_gazetteer.copy()
+
+    place_table = (
+        pd.merge(
+            place_table,
+            historic_county_table,
+            on=["historic_county__label", "historic_county__wikidata_id"],
+            how="left",
+        )
+        .drop(["historic_county__label", "historic_county__wikidata_id"], axis=1)
+        .rename({"historic_county__pk": "historic_county_id"}, axis=1)
+    )
+
+    place_table = (
+        pd.merge(
+            place_table,
+            admin_county_table,
+            on=["admin_county__label", "admin_county__wikidata_id"],
+            how="left",
+        )
+        .drop(["admin_county__label", "admin_county__wikidata_id"], axis=1)
+        .rename({"admin_county__pk": "admin_county_id"}, axis=1)
+    )
+
+    place_table = (
+        pd.merge(
+            place_table,
+            country_table,
+            on=["country__label", "country__wikidata_id"],
+            how="left",
+        )
+        .drop(["country__label", "country__wikidata_id"], axis=1)
+        .rename({"country__pk": "country_id"}, axis=1)
+    )
+
+    place_table.fillna("", inplace=True)
+    place_table.set_index("place_pk", inplace=True)
+    place_table.rename(
+        {"place_label": "label", "place_wikidata_id": "wikidata_id"},
+        axis=1,
+        inplace=True,
+    )
+    place_table["historic_county_id"] = (
+        place_table["historic_county_id"]
+        .replace(r"^\s*$", 0, regex=True)
+        .astype(int)
+        .replace(0, "")
+    )
+    place_table["admin_county_id"] = (
+        place_table["admin_county_id"]
+        .replace(r"^\s*$", 0, regex=True)
+        .astype(int)
+        .replace(0, "")
+    )
+    place_table["country_id"] = (
+        place_table["country_id"]
+        .replace(r"^\s*$", 0, regex=True)
+        .astype(int)
+        .replace(0, "")
+    )
+    place_table.index.rename("pk", inplace=True)
+    place_table.rename(
+        {
+            "historic_county_id": "historic_county",
+            "admin_county_id": "admin_county",
+            "country_id": "country",
+        },
+        axis=1,
+        inplace=True,
+    )
+
+    historic_county_table.set_index("historic_county__pk", inplace=True)
+    historic_county_table.rename(
+        {x: x.split("__")[1] for x in historic_county_table.columns},
+        axis=1,
+        inplace=True,
+    )
+    historic_county_table.index.rename("pk", inplace=True)
+
+    admin_county_table.set_index("admin_county__pk", inplace=True)
+    admin_county_table.rename(
+        {x: x.split("__")[1] for x in admin_county_table.columns}, axis=1, inplace=True
+    )
+    admin_county_table.index.rename("pk", inplace=True)
+
+    country_table.set_index("country__pk", inplace=True)
+    country_table.rename(
+        {x: x.split("__")[1] for x in country_table.columns}, axis=1, inplace=True
+    )
+    country_table.index.rename("pk", inplace=True)
+
+    # Adding created_at, updated_at to all the gazetteer tables
+    place_table["created_at"] = time_stamp
+    place_table["updated_at"] = time_stamp
+    admin_county_table["created_at"] = time_stamp
+    admin_county_table["updated_at"] = time_stamp
+    historic_county_table["created_at"] = time_stamp
+    historic_county_table["updated_at"] = time_stamp
+    country_table["created_at"] = time_stamp
+    country_table["updated_at"] = time_stamp
+
+    # Save CSV files for gazetteer tables
+    place_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[PLACE]["csv"])
+    admin_county_table.to_csv(
+        output_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY]["csv"]
+    )
+    historic_county_table.to_csv(
+        output_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY]["csv"]
+    )
+    country_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[COUNTRY]["csv"])
+    saved.extend(
+        [
+            output_path / GAZETTEER_OUT_FILENAMES[PLACE]["csv"],
+            output_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY]["csv"],
+            output_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY]["csv"],
+            output_path / GAZETTEER_OUT_FILENAMES[COUNTRY]["csv"],
+        ]
+    )
+
+    # Fix up Mitchells (already loaded)
+    mitchells_df["politics"] = mitchells_df.POLITICS.apply(get_list)
+    mitchells_df["persons"] = mitchells_df.PERSONS.apply(get_list)
+    mitchells_df["organisations"] = mitchells_df.ORGANIZATIONS.apply(get_list)
+    mitchells_df["price"] = mitchells_df.PRICE.apply(get_list)
+
+    mitchells_df.rename(
+        {
+            "ID": "mpd_id",
+            "TITLE": "title",
+            "politics": "political_leaning_raw",
+            "price": "price_raw",
+            "YEAR": "year",
+            "PLACE_PUB_WIKI": "place_of_publication_id",
+            "ESTABLISHED_DATE": "date_established_raw",
+            "PUBLISED_DATE": "day_of_publication_raw",
+        },
+        axis=1,
+        inplace=True,
+    )
+
+    drop_cols = [
+        "CHAIN_ID",
+        "POLITICS",
+        "PERSONS",
+        "ORGANIZATIONS",
+        "PRICE",
+        "PLACE_PUB",
+        "PLACE_PUB_COORD",
+        "PLACES",
+        "PLACES_TRES",
+        "TEXT",
+    ]
+    mitchells_df.drop(columns=drop_cols, inplace=True)
+
+    # Create derivative tables (from Mitchells) = political_leanings, prices,
+    # issues
+    political_leanings = sorted(
+        list(set([y.strip() for x in mitchells_df.political_leaning_raw for y in x]))
+    )
+    political_leanings_table = pd.DataFrame()
+    political_leanings_table["political_leaning__pk"] = np.arange(
+        1, len(political_leanings) + 1
+    )
+    political_leanings_table["political_leaning__label"] = political_leanings
+    export = political_leanings_table.copy()
+    export["created_at"] = time_stamp
+    export["updated_at"] = time_stamp
+    export.set_index("political_leaning__pk", inplace=True)
+    export.index.rename("pk", inplace=True)
+    export.rename(
+        {x: x.split("__")[1] if len(x.split("__")) > 1 else x for x in export.columns},
+        axis=1,
+        inplace=True,
+    )
+    export.to_csv(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING]["csv"])
+    saved.append(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING]["csv"])
+
+    prices = sorted(list(set([y.strip() for x in mitchells_df.price_raw for y in x])))
+    prices_table = pd.DataFrame()
+    prices_table["price__pk"] = np.arange(1, len(prices) + 1)
+    prices_table["price__label"] = prices
+    export = prices_table.copy()
+    export["created_at"] = time_stamp
+    export["updated_at"] = time_stamp
+    export.set_index("price__pk", inplace=True)
+    export.index.rename("pk", inplace=True)
+    export.rename(
+        {x: x.split("__")[1] if len(x.split("__")) > 1 else x for x in export.columns},
+        axis=1,
+        inplace=True,
+    )
+    export.to_csv(output_path / MITCHELLS_OUT_FILENAMES[PRICE]["csv"])
+    saved.append(output_path / MITCHELLS_OUT_FILENAMES[PRICE]["csv"])
+
+    issues = sorted(list(mitchells_df.year.unique()))
+    issues_table = pd.DataFrame()
+    issues_table["issue__pk"] = np.arange(1, len(issues) + 1)
+    issues_table["issue__year"] = issues
+    export = issues_table.copy()
+    export["created_at"] = time_stamp
+    export["updated_at"] = time_stamp
+    export.set_index("issue__pk", inplace=True)
+    export.index.rename("pk", inplace=True)
+    export.rename(
+        {x: x.split("__")[1] if len(x.split("__")) > 1 else x for x in export.columns},
+        axis=1,
+        inplace=True,
+    )
+    export.to_csv(output_path / MITCHELLS_OUT_FILENAMES[ISSUE]["csv"])
+    saved.append(output_path / MITCHELLS_OUT_FILENAMES[ISSUE]["csv"])
+
+    # Set up linking on Mitchells dataframe
+    linking_df = pd.read_csv(
+        files_dict["linking"]["local"],
+        index_col=0,
+        dtype={"NLP": str},
+        usecols=[
+            "NLP",
+            "Title",
+            "AcquiredYears",
+            "Editions",
+            "EditionTitles",
+            "City",
+            "Publisher",
+            "UnavailableYears",
+            "Collection",
+            "UK",
+            "Complete",
+            "Notes",
+            "County",
+            "HistoricCounty",
+            "First date held",
+            "Publication title",
+            "link_to_mpd",
+        ],
+    )
+    linking_df["NLP"] = linking_df.index
+
+    linking_df.rename(
+        {"link_to_mpd": "mpd_id", "NLP": "newspaper"}, axis=1, inplace=True
+    )
+
+    # Link Mitchells with all the other data
+    mitchells_df = pd.merge(mitchells_df, linking_df, on="mpd_id", how="inner")
+
+    # Create entry_table
+    entry_table = mitchells_df.copy()
+    entry_table["place_of_circulation_raw"] = ""
+    entry_table["publication_district_raw"] = ""
+    entry_table["publication_county_raw"] = ""
+    # TODO: What happened to the three columns above? (Check w Kaspar?)
+
+    # Only keep relevant columns
+    entry_table = entry_table[
+        [
+            "title",
+            "political_leaning_raw",
+            "price_raw",
+            "year",
+            "date_established_raw",
+            "day_of_publication_raw",
+            "place_of_circulation_raw",
+            "publication_district_raw",
+            "publication_county_raw",
+            "organisations",
+            "persons",
+            "place_of_publication_id",
+            "newspaper",
+        ]
+    ]
+
+    # Fix refs to political_leanings_table
+    rev = political_leanings_table.set_index("political_leaning__label")
+    entry_table["political_leanings"] = entry_table.political_leaning_raw.apply(
+        lambda x: [rev.at[y, "political_leaning__pk"] for y in x]
+    )
+
+    # Fix refs to prices_table
+    rev = prices_table.set_index("price__label")
+    entry_table["prices"] = entry_table.price_raw.apply(
+        lambda x: [rev.at[y.strip(), "price__pk"] for y in x]
+    )
+
+    # Fix refs to issues_table
+    rev = issues_table.set_index("issue__year")
+    entry_table["issue"] = entry_table.year.apply(lambda x: rev.at[x, "issue__pk"])
+
+    # Fix refs to place_table
+    rev = place_table.copy()
+    rev["place__pk"] = rev.index
+    rev.set_index("wikidata_id", inplace=True)
+    entry_table["place_of_publication"] = entry_table.place_of_publication_id.apply(
+        test_place, rev=rev
+    )
+    entry_table.drop(columns=["place_of_publication_id"], inplace=True)
+
+    # Set up ref to newspapers
+    rev = json.loads(files_dict["Newspaper-1"]["local"].read_text())
+    rev = [dict(pk=v["pk"], **v["fields"]) for v in rev]
+    rev = pd.DataFrame(rev)
+    rev.set_index("publication_code", inplace=True)
+    entry_table["newspaper"] = entry_table.newspaper.str.zfill(7)
+    entry_table["newspaper"] = entry_table.newspaper.apply(test_paper, rev=rev)
+
+    # Create PK for entries
+    entry_table["pk"] = np.arange(1, len(entry_table) + 1)
+
+    # Sort columns in entries file
+    entry_table = entry_table[
+        ["pk"] + [col for col in entry_table.columns if not col == "pk"]
+    ]
+
+    # Add created_at, modified_at to entry_table
+    entry_table["created_at"] = time_stamp
+    entry_table["updated_at"] = time_stamp
+
+    # Export entry_table
+    entry_table.set_index("pk").to_csv(
+        output_path / MITCHELLS_OUT_FILENAMES[ENTRY]["csv"]
+    )
+    saved.append(output_path / MITCHELLS_OUT_FILENAMES[ENTRY]["csv"])
+
+    # ###### NOW WE CAN EASILY CREATE JSON files_dict
+    for csv_file_path in output_path.glob("*.csv"):
+        csv2json_list(csv_file_path)
+
+    print("Finished - saved files:")
+    print("- " + "\n- ".join([str(x) for x in saved]))
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/index.html b/reference/alto2txt2fixture/index.html new file mode 100644 index 0000000..a356b6f --- /dev/null +++ b/reference/alto2txt2fixture/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + alto2txt2fixture - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

alto2txt2fixture

+ +
+ + + +
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/jisc.html b/reference/alto2txt2fixture/jisc.html new file mode 100644 index 0000000..242bc1a --- /dev/null +++ b/reference/alto2txt2fixture/jisc.html @@ -0,0 +1,1312 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + jisc - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

jisc

+ +
+ + + +
+ + + +
+ + + + + + + + + + +
+ + + +

+ get_jisc_title + + +

+
get_jisc_title(
+    title: str,
+    issue_date: str,
+    jisc_papers: pd.DataFrame,
+    input_sub_path: str,
+    publication_code: str,
+    abbr: str | None = None,
+) -> str
+
+ +
+ +

Match a newspaper title with jisc_papers records.

+

Takes an input_sub_path, a publication_code, and an (optional) +abbreviation for any newspaper to locate the title in the +jisc_papers DataFrame. jisc_papers is usually loaded via the +setup_jisc_papers function.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
title + str + +
+

target newspaper title

+
+
+ required +
issue_date + str + +
+

target newspaper issue_date

+
+
+ required +
jisc_papers + pd.DataFrame + +
+

DataFrame of jisc_papers to match

+
+
+ required +
input_sub_path + str + +
+

path of files to narrow down query input_sub_path

+
+
+ required +
publication_code + str + +
+

unique codes to match newspaper records

+
+
+ required +
abbr + str | None + +
+

an optional abbreviation of the newspaper title

+
+
+ None +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

Matched title str or abbr.

+
+
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

A string estimating the JISC equivalent newspaper title

+
+
+ +
+ Source code in alto2txt2fixture/jisc.py +
def get_jisc_title(
+    title: str,
+    issue_date: str,
+    jisc_papers: pd.DataFrame,
+    input_sub_path: str,
+    publication_code: str,
+    abbr: str | None = None,
+) -> str:
+    """
+    Match a newspaper ``title`` with ``jisc_papers`` records.
+
+    Takes an ``input_sub_path``, a ``publication_code``, and an (optional)
+    abbreviation for any newspaper to locate the ``title`` in the
+    ``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the
+    ``setup_jisc_papers`` function.
+
+    Args:
+        title: target newspaper title
+        issue_date: target newspaper issue_date
+        jisc_papers: `DataFrame` of `jisc_papers` to match
+        input_sub_path: path of files to narrow down query input_sub_path
+        publication_code: unique codes to match newspaper records
+        abbr: an optional abbreviation of the newspaper title
+
+    Returns:
+        Matched ``title`` `str` or ``abbr``.
+
+
+    Returns:
+        A string estimating the JISC equivalent newspaper title
+    """
+
+    # First option, search the input_sub_path for a valid-looking publication_code
+    g = PUBLICATION_CODE.findall(input_sub_path)
+
+    if len(g) == 1:
+        publication_code = g[0]
+        # Let's see if we can find title:
+        title = (
+            jisc_papers[
+                jisc_papers.publication_code == publication_code
+            ].title.to_list()[0]
+            if jisc_papers[
+                jisc_papers.publication_code == publication_code
+            ].title.count()
+            == 1
+            else title
+        )
+        return title
+
+    # Second option, look through JISC papers for best match (on publication_code if we have it, but abbr more importantly if we have it)
+    if abbr:
+        _publication_code = publication_code
+        publication_code = abbr
+
+    if jisc_papers.abbr[jisc_papers.abbr == publication_code].count():
+        date = datetime.strptime(issue_date, "%Y-%m-%d")
+        mask = (
+            (jisc_papers.abbr == publication_code)
+            & (date >= jisc_papers.start_date)
+            & (date <= jisc_papers.end_date)
+        )
+        filtered = jisc_papers.loc[mask]
+        if filtered.publication_code.count() == 1:
+            publication_code = filtered.publication_code.to_list()[0]
+            title = filtered.title.to_list()[0]
+            return title
+
+    # Last option: let's find all the possible titles in the jisc_papers for the abbreviation, and if it's just one unique title, let's pick it!
+    if abbr:
+        test = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})
+        if len(test) == 1:
+            return test[0]
+        else:
+            mask1 = (jisc_papers.abbr == publication_code) & (
+                jisc_papers.publication_code == _publication_code
+            )
+            test1 = jisc_papers.loc[mask1]
+            test1 = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})
+            if len(test) == 1:
+                return test1[0]
+
+    # Fallback: if abbreviation is set, we'll return that:
+    if abbr:
+        # For these exceptions, see issue comment:
+        # https://github.com/alan-turing-institute/Living-with-Machines/issues/2453#issuecomment-1050652587
+        if abbr == "IPJL":
+            return "Ipswich Journal"
+        elif abbr == "BHCH":
+            return "Bath Chronicle"
+        elif abbr == "LSIR":
+            return "Leeds Intelligencer"
+        elif abbr == "AGER":
+            return "Lancaster Gazetter, And General Advertiser For Lancashire West"
+
+        return abbr
+
+    raise RuntimeError(f"Title {title} could not be found.")
+
+
+
+ +
+ + +
+ + + +

+ setup_jisc_papers + + +

+
setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame
+
+ +
+ +

Create a DataFrame with information in JISC_PAPERS_CSV in settings.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ pd.DataFrame + +
+

DataFrame with all JISC titles.

+
+
+ +
+ Source code in alto2txt2fixture/jisc.py +
10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:
+    """
+    Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.
+
+    Returns:
+        `DataFrame` with all JISC titles.
+    """
+
+    if not Path(path).exists():
+        raise RuntimeError(
+            f"Could not find required JISC papers file. Put {Path(path).name} in {Path(path).parent} or correct the settings with a different path."
+        )
+
+    months = {
+        "Jan": 1,
+        "Feb": 2,
+        "Mar": 3,
+        "Apr": 4,
+        "May": 5,
+        "Jun": 6,
+        "June": 6,
+        "Jul": 7,
+        "July": 7,
+        "Aug": 8,
+        "Sep": 9,
+        "Sept": 9,
+        "Oct": 10,
+        "Nov": 11,
+        "Dec": 12,
+        "Dec.": 12,
+    }
+
+    jisc_papers = pd.read_csv(
+        path,
+        usecols=[
+            "Newspaper Title",
+            "NLP",
+            "Abbr",
+            "StartD",
+            "StartM",
+            "StartY",
+            "EndD",
+            "EndM",
+            "EndY",
+        ],
+    )
+    jisc_papers["start_date"] = jisc_papers.apply(
+        lambda x: datetime(
+            year=int(x.StartY),
+            month=months[x.StartM.strip(".").strip()],
+            day=int(x.StartD),
+        ),
+        axis=1,
+    )
+    jisc_papers["end_date"] = jisc_papers.apply(
+        lambda x: datetime(
+            year=int(x.EndY), month=months[x.EndM.strip(".").strip()], day=int(x.EndD)
+        ),
+        axis=1,
+    )
+    jisc_papers.drop(
+        ["StartD", "StartM", "StartY", "EndD", "EndM", "EndY"],
+        axis="columns",
+        inplace=True,
+    )
+    jisc_papers.rename(
+        {"Newspaper Title": "title", "NLP": "publication_code", "Abbr": "abbr"},
+        axis=1,
+        inplace=True,
+    )
+    jisc_papers["title"] = jisc_papers["title"].apply(
+        lambda x: "The " + x[:-5] if x.strip()[-5:].lower() == ", the" else x
+    )
+    jisc_papers["publication_code"] = jisc_papers["publication_code"].apply(
+        lambda x: str(x).zfill(7)
+    )
+
+    return jisc_papers
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/log.html b/reference/alto2txt2fixture/log.html new file mode 100644 index 0000000..9647e80 --- /dev/null +++ b/reference/alto2txt2fixture/log.html @@ -0,0 +1,902 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + log - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

log

+ +
+ + + +
+ + + +
+ + + + + + + + + + +
+ + + +

+ error + + +

+
error(msg: str, crash: bool = True, silent: bool = True) -> None
+
+ +
+ +

Print msg in colorama Force.RED and exit()

+

If silent exit() after call, else raise RuntimeError if crash=True.

+ +
+ Source code in alto2txt2fixture/log.py +
22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
def error(msg: str, crash: bool = True, silent: bool = True) -> None:
+    """Print ``msg`` in `colorama` `Force.RED` and `exit()`
+
+    If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``."""
+    if crash and silent:
+        print(f"{Fore.RED}{msg}{Style.RESET_ALL}")
+        exit()
+    elif crash:
+        raise RuntimeError(msg) from None
+    print(f"{Fore.RED}{msg}{Style.RESET_ALL}")
+
+    return
+
+
+
+ +
+ + +
+ + + +

+ info + + +

+
info(msg: str) -> None
+
+ +
+ +

Print msg in colorama Force.CYAN colour.

+ +
+ Source code in alto2txt2fixture/log.py +
10
+11
+12
+13
def info(msg: str) -> None:
+    """Print ``msg`` in `colorama` `Force.CYAN` colour."""
+    print(f"{Fore.CYAN}{msg}{Style.RESET_ALL}")
+    return
+
+
+
+ +
+ + +
+ + + +

+ success + + +

+
success(msg: str) -> None
+
+ +
+ +

Print msg in colorama Force.GREEN colour.

+ +
+ Source code in alto2txt2fixture/log.py +
4
+5
+6
+7
def success(msg: str) -> None:
+    """Print ``msg`` in `colorama` `Force.GREEN` colour."""
+    print(f"{Fore.GREEN}{msg}{Style.RESET_ALL}")
+    return
+
+
+
+ +
+ + +
+ + + +

+ warning + + +

+
warning(msg: str) -> None
+
+ +
+ +

Print msg in colorama Force.YELLOW colour.

+ +
+ Source code in alto2txt2fixture/log.py +
16
+17
+18
+19
def warning(msg: str) -> None:
+    """Print ``msg`` in `colorama` `Force.YELLOW` colour."""
+    print(f"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}")
+    return
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/parser.html b/reference/alto2txt2fixture/parser.html new file mode 100644 index 0000000..62ebf6b --- /dev/null +++ b/reference/alto2txt2fixture/parser.html @@ -0,0 +1,2737 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + parser - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

parser

+ +
+ + + +
+ + + +
+ + + + + + + + + + +
+ + + +

+ fixtures + + +

+
fixtures(
+    filelist: list = [],
+    model: str = "",
+    translate: dict = {},
+    rename: dict = {},
+    uniq_keys: list = [],
+) -> Generator[FixtureDict, None, None]
+
+ +
+ +

Generates fixtures for a specified model using a list of files.

+

This function takes a list of files and generates fixtures for a specified +model. The fixtures can be used to populate a database or perform other +data-related operations.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
filelist + list + +
+

A list of files to process and generate fixtures from.

+
+
+ [] +
model + str + +
+

The name of the model for which fixtures are generated. +translate: A nested dictionary representing the translation mapping +for fields. The structure of the translator follows the format: +

{
+    'part1': {
+        'part2': {
+            'translated_field': 'pk'
+        }
+    }
+}
+
+The translated fields will be used as keys, and their +corresponding primary keys (obtained from the provided files) will +be used as values in the generated fixtures.

+
+
+ '' +
rename + dict + +
+

A nested dictionary representing the field renaming +mapping. The structure of the dictionary follows the format: +

{
+    'part1': {
+        'part2': 'new_field_name'
+    }
+}
+
+The fields specified in the dictionary will be renamed to the +provided new field names in the generated fixtures.

+
+
+ {} +
uniq_keys + list + +
+

A list of fields that need to be considered for +uniqueness in the fixtures. If specified, the fixtures will yield +only unique items based on the combination of these fields.

+
+
+ [] +
+ + + +

Yields:

+ + + + + + + + + + + + + +
TypeDescription
+ FixtureDict + +
+

FixtureDict from model, pk and dict of fields.

+
+
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Generator[FixtureDict, None, None] + +
+

This function generates fixtures but does not return any value.

+
+
+ +
+ Source code in alto2txt2fixture/parser.py +
def fixtures(
+    filelist: list = [],
+    model: str = "",
+    translate: dict = {},
+    rename: dict = {},
+    uniq_keys: list = [],
+) -> Generator[FixtureDict, None, None]:
+    """
+    Generates fixtures for a specified model using a list of files.
+
+    This function takes a list of files and generates fixtures for a specified
+    model. The fixtures can be used to populate a database or perform other
+    data-related operations.
+
+    Args:
+        filelist: A list of files to process and generate fixtures from.
+        model: The name of the model for which fixtures are generated.
+            translate: A nested dictionary representing the translation mapping
+            for fields. The structure of the translator follows the format:
+            ```python
+            {
+                'part1': {
+                    'part2': {
+                        'translated_field': 'pk'
+                    }
+                }
+            }
+            ```
+            The translated fields will be used as keys, and their
+            corresponding primary keys (obtained from the provided files) will
+            be used as values in the generated fixtures.
+        rename: A nested dictionary representing the field renaming
+            mapping. The structure of the dictionary follows the format:
+            ```python
+            {
+                'part1': {
+                    'part2': 'new_field_name'
+                }
+            }
+            ```
+            The fields specified in the dictionary will be renamed to the
+            provided new field names in the generated fixtures.
+        uniq_keys: A list of fields that need to be considered for
+            uniqueness in the fixtures. If specified, the fixtures will yield
+            only unique items based on the combination of these fields.
+
+    Yields:
+        `FixtureDict` from ``model``, ``pk`` and `dict` of ``fields``.
+
+    Returns:
+        This function generates fixtures but does not return any value.
+    """
+
+    filelist = sorted(filelist, key=lambda x: str(x).split("/")[:-1])
+    count = len(filelist)
+
+    # Process JSONL
+    if [x for x in filelist if ".jsonl" in x.name]:
+        pk = 0
+        # In the future, we might want to show progress here (tqdm or suchlike)
+        for file in filelist:
+            for line in file.read_text().splitlines():
+                pk += 1
+                line = json.loads(line)
+                yield FixtureDict(
+                    pk=pk,
+                    model=model,
+                    fields=dict(**get_fields(line, translate=translate, rename=rename)),
+                )
+
+        return
+    else:
+        # Process JSON
+        pks = [x for x in range(1, count + 1)]
+
+        if len(uniq_keys):
+            uniq_files = list(uniq(filelist, uniq_keys))
+            count = len(uniq_files)
+            zipped = zip(uniq_files, pks)
+        else:
+            zipped = zip(filelist, pks)
+
+        for x in tqdm(
+            zipped, total=count, desc=f"{model} ({count:,} objs)", leave=False
+        ):
+            yield FixtureDict(
+                pk=x[1],
+                model=model,
+                fields=dict(**get_fields(x[0], translate=translate, rename=rename)),
+            )
+
+        return
+
+
+
+ +
+ + +
+ + + +

+ get_fields + + +

+
get_fields(
+    file: Union[Path, str, dict],
+    translate: dict = {},
+    rename: dict = {},
+    allow_null: bool = False,
+) -> dict
+
+ +
+ +

Retrieves fields from a file and performs modifications and checks.

+

This function takes a file (in various formats: Path, str, or dict) +and processes its fields. It retrieves the fields from the file and +performs modifications, translations, and checks on the fields.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
file + Union[Path, str, dict] + +
+

The file from which the fields are retrieved.

+
+
+ required +
translate + dict + +
+

A nested dictionary representing the translation mapping +for fields. The structure of the translator follows the format: +

{
+    'part1': {
+        'part2': {
+            'translated_field': 'pk'
+        }
+    }
+}
+
+The translated fields will be used to replace the original fields +in the retrieved fields.

+
+
+ {} +
rename + dict + +
+

A nested dictionary representing the field renaming +mapping. The structure of the dictionary follows the format: +

{
+    'part1': {
+        'part2': 'new_field_name'
+    }
+}
+
+The fields specified in the dictionary will be renamed to the +provided new field names in the retrieved fields.

+
+
+ {} +
allow_null + bool + +
+

Determines whether to allow None values for +relational fields. If set to True, relational fields with +missing values will be assigned None. If set to False, an +error will be raised.

+
+
+ False +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

A dictionary representing the retrieved fields from the file, +with modifications and checks applied.

+
+
+ + + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ RuntimeError + +
+

If the file type is unsupported or if an error occurs +during field retrieval or processing.

+
+
+ +
+ Source code in alto2txt2fixture/parser.py +
def get_fields(
+    file: Union[Path, str, dict],
+    translate: dict = {},
+    rename: dict = {},
+    allow_null: bool = False,
+) -> dict:
+    """
+    Retrieves fields from a file and performs modifications and checks.
+
+    This function takes a file (in various formats: `Path`, `str`, or `dict`)
+    and processes its fields. It retrieves the fields from the file and
+    performs modifications, translations, and checks on the fields.
+
+    Args:
+        file: The file from which the fields are retrieved.
+        translate: A nested dictionary representing the translation mapping
+            for fields. The structure of the translator follows the format:
+            ```python
+            {
+                'part1': {
+                    'part2': {
+                        'translated_field': 'pk'
+                    }
+                }
+            }
+            ```
+            The translated fields will be used to replace the original fields
+            in the retrieved fields.
+        rename: A nested dictionary representing the field renaming
+            mapping. The structure of the dictionary follows the format:
+            ```python
+            {
+                'part1': {
+                    'part2': 'new_field_name'
+                }
+            }
+            ```
+            The fields specified in the dictionary will be renamed to the
+            provided new field names in the retrieved fields.
+        allow_null: Determines whether to allow ``None`` values for
+            relational fields. If set to ``True``, relational fields with
+            missing values will be assigned ``None``. If set to ``False``, an
+            error will be raised.
+
+    Returns:
+        A dictionary representing the retrieved fields from the file,
+            with modifications and checks applied.
+
+    Raises:
+        RuntimeError: If the file type is unsupported or if an error occurs
+            during field retrieval or processing.
+    """
+    if isinstance(file, Path):
+        try:
+            fields = json.loads(file.read_text())
+        except Exception as e:
+            raise RuntimeError(f"Cannot interpret JSON ({e}): {file}")
+    elif isinstance(file, str):
+        if "\n" in file:
+            raise RuntimeError("File has multiple lines.")
+        try:
+            fields = json.loads(file)
+        except json.decoder.JSONDecodeError as e:
+            raise RuntimeError(f"Cannot interpret JSON ({e}): {file}")
+    elif isinstance(file, dict):
+        fields = file
+    else:
+        raise RuntimeError(f"Cannot process type {type(file)}.")
+
+    # Fix relational fields for any file
+    for key in [key for key in fields.keys() if "__" in key]:
+        parts = key.split("__")
+
+        try:
+            before = fields[key]
+            if before:
+                before = before.replace("---", "/")
+                loc = translate.get(parts[0], {}).get(parts[1], {})
+                fields[key] = loc.get(before)
+                if fields[key] is None:
+                    raise RuntimeError(
+                        f"Cannot translate fields.{key} from {before}: {loc}"
+                    )
+
+        except AttributeError:
+            if allow_null:
+                fields[key] = None
+            else:
+                print(
+                    "Content had relational fields, but something went wrong in parsing the data:"
+                )
+                print("file", file)
+                print("fields", fields)
+                print("KEY:", key)
+                raise RuntimeError()
+
+        new_name = rename.get(parts[0], {}).get(parts[1], None)
+        if new_name:
+            fields[new_name] = fields[key]
+            del fields[key]
+
+    fields["created_at"] = NOW_str
+    fields["updated_at"] = NOW_str
+
+    try:
+        fields["item_type"] = str(fields["item_type"]).upper()
+    except KeyError:
+        pass
+
+    try:
+        if fields["ocr_quality_mean"] == "":
+            fields["ocr_quality_mean"] = 0
+    except KeyError:
+        pass
+
+    try:
+        if fields["ocr_quality_sd"] == "":
+            fields["ocr_quality_sd"] = 0
+    except KeyError:
+        pass
+
+    return fields
+
+
+
+ +
+ + +
+ + + +

+ get_key_from + + +

+
get_key_from(item: Path, x: str) -> str
+
+ +
+ +

Retrieves a specific key from a file and returns its value.

+

This function reads a file and extracts the value of a specified +key. If the key is not found or an error occurs while processing +the file, a warning is printed, and an empty string is returned.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
item + Path + +
+

The file from which the key is extracted.

+
+
+ required +
x + str + +
+

The key to be retrieved from the file.

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

The value of the specified key from the file.

+
+
+ +
+ Source code in alto2txt2fixture/parser.py +
12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
def get_key_from(item: Path, x: str) -> str:
+    """
+    Retrieves a specific key from a file and returns its value.
+
+    This function reads a file and extracts the value of a specified
+    key. If the key is not found or an error occurs while processing
+    the file, a warning is printed, and an empty string is returned.
+
+    Args:
+        item: The file from which the key is extracted.
+        x: The key to be retrieved from the file.
+
+    Returns:
+        The value of the specified key from the file.
+    """
+    result = json.loads(item.read_text()).get(x, None)
+    if not result:
+        print(f"[WARN] Could not find key {x} in {item}")
+        result = ""
+    return result
+
+
+
+ +
+ + +
+ + + +

+ get_translator + + +

+
get_translator(
+    fields: list[TranslatorTuple] = [TranslatorTuple("", "", [])]
+) -> dict
+
+ +
+ +

Converts a list of fields into a nested dictionary representing a +translator.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
fields + list[TranslatorTuple] + +
+

A list of tuples representing fields to be translated.

+
+
+ [TranslatorTuple('', '', [])] +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

A nested dictionary representing the translator. The structure of +the dictionary follows the format: +

{
+    'part1': {
+          'part2': {
+              'translated_field': 'pk'
+          }
+    }
+}
+

+
+
+ +
+ Example +
>>> fields = [
+...     TranslatorTuple(
+...         start='start__field1',
+...         finish='field1',
+...         lst=[{
+...             'fields': {'field1': 'translation1'},
+...             'pk': 1}],
+...      )]
+>>> get_translator(fields)
+{'start': {'field1': {'translation1': 1}}}
+
+
+
+ Source code in alto2txt2fixture/parser.py +
def get_translator(
+    fields: list[TranslatorTuple] = [TranslatorTuple("", "", [])]
+) -> dict:
+    """
+    Converts a list of fields into a nested dictionary representing a
+    translator.
+
+    Args:
+        fields: A list of tuples representing fields to be translated.
+
+    Returns:
+        A nested dictionary representing the translator. The structure of
+            the dictionary follows the format:
+            ```python
+            {
+                'part1': {
+                      'part2': {
+                          'translated_field': 'pk'
+                      }
+                }
+            }
+            ```
+
+    Example:
+        ```pycon
+        >>> fields = [
+        ...     TranslatorTuple(
+        ...         start='start__field1',
+        ...         finish='field1',
+        ...         lst=[{
+        ...             'fields': {'field1': 'translation1'},
+        ...             'pk': 1}],
+        ...      )]
+        >>> get_translator(fields)
+        {'start': {'field1': {'translation1': 1}}}
+
+        ```
+    """
+    _ = dict()
+    for field in fields:
+        start, finish, lst = field
+        part1, part2 = start.split("__")
+        if part1 not in _:
+            _[part1] = {}
+        if part2 not in _[part1]:
+            _[part1][part2] = {}
+        if isinstance(finish, str):
+            _[part1][part2] = {o["fields"][finish]: o["pk"] for o in lst}
+        elif isinstance(finish, list):
+            _[part1][part2] = {
+                "-".join([o["fields"][x] for x in finish]): o["pk"] for o in lst
+            }
+
+    return _
+
+
+
+ +
+ + +
+ + + +

+ parse + + +

+
parse(
+    collections: list, cache_home: str, output: str, max_elements_per_file: int
+) -> None
+
+ +
+ +

Parses files from collections and generates fixtures for various models.

+

This function processes files from the specified collections and generates +fixtures for different models, such as newspapers.dataprovider, +newspapers.ingest, newspapers.digitisation, newspapers.newspaper, +newspapers.issue, and newspapers.item.

+

It performs various steps, such as file listing, fixture generation, +translation mapping, renaming fields, and saving fixtures to files.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
collections + list + +
+

A list of collections from which files are +processed and fixtures are generated.

+
+
+ required +
cache_home + str + +
+

The directory path where the collections are located.

+
+
+ required +
output + str + +
+

The directory path where the fixtures will be saved.

+
+
+ required +
max_elements_per_file + int + +
+

The maximum number of elements per file +when saving fixtures.

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

This function generates fixtures but does not return any value.

+
+
+ +
+ Source code in alto2txt2fixture/parser.py +
def parse(
+    collections: list, cache_home: str, output: str, max_elements_per_file: int
+) -> None:
+    """
+    Parses files from collections and generates fixtures for various models.
+
+    This function processes files from the specified collections and generates
+    fixtures for different models, such as `newspapers.dataprovider`,
+    `newspapers.ingest`, `newspapers.digitisation`, `newspapers.newspaper`,
+    `newspapers.issue`, and `newspapers.item`.
+
+    It performs various steps, such as file listing, fixture generation,
+    translation mapping, renaming fields, and saving fixtures to files.
+
+    Args:
+        collections: A list of collections from which files are
+            processed and fixtures are generated.
+        cache_home: The directory path where the collections are located.
+        output: The directory path where the fixtures will be saved.
+        max_elements_per_file: The maximum number of elements per file
+            when saving fixtures.
+
+    Returns:
+        This function generates fixtures but does not return any value.
+    """
+    global CACHE_HOME
+    global OUTPUT
+    global MAX_ELEMENTS_PER_FILE
+
+    CACHE_HOME = cache_home
+    OUTPUT = output
+    MAX_ELEMENTS_PER_FILE = max_elements_per_file
+
+    # Set up output directory
+    reset_fixture_dir(OUTPUT)
+
+    # Get file lists
+    print("\nGetting file lists...")
+
+    def issues_in_x(x):
+        return "issues" in str(x.parent).split("/")
+
+    def newspapers_in_x(x):
+        return not any(
+            [
+                condition
+                for y in str(x.parent).split("/")
+                for condition in [
+                    "issues" in y,
+                    "ingest" in y,
+                    "digitisation" in y,
+                    "data-provider" in y,
+                ]
+            ]
+        )
+
+    all_json = [
+        x for y in collections for x in (Path(CACHE_HOME) / y).glob("**/*.json")
+    ]
+    all_jsonl = [
+        x for y in collections for x in (Path(CACHE_HOME) / y).glob("**/*.jsonl")
+    ]
+    print(f"--> {len(all_json):,} JSON files altogether")
+    print(f"--> {len(all_jsonl):,} JSONL files altogether")
+
+    print("\nSetting up fixtures...")
+
+    # Process data providers
+    def data_provider_in_x(x):
+        return "data-provider" in str(x.parent).split("/")
+
+    data_provider_json = list(
+        fixtures(
+            model="newspapers.dataprovider",
+            filelist=[x for x in all_json if data_provider_in_x(x)],
+            uniq_keys=["name"],
+        )
+    )
+    print(f"--> {len(data_provider_json):,} DataProvider fixtures")
+
+    # Process ingest
+    def ingest_in_x(x):
+        return "ingest" in str(x.parent).split("/")
+
+    ingest_json = list(
+        fixtures(
+            model="newspapers.ingest",
+            filelist=[x for x in all_json if ingest_in_x(x)],
+            uniq_keys=["lwm_tool_name", "lwm_tool_version"],
+        )
+    )
+    print(f"--> {len(ingest_json):,} Ingest fixtures")
+
+    # Process digitisation
+    def digitisation_in_x(x):
+        return "digitisation" in str(x.parent).split("/")
+
+    digitisation_json = list(
+        fixtures(
+            model="newspapers.digitisation",
+            filelist=[x for x in all_json if digitisation_in_x(x)],
+            uniq_keys=["software"],
+        )
+    )
+    print(f"--> {len(digitisation_json):,} Digitisation fixtures")
+
+    # Process newspapers
+    newspaper_json = list(
+        fixtures(
+            model="newspapers.newspaper",
+            filelist=[file for file in all_json if newspapers_in_x(file)],
+        )
+    )
+    print(f"--> {len(newspaper_json):,} Newspaper fixtures")
+
+    # Process issue
+    translate = get_translator(
+        [
+            TranslatorTuple(
+                "publication__publication_code", "publication_code", newspaper_json
+            )
+        ]
+    )
+    rename = {"publication": {"publication_code": "newspaper_id"}}
+
+    issue_json = list(
+        fixtures(
+            model="newspapers.issue",
+            filelist=[file for file in all_json if issues_in_x(file)],
+            translate=translate,
+            rename=rename,
+        )
+    )
+    print(f"--> {len(issue_json):,} Issue fixtures")
+
+    # Create translator/clear up memory before processing items
+    translate = get_translator(
+        [
+            ("issue__issue_identifier", "issue_code", issue_json),
+            ("digitisation__software", "software", digitisation_json),
+            ("data_provider__name", "name", data_provider_json),
+            (
+                "ingest__lwm_tool_identifier",
+                ["lwm_tool_name", "lwm_tool_version"],
+                ingest_json,
+            ),
+        ]
+    )
+
+    rename = {
+        "issue": {"issue_identifier": "issue_id"},
+        "digitisation": {"software": "digitisation_id"},
+        "data_provider": {"name": "data_provider_id"},
+        "ingest": {"lwm_tool_identifier": "ingest_id"},
+    }
+
+    save_fixture(newspaper_json, "Newspaper")
+    save_fixture(issue_json, "Issue")
+
+    del newspaper_json
+    del issue_json
+    gc.collect()
+
+    print("\nSaving...")
+
+    save_fixture(digitisation_json, "Digitisation")
+    save_fixture(ingest_json, "Ingest")
+    save_fixture(data_provider_json, "DataProvider")
+
+    # Process items
+    item_json = fixtures(
+        model="newspapers.item",
+        filelist=all_jsonl,
+        translate=translate,
+        rename=rename,
+    )
+    save_fixture(item_json, "Item")
+
+    return
+
+
+
+ +
+ + +
+ + + +

+ reset_fixture_dir + + +

+
reset_fixture_dir(output: str | Path) -> None
+
+ +
+ +

Resets the fixture directory by removing all JSON files inside it.

+

This function takes a directory path (output) as input and removes all +JSON files within the directory.

+

Prior to removal, it prompts the user for confirmation to proceed. If the +user confirms, the function clears the fixture directory by deleting the +JSON files.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
output + str | Path + +
+

The directory path of the fixture directory to be reset.

+
+
+ required +
+ + + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ RuntimeError + +
+

If the output directory is not specified as a string.

+
+
+ +
+ Source code in alto2txt2fixture/parser.py +
def reset_fixture_dir(output: str | Path) -> None:
+    """
+    Resets the fixture directory by removing all JSON files inside it.
+
+    This function takes a directory path (``output``) as input and removes all
+    JSON files within the directory.
+
+    Prior to removal, it prompts the user for confirmation to proceed. If the
+    user confirms, the function clears the fixture directory by deleting the
+    JSON files.
+
+    Args:
+        output: The directory path of the fixture directory to be reset.
+
+    Raises:
+        RuntimeError: If the ``output`` directory is not specified as a string.
+    """
+
+    if not isinstance(output, str):
+        raise RuntimeError("`output` directory needs to be specified as a string.")
+
+    output = Path(output)
+
+    y = input(
+        f"This command will automatically empty the fixture directory ({output.absolute()}). "
+        "Do you want to proceed? [y/N]"
+    )
+
+    if not y.lower() == "y":
+        output.mkdir(parents=True, exist_ok=True)
+        return
+
+    print("\nClearing up the fixture directory")
+
+    # Ensure directory exists
+    output.mkdir(parents=True, exist_ok=True)
+
+    # Drop all JSON files
+    [x.unlink() for x in Path(output).glob("*.json")]
+
+    return
+
+
+
+ +
+ + +
+ + + +

+ uniq + + +

+
uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]
+
+ +
+ +

Generates unique items from a list of files based on specified keys.

+

This function takes a list of files and yields unique items based on a +combination of keys. The keys are extracted from each file using the +get_key_from function, and duplicate items are ignored.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
filelist + list + +
+

A list of files from which unique items are +generated.

+
+
+ required +
keys + list + +
+

A list of keys used for uniqueness. Each key specifies +a field to be used for uniqueness checking in the generated +items.

+
+
+ [] +
+ + + +

Yields:

+ + + + + + + + + + + + + +
TypeDescription
+ Any + +
+

A unique item from filelist.

+
+
+ +
+ Source code in alto2txt2fixture/parser.py +
34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:
+    """
+    Generates unique items from a list of files based on specified keys.
+
+    This function takes a list of files and yields unique items based on a
+    combination of keys. The keys are extracted from each file using the
+    ``get_key_from`` function, and duplicate items are ignored.
+
+    Args:
+        filelist: A list of files from which unique items are
+            generated.
+        keys: A list of keys used for uniqueness. Each key specifies
+            a field to be used for uniqueness checking in the generated
+            items.
+
+    Yields:
+        A unique item from `filelist`.
+    """
+
+    seen = set()
+    for item in filelist:
+        key = "-".join([get_key_from(item, x) for x in keys])
+
+        if key not in seen:
+            seen.add(key)
+            yield item
+        else:
+            # Drop it if duplicate
+            pass
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/patterns.html b/reference/alto2txt2fixture/patterns.html new file mode 100644 index 0000000..554fad9 --- /dev/null +++ b/reference/alto2txt2fixture/patterns.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + patterns - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

patterns

+ +
+ + + +
+ +

Useful regular expressions, intially just PUBLICATION_CODE.

+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/router.html b/reference/alto2txt2fixture/router.html new file mode 100644 index 0000000..a5a8f60 --- /dev/null +++ b/reference/alto2txt2fixture/router.html @@ -0,0 +1,5134 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + router - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

router

+ +
+ + + +
+ + + +
+ + + + + + + + +
+ + + +

+ Archive + + +

+
Archive(
+    path: str | Path,
+    collection: str = "",
+    report_id: str | None = None,
+    jisc_papers: pd.DataFrame | None = None,
+    json_indent: int = JSON_INDENT,
+)
+
+ +
+ + +

Manage extracting information from a ZIP archive.

+

The Archive class represents a zip archive of XML files. The class is used +to extract information from a ZIP archive, and it contains several methods +to process the data contained in the archive.

+
+

open(Archive) context manager

+

Archive can be opened with a context manager, which creates a meta +object, with timings for the object. When closed, it will save the +meta JSON to the correct paths.

+
+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
path + Path + +
+

The path to the zip archive.

+
+
collection + str + +
+

The collection of the XML files in the archive. Default is "".

+
+
report + Path + +
+

The file path of the report file for the archive.

+
+
report_id + str + +
+

The report ID for the archive. If not provided, a random UUID is +generated.

+
+
report_parent + Path + +
+

The parent directory of the report file for the archive.

+
+
jisc_papers + pd.DataFrame + +
+

A DataFrame of JISC papers.

+
+
size + str | float + +
+

The size of the archive, in human-readable format.

+
+
size_raw + str | float + +
+

The raw size of the archive, in bytes.

+
+
roots + Generator[ET.Element, None, None] + +
+

The root elements of the XML documents contained in the archive.

+
+
meta + dotdict + +
+

Metadata about the archive, such as its path, size, and number of contents.

+
+
json_indent + int + +
+

Indentation formatting of json output

+
+
+ + + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ RuntimeError + +
+

If the path does not exist.

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(
+    self,
+    path: str | Path,
+    collection: str = "",
+    report_id: str | None = None,
+    jisc_papers: pd.DataFrame | None = None,
+    json_indent: int = JSON_INDENT,
+):
+    """Constructor method."""
+
+    self.path: Path = Path(path)
+
+    if not self.path.exists():
+        raise RuntimeError("Path does not exist.")
+
+    self.size: str | float = get_size_from_path(self.path)
+    self.size_raw: str | float = get_size_from_path(self.path, raw=True)
+    self.zip_file: zipfile.ZipFile = zipfile.ZipFile(self.path)
+    self.collection: str = collection
+    self.roots: Generator[ET.Element, None, None] = self.get_roots()
+
+    self.meta: dotdict = dotdict(
+        path=str(self.path),
+        bytes=self.size_raw,
+        size=self.size,
+        contents=len(self.filelist),
+    )
+
+    if not report_id:
+        self.report_id: str = str(uuid.uuid4())
+    else:
+        self.report_id = report_id
+
+    self.jisc_papers: pd.DataFrame = jisc_papers
+    self.report_parent: Path = Path(f"{REPORT_DIR}/{self.report_id}")
+    self.report: Path = (
+        self.report_parent / f"{self.path.stem.replace('_metadata', '')}.json"
+    )
+    self.json_indent: int = json_indent
+
+
+ + + +
+ + + + + + + +
+ + + +

+ documents + + + + property + + +

+
documents
+
+ +
+ +

Property that calls the get_documents method

+
+ +
+ +
+ + + +

+ filelist + + + + property + + +

+
filelist
+
+ +
+ +

Returns the list of files in the zip file

+
+ +
+ + + + +
+ + + +

+ __len__ + + +

+
__len__()
+
+ +
+ +

The number of files inside the zip archive.

+ +
+ Source code in alto2txt2fixture/router.py +
def __len__(self):
+    """The number of files inside the zip archive."""
+    return len(self.filelist)
+
+
+
+ +
+ + +
+ + + +

+ get_documents + + +

+
get_documents() -> Generator[Document, None, None]
+
+ +
+ +

A generator that yields instances of the Document class for each XML +file in the ZIP archive.

+

It uses the tqdm library to display a progress bar in the terminal +while it is running.

+

If the contents of the ZIP file are not empty, the method creates an +instance of the Document class by passing the root element of the XML +file, the collection name, meta information about the archive, and the +JISC papers data frame (if provided) to the constructor of the +Document class. The instance of the Document class is then +returned by the generator.

+ + + +

Yields:

+ + + + + + + + + + + + + +
TypeDescription
+ Document + +
+

Document class instance for each unzipped XML file.

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def get_documents(self) -> Generator[Document, None, None]:
+    """
+    A generator that yields instances of the Document class for each XML
+    file in the ZIP archive.
+
+    It uses the `tqdm` library to display a progress bar in the terminal
+    while it is running.
+
+    If the contents of the ZIP file are not empty, the method creates an
+    instance of the ``Document`` class by passing the root element of the XML
+    file, the collection name, meta information about the archive, and the
+    JISC papers data frame (if provided) to the constructor of the
+    ``Document`` class. The instance of the ``Document`` class is then
+    returned by the generator.
+
+    Yields:
+        ``Document`` class instance for each unzipped `XML` file.
+    """
+    for xml_file in tqdm(
+        self.filelist,
+        desc=f"{Path(self.zip_file.filename).stem} ({self.meta.size})",
+        leave=False,
+        colour="green",
+    ):
+        with self.zip_file.open(xml_file) as f:
+            xml = f.read()
+            if xml:
+                yield Document(
+                    root=ET.fromstring(xml),
+                    collection=self.collection,
+                    meta=self.meta,
+                    jisc_papers=self.jisc_papers,
+                )
+
+
+
+ +
+ + +
+ + + +

+ get_roots + + +

+
get_roots() -> Generator[ET.Element, None, None]
+
+ +
+ +

Yields the root elements of the XML documents contained in the archive.

+ +
+ Source code in alto2txt2fixture/router.py +
def get_roots(self) -> Generator[ET.Element, None, None]:
+    """
+    Yields the root elements of the XML documents contained in the archive.
+    """
+    for xml_file in tqdm(self.filelist, leave=False, colour="blue"):
+        with self.zip_file.open(xml_file) as f:
+            xml = f.read()
+            if xml:
+                yield ET.fromstring(xml)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Cache + + +

+
Cache()
+
+ +
+ + +

The Cache class provides a blueprint for creating and managing cache data. +The class has several methods that help in getting the cache path, +converting the data to a dictionary, and writing the cache data to a file.

+

It is inherited by many other classes in this document.

+ +

Initializes the Cache class object.

+ +
+ Source code in alto2txt2fixture/router.py +
35
+36
+37
+38
+39
def __init__(self):
+    """
+    Initializes the Cache class object.
+    """
+    pass
+
+
+ + + +
+ + + + + + + + + + +
+ + + +

+ __str__ + + +

+
__str__() -> str
+
+ +
+ +

Returns the string representation of the cache data as a dictionary.

+ +
+ Source code in alto2txt2fixture/router.py +
41
+42
+43
+44
+45
def __str__(self) -> str:
+    """
+    Returns the string representation of the cache data as a dictionary.
+    """
+    return str(self.as_dict())
+
+
+
+ +
+ + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

Converts the cache data to a dictionary and returns it.

+ +
+ Source code in alto2txt2fixture/router.py +
47
+48
+49
+50
+51
def as_dict(self) -> dict:
+    """
+    Converts the cache data to a dictionary and returns it.
+    """
+    return {}
+
+
+
+ +
+ + +
+ + + +

+ get_cache_path + + +

+
get_cache_path() -> Path
+
+ +
+ +

Returns the cache path, which is used to store the cache data. +The path is normally constructed using some of the object's +properties (collection, kind, and id) but can be changed when +inherited.

+ +
+ Source code in alto2txt2fixture/router.py +
53
+54
+55
+56
+57
+58
+59
+60
def get_cache_path(self) -> Path:
+    """
+    Returns the cache path, which is used to store the cache data.
+    The path is normally constructed using some of the object's
+    properties (collection, kind, and id) but can be changed when
+    inherited.
+    """
+    return Path(f"{CACHE_HOME}/{self.collection}/{self.kind}/{self.id}.json")
+
+
+
+ +
+ + +
+ + + +

+ write_to_cache + + +

+
write_to_cache(json_indent: int = JSON_INDENT) -> Optional[bool]
+
+ +
+ +

Writes the cache data to a file at the specified cache path. The cache +data is first converted to a dictionary using the as_dict method. If +the cache path already exists, the function returns True.

+ +
+ Source code in alto2txt2fixture/router.py +
62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
def write_to_cache(self, json_indent: int = JSON_INDENT) -> Optional[bool]:
+    """
+    Writes the cache data to a file at the specified cache path. The cache
+    data is first converted to a dictionary using the as_dict method. If
+    the cache path already exists, the function returns True.
+    """
+
+    path = self.get_cache_path()
+
+    try:
+        if path.exists():
+            return True
+    except AttributeError:
+        error(
+            f"Error occurred when getting cache path for "
+            f"{self.kind}: {path}. It was not of expected "
+            f"type Path but of type {type(path)}:",
+        )
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(path, "w+") as f:
+        f.write(json.dumps(self.as_dict(), indent=json_indent))
+
+    return
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Collection + + +

+
Collection(name: str = 'hmd', jisc_papers: Optional[pd.DataFrame] = None)
+
+ +
+ + +

A Collection represents a group of newspaper archives from any passed +alto2txt metadata output.

+

A Collection is initialised with a name and an optional pandas DataFrame +of JISC papers. The archives property returns an iterable of the +Archive objects within the collection.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
name + str + +
+

Name of the collection (default "hmd")

+
+
jisc_papers + pandas.DataFrame + +
+

DataFrame of JISC papers, optional

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(self, name: str = "hmd", jisc_papers: Optional[pd.DataFrame] = None):
+    """Constructor method."""
+
+    self.name: str = name
+    self.jisc_papers: pd.DataFrame | None = jisc_papers
+    self.dir: Path = Path(f"{MNT}/{self.name}-alto2txt/metadata")
+    self.zip_files: list[Path] = sorted(
+        list(self.dir.glob("*.zip")), key=lambda x: x.stat().st_size
+    )
+    self.zip_file_count: int = sum([1 for _ in self.dir.glob("*.zip")])
+    self.report_id: str = str(uuid.uuid4())
+    self.empty: bool = self.zip_file_count == 0
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ DataProvider + + +

+
DataProvider(collection: str)
+
+ +
+

+ Bases: Cache

+ + +

The DataProvider class extends the Cache class and represents a newspaper +data provider. The class has several properties and methods that allow +creation of a data provider object and the manipulation of its data.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
collection + str + +
+

A string representing publication collection

+
+
kind + str + +
+

Indication of object type, defaults to data-provider

+
+
providers_meta_data + list[FixtureDict] + +
+

structured dict of metadata for known collection sources

+
+
collection_type + str + +
+

related data sources and potential linkage source

+
+
index_field + str + +
+

field name for querying existing records

+
+
+ +
+ Example +
>>> from pprint import pprint
+>>> hmd = DataProvider("hmd")
+>>> hmd.pk
+2
+>>> pprint(hmd.as_dict())
+{'code': 'bl-hmd',
+ 'collection': 'newspapers',
+ 'legacy_code': 'hmd',
+ 'name': 'Heritage Made Digital',
+ 'source_note': 'British Library-funded digitised newspapers provided by the '
+                'British Newspaper Archive'}
+
+
+

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(self, collection: str):
+    """Constructor method."""
+    self.collection: str = collection
+
+
+ + + +
+ + + + + + + +
+ + + +

+ meta_data + + + + property + + +

+
meta_data: FixtureDict | dict
+
+ +
+ +

Return self.providers_meta_data[self.collection] or {}.

+
+ +
+ +
+ + + +

+ meta_data_fields + + + + property + + +

+
meta_data_fields: FixtureDict | dict
+
+ +
+ +

Return self.providers_meta_data[self.collection] or {}.

+
+ +
+ +
+ + + +

+ pk + + + + property + + +

+
pk: int | None
+
+ +
+ +

Return pk if provided via providers_meta_data, else None.

+
+ +
+ +
+ + + +

+ providers_index_dict + + + + property + + +

+
providers_index_dict: dict[str, FixtureDict]
+
+ +
+ +

Return all self.index_field values from providers_meta_data.

+
+ +
+ + + + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

Return a dict of the data provider object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

Dictionary representation of the DataProvider object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def as_dict(self) -> dict:
+    """
+    Return a `dict` of the data provider object.
+
+    Returns:
+        Dictionary representation of the DataProvider object
+    """
+    if self.meta_data:
+        return {
+            "name": self.meta_data_fields["name"],
+            "code": self.meta_data_fields["code"],
+            "legacy_code": self.collection,
+            "source_note": self.meta_data_fields["source_note"],
+            "collection": self.collection_type,
+        }
+    else:
+        return {
+            "name": self.collection,
+            "code": slugify(self.collection),
+            "source_note": "",
+            "legacy_code": None,
+            "collection": self.collection_type,
+        }
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Digitisation + + +

+
Digitisation(root: ET.Element, collection: str = '')
+
+ +
+

+ Bases: Cache

+ + +

The Digitisation class extends the Cache class and represents a newspaper +digitisation. The class has several properties and methods that allow +creation of an digitisation object and the manipulation of its data.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
root + ET.Element + +
+

An xml element that represents the root of the publication

+
+
collection + str + +
+

A string that represents the collection of the publication

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(self, root: ET.Element, collection: str = ""):
+    """Constructor method."""
+
+    if not isinstance(root, ET.Element):
+        raise RuntimeError(f"Expected root to be xml.etree.Element: {type(root)}")
+
+    self.root: ET.Element = root
+    self.collection: str = collection
+
+
+ + + +
+ + + + + + + +
+ + + +

+ kind + + + + class-attribute + instance-attribute + + +

+
kind = 'digitisation'
+
+ +
+ +

A string that represents the type of the object, set to +"digitisation".

+
+ +
+ + + + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

A method that returns a dictionary representation of the digitisation +object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

Dictionary representation of the Digitising object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def as_dict(self) -> dict:
+    """
+    A method that returns a dictionary representation of the digitisation
+    object.
+
+    Returns:
+        Dictionary representation of the Digitising object
+    """
+    dic = {
+        x.tag: x.text or ""
+        for x in self.root.findall("./process/*")
+        if x.tag
+        in [
+            "xml_flavour",
+            "software",
+            "mets_namespace",
+            "alto_namespace",
+        ]
+    }
+    if not dic.get("software"):
+        return {}
+
+    return dic
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Document + + +

+
Document(*args, **kwargs)
+
+ +
+ + +

The Document class is a representation of a document that contains +information about a publication, newspaper, item, digitisation, and +ingest. This class holds all the relevant information about a document in +a structured manner and provides properties that can be used to access +different aspects of the document.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
collection + str | None + +
+

A string that represents the collection of the publication

+
+
root + ET.Element | None + +
+

An XML element that represents the root of the publication

+
+
zip_file + str | None + +
+

A path to a valid zip file

+
+
jisc_papers + pd.DataFrame | None + +
+

A pandas DataFrame object that holds information about the JISC papers

+
+
meta + dotdict | None + +
+

TODO

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(self, *args, **kwargs):
+    """Constructor method."""
+
+    self.collection: str | None = kwargs.get("collection")
+    if not self.collection or not isinstance(self.collection, str):
+        raise RuntimeError("A valid collection must be passed")
+
+    self.root: ET.Element | None = kwargs.get("root")
+    if not self.root or not isinstance(self.root, ET.Element):
+        raise RuntimeError("A valid XML root must be passed")
+
+    self.zip_file: str | None = kwargs.get("zip_file")
+    if self.zip_file and not isinstance(self.zip_file, str):
+        raise RuntimeError("A valid zip file must be passed")
+
+    self.jisc_papers: pd.DataFrame | None = kwargs.get("jisc_papers")
+    if not isinstance(self.jisc_papers, pd.DataFrame):
+        raise RuntimeError(
+            "A valid DataFrame containing JISC papers must be passed"
+        )
+
+    self.meta: dotdict | None = kwargs.get("meta")
+
+    self._publication_elem = None
+    self._input_sub_path = None
+    self._ingest = None
+    self._digitisation = None
+    self._item = None
+    self._issue = None
+    self._newspaper = None
+    self._data_provider = None
+
+
+ + + +
+ + + + + + + +
+ + + +

+ publication + + + + property + + +

+
publication: ET.Element
+
+ +
+ +

This property returns an ElementTree object representing the +publication information in the XML document.

+
+ +
+ + + + + +
+ +
+ +
+ +
+ + + +

+ Ingest + + +

+
Ingest(root: ET.Element, collection: str = '')
+
+ +
+

+ Bases: Cache

+ + +

The Ingest class extends the Cache class and represents a newspaper ingest. +The class has several properties and methods that allow the creation of an +ingest object and the manipulation of its data.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
root + ET.Element + +
+

An xml element that represents the root of the publication

+
+
collection + str + +
+

A string that represents the collection of the publication

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(self, root: ET.Element, collection: str = ""):
+    """Constructor method."""
+
+    if not isinstance(root, ET.Element):
+        raise RuntimeError(f"Expected root to be xml.etree.Element: {type(root)}")
+
+    self.root: ET.Element = root
+    self.collection: str = collection
+
+
+ + + +
+ + + + + + + +
+ + + +

+ kind + + + + class-attribute + instance-attribute + + +

+
kind = 'ingest'
+
+ +
+ +

A string that represents the type of the object, set to "ingest".

+
+ +
+ + + + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

A method that returns a dictionary representation of the ingest +object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

Dictionary representation of the Ingest object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def as_dict(self) -> dict:
+    """
+    A method that returns a dictionary representation of the ingest
+    object.
+
+    Returns:
+        Dictionary representation of the Ingest object
+    """
+    return {
+        f"lwm_tool_{x.tag}": x.text or ""
+        for x in self.root.findall("./process/lwm_tool/*")
+    }
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Issue + + +

+
Issue(
+    publication: ET.Element,
+    newspaper: Optional[Newspaper] = None,
+    collection: str = "",
+    input_sub_path: str = "",
+    meta: dotdict = dotdict(),
+)
+
+ +
+

+ Bases: Cache

+ + +

The Issue class extends the Cache class and represents a newspaper issue. +The class has several properties and methods that allow the creation of an +issue object and the manipulation of its data.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
root + +
+

An xml element that represents the root of the publication

+
+
newspaper + Newspaper | None + +
+

The parent newspaper

+
+
collection + str + +
+

A string that represents the collection of the publication

+
+
input_sub_path + str + +
+

TODO

+
+
meta + dotdict + +
+

TODO

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(
+    self,
+    publication: ET.Element,
+    newspaper: Optional[Newspaper] = None,
+    collection: str = "",
+    input_sub_path: str = "",
+    meta: dotdict = dotdict(),
+):
+    """Constructor method."""
+
+    self.publication: ET.Element = publication
+    self.newspaper: Newspaper | None = newspaper
+    self.collection: str = collection
+    self.input_sub_path: str = input_sub_path
+    self.meta: dotdict = meta
+
+    self._issue = None
+    self._issue_date = None
+
+    path: str = str(self.get_cache_path())
+    if not self.meta.issue_paths:
+        self.meta.issue_paths = [path]
+    elif path not in self.meta.issue_paths:
+        self.meta.issue_paths.append(path)
+
+
+ + + +
+ + + + + + + +
+ + + +

+ issue_code + + + + property + + +

+
issue_code: str
+
+ +
+ +

Sets up and saves the issue code for easy access as property.

+
+ +
+ +
+ + + +

+ issue_date + + + + property + + +

+
issue_date: str
+
+ +
+ +

Sets up and saves the issue date for easy access as property.

+
+ +
+ +
+ + + +

+ kind + + + + class-attribute + instance-attribute + + +

+
kind = 'issue'
+
+ +
+ +

A string that represents the type of the object, set to "issue".

+
+ +
+ + + + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

A method that returns a dictionary representation of the issue +object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

Dictionary representation of the Issue object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def as_dict(self) -> dict:
+    """
+    A method that returns a dictionary representation of the issue
+    object.
+
+    Returns:
+        Dictionary representation of the Issue object
+    """
+
+    if not self._issue:
+        self._issue = dict(
+            issue_code=self.issue_code,
+            issue_date=self.issue_date,
+            publication__publication_code=self.newspaper.publication_code,
+            input_sub_path=self.input_sub_path,
+        )
+
+    return self._issue
+
+
+
+ +
+ + +
+ + + +

+ get_cache_path + + +

+
get_cache_path() -> Path
+
+ +
+ +

Returns the path to the cache file for the issue object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Path + +
+

Path to the cache file for the issue object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def get_cache_path(self) -> Path:
+    """
+    Returns the path to the cache file for the issue object.
+
+    Returns:
+        Path to the cache file for the issue object
+    """
+
+    json_file = f"/{self.newspaper.publication_code}/issues/{self.issue_code}.json"
+
+    return Path(
+        f"{CACHE_HOME}/{self.collection}/"
+        + "/".join(self.newspaper.number_paths)
+        + json_file
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Item + + +

+
Item(
+    root: ET.Element,
+    issue_code: str = "",
+    digitisation: dict = {},
+    ingest: dict = {},
+    collection: str = "",
+    newspaper: Optional[Newspaper] = None,
+    meta: dotdict = dotdict(),
+)
+
+ +
+

+ Bases: Cache

+ + +

The Newspaper class extends the Cache class and represents a newspaper +item, i.e. an article. The class has several properties and methods that +allow the creation of an article object and the manipulation of its data.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
root + ET.Element + +
+

An xml element that represents the root of the publication

+
+
issue_code + str + +
+

A string that represents the issue code

+
+
digitisation + dict + +
+

TODO

+
+
ingest + dict + +
+

TODO

+
+
collection + str + +
+

A string that represents the collection of the publication

+
+
newspaper + Newspaper | None + +
+

The parent newspaper

+
+
meta + dotdict + +
+

TODO

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(
+    self,
+    root: ET.Element,
+    issue_code: str = "",
+    digitisation: dict = {},
+    ingest: dict = {},
+    collection: str = "",
+    newspaper: Optional[Newspaper] = None,
+    meta: dotdict = dotdict(),
+):
+    """Constructor method."""
+
+    if not isinstance(root, ET.Element):
+        raise RuntimeError(f"Expected root to be xml.etree.Element: {type(root)}")
+
+    if not isinstance(newspaper, Newspaper):
+        raise RuntimeError("Expected newspaper to be of type router.Newspaper")
+
+    self.root: ET.Element = root
+    self.issue_code: str = issue_code
+    self.digitisation: dict = digitisation
+    self.ingest: dict = ingest
+    self.collection: str = collection
+    self.newspaper: Newspaper | None = newspaper
+    self.meta: dotdict = meta
+
+    self._item_elem = None
+    self._item_code = None
+    self._item = None
+
+    path: str = str(self.get_cache_path())
+    if not self.meta.item_paths:
+        self.meta.item_paths = [path]
+    elif path not in self.meta.item_paths:
+        self.meta.item_paths.append(path)
+
+
+ + + +
+ + + + + + + +
+ + + +

+ item_code + + + + property + + +

+
item_code: str
+
+ +
+ +

Sets up and saves the item code for easy access as property.

+
+ +
+ +
+ + + +

+ item_elem + + + + property + + +

+
item_elem
+
+ +
+ +

Sets up and saves the issue XML item for easy access as a property.

+
+ +
+ +
+ + + +

+ kind + + + + class-attribute + instance-attribute + + +

+
kind = 'item'
+
+ +
+ +

A string that represents the type of the object, set to "item".

+
+ +
+ + + + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

A method that returns a dictionary representation of the item object +(i.e. article).

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

Dictionary representation of the Item object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def as_dict(self) -> dict:
+    """
+    A method that returns a dictionary representation of the item object
+    (i.e. article).
+
+    Returns:
+        Dictionary representation of the Item object
+    """
+
+    if not self._item:
+        self._item = {
+            f"{x.tag}": x.text or ""
+            for x in self.item_elem.findall("*")
+            if x.tag
+            in [
+                "title",
+                "word_count",
+                "ocr_quality_mean",
+                "ocr_quality_sd",
+                "plain_text_file",
+                "item_type",
+            ]
+        }
+
+        self._item["title"] = self._item.get("title", "")[:2097151]
+
+        self._item = {
+            "item_code": self.item_code,
+            "word_count": self._item.get("word_count", 0),
+            "title": self._item.get("title"),
+            "item_type": self._item.get("item_type"),
+            "input_filename": self._item.get("plain_text_file", ""),
+            "ocr_quality_mean": self._item.get("ocr_quality_mean", 0),
+            "ocr_quality_sd": self._item.get("ocr_quality_sd", 0),
+            "digitisation__software": self.digitisation.id,
+            "ingest__lwm_tool_identifier": self.ingest.id,
+            "issue__issue_identifier": self.issue_code,
+            "data_provider__name": self.collection,
+        }
+
+    return self._item
+
+
+
+ +
+ + +
+ + + +

+ get_cache_path + + +

+
get_cache_path() -> Path
+
+ +
+ +

Returns the path to the cache file for the item (article) object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Path + +
+

Path to the cache file for the article object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def get_cache_path(self) -> Path:
+    """
+    Returns the path to the cache file for the item (article) object.
+
+    Returns:
+        Path to the cache file for the article object
+    """
+    return Path(
+        f"{CACHE_HOME}/{self.collection}/"
+        + "/".join(self.newspaper.number_paths)
+        + f"/{self.newspaper.publication_code}/items.jsonl"
+    )
+
+
+
+ +
+ + +
+ + + +

+ write_to_cache + + +

+
write_to_cache(json_indent = JSON_INDENT) -> None
+
+ +
+ +

Special cache-write function that appends rather than writes at the +end of the process.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

None.

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def write_to_cache(self, json_indent=JSON_INDENT) -> None:
+    """
+    Special cache-write function that appends rather than writes at the
+    end of the process.
+
+    Returns:
+        None.
+    """
+    path = self.get_cache_path()
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(path, "a+") as f:
+        f.write(json.dumps(self.as_dict(), indent=json_indent) + "\n")
+
+    return
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ Newspaper + + +

+
Newspaper(
+    root: ET.Element,
+    collection: str = "",
+    meta: dotdict = dotdict(),
+    jisc_papers: Optional[pd.DataFrame] = None,
+)
+
+ +
+

+ Bases: Cache

+ + +

The Newspaper class extends the Cache class and represents a newspaper.

+

The class has several properties and methods that allow the creation of a +newspaper object and the manipulation of its data.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
root + +
+

An xml element that represents the root of the publication.

+
+
collection + +
+

A string that represents the collection of the publication.

+
+
meta + +
+

A dotdict object that holds metadata about the publication.

+
+
jisc_papers + +
+

A pandas DataFrame object for JISC paper information.

+
+
+ +

Constructor method.

+ +
+ Source code in alto2txt2fixture/router.py +
def __init__(
+    self,
+    root: ET.Element,
+    collection: str = "",
+    meta: dotdict = dotdict(),
+    jisc_papers: Optional[pd.DataFrame] = None,
+):
+    """Constructor method."""
+
+    if not isinstance(root, ET.Element):
+        raise RuntimeError(f"Expected root to be xml.etree.Element: {type(root)}")
+
+    self.publication = root.find("./publication")
+    self.input_sub_path = root.find("./process/input_sub_path").text
+    self.issue_date = self.publication.find("./issue/date").text
+    self.collection = collection
+    self.meta = meta
+    self.jisc_papers = jisc_papers
+
+    self._newspaper = None
+    self._title = None
+    self._publication_code = None
+
+    path = str(self.get_cache_path())
+    if not self.meta.newspaper_paths:
+        self.meta.newspaper_paths = []
+    elif path not in self.meta.newspaper_paths:
+        self.meta.newspaper_paths.append(path)
+
+    if not self.meta.publication_codes:
+        self.meta.publication_codes = [self.publication_code]
+    elif self.publication_code not in self.meta.publication_codes:
+        self.meta.publication_codes.append(self.publication_code)
+
+    self.zip_file = Path(meta.path).name
+
+
+ + + +
+ + + + + + + +
+ + + +

+ kind + + + + class-attribute + instance-attribute + + +

+
kind = 'newspaper'
+
+ +
+ +

A string that represents the type of the object, set to "newspaper".

+
+ +
+ +
+ + + +

+ number_paths + + + + property + + +

+
number_paths: list
+
+ +
+ +

Returns the nested directories in which we want to save the cache file.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ list + +
+

List of the desired directories in descending order

+
+
+
+ +
+ +
+ + + +

+ publication_code + + + + property + + +

+
publication_code: str
+
+ +
+ +

A property that returns the code of the publication.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

The code of the publication

+
+
+
+ +
+ +
+ + + +

+ title + + + + property + + +

+
title: str
+
+ +
+ +

A property that returns the title of the newspaper.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

The title of the newspaper

+
+
+
+ +
+ + + + +
+ + + +

+ as_dict + + +

+
as_dict() -> dict
+
+ +
+ +

A method that returns a dictionary representation of the newspaper +object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

Dictionary representation of the Newspaper object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def as_dict(self) -> dict:
+    """
+    A method that returns a dictionary representation of the newspaper
+    object.
+
+    Returns:
+        Dictionary representation of the Newspaper object
+    """
+
+    if not self._newspaper:
+        self._newspaper = dict(
+            **dict(publication_code=self.publication_code, title=self.title),
+            **{
+                x.tag: x.text or ""
+                for x in self.publication.findall("*")
+                if x.tag in ["location"]
+            },
+        )
+    return self._newspaper
+
+
+
+ +
+ + +
+ + + +

+ get_cache_path + + +

+
get_cache_path() -> Path
+
+ +
+ +

Returns the path to the cache file for the newspaper object.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Path + +
+

Path to the cache file for the newspaper object

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def get_cache_path(self) -> Path:
+    """
+    Returns the path to the cache file for the newspaper object.
+
+    Returns:
+        Path to the cache file for the newspaper object
+    """
+    json_file = f"/{self.publication_code}/{self.publication_code}.json"
+
+    return Path(
+        f"{CACHE_HOME}/{self.collection}/" + "/".join(self.number_paths) + json_file
+    )
+
+
+
+ +
+ + +
+ + + +

+ publication_code_from_input_sub_path + + +

+
publication_code_from_input_sub_path() -> str | None
+
+ +
+ +

A method that returns the publication code from the input sub-path of +the publication process.

+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str | None + +
+

The code of the publication

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def publication_code_from_input_sub_path(self) -> str | None:
+    """
+    A method that returns the publication code from the input sub-path of
+    the publication process.
+
+    Returns:
+        The code of the publication
+    """
+
+    g = PUBLICATION_CODE.findall(self.input_sub_path)
+    if len(g) == 1:
+        return g[0]
+    return None
+
+
+
+ +
+ + + +
+ +
+ +
+ + + +
+ + + +

+ route + + +

+
route(
+    collections: list,
+    cache_home: str,
+    mountpoint: str,
+    jisc_papers_path: str,
+    report_dir: str,
+) -> None
+
+ +
+ +

This function is responsible for setting up the path for the alto2txt +mountpoint, setting up the JISC papers and routing the collections for +processing.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
collections + list + +
+

List of collection names

+
+
+ required +
cache_home + str + +
+

Directory path for the cache

+
+
+ required +
mountpoint + str + +
+

Directory path for the alto2txt mountpoint

+
+
+ required +
jisc_papers_path + str + +
+

Path to the JISC papers

+
+
+ required +
report_dir + str + +
+

Path to the report directory

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

None

+
+
+ +
+ Source code in alto2txt2fixture/router.py +
def route(
+    collections: list,
+    cache_home: str,
+    mountpoint: str,
+    jisc_papers_path: str,
+    report_dir: str,
+) -> None:
+    """
+    This function is responsible for setting up the path for the alto2txt
+    mountpoint, setting up the JISC papers and routing the collections for
+    processing.
+
+    Args:
+        collections: List of collection names
+        cache_home: Directory path for the cache
+        mountpoint: Directory path for the alto2txt mountpoint
+        jisc_papers_path: Path to the JISC papers
+        report_dir: Path to the report directory
+
+    Returns:
+        None
+    """
+
+    global CACHE_HOME
+    global MNT
+    global REPORT_DIR
+
+    CACHE_HOME = cache_home
+    REPORT_DIR = report_dir
+
+    MNT = Path(mountpoint) if isinstance(mountpoint, str) else mountpoint
+    if not MNT.exists():
+        error(
+            f"The mountpoint provided for alto2txt does not exist. "
+            f"Either create a local copy or blobfuse it to "
+            f"`{MNT.absolute()}`."
+        )
+
+    jisc_papers = setup_jisc_papers(path=jisc_papers_path)
+
+    for collection_name in collections:
+        collection = Collection(name=collection_name, jisc_papers=jisc_papers)
+
+        if collection.empty:
+            error(
+                f"It looks like {collection_name} is empty in the "
+                f"alto2txt mountpoint: `{collection.dir.absolute()}`."
+            )
+
+        for archive in collection.archives:
+            with archive as _:
+                [
+                    (
+                        doc.item.write_to_cache(),
+                        doc.newspaper.write_to_cache(),
+                        doc.issue.write_to_cache(),
+                        doc.data_provider.write_to_cache(),
+                        doc.ingest.write_to_cache(),
+                        doc.digitisation.write_to_cache(),
+                    )
+                    for doc in archive.documents
+                ]
+
+    return
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/settings.html b/reference/alto2txt2fixture/settings.html new file mode 100644 index 0000000..19d1f64 --- /dev/null +++ b/reference/alto2txt2fixture/settings.html @@ -0,0 +1,777 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + settings - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

settings

+ +
+ + + +
+ +

The settings module provides configuration for running alto2txt2fixture.

+

Most of these are managed within the settings variable within this module.

+
+

Note

+

See the command line interface parameters documentation for means of modifying settings when run.

+
+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
JSON_INDEX + +
+

Amount of indentation to include in output JSON files

+
+
DATA_PROVIDER_INDEX + Final[str] + +
+

The field used to index DataProvider records

+
+
NEWSPAPER_COLLECTION_METADATA + Final[list[FixtureDict]] + +
+

A list of FixtureDicts specifying speific newspaper data providers

+
+
SETUP_TITLE + str + +
+

the title printed at the commandline via cli.show_setup() function

+
+
settings + dotdict + +
+

a docdict configuration for running newspaper portions of alto2txt2fixture

+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/types.html b/reference/alto2txt2fixture/types.html new file mode 100644 index 0000000..e905bbe --- /dev/null +++ b/reference/alto2txt2fixture/types.html @@ -0,0 +1,965 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + types - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

types

+ +
+ + + +
+ + + +
+ + + + + + + + +
+ + + +

+ FixtureDict + + +

+ + +
+

+ Bases: TypedDict

+ + +

A dict structure to ease use as a json database fixture.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
pk + int + +
+

an id to uniquely define and query each entry

+
+
model + str + +
+

what model a given record is for

+
+
fields + dict[str, Any] + +
+

a dict of record information conforming to model table

+
+
+ + + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ TranslatorTuple + + +

+ + +
+

+ Bases: NamedTuple

+ + +

A named tuple of fields for translation.

+ + + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
start + str + +
+

A string representing the starting field name.

+
+
finish + str | list + +
+

A string or list specifying the field(s) to be translated. +If it is a string, the translated field +will be a direct mapping of the specified field in +each item of the input list. +If it is a list, the translated field will be a +hyphen-separated concatenation of the specified fields +in each item of the input list.

+
+
lst + list[dict] + +
+

A list of dictionaries representing the items to be +translated. Each dictionary should contain the necessary +fields for translation, with the field names specified in +the start parameter.

+
+
+ + + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ dotdict + + +

+ + +
+

+ Bases: dict

+ + +

dot.notation access to dictionary attributes

+ + + + +
+ + + + + + + + + + + +
+ +
+ +
+ + + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/reference/alto2txt2fixture/utils.html b/reference/alto2txt2fixture/utils.html new file mode 100644 index 0000000..f158b6d --- /dev/null +++ b/reference/alto2txt2fixture/utils.html @@ -0,0 +1,5144 @@ + + + + + + + + + + + + + + + + + + + + + + + + utils - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + + + + + +
+
+ + + + + + + +

utils

+ +
+ + + +
+ + + +
+ + + + + + + + + + +
+ + + +

+ check_newspaper_collection_configuration + + +

+
check_newspaper_collection_configuration(
+    collections: Iterable[str] = settings.COLLECTIONS,
+    newspaper_collections: Iterable[
+        FixtureDict
+    ] = NEWSPAPER_COLLECTION_METADATA,
+    data_provider_index: str = DATA_PROVIDER_INDEX,
+) -> set[str]
+
+ +
+ +

Check the names in collections match the names in newspaper_collections.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
collections + Iterable[str] + +
+

Names of newspaper collections, defaults to settings.COLLECTIONS

+
+
+ settings.COLLECTIONS +
newspaper_collections + Iterable[FixtureDict] + +
+

Newspaper collections in a list of FixtureDict format. Defaults + to settings.FIXTURE_TABLE['dataprovider]

+
+
+ NEWSPAPER_COLLECTION_METADATA +
data_provider_index + str + +
+

dict fields key used to check matchiching collections name

+
+
+ DATA_PROVIDER_INDEX +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ set[str] + +
+

A set of collections without a matching newspaper_collections record.

+
+
+ +
+ Example +
>>> check_newspaper_collection_configuration()
+set()
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def check_newspaper_collection_configuration(
+    collections: Iterable[str] = settings.COLLECTIONS,
+    newspaper_collections: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,
+    data_provider_index: str = DATA_PROVIDER_INDEX,
+) -> set[str]:
+    """Check the names in `collections` match the names in `newspaper_collections`.
+
+    Arguments:
+        collections:
+            Names of newspaper collections, defaults to ``settings.COLLECTIONS``
+        newspaper_collections:
+            Newspaper collections in a list of `FixtureDict` format. Defaults
+                to ``settings.FIXTURE_TABLE['dataprovider]``
+        data_provider_index:
+            `dict` `fields` `key` used to check matchiching `collections` name
+
+    Returns:
+        A set of ``collections`` without a matching `newspaper_collections` record.
+
+    Example:
+        ```pycon
+        >>> check_newspaper_collection_configuration()
+        set()
+
+        ```
+
+    """
+    newspaper_collection_names: tuple[str, ...] = tuple(
+        dict_from_list_fixture_fields(
+            newspaper_collections, field_name=data_provider_index
+        ).keys()
+    )
+    collection_diff: set[str] = set(collections) - set(newspaper_collection_names)
+    if collection_diff:
+        warning(
+            f"{len(collection_diff)} `collections` "
+            f"not in `newspaper_collections`: {collection_diff}"
+        )
+    return collection_diff
+
+
+
+ +
+ + +
+ + + +

+ clear_cache + + +

+
clear_cache(dir: str | Path) -> None
+
+ +
+ +

Clears the cache directory by removing all .json files in it.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
dir + str | Path + +
+

The path of the directory to be cleared.

+
+
+ required +
+ +
+ Source code in alto2txt2fixture/utils.py +
def clear_cache(dir: str | Path) -> None:
+    """
+    Clears the cache directory by removing all `.json` files in it.
+
+    Args:
+        dir: The path of the directory to be cleared.
+    """
+
+    dir = get_path_from(dir)
+
+    y = input(
+        f"Do you want to erase the cache path now that the "
+        f"files have been generated ({dir.absolute()})? [y/N]"
+    )
+
+    if y.lower() == "y":
+        info("Clearing up the cache directory")
+        for x in dir.glob("*.json"):
+            x.unlink()
+
+
+
+ +
+ + +
+ + + +

+ create_lookup + + +

+
create_lookup(lst: list = [], on: list = []) -> dict
+
+ +
+ +

Create a lookup dictionary from a list of dictionaries.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
lst + list + +
+

A list of dictionaries that should be used to generate the lookup.

+
+
+ [] +
on + list + +
+

A list of keys from the dictionaries in the list that should be used as the keys in the lookup.

+
+
+ [] +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict + +
+

The generated lookup dictionary.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
def create_lookup(lst: list = [], on: list = []) -> dict:
+    """
+    Create a lookup dictionary from a list of dictionaries.
+
+    Args:
+        lst: A list of dictionaries that should be used to generate the lookup.
+        on: A list of keys from the dictionaries in the list that should be used as the keys in the lookup.
+
+    Returns:
+        The generated lookup dictionary.
+    """
+    return {get_key(x, on): x["pk"] for x in lst}
+
+
+
+ +
+ + +
+ + + +

+ dict_from_list_fixture_fields + + +

+
dict_from_list_fixture_fields(
+    fixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,
+    field_name: str = DATA_PROVIDER_INDEX,
+) -> dict[str, FixtureDict]
+
+ +
+ +

Create a dict from fixture_list with attr_name as key.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
fixture_list + Iterable[FixtureDict] + +
+

list of FixtureDict with attr_name key fields.

+
+
+ NEWSPAPER_COLLECTION_METADATA +
field_name + str + +
+

key for values within fixture_list fields.

+
+
+ DATA_PROVIDER_INDEX +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict[str, FixtureDict] + +
+

A dict where extracted field_name is key for related FixtureDict values.

+
+
+ +
+ Example +
>>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()
+>>> fixture_dict['hmd']['pk']
+2
+>>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]
+'hmd'
+>>> fixture_dict['hmd']['fields']['code']
+'bl-hmd'
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def dict_from_list_fixture_fields(
+    fixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,
+    field_name: str = DATA_PROVIDER_INDEX,
+) -> dict[str, FixtureDict]:
+    """Create a `dict` from ``fixture_list`` with ``attr_name`` as `key`.
+
+    Args:
+        fixture_list: `list` of `FixtureDict` with ``attr_name`` key `fields`.
+        field_name: key for values within ``fixture_list`` `fields`.
+
+    Returns:
+        A `dict` where extracted `field_name` is key for related `FixtureDict` values.
+
+    Example:
+        ```pycon
+        >>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()
+        >>> fixture_dict['hmd']['pk']
+        2
+        >>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]
+        'hmd'
+        >>> fixture_dict['hmd']['fields']['code']
+        'bl-hmd'
+
+        ```
+    """
+    return {record["fields"][field_name]: record for record in fixture_list}
+
+
+
+ +
+ + +
+ + + +

+ export_fixtures + + +

+
export_fixtures(
+    fixture_tables: dict[str, Sequence[FixtureDict]],
+    path: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,
+    prefix: str = "test-",
+    add_created: bool = True,
+    formats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,
+) -> None
+
+ +
+ +

Export fixture_tables in formats.

+ +
+ Note +

This is still in experimental phase of development and not recommended +for production.

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
fixture_tables + dict[str, Sequence[FixtureDict]] + +
+

dict of table name (eg: dataprovider) and FixtureDict

+
+
+ required +
path + str | PathLike + +
+

Path to save exports in

+
+
+ settings.FIXTURE_TABLES_OUTPUT +
prefix + str + +
+

str to prefix export filenames with

+
+
+ 'test-' +
formats + Sequence[EXPORT_FORMATS] + +
+

list of EXPORT_FORMATS to export

+
+
+ settings.FIXTURE_TABLES_FORMATS +
+ +
+ Example +
>>> test_fixture_tables: dict[str, FixtureDict] = {
+...     'test0': NEWSPAPER_COLLECTION_METADATA,
+...     'test1': NEWSPAPER_COLLECTION_METADATA}
+>>> export_fixtures(test_fixture_tables, path='tests/')
+...     # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+<BLANKLINE>
+...Warning: Saving test0...
+...Warning: Saving test1...
+>>> from pandas import read_csv
+>>> fixture0_json = load_json('tests/test-test0-1.json')
+>>> fixture0_df = read_csv('tests/test-test0-1.csv')
+>>> fixture1_json = load_json('tests/test-test1-1.json')
+>>> fixture1_df = read_csv('tests/test-test1-1.csv')
+>>> fixture0_json == fixture1_json
+True
+>>> all(fixture0_df == fixture1_df)
+True
+>>> all(field in fixture0_json[0]['fields']
+...     for field in ['created_at', 'updated_at'])
+True
+>>> fixture0_json[1]['pk']
+2
+>>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]
+'hmd'
+>>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()
+[2, 'hmd']
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def export_fixtures(
+    fixture_tables: dict[str, Sequence[FixtureDict]],
+    path: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,
+    prefix: str = "test-",
+    add_created: bool = True,
+    formats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,
+) -> None:
+    """Export ``fixture_tables`` in ``formats``.
+
+    Note:
+        This is still in experimental phase of development and not recommended
+        for production.
+
+    Args:
+        fixture_tables: `dict` of table name (eg: `dataprovider`) and `FixtureDict`
+        path: Path to save exports in
+        prefix: `str` to prefix export filenames with
+        formats: list of `EXPORT_FORMATS` to export
+
+    Example:
+        ```pycon
+        >>> test_fixture_tables: dict[str, FixtureDict] = {
+        ...     'test0': NEWSPAPER_COLLECTION_METADATA,
+        ...     'test1': NEWSPAPER_COLLECTION_METADATA}
+        >>> export_fixtures(test_fixture_tables, path='tests/')
+        ...     # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+        <BLANKLINE>
+        ...Warning: Saving test0...
+        ...Warning: Saving test1...
+        >>> from pandas import read_csv
+        >>> fixture0_json = load_json('tests/test-test0-1.json')
+        >>> fixture0_df = read_csv('tests/test-test0-1.csv')
+        >>> fixture1_json = load_json('tests/test-test1-1.json')
+        >>> fixture1_df = read_csv('tests/test-test1-1.csv')
+        >>> fixture0_json == fixture1_json
+        True
+        >>> all(fixture0_df == fixture1_df)
+        True
+        >>> all(field in fixture0_json[0]['fields']
+        ...     for field in ['created_at', 'updated_at'])
+        True
+        >>> fixture0_json[1]['pk']
+        2
+        >>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]
+        'hmd'
+        >>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()
+        [2, 'hmd']
+
+        ```
+    """
+    for table_name, records in fixture_tables.items():
+        warning(
+            f"Saving {table_name} fixture in {formats} formats "
+            f"to {path} *without* checks..."
+        )
+        if "json" in formats:
+            save_fixture(
+                records,
+                prefix=f"{prefix}{table_name}",
+                output_path=path,
+                add_created=add_created,
+            )
+        if "csv" in formats:
+            fixtures_dict2csv(records, prefix=f"{prefix}{table_name}", output_path=path)
+
+
+
+ +
+ + +
+ + + +

+ filter_json_fields + + +

+
filter_json_fields(
+    json_results: list | dict | None = None,
+    file_path: PathLike | None = None,
+    fields: Sequence[str] = [],
+    value: Hashable = "",
+    **kwargs: Hashable
+) -> dict | list
+
+ +
+ +

Return keys and values from json_dict where any fields equal value.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
file_path + PathLike | None + +
+

The file path to load based on extension and filter

+
+
+ None +
fields + Sequence[str] + +
+

Which fields to check equal value

+
+
+ [] +
value + Hashable + +
+

Value to filter by

+
+
+ '' +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ dict | list + +
+

A dict of records indexed by pk which fit filter criteria

+
+
+ + + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

file_path must have a .json suffix

+
+
+ +
+ Example +
>>> from pprint import pprint
+>>> entry_fixture: dict = [
+...     {"pk": 4889, "model": "mitchells.entry",
+...      "fields": {"title": "BIRMINGHAM POST .",
+...                 "price_raw": ['2d'],
+...                 "year": 1920,
+...                 "date_established_raw": "1857",
+...                 "persons": [], "newspaper": ""}},
+...      {"pk": 9207, "model": "mitchells.entry",
+...       "fields": {"title": "ULVERSTONE ADVERTISER .",
+...                  "price_raw": ['2 ½ d', '3 ½ d'],
+...                  "year": 1856,
+...                  "date_established_raw": "1848",
+...                  "persons": ['Stephen Soulby'],
+...                  "newspaper": "",}},
+...     {"pk": 15, "model": "mitchells.entry",
+...      "fields": {"title": "LLOYD'S WEEKLY LONDON NEWSPAPER .",
+...                 "price_raw": ['2d', '3d'],
+...                 "year": 1857,
+...                 "date_established_raw": "November , 1842",
+...                 "persons": ['Mr. Douglas Jerrold', 'Edward Lloyd'],
+...                 "newspaper": 1187}}
+...     ]
+>>> pprint(filter_json_fields(entry_fixture,
+...                           fields=("newspaper", "persons"),
+...                           value=""))
+[{'fields': {'date_established_raw': '1857',
+             'newspaper': '',
+             'persons': [],
+             'price_raw': ['2d'],
+             'title': 'BIRMINGHAM POST .',
+             'year': 1920},
+  'model': 'mitchells.entry',
+  'pk': 4889},
+ {'fields': {'date_established_raw': '1848',
+             'newspaper': '',
+             'persons': ['Stephen Soulby'],
+             'price_raw': ['2 ½ d', '3 ½ d'],
+             'title': 'ULVERSTONE ADVERTISER .',
+             'year': 1856},
+  'model': 'mitchells.entry',
+  'pk': 9207}]
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def filter_json_fields(
+    json_results: list | dict | None = None,
+    file_path: PathLike | None = None,
+    fields: Sequence[str] = [],
+    value: Hashable = "",
+    **kwargs,
+) -> dict | list:
+    """Return `keys` and `values` from `json_dict` where any `fields` equal `value`.
+
+    Args:
+        file_path: The file `path` to load based on extension and filter
+        fields: Which fields to check equal `value`
+        value: Value to filter by
+
+    Returns:
+        A `dict` of records indexed by `pk` which fit filter criteria
+
+    Raises:
+        ValueError: ``file_path`` must have a `.json` `suffix`
+
+    Example:
+        ```pycon
+        >>> from pprint import pprint
+        >>> entry_fixture: dict = [
+        ...     {"pk": 4889, "model": "mitchells.entry",
+        ...      "fields": {"title": "BIRMINGHAM POST .",
+        ...                 "price_raw": ['2d'],
+        ...                 "year": 1920,
+        ...                 "date_established_raw": "1857",
+        ...                 "persons": [], "newspaper": ""}},
+        ...      {"pk": 9207, "model": "mitchells.entry",
+        ...       "fields": {"title": "ULVERSTONE ADVERTISER .",
+        ...                  "price_raw": ['2 \u00bd d', '3 \u00bd d'],
+        ...                  "year": 1856,
+        ...                  "date_established_raw": "1848",
+        ...                  "persons": ['Stephen Soulby'],
+        ...                  "newspaper": "",}},
+        ...     {"pk": 15, "model": "mitchells.entry",
+        ...      "fields": {"title": "LLOYD'S WEEKLY LONDON NEWSPAPER .",
+        ...                 "price_raw": ['2d', '3d'],
+        ...                 "year": 1857,
+        ...                 "date_established_raw": "November , 1842",
+        ...                 "persons": ['Mr. Douglas Jerrold', 'Edward Lloyd'],
+        ...                 "newspaper": 1187}}
+        ...     ]
+        >>> pprint(filter_json_fields(entry_fixture,
+        ...                           fields=("newspaper", "persons"),
+        ...                           value=""))
+        [{'fields': {'date_established_raw': '1857',
+                     'newspaper': '',
+                     'persons': [],
+                     'price_raw': ['2d'],
+                     'title': 'BIRMINGHAM POST .',
+                     'year': 1920},
+          'model': 'mitchells.entry',
+          'pk': 4889},
+         {'fields': {'date_established_raw': '1848',
+                     'newspaper': '',
+                     'persons': ['Stephen Soulby'],
+                     'price_raw': ['2 \u00bd d', '3 \u00bd d'],
+                     'title': 'ULVERSTONE ADVERTISER .',
+                     'year': 1856},
+          'model': 'mitchells.entry',
+          'pk': 9207}]
+
+        ```
+    """
+    if not json_results:
+        assert file_path
+        try:
+            assert Path(file_path).suffix == ".json"
+        except AssertionError:
+            raise ValueError(f"{file_path} must be `json` format.")
+        json_results = load_json(Path(file_path), **kwargs)
+    assert json_results
+    if isinstance(json_results, dict):
+        return {
+            k: v
+            for k, v in json_results.items()
+            if any(v["fields"][field] == value for field in fields)
+        }
+    else:
+        return [
+            v
+            for v in json_results
+            if any(v["fields"][field] == value for field in fields)
+        ]
+
+
+
+ +
+ + +
+ + + +

+ fixture_fields + + +

+
fixture_fields(
+    fixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False
+) -> tuple[str, ...] | dict[str, Any]
+
+ +
+ +

Generate a tuple of FixtureDict field names.

+ +
+ Note +

This is not in the utils module to avoid a circular import.

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
fixture_dict + FixtureDict + +
+

A FixtureDict instance to extract names from fields

+
+
+ required +
include_pk + bool + +
+

Whether to include the pk (primary key) column

+
+
+ True +
+ +
+ Example +
>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])
+('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')
+>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)
+('name', 'code', 'legacy_code', 'collection', 'source_note')
+>>> hmd_dict: dict[str, Any] = fixture_fields(
+...     NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)
+>>> hmd_dict['code']
+'bl-hmd'
+>>> hmd_dict['pk']
+2
+>>> hmd_dict = fixture_fields(
+...     NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)
+>>> 'pk' in hmd_dict
+False
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def fixture_fields(
+    fixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False
+) -> tuple[str, ...] | dict[str, Any]:
+    """Generate a tuple of `FixtureDict` `field` names.
+
+    Note:
+        This is not in the `utils` module to avoid a circular import.
+
+    Args:
+        fixture_dict: A `FixtureDict` instance to extract names from `fields`
+        include_pk: Whether to include the `pk` (primary key) column
+
+    Example:
+        ```pycon
+        >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])
+        ('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')
+        >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)
+        ('name', 'code', 'legacy_code', 'collection', 'source_note')
+        >>> hmd_dict: dict[str, Any] = fixture_fields(
+        ...     NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)
+        >>> hmd_dict['code']
+        'bl-hmd'
+        >>> hmd_dict['pk']
+        2
+        >>> hmd_dict = fixture_fields(
+        ...     NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)
+        >>> 'pk' in hmd_dict
+        False
+
+        ```
+    """
+    fields: OrderedDict[str, Any] = OrderedDict(fixture_dict["fields"])
+    if include_pk:
+        fields["pk"] = fixture_dict["pk"]
+        fields.move_to_end("pk", last=False)
+    if as_dict:
+        return fields
+    else:
+        return tuple(fields.keys())
+
+
+
+ +
+ + +
+ + + +

+ fixture_or_default_dict + + +

+
fixture_or_default_dict(
+    key: str,
+    fixture_dict: dict[str, FixtureDict],
+    default_dict: FixtureDict | dict = {},
+) -> FixtureDict | dict
+
+ +
+ +

Return a FixtureDict from fixture_list via key index, else default_dict.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
key + str + +
+

a str to query fixture_dict with

+
+
+ required +
fixture_dict + dict[str, FixtureDict] + +
+

a dict of str to FixtureDict, often generated by + dict_from_list_fixture_fields

+
+
+ required +
default_dict + FixtureDict | dict + +
+

a dict to return if key is not in +fixture_dict index

+
+
+ {} +
+ +
+ Example +
>>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(
+...     NEWSPAPER_COLLECTION_METADATA)
+>>> hmd_dict: FixtureDict = fixture_or_default_dict(
+...     'hmd', newspaper_dict
+... )
+>>> fixture_or_default_dict(
+...     'hmd', NEWSPAPER_COLLECTION_METADATA
+... )
+{}
+>>> fixture_or_default_dict(
+...     'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}
+... )
+{'a': 'default'}
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def fixture_or_default_dict(
+    key: str,
+    fixture_dict: dict[str, FixtureDict],
+    default_dict: FixtureDict | dict = {},
+) -> FixtureDict | dict:
+    """Return a `FixtureDict` from ``fixture_list`` via ``key`` index, else ``default_dict``.
+
+    Args:
+        key:
+            a `str` to query ``fixture_dict`` with
+        fixture_dict: a `dict` of `str` to `FixtureDict`, often generated by
+             ``dict_from_list_fixture_fields``
+        default_dict: a `dict` to return if ``key`` is not in
+            ``fixture_dict`` index
+
+    Example:
+        ```pycon
+        >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(
+        ...     NEWSPAPER_COLLECTION_METADATA)
+        >>> hmd_dict: FixtureDict = fixture_or_default_dict(
+        ...     'hmd', newspaper_dict
+        ... )
+        >>> fixture_or_default_dict(
+        ...     'hmd', NEWSPAPER_COLLECTION_METADATA
+        ... )
+        {}
+        >>> fixture_or_default_dict(
+        ...     'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}
+        ... )
+        {'a': 'default'}
+
+        ```
+    """
+    if key in fixture_dict:
+        return fixture_dict[key]
+    else:
+        return default_dict
+
+
+
+ +
+ + +
+ + + +

+ fixtures_dict2csv + + +

+
fixtures_dict2csv(
+    fixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],
+    prefix: str = "",
+    output_path: PathLike | str = settings.OUTPUT,
+    index: bool = False,
+    max_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,
+) -> None
+
+ +
+ +

Saves fixtures generated by a generator to separate separate CSV files.

+

This function takes an Iterable or Generator of fixtures and saves to +separate CSV files. The fixtures are saved in batches, where each batch +is determined by the max_elements_per_file parameter.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
fixtures + Iterable[FixtureDict] | Generator[FixtureDict, None, None] + +
+

An Iterable or Generator of the fixtures to be saved.

+
+
+ required +
prefix + str + +
+

A string prefix to be added to the file names of the +saved fixtures.

+
+
+ '' +
output_path + PathLike | str + +
+

Path to folder fixtures are saved to

+
+
+ settings.OUTPUT +
max_elements_per_file + int + +
+

Maximum JSON records saved in each file

+
+
+ settings.MAX_ELEMENTS_PER_FILE +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

This function saves fixtures to files and does not return a value.

+
+
+ +
+ Example +
>>> from pandas import read_csv
+>>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,
+...                   prefix='test', output_path='tests/')
+>>> imported_fixture = read_csv('tests/test-1.csv')
+>>> imported_fixture.iloc[1]['pk']
+2
+>>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]
+'hmd'
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def fixtures_dict2csv(
+    fixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],
+    prefix: str = "",
+    output_path: PathLike | str = settings.OUTPUT,
+    index: bool = False,
+    max_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,
+) -> None:
+    """Saves fixtures generated by a generator to separate separate `CSV` files.
+
+    This function takes an `Iterable` or `Generator` of fixtures and saves to
+    separate `CSV` files. The fixtures are saved in batches, where each batch
+    is determined by the ``max_elements_per_file`` parameter.
+
+    Args:
+        fixtures: An `Iterable` or `Generator` of the fixtures to be saved.
+        prefix: A string prefix to be added to the file names of the
+            saved fixtures.
+        output_path: Path to folder fixtures are saved to
+        max_elements_per_file: Maximum `JSON` records saved in each file
+
+    Returns:
+        This function saves fixtures to files and does not return a value.
+
+    Example:
+        ```pycon
+        >>> from pandas import read_csv
+        >>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,
+        ...                   prefix='test', output_path='tests/')
+        >>> imported_fixture = read_csv('tests/test-1.csv')
+        >>> imported_fixture.iloc[1]['pk']
+        2
+        >>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]
+        'hmd'
+
+        ```
+
+    """
+    internal_counter: int = 1
+    counter: int = 1
+    lst: list = []
+    Path(output_path).mkdir(parents=True, exist_ok=True)
+    for item in fixtures:
+        lst.append(fixture_fields(item, as_dict=True))
+        internal_counter += 1
+        if internal_counter > max_elements_per_file:
+            df: DataFrame = DataFrame.from_records(lst)
+            df.to_csv(Path(f"{output_path}/{prefix}-{counter}.csv"), index=index)
+            # Save up some memory
+            del lst
+            gc.collect()
+
+            # Re-instantiate
+            lst: list = []
+            internal_counter = 1
+            counter += 1
+    else:
+        df: DataFrame = DataFrame.from_records(lst)
+        df.to_csv(Path(f"{output_path}/{prefix}-{counter}.csv"), index=index)
+
+    return
+    save_fixture(records, prefix=f"test-{table_name}", output_path=path)
+
+
+
+ +
+ + +
+ + + +

+ gen_fixture_tables + + +

+
gen_fixture_tables(
+    fixture_tables: dict[str, list[FixtureDict]] = {},
+    include_fixture_pk_column: bool = True,
+) -> Generator[Table, None, None]
+
+ +
+ +

Generator of rich.Table instances from FixtureDict configuration tables.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
fixture_tables + dict[str, list[FixtureDict]] + +
+

dict where key is for Table title and value is a FixtureDict

+
+
+ {} +
include_fixture_pk_column + bool + +
+

whether to include the pk field from FixtureDict

+
+
+ True +
+ +
+ Example +
>>> table_name: str = "data_provider"
+>>> tables = tuple(
+...     gen_fixture_tables(
+...         {table_name: NEWSPAPER_COLLECTION_METADATA}
+...     ))
+>>> len(tables)
+1
+>>> assert tables[0].title == table_name
+>>> [column.header for column in tables[0].columns]
+['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def gen_fixture_tables(
+    fixture_tables: dict[str, list[FixtureDict]] = {},
+    include_fixture_pk_column: bool = True,
+) -> Generator[Table, None, None]:
+    """Generator of `rich.Table` instances from `FixtureDict` configuration tables.
+
+    Args:
+        fixture_tables: `dict` where `key` is for `Table` title and `value` is a `FixtureDict`
+        include_fixture_pk_column: whether to include the `pk` field from `FixtureDict`
+
+    Example:
+        ```pycon
+        >>> table_name: str = "data_provider"
+        >>> tables = tuple(
+        ...     gen_fixture_tables(
+        ...         {table_name: NEWSPAPER_COLLECTION_METADATA}
+        ...     ))
+        >>> len(tables)
+        1
+        >>> assert tables[0].title == table_name
+        >>> [column.header for column in tables[0].columns]
+        ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
+
+        ```
+    """
+    for name, fixture_records in fixture_tables.items():
+        fixture_table: Table = Table(title=name)
+        for i, fixture_dict in enumerate(fixture_records):
+            if i == 0:
+                [
+                    fixture_table.add_column(name)
+                    for name in fixture_fields(fixture_dict, include_fixture_pk_column)
+                ]
+            row_values: tuple[str, ...] = tuple(
+                str(x) for x in (fixture_dict["pk"], *fixture_dict["fields"].values())
+            )
+            fixture_table.add_row(*row_values)
+        yield fixture_table
+
+
+
+ +
+ + +
+ + + +

+ get_chunked_zipfiles + + +

+
get_chunked_zipfiles(path: Path) -> list
+
+ +
+ +

This function takes in a Path object path and returns a list of lists +of zipfiles sorted and chunked according to certain conditions defined +in the settings object (see settings.CHUNK_THRESHOLD).

+

Note: the function will also skip zip files of a certain file size, which +can be specified in the settings object (see settings.SKIP_FILE_SIZE).

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
path + Path + +
+

The input path where the zipfiles are located

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ list + +
+

A list of lists of zipfiles, each inner list represents a chunk of +zipfiles.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def get_chunked_zipfiles(path: Path) -> list:
+    """This function takes in a `Path` object `path` and returns a list of lists
+    of `zipfiles` sorted and chunked according to certain conditions defined
+    in the `settings` object (see `settings.CHUNK_THRESHOLD`).
+
+    Note: the function will also skip zip files of a certain file size, which
+    can be specified in the `settings` object (see `settings.SKIP_FILE_SIZE`).
+
+    Args:
+        path: The input path where the zipfiles are located
+
+    Returns:
+        A list of lists of `zipfiles`, each inner list represents a chunk of
+            zipfiles.
+    """
+
+    zipfiles = sorted(
+        path.glob("*.zip"),
+        key=lambda x: x.stat().st_size,
+        reverse=settings.START_WITH_LARGEST,
+    )
+
+    zipfiles = [x for x in zipfiles if x.stat().st_size <= settings.SKIP_FILE_SIZE]
+
+    if len(zipfiles) > settings.CHUNK_THRESHOLD:
+        chunks = array_split(zipfiles, len(zipfiles) / settings.CHUNK_THRESHOLD)
+    else:
+        chunks = [zipfiles]
+
+    return chunks
+
+
+
+ +
+ + +
+ + + +

+ get_key + + +

+
get_key(x: dict = dict(), on: list = []) -> str
+
+ +
+ +

Get a string key from a dictionary using values from specified keys.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
x + dict + +
+

A dictionary from which the key is generated.

+
+
+ dict() +
on + list + +
+

A list of keys from the dictionary that should be used to +generate the key.

+
+
+ [] +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

The generated string key.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
def get_key(x: dict = dict(), on: list = []) -> str:
+    """
+    Get a string key from a dictionary using values from specified keys.
+
+    Args:
+        x: A dictionary from which the key is generated.
+        on: A list of keys from the dictionary that should be used to
+            generate the key.
+
+    Returns:
+        The generated string key.
+    """
+
+    return f"{'-'.join([str(x['fields'][y]) for y in on])}"
+
+
+
+ +
+ + +
+ + + +

+ get_lockfile + + +

+
get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path
+
+ +
+ +

Provides the path to any given lockfile, which controls whether any +existing files should be overwritten or not.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
collection + str + +
+

Collection folder name

+
+
+ required +
kind + NewspaperElements + +
+

Either newspaper or issue or item

+
+
+ required +
dic + dict + +
+

A dictionary with required information for either kind passed

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Path + +
+

Path to the resulting lockfile

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path:
+    """
+    Provides the path to any given lockfile, which controls whether any
+    existing files should be overwritten or not.
+
+    Args:
+        collection: Collection folder name
+        kind: Either `newspaper` or `issue` or `item`
+        dic: A dictionary with required information for either `kind` passed
+
+    Returns:
+        Path to the resulting lockfile
+    """
+
+    p: Path
+    base = Path(f"cache-lockfiles/{collection}")
+
+    if kind == "newspaper":
+        p = base / f"newspapers/{dic['publication_code']}"
+    elif kind == "issue":
+        p = base / f"issues/{dic['publication__publication_code']}/{dic['issue_code']}"
+    elif kind == "item":
+        try:
+            if dic.get("issue_code"):
+                p = base / f"items/{dic['issue_code']}/{dic['item_code']}"
+            elif dic.get("issue__issue_identifier"):
+                p = base / f"items/{dic['issue__issue_identifier']}/{dic['item_code']}"
+        except KeyError:
+            error("An unknown error occurred (in get_lockfile)")
+    else:
+        p = base / "lockfile"
+
+    p.parent.mkdir(parents=True, exist_ok=True) if settings.WRITE_LOCKFILES else None
+
+    return p
+
+
+
+ +
+ + +
+ + + +

+ get_now + + +

+
get_now(as_str: bool = False) -> datetime.datetime | str
+
+ +
+ +

Return datetime.now() as either a string or datetime object.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
as_str + bool + +
+

Whether to return now time as a str or not, default: False

+
+
+ False +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ datetime.datetime | str + +
+

datetime.now() in pytz.UTC time zone as a string if as_str, else +as a datetime.datetime object.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
def get_now(as_str: bool = False) -> datetime.datetime | str:
+    """
+    Return `datetime.now()` as either a string or `datetime` object.
+
+    Args:
+        as_str: Whether to return `now` `time` as a `str` or not, default: `False`
+
+    Returns:
+        `datetime.now()` in `pytz.UTC` time zone as a string if `as_str`, else
+            as a `datetime.datetime` object.
+    """
+    now = datetime.datetime.now(tz=pytz.UTC)
+
+    if as_str:
+        return str(now)
+    else:
+        assert isinstance(now, datetime.datetime)
+        return now
+
+
+
+ +
+ + +
+ + + +

+ get_path_from + + +

+
get_path_from(p: str | Path) -> Path
+
+ +
+ +

Converts an input value into a Path object if it's not already one.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str | Path + +
+

The input value, which can be a string or a Path object.

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Path + +
+

The input value as a Path object.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def get_path_from(p: str | Path) -> Path:
+    """
+    Converts an input value into a Path object if it's not already one.
+
+    Args:
+        p: The input value, which can be a string or a Path object.
+
+    Returns:
+        The input value as a Path object.
+    """
+    if isinstance(p, str):
+        p = Path(p)
+
+    if not isinstance(p, Path):
+        raise RuntimeError(f"Unable to handle type: {type(p)}")
+
+    return p
+
+
+
+ +
+ + +
+ + + +

+ get_size_from_path + + +

+
get_size_from_path(p: str | Path, raw: bool = False) -> str | float
+
+ +
+ +

Returns a nice string for any given file size.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str | Path + +
+

Path to read the size from

+
+
+ required +
raw + bool + +
+

Whether to return the file size as total number of bytes or +a human-readable MB/GB amount

+
+
+ False +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str | float + +
+

Return str followed by MB or GB for size if not raw otherwise float.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def get_size_from_path(p: str | Path, raw: bool = False) -> str | float:
+    """
+    Returns a nice string for any given file size.
+
+    Args:
+        p: Path to read the size from
+        raw: Whether to return the file size as total number of bytes or
+            a human-readable MB/GB amount
+
+    Returns:
+        Return `str` followed by `MB` or `GB` for size if not `raw` otherwise `float`.
+    """
+
+    p = get_path_from(p)
+
+    bytes = p.stat().st_size
+
+    if raw:
+        return bytes
+
+    rel_size: float | int | str = round(bytes / 1000 / 1000 / 1000, 1)
+
+    assert not isinstance(rel_size, str)
+
+    if rel_size < 0.5:
+        rel_size = round(bytes / 1000 / 1000, 1)
+        rel_size = f"{rel_size}MB"
+    else:
+        rel_size = f"{rel_size}GB"
+
+    return rel_size
+
+
+
+ +
+ + +
+ + + +

+ glob_filter + + +

+
glob_filter(p: str) -> list
+
+ +
+ +

Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str + +
+

Path to a directory to filter

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ list + +
+

Sorted list of files contained in the provided path without the ones

+
+
+ list + +
+

whose names start with a .

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def glob_filter(p: str) -> list:
+    """
+    Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.
+
+    Args:
+        p: Path to a directory to filter
+
+    Returns:
+        Sorted list of files contained in the provided path without the ones
+        whose names start with a `.`
+    """
+    return sorted([x for x in get_path_from(p).glob("*") if not x.name.startswith(".")])
+
+
+
+ +
+ + +
+ + + +

+ list_json_files + + +

+
list_json_files(
+    p: str | Path,
+    drill: bool = False,
+    exclude_names: list = [],
+    include_names: list = [],
+) -> Generator[Path, None, None] | list[Path]
+
+ +
+ +

List json files under the path specified in p.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str | Path + +
+

The path to search for json files

+
+
+ required +
drill + bool + +
+

A flag indicating whether to drill down the subdirectories +or not. Default is False

+
+
+ False +
exclude_names + list + +
+

A list of file names to exclude from the search +result. Default is an empty list

+
+
+ [] +
include_names + list + +
+

A list of file names to include in search result. +If provided, the exclude_names argument will be ignored. +Default is an empty list

+
+
+ [] +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Generator[Path, None, None] | list[Path] + +
+

A list of Path objects pointing to the found json files

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def list_json_files(
+    p: str | Path,
+    drill: bool = False,
+    exclude_names: list = [],
+    include_names: list = [],
+) -> Generator[Path, None, None] | list[Path]:
+    """
+    List `json` files under the path specified in ``p``.
+
+    Args:
+        p: The path to search for `json` files
+        drill: A flag indicating whether to drill down the subdirectories
+            or not. Default is ``False``
+        exclude_names: A list of file names to exclude from the search
+            result. Default is an empty list
+        include_names: A list of file names to include in search result.
+            If provided, the ``exclude_names`` argument will be ignored.
+            Default is an empty list
+
+    Returns:
+        A list of `Path` objects pointing to the found `json` files
+    """
+
+    q: str = "**/*.json" if drill else "*.json"
+    files = get_path_from(p).glob(q)
+
+    if exclude_names:
+        files = list({x for x in files if x.name not in exclude_names})
+    elif include_names:
+        files = list({x for x in files if x.name in include_names})
+
+    return sorted(files)
+
+
+
+ +
+ + +
+ + + +

+ load_json + + +

+
load_json(p: str | Path, crash: bool = False) -> dict | list
+
+ +
+ +

Easier access to reading json files.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str | Path + +
+

Path to read json from

+
+
+ required +
crash + bool + +
+

Whether the program should crash if there is a json decode +error, default: False

+
+
+ False +
+ + + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ dict | list + +
+

The decoded json contents from the path, but an empty dictionary

+
+
+ dict | list + +
+

if the file cannot be decoded and crash is set to False

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def load_json(p: str | Path, crash: bool = False) -> dict | list:
+    """
+    Easier access to reading `json` files.
+
+    Args:
+        p: Path to read `json` from
+        crash: Whether the program should crash if there is a `json` decode
+            error, default: ``False``
+
+    Returns:
+        The decoded `json` contents from the path, but an empty dictionary
+        if the file cannot be decoded and ``crash`` is set to ``False``
+    """
+
+    p = get_path_from(p)
+
+    try:
+        return json.loads(p.read_text())
+    except json.JSONDecodeError:
+        msg = f"Error: {p.read_text()}"
+        error(msg, crash=crash)
+
+    return {}
+
+
+
+ +
+ + +
+ + + +

+ load_multiple_json + + +

+
load_multiple_json(
+    p: str | Path,
+    drill: bool = False,
+    filter_na: bool = True,
+    crash: bool = False,
+) -> list
+
+ +
+ +

Load multiple json files and return a list of their content.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str | Path + +
+

The path to search for json files

+
+
+ required +
drill + bool + +
+

A flag indicating whether to drill down the subdirectories +or not. Default is False

+
+
+ False +
filter_na + bool + +
+

A flag indicating whether to filter out the content that +is None. Default is True.

+
+
+ True +
crash + bool + +
+

A flag indicating whether to raise an exception when an +error occurs while loading a json file. Default is False.

+
+
+ False +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ list + +
+

A list of the content of the loaded json files.

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def load_multiple_json(
+    p: str | Path,
+    drill: bool = False,
+    filter_na: bool = True,
+    crash: bool = False,
+) -> list:
+    """
+    Load multiple `json` files and return a list of their content.
+
+    Args:
+        p: The path to search for `json` files
+        drill: A flag indicating whether to drill down the subdirectories
+            or not. Default is `False`
+        filter_na: A flag indicating whether to filter out the content that
+            is `None`. Default is `True`.
+        crash: A flag indicating whether to raise an exception when an
+            error occurs while loading a `json` file. Default is `False`.
+
+    Returns:
+        A `list` of the content of the loaded `json` files.
+    """
+
+    files = list_json_files(p, drill=drill)
+
+    content = [load_json(x, crash=crash) for x in files]
+
+    return [x for x in content if x] if filter_na else content
+
+
+
+ +
+ + +
+ + + +

+ lock + + +

+
lock(lockfile: Path) -> None
+
+ +
+ +

Writes a '.' to a lockfile, after making sure the parent directory exists.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
lockfile + Path + +
+

The path to the lock file to be created

+
+
+ required +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

None

+
+
+ +
+ Source code in alto2txt2fixture/utils.py +
def lock(lockfile: Path) -> None:
+    """
+    Writes a '.' to a lockfile, after making sure the parent directory exists.
+
+    Args:
+        lockfile: The path to the lock file to be created
+
+    Returns:
+        None
+    """
+    lockfile.parent.mkdir(parents=True, exist_ok=True)
+
+    lockfile.write_text("")
+
+    return
+
+
+
+ +
+ + +
+ + + +

+ save_fixture + + +

+
save_fixture(
+    generator: Sequence | Generator = [],
+    prefix: str = "",
+    output_path: PathLike | str = settings.OUTPUT,
+    max_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,
+    add_created: bool = True,
+    json_indent: int = JSON_INDENT,
+) -> None
+
+ +
+ +

Saves fixtures generated by a generator to separate JSON files.

+

This function takes a generator and saves the generated fixtures to +separate JSON files. The fixtures are saved in batches, where each batch +is determined by the max_elements_per_file parameter.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
generator + Sequence | Generator + +
+

A generator that yields the fixtures to be saved.

+
+
+ [] +
prefix + str + +
+

A string prefix to be added to the file names of the +saved fixtures.

+
+
+ '' +
output_path + PathLike | str + +
+

Path to folder fixtures are saved to

+
+
+ settings.OUTPUT +
max_elements_per_file + int + +
+

Maximum JSON records saved in each file

+
+
+ settings.MAX_ELEMENTS_PER_FILE +
add_created + bool + +
+

Whether to add created_at and updated_at timestamps

+
+
+ True +
json_indent + int + +
+

Number of indent spaces per line in saved JSON

+
+
+ JSON_INDENT +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

This function saves the fixtures to files but does not return +any value.

+
+
+ +
+ Example +
>>> save_fixture(NEWSPAPER_COLLECTION_METADATA,
+...              prefix='test', output_path='tests/')
+>>> imported_fixture = load_json('tests/test-1.json')
+>>> imported_fixture[1]['pk']
+2
+>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]
+'hmd'
+>>> 'created_at' in imported_fixture[1]['fields']
+True
+
+
+
+ Source code in alto2txt2fixture/utils.py +
def save_fixture(
+    generator: Sequence | Generator = [],
+    prefix: str = "",
+    output_path: PathLike | str = settings.OUTPUT,
+    max_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,
+    add_created: bool = True,
+    json_indent: int = JSON_INDENT,
+) -> None:
+    """Saves fixtures generated by a generator to separate JSON files.
+
+    This function takes a generator and saves the generated fixtures to
+    separate JSON files. The fixtures are saved in batches, where each batch
+    is determined by the ``max_elements_per_file`` parameter.
+
+    Args:
+        generator: A generator that yields the fixtures to be saved.
+        prefix: A string prefix to be added to the file names of the
+            saved fixtures.
+        output_path: Path to folder fixtures are saved to
+        max_elements_per_file: Maximum `JSON` records saved in each file
+        add_created: Whether to add `created_at` and `updated_at` `timestamps`
+        json_indent: Number of indent spaces per line in saved `JSON`
+
+
+    Returns:
+        This function saves the fixtures to files but does not return
+            any value.
+
+    Example:
+        ```pycon
+        >>> save_fixture(NEWSPAPER_COLLECTION_METADATA,
+        ...              prefix='test', output_path='tests/')
+        >>> imported_fixture = load_json('tests/test-1.json')
+        >>> imported_fixture[1]['pk']
+        2
+        >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]
+        'hmd'
+        >>> 'created_at' in imported_fixture[1]['fields']
+        True
+
+        ```
+
+    """
+    internal_counter = 1
+    counter = 1
+    lst = []
+    Path(output_path).mkdir(parents=True, exist_ok=True)
+    for item in generator:
+        lst.append(item)
+        internal_counter += 1
+        if internal_counter > max_elements_per_file:
+            write_json(
+                p=Path(f"{output_path}/{prefix}-{counter}.json"),
+                o=lst,
+                add_created=add_created,
+                json_indent=json_indent,
+            )
+
+            # Save up some memory
+            del lst
+            gc.collect()
+
+            # Re-instantiate
+            lst = []
+            internal_counter = 1
+            counter += 1
+    else:
+        write_json(
+            p=Path(f"{output_path}/{prefix}-{counter}.json"),
+            o=lst,
+            add_created=add_created,
+            json_indent=json_indent,
+        )
+
+    return
+
+
+
+ +
+ + +
+ + + +

+ write_json + + +

+
write_json(
+    p: str | Path,
+    o: dict,
+    add_created: bool = True,
+    json_indent: int = JSON_INDENT,
+) -> None
+
+ +
+ +

Easier access to writing json files. Checks whether parent exists.

+ + + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
p + str | Path + +
+

Path to write json to

+
+
+ required +
o + dict + +
+

Object to write to json file

+
+
+ required +
add_created + bool + +
+

If set to True will add created_at and updated_at +to the dictionary's fields. If created_at and updated_at +already exist in the fields, they will be forcefully updated.

+
+
+ True +
json_indent + int + +
+

What indetation format to write out JSON file in

+
+
+ JSON_INDENT +
+ + + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

None

+
+
+ +
+ Example +

>>> path = 'test-write-json/example.json'
+>>> write_json(p=path,
+...            o=NEWSPAPER_COLLECTION_METADATA,
+...            add_created=True)
+>>> imported_fixture = load_json(path)
+>>> imported_fixture[1]['pk']
+2
+>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]
+'hmd'
+
+`

+
+
+ Source code in alto2txt2fixture/utils.py +
def write_json(
+    p: str | Path, o: dict, add_created: bool = True, json_indent: int = JSON_INDENT
+) -> None:
+    """
+    Easier access to writing `json` files. Checks whether parent exists.
+
+    Args:
+        p: Path to write `json` to
+        o: Object to write to `json` file
+        add_created:
+            If set to True will add `created_at` and `updated_at`
+            to the dictionary's fields. If `created_at` and `updated_at`
+            already exist in the fields, they will be forcefully updated.
+        json_indent:
+            What indetation format to write out `JSON` file in
+
+    Returns:
+        None
+
+    Example:
+        ```pycon
+        >>> path = 'test-write-json/example.json'
+        >>> write_json(p=path,
+        ...            o=NEWSPAPER_COLLECTION_METADATA,
+        ...            add_created=True)
+        >>> imported_fixture = load_json(path)
+        >>> imported_fixture[1]['pk']
+        2
+        >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]
+        'hmd'
+
+        ```
+        `
+    """
+
+    p = get_path_from(p)
+
+    if not (isinstance(o, dict) or isinstance(o, list)):
+        raise RuntimeError(f"Unable to handle data of type: {type(o)}")
+
+    def _append_created_fields(o: dict):
+        """Add `created_at` and `updated_at` fields to a `dict` with `FixtureDict` values."""
+        return dict(
+            **{k: v for k, v in o.items() if not k == "fields"},
+            fields=dict(
+                **{
+                    k: v
+                    for k, v in o["fields"].items()
+                    if not k == "created_at" and not k == "updated_at"
+                },
+                **{"created_at": NOW_str, "updated_at": NOW_str},
+            ),
+        )
+
+    try:
+        if add_created and isinstance(o, dict):
+            o = _append_created_fields(o)
+        elif add_created and isinstance(o, list):
+            o = [_append_created_fields(x) for x in o]
+    except KeyError:
+        error("An unknown error occurred (in write_json)")
+
+    p.parent.mkdir(parents=True, exist_ok=True)
+
+    p.write_text(json.dumps(o, indent=json_indent))
+
+    return
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/running.html b/running.html new file mode 100644 index 0000000..538ce5c --- /dev/null +++ b/running.html @@ -0,0 +1,764 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Running the Program - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + + + + + +
+
+ + + + + + + +

Running the Program

+ +

Using poetry to run

+

The program should run automatically with the following command:

+
$ poetry run a2t2f-news
+
+

Alternatively, if you want to add optional parameters and don’t want to use the standard poetry script to run, you can use the (somewhat convoluted) poetry run alto2txt2fixture/run.py and provide any optional parameters. You can see a list of all the “Optional parameters” below. For example, if you want to only include the hmd collection:

+
$ poetry run alto2txt2fixture/run.py --collections hmd
+
+

Alternative: Run the script without poetry

+

If you find yourself in trouble with poetry, the program should run perfectly fine on its own, assuming the dependencies are installed. The same command, then, would be:

+
$ python alto2txt2fixture/run.py --collections hmd
+
+
+

Note

+

See the list under [tool.poetry.dependencies] in pyproject.toml for a list of dependencies that would need to be installed for alto2txt2fixture to work outside a python poetry environment.

+
+

Optional parameters

+

The program has a number of optional parameters that you can choose to include or not. The table below describes each parameter, how to pass it to the program, and what its defaults are.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FlagDescriptionDefault value
-c, --collectionsWhich collections to process in the mounted alto2txt directoryhmd, lwm, jisc, bna
-o, --outputInto which directory should the processed files be put?./output/fixtures/
-m, --mountpointWhere is the alto2txt directories mounted?./input/alto2txt/
-t, --test-configPrint the config table but do not runFalse
+

Successfully running the program: An example

+

img/successfully-running.png

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/search/search_index.js b/search/search_index.js new file mode 100644 index 0000000..6e22acb --- /dev/null +++ b/search/search_index.js @@ -0,0 +1 @@ +var __index = {"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"alto2txt2fixture","text":"

alto2txt2fixture is a standalone tool to convert alto2txt XML output and other related datasets into JSON (and where feasible CSV) data with corresponding relational IDs to ease general use and ingestion into a relational database.

We target the the JSON produced for importing into lwmdb: a database built using the Django python webframework database fixture structure.

"},{"location":"index.html#installation-and-simple-use","title":"Installation and simple use","text":"

We provide a command line interface to process alto2txt XML files stored locally (or mounted via azure blobfuse), and for additional public data we automate a means of downloading those automatically.

"},{"location":"index.html#installation","title":"Installation","text":"

We recommend downloading a copy of the reposity or using git clone. From a local copy use poetry to install dependencies:

$ cd alto2txt2fixture\n$ poetry install\n

If you would like to test, render documentation and/or contribute to the code included dev dependencies in a local install:

$ poetry install --with dev\n
"},{"location":"index.html#simple-use","title":"Simple use","text":"

To processing newspaper metadata with a local copy of alto2txt XML results, it's easiest to have that data in the same folder as your alto2txt2fixture checkout and poetry installed folder. One arranged, you should be able to begin the JSON converstion with

$ poetry run a2t2f-news\n

To generate related data in JSON and CSV form, assuming you have an internet collection and access to a living-with-machines azure account, the following will download related data into JSON and CSV files. The JSON results should be consistent with lwmdb tables for ease of import.

$ poetry run a2t2f-adj\n
"},{"location":"running.html","title":"Running the Program","text":""},{"location":"running.html#using-poetry-to-run","title":"Using poetry to run","text":"

The program should run automatically with the following command:

$ poetry run a2t2f-news\n

Alternatively, if you want to add optional parameters and don\u2019t want to use the standard poetry script to run, you can use the (somewhat convoluted) poetry run alto2txt2fixture/run.py and provide any optional parameters. You can see a list of all the \u201cOptional parameters\u201d below. For example, if you want to only include the hmd collection:

$ poetry run alto2txt2fixture/run.py --collections hmd\n
"},{"location":"running.html#alternative-run-the-script-without-poetry","title":"Alternative: Run the script without poetry","text":"

If you find yourself in trouble with poetry, the program should run perfectly fine on its own, assuming the dependencies are installed. The same command, then, would be:

$ python alto2txt2fixture/run.py --collections hmd\n

Note

See the list under [tool.poetry.dependencies] in pyproject.toml for a list of dependencies that would need to be installed for alto2txt2fixture to work outside a python poetry environment.

"},{"location":"running.html#optional-parameters","title":"Optional parameters","text":"

The program has a number of optional parameters that you can choose to include or not. The table below describes each parameter, how to pass it to the program, and what its defaults are.

Flag Description Default value -c, --collections Which collections to process in the mounted alto2txt directory hmd, lwm, jisc, bna -o, --output Into which directory should the processed files be put? ./output/fixtures/ -m, --mountpoint Where is the alto2txt directories mounted? ./input/alto2txt/ -t, --test-config Print the config table but do not run False"},{"location":"running.html#successfully-running-the-program-an-example","title":"Successfully running the program: An example","text":""},{"location":"understanding-results.html","title":"Understanding the Results","text":""},{"location":"understanding-results.html#the-resulting-file-structure","title":"The resulting file structure","text":"

The examples below follow standard settings

If you choose other settings for when you run the program, your output directory may look different from the information on this page.

"},{"location":"understanding-results.html#reports","title":"Reports","text":"

Reports are automatically generated with a unique hash as the overarching folder structure. Inside the reports directory, you\u2019ll find a JSON file for each alto2txt directory (organised by NLP identifier).

The report structure, thus, looks like this:

The JSON file has some good troubleshooting information. You\u2019ll find that the contents are structured as a Python dictionary (or JavaScript Object). Here is an example:

Here is an explanation of each of the keys in the dictionary:

Key Explanation Data type path The input path for the zip file that is being converted. string bytes The size of the input zip file represented in bytes. integer size The size of the input zip file represented in a human-readable string. string contents #TODO #3 integer start Date and time when processing started (see also end below). datestring newspaper_paths #TODO #3 list (string) publication_codes A list of the NLPs that are contained in the input zip file. list (string) issue_paths A list of all the issue paths that are contained in the cache directory. list (string) item_paths A list of all the item paths that are contained in the cache directory. list (string) end Date and time when processing ended (see also start above). datestring seconds Seconds that the script spent interpreting the zip file (should be added to the microseconds below). integer microseconds Microseconds that the script spent interpreting the zip file (should be added to the seconds above). integer"},{"location":"understanding-results.html#fixtures","title":"Fixtures","text":"

The most important output of the script is contained in the fixtures directory. This directory contains JSON files for all the different columns in the corresponding Django metadata database (i.e. DataProvider, Digitisation, Ingest, Issue, Newspaper, and Item). The numbering at the end of each file indicates the order of the files as they are divided into a maximum of 2e6 elements*:

Each JSON file contains a Python-like list (JavaScript Array) of dictionaries (JavaScript Objects), which have a primary key (pk), the related database model (in the example below the Django newspapers app\u2019s newspaper table), and a nested dictionary/Object which contains all the values for the database\u2019s table entry:

* The maximum elements per file can be adjusted in the settings.py file\u2019s settings object\u2019s MAX_ELEMENTS_PER_FILE value.

"},{"location":"reference/SUMMARY.html","title":"SUMMARY","text":"
  • alto2txt2fixture
    • __main__
    • cli
    • create_adjacent_tables
    • jisc
    • log
    • parser
    • patterns
    • router
    • settings
    • types
    • utils
"},{"location":"reference/alto2txt2fixture/index.html","title":"alto2txt2fixture","text":""},{"location":"reference/alto2txt2fixture/__main__.html","title":"__main__","text":"

Entry point for alto2txt2fixture.parse to convert alto2txt XML -> JSON.

This module defines the run function which is the main driver for the entire process.

It imports various functions from other modules and uses them to route and parse XML data generated by alto2txt.

The following steps are performed in the run function:

  1. Parses command line arguments using the parse_args function. If no arguments are provided, the default values are taken from the settings module.
  2. Prints a setup report to the console, showing the values of the relevant parameters.
  3. Calls the route function to route alto2txt data into subdirectories with structured files.
  4. Calls the parse function to parse the resulting JSON files.
  5. Calls the clear_cache function to clear the cache.

If the script is run as a main program (i.e. if the name of the script is __main__), the run() function is executed.

Note: at present this does not include any functunality in create_adjacent_tables.py

"},{"location":"reference/alto2txt2fixture/__main__.html#alto2txt2fixture.__main__.parse_args","title":"parse_args","text":"
parse_args(argv: list[str] | None = None) -> Namespace\n

Manage command line arguments for run()

This constructs an ArgumentParser instance to manage configurating calls of run() to manage newspaper XML to JSON converstion.

Parameters:

Name Type Description Default argv list[str] | None

If None treat as equivalent of ['--help], if alistofstrpass those options toArgumentParser`

None

Returns:

Type Description Namespace

A Namespace dict-like configuration for run()

Source code in alto2txt2fixture/__main__.py
def parse_args(argv: list[str] | None = None) -> Namespace:\n\"\"\"Manage command line arguments for `run()`\n    This constructs an `ArgumentParser` instance to manage\n    configurating calls of `run()` to manage `newspaper`\n    `XML` to `JSON` converstion.\n    Arguments:\n        argv:\n            If `None` treat as equivalent of ['--help`],\n            if a `list` of `str` pass those options to `ArgumentParser`\n    Returns:\n        A `Namespace` `dict`-like configuration for `run()`\n    \"\"\"\nargv = None if not argv else argv\nparser = ArgumentParser(\nprog=\"a2t2f-news\",\ndescription=\"Process alto2txt XML into and Django JSON Fixture files\",\nepilog=(\n\"Note: this is still in beta mode and contributions welcome\\n\\n\" + __doc__\n),\nformatter_class=RawTextHelpFormatter,\n)\nparser.add_argument(\n\"-c\",\n\"--collections\",\nnargs=\"+\",\nhelp=\"<Optional> Set collections\",\nrequired=False,\n)\nparser.add_argument(\n\"-m\",\n\"--mountpoint\",\ntype=str,\nhelp=\"<Optional> Mountpoint\",\nrequired=False,\n)\nparser.add_argument(\n\"-o\",\n\"--output\",\ntype=str,\nhelp=\"<Optional> Set an output directory\",\nrequired=False,\n)\nparser.add_argument(\n\"-t\",\n\"--test-config\",\ndefault=False,\nhelp=\"Only print the configuration\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"-f\",\n\"--show-fixture-tables\",\ndefault=True,\nhelp=\"Print included fixture table configurations\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--export-fixture-tables\",\ndefault=True,\nhelp=\"Experimental: export fixture tables prior to data processing\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--data-provider-field\",\ntype=str,\ndefault=DATA_PROVIDER_INDEX,\nhelp=\"Key for indexing DataProvider records\",\n)\nreturn parser.parse_args(argv)\n
"},{"location":"reference/alto2txt2fixture/__main__.html#alto2txt2fixture.__main__.run","title":"run","text":"
run(local_args: list[str] | None = None) -> None\n

Manage running newspaper XML to JSON conversion.

First parse_args is called for command line arguments including:

  • collections
  • output
  • mountpoint

If any of these arguments are specified, they will be used, otherwise they will default to the values in the settings module.

The show_setup function is then called to display the configurations being used.

The route function is then called to route the alto2txt files into subdirectories with structured files.

The parse function is then called to parse the resulting JSON files.

Finally, the clear_cache function is called to clear the cache (pending the user's confirmation).

Parameters:

Name Type Description Default local_args list[str] | None

Options passed to parse_args()

None Source code in alto2txt2fixture/__main__.py
def run(local_args: list[str] | None = None) -> None:\n\"\"\"Manage running newspaper `XML` to `JSON` conversion.\n    First `parse_args` is called for command line arguments including:\n    - `collections`\n    - `output`\n    - `mountpoint`\n    If any of these arguments are specified, they will be used, otherwise they\n    will default to the values in the `settings` module.\n    The `show_setup` function is then called to display the configurations\n    being used.\n    The `route` function is then called to route the alto2txt files into\n    subdirectories with structured files.\n    The `parse` function is then called to parse the resulting JSON files.\n    Finally, the `clear_cache` function is called to clear the cache\n    (pending the user's confirmation).\n    Arguments:\n        local_args:\n            Options passed to `parse_args()`\n    \"\"\"\nargs: Namespace = parse_args(argv=local_args)\nif args.collections:\nCOLLECTIONS = [x.lower() for x in args.collections]\nelse:\nCOLLECTIONS = settings.COLLECTIONS\nif args.output:\nOUTPUT = args.output.rstrip(\"/\")\nelse:\nOUTPUT = settings.OUTPUT\nif args.mountpoint:\nMOUNTPOINT = args.mountpoint.rstrip(\"/\")\nelse:\nMOUNTPOINT = settings.MOUNTPOINT\nshow_setup(\nCOLLECTIONS=COLLECTIONS,\nOUTPUT=OUTPUT,\nCACHE_HOME=settings.CACHE_HOME,\nMOUNTPOINT=MOUNTPOINT,\nJISC_PAPERS_CSV=settings.JISC_PAPERS_CSV,\nREPORT_DIR=settings.REPORT_DIR,\nMAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,\n)\nif args.show_fixture_tables:\n# Show a table of fixtures used, defaults to DataProvider Table\nshow_fixture_tables(settings, data_provider_index=args.data_provider_field)\nif args.export_fixture_tables:\nexport_fixtures(\nfixture_tables=settings.FIXTURE_TABLES,\npath=OUTPUT,\nformats=settings.FIXTURE_TABLES_FORMATS,\n)\nif not args.test_config:\n# Routing alto2txt into subdirectories with structured files\nroute(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nMOUNTPOINT,\nsettings.JISC_PAPERS_CSV,\nsettings.REPORT_DIR,\n)\n# Parsing the resulting JSON files\nparse(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nOUTPUT,\nsettings.MAX_ELEMENTS_PER_FILE,\n)\nclear_cache(settings.CACHE_HOME)\n
"},{"location":"reference/alto2txt2fixture/cli.html","title":"cli","text":""},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_fixture_tables","title":"show_fixture_tables","text":"
show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]\n

Print fixture tables specified in settings.fixture_tables in rich.Table format.

Parameters:

Name Type Description Default run_settings dotdict

alto2txt2fixture run configuration

settings print_in_call bool

whether to print to console (will use console variable if so)

True data_provider_index str

key to index dataprovider from NEWSPAPER_COLLECTION_METADATA

DATA_PROVIDER_INDEX

Returns:

Type Description list[Table]

A list of rich.Table renders from configurations in run_settings.FIXTURE_TABLES

Example
>>> fixture_tables: list[Table] = show_fixture_tables(\n...     settings,\n...     print_in_call=False)\n>>> len(fixture_tables)\n1\n>>> fixture_tables[0].title\n'dataprovider'\n>>> [column.header for column in fixture_tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n>>> fixture_tables = show_fixture_tables(settings)\n... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n
Note

It is possible for the example test to fail in different screen sizes. Try increasing the window or screen width of terminal used to check before raising an issue.

Source code in alto2txt2fixture/cli.py
def show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]:\n\"\"\"Print fixture tables specified in ``settings.fixture_tables`` in `rich.Table` format.\n    Arguments:\n        run_settings: `alto2txt2fixture` run configuration\n        print_in_call: whether to print to console (will use ``console`` variable if so)\n        data_provider_index: key to index `dataprovider` from ``NEWSPAPER_COLLECTION_METADATA``\n    Returns:\n        A `list` of `rich.Table` renders from configurations in ``run_settings.FIXTURE_TABLES``\n    Example:\n        ```pycon\n        >>> fixture_tables: list[Table] = show_fixture_tables(\n        ...     settings,\n        ...     print_in_call=False)\n        >>> len(fixture_tables)\n        1\n        >>> fixture_tables[0].title\n        'dataprovider'\n        >>> [column.header for column in fixture_tables[0].columns]\n        ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n        >>> fixture_tables = show_fixture_tables(settings)\n        ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n        <BLANKLINE>\n        ...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n        ```\n    Note:\n        It is possible for the example test to fail in different screen sizes. Try\n        increasing the window or screen width of terminal used to check before\n        raising an issue.\n    \"\"\"\nif run_settings.FIXTURE_TABLES:\nif \"dataprovider\" in run_settings.FIXTURE_TABLES:\ncheck_newspaper_collection_configuration(\nrun_settings.COLLECTIONS,\nrun_settings.FIXTURE_TABLES[\"dataprovider\"],\ndata_provider_index=data_provider_index,\n)\nconsole_tables: list[Table] = list(\ngen_fixture_tables(run_settings.FIXTURE_TABLES)\n)\nif print_in_call:\nfor console_table in console_tables:\nconsole.print(console_table)\nreturn console_tables\nelse:\nreturn []\n
"},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_setup","title":"show_setup","text":"
show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs: str) -> None\n

Generate a rich.table.Table for printing configuration to console.

Source code in alto2txt2fixture/cli.py
def show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs) -> None:\n\"\"\"Generate a `rich.table.Table` for printing configuration to console.\"\"\"\nif clear and os.name == \"posix\":\nos.system(\"clear\")\nelif clear:\nos.system(\"cls\")\ntable = Table(title=title)\ntable.add_column(\"Setting\", justify=\"right\", style=\"cyan\", no_wrap=True)\ntable.add_column(\"Value\", style=\"magenta\")\nfor key, value in kwargs.items():\ntable.add_row(str(key), str(value))\nconsole.print(table)\nreturn\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html","title":"create_adjacent_tables","text":""},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.correct_dict","title":"correct_dict","text":"
correct_dict(o: dict) -> list\n

Returns a list with corrected data from a provided dictionary.

Source code in alto2txt2fixture/create_adjacent_tables.py
def correct_dict(o: dict) -> list:\n\"\"\"Returns a list with corrected data from a provided dictionary.\"\"\"\nreturn [(k, v[0], v[1]) for k, v in o.items() if not v[0].startswith(\"Q\")] + [\n(k, v[1], v[0]) for k, v in o.items() if v[0].startswith(\"Q\")\n]\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.csv2json_list","title":"csv2json_list","text":"
csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list\n

Save csv_path as a json file and return as a list.

Source code in alto2txt2fixture/create_adjacent_tables.py
def csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list:\n\"\"\"Save `csv_path` as a `json` file and return as a `list`.\"\"\"\njson_data = []\n# See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486\ndf = (\npd.read_csv(csv_path, index_col=0).fillna(np.nan).replace([np.nan], [None])\n)  # fillna(None)\nif \"political_leanings\" in df.columns:\ndf[\"political_leanings\"] = df[\"political_leanings\"].apply(json.loads)\nif \"prices\" in df.columns:\ndf[\"prices\"] = df[\"prices\"].apply(json.loads)\nmodel = Path(csv_path).stem.lower()\nfor pk, row in df.iterrows():\nfields = row.to_dict()\njson_data.append({\"pk\": pk, \"model\": model, \"fields\": fields})\n(Path(output_path) / csv_path).parent.mkdir(parents=True, exist_ok=True)\nPath(output_path / f\"{Path(csv_path).stem}.json\").write_text(\njson.dumps(json_data, indent=indent)\n)\nif not saved is None:\nsaved.append(output_path / f\"{Path(csv_path).stem}.json\")\nreturn json_data\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.download_data","title":"download_data","text":"
download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None\n

Download files in files_dict, overwrite if specified.

Parameters:

Name Type Description Default files_dict RemoteDataFilesType

dict of related files to download

{} overwrite bool

bool to overwrite LOCAL_CACHE files or not

OVERWRITE exclude list[str]

list of files to exclude from files_dict

[] Example
>>> tmp: Path = getfixture('tmpdir')\n>>> set_path: Path = tmp.chdir()\n>>> download_data(exclude=[\n...     \"mitchells\", \"Newspaper-1\", \"linking\"\n... ])  # doctest: +ELLIPSIS\nExcluding mitchells...\nExcluding Newspaper-1...\nExcluding linking...\nDownloading cache...dict_admin_counties.json\n100% ... 37/37 bytes\nDownloading cache...dict_countries.json\n100% ... 33.2/33.2 kB\nDownloading cache...dict_historic_counties.json\n100% ... 41.4/41.4 kB\nDownloading cache...nlp_loc_wikidata_concat.csv\n100% ... 59.8/59.8 kB\nDownloading cache...wikidata_gazetteer_selected_columns.csv\n100% ... 47.8/47.8 MB\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None:\n\"\"\"Download files in ``files_dict``, overwrite if specified.\n    Args:\n        files_dict: `dict` of related files to download\n        overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not\n        exclude: `list` of files to exclude from ``files_dict``\n    Example:\n        ```pycon\n        >>> tmp: Path = getfixture('tmpdir')\n        >>> set_path: Path = tmp.chdir()\n        >>> download_data(exclude=[\n        ...     \"mitchells\", \"Newspaper-1\", \"linking\"\n        ... ])  # doctest: +ELLIPSIS\n        Excluding mitchells...\n        Excluding Newspaper-1...\n        Excluding linking...\n        Downloading cache...dict_admin_counties.json\n        100% ... 37/37 bytes\n        Downloading cache...dict_countries.json\n        100% ... 33.2/33.2 kB\n        Downloading cache...dict_historic_counties.json\n        100% ... 41.4/41.4 kB\n        Downloading cache...nlp_loc_wikidata_concat.csv\n        100% ... 59.8/59.8 kB\n        Downloading cache...wikidata_gazetteer_selected_columns.csv\n        100% ... 47.8/47.8 MB\n        ```\n    \"\"\"\nif not files_dict:\nfiles_dict = deepcopy(FILES)\nfor data_source in exclude:\nif data_source in files_dict:\nprint(f\"Excluding {data_source}...\")\nfiles_dict.pop(data_source, 0)\nelse:\nlogger.warning(\nf'\"{data_source}\" not an option to exclude from {files_dict}'\n)\n# Describe whether local file exists\nfor k in files_dict.keys():\nfiles_dict[k][\"exists\"] = files_dict[k][\"local\"].exists()\nfiles_to_download = [\n(v[\"remote\"], v[\"local\"], v[\"exists\"])\nfor v in files_dict.values()\nif \"exists\" in v and not v[\"exists\"] or overwrite\n]\nfor url, out, exists in files_to_download:\nrmtree(Path(out), ignore_errors=True) if exists else None\nprint(f\"Downloading {out}\")\nPath(out).parent.mkdir(parents=True, exist_ok=True)\nassert isinstance(url, str)\nwith urlopen(url) as response, open(out, \"wb\") as out_file:\ntotal: int = int(response.info()[\"Content-length\"])\nwith Progress(\n\"[progress.percentage]{task.percentage:>3.0f}%\",\nBarColumn(),  # removed bar_width=None to avoid too long when resized\nDownloadColumn(),\n) as progress:\ndownload_task = progress.add_task(\"Download\", total=total)\nfor chunk in response:\nout_file.write(chunk)\nprogress.update(download_task, advance=len(chunk))\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_list","title":"get_list","text":"
get_list(x)\n

Get a list from a string, which contains as separator. If no string is encountered, the function returns an empty list. Source code in alto2txt2fixture/create_adjacent_tables.py

def get_list(x):\n\"\"\"Get a list from a string, which contains <SEP> as separator. If no\n    string is encountered, the function returns an empty list.\"\"\"\nreturn x.split(\"<SEP>\") if isinstance(x, str) else []\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_outpaths_dict","title":"get_outpaths_dict","text":"
get_outpaths_dict(\nnames: Sequence[str], module_name: str\n) -> TableOutputConfigType\n

Return a dict of csv and json paths for each module_name table.

The csv and json paths

Parameters:

Name Type Description Default names Sequence[str]

iterable of names of each module_name's component. Main target is csv and json table names

required module_name str

name of module each name is part of, that is added as a prefix

required

Returns:

Type Description TableOutputConfigType

A TableOutputConfigType: a dict of table names and output csv and json filenames.

Example
>>> from pprint import pprint\n>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n                      'json': 'mitchells.PoliticalLeaning.json'},\n 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConfigType:\n\"\"\"Return a `dict` of `csv` and `json` paths for each `module_name` table.\n    The `csv` and `json` paths\n    Args:\n        names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names\n        module_name: name of module each name is part of, that is added as a prefix\n    Returns:\n        A ``TableOutputConfigType``: a `dict` of table ``names`` and output\n            `csv` and `json` filenames.\n    Example:\n        ```pycon\n        >>> from pprint import pprint\n        >>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n        {'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n         'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n         'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n                              'json': 'mitchells.PoliticalLeaning.json'},\n         'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n        ```\n    \"\"\"\nreturn {\nname: OutputPathDict(\ncsv=f\"{module_name}.{name}.csv\",\njson=f\"{module_name}.{name}.json\",\n)\nfor name in names\n}\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.run","title":"run","text":"
run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None\n

Download, process and link files_dict to json and csv.

Note

This will require access to https://zooniversedata.blob.core.windows.net/downloads/.

Source code in alto2txt2fixture/create_adjacent_tables.py
def run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None:\n\"\"\"Download, process and link ``files_dict`` to `json` and `csv`.\n    Note:\n        This will require access to `https://zooniversedata.blob.core.windows.net/downloads/`.\n    \"\"\"\n# Ensure time_stamp from the point of calling `run`\nif not time_stamp:\ntime_stamp = get_now(as_str=False).strftime(TIME_FORMAT)\n# Ensure an independent deepcopy of FILES to avoid modifying subsequent runs\nif not files_dict:\nfiles_dict = deepcopy(FILES)\n# Download non-existing files\ndownload_data(files_dict=files_dict, overwrite=files_to_download_overwrite)\n# Create the output directory (defined in output_path)\noutput_path.mkdir(exist_ok=True, parents=True)\n# Read all the Wikidata Q values from Mitchells\nmitchells_df = pd.read_csv(files_dict[\"mitchells\"][\"local\"], index_col=0)\nmitchell_wikidata_mentions = sorted(\nlist(mitchells_df.PLACE_PUB_WIKI.unique()),\nkey=lambda x: int(x.replace(\"Q\", \"\")),\n)\n# Set up wikidata_gazetteer\ngaz_cols = [\"wikidata_id\", \"english_label\", \"latitude\", \"longitude\", \"geonamesIDs\"]\nwikidata_gazetteer = pd.read_csv(\nfiles_dict[\"wikidata_gazetteer_selected_columns\"][\"local\"], usecols=gaz_cols\n)\nwikidata_gazetteer.rename(\n{\n\"wikidata_id\": \"place_wikidata_id\",\n\"english_label\": \"place_label\",\n\"geonamesIDs\": \"geonames_ids\",\n},\naxis=1,\ninplace=True,\n)\n# Read in + fix all dictionaries\ndict_historic_counties = json.loads(\nPath(files_dict[\"dict_historic_counties\"][\"local\"]).read_text()\n)\ndict_admin_counties = json.loads(\nPath(files_dict[\"dict_admin_counties\"][\"local\"]).read_text()\n)\ndict_countries = json.loads(Path(files_dict[\"dict_countries\"][\"local\"]).read_text())\ndict_historic_counties = correct_dict(dict_historic_counties)\ndict_admin_counties = correct_dict(dict_admin_counties)\ndict_countries = correct_dict(dict_countries)\n# Create assisting frames\nhistorical_counties_df = pd.DataFrame(\ndict_historic_counties,\ncolumns=[\"place_wikidata_id\", \"hcounty_label\", \"hcounty_wikidata_id\"],\n)\nadmin_county_df = pd.DataFrame(\ndict_admin_counties,\ncolumns=[\n\"place_wikidata_id\",\n\"admin_county_label\",\n\"admin_county_wikidata_id\",\n],\n)\ncountries_df = pd.DataFrame(\ndict_countries,\ncolumns=[\"place_wikidata_id\", \"country_label\", \"country_wikidata_id\"],\n)\nwikidata_gazetteer = wikidata_gazetteer[\nwikidata_gazetteer.place_wikidata_id.isin(mitchell_wikidata_mentions)\n].sort_values(\"place_wikidata_id\")\nwikidata_gazetteer[\"place_pk\"] = np.arange(1, len(wikidata_gazetteer) + 1)\nwikidata_gazetteer = wikidata_gazetteer[\n[\"place_pk\"] + [x for x in wikidata_gazetteer.columns if not x == \"place_pk\"]\n]\n# Merge wikidata_gazetteer with all the assisting frames (and rename the\n# resulting columns)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, historical_counties_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, admin_county_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, countries_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer.rename(\n{\n\"admin_county_label\": \"admin_county__label\",\n\"admin_county_wikidata_id\": \"admin_county__wikidata_id\",\n\"hcounty_label\": \"historic_county__label\",\n\"hcounty_wikidata_id\": \"historic_county__wikidata_id\",\n\"country_label\": \"country__label\",\n\"country_wikidata_id\": \"country__wikidata_id\",\n},\naxis=1,\ninplace=True,\n)\n# Split back up into dataframes specific for the tables\nhistoric_county_table = (\nwikidata_gazetteer[[\"historic_county__label\", \"historic_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nhistoric_county_table = historic_county_table.replace({\"\": np.nan}).dropna()\nhistoric_county_table[\"historic_county__pk\"] = np.arange(\n1, len(historic_county_table) + 1\n)\nadmin_county_table = (\nwikidata_gazetteer[[\"admin_county__label\", \"admin_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nadmin_county_table = admin_county_table.replace({\"\": np.nan}).dropna()\nadmin_county_table[\"admin_county__pk\"] = np.arange(1, len(admin_county_table) + 1)\ncountry_table = (\nwikidata_gazetteer[[\"country__label\", \"country__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\ncountry_table = country_table.replace({\"\": np.nan}).dropna()\ncountry_table[\"country__pk\"] = np.arange(1, len(country_table) + 1)\n# Set up place_table from wikidata_gazetteer\nplace_table = wikidata_gazetteer.copy()\nplace_table = (\npd.merge(\nplace_table,\nhistoric_county_table,\non=[\"historic_county__label\", \"historic_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"historic_county__label\", \"historic_county__wikidata_id\"], axis=1)\n.rename({\"historic_county__pk\": \"historic_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\nadmin_county_table,\non=[\"admin_county__label\", \"admin_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"admin_county__label\", \"admin_county__wikidata_id\"], axis=1)\n.rename({\"admin_county__pk\": \"admin_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\ncountry_table,\non=[\"country__label\", \"country__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"country__label\", \"country__wikidata_id\"], axis=1)\n.rename({\"country__pk\": \"country_id\"}, axis=1)\n)\nplace_table.fillna(\"\", inplace=True)\nplace_table.set_index(\"place_pk\", inplace=True)\nplace_table.rename(\n{\"place_label\": \"label\", \"place_wikidata_id\": \"wikidata_id\"},\naxis=1,\ninplace=True,\n)\nplace_table[\"historic_county_id\"] = (\nplace_table[\"historic_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"admin_county_id\"] = (\nplace_table[\"admin_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"country_id\"] = (\nplace_table[\"country_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table.index.rename(\"pk\", inplace=True)\nplace_table.rename(\n{\n\"historic_county_id\": \"historic_county\",\n\"admin_county_id\": \"admin_county\",\n\"country_id\": \"country\",\n},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.set_index(\"historic_county__pk\", inplace=True)\nhistoric_county_table.rename(\n{x: x.split(\"__\")[1] for x in historic_county_table.columns},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.index.rename(\"pk\", inplace=True)\nadmin_county_table.set_index(\"admin_county__pk\", inplace=True)\nadmin_county_table.rename(\n{x: x.split(\"__\")[1] for x in admin_county_table.columns}, axis=1, inplace=True\n)\nadmin_county_table.index.rename(\"pk\", inplace=True)\ncountry_table.set_index(\"country__pk\", inplace=True)\ncountry_table.rename(\n{x: x.split(\"__\")[1] for x in country_table.columns}, axis=1, inplace=True\n)\ncountry_table.index.rename(\"pk\", inplace=True)\n# Adding created_at, updated_at to all the gazetteer tables\nplace_table[\"created_at\"] = time_stamp\nplace_table[\"updated_at\"] = time_stamp\nadmin_county_table[\"created_at\"] = time_stamp\nadmin_county_table[\"updated_at\"] = time_stamp\nhistoric_county_table[\"created_at\"] = time_stamp\nhistoric_county_table[\"updated_at\"] = time_stamp\ncountry_table[\"created_at\"] = time_stamp\ncountry_table[\"updated_at\"] = time_stamp\n# Save CSV files for gazetteer tables\nplace_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"])\nadmin_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"]\n)\nhistoric_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"]\n)\ncountry_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"])\nsaved.extend(\n[\noutput_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"],\n]\n)\n# Fix up Mitchells (already loaded)\nmitchells_df[\"politics\"] = mitchells_df.POLITICS.apply(get_list)\nmitchells_df[\"persons\"] = mitchells_df.PERSONS.apply(get_list)\nmitchells_df[\"organisations\"] = mitchells_df.ORGANIZATIONS.apply(get_list)\nmitchells_df[\"price\"] = mitchells_df.PRICE.apply(get_list)\nmitchells_df.rename(\n{\n\"ID\": \"mpd_id\",\n\"TITLE\": \"title\",\n\"politics\": \"political_leaning_raw\",\n\"price\": \"price_raw\",\n\"YEAR\": \"year\",\n\"PLACE_PUB_WIKI\": \"place_of_publication_id\",\n\"ESTABLISHED_DATE\": \"date_established_raw\",\n\"PUBLISED_DATE\": \"day_of_publication_raw\",\n},\naxis=1,\ninplace=True,\n)\ndrop_cols = [\n\"CHAIN_ID\",\n\"POLITICS\",\n\"PERSONS\",\n\"ORGANIZATIONS\",\n\"PRICE\",\n\"PLACE_PUB\",\n\"PLACE_PUB_COORD\",\n\"PLACES\",\n\"PLACES_TRES\",\n\"TEXT\",\n]\nmitchells_df.drop(columns=drop_cols, inplace=True)\n# Create derivative tables (from Mitchells) = political_leanings, prices,\n# issues\npolitical_leanings = sorted(\nlist(set([y.strip() for x in mitchells_df.political_leaning_raw for y in x]))\n)\npolitical_leanings_table = pd.DataFrame()\npolitical_leanings_table[\"political_leaning__pk\"] = np.arange(\n1, len(political_leanings) + 1\n)\npolitical_leanings_table[\"political_leaning__label\"] = political_leanings\nexport = political_leanings_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"political_leaning__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nprices = sorted(list(set([y.strip() for x in mitchells_df.price_raw for y in x])))\nprices_table = pd.DataFrame()\nprices_table[\"price__pk\"] = np.arange(1, len(prices) + 1)\nprices_table[\"price__label\"] = prices\nexport = prices_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"price__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nissues = sorted(list(mitchells_df.year.unique()))\nissues_table = pd.DataFrame()\nissues_table[\"issue__pk\"] = np.arange(1, len(issues) + 1)\nissues_table[\"issue__year\"] = issues\nexport = issues_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"issue__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\n# Set up linking on Mitchells dataframe\nlinking_df = pd.read_csv(\nfiles_dict[\"linking\"][\"local\"],\nindex_col=0,\ndtype={\"NLP\": str},\nusecols=[\n\"NLP\",\n\"Title\",\n\"AcquiredYears\",\n\"Editions\",\n\"EditionTitles\",\n\"City\",\n\"Publisher\",\n\"UnavailableYears\",\n\"Collection\",\n\"UK\",\n\"Complete\",\n\"Notes\",\n\"County\",\n\"HistoricCounty\",\n\"First date held\",\n\"Publication title\",\n\"link_to_mpd\",\n],\n)\nlinking_df[\"NLP\"] = linking_df.index\nlinking_df.rename(\n{\"link_to_mpd\": \"mpd_id\", \"NLP\": \"newspaper\"}, axis=1, inplace=True\n)\n# Link Mitchells with all the other data\nmitchells_df = pd.merge(mitchells_df, linking_df, on=\"mpd_id\", how=\"inner\")\n# Create entry_table\nentry_table = mitchells_df.copy()\nentry_table[\"place_of_circulation_raw\"] = \"\"\nentry_table[\"publication_district_raw\"] = \"\"\nentry_table[\"publication_county_raw\"] = \"\"\n# TODO: What happened to the three columns above? (Check w Kaspar?)\n# Only keep relevant columns\nentry_table = entry_table[\n[\n\"title\",\n\"political_leaning_raw\",\n\"price_raw\",\n\"year\",\n\"date_established_raw\",\n\"day_of_publication_raw\",\n\"place_of_circulation_raw\",\n\"publication_district_raw\",\n\"publication_county_raw\",\n\"organisations\",\n\"persons\",\n\"place_of_publication_id\",\n\"newspaper\",\n]\n]\n# Fix refs to political_leanings_table\nrev = political_leanings_table.set_index(\"political_leaning__label\")\nentry_table[\"political_leanings\"] = entry_table.political_leaning_raw.apply(\nlambda x: [rev.at[y, \"political_leaning__pk\"] for y in x]\n)\n# Fix refs to prices_table\nrev = prices_table.set_index(\"price__label\")\nentry_table[\"prices\"] = entry_table.price_raw.apply(\nlambda x: [rev.at[y.strip(), \"price__pk\"] for y in x]\n)\n# Fix refs to issues_table\nrev = issues_table.set_index(\"issue__year\")\nentry_table[\"issue\"] = entry_table.year.apply(lambda x: rev.at[x, \"issue__pk\"])\n# Fix refs to place_table\nrev = place_table.copy()\nrev[\"place__pk\"] = rev.index\nrev.set_index(\"wikidata_id\", inplace=True)\nentry_table[\"place_of_publication\"] = entry_table.place_of_publication_id.apply(\ntest_place, rev=rev\n)\nentry_table.drop(columns=[\"place_of_publication_id\"], inplace=True)\n# Set up ref to newspapers\nrev = json.loads(files_dict[\"Newspaper-1\"][\"local\"].read_text())\nrev = [dict(pk=v[\"pk\"], **v[\"fields\"]) for v in rev]\nrev = pd.DataFrame(rev)\nrev.set_index(\"publication_code\", inplace=True)\nentry_table[\"newspaper\"] = entry_table.newspaper.str.zfill(7)\nentry_table[\"newspaper\"] = entry_table.newspaper.apply(test_paper, rev=rev)\n# Create PK for entries\nentry_table[\"pk\"] = np.arange(1, len(entry_table) + 1)\n# Sort columns in entries file\nentry_table = entry_table[\n[\"pk\"] + [col for col in entry_table.columns if not col == \"pk\"]\n]\n# Add created_at, modified_at to entry_table\nentry_table[\"created_at\"] = time_stamp\nentry_table[\"updated_at\"] = time_stamp\n# Export entry_table\nentry_table.set_index(\"pk\").to_csv(\noutput_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"]\n)\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"])\n# ######\u00a0NOW WE CAN EASILY CREATE JSON files_dict\nfor csv_file_path in output_path.glob(\"*.csv\"):\ncsv2json_list(csv_file_path)\nprint(\"Finished - saved files:\")\nprint(\"- \" + \"\\n- \".join([str(x) for x in saved]))\n
"},{"location":"reference/alto2txt2fixture/jisc.html","title":"jisc","text":""},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.get_jisc_title","title":"get_jisc_title","text":"
get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str\n

Match a newspaper title with jisc_papers records.

Takes an input_sub_path, a publication_code, and an (optional) abbreviation for any newspaper to locate the title in the jisc_papers DataFrame. jisc_papers is usually loaded via the setup_jisc_papers function.

Parameters:

Name Type Description Default title str

target newspaper title

required issue_date str

target newspaper issue_date

required jisc_papers pd.DataFrame

DataFrame of jisc_papers to match

required input_sub_path str

path of files to narrow down query input_sub_path

required publication_code str

unique codes to match newspaper records

required abbr str | None

an optional abbreviation of the newspaper title

None

Returns:

Type Description str

Matched title str or abbr.

Returns:

Type Description str

A string estimating the JISC equivalent newspaper title

Source code in alto2txt2fixture/jisc.py
def get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str:\n\"\"\"\n    Match a newspaper ``title`` with ``jisc_papers`` records.\n    Takes an ``input_sub_path``, a ``publication_code``, and an (optional)\n    abbreviation for any newspaper to locate the ``title`` in the\n    ``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the\n    ``setup_jisc_papers`` function.\n    Args:\n        title: target newspaper title\n        issue_date: target newspaper issue_date\n        jisc_papers: `DataFrame` of `jisc_papers` to match\n        input_sub_path: path of files to narrow down query input_sub_path\n        publication_code: unique codes to match newspaper records\n        abbr: an optional abbreviation of the newspaper title\n    Returns:\n        Matched ``title`` `str` or ``abbr``.\n    Returns:\n        A string estimating the JISC equivalent newspaper title\n    \"\"\"\n# First option, search the input_sub_path for a valid-looking publication_code\ng = PUBLICATION_CODE.findall(input_sub_path)\nif len(g) == 1:\npublication_code = g[0]\n# Let's see if we can find title:\ntitle = (\njisc_papers[\njisc_papers.publication_code == publication_code\n].title.to_list()[0]\nif jisc_papers[\njisc_papers.publication_code == publication_code\n].title.count()\n== 1\nelse title\n)\nreturn title\n# Second option, look through JISC papers for best match (on publication_code if we have it, but abbr more importantly if we have it)\nif abbr:\n_publication_code = publication_code\npublication_code = abbr\nif jisc_papers.abbr[jisc_papers.abbr == publication_code].count():\ndate = datetime.strptime(issue_date, \"%Y-%m-%d\")\nmask = (\n(jisc_papers.abbr == publication_code)\n& (date >= jisc_papers.start_date)\n& (date <= jisc_papers.end_date)\n)\nfiltered = jisc_papers.loc[mask]\nif filtered.publication_code.count() == 1:\npublication_code = filtered.publication_code.to_list()[0]\ntitle = filtered.title.to_list()[0]\nreturn title\n# Last option: let's find all the possible titles in the jisc_papers for the abbreviation, and if it's just one unique title, let's pick it!\nif abbr:\ntest = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test[0]\nelse:\nmask1 = (jisc_papers.abbr == publication_code) & (\njisc_papers.publication_code == _publication_code\n)\ntest1 = jisc_papers.loc[mask1]\ntest1 = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test1[0]\n# Fallback: if abbreviation is set, we'll return that:\nif abbr:\n# For these exceptions, see issue comment:\n# https://github.com/alan-turing-institute/Living-with-Machines/issues/2453#issuecomment-1050652587\nif abbr == \"IPJL\":\nreturn \"Ipswich Journal\"\nelif abbr == \"BHCH\":\nreturn \"Bath Chronicle\"\nelif abbr == \"LSIR\":\nreturn \"Leeds Intelligencer\"\nelif abbr == \"AGER\":\nreturn \"Lancaster Gazetter, And General Advertiser For Lancashire West\"\nreturn abbr\nraise RuntimeError(f\"Title {title} could not be found.\")\n
"},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.setup_jisc_papers","title":"setup_jisc_papers","text":"
setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame\n

Create a DataFrame with information in JISC_PAPERS_CSV in settings.

Returns:

Type Description pd.DataFrame

DataFrame with all JISC titles.

Source code in alto2txt2fixture/jisc.py
def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:\n\"\"\"\n    Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.\n    Returns:\n        `DataFrame` with all JISC titles.\n    \"\"\"\nif not Path(path).exists():\nraise RuntimeError(\nf\"Could not find required JISC papers file. Put {Path(path).name} in {Path(path).parent} or correct the settings with a different path.\"\n)\nmonths = {\n\"Jan\": 1,\n\"Feb\": 2,\n\"Mar\": 3,\n\"Apr\": 4,\n\"May\": 5,\n\"Jun\": 6,\n\"June\": 6,\n\"Jul\": 7,\n\"July\": 7,\n\"Aug\": 8,\n\"Sep\": 9,\n\"Sept\": 9,\n\"Oct\": 10,\n\"Nov\": 11,\n\"Dec\": 12,\n\"Dec.\": 12,\n}\njisc_papers = pd.read_csv(\npath,\nusecols=[\n\"Newspaper Title\",\n\"NLP\",\n\"Abbr\",\n\"StartD\",\n\"StartM\",\n\"StartY\",\n\"EndD\",\n\"EndM\",\n\"EndY\",\n],\n)\njisc_papers[\"start_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.StartY),\nmonth=months[x.StartM.strip(\".\").strip()],\nday=int(x.StartD),\n),\naxis=1,\n)\njisc_papers[\"end_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.EndY), month=months[x.EndM.strip(\".\").strip()], day=int(x.EndD)\n),\naxis=1,\n)\njisc_papers.drop(\n[\"StartD\", \"StartM\", \"StartY\", \"EndD\", \"EndM\", \"EndY\"],\naxis=\"columns\",\ninplace=True,\n)\njisc_papers.rename(\n{\"Newspaper Title\": \"title\", \"NLP\": \"publication_code\", \"Abbr\": \"abbr\"},\naxis=1,\ninplace=True,\n)\njisc_papers[\"title\"] = jisc_papers[\"title\"].apply(\nlambda x: \"The \" + x[:-5] if x.strip()[-5:].lower() == \", the\" else x\n)\njisc_papers[\"publication_code\"] = jisc_papers[\"publication_code\"].apply(\nlambda x: str(x).zfill(7)\n)\nreturn jisc_papers\n
"},{"location":"reference/alto2txt2fixture/log.html","title":"log","text":""},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.error","title":"error","text":"
error(msg: str, crash: bool = True, silent: bool = True) -> None\n

Print msg in colorama Force.RED and exit()

If silent exit() after call, else raise RuntimeError if crash=True.

Source code in alto2txt2fixture/log.py
def error(msg: str, crash: bool = True, silent: bool = True) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.RED` and `exit()`\n    If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``.\"\"\"\nif crash and silent:\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nexit()\nelif crash:\nraise RuntimeError(msg) from None\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.info","title":"info","text":"
info(msg: str) -> None\n

Print msg in colorama Force.CYAN colour.

Source code in alto2txt2fixture/log.py
def info(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.CYAN` colour.\"\"\"\nprint(f\"{Fore.CYAN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.success","title":"success","text":"
success(msg: str) -> None\n

Print msg in colorama Force.GREEN colour.

Source code in alto2txt2fixture/log.py
def success(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.GREEN` colour.\"\"\"\nprint(f\"{Fore.GREEN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.warning","title":"warning","text":"
warning(msg: str) -> None\n

Print msg in colorama Force.YELLOW colour.

Source code in alto2txt2fixture/log.py
def warning(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.YELLOW` colour.\"\"\"\nprint(f\"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html","title":"parser","text":""},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.fixtures","title":"fixtures","text":"
fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]\n

Generates fixtures for a specified model using a list of files.

This function takes a list of files and generates fixtures for a specified model. The fixtures can be used to populate a database or perform other data-related operations.

Parameters:

Name Type Description Default filelist list

A list of files to process and generate fixtures from.

[] model str

The name of the model for which fixtures are generated. translate: A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:

{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used as keys, and their corresponding primary keys (obtained from the provided files) will be used as values in the generated fixtures.

'' rename dict

A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:

{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the generated fixtures.

{} uniq_keys list

A list of fields that need to be considered for uniqueness in the fixtures. If specified, the fixtures will yield only unique items based on the combination of these fields.

[]

Yields:

Type Description FixtureDict

FixtureDict from model, pk and dict of fields.

Returns:

Type Description Generator[FixtureDict, None, None]

This function generates fixtures but does not return any value.

Source code in alto2txt2fixture/parser.py
def fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]:\n\"\"\"\n    Generates fixtures for a specified model using a list of files.\n    This function takes a list of files and generates fixtures for a specified\n    model. The fixtures can be used to populate a database or perform other\n    data-related operations.\n    Args:\n        filelist: A list of files to process and generate fixtures from.\n        model: The name of the model for which fixtures are generated.\n            translate: A nested dictionary representing the translation mapping\n            for fields. The structure of the translator follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': {\n                        'translated_field': 'pk'\n                    }\n                }\n            }\n            ```\n            The translated fields will be used as keys, and their\n            corresponding primary keys (obtained from the provided files) will\n            be used as values in the generated fixtures.\n        rename: A nested dictionary representing the field renaming\n            mapping. The structure of the dictionary follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': 'new_field_name'\n                }\n            }\n            ```\n            The fields specified in the dictionary will be renamed to the\n            provided new field names in the generated fixtures.\n        uniq_keys: A list of fields that need to be considered for\n            uniqueness in the fixtures. If specified, the fixtures will yield\n            only unique items based on the combination of these fields.\n    Yields:\n        `FixtureDict` from ``model``, ``pk`` and `dict` of ``fields``.\n    Returns:\n        This function generates fixtures but does not return any value.\n    \"\"\"\nfilelist = sorted(filelist, key=lambda x: str(x).split(\"/\")[:-1])\ncount = len(filelist)\n# Process JSONL\nif [x for x in filelist if \".jsonl\" in x.name]:\npk = 0\n# In the future, we might want to show progress here (tqdm or suchlike)\nfor file in filelist:\nfor line in file.read_text().splitlines():\npk += 1\nline = json.loads(line)\nyield FixtureDict(\npk=pk,\nmodel=model,\nfields=dict(**get_fields(line, translate=translate, rename=rename)),\n)\nreturn\nelse:\n# Process JSON\npks = [x for x in range(1, count + 1)]\nif len(uniq_keys):\nuniq_files = list(uniq(filelist, uniq_keys))\ncount = len(uniq_files)\nzipped = zip(uniq_files, pks)\nelse:\nzipped = zip(filelist, pks)\nfor x in tqdm(\nzipped, total=count, desc=f\"{model} ({count:,} objs)\", leave=False\n):\nyield FixtureDict(\npk=x[1],\nmodel=model,\nfields=dict(**get_fields(x[0], translate=translate, rename=rename)),\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_fields","title":"get_fields","text":"
get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict\n

Retrieves fields from a file and performs modifications and checks.

This function takes a file (in various formats: Path, str, or dict) and processes its fields. It retrieves the fields from the file and performs modifications, translations, and checks on the fields.

Parameters:

Name Type Description Default file Union[Path, str, dict]

The file from which the fields are retrieved.

required translate dict

A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:

{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used to replace the original fields in the retrieved fields.

{} rename dict

A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:

{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the retrieved fields.

{} allow_null bool

Determines whether to allow None values for relational fields. If set to True, relational fields with missing values will be assigned None. If set to False, an error will be raised.

False

Returns:

Type Description dict

A dictionary representing the retrieved fields from the file, with modifications and checks applied.

Raises:

Type Description RuntimeError

If the file type is unsupported or if an error occurs during field retrieval or processing.

Source code in alto2txt2fixture/parser.py
def get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict:\n\"\"\"\n    Retrieves fields from a file and performs modifications and checks.\n    This function takes a file (in various formats: `Path`, `str`, or `dict`)\n    and processes its fields. It retrieves the fields from the file and\n    performs modifications, translations, and checks on the fields.\n    Args:\n        file: The file from which the fields are retrieved.\n        translate: A nested dictionary representing the translation mapping\n            for fields. The structure of the translator follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': {\n                        'translated_field': 'pk'\n                    }\n                }\n            }\n            ```\n            The translated fields will be used to replace the original fields\n            in the retrieved fields.\n        rename: A nested dictionary representing the field renaming\n            mapping. The structure of the dictionary follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': 'new_field_name'\n                }\n            }\n            ```\n            The fields specified in the dictionary will be renamed to the\n            provided new field names in the retrieved fields.\n        allow_null: Determines whether to allow ``None`` values for\n            relational fields. If set to ``True``, relational fields with\n            missing values will be assigned ``None``. If set to ``False``, an\n            error will be raised.\n    Returns:\n        A dictionary representing the retrieved fields from the file,\n            with modifications and checks applied.\n    Raises:\n        RuntimeError: If the file type is unsupported or if an error occurs\n            during field retrieval or processing.\n    \"\"\"\nif isinstance(file, Path):\ntry:\nfields = json.loads(file.read_text())\nexcept Exception as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, str):\nif \"\\n\" in file:\nraise RuntimeError(\"File has multiple lines.\")\ntry:\nfields = json.loads(file)\nexcept json.decoder.JSONDecodeError as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, dict):\nfields = file\nelse:\nraise RuntimeError(f\"Cannot process type {type(file)}.\")\n# Fix relational fields for any file\nfor key in [key for key in fields.keys() if \"__\" in key]:\nparts = key.split(\"__\")\ntry:\nbefore = fields[key]\nif before:\nbefore = before.replace(\"---\", \"/\")\nloc = translate.get(parts[0], {}).get(parts[1], {})\nfields[key] = loc.get(before)\nif fields[key] is None:\nraise RuntimeError(\nf\"Cannot translate fields.{key} from {before}: {loc}\"\n)\nexcept AttributeError:\nif allow_null:\nfields[key] = None\nelse:\nprint(\n\"Content had relational fields, but something went wrong in parsing the data:\"\n)\nprint(\"file\", file)\nprint(\"fields\", fields)\nprint(\"KEY:\", key)\nraise RuntimeError()\nnew_name = rename.get(parts[0], {}).get(parts[1], None)\nif new_name:\nfields[new_name] = fields[key]\ndel fields[key]\nfields[\"created_at\"] = NOW_str\nfields[\"updated_at\"] = NOW_str\ntry:\nfields[\"item_type\"] = str(fields[\"item_type\"]).upper()\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_mean\"] == \"\":\nfields[\"ocr_quality_mean\"] = 0\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_sd\"] == \"\":\nfields[\"ocr_quality_sd\"] = 0\nexcept KeyError:\npass\nreturn fields\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_key_from","title":"get_key_from","text":"
get_key_from(item: Path, x: str) -> str\n

Retrieves a specific key from a file and returns its value.

This function reads a file and extracts the value of a specified key. If the key is not found or an error occurs while processing the file, a warning is printed, and an empty string is returned.

Parameters:

Name Type Description Default item Path

The file from which the key is extracted.

required x str

The key to be retrieved from the file.

required

Returns:

Type Description str

The value of the specified key from the file.

Source code in alto2txt2fixture/parser.py
def get_key_from(item: Path, x: str) -> str:\n\"\"\"\n    Retrieves a specific key from a file and returns its value.\n    This function reads a file and extracts the value of a specified\n    key. If the key is not found or an error occurs while processing\n    the file, a warning is printed, and an empty string is returned.\n    Args:\n        item: The file from which the key is extracted.\n        x: The key to be retrieved from the file.\n    Returns:\n        The value of the specified key from the file.\n    \"\"\"\nresult = json.loads(item.read_text()).get(x, None)\nif not result:\nprint(f\"[WARN] Could not find key {x} in {item}\")\nresult = \"\"\nreturn result\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_translator","title":"get_translator","text":"
get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict\n

Converts a list of fields into a nested dictionary representing a translator.

Parameters:

Name Type Description Default fields list[TranslatorTuple]

A list of tuples representing fields to be translated.

[TranslatorTuple('', '', [])]

Returns:

Type Description dict

A nested dictionary representing the translator. The structure of the dictionary follows the format:

{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n

Example
>>> fields = [\n...     TranslatorTuple(\n...         start='start__field1',\n...         finish='field1',\n...         lst=[{\n...             'fields': {'field1': 'translation1'},\n...             'pk': 1}],\n...      )]\n>>> get_translator(fields)\n{'start': {'field1': {'translation1': 1}}}\n
Source code in alto2txt2fixture/parser.py
def get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict:\n\"\"\"\n    Converts a list of fields into a nested dictionary representing a\n    translator.\n    Args:\n        fields: A list of tuples representing fields to be translated.\n    Returns:\n        A nested dictionary representing the translator. The structure of\n            the dictionary follows the format:\n            ```python\n            {\n                'part1': {\n                      'part2': {\n                          'translated_field': 'pk'\n                      }\n                }\n            }\n            ```\n    Example:\n        ```pycon\n        >>> fields = [\n        ...     TranslatorTuple(\n        ...         start='start__field1',\n        ...         finish='field1',\n        ...         lst=[{\n        ...             'fields': {'field1': 'translation1'},\n        ...             'pk': 1}],\n        ...      )]\n        >>> get_translator(fields)\n        {'start': {'field1': {'translation1': 1}}}\n        ```\n    \"\"\"\n_ = dict()\nfor field in fields:\nstart, finish, lst = field\npart1, part2 = start.split(\"__\")\nif part1 not in _:\n_[part1] = {}\nif part2 not in _[part1]:\n_[part1][part2] = {}\nif isinstance(finish, str):\n_[part1][part2] = {o[\"fields\"][finish]: o[\"pk\"] for o in lst}\nelif isinstance(finish, list):\n_[part1][part2] = {\n\"-\".join([o[\"fields\"][x] for x in finish]): o[\"pk\"] for o in lst\n}\nreturn _\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.parse","title":"parse","text":"
parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None\n

Parses files from collections and generates fixtures for various models.

This function processes files from the specified collections and generates fixtures for different models, such as newspapers.dataprovider, newspapers.ingest, newspapers.digitisation, newspapers.newspaper, newspapers.issue, and newspapers.item.

It performs various steps, such as file listing, fixture generation, translation mapping, renaming fields, and saving fixtures to files.

Parameters:

Name Type Description Default collections list

A list of collections from which files are processed and fixtures are generated.

required cache_home str

The directory path where the collections are located.

required output str

The directory path where the fixtures will be saved.

required max_elements_per_file int

The maximum number of elements per file when saving fixtures.

required

Returns:

Type Description None

This function generates fixtures but does not return any value.

Source code in alto2txt2fixture/parser.py
def parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None:\n\"\"\"\n    Parses files from collections and generates fixtures for various models.\n    This function processes files from the specified collections and generates\n    fixtures for different models, such as `newspapers.dataprovider`,\n    `newspapers.ingest`, `newspapers.digitisation`, `newspapers.newspaper`,\n    `newspapers.issue`, and `newspapers.item`.\n    It performs various steps, such as file listing, fixture generation,\n    translation mapping, renaming fields, and saving fixtures to files.\n    Args:\n        collections: A list of collections from which files are\n            processed and fixtures are generated.\n        cache_home: The directory path where the collections are located.\n        output: The directory path where the fixtures will be saved.\n        max_elements_per_file: The maximum number of elements per file\n            when saving fixtures.\n    Returns:\n        This function generates fixtures but does not return any value.\n    \"\"\"\nglobal CACHE_HOME\nglobal OUTPUT\nglobal MAX_ELEMENTS_PER_FILE\nCACHE_HOME = cache_home\nOUTPUT = output\nMAX_ELEMENTS_PER_FILE = max_elements_per_file\n# Set up output directory\nreset_fixture_dir(OUTPUT)\n# Get file lists\nprint(\"\\nGetting file lists...\")\ndef issues_in_x(x):\nreturn \"issues\" in str(x.parent).split(\"/\")\ndef newspapers_in_x(x):\nreturn not any(\n[\ncondition\nfor y in str(x.parent).split(\"/\")\nfor condition in [\n\"issues\" in y,\n\"ingest\" in y,\n\"digitisation\" in y,\n\"data-provider\" in y,\n]\n]\n)\nall_json = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.json\")\n]\nall_jsonl = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.jsonl\")\n]\nprint(f\"--> {len(all_json):,} JSON files altogether\")\nprint(f\"--> {len(all_jsonl):,} JSONL files altogether\")\nprint(\"\\nSetting up fixtures...\")\n# Process data providers\ndef data_provider_in_x(x):\nreturn \"data-provider\" in str(x.parent).split(\"/\")\ndata_provider_json = list(\nfixtures(\nmodel=\"newspapers.dataprovider\",\nfilelist=[x for x in all_json if data_provider_in_x(x)],\nuniq_keys=[\"name\"],\n)\n)\nprint(f\"--> {len(data_provider_json):,} DataProvider fixtures\")\n# Process ingest\ndef ingest_in_x(x):\nreturn \"ingest\" in str(x.parent).split(\"/\")\ningest_json = list(\nfixtures(\nmodel=\"newspapers.ingest\",\nfilelist=[x for x in all_json if ingest_in_x(x)],\nuniq_keys=[\"lwm_tool_name\", \"lwm_tool_version\"],\n)\n)\nprint(f\"--> {len(ingest_json):,} Ingest fixtures\")\n# Process digitisation\ndef digitisation_in_x(x):\nreturn \"digitisation\" in str(x.parent).split(\"/\")\ndigitisation_json = list(\nfixtures(\nmodel=\"newspapers.digitisation\",\nfilelist=[x for x in all_json if digitisation_in_x(x)],\nuniq_keys=[\"software\"],\n)\n)\nprint(f\"--> {len(digitisation_json):,} Digitisation fixtures\")\n# Process newspapers\nnewspaper_json = list(\nfixtures(\nmodel=\"newspapers.newspaper\",\nfilelist=[file for file in all_json if newspapers_in_x(file)],\n)\n)\nprint(f\"--> {len(newspaper_json):,} Newspaper fixtures\")\n# Process issue\ntranslate = get_translator(\n[\nTranslatorTuple(\n\"publication__publication_code\", \"publication_code\", newspaper_json\n)\n]\n)\nrename = {\"publication\": {\"publication_code\": \"newspaper_id\"}}\nissue_json = list(\nfixtures(\nmodel=\"newspapers.issue\",\nfilelist=[file for file in all_json if issues_in_x(file)],\ntranslate=translate,\nrename=rename,\n)\n)\nprint(f\"--> {len(issue_json):,} Issue fixtures\")\n# Create translator/clear up memory before processing items\ntranslate = get_translator(\n[\n(\"issue__issue_identifier\", \"issue_code\", issue_json),\n(\"digitisation__software\", \"software\", digitisation_json),\n(\"data_provider__name\", \"name\", data_provider_json),\n(\n\"ingest__lwm_tool_identifier\",\n[\"lwm_tool_name\", \"lwm_tool_version\"],\ningest_json,\n),\n]\n)\nrename = {\n\"issue\": {\"issue_identifier\": \"issue_id\"},\n\"digitisation\": {\"software\": \"digitisation_id\"},\n\"data_provider\": {\"name\": \"data_provider_id\"},\n\"ingest\": {\"lwm_tool_identifier\": \"ingest_id\"},\n}\nsave_fixture(newspaper_json, \"Newspaper\")\nsave_fixture(issue_json, \"Issue\")\ndel newspaper_json\ndel issue_json\ngc.collect()\nprint(\"\\nSaving...\")\nsave_fixture(digitisation_json, \"Digitisation\")\nsave_fixture(ingest_json, \"Ingest\")\nsave_fixture(data_provider_json, \"DataProvider\")\n# Process items\nitem_json = fixtures(\nmodel=\"newspapers.item\",\nfilelist=all_jsonl,\ntranslate=translate,\nrename=rename,\n)\nsave_fixture(item_json, \"Item\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.reset_fixture_dir","title":"reset_fixture_dir","text":"
reset_fixture_dir(output: str | Path) -> None\n

Resets the fixture directory by removing all JSON files inside it.

This function takes a directory path (output) as input and removes all JSON files within the directory.

Prior to removal, it prompts the user for confirmation to proceed. If the user confirms, the function clears the fixture directory by deleting the JSON files.

Parameters:

Name Type Description Default output str | Path

The directory path of the fixture directory to be reset.

required

Raises:

Type Description RuntimeError

If the output directory is not specified as a string.

Source code in alto2txt2fixture/parser.py
def reset_fixture_dir(output: str | Path) -> None:\n\"\"\"\n    Resets the fixture directory by removing all JSON files inside it.\n    This function takes a directory path (``output``) as input and removes all\n    JSON files within the directory.\n    Prior to removal, it prompts the user for confirmation to proceed. If the\n    user confirms, the function clears the fixture directory by deleting the\n    JSON files.\n    Args:\n        output: The directory path of the fixture directory to be reset.\n    Raises:\n        RuntimeError: If the ``output`` directory is not specified as a string.\n    \"\"\"\nif not isinstance(output, str):\nraise RuntimeError(\"`output` directory needs to be specified as a string.\")\noutput = Path(output)\ny = input(\nf\"This command will automatically empty the fixture directory ({output.absolute()}). \"\n\"Do you want to proceed? [y/N]\"\n)\nif not y.lower() == \"y\":\noutput.mkdir(parents=True, exist_ok=True)\nreturn\nprint(\"\\nClearing up the fixture directory\")\n# Ensure directory exists\noutput.mkdir(parents=True, exist_ok=True)\n# Drop all JSON files\n[x.unlink() for x in Path(output).glob(\"*.json\")]\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.uniq","title":"uniq","text":"
uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]\n

Generates unique items from a list of files based on specified keys.

This function takes a list of files and yields unique items based on a combination of keys. The keys are extracted from each file using the get_key_from function, and duplicate items are ignored.

Parameters:

Name Type Description Default filelist list

A list of files from which unique items are generated.

required keys list

A list of keys used for uniqueness. Each key specifies a field to be used for uniqueness checking in the generated items.

[]

Yields:

Type Description Any

A unique item from filelist.

Source code in alto2txt2fixture/parser.py
def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:\n\"\"\"\n    Generates unique items from a list of files based on specified keys.\n    This function takes a list of files and yields unique items based on a\n    combination of keys. The keys are extracted from each file using the\n    ``get_key_from`` function, and duplicate items are ignored.\n    Args:\n        filelist: A list of files from which unique items are\n            generated.\n        keys: A list of keys used for uniqueness. Each key specifies\n            a field to be used for uniqueness checking in the generated\n            items.\n    Yields:\n        A unique item from `filelist`.\n    \"\"\"\nseen = set()\nfor item in filelist:\nkey = \"-\".join([get_key_from(item, x) for x in keys])\nif key not in seen:\nseen.add(key)\nyield item\nelse:\n# Drop it if duplicate\npass\n
"},{"location":"reference/alto2txt2fixture/patterns.html","title":"patterns","text":"

Useful regular expressions, intially just PUBLICATION_CODE.

"},{"location":"reference/alto2txt2fixture/router.html","title":"router","text":""},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive","title":"Archive","text":"
Archive(\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n)\n

Manage extracting information from a ZIP archive.

The Archive class represents a zip archive of XML files. The class is used to extract information from a ZIP archive, and it contains several methods to process the data contained in the archive.

open(Archive) context manager

Archive can be opened with a context manager, which creates a meta object, with timings for the object. When closed, it will save the meta JSON to the correct paths.

Attributes:

Name Type Description path Path

The path to the zip archive.

collection str

The collection of the XML files in the archive. Default is \"\".

report Path

The file path of the report file for the archive.

report_id str

The report ID for the archive. If not provided, a random UUID is generated.

report_parent Path

The parent directory of the report file for the archive.

jisc_papers pd.DataFrame

A DataFrame of JISC papers.

size str | float

The size of the archive, in human-readable format.

size_raw str | float

The raw size of the archive, in bytes.

roots Generator[ET.Element, None, None]

The root elements of the XML documents contained in the archive.

meta dotdict

Metadata about the archive, such as its path, size, and number of contents.

json_indent int

Indentation formatting of json output

Raises:

Type Description RuntimeError

If the path does not exist.

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n):\n\"\"\"Constructor method.\"\"\"\nself.path: Path = Path(path)\nif not self.path.exists():\nraise RuntimeError(\"Path does not exist.\")\nself.size: str | float = get_size_from_path(self.path)\nself.size_raw: str | float = get_size_from_path(self.path, raw=True)\nself.zip_file: zipfile.ZipFile = zipfile.ZipFile(self.path)\nself.collection: str = collection\nself.roots: Generator[ET.Element, None, None] = self.get_roots()\nself.meta: dotdict = dotdict(\npath=str(self.path),\nbytes=self.size_raw,\nsize=self.size,\ncontents=len(self.filelist),\n)\nif not report_id:\nself.report_id: str = str(uuid.uuid4())\nelse:\nself.report_id = report_id\nself.jisc_papers: pd.DataFrame = jisc_papers\nself.report_parent: Path = Path(f\"{REPORT_DIR}/{self.report_id}\")\nself.report: Path = (\nself.report_parent / f\"{self.path.stem.replace('_metadata', '')}.json\"\n)\nself.json_indent: int = json_indent\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.documents","title":"documents property","text":"
documents\n

Property that calls the get_documents method

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.filelist","title":"filelist property","text":"
filelist\n

Returns the list of files in the zip file

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.__len__","title":"__len__","text":"
__len__()\n

The number of files inside the zip archive.

Source code in alto2txt2fixture/router.py
def __len__(self):\n\"\"\"The number of files inside the zip archive.\"\"\"\nreturn len(self.filelist)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_documents","title":"get_documents","text":"
get_documents() -> Generator[Document, None, None]\n

A generator that yields instances of the Document class for each XML file in the ZIP archive.

It uses the tqdm library to display a progress bar in the terminal while it is running.

If the contents of the ZIP file are not empty, the method creates an instance of the Document class by passing the root element of the XML file, the collection name, meta information about the archive, and the JISC papers data frame (if provided) to the constructor of the Document class. The instance of the Document class is then returned by the generator.

Yields:

Type Description Document

Document class instance for each unzipped XML file.

Source code in alto2txt2fixture/router.py
def get_documents(self) -> Generator[Document, None, None]:\n\"\"\"\n    A generator that yields instances of the Document class for each XML\n    file in the ZIP archive.\n    It uses the `tqdm` library to display a progress bar in the terminal\n    while it is running.\n    If the contents of the ZIP file are not empty, the method creates an\n    instance of the ``Document`` class by passing the root element of the XML\n    file, the collection name, meta information about the archive, and the\n    JISC papers data frame (if provided) to the constructor of the\n    ``Document`` class. The instance of the ``Document`` class is then\n    returned by the generator.\n    Yields:\n        ``Document`` class instance for each unzipped `XML` file.\n    \"\"\"\nfor xml_file in tqdm(\nself.filelist,\ndesc=f\"{Path(self.zip_file.filename).stem} ({self.meta.size})\",\nleave=False,\ncolour=\"green\",\n):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield Document(\nroot=ET.fromstring(xml),\ncollection=self.collection,\nmeta=self.meta,\njisc_papers=self.jisc_papers,\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_roots","title":"get_roots","text":"
get_roots() -> Generator[ET.Element, None, None]\n

Yields the root elements of the XML documents contained in the archive.

Source code in alto2txt2fixture/router.py
def get_roots(self) -> Generator[ET.Element, None, None]:\n\"\"\"\n    Yields the root elements of the XML documents contained in the archive.\n    \"\"\"\nfor xml_file in tqdm(self.filelist, leave=False, colour=\"blue\"):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield ET.fromstring(xml)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache","title":"Cache","text":"
Cache()\n

The Cache class provides a blueprint for creating and managing cache data. The class has several methods that help in getting the cache path, converting the data to a dictionary, and writing the cache data to a file.

It is inherited by many other classes in this document.

Initializes the Cache class object.

Source code in alto2txt2fixture/router.py
def __init__(self):\n\"\"\"\n    Initializes the Cache class object.\n    \"\"\"\npass\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.__str__","title":"__str__","text":"
__str__() -> str\n

Returns the string representation of the cache data as a dictionary.

Source code in alto2txt2fixture/router.py
def __str__(self) -> str:\n\"\"\"\n    Returns the string representation of the cache data as a dictionary.\n    \"\"\"\nreturn str(self.as_dict())\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

Converts the cache data to a dictionary and returns it.

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    Converts the cache data to a dictionary and returns it.\n    \"\"\"\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the cache path, which is used to store the cache data. The path is normally constructed using some of the object's properties (collection, kind, and id) but can be changed when inherited.

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the cache path, which is used to store the cache data.\n    The path is normally constructed using some of the object's\n    properties (collection, kind, and id) but can be changed when\n    inherited.\n    \"\"\"\nreturn Path(f\"{CACHE_HOME}/{self.collection}/{self.kind}/{self.id}.json\")\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.write_to_cache","title":"write_to_cache","text":"
write_to_cache(json_indent: int = JSON_INDENT) -> Optional[bool]\n

Writes the cache data to a file at the specified cache path. The cache data is first converted to a dictionary using the as_dict method. If the cache path already exists, the function returns True.

Source code in alto2txt2fixture/router.py
def write_to_cache(self, json_indent: int = JSON_INDENT) -> Optional[bool]:\n\"\"\"\n    Writes the cache data to a file at the specified cache path. The cache\n    data is first converted to a dictionary using the as_dict method. If\n    the cache path already exists, the function returns True.\n    \"\"\"\npath = self.get_cache_path()\ntry:\nif path.exists():\nreturn True\nexcept AttributeError:\nerror(\nf\"Error occurred when getting cache path for \"\nf\"{self.kind}: {path}. It was not of expected \"\nf\"type Path but of type {type(path)}:\",\n)\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"w+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent))\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Collection","title":"Collection","text":"
Collection(name: str = 'hmd', jisc_papers: Optional[pd.DataFrame] = None)\n

A Collection represents a group of newspaper archives from any passed alto2txt metadata output.

A Collection is initialised with a name and an optional pandas DataFrame of JISC papers. The archives property returns an iterable of the Archive objects within the collection.

Attributes:

Name Type Description name str

Name of the collection (default \"hmd\")

jisc_papers pandas.DataFrame

DataFrame of JISC papers, optional

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, name: str = \"hmd\", jisc_papers: Optional[pd.DataFrame] = None):\n\"\"\"Constructor method.\"\"\"\nself.name: str = name\nself.jisc_papers: pd.DataFrame | None = jisc_papers\nself.dir: Path = Path(f\"{MNT}/{self.name}-alto2txt/metadata\")\nself.zip_files: list[Path] = sorted(\nlist(self.dir.glob(\"*.zip\")), key=lambda x: x.stat().st_size\n)\nself.zip_file_count: int = sum([1 for _ in self.dir.glob(\"*.zip\")])\nself.report_id: str = str(uuid.uuid4())\nself.empty: bool = self.zip_file_count == 0\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider","title":"DataProvider","text":"
DataProvider(collection: str)\n

Bases: Cache

The DataProvider class extends the Cache class and represents a newspaper data provider. The class has several properties and methods that allow creation of a data provider object and the manipulation of its data.

Attributes:

Name Type Description collection str

A string representing publication collection

kind str

Indication of object type, defaults to data-provider

providers_meta_data list[FixtureDict]

structured dict of metadata for known collection sources

collection_type str

related data sources and potential linkage source

index_field str

field name for querying existing records

Example
>>> from pprint import pprint\n>>> hmd = DataProvider(\"hmd\")\n>>> hmd.pk\n2\n>>> pprint(hmd.as_dict())\n{'code': 'bl-hmd',\n 'collection': 'newspapers',\n 'legacy_code': 'hmd',\n 'name': 'Heritage Made Digital',\n 'source_note': 'British Library-funded digitised newspapers provided by the '\n                'British Newspaper Archive'}\n

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, collection: str):\n\"\"\"Constructor method.\"\"\"\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.meta_data","title":"meta_data property","text":"
meta_data: FixtureDict | dict\n

Return self.providers_meta_data[self.collection] or {}.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.meta_data_fields","title":"meta_data_fields property","text":"
meta_data_fields: FixtureDict | dict\n

Return self.providers_meta_data[self.collection] or {}.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.pk","title":"pk property","text":"
pk: int | None\n

Return pk if provided via providers_meta_data, else None.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.providers_index_dict","title":"providers_index_dict property","text":"
providers_index_dict: dict[str, FixtureDict]\n

Return all self.index_field values from providers_meta_data.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

Return a dict of the data provider object.

Returns:

Type Description dict

Dictionary representation of the DataProvider object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    Return a `dict` of the data provider object.\n    Returns:\n        Dictionary representation of the DataProvider object\n    \"\"\"\nif self.meta_data:\nreturn {\n\"name\": self.meta_data_fields[\"name\"],\n\"code\": self.meta_data_fields[\"code\"],\n\"legacy_code\": self.collection,\n\"source_note\": self.meta_data_fields[\"source_note\"],\n\"collection\": self.collection_type,\n}\nelse:\nreturn {\n\"name\": self.collection,\n\"code\": slugify(self.collection),\n\"source_note\": \"\",\n\"legacy_code\": None,\n\"collection\": self.collection_type,\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation","title":"Digitisation","text":"
Digitisation(root: ET.Element, collection: str = '')\n

Bases: Cache

The Digitisation class extends the Cache class and represents a newspaper digitisation. The class has several properties and methods that allow creation of an digitisation object and the manipulation of its data.

Attributes:

Name Type Description root ET.Element

An xml element that represents the root of the publication

collection str

A string that represents the collection of the publication

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'digitisation'\n

A string that represents the type of the object, set to \"digitisation\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the digitisation object.

Returns:

Type Description dict

Dictionary representation of the Digitising object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the digitisation\n    object.\n    Returns:\n        Dictionary representation of the Digitising object\n    \"\"\"\ndic = {\nx.tag: x.text or \"\"\nfor x in self.root.findall(\"./process/*\")\nif x.tag\nin [\n\"xml_flavour\",\n\"software\",\n\"mets_namespace\",\n\"alto_namespace\",\n]\n}\nif not dic.get(\"software\"):\nreturn {}\nreturn dic\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document","title":"Document","text":"
Document(*args, **kwargs)\n

The Document class is a representation of a document that contains information about a publication, newspaper, item, digitisation, and ingest. This class holds all the relevant information about a document in a structured manner and provides properties that can be used to access different aspects of the document.

Attributes:

Name Type Description collection str | None

A string that represents the collection of the publication

root ET.Element | None

An XML element that represents the root of the publication

zip_file str | None

A path to a valid zip file

jisc_papers pd.DataFrame | None

A pandas DataFrame object that holds information about the JISC papers

meta dotdict | None

TODO

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, *args, **kwargs):\n\"\"\"Constructor method.\"\"\"\nself.collection: str | None = kwargs.get(\"collection\")\nif not self.collection or not isinstance(self.collection, str):\nraise RuntimeError(\"A valid collection must be passed\")\nself.root: ET.Element | None = kwargs.get(\"root\")\nif not self.root or not isinstance(self.root, ET.Element):\nraise RuntimeError(\"A valid XML root must be passed\")\nself.zip_file: str | None = kwargs.get(\"zip_file\")\nif self.zip_file and not isinstance(self.zip_file, str):\nraise RuntimeError(\"A valid zip file must be passed\")\nself.jisc_papers: pd.DataFrame | None = kwargs.get(\"jisc_papers\")\nif not isinstance(self.jisc_papers, pd.DataFrame):\nraise RuntimeError(\n\"A valid DataFrame containing JISC papers must be passed\"\n)\nself.meta: dotdict | None = kwargs.get(\"meta\")\nself._publication_elem = None\nself._input_sub_path = None\nself._ingest = None\nself._digitisation = None\nself._item = None\nself._issue = None\nself._newspaper = None\nself._data_provider = None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document.publication","title":"publication property","text":"
publication: ET.Element\n

This property returns an ElementTree object representing the publication information in the XML document.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest","title":"Ingest","text":"
Ingest(root: ET.Element, collection: str = '')\n

Bases: Cache

The Ingest class extends the Cache class and represents a newspaper ingest. The class has several properties and methods that allow the creation of an ingest object and the manipulation of its data.

Attributes:

Name Type Description root ET.Element

An xml element that represents the root of the publication

collection str

A string that represents the collection of the publication

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'ingest'\n

A string that represents the type of the object, set to \"ingest\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the ingest object.

Returns:

Type Description dict

Dictionary representation of the Ingest object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the ingest\n    object.\n    Returns:\n        Dictionary representation of the Ingest object\n    \"\"\"\nreturn {\nf\"lwm_tool_{x.tag}\": x.text or \"\"\nfor x in self.root.findall(\"./process/lwm_tool/*\")\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue","title":"Issue","text":"
Issue(\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n)\n

Bases: Cache

The Issue class extends the Cache class and represents a newspaper issue. The class has several properties and methods that allow the creation of an issue object and the manipulation of its data.

Attributes:

Name Type Description root

An xml element that represents the root of the publication

newspaper Newspaper | None

The parent newspaper

collection str

A string that represents the collection of the publication

input_sub_path str

TODO

meta dotdict

TODO

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nself.publication: ET.Element = publication\nself.newspaper: Newspaper | None = newspaper\nself.collection: str = collection\nself.input_sub_path: str = input_sub_path\nself.meta: dotdict = meta\nself._issue = None\nself._issue_date = None\npath: str = str(self.get_cache_path())\nif not self.meta.issue_paths:\nself.meta.issue_paths = [path]\nelif path not in self.meta.issue_paths:\nself.meta.issue_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_code","title":"issue_code property","text":"
issue_code: str\n

Sets up and saves the issue code for easy access as property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_date","title":"issue_date property","text":"
issue_date: str\n

Sets up and saves the issue date for easy access as property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'issue'\n

A string that represents the type of the object, set to \"issue\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the issue object.

Returns:

Type Description dict

Dictionary representation of the Issue object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the issue\n    object.\n    Returns:\n        Dictionary representation of the Issue object\n    \"\"\"\nif not self._issue:\nself._issue = dict(\nissue_code=self.issue_code,\nissue_date=self.issue_date,\npublication__publication_code=self.newspaper.publication_code,\ninput_sub_path=self.input_sub_path,\n)\nreturn self._issue\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the path to the cache file for the issue object.

Returns:

Type Description Path

Path to the cache file for the issue object

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the path to the cache file for the issue object.\n    Returns:\n        Path to the cache file for the issue object\n    \"\"\"\njson_file = f\"/{self.newspaper.publication_code}/issues/{self.issue_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item","title":"Item","text":"
Item(\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n)\n

Bases: Cache

The Newspaper class extends the Cache class and represents a newspaper item, i.e. an article. The class has several properties and methods that allow the creation of an article object and the manipulation of its data.

Attributes:

Name Type Description root ET.Element

An xml element that represents the root of the publication

issue_code str

A string that represents the issue code

digitisation dict

TODO

ingest dict

TODO

collection str

A string that represents the collection of the publication

newspaper Newspaper | None

The parent newspaper

meta dotdict

TODO

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nif not isinstance(newspaper, Newspaper):\nraise RuntimeError(\"Expected newspaper to be of type router.Newspaper\")\nself.root: ET.Element = root\nself.issue_code: str = issue_code\nself.digitisation: dict = digitisation\nself.ingest: dict = ingest\nself.collection: str = collection\nself.newspaper: Newspaper | None = newspaper\nself.meta: dotdict = meta\nself._item_elem = None\nself._item_code = None\nself._item = None\npath: str = str(self.get_cache_path())\nif not self.meta.item_paths:\nself.meta.item_paths = [path]\nelif path not in self.meta.item_paths:\nself.meta.item_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_code","title":"item_code property","text":"
item_code: str\n

Sets up and saves the item code for easy access as property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_elem","title":"item_elem property","text":"
item_elem\n

Sets up and saves the issue XML item for easy access as a property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'item'\n

A string that represents the type of the object, set to \"item\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the item object (i.e. article).

Returns:

Type Description dict

Dictionary representation of the Item object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the item object\n    (i.e. article).\n    Returns:\n        Dictionary representation of the Item object\n    \"\"\"\nif not self._item:\nself._item = {\nf\"{x.tag}\": x.text or \"\"\nfor x in self.item_elem.findall(\"*\")\nif x.tag\nin [\n\"title\",\n\"word_count\",\n\"ocr_quality_mean\",\n\"ocr_quality_sd\",\n\"plain_text_file\",\n\"item_type\",\n]\n}\nself._item[\"title\"] = self._item.get(\"title\", \"\")[:2097151]\nself._item = {\n\"item_code\": self.item_code,\n\"word_count\": self._item.get(\"word_count\", 0),\n\"title\": self._item.get(\"title\"),\n\"item_type\": self._item.get(\"item_type\"),\n\"input_filename\": self._item.get(\"plain_text_file\", \"\"),\n\"ocr_quality_mean\": self._item.get(\"ocr_quality_mean\", 0),\n\"ocr_quality_sd\": self._item.get(\"ocr_quality_sd\", 0),\n\"digitisation__software\": self.digitisation.id,\n\"ingest__lwm_tool_identifier\": self.ingest.id,\n\"issue__issue_identifier\": self.issue_code,\n\"data_provider__name\": self.collection,\n}\nreturn self._item\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the path to the cache file for the item (article) object.

Returns:

Type Description Path

Path to the cache file for the article object

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the path to the cache file for the item (article) object.\n    Returns:\n        Path to the cache file for the article object\n    \"\"\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ f\"/{self.newspaper.publication_code}/items.jsonl\"\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.write_to_cache","title":"write_to_cache","text":"
write_to_cache(json_indent = JSON_INDENT) -> None\n

Special cache-write function that appends rather than writes at the end of the process.

Returns:

Type Description None

None.

Source code in alto2txt2fixture/router.py
def write_to_cache(self, json_indent=JSON_INDENT) -> None:\n\"\"\"\n    Special cache-write function that appends rather than writes at the\n    end of the process.\n    Returns:\n        None.\n    \"\"\"\npath = self.get_cache_path()\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"a+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent) + \"\\n\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper","title":"Newspaper","text":"
Newspaper(\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n)\n

Bases: Cache

The Newspaper class extends the Cache class and represents a newspaper.

The class has several properties and methods that allow the creation of a newspaper object and the manipulation of its data.

Attributes:

Name Type Description root

An xml element that represents the root of the publication.

collection

A string that represents the collection of the publication.

meta

A dotdict object that holds metadata about the publication.

jisc_papers

A pandas DataFrame object for JISC paper information.

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.publication = root.find(\"./publication\")\nself.input_sub_path = root.find(\"./process/input_sub_path\").text\nself.issue_date = self.publication.find(\"./issue/date\").text\nself.collection = collection\nself.meta = meta\nself.jisc_papers = jisc_papers\nself._newspaper = None\nself._title = None\nself._publication_code = None\npath = str(self.get_cache_path())\nif not self.meta.newspaper_paths:\nself.meta.newspaper_paths = []\nelif path not in self.meta.newspaper_paths:\nself.meta.newspaper_paths.append(path)\nif not self.meta.publication_codes:\nself.meta.publication_codes = [self.publication_code]\nelif self.publication_code not in self.meta.publication_codes:\nself.meta.publication_codes.append(self.publication_code)\nself.zip_file = Path(meta.path).name\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'newspaper'\n

A string that represents the type of the object, set to \"newspaper\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.number_paths","title":"number_paths property","text":"
number_paths: list\n

Returns the nested directories in which we want to save the cache file.

Returns:

Type Description list

List of the desired directories in descending order

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code","title":"publication_code property","text":"
publication_code: str\n

A property that returns the code of the publication.

Returns:

Type Description str

The code of the publication

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.title","title":"title property","text":"
title: str\n

A property that returns the title of the newspaper.

Returns:

Type Description str

The title of the newspaper

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the newspaper object.

Returns:

Type Description dict

Dictionary representation of the Newspaper object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the newspaper\n    object.\n    Returns:\n        Dictionary representation of the Newspaper object\n    \"\"\"\nif not self._newspaper:\nself._newspaper = dict(\n**dict(publication_code=self.publication_code, title=self.title),\n**{\nx.tag: x.text or \"\"\nfor x in self.publication.findall(\"*\")\nif x.tag in [\"location\"]\n},\n)\nreturn self._newspaper\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the path to the cache file for the newspaper object.

Returns:

Type Description Path

Path to the cache file for the newspaper object

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the path to the cache file for the newspaper object.\n    Returns:\n        Path to the cache file for the newspaper object\n    \"\"\"\njson_file = f\"/{self.publication_code}/{self.publication_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\" + \"/\".join(self.number_paths) + json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code_from_input_sub_path","title":"publication_code_from_input_sub_path","text":"
publication_code_from_input_sub_path() -> str | None\n

A method that returns the publication code from the input sub-path of the publication process.

Returns:

Type Description str | None

The code of the publication

Source code in alto2txt2fixture/router.py
def publication_code_from_input_sub_path(self) -> str | None:\n\"\"\"\n    A method that returns the publication code from the input sub-path of\n    the publication process.\n    Returns:\n        The code of the publication\n    \"\"\"\ng = PUBLICATION_CODE.findall(self.input_sub_path)\nif len(g) == 1:\nreturn g[0]\nreturn None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.route","title":"route","text":"
route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None\n

This function is responsible for setting up the path for the alto2txt mountpoint, setting up the JISC papers and routing the collections for processing.

Parameters:

Name Type Description Default collections list

List of collection names

required cache_home str

Directory path for the cache

required mountpoint str

Directory path for the alto2txt mountpoint

required jisc_papers_path str

Path to the JISC papers

required report_dir str

Path to the report directory

required

Returns:

Type Description None

None

Source code in alto2txt2fixture/router.py
def route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None:\n\"\"\"\n    This function is responsible for setting up the path for the alto2txt\n    mountpoint, setting up the JISC papers and routing the collections for\n    processing.\n    Args:\n        collections: List of collection names\n        cache_home: Directory path for the cache\n        mountpoint: Directory path for the alto2txt mountpoint\n        jisc_papers_path: Path to the JISC papers\n        report_dir: Path to the report directory\n    Returns:\n        None\n    \"\"\"\nglobal CACHE_HOME\nglobal MNT\nglobal REPORT_DIR\nCACHE_HOME = cache_home\nREPORT_DIR = report_dir\nMNT = Path(mountpoint) if isinstance(mountpoint, str) else mountpoint\nif not MNT.exists():\nerror(\nf\"The mountpoint provided for alto2txt does not exist. \"\nf\"Either create a local copy or blobfuse it to \"\nf\"`{MNT.absolute()}`.\"\n)\njisc_papers = setup_jisc_papers(path=jisc_papers_path)\nfor collection_name in collections:\ncollection = Collection(name=collection_name, jisc_papers=jisc_papers)\nif collection.empty:\nerror(\nf\"It looks like {collection_name} is empty in the \"\nf\"alto2txt mountpoint: `{collection.dir.absolute()}`.\"\n)\nfor archive in collection.archives:\nwith archive as _:\n[\n(\ndoc.item.write_to_cache(),\ndoc.newspaper.write_to_cache(),\ndoc.issue.write_to_cache(),\ndoc.data_provider.write_to_cache(),\ndoc.ingest.write_to_cache(),\ndoc.digitisation.write_to_cache(),\n)\nfor doc in archive.documents\n]\nreturn\n
"},{"location":"reference/alto2txt2fixture/settings.html","title":"settings","text":"

The settings module provides configuration for running alto2txt2fixture.

Most of these are managed within the settings variable within this module.

Note

See the command line interface parameters documentation for means of modifying settings when run.

Attributes:

Name Type Description JSON_INDEX

Amount of indentation to include in output JSON files

DATA_PROVIDER_INDEX Final[str]

The field used to index DataProvider records

NEWSPAPER_COLLECTION_METADATA Final[list[FixtureDict]]

A list of FixtureDicts specifying speific newspaper data providers

SETUP_TITLE str

the title printed at the commandline via cli.show_setup() function

settings dotdict

a docdict configuration for running newspaper portions of alto2txt2fixture

"},{"location":"reference/alto2txt2fixture/types.html","title":"types","text":""},{"location":"reference/alto2txt2fixture/types.html#alto2txt2fixture.types.FixtureDict","title":"FixtureDict","text":"

Bases: TypedDict

A dict structure to ease use as a json database fixture.

Attributes:

Name Type Description pk int

an id to uniquely define and query each entry

model str

what model a given record is for

fields dict[str, Any]

a dict of record information conforming to model table

"},{"location":"reference/alto2txt2fixture/types.html#alto2txt2fixture.types.TranslatorTuple","title":"TranslatorTuple","text":"

Bases: NamedTuple

A named tuple of fields for translation.

Attributes:

Name Type Description start str

A string representing the starting field name.

finish str | list

A string or list specifying the field(s) to be translated. If it is a string, the translated field will be a direct mapping of the specified field in each item of the input list. If it is a list, the translated field will be a hyphen-separated concatenation of the specified fields in each item of the input list.

lst list[dict]

A list of dictionaries representing the items to be translated. Each dictionary should contain the necessary fields for translation, with the field names specified in the start parameter.

"},{"location":"reference/alto2txt2fixture/types.html#alto2txt2fixture.types.dotdict","title":"dotdict","text":"

Bases: dict

dot.notation access to dictionary attributes

"},{"location":"reference/alto2txt2fixture/utils.html","title":"utils","text":""},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.check_newspaper_collection_configuration","title":"check_newspaper_collection_configuration","text":"
check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[\nFixtureDict\n] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]\n

Check the names in collections match the names in newspaper_collections.

Parameters:

Name Type Description Default collections Iterable[str]

Names of newspaper collections, defaults to settings.COLLECTIONS

settings.COLLECTIONS newspaper_collections Iterable[FixtureDict]

Newspaper collections in a list of FixtureDict format. Defaults to settings.FIXTURE_TABLE['dataprovider]

NEWSPAPER_COLLECTION_METADATA data_provider_index str

dict fields key used to check matchiching collections name

DATA_PROVIDER_INDEX

Returns:

Type Description set[str]

A set of collections without a matching newspaper_collections record.

Example
>>> check_newspaper_collection_configuration()\nset()\n
Source code in alto2txt2fixture/utils.py
def check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]:\n\"\"\"Check the names in `collections` match the names in `newspaper_collections`.\n    Arguments:\n        collections:\n            Names of newspaper collections, defaults to ``settings.COLLECTIONS``\n        newspaper_collections:\n            Newspaper collections in a list of `FixtureDict` format. Defaults\n                to ``settings.FIXTURE_TABLE['dataprovider]``\n        data_provider_index:\n            `dict` `fields` `key` used to check matchiching `collections` name\n    Returns:\n        A set of ``collections`` without a matching `newspaper_collections` record.\n    Example:\n        ```pycon\n        >>> check_newspaper_collection_configuration()\n        set()\n        ```\n    \"\"\"\nnewspaper_collection_names: tuple[str, ...] = tuple(\ndict_from_list_fixture_fields(\nnewspaper_collections, field_name=data_provider_index\n).keys()\n)\ncollection_diff: set[str] = set(collections) - set(newspaper_collection_names)\nif collection_diff:\nwarning(\nf\"{len(collection_diff)} `collections` \"\nf\"not in `newspaper_collections`: {collection_diff}\"\n)\nreturn collection_diff\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.clear_cache","title":"clear_cache","text":"
clear_cache(dir: str | Path) -> None\n

Clears the cache directory by removing all .json files in it.

Parameters:

Name Type Description Default dir str | Path

The path of the directory to be cleared.

required Source code in alto2txt2fixture/utils.py
def clear_cache(dir: str | Path) -> None:\n\"\"\"\n    Clears the cache directory by removing all `.json` files in it.\n    Args:\n        dir: The path of the directory to be cleared.\n    \"\"\"\ndir = get_path_from(dir)\ny = input(\nf\"Do you want to erase the cache path now that the \"\nf\"files have been generated ({dir.absolute()})? [y/N]\"\n)\nif y.lower() == \"y\":\ninfo(\"Clearing up the cache directory\")\nfor x in dir.glob(\"*.json\"):\nx.unlink()\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.create_lookup","title":"create_lookup","text":"
create_lookup(lst: list = [], on: list = []) -> dict\n

Create a lookup dictionary from a list of dictionaries.

Parameters:

Name Type Description Default lst list

A list of dictionaries that should be used to generate the lookup.

[] on list

A list of keys from the dictionaries in the list that should be used as the keys in the lookup.

[]

Returns:

Type Description dict

The generated lookup dictionary.

Source code in alto2txt2fixture/utils.py
def create_lookup(lst: list = [], on: list = []) -> dict:\n\"\"\"\n    Create a lookup dictionary from a list of dictionaries.\n    Args:\n        lst: A list of dictionaries that should be used to generate the lookup.\n        on: A list of keys from the dictionaries in the list that should be used as the keys in the lookup.\n    Returns:\n        The generated lookup dictionary.\n    \"\"\"\nreturn {get_key(x, on): x[\"pk\"] for x in lst}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.dict_from_list_fixture_fields","title":"dict_from_list_fixture_fields","text":"
dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]\n

Create a dict from fixture_list with attr_name as key.

Parameters:

Name Type Description Default fixture_list Iterable[FixtureDict]

list of FixtureDict with attr_name key fields.

NEWSPAPER_COLLECTION_METADATA field_name str

key for values within fixture_list fields.

DATA_PROVIDER_INDEX

Returns:

Type Description dict[str, FixtureDict]

A dict where extracted field_name is key for related FixtureDict values.

Example
>>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n>>> fixture_dict['hmd']['pk']\n2\n>>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture_dict['hmd']['fields']['code']\n'bl-hmd'\n
Source code in alto2txt2fixture/utils.py
def dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]:\n\"\"\"Create a `dict` from ``fixture_list`` with ``attr_name`` as `key`.\n    Args:\n        fixture_list: `list` of `FixtureDict` with ``attr_name`` key `fields`.\n        field_name: key for values within ``fixture_list`` `fields`.\n    Returns:\n        A `dict` where extracted `field_name` is key for related `FixtureDict` values.\n    Example:\n        ```pycon\n        >>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n        >>> fixture_dict['hmd']['pk']\n        2\n        >>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        >>> fixture_dict['hmd']['fields']['code']\n        'bl-hmd'\n        ```\n    \"\"\"\nreturn {record[\"fields\"][field_name]: record for record in fixture_list}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.export_fixtures","title":"export_fixtures","text":"
export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None\n

Export fixture_tables in formats.

Note

This is still in experimental phase of development and not recommended for production.

Parameters:

Name Type Description Default fixture_tables dict[str, Sequence[FixtureDict]]

dict of table name (eg: dataprovider) and FixtureDict

required path str | PathLike

Path to save exports in

settings.FIXTURE_TABLES_OUTPUT prefix str

str to prefix export filenames with

'test-' formats Sequence[EXPORT_FORMATS]

list of EXPORT_FORMATS to export

settings.FIXTURE_TABLES_FORMATS Example
>>> test_fixture_tables: dict[str, FixtureDict] = {\n...     'test0': NEWSPAPER_COLLECTION_METADATA,\n...     'test1': NEWSPAPER_COLLECTION_METADATA}\n>>> export_fixtures(test_fixture_tables, path='tests/')\n...     # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...Warning: Saving test0...\n...Warning: Saving test1...\n>>> from pandas import read_csv\n>>> fixture0_json = load_json('tests/test-test0-1.json')\n>>> fixture0_df = read_csv('tests/test-test0-1.csv')\n>>> fixture1_json = load_json('tests/test-test1-1.json')\n>>> fixture1_df = read_csv('tests/test-test1-1.csv')\n>>> fixture0_json == fixture1_json\nTrue\n>>> all(fixture0_df == fixture1_df)\nTrue\n>>> all(field in fixture0_json[0]['fields']\n...     for field in ['created_at', 'updated_at'])\nTrue\n>>> fixture0_json[1]['pk']\n2\n>>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n[2, 'hmd']\n
Source code in alto2txt2fixture/utils.py
def export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None:\n\"\"\"Export ``fixture_tables`` in ``formats``.\n    Note:\n        This is still in experimental phase of development and not recommended\n        for production.\n    Args:\n        fixture_tables: `dict` of table name (eg: `dataprovider`) and `FixtureDict`\n        path: Path to save exports in\n        prefix: `str` to prefix export filenames with\n        formats: list of `EXPORT_FORMATS` to export\n    Example:\n        ```pycon\n        >>> test_fixture_tables: dict[str, FixtureDict] = {\n        ...     'test0': NEWSPAPER_COLLECTION_METADATA,\n        ...     'test1': NEWSPAPER_COLLECTION_METADATA}\n        >>> export_fixtures(test_fixture_tables, path='tests/')\n        ...     # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n        <BLANKLINE>\n        ...Warning: Saving test0...\n        ...Warning: Saving test1...\n        >>> from pandas import read_csv\n        >>> fixture0_json = load_json('tests/test-test0-1.json')\n        >>> fixture0_df = read_csv('tests/test-test0-1.csv')\n        >>> fixture1_json = load_json('tests/test-test1-1.json')\n        >>> fixture1_df = read_csv('tests/test-test1-1.csv')\n        >>> fixture0_json == fixture1_json\n        True\n        >>> all(fixture0_df == fixture1_df)\n        True\n        >>> all(field in fixture0_json[0]['fields']\n        ...     for field in ['created_at', 'updated_at'])\n        True\n        >>> fixture0_json[1]['pk']\n        2\n        >>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        >>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n        [2, 'hmd']\n        ```\n    \"\"\"\nfor table_name, records in fixture_tables.items():\nwarning(\nf\"Saving {table_name} fixture in {formats} formats \"\nf\"to {path} *without* checks...\"\n)\nif \"json\" in formats:\nsave_fixture(\nrecords,\nprefix=f\"{prefix}{table_name}\",\noutput_path=path,\nadd_created=add_created,\n)\nif \"csv\" in formats:\nfixtures_dict2csv(records, prefix=f\"{prefix}{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.filter_json_fields","title":"filter_json_fields","text":"
filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs: Hashable\n) -> dict | list\n

Return keys and values from json_dict where any fields equal value.

Parameters:

Name Type Description Default file_path PathLike | None

The file path to load based on extension and filter

None fields Sequence[str]

Which fields to check equal value

[] value Hashable

Value to filter by

''

Returns:

Type Description dict | list

A dict of records indexed by pk which fit filter criteria

Raises:

Type Description ValueError

file_path must have a .json suffix

Example
>>> from pprint import pprint\n>>> entry_fixture: dict = [\n...     {\"pk\": 4889, \"model\": \"mitchells.entry\",\n...      \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n...                 \"price_raw\": ['2d'],\n...                 \"year\": 1920,\n...                 \"date_established_raw\": \"1857\",\n...                 \"persons\": [], \"newspaper\": \"\"}},\n...      {\"pk\": 9207, \"model\": \"mitchells.entry\",\n...       \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n...                  \"price_raw\": ['2 \u00bd d', '3 \u00bd d'],\n...                  \"year\": 1856,\n...                  \"date_established_raw\": \"1848\",\n...                  \"persons\": ['Stephen Soulby'],\n...                  \"newspaper\": \"\",}},\n...     {\"pk\": 15, \"model\": \"mitchells.entry\",\n...      \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n...                 \"price_raw\": ['2d', '3d'],\n...                 \"year\": 1857,\n...                 \"date_established_raw\": \"November , 1842\",\n...                 \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n...                 \"newspaper\": 1187}}\n...     ]\n>>> pprint(filter_json_fields(entry_fixture,\n...                           fields=(\"newspaper\", \"persons\"),\n...                           value=\"\"))\n[{'fields': {'date_established_raw': '1857',\n             'newspaper': '',\n             'persons': [],\n             'price_raw': ['2d'],\n             'title': 'BIRMINGHAM POST .',\n             'year': 1920},\n  'model': 'mitchells.entry',\n  'pk': 4889},\n {'fields': {'date_established_raw': '1848',\n             'newspaper': '',\n             'persons': ['Stephen Soulby'],\n             'price_raw': ['2 \u00bd d', '3 \u00bd d'],\n             'title': 'ULVERSTONE ADVERTISER .',\n             'year': 1856},\n  'model': 'mitchells.entry',\n  'pk': 9207}]\n
Source code in alto2txt2fixture/utils.py
def filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs,\n) -> dict | list:\n\"\"\"Return `keys` and `values` from `json_dict` where any `fields` equal `value`.\n    Args:\n        file_path: The file `path` to load based on extension and filter\n        fields: Which fields to check equal `value`\n        value: Value to filter by\n    Returns:\n        A `dict` of records indexed by `pk` which fit filter criteria\n    Raises:\n        ValueError: ``file_path`` must have a `.json` `suffix`\n    Example:\n        ```pycon\n        >>> from pprint import pprint\n        >>> entry_fixture: dict = [\n        ...     {\"pk\": 4889, \"model\": \"mitchells.entry\",\n        ...      \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n        ...                 \"price_raw\": ['2d'],\n        ...                 \"year\": 1920,\n        ...                 \"date_established_raw\": \"1857\",\n        ...                 \"persons\": [], \"newspaper\": \"\"}},\n        ...      {\"pk\": 9207, \"model\": \"mitchells.entry\",\n        ...       \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n        ...                  \"price_raw\": ['2 \\u00bd d', '3 \\u00bd d'],\n        ...                  \"year\": 1856,\n        ...                  \"date_established_raw\": \"1848\",\n        ...                  \"persons\": ['Stephen Soulby'],\n        ...                  \"newspaper\": \"\",}},\n        ...     {\"pk\": 15, \"model\": \"mitchells.entry\",\n        ...      \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n        ...                 \"price_raw\": ['2d', '3d'],\n        ...                 \"year\": 1857,\n        ...                 \"date_established_raw\": \"November , 1842\",\n        ...                 \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n        ...                 \"newspaper\": 1187}}\n        ...     ]\n        >>> pprint(filter_json_fields(entry_fixture,\n        ...                           fields=(\"newspaper\", \"persons\"),\n        ...                           value=\"\"))\n        [{'fields': {'date_established_raw': '1857',\n                     'newspaper': '',\n                     'persons': [],\n                     'price_raw': ['2d'],\n                     'title': 'BIRMINGHAM POST .',\n                     'year': 1920},\n          'model': 'mitchells.entry',\n          'pk': 4889},\n         {'fields': {'date_established_raw': '1848',\n                     'newspaper': '',\n                     'persons': ['Stephen Soulby'],\n                     'price_raw': ['2 \\u00bd d', '3 \\u00bd d'],\n                     'title': 'ULVERSTONE ADVERTISER .',\n                     'year': 1856},\n          'model': 'mitchells.entry',\n          'pk': 9207}]\n        ```\n    \"\"\"\nif not json_results:\nassert file_path\ntry:\nassert Path(file_path).suffix == \".json\"\nexcept AssertionError:\nraise ValueError(f\"{file_path} must be `json` format.\")\njson_results = load_json(Path(file_path), **kwargs)\nassert json_results\nif isinstance(json_results, dict):\nreturn {\nk: v\nfor k, v in json_results.items()\nif any(v[\"fields\"][field] == value for field in fields)\n}\nelse:\nreturn [\nv\nfor v in json_results\nif any(v[\"fields\"][field] == value for field in fields)\n]\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_fields","title":"fixture_fields","text":"
fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]\n

Generate a tuple of FixtureDict field names.

Note

This is not in the utils module to avoid a circular import.

Parameters:

Name Type Description Default fixture_dict FixtureDict

A FixtureDict instance to extract names from fields

required include_pk bool

Whether to include the pk (primary key) column

True Example
>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n('name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> hmd_dict: dict[str, Any] = fixture_fields(\n...     NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n>>> hmd_dict['code']\n'bl-hmd'\n>>> hmd_dict['pk']\n2\n>>> hmd_dict = fixture_fields(\n...     NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n>>> 'pk' in hmd_dict\nFalse\n
Source code in alto2txt2fixture/utils.py
def fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]:\n\"\"\"Generate a tuple of `FixtureDict` `field` names.\n    Note:\n        This is not in the `utils` module to avoid a circular import.\n    Args:\n        fixture_dict: A `FixtureDict` instance to extract names from `fields`\n        include_pk: Whether to include the `pk` (primary key) column\n    Example:\n        ```pycon\n        >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n        ('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n        >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n        ('name', 'code', 'legacy_code', 'collection', 'source_note')\n        >>> hmd_dict: dict[str, Any] = fixture_fields(\n        ...     NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n        >>> hmd_dict['code']\n        'bl-hmd'\n        >>> hmd_dict['pk']\n        2\n        >>> hmd_dict = fixture_fields(\n        ...     NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n        >>> 'pk' in hmd_dict\n        False\n        ```\n    \"\"\"\nfields: OrderedDict[str, Any] = OrderedDict(fixture_dict[\"fields\"])\nif include_pk:\nfields[\"pk\"] = fixture_dict[\"pk\"]\nfields.move_to_end(\"pk\", last=False)\nif as_dict:\nreturn fields\nelse:\nreturn tuple(fields.keys())\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_or_default_dict","title":"fixture_or_default_dict","text":"
fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict\n

Return a FixtureDict from fixture_list via key index, else default_dict.

Parameters:

Name Type Description Default key str

a str to query fixture_dict with

required fixture_dict dict[str, FixtureDict]

a dict of str to FixtureDict, often generated by dict_from_list_fixture_fields

required default_dict FixtureDict | dict

a dict to return if key is not in fixture_dict index

{} Example
>>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n...     NEWSPAPER_COLLECTION_METADATA)\n>>> hmd_dict: FixtureDict = fixture_or_default_dict(\n...     'hmd', newspaper_dict\n... )\n>>> fixture_or_default_dict(\n...     'hmd', NEWSPAPER_COLLECTION_METADATA\n... )\n{}\n>>> fixture_or_default_dict(\n...     'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n... )\n{'a': 'default'}\n
Source code in alto2txt2fixture/utils.py
def fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict:\n\"\"\"Return a `FixtureDict` from ``fixture_list`` via ``key`` index, else ``default_dict``.\n    Args:\n        key:\n            a `str` to query ``fixture_dict`` with\n        fixture_dict: a `dict` of `str` to `FixtureDict`, often generated by\n             ``dict_from_list_fixture_fields``\n        default_dict: a `dict` to return if ``key`` is not in\n            ``fixture_dict`` index\n    Example:\n        ```pycon\n        >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n        ...     NEWSPAPER_COLLECTION_METADATA)\n        >>> hmd_dict: FixtureDict = fixture_or_default_dict(\n        ...     'hmd', newspaper_dict\n        ... )\n        >>> fixture_or_default_dict(\n        ...     'hmd', NEWSPAPER_COLLECTION_METADATA\n        ... )\n        {}\n        >>> fixture_or_default_dict(\n        ...     'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n        ... )\n        {'a': 'default'}\n        ```\n    \"\"\"\nif key in fixture_dict:\nreturn fixture_dict[key]\nelse:\nreturn default_dict\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixtures_dict2csv","title":"fixtures_dict2csv","text":"
fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None\n

Saves fixtures generated by a generator to separate separate CSV files.

This function takes an Iterable or Generator of fixtures and saves to separate CSV files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file parameter.

Parameters:

Name Type Description Default fixtures Iterable[FixtureDict] | Generator[FixtureDict, None, None]

An Iterable or Generator of the fixtures to be saved.

required prefix str

A string prefix to be added to the file names of the saved fixtures.

'' output_path PathLike | str

Path to folder fixtures are saved to

settings.OUTPUT max_elements_per_file int

Maximum JSON records saved in each file

settings.MAX_ELEMENTS_PER_FILE

Returns:

Type Description None

This function saves fixtures to files and does not return a value.

Example
>>> from pandas import read_csv\n>>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n...                   prefix='test', output_path='tests/')\n>>> imported_fixture = read_csv('tests/test-1.csv')\n>>> imported_fixture.iloc[1]['pk']\n2\n>>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n'hmd'\n
Source code in alto2txt2fixture/utils.py
def fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate separate `CSV` files.\n    This function takes an `Iterable` or `Generator` of fixtures and saves to\n    separate `CSV` files. The fixtures are saved in batches, where each batch\n    is determined by the ``max_elements_per_file`` parameter.\n    Args:\n        fixtures: An `Iterable` or `Generator` of the fixtures to be saved.\n        prefix: A string prefix to be added to the file names of the\n            saved fixtures.\n        output_path: Path to folder fixtures are saved to\n        max_elements_per_file: Maximum `JSON` records saved in each file\n    Returns:\n        This function saves fixtures to files and does not return a value.\n    Example:\n        ```pycon\n        >>> from pandas import read_csv\n        >>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n        ...                   prefix='test', output_path='tests/')\n        >>> imported_fixture = read_csv('tests/test-1.csv')\n        >>> imported_fixture.iloc[1]['pk']\n        2\n        >>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n        'hmd'\n        ```\n    \"\"\"\ninternal_counter: int = 1\ncounter: int = 1\nlst: list = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in fixtures:\nlst.append(fixture_fields(item, as_dict=True))\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst: list = []\ninternal_counter = 1\ncounter += 1\nelse:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\nreturn\nsave_fixture(records, prefix=f\"test-{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.gen_fixture_tables","title":"gen_fixture_tables","text":"
gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]\n

Generator of rich.Table instances from FixtureDict configuration tables.

Parameters:

Name Type Description Default fixture_tables dict[str, list[FixtureDict]]

dict where key is for Table title and value is a FixtureDict

{} include_fixture_pk_column bool

whether to include the pk field from FixtureDict

True Example
>>> table_name: str = \"data_provider\"\n>>> tables = tuple(\n...     gen_fixture_tables(\n...         {table_name: NEWSPAPER_COLLECTION_METADATA}\n...     ))\n>>> len(tables)\n1\n>>> assert tables[0].title == table_name\n>>> [column.header for column in tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n
Source code in alto2txt2fixture/utils.py
def gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]:\n\"\"\"Generator of `rich.Table` instances from `FixtureDict` configuration tables.\n    Args:\n        fixture_tables: `dict` where `key` is for `Table` title and `value` is a `FixtureDict`\n        include_fixture_pk_column: whether to include the `pk` field from `FixtureDict`\n    Example:\n        ```pycon\n        >>> table_name: str = \"data_provider\"\n        >>> tables = tuple(\n        ...     gen_fixture_tables(\n        ...         {table_name: NEWSPAPER_COLLECTION_METADATA}\n        ...     ))\n        >>> len(tables)\n        1\n        >>> assert tables[0].title == table_name\n        >>> [column.header for column in tables[0].columns]\n        ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n        ```\n    \"\"\"\nfor name, fixture_records in fixture_tables.items():\nfixture_table: Table = Table(title=name)\nfor i, fixture_dict in enumerate(fixture_records):\nif i == 0:\n[\nfixture_table.add_column(name)\nfor name in fixture_fields(fixture_dict, include_fixture_pk_column)\n]\nrow_values: tuple[str, ...] = tuple(\nstr(x) for x in (fixture_dict[\"pk\"], *fixture_dict[\"fields\"].values())\n)\nfixture_table.add_row(*row_values)\nyield fixture_table\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_chunked_zipfiles","title":"get_chunked_zipfiles","text":"
get_chunked_zipfiles(path: Path) -> list\n

This function takes in a Path object path and returns a list of lists of zipfiles sorted and chunked according to certain conditions defined in the settings object (see settings.CHUNK_THRESHOLD).

Note: the function will also skip zip files of a certain file size, which can be specified in the settings object (see settings.SKIP_FILE_SIZE).

Parameters:

Name Type Description Default path Path

The input path where the zipfiles are located

required

Returns:

Type Description list

A list of lists of zipfiles, each inner list represents a chunk of zipfiles.

Source code in alto2txt2fixture/utils.py
def get_chunked_zipfiles(path: Path) -> list:\n\"\"\"This function takes in a `Path` object `path` and returns a list of lists\n    of `zipfiles` sorted and chunked according to certain conditions defined\n    in the `settings` object (see `settings.CHUNK_THRESHOLD`).\n    Note: the function will also skip zip files of a certain file size, which\n    can be specified in the `settings` object (see `settings.SKIP_FILE_SIZE`).\n    Args:\n        path: The input path where the zipfiles are located\n    Returns:\n        A list of lists of `zipfiles`, each inner list represents a chunk of\n            zipfiles.\n    \"\"\"\nzipfiles = sorted(\npath.glob(\"*.zip\"),\nkey=lambda x: x.stat().st_size,\nreverse=settings.START_WITH_LARGEST,\n)\nzipfiles = [x for x in zipfiles if x.stat().st_size <= settings.SKIP_FILE_SIZE]\nif len(zipfiles) > settings.CHUNK_THRESHOLD:\nchunks = array_split(zipfiles, len(zipfiles) / settings.CHUNK_THRESHOLD)\nelse:\nchunks = [zipfiles]\nreturn chunks\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_key","title":"get_key","text":"
get_key(x: dict = dict(), on: list = []) -> str\n

Get a string key from a dictionary using values from specified keys.

Parameters:

Name Type Description Default x dict

A dictionary from which the key is generated.

dict() on list

A list of keys from the dictionary that should be used to generate the key.

[]

Returns:

Type Description str

The generated string key.

Source code in alto2txt2fixture/utils.py
def get_key(x: dict = dict(), on: list = []) -> str:\n\"\"\"\n    Get a string key from a dictionary using values from specified keys.\n    Args:\n        x: A dictionary from which the key is generated.\n        on: A list of keys from the dictionary that should be used to\n            generate the key.\n    Returns:\n        The generated string key.\n    \"\"\"\nreturn f\"{'-'.join([str(x['fields'][y]) for y in on])}\"\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_lockfile","title":"get_lockfile","text":"
get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path\n

Provides the path to any given lockfile, which controls whether any existing files should be overwritten or not.

Parameters:

Name Type Description Default collection str

Collection folder name

required kind NewspaperElements

Either newspaper or issue or item

required dic dict

A dictionary with required information for either kind passed

required

Returns:

Type Description Path

Path to the resulting lockfile

Source code in alto2txt2fixture/utils.py
def get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path:\n\"\"\"\n    Provides the path to any given lockfile, which controls whether any\n    existing files should be overwritten or not.\n    Args:\n        collection: Collection folder name\n        kind: Either `newspaper` or `issue` or `item`\n        dic: A dictionary with required information for either `kind` passed\n    Returns:\n        Path to the resulting lockfile\n    \"\"\"\np: Path\nbase = Path(f\"cache-lockfiles/{collection}\")\nif kind == \"newspaper\":\np = base / f\"newspapers/{dic['publication_code']}\"\nelif kind == \"issue\":\np = base / f\"issues/{dic['publication__publication_code']}/{dic['issue_code']}\"\nelif kind == \"item\":\ntry:\nif dic.get(\"issue_code\"):\np = base / f\"items/{dic['issue_code']}/{dic['item_code']}\"\nelif dic.get(\"issue__issue_identifier\"):\np = base / f\"items/{dic['issue__issue_identifier']}/{dic['item_code']}\"\nexcept KeyError:\nerror(\"An unknown error occurred (in get_lockfile)\")\nelse:\np = base / \"lockfile\"\np.parent.mkdir(parents=True, exist_ok=True) if settings.WRITE_LOCKFILES else None\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_now","title":"get_now","text":"
get_now(as_str: bool = False) -> datetime.datetime | str\n

Return datetime.now() as either a string or datetime object.

Parameters:

Name Type Description Default as_str bool

Whether to return now time as a str or not, default: False

False

Returns:

Type Description datetime.datetime | str

datetime.now() in pytz.UTC time zone as a string if as_str, else as a datetime.datetime object.

Source code in alto2txt2fixture/utils.py
def get_now(as_str: bool = False) -> datetime.datetime | str:\n\"\"\"\n    Return `datetime.now()` as either a string or `datetime` object.\n    Args:\n        as_str: Whether to return `now` `time` as a `str` or not, default: `False`\n    Returns:\n        `datetime.now()` in `pytz.UTC` time zone as a string if `as_str`, else\n            as a `datetime.datetime` object.\n    \"\"\"\nnow = datetime.datetime.now(tz=pytz.UTC)\nif as_str:\nreturn str(now)\nelse:\nassert isinstance(now, datetime.datetime)\nreturn now\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_path_from","title":"get_path_from","text":"
get_path_from(p: str | Path) -> Path\n

Converts an input value into a Path object if it's not already one.

Parameters:

Name Type Description Default p str | Path

The input value, which can be a string or a Path object.

required

Returns:

Type Description Path

The input value as a Path object.

Source code in alto2txt2fixture/utils.py
def get_path_from(p: str | Path) -> Path:\n\"\"\"\n    Converts an input value into a Path object if it's not already one.\n    Args:\n        p: The input value, which can be a string or a Path object.\n    Returns:\n        The input value as a Path object.\n    \"\"\"\nif isinstance(p, str):\np = Path(p)\nif not isinstance(p, Path):\nraise RuntimeError(f\"Unable to handle type: {type(p)}\")\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_size_from_path","title":"get_size_from_path","text":"
get_size_from_path(p: str | Path, raw: bool = False) -> str | float\n

Returns a nice string for any given file size.

Parameters:

Name Type Description Default p str | Path

Path to read the size from

required raw bool

Whether to return the file size as total number of bytes or a human-readable MB/GB amount

False

Returns:

Type Description str | float

Return str followed by MB or GB for size if not raw otherwise float.

Source code in alto2txt2fixture/utils.py
def get_size_from_path(p: str | Path, raw: bool = False) -> str | float:\n\"\"\"\n    Returns a nice string for any given file size.\n    Args:\n        p: Path to read the size from\n        raw: Whether to return the file size as total number of bytes or\n            a human-readable MB/GB amount\n    Returns:\n        Return `str` followed by `MB` or `GB` for size if not `raw` otherwise `float`.\n    \"\"\"\np = get_path_from(p)\nbytes = p.stat().st_size\nif raw:\nreturn bytes\nrel_size: float | int | str = round(bytes / 1000 / 1000 / 1000, 1)\nassert not isinstance(rel_size, str)\nif rel_size < 0.5:\nrel_size = round(bytes / 1000 / 1000, 1)\nrel_size = f\"{rel_size}MB\"\nelse:\nrel_size = f\"{rel_size}GB\"\nreturn rel_size\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.glob_filter","title":"glob_filter","text":"
glob_filter(p: str) -> list\n

Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.

Parameters:

Name Type Description Default p str

Path to a directory to filter

required

Returns:

Type Description list

Sorted list of files contained in the provided path without the ones

list

whose names start with a .

Source code in alto2txt2fixture/utils.py
def glob_filter(p: str) -> list:\n\"\"\"\n    Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.\n    Args:\n        p: Path to a directory to filter\n    Returns:\n        Sorted list of files contained in the provided path without the ones\n        whose names start with a `.`\n    \"\"\"\nreturn sorted([x for x in get_path_from(p).glob(\"*\") if not x.name.startswith(\".\")])\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.list_json_files","title":"list_json_files","text":"
list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]\n

List json files under the path specified in p.

Parameters:

Name Type Description Default p str | Path

The path to search for json files

required drill bool

A flag indicating whether to drill down the subdirectories or not. Default is False

False exclude_names list

A list of file names to exclude from the search result. Default is an empty list

[] include_names list

A list of file names to include in search result. If provided, the exclude_names argument will be ignored. Default is an empty list

[]

Returns:

Type Description Generator[Path, None, None] | list[Path]

A list of Path objects pointing to the found json files

Source code in alto2txt2fixture/utils.py
def list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]:\n\"\"\"\n    List `json` files under the path specified in ``p``.\n    Args:\n        p: The path to search for `json` files\n        drill: A flag indicating whether to drill down the subdirectories\n            or not. Default is ``False``\n        exclude_names: A list of file names to exclude from the search\n            result. Default is an empty list\n        include_names: A list of file names to include in search result.\n            If provided, the ``exclude_names`` argument will be ignored.\n            Default is an empty list\n    Returns:\n        A list of `Path` objects pointing to the found `json` files\n    \"\"\"\nq: str = \"**/*.json\" if drill else \"*.json\"\nfiles = get_path_from(p).glob(q)\nif exclude_names:\nfiles = list({x for x in files if x.name not in exclude_names})\nelif include_names:\nfiles = list({x for x in files if x.name in include_names})\nreturn sorted(files)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_json","title":"load_json","text":"
load_json(p: str | Path, crash: bool = False) -> dict | list\n

Easier access to reading json files.

Parameters:

Name Type Description Default p str | Path

Path to read json from

required crash bool

Whether the program should crash if there is a json decode error, default: False

False

Returns:

Type Description dict | list

The decoded json contents from the path, but an empty dictionary

dict | list

if the file cannot be decoded and crash is set to False

Source code in alto2txt2fixture/utils.py
def load_json(p: str | Path, crash: bool = False) -> dict | list:\n\"\"\"\n    Easier access to reading `json` files.\n    Args:\n        p: Path to read `json` from\n        crash: Whether the program should crash if there is a `json` decode\n            error, default: ``False``\n    Returns:\n        The decoded `json` contents from the path, but an empty dictionary\n        if the file cannot be decoded and ``crash`` is set to ``False``\n    \"\"\"\np = get_path_from(p)\ntry:\nreturn json.loads(p.read_text())\nexcept json.JSONDecodeError:\nmsg = f\"Error: {p.read_text()}\"\nerror(msg, crash=crash)\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_multiple_json","title":"load_multiple_json","text":"
load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list\n

Load multiple json files and return a list of their content.

Parameters:

Name Type Description Default p str | Path

The path to search for json files

required drill bool

A flag indicating whether to drill down the subdirectories or not. Default is False

False filter_na bool

A flag indicating whether to filter out the content that is None. Default is True.

True crash bool

A flag indicating whether to raise an exception when an error occurs while loading a json file. Default is False.

False

Returns:

Type Description list

A list of the content of the loaded json files.

Source code in alto2txt2fixture/utils.py
def load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list:\n\"\"\"\n    Load multiple `json` files and return a list of their content.\n    Args:\n        p: The path to search for `json` files\n        drill: A flag indicating whether to drill down the subdirectories\n            or not. Default is `False`\n        filter_na: A flag indicating whether to filter out the content that\n            is `None`. Default is `True`.\n        crash: A flag indicating whether to raise an exception when an\n            error occurs while loading a `json` file. Default is `False`.\n    Returns:\n        A `list` of the content of the loaded `json` files.\n    \"\"\"\nfiles = list_json_files(p, drill=drill)\ncontent = [load_json(x, crash=crash) for x in files]\nreturn [x for x in content if x] if filter_na else content\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.lock","title":"lock","text":"
lock(lockfile: Path) -> None\n

Writes a '.' to a lockfile, after making sure the parent directory exists.

Parameters:

Name Type Description Default lockfile Path

The path to the lock file to be created

required

Returns:

Type Description None

None

Source code in alto2txt2fixture/utils.py
def lock(lockfile: Path) -> None:\n\"\"\"\n    Writes a '.' to a lockfile, after making sure the parent directory exists.\n    Args:\n        lockfile: The path to the lock file to be created\n    Returns:\n        None\n    \"\"\"\nlockfile.parent.mkdir(parents=True, exist_ok=True)\nlockfile.write_text(\"\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.save_fixture","title":"save_fixture","text":"
save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n

Saves fixtures generated by a generator to separate JSON files.

This function takes a generator and saves the generated fixtures to separate JSON files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file parameter.

Parameters:

Name Type Description Default generator Sequence | Generator

A generator that yields the fixtures to be saved.

[] prefix str

A string prefix to be added to the file names of the saved fixtures.

'' output_path PathLike | str

Path to folder fixtures are saved to

settings.OUTPUT max_elements_per_file int

Maximum JSON records saved in each file

settings.MAX_ELEMENTS_PER_FILE add_created bool

Whether to add created_at and updated_at timestamps

True json_indent int

Number of indent spaces per line in saved JSON

JSON_INDENT

Returns:

Type Description None

This function saves the fixtures to files but does not return any value.

Example
>>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n...              prefix='test', output_path='tests/')\n>>> imported_fixture = load_json('tests/test-1.json')\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> 'created_at' in imported_fixture[1]['fields']\nTrue\n
Source code in alto2txt2fixture/utils.py
def save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate JSON files.\n    This function takes a generator and saves the generated fixtures to\n    separate JSON files. The fixtures are saved in batches, where each batch\n    is determined by the ``max_elements_per_file`` parameter.\n    Args:\n        generator: A generator that yields the fixtures to be saved.\n        prefix: A string prefix to be added to the file names of the\n            saved fixtures.\n        output_path: Path to folder fixtures are saved to\n        max_elements_per_file: Maximum `JSON` records saved in each file\n        add_created: Whether to add `created_at` and `updated_at` `timestamps`\n        json_indent: Number of indent spaces per line in saved `JSON`\n    Returns:\n        This function saves the fixtures to files but does not return\n            any value.\n    Example:\n        ```pycon\n        >>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n        ...              prefix='test', output_path='tests/')\n        >>> imported_fixture = load_json('tests/test-1.json')\n        >>> imported_fixture[1]['pk']\n        2\n        >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        >>> 'created_at' in imported_fixture[1]['fields']\n        True\n        ```\n    \"\"\"\ninternal_counter = 1\ncounter = 1\nlst = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in generator:\nlst.append(item)\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst = []\ninternal_counter = 1\ncounter += 1\nelse:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.write_json","title":"write_json","text":"
write_json(\np: str | Path,\no: dict,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n

Easier access to writing json files. Checks whether parent exists.

Parameters:

Name Type Description Default p str | Path

Path to write json to

required o dict

Object to write to json file

required add_created bool

If set to True will add created_at and updated_at to the dictionary's fields. If created_at and updated_at already exist in the fields, they will be forcefully updated.

True json_indent int

What indetation format to write out JSON file in

JSON_INDENT

Returns:

Type Description None

None

Example

>>> path = 'test-write-json/example.json'\n>>> write_json(p=path,\n...            o=NEWSPAPER_COLLECTION_METADATA,\n...            add_created=True)\n>>> imported_fixture = load_json(path)\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n
`

Source code in alto2txt2fixture/utils.py
def write_json(\np: str | Path, o: dict, add_created: bool = True, json_indent: int = JSON_INDENT\n) -> None:\n\"\"\"\n    Easier access to writing `json` files. Checks whether parent exists.\n    Args:\n        p: Path to write `json` to\n        o: Object to write to `json` file\n        add_created:\n            If set to True will add `created_at` and `updated_at`\n            to the dictionary's fields. If `created_at` and `updated_at`\n            already exist in the fields, they will be forcefully updated.\n        json_indent:\n            What indetation format to write out `JSON` file in\n    Returns:\n        None\n    Example:\n        ```pycon\n        >>> path = 'test-write-json/example.json'\n        >>> write_json(p=path,\n        ...            o=NEWSPAPER_COLLECTION_METADATA,\n        ...            add_created=True)\n        >>> imported_fixture = load_json(path)\n        >>> imported_fixture[1]['pk']\n        2\n        >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        ```\n        `\n    \"\"\"\np = get_path_from(p)\nif not (isinstance(o, dict) or isinstance(o, list)):\nraise RuntimeError(f\"Unable to handle data of type: {type(o)}\")\ndef _append_created_fields(o: dict):\n\"\"\"Add `created_at` and `updated_at` fields to a `dict` with `FixtureDict` values.\"\"\"\nreturn dict(\n**{k: v for k, v in o.items() if not k == \"fields\"},\nfields=dict(\n**{\nk: v\nfor k, v in o[\"fields\"].items()\nif not k == \"created_at\" and not k == \"updated_at\"\n},\n**{\"created_at\": NOW_str, \"updated_at\": NOW_str},\n),\n)\ntry:\nif add_created and isinstance(o, dict):\no = _append_created_fields(o)\nelif add_created and isinstance(o, list):\no = [_append_created_fields(x) for x in o]\nexcept KeyError:\nerror(\"An unknown error occurred (in write_json)\")\np.parent.mkdir(parents=True, exist_ok=True)\np.write_text(json.dumps(o, indent=json_indent))\nreturn\n
"},{"location":"tutorial/first-steps.html","title":"First Steps","text":""},{"location":"tutorial/first-steps.html#installing","title":"Installing","text":"

The installation process should be fairly easy to take care of, using poetry:

$ poetry install\n

However, this is only the first step in the process. As the script works through the alto2txt collections, you will either need to choose the slower option \u2014 mounting them to your computer (using blobfuse) \u2014\u00a0or the faster option \u2014 downloading the required zip files from the Azure storage to your local hard drive. In the two following sections, both of those options are described.

"},{"location":"tutorial/first-steps.html#connecting-alto2txt-to-the-program","title":"Connecting alto2txt to the program","text":""},{"location":"tutorial/first-steps.html#downloading-local-copies-of-alto2txt-on-your-computer","title":"Downloading local copies of alto2txt on your computer","text":"

This option will take up a lot of hard drive space

As of the time of writing, downloading all of alto2txt\u2019s metadata takes up about 185GB on your local drive.

You do not have to download all of the collections or all of the zip files for each collection, as long as you are aware that the resulting fixtures will be limited in scope.

"},{"location":"tutorial/first-steps.html#step-1-log-in-to-azure-using-microsoft-azure-storage-explorer","title":"Step 1: Log in to Azure using Microsoft Azure Storage Explorer","text":"

Microsoft Azure Storage Explorer (MASE) is a great and free tool for downloading content off Azure. Your first step is to download and install this product on your local computer.

Once you have opened MASE, you will need to sign into the appropriate Azure account.

"},{"location":"tutorial/first-steps.html#step-2-download-the-alto2txt-blob-container-to-your-hard-drive","title":"Step 2: Download the alto2txt blob container to your hard drive","text":"

On your left-hand side, you should see a menu where you can navigate to the correct \u201cblob container\u201d: Living with Machines > Storage Accounts > alto2txt > Blob Containers:

You will want to replicate the same structure as the Blob Container itself in a folder on your hard drive:

Once you have the structure set up, you are ready to download all of the files needed. For each of the blob containers, make sure that you download the metadata directory only onto your computer:

Select all of the files and press the download button:

Make sure you save all the zip files inside the correct local folder:

The \u201cActivities\u201d bar will now show you the progress and speed:

"},{"location":"tutorial/first-steps.html#mounting-alto2txt-on-your-computer","title":"Mounting alto2txt on your computer","text":"

This option will only work on a Linux or UNIX computer

If you have a mac, your only option is the one below.

"},{"location":"tutorial/first-steps.html#step-1-install-blobfuse","title":"Step 1: Install BlobFuse","text":"

Follow the instructions for installing BlobFuse and the instructions for how to prepare your drive for mounting.

"},{"location":"tutorial/first-steps.html#step-2-set-up-sas-tokens","title":"Step 2: Set up SAS tokens","text":"

Follow the instructions for setting up access to your Azure storage account.

"},{"location":"tutorial/first-steps.html#step-3-mount-your-blobs","title":"Step 3: Mount your blobs","text":"

TODO #3: Write this section.

Note that you can also search on the internet for ideas on how to create local scripts to facilitate easier connection next time.

"}]} \ No newline at end of file diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 0000000..8713024 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"alto2txt2fixture","text":"

alto2txt2fixture is a standalone tool to convert alto2txt XML output and other related datasets into JSON (and where feasible CSV) data with corresponding relational IDs to ease general use and ingestion into a relational database.

We target the the JSON produced for importing into lwmdb: a database built using the Django python webframework database fixture structure.

"},{"location":"index.html#installation-and-simple-use","title":"Installation and simple use","text":"

We provide a command line interface to process alto2txt XML files stored locally (or mounted via azure blobfuse), and for additional public data we automate a means of downloading those automatically.

"},{"location":"index.html#installation","title":"Installation","text":"

We recommend downloading a copy of the reposity or using git clone. From a local copy use poetry to install dependencies:

$ cd alto2txt2fixture\n$ poetry install\n

If you would like to test, render documentation and/or contribute to the code included dev dependencies in a local install:

$ poetry install --with dev\n
"},{"location":"index.html#simple-use","title":"Simple use","text":"

To processing newspaper metadata with a local copy of alto2txt XML results, it's easiest to have that data in the same folder as your alto2txt2fixture checkout and poetry installed folder. One arranged, you should be able to begin the JSON converstion with

$ poetry run a2t2f-news\n

To generate related data in JSON and CSV form, assuming you have an internet collection and access to a living-with-machines azure account, the following will download related data into JSON and CSV files. The JSON results should be consistent with lwmdb tables for ease of import.

$ poetry run a2t2f-adj\n
"},{"location":"running.html","title":"Running the Program","text":""},{"location":"running.html#using-poetry-to-run","title":"Using poetry to run","text":"

The program should run automatically with the following command:

$ poetry run a2t2f-news\n

Alternatively, if you want to add optional parameters and don\u2019t want to use the standard poetry script to run, you can use the (somewhat convoluted) poetry run alto2txt2fixture/run.py and provide any optional parameters. You can see a list of all the \u201cOptional parameters\u201d below. For example, if you want to only include the hmd collection:

$ poetry run alto2txt2fixture/run.py --collections hmd\n
"},{"location":"running.html#alternative-run-the-script-without-poetry","title":"Alternative: Run the script without poetry","text":"

If you find yourself in trouble with poetry, the program should run perfectly fine on its own, assuming the dependencies are installed. The same command, then, would be:

$ python alto2txt2fixture/run.py --collections hmd\n

Note

See the list under [tool.poetry.dependencies] in pyproject.toml for a list of dependencies that would need to be installed for alto2txt2fixture to work outside a python poetry environment.

"},{"location":"running.html#optional-parameters","title":"Optional parameters","text":"

The program has a number of optional parameters that you can choose to include or not. The table below describes each parameter, how to pass it to the program, and what its defaults are.

Flag Description Default value -c, --collections Which collections to process in the mounted alto2txt directory hmd, lwm, jisc, bna -o, --output Into which directory should the processed files be put? ./output/fixtures/ -m, --mountpoint Where is the alto2txt directories mounted? ./input/alto2txt/ -t, --test-config Print the config table but do not run False"},{"location":"running.html#successfully-running-the-program-an-example","title":"Successfully running the program: An example","text":""},{"location":"understanding-results.html","title":"Understanding the Results","text":""},{"location":"understanding-results.html#the-resulting-file-structure","title":"The resulting file structure","text":"

The examples below follow standard settings

If you choose other settings for when you run the program, your output directory may look different from the information on this page.

"},{"location":"understanding-results.html#reports","title":"Reports","text":"

Reports are automatically generated with a unique hash as the overarching folder structure. Inside the reports directory, you\u2019ll find a JSON file for each alto2txt directory (organised by NLP identifier).

The report structure, thus, looks like this:

The JSON file has some good troubleshooting information. You\u2019ll find that the contents are structured as a Python dictionary (or JavaScript Object). Here is an example:

Here is an explanation of each of the keys in the dictionary:

Key Explanation Data type path The input path for the zip file that is being converted. string bytes The size of the input zip file represented in bytes. integer size The size of the input zip file represented in a human-readable string. string contents #TODO #3 integer start Date and time when processing started (see also end below). datestring newspaper_paths #TODO #3 list (string) publication_codes A list of the NLPs that are contained in the input zip file. list (string) issue_paths A list of all the issue paths that are contained in the cache directory. list (string) item_paths A list of all the item paths that are contained in the cache directory. list (string) end Date and time when processing ended (see also start above). datestring seconds Seconds that the script spent interpreting the zip file (should be added to the microseconds below). integer microseconds Microseconds that the script spent interpreting the zip file (should be added to the seconds above). integer"},{"location":"understanding-results.html#fixtures","title":"Fixtures","text":"

The most important output of the script is contained in the fixtures directory. This directory contains JSON files for all the different columns in the corresponding Django metadata database (i.e. DataProvider, Digitisation, Ingest, Issue, Newspaper, and Item). The numbering at the end of each file indicates the order of the files as they are divided into a maximum of 2e6 elements*:

Each JSON file contains a Python-like list (JavaScript Array) of dictionaries (JavaScript Objects), which have a primary key (pk), the related database model (in the example below the Django newspapers app\u2019s newspaper table), and a nested dictionary/Object which contains all the values for the database\u2019s table entry:

* The maximum elements per file can be adjusted in the settings.py file\u2019s settings object\u2019s MAX_ELEMENTS_PER_FILE value.

"},{"location":"reference/SUMMARY.html","title":"SUMMARY","text":"
  • alto2txt2fixture
    • __main__
    • cli
    • create_adjacent_tables
    • jisc
    • log
    • parser
    • patterns
    • router
    • settings
    • types
    • utils
"},{"location":"reference/alto2txt2fixture/index.html","title":"alto2txt2fixture","text":""},{"location":"reference/alto2txt2fixture/__main__.html","title":"__main__","text":"

Entry point for alto2txt2fixture.parse to convert alto2txt XML -> JSON.

This module defines the run function which is the main driver for the entire process.

It imports various functions from other modules and uses them to route and parse XML data generated by alto2txt.

The following steps are performed in the run function:

  1. Parses command line arguments using the parse_args function. If no arguments are provided, the default values are taken from the settings module.
  2. Prints a setup report to the console, showing the values of the relevant parameters.
  3. Calls the route function to route alto2txt data into subdirectories with structured files.
  4. Calls the parse function to parse the resulting JSON files.
  5. Calls the clear_cache function to clear the cache.

If the script is run as a main program (i.e. if the name of the script is __main__), the run() function is executed.

Note: at present this does not include any functunality in create_adjacent_tables.py

"},{"location":"reference/alto2txt2fixture/__main__.html#alto2txt2fixture.__main__.parse_args","title":"parse_args","text":"
parse_args(argv: list[str] | None = None) -> Namespace\n

Manage command line arguments for run()

This constructs an ArgumentParser instance to manage configurating calls of run() to manage newspaper XML to JSON converstion.

Parameters:

Name Type Description Default argv list[str] | None

If None treat as equivalent of ['--help], if alistofstrpass those options toArgumentParser`

None

Returns:

Type Description Namespace

A Namespace dict-like configuration for run()

Source code in alto2txt2fixture/__main__.py
def parse_args(argv: list[str] | None = None) -> Namespace:\n\"\"\"Manage command line arguments for `run()`\n    This constructs an `ArgumentParser` instance to manage\n    configurating calls of `run()` to manage `newspaper`\n    `XML` to `JSON` converstion.\n    Arguments:\n        argv:\n            If `None` treat as equivalent of ['--help`],\n            if a `list` of `str` pass those options to `ArgumentParser`\n    Returns:\n        A `Namespace` `dict`-like configuration for `run()`\n    \"\"\"\nargv = None if not argv else argv\nparser = ArgumentParser(\nprog=\"a2t2f-news\",\ndescription=\"Process alto2txt XML into and Django JSON Fixture files\",\nepilog=(\n\"Note: this is still in beta mode and contributions welcome\\n\\n\" + __doc__\n),\nformatter_class=RawTextHelpFormatter,\n)\nparser.add_argument(\n\"-c\",\n\"--collections\",\nnargs=\"+\",\nhelp=\"<Optional> Set collections\",\nrequired=False,\n)\nparser.add_argument(\n\"-m\",\n\"--mountpoint\",\ntype=str,\nhelp=\"<Optional> Mountpoint\",\nrequired=False,\n)\nparser.add_argument(\n\"-o\",\n\"--output\",\ntype=str,\nhelp=\"<Optional> Set an output directory\",\nrequired=False,\n)\nparser.add_argument(\n\"-t\",\n\"--test-config\",\ndefault=False,\nhelp=\"Only print the configuration\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"-f\",\n\"--show-fixture-tables\",\ndefault=True,\nhelp=\"Print included fixture table configurations\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--export-fixture-tables\",\ndefault=True,\nhelp=\"Experimental: export fixture tables prior to data processing\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--data-provider-field\",\ntype=str,\ndefault=DATA_PROVIDER_INDEX,\nhelp=\"Key for indexing DataProvider records\",\n)\nreturn parser.parse_args(argv)\n
"},{"location":"reference/alto2txt2fixture/__main__.html#alto2txt2fixture.__main__.run","title":"run","text":"
run(local_args: list[str] | None = None) -> None\n

Manage running newspaper XML to JSON conversion.

First parse_args is called for command line arguments including:

  • collections
  • output
  • mountpoint

If any of these arguments are specified, they will be used, otherwise they will default to the values in the settings module.

The show_setup function is then called to display the configurations being used.

The route function is then called to route the alto2txt files into subdirectories with structured files.

The parse function is then called to parse the resulting JSON files.

Finally, the clear_cache function is called to clear the cache (pending the user's confirmation).

Parameters:

Name Type Description Default local_args list[str] | None

Options passed to parse_args()

None Source code in alto2txt2fixture/__main__.py
def run(local_args: list[str] | None = None) -> None:\n\"\"\"Manage running newspaper `XML` to `JSON` conversion.\n    First `parse_args` is called for command line arguments including:\n    - `collections`\n    - `output`\n    - `mountpoint`\n    If any of these arguments are specified, they will be used, otherwise they\n    will default to the values in the `settings` module.\n    The `show_setup` function is then called to display the configurations\n    being used.\n    The `route` function is then called to route the alto2txt files into\n    subdirectories with structured files.\n    The `parse` function is then called to parse the resulting JSON files.\n    Finally, the `clear_cache` function is called to clear the cache\n    (pending the user's confirmation).\n    Arguments:\n        local_args:\n            Options passed to `parse_args()`\n    \"\"\"\nargs: Namespace = parse_args(argv=local_args)\nif args.collections:\nCOLLECTIONS = [x.lower() for x in args.collections]\nelse:\nCOLLECTIONS = settings.COLLECTIONS\nif args.output:\nOUTPUT = args.output.rstrip(\"/\")\nelse:\nOUTPUT = settings.OUTPUT\nif args.mountpoint:\nMOUNTPOINT = args.mountpoint.rstrip(\"/\")\nelse:\nMOUNTPOINT = settings.MOUNTPOINT\nshow_setup(\nCOLLECTIONS=COLLECTIONS,\nOUTPUT=OUTPUT,\nCACHE_HOME=settings.CACHE_HOME,\nMOUNTPOINT=MOUNTPOINT,\nJISC_PAPERS_CSV=settings.JISC_PAPERS_CSV,\nREPORT_DIR=settings.REPORT_DIR,\nMAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,\n)\nif args.show_fixture_tables:\n# Show a table of fixtures used, defaults to DataProvider Table\nshow_fixture_tables(settings, data_provider_index=args.data_provider_field)\nif args.export_fixture_tables:\nexport_fixtures(\nfixture_tables=settings.FIXTURE_TABLES,\npath=OUTPUT,\nformats=settings.FIXTURE_TABLES_FORMATS,\n)\nif not args.test_config:\n# Routing alto2txt into subdirectories with structured files\nroute(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nMOUNTPOINT,\nsettings.JISC_PAPERS_CSV,\nsettings.REPORT_DIR,\n)\n# Parsing the resulting JSON files\nparse(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nOUTPUT,\nsettings.MAX_ELEMENTS_PER_FILE,\n)\nclear_cache(settings.CACHE_HOME)\n
"},{"location":"reference/alto2txt2fixture/cli.html","title":"cli","text":""},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_fixture_tables","title":"show_fixture_tables","text":"
show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]\n

Print fixture tables specified in settings.fixture_tables in rich.Table format.

Parameters:

Name Type Description Default run_settings dotdict

alto2txt2fixture run configuration

settings print_in_call bool

whether to print to console (will use console variable if so)

True data_provider_index str

key to index dataprovider from NEWSPAPER_COLLECTION_METADATA

DATA_PROVIDER_INDEX

Returns:

Type Description list[Table]

A list of rich.Table renders from configurations in run_settings.FIXTURE_TABLES

Example
>>> fixture_tables: list[Table] = show_fixture_tables(\n...     settings,\n...     print_in_call=False)\n>>> len(fixture_tables)\n1\n>>> fixture_tables[0].title\n'dataprovider'\n>>> [column.header for column in fixture_tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n>>> fixture_tables = show_fixture_tables(settings)\n... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n
Note

It is possible for the example test to fail in different screen sizes. Try increasing the window or screen width of terminal used to check before raising an issue.

Source code in alto2txt2fixture/cli.py
def show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]:\n\"\"\"Print fixture tables specified in ``settings.fixture_tables`` in `rich.Table` format.\n    Arguments:\n        run_settings: `alto2txt2fixture` run configuration\n        print_in_call: whether to print to console (will use ``console`` variable if so)\n        data_provider_index: key to index `dataprovider` from ``NEWSPAPER_COLLECTION_METADATA``\n    Returns:\n        A `list` of `rich.Table` renders from configurations in ``run_settings.FIXTURE_TABLES``\n    Example:\n        ```pycon\n        >>> fixture_tables: list[Table] = show_fixture_tables(\n        ...     settings,\n        ...     print_in_call=False)\n        >>> len(fixture_tables)\n        1\n        >>> fixture_tables[0].title\n        'dataprovider'\n        >>> [column.header for column in fixture_tables[0].columns]\n        ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n        >>> fixture_tables = show_fixture_tables(settings)\n        ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n        <BLANKLINE>\n        ...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n        ```\n    Note:\n        It is possible for the example test to fail in different screen sizes. Try\n        increasing the window or screen width of terminal used to check before\n        raising an issue.\n    \"\"\"\nif run_settings.FIXTURE_TABLES:\nif \"dataprovider\" in run_settings.FIXTURE_TABLES:\ncheck_newspaper_collection_configuration(\nrun_settings.COLLECTIONS,\nrun_settings.FIXTURE_TABLES[\"dataprovider\"],\ndata_provider_index=data_provider_index,\n)\nconsole_tables: list[Table] = list(\ngen_fixture_tables(run_settings.FIXTURE_TABLES)\n)\nif print_in_call:\nfor console_table in console_tables:\nconsole.print(console_table)\nreturn console_tables\nelse:\nreturn []\n
"},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_setup","title":"show_setup","text":"
show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs: str) -> None\n

Generate a rich.table.Table for printing configuration to console.

Source code in alto2txt2fixture/cli.py
def show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs) -> None:\n\"\"\"Generate a `rich.table.Table` for printing configuration to console.\"\"\"\nif clear and os.name == \"posix\":\nos.system(\"clear\")\nelif clear:\nos.system(\"cls\")\ntable = Table(title=title)\ntable.add_column(\"Setting\", justify=\"right\", style=\"cyan\", no_wrap=True)\ntable.add_column(\"Value\", style=\"magenta\")\nfor key, value in kwargs.items():\ntable.add_row(str(key), str(value))\nconsole.print(table)\nreturn\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html","title":"create_adjacent_tables","text":""},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.correct_dict","title":"correct_dict","text":"
correct_dict(o: dict) -> list\n

Returns a list with corrected data from a provided dictionary.

Source code in alto2txt2fixture/create_adjacent_tables.py
def correct_dict(o: dict) -> list:\n\"\"\"Returns a list with corrected data from a provided dictionary.\"\"\"\nreturn [(k, v[0], v[1]) for k, v in o.items() if not v[0].startswith(\"Q\")] + [\n(k, v[1], v[0]) for k, v in o.items() if v[0].startswith(\"Q\")\n]\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.csv2json_list","title":"csv2json_list","text":"
csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list\n

Save csv_path as a json file and return as a list.

Source code in alto2txt2fixture/create_adjacent_tables.py
def csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list:\n\"\"\"Save `csv_path` as a `json` file and return as a `list`.\"\"\"\njson_data = []\n# See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486\ndf = (\npd.read_csv(csv_path, index_col=0).fillna(np.nan).replace([np.nan], [None])\n)  # fillna(None)\nif \"political_leanings\" in df.columns:\ndf[\"political_leanings\"] = df[\"political_leanings\"].apply(json.loads)\nif \"prices\" in df.columns:\ndf[\"prices\"] = df[\"prices\"].apply(json.loads)\nmodel = Path(csv_path).stem.lower()\nfor pk, row in df.iterrows():\nfields = row.to_dict()\njson_data.append({\"pk\": pk, \"model\": model, \"fields\": fields})\n(Path(output_path) / csv_path).parent.mkdir(parents=True, exist_ok=True)\nPath(output_path / f\"{Path(csv_path).stem}.json\").write_text(\njson.dumps(json_data, indent=indent)\n)\nif not saved is None:\nsaved.append(output_path / f\"{Path(csv_path).stem}.json\")\nreturn json_data\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.download_data","title":"download_data","text":"
download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None\n

Download files in files_dict, overwrite if specified.

Parameters:

Name Type Description Default files_dict RemoteDataFilesType

dict of related files to download

{} overwrite bool

bool to overwrite LOCAL_CACHE files or not

OVERWRITE exclude list[str]

list of files to exclude from files_dict

[] Example
>>> tmp: Path = getfixture('tmpdir')\n>>> set_path: Path = tmp.chdir()\n>>> download_data(exclude=[\n...     \"mitchells\", \"Newspaper-1\", \"linking\"\n... ])  # doctest: +ELLIPSIS\nExcluding mitchells...\nExcluding Newspaper-1...\nExcluding linking...\nDownloading cache...dict_admin_counties.json\n100% ... 37/37 bytes\nDownloading cache...dict_countries.json\n100% ... 33.2/33.2 kB\nDownloading cache...dict_historic_counties.json\n100% ... 41.4/41.4 kB\nDownloading cache...nlp_loc_wikidata_concat.csv\n100% ... 59.8/59.8 kB\nDownloading cache...wikidata_gazetteer_selected_columns.csv\n100% ... 47.8/47.8 MB\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None:\n\"\"\"Download files in ``files_dict``, overwrite if specified.\n    Args:\n        files_dict: `dict` of related files to download\n        overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not\n        exclude: `list` of files to exclude from ``files_dict``\n    Example:\n        ```pycon\n        >>> tmp: Path = getfixture('tmpdir')\n        >>> set_path: Path = tmp.chdir()\n        >>> download_data(exclude=[\n        ...     \"mitchells\", \"Newspaper-1\", \"linking\"\n        ... ])  # doctest: +ELLIPSIS\n        Excluding mitchells...\n        Excluding Newspaper-1...\n        Excluding linking...\n        Downloading cache...dict_admin_counties.json\n        100% ... 37/37 bytes\n        Downloading cache...dict_countries.json\n        100% ... 33.2/33.2 kB\n        Downloading cache...dict_historic_counties.json\n        100% ... 41.4/41.4 kB\n        Downloading cache...nlp_loc_wikidata_concat.csv\n        100% ... 59.8/59.8 kB\n        Downloading cache...wikidata_gazetteer_selected_columns.csv\n        100% ... 47.8/47.8 MB\n        ```\n    \"\"\"\nif not files_dict:\nfiles_dict = deepcopy(FILES)\nfor data_source in exclude:\nif data_source in files_dict:\nprint(f\"Excluding {data_source}...\")\nfiles_dict.pop(data_source, 0)\nelse:\nlogger.warning(\nf'\"{data_source}\" not an option to exclude from {files_dict}'\n)\n# Describe whether local file exists\nfor k in files_dict.keys():\nfiles_dict[k][\"exists\"] = files_dict[k][\"local\"].exists()\nfiles_to_download = [\n(v[\"remote\"], v[\"local\"], v[\"exists\"])\nfor v in files_dict.values()\nif \"exists\" in v and not v[\"exists\"] or overwrite\n]\nfor url, out, exists in files_to_download:\nrmtree(Path(out), ignore_errors=True) if exists else None\nprint(f\"Downloading {out}\")\nPath(out).parent.mkdir(parents=True, exist_ok=True)\nassert isinstance(url, str)\nwith urlopen(url) as response, open(out, \"wb\") as out_file:\ntotal: int = int(response.info()[\"Content-length\"])\nwith Progress(\n\"[progress.percentage]{task.percentage:>3.0f}%\",\nBarColumn(),  # removed bar_width=None to avoid too long when resized\nDownloadColumn(),\n) as progress:\ndownload_task = progress.add_task(\"Download\", total=total)\nfor chunk in response:\nout_file.write(chunk)\nprogress.update(download_task, advance=len(chunk))\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_list","title":"get_list","text":"
get_list(x)\n

Get a list from a string, which contains as separator. If no string is encountered, the function returns an empty list. Source code in alto2txt2fixture/create_adjacent_tables.py

def get_list(x):\n\"\"\"Get a list from a string, which contains <SEP> as separator. If no\n    string is encountered, the function returns an empty list.\"\"\"\nreturn x.split(\"<SEP>\") if isinstance(x, str) else []\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_outpaths_dict","title":"get_outpaths_dict","text":"
get_outpaths_dict(\nnames: Sequence[str], module_name: str\n) -> TableOutputConfigType\n

Return a dict of csv and json paths for each module_name table.

The csv and json paths

Parameters:

Name Type Description Default names Sequence[str]

iterable of names of each module_name's component. Main target is csv and json table names

required module_name str

name of module each name is part of, that is added as a prefix

required

Returns:

Type Description TableOutputConfigType

A TableOutputConfigType: a dict of table names and output csv and json filenames.

Example
>>> from pprint import pprint\n>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n                      'json': 'mitchells.PoliticalLeaning.json'},\n 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConfigType:\n\"\"\"Return a `dict` of `csv` and `json` paths for each `module_name` table.\n    The `csv` and `json` paths\n    Args:\n        names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names\n        module_name: name of module each name is part of, that is added as a prefix\n    Returns:\n        A ``TableOutputConfigType``: a `dict` of table ``names`` and output\n            `csv` and `json` filenames.\n    Example:\n        ```pycon\n        >>> from pprint import pprint\n        >>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n        {'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n         'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n         'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n                              'json': 'mitchells.PoliticalLeaning.json'},\n         'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n        ```\n    \"\"\"\nreturn {\nname: OutputPathDict(\ncsv=f\"{module_name}.{name}.csv\",\njson=f\"{module_name}.{name}.json\",\n)\nfor name in names\n}\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.run","title":"run","text":"
run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None\n

Download, process and link files_dict to json and csv.

Note

This will require access to https://zooniversedata.blob.core.windows.net/downloads/.

Source code in alto2txt2fixture/create_adjacent_tables.py
def run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None:\n\"\"\"Download, process and link ``files_dict`` to `json` and `csv`.\n    Note:\n        This will require access to `https://zooniversedata.blob.core.windows.net/downloads/`.\n    \"\"\"\n# Ensure time_stamp from the point of calling `run`\nif not time_stamp:\ntime_stamp = get_now(as_str=False).strftime(TIME_FORMAT)\n# Ensure an independent deepcopy of FILES to avoid modifying subsequent runs\nif not files_dict:\nfiles_dict = deepcopy(FILES)\n# Download non-existing files\ndownload_data(files_dict=files_dict, overwrite=files_to_download_overwrite)\n# Create the output directory (defined in output_path)\noutput_path.mkdir(exist_ok=True, parents=True)\n# Read all the Wikidata Q values from Mitchells\nmitchells_df = pd.read_csv(files_dict[\"mitchells\"][\"local\"], index_col=0)\nmitchell_wikidata_mentions = sorted(\nlist(mitchells_df.PLACE_PUB_WIKI.unique()),\nkey=lambda x: int(x.replace(\"Q\", \"\")),\n)\n# Set up wikidata_gazetteer\ngaz_cols = [\"wikidata_id\", \"english_label\", \"latitude\", \"longitude\", \"geonamesIDs\"]\nwikidata_gazetteer = pd.read_csv(\nfiles_dict[\"wikidata_gazetteer_selected_columns\"][\"local\"], usecols=gaz_cols\n)\nwikidata_gazetteer.rename(\n{\n\"wikidata_id\": \"place_wikidata_id\",\n\"english_label\": \"place_label\",\n\"geonamesIDs\": \"geonames_ids\",\n},\naxis=1,\ninplace=True,\n)\n# Read in + fix all dictionaries\ndict_historic_counties = json.loads(\nPath(files_dict[\"dict_historic_counties\"][\"local\"]).read_text()\n)\ndict_admin_counties = json.loads(\nPath(files_dict[\"dict_admin_counties\"][\"local\"]).read_text()\n)\ndict_countries = json.loads(Path(files_dict[\"dict_countries\"][\"local\"]).read_text())\ndict_historic_counties = correct_dict(dict_historic_counties)\ndict_admin_counties = correct_dict(dict_admin_counties)\ndict_countries = correct_dict(dict_countries)\n# Create assisting frames\nhistorical_counties_df = pd.DataFrame(\ndict_historic_counties,\ncolumns=[\"place_wikidata_id\", \"hcounty_label\", \"hcounty_wikidata_id\"],\n)\nadmin_county_df = pd.DataFrame(\ndict_admin_counties,\ncolumns=[\n\"place_wikidata_id\",\n\"admin_county_label\",\n\"admin_county_wikidata_id\",\n],\n)\ncountries_df = pd.DataFrame(\ndict_countries,\ncolumns=[\"place_wikidata_id\", \"country_label\", \"country_wikidata_id\"],\n)\nwikidata_gazetteer = wikidata_gazetteer[\nwikidata_gazetteer.place_wikidata_id.isin(mitchell_wikidata_mentions)\n].sort_values(\"place_wikidata_id\")\nwikidata_gazetteer[\"place_pk\"] = np.arange(1, len(wikidata_gazetteer) + 1)\nwikidata_gazetteer = wikidata_gazetteer[\n[\"place_pk\"] + [x for x in wikidata_gazetteer.columns if not x == \"place_pk\"]\n]\n# Merge wikidata_gazetteer with all the assisting frames (and rename the\n# resulting columns)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, historical_counties_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, admin_county_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, countries_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer.rename(\n{\n\"admin_county_label\": \"admin_county__label\",\n\"admin_county_wikidata_id\": \"admin_county__wikidata_id\",\n\"hcounty_label\": \"historic_county__label\",\n\"hcounty_wikidata_id\": \"historic_county__wikidata_id\",\n\"country_label\": \"country__label\",\n\"country_wikidata_id\": \"country__wikidata_id\",\n},\naxis=1,\ninplace=True,\n)\n# Split back up into dataframes specific for the tables\nhistoric_county_table = (\nwikidata_gazetteer[[\"historic_county__label\", \"historic_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nhistoric_county_table = historic_county_table.replace({\"\": np.nan}).dropna()\nhistoric_county_table[\"historic_county__pk\"] = np.arange(\n1, len(historic_county_table) + 1\n)\nadmin_county_table = (\nwikidata_gazetteer[[\"admin_county__label\", \"admin_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nadmin_county_table = admin_county_table.replace({\"\": np.nan}).dropna()\nadmin_county_table[\"admin_county__pk\"] = np.arange(1, len(admin_county_table) + 1)\ncountry_table = (\nwikidata_gazetteer[[\"country__label\", \"country__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\ncountry_table = country_table.replace({\"\": np.nan}).dropna()\ncountry_table[\"country__pk\"] = np.arange(1, len(country_table) + 1)\n# Set up place_table from wikidata_gazetteer\nplace_table = wikidata_gazetteer.copy()\nplace_table = (\npd.merge(\nplace_table,\nhistoric_county_table,\non=[\"historic_county__label\", \"historic_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"historic_county__label\", \"historic_county__wikidata_id\"], axis=1)\n.rename({\"historic_county__pk\": \"historic_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\nadmin_county_table,\non=[\"admin_county__label\", \"admin_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"admin_county__label\", \"admin_county__wikidata_id\"], axis=1)\n.rename({\"admin_county__pk\": \"admin_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\ncountry_table,\non=[\"country__label\", \"country__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"country__label\", \"country__wikidata_id\"], axis=1)\n.rename({\"country__pk\": \"country_id\"}, axis=1)\n)\nplace_table.fillna(\"\", inplace=True)\nplace_table.set_index(\"place_pk\", inplace=True)\nplace_table.rename(\n{\"place_label\": \"label\", \"place_wikidata_id\": \"wikidata_id\"},\naxis=1,\ninplace=True,\n)\nplace_table[\"historic_county_id\"] = (\nplace_table[\"historic_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"admin_county_id\"] = (\nplace_table[\"admin_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"country_id\"] = (\nplace_table[\"country_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table.index.rename(\"pk\", inplace=True)\nplace_table.rename(\n{\n\"historic_county_id\": \"historic_county\",\n\"admin_county_id\": \"admin_county\",\n\"country_id\": \"country\",\n},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.set_index(\"historic_county__pk\", inplace=True)\nhistoric_county_table.rename(\n{x: x.split(\"__\")[1] for x in historic_county_table.columns},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.index.rename(\"pk\", inplace=True)\nadmin_county_table.set_index(\"admin_county__pk\", inplace=True)\nadmin_county_table.rename(\n{x: x.split(\"__\")[1] for x in admin_county_table.columns}, axis=1, inplace=True\n)\nadmin_county_table.index.rename(\"pk\", inplace=True)\ncountry_table.set_index(\"country__pk\", inplace=True)\ncountry_table.rename(\n{x: x.split(\"__\")[1] for x in country_table.columns}, axis=1, inplace=True\n)\ncountry_table.index.rename(\"pk\", inplace=True)\n# Adding created_at, updated_at to all the gazetteer tables\nplace_table[\"created_at\"] = time_stamp\nplace_table[\"updated_at\"] = time_stamp\nadmin_county_table[\"created_at\"] = time_stamp\nadmin_county_table[\"updated_at\"] = time_stamp\nhistoric_county_table[\"created_at\"] = time_stamp\nhistoric_county_table[\"updated_at\"] = time_stamp\ncountry_table[\"created_at\"] = time_stamp\ncountry_table[\"updated_at\"] = time_stamp\n# Save CSV files for gazetteer tables\nplace_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"])\nadmin_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"]\n)\nhistoric_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"]\n)\ncountry_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"])\nsaved.extend(\n[\noutput_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"],\n]\n)\n# Fix up Mitchells (already loaded)\nmitchells_df[\"politics\"] = mitchells_df.POLITICS.apply(get_list)\nmitchells_df[\"persons\"] = mitchells_df.PERSONS.apply(get_list)\nmitchells_df[\"organisations\"] = mitchells_df.ORGANIZATIONS.apply(get_list)\nmitchells_df[\"price\"] = mitchells_df.PRICE.apply(get_list)\nmitchells_df.rename(\n{\n\"ID\": \"mpd_id\",\n\"TITLE\": \"title\",\n\"politics\": \"political_leaning_raw\",\n\"price\": \"price_raw\",\n\"YEAR\": \"year\",\n\"PLACE_PUB_WIKI\": \"place_of_publication_id\",\n\"ESTABLISHED_DATE\": \"date_established_raw\",\n\"PUBLISED_DATE\": \"day_of_publication_raw\",\n},\naxis=1,\ninplace=True,\n)\ndrop_cols = [\n\"CHAIN_ID\",\n\"POLITICS\",\n\"PERSONS\",\n\"ORGANIZATIONS\",\n\"PRICE\",\n\"PLACE_PUB\",\n\"PLACE_PUB_COORD\",\n\"PLACES\",\n\"PLACES_TRES\",\n\"TEXT\",\n]\nmitchells_df.drop(columns=drop_cols, inplace=True)\n# Create derivative tables (from Mitchells) = political_leanings, prices,\n# issues\npolitical_leanings = sorted(\nlist(set([y.strip() for x in mitchells_df.political_leaning_raw for y in x]))\n)\npolitical_leanings_table = pd.DataFrame()\npolitical_leanings_table[\"political_leaning__pk\"] = np.arange(\n1, len(political_leanings) + 1\n)\npolitical_leanings_table[\"political_leaning__label\"] = political_leanings\nexport = political_leanings_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"political_leaning__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nprices = sorted(list(set([y.strip() for x in mitchells_df.price_raw for y in x])))\nprices_table = pd.DataFrame()\nprices_table[\"price__pk\"] = np.arange(1, len(prices) + 1)\nprices_table[\"price__label\"] = prices\nexport = prices_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"price__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nissues = sorted(list(mitchells_df.year.unique()))\nissues_table = pd.DataFrame()\nissues_table[\"issue__pk\"] = np.arange(1, len(issues) + 1)\nissues_table[\"issue__year\"] = issues\nexport = issues_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"issue__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\n# Set up linking on Mitchells dataframe\nlinking_df = pd.read_csv(\nfiles_dict[\"linking\"][\"local\"],\nindex_col=0,\ndtype={\"NLP\": str},\nusecols=[\n\"NLP\",\n\"Title\",\n\"AcquiredYears\",\n\"Editions\",\n\"EditionTitles\",\n\"City\",\n\"Publisher\",\n\"UnavailableYears\",\n\"Collection\",\n\"UK\",\n\"Complete\",\n\"Notes\",\n\"County\",\n\"HistoricCounty\",\n\"First date held\",\n\"Publication title\",\n\"link_to_mpd\",\n],\n)\nlinking_df[\"NLP\"] = linking_df.index\nlinking_df.rename(\n{\"link_to_mpd\": \"mpd_id\", \"NLP\": \"newspaper\"}, axis=1, inplace=True\n)\n# Link Mitchells with all the other data\nmitchells_df = pd.merge(mitchells_df, linking_df, on=\"mpd_id\", how=\"inner\")\n# Create entry_table\nentry_table = mitchells_df.copy()\nentry_table[\"place_of_circulation_raw\"] = \"\"\nentry_table[\"publication_district_raw\"] = \"\"\nentry_table[\"publication_county_raw\"] = \"\"\n# TODO: What happened to the three columns above? (Check w Kaspar?)\n# Only keep relevant columns\nentry_table = entry_table[\n[\n\"title\",\n\"political_leaning_raw\",\n\"price_raw\",\n\"year\",\n\"date_established_raw\",\n\"day_of_publication_raw\",\n\"place_of_circulation_raw\",\n\"publication_district_raw\",\n\"publication_county_raw\",\n\"organisations\",\n\"persons\",\n\"place_of_publication_id\",\n\"newspaper\",\n]\n]\n# Fix refs to political_leanings_table\nrev = political_leanings_table.set_index(\"political_leaning__label\")\nentry_table[\"political_leanings\"] = entry_table.political_leaning_raw.apply(\nlambda x: [rev.at[y, \"political_leaning__pk\"] for y in x]\n)\n# Fix refs to prices_table\nrev = prices_table.set_index(\"price__label\")\nentry_table[\"prices\"] = entry_table.price_raw.apply(\nlambda x: [rev.at[y.strip(), \"price__pk\"] for y in x]\n)\n# Fix refs to issues_table\nrev = issues_table.set_index(\"issue__year\")\nentry_table[\"issue\"] = entry_table.year.apply(lambda x: rev.at[x, \"issue__pk\"])\n# Fix refs to place_table\nrev = place_table.copy()\nrev[\"place__pk\"] = rev.index\nrev.set_index(\"wikidata_id\", inplace=True)\nentry_table[\"place_of_publication\"] = entry_table.place_of_publication_id.apply(\ntest_place, rev=rev\n)\nentry_table.drop(columns=[\"place_of_publication_id\"], inplace=True)\n# Set up ref to newspapers\nrev = json.loads(files_dict[\"Newspaper-1\"][\"local\"].read_text())\nrev = [dict(pk=v[\"pk\"], **v[\"fields\"]) for v in rev]\nrev = pd.DataFrame(rev)\nrev.set_index(\"publication_code\", inplace=True)\nentry_table[\"newspaper\"] = entry_table.newspaper.str.zfill(7)\nentry_table[\"newspaper\"] = entry_table.newspaper.apply(test_paper, rev=rev)\n# Create PK for entries\nentry_table[\"pk\"] = np.arange(1, len(entry_table) + 1)\n# Sort columns in entries file\nentry_table = entry_table[\n[\"pk\"] + [col for col in entry_table.columns if not col == \"pk\"]\n]\n# Add created_at, modified_at to entry_table\nentry_table[\"created_at\"] = time_stamp\nentry_table[\"updated_at\"] = time_stamp\n# Export entry_table\nentry_table.set_index(\"pk\").to_csv(\noutput_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"]\n)\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"])\n# ######\u00a0NOW WE CAN EASILY CREATE JSON files_dict\nfor csv_file_path in output_path.glob(\"*.csv\"):\ncsv2json_list(csv_file_path)\nprint(\"Finished - saved files:\")\nprint(\"- \" + \"\\n- \".join([str(x) for x in saved]))\n
"},{"location":"reference/alto2txt2fixture/jisc.html","title":"jisc","text":""},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.get_jisc_title","title":"get_jisc_title","text":"
get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str\n

Match a newspaper title with jisc_papers records.

Takes an input_sub_path, a publication_code, and an (optional) abbreviation for any newspaper to locate the title in the jisc_papers DataFrame. jisc_papers is usually loaded via the setup_jisc_papers function.

Parameters:

Name Type Description Default title str

target newspaper title

required issue_date str

target newspaper issue_date

required jisc_papers pd.DataFrame

DataFrame of jisc_papers to match

required input_sub_path str

path of files to narrow down query input_sub_path

required publication_code str

unique codes to match newspaper records

required abbr str | None

an optional abbreviation of the newspaper title

None

Returns:

Type Description str

Matched title str or abbr.

Returns:

Type Description str

A string estimating the JISC equivalent newspaper title

Source code in alto2txt2fixture/jisc.py
def get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str:\n\"\"\"\n    Match a newspaper ``title`` with ``jisc_papers`` records.\n    Takes an ``input_sub_path``, a ``publication_code``, and an (optional)\n    abbreviation for any newspaper to locate the ``title`` in the\n    ``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the\n    ``setup_jisc_papers`` function.\n    Args:\n        title: target newspaper title\n        issue_date: target newspaper issue_date\n        jisc_papers: `DataFrame` of `jisc_papers` to match\n        input_sub_path: path of files to narrow down query input_sub_path\n        publication_code: unique codes to match newspaper records\n        abbr: an optional abbreviation of the newspaper title\n    Returns:\n        Matched ``title`` `str` or ``abbr``.\n    Returns:\n        A string estimating the JISC equivalent newspaper title\n    \"\"\"\n# First option, search the input_sub_path for a valid-looking publication_code\ng = PUBLICATION_CODE.findall(input_sub_path)\nif len(g) == 1:\npublication_code = g[0]\n# Let's see if we can find title:\ntitle = (\njisc_papers[\njisc_papers.publication_code == publication_code\n].title.to_list()[0]\nif jisc_papers[\njisc_papers.publication_code == publication_code\n].title.count()\n== 1\nelse title\n)\nreturn title\n# Second option, look through JISC papers for best match (on publication_code if we have it, but abbr more importantly if we have it)\nif abbr:\n_publication_code = publication_code\npublication_code = abbr\nif jisc_papers.abbr[jisc_papers.abbr == publication_code].count():\ndate = datetime.strptime(issue_date, \"%Y-%m-%d\")\nmask = (\n(jisc_papers.abbr == publication_code)\n& (date >= jisc_papers.start_date)\n& (date <= jisc_papers.end_date)\n)\nfiltered = jisc_papers.loc[mask]\nif filtered.publication_code.count() == 1:\npublication_code = filtered.publication_code.to_list()[0]\ntitle = filtered.title.to_list()[0]\nreturn title\n# Last option: let's find all the possible titles in the jisc_papers for the abbreviation, and if it's just one unique title, let's pick it!\nif abbr:\ntest = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test[0]\nelse:\nmask1 = (jisc_papers.abbr == publication_code) & (\njisc_papers.publication_code == _publication_code\n)\ntest1 = jisc_papers.loc[mask1]\ntest1 = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test1[0]\n# Fallback: if abbreviation is set, we'll return that:\nif abbr:\n# For these exceptions, see issue comment:\n# https://github.com/alan-turing-institute/Living-with-Machines/issues/2453#issuecomment-1050652587\nif abbr == \"IPJL\":\nreturn \"Ipswich Journal\"\nelif abbr == \"BHCH\":\nreturn \"Bath Chronicle\"\nelif abbr == \"LSIR\":\nreturn \"Leeds Intelligencer\"\nelif abbr == \"AGER\":\nreturn \"Lancaster Gazetter, And General Advertiser For Lancashire West\"\nreturn abbr\nraise RuntimeError(f\"Title {title} could not be found.\")\n
"},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.setup_jisc_papers","title":"setup_jisc_papers","text":"
setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame\n

Create a DataFrame with information in JISC_PAPERS_CSV in settings.

Returns:

Type Description pd.DataFrame

DataFrame with all JISC titles.

Source code in alto2txt2fixture/jisc.py
def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:\n\"\"\"\n    Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.\n    Returns:\n        `DataFrame` with all JISC titles.\n    \"\"\"\nif not Path(path).exists():\nraise RuntimeError(\nf\"Could not find required JISC papers file. Put {Path(path).name} in {Path(path).parent} or correct the settings with a different path.\"\n)\nmonths = {\n\"Jan\": 1,\n\"Feb\": 2,\n\"Mar\": 3,\n\"Apr\": 4,\n\"May\": 5,\n\"Jun\": 6,\n\"June\": 6,\n\"Jul\": 7,\n\"July\": 7,\n\"Aug\": 8,\n\"Sep\": 9,\n\"Sept\": 9,\n\"Oct\": 10,\n\"Nov\": 11,\n\"Dec\": 12,\n\"Dec.\": 12,\n}\njisc_papers = pd.read_csv(\npath,\nusecols=[\n\"Newspaper Title\",\n\"NLP\",\n\"Abbr\",\n\"StartD\",\n\"StartM\",\n\"StartY\",\n\"EndD\",\n\"EndM\",\n\"EndY\",\n],\n)\njisc_papers[\"start_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.StartY),\nmonth=months[x.StartM.strip(\".\").strip()],\nday=int(x.StartD),\n),\naxis=1,\n)\njisc_papers[\"end_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.EndY), month=months[x.EndM.strip(\".\").strip()], day=int(x.EndD)\n),\naxis=1,\n)\njisc_papers.drop(\n[\"StartD\", \"StartM\", \"StartY\", \"EndD\", \"EndM\", \"EndY\"],\naxis=\"columns\",\ninplace=True,\n)\njisc_papers.rename(\n{\"Newspaper Title\": \"title\", \"NLP\": \"publication_code\", \"Abbr\": \"abbr\"},\naxis=1,\ninplace=True,\n)\njisc_papers[\"title\"] = jisc_papers[\"title\"].apply(\nlambda x: \"The \" + x[:-5] if x.strip()[-5:].lower() == \", the\" else x\n)\njisc_papers[\"publication_code\"] = jisc_papers[\"publication_code\"].apply(\nlambda x: str(x).zfill(7)\n)\nreturn jisc_papers\n
"},{"location":"reference/alto2txt2fixture/log.html","title":"log","text":""},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.error","title":"error","text":"
error(msg: str, crash: bool = True, silent: bool = True) -> None\n

Print msg in colorama Force.RED and exit()

If silent exit() after call, else raise RuntimeError if crash=True.

Source code in alto2txt2fixture/log.py
def error(msg: str, crash: bool = True, silent: bool = True) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.RED` and `exit()`\n    If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``.\"\"\"\nif crash and silent:\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nexit()\nelif crash:\nraise RuntimeError(msg) from None\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.info","title":"info","text":"
info(msg: str) -> None\n

Print msg in colorama Force.CYAN colour.

Source code in alto2txt2fixture/log.py
def info(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.CYAN` colour.\"\"\"\nprint(f\"{Fore.CYAN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.success","title":"success","text":"
success(msg: str) -> None\n

Print msg in colorama Force.GREEN colour.

Source code in alto2txt2fixture/log.py
def success(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.GREEN` colour.\"\"\"\nprint(f\"{Fore.GREEN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.warning","title":"warning","text":"
warning(msg: str) -> None\n

Print msg in colorama Force.YELLOW colour.

Source code in alto2txt2fixture/log.py
def warning(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.YELLOW` colour.\"\"\"\nprint(f\"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html","title":"parser","text":""},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.fixtures","title":"fixtures","text":"
fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]\n

Generates fixtures for a specified model using a list of files.

This function takes a list of files and generates fixtures for a specified model. The fixtures can be used to populate a database or perform other data-related operations.

Parameters:

Name Type Description Default filelist list

A list of files to process and generate fixtures from.

[] model str

The name of the model for which fixtures are generated. translate: A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:

{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used as keys, and their corresponding primary keys (obtained from the provided files) will be used as values in the generated fixtures.

'' rename dict

A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:

{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the generated fixtures.

{} uniq_keys list

A list of fields that need to be considered for uniqueness in the fixtures. If specified, the fixtures will yield only unique items based on the combination of these fields.

[]

Yields:

Type Description FixtureDict

FixtureDict from model, pk and dict of fields.

Returns:

Type Description Generator[FixtureDict, None, None]

This function generates fixtures but does not return any value.

Source code in alto2txt2fixture/parser.py
def fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]:\n\"\"\"\n    Generates fixtures for a specified model using a list of files.\n    This function takes a list of files and generates fixtures for a specified\n    model. The fixtures can be used to populate a database or perform other\n    data-related operations.\n    Args:\n        filelist: A list of files to process and generate fixtures from.\n        model: The name of the model for which fixtures are generated.\n            translate: A nested dictionary representing the translation mapping\n            for fields. The structure of the translator follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': {\n                        'translated_field': 'pk'\n                    }\n                }\n            }\n            ```\n            The translated fields will be used as keys, and their\n            corresponding primary keys (obtained from the provided files) will\n            be used as values in the generated fixtures.\n        rename: A nested dictionary representing the field renaming\n            mapping. The structure of the dictionary follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': 'new_field_name'\n                }\n            }\n            ```\n            The fields specified in the dictionary will be renamed to the\n            provided new field names in the generated fixtures.\n        uniq_keys: A list of fields that need to be considered for\n            uniqueness in the fixtures. If specified, the fixtures will yield\n            only unique items based on the combination of these fields.\n    Yields:\n        `FixtureDict` from ``model``, ``pk`` and `dict` of ``fields``.\n    Returns:\n        This function generates fixtures but does not return any value.\n    \"\"\"\nfilelist = sorted(filelist, key=lambda x: str(x).split(\"/\")[:-1])\ncount = len(filelist)\n# Process JSONL\nif [x for x in filelist if \".jsonl\" in x.name]:\npk = 0\n# In the future, we might want to show progress here (tqdm or suchlike)\nfor file in filelist:\nfor line in file.read_text().splitlines():\npk += 1\nline = json.loads(line)\nyield FixtureDict(\npk=pk,\nmodel=model,\nfields=dict(**get_fields(line, translate=translate, rename=rename)),\n)\nreturn\nelse:\n# Process JSON\npks = [x for x in range(1, count + 1)]\nif len(uniq_keys):\nuniq_files = list(uniq(filelist, uniq_keys))\ncount = len(uniq_files)\nzipped = zip(uniq_files, pks)\nelse:\nzipped = zip(filelist, pks)\nfor x in tqdm(\nzipped, total=count, desc=f\"{model} ({count:,} objs)\", leave=False\n):\nyield FixtureDict(\npk=x[1],\nmodel=model,\nfields=dict(**get_fields(x[0], translate=translate, rename=rename)),\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_fields","title":"get_fields","text":"
get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict\n

Retrieves fields from a file and performs modifications and checks.

This function takes a file (in various formats: Path, str, or dict) and processes its fields. It retrieves the fields from the file and performs modifications, translations, and checks on the fields.

Parameters:

Name Type Description Default file Union[Path, str, dict]

The file from which the fields are retrieved.

required translate dict

A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:

{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used to replace the original fields in the retrieved fields.

{} rename dict

A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:

{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the retrieved fields.

{} allow_null bool

Determines whether to allow None values for relational fields. If set to True, relational fields with missing values will be assigned None. If set to False, an error will be raised.

False

Returns:

Type Description dict

A dictionary representing the retrieved fields from the file, with modifications and checks applied.

Raises:

Type Description RuntimeError

If the file type is unsupported or if an error occurs during field retrieval or processing.

Source code in alto2txt2fixture/parser.py
def get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict:\n\"\"\"\n    Retrieves fields from a file and performs modifications and checks.\n    This function takes a file (in various formats: `Path`, `str`, or `dict`)\n    and processes its fields. It retrieves the fields from the file and\n    performs modifications, translations, and checks on the fields.\n    Args:\n        file: The file from which the fields are retrieved.\n        translate: A nested dictionary representing the translation mapping\n            for fields. The structure of the translator follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': {\n                        'translated_field': 'pk'\n                    }\n                }\n            }\n            ```\n            The translated fields will be used to replace the original fields\n            in the retrieved fields.\n        rename: A nested dictionary representing the field renaming\n            mapping. The structure of the dictionary follows the format:\n            ```python\n            {\n                'part1': {\n                    'part2': 'new_field_name'\n                }\n            }\n            ```\n            The fields specified in the dictionary will be renamed to the\n            provided new field names in the retrieved fields.\n        allow_null: Determines whether to allow ``None`` values for\n            relational fields. If set to ``True``, relational fields with\n            missing values will be assigned ``None``. If set to ``False``, an\n            error will be raised.\n    Returns:\n        A dictionary representing the retrieved fields from the file,\n            with modifications and checks applied.\n    Raises:\n        RuntimeError: If the file type is unsupported or if an error occurs\n            during field retrieval or processing.\n    \"\"\"\nif isinstance(file, Path):\ntry:\nfields = json.loads(file.read_text())\nexcept Exception as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, str):\nif \"\\n\" in file:\nraise RuntimeError(\"File has multiple lines.\")\ntry:\nfields = json.loads(file)\nexcept json.decoder.JSONDecodeError as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, dict):\nfields = file\nelse:\nraise RuntimeError(f\"Cannot process type {type(file)}.\")\n# Fix relational fields for any file\nfor key in [key for key in fields.keys() if \"__\" in key]:\nparts = key.split(\"__\")\ntry:\nbefore = fields[key]\nif before:\nbefore = before.replace(\"---\", \"/\")\nloc = translate.get(parts[0], {}).get(parts[1], {})\nfields[key] = loc.get(before)\nif fields[key] is None:\nraise RuntimeError(\nf\"Cannot translate fields.{key} from {before}: {loc}\"\n)\nexcept AttributeError:\nif allow_null:\nfields[key] = None\nelse:\nprint(\n\"Content had relational fields, but something went wrong in parsing the data:\"\n)\nprint(\"file\", file)\nprint(\"fields\", fields)\nprint(\"KEY:\", key)\nraise RuntimeError()\nnew_name = rename.get(parts[0], {}).get(parts[1], None)\nif new_name:\nfields[new_name] = fields[key]\ndel fields[key]\nfields[\"created_at\"] = NOW_str\nfields[\"updated_at\"] = NOW_str\ntry:\nfields[\"item_type\"] = str(fields[\"item_type\"]).upper()\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_mean\"] == \"\":\nfields[\"ocr_quality_mean\"] = 0\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_sd\"] == \"\":\nfields[\"ocr_quality_sd\"] = 0\nexcept KeyError:\npass\nreturn fields\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_key_from","title":"get_key_from","text":"
get_key_from(item: Path, x: str) -> str\n

Retrieves a specific key from a file and returns its value.

This function reads a file and extracts the value of a specified key. If the key is not found or an error occurs while processing the file, a warning is printed, and an empty string is returned.

Parameters:

Name Type Description Default item Path

The file from which the key is extracted.

required x str

The key to be retrieved from the file.

required

Returns:

Type Description str

The value of the specified key from the file.

Source code in alto2txt2fixture/parser.py
def get_key_from(item: Path, x: str) -> str:\n\"\"\"\n    Retrieves a specific key from a file and returns its value.\n    This function reads a file and extracts the value of a specified\n    key. If the key is not found or an error occurs while processing\n    the file, a warning is printed, and an empty string is returned.\n    Args:\n        item: The file from which the key is extracted.\n        x: The key to be retrieved from the file.\n    Returns:\n        The value of the specified key from the file.\n    \"\"\"\nresult = json.loads(item.read_text()).get(x, None)\nif not result:\nprint(f\"[WARN] Could not find key {x} in {item}\")\nresult = \"\"\nreturn result\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_translator","title":"get_translator","text":"
get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict\n

Converts a list of fields into a nested dictionary representing a translator.

Parameters:

Name Type Description Default fields list[TranslatorTuple]

A list of tuples representing fields to be translated.

[TranslatorTuple('', '', [])]

Returns:

Type Description dict

A nested dictionary representing the translator. The structure of the dictionary follows the format:

{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n

Example
>>> fields = [\n...     TranslatorTuple(\n...         start='start__field1',\n...         finish='field1',\n...         lst=[{\n...             'fields': {'field1': 'translation1'},\n...             'pk': 1}],\n...      )]\n>>> get_translator(fields)\n{'start': {'field1': {'translation1': 1}}}\n
Source code in alto2txt2fixture/parser.py
def get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict:\n\"\"\"\n    Converts a list of fields into a nested dictionary representing a\n    translator.\n    Args:\n        fields: A list of tuples representing fields to be translated.\n    Returns:\n        A nested dictionary representing the translator. The structure of\n            the dictionary follows the format:\n            ```python\n            {\n                'part1': {\n                      'part2': {\n                          'translated_field': 'pk'\n                      }\n                }\n            }\n            ```\n    Example:\n        ```pycon\n        >>> fields = [\n        ...     TranslatorTuple(\n        ...         start='start__field1',\n        ...         finish='field1',\n        ...         lst=[{\n        ...             'fields': {'field1': 'translation1'},\n        ...             'pk': 1}],\n        ...      )]\n        >>> get_translator(fields)\n        {'start': {'field1': {'translation1': 1}}}\n        ```\n    \"\"\"\n_ = dict()\nfor field in fields:\nstart, finish, lst = field\npart1, part2 = start.split(\"__\")\nif part1 not in _:\n_[part1] = {}\nif part2 not in _[part1]:\n_[part1][part2] = {}\nif isinstance(finish, str):\n_[part1][part2] = {o[\"fields\"][finish]: o[\"pk\"] for o in lst}\nelif isinstance(finish, list):\n_[part1][part2] = {\n\"-\".join([o[\"fields\"][x] for x in finish]): o[\"pk\"] for o in lst\n}\nreturn _\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.parse","title":"parse","text":"
parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None\n

Parses files from collections and generates fixtures for various models.

This function processes files from the specified collections and generates fixtures for different models, such as newspapers.dataprovider, newspapers.ingest, newspapers.digitisation, newspapers.newspaper, newspapers.issue, and newspapers.item.

It performs various steps, such as file listing, fixture generation, translation mapping, renaming fields, and saving fixtures to files.

Parameters:

Name Type Description Default collections list

A list of collections from which files are processed and fixtures are generated.

required cache_home str

The directory path where the collections are located.

required output str

The directory path where the fixtures will be saved.

required max_elements_per_file int

The maximum number of elements per file when saving fixtures.

required

Returns:

Type Description None

This function generates fixtures but does not return any value.

Source code in alto2txt2fixture/parser.py
def parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None:\n\"\"\"\n    Parses files from collections and generates fixtures for various models.\n    This function processes files from the specified collections and generates\n    fixtures for different models, such as `newspapers.dataprovider`,\n    `newspapers.ingest`, `newspapers.digitisation`, `newspapers.newspaper`,\n    `newspapers.issue`, and `newspapers.item`.\n    It performs various steps, such as file listing, fixture generation,\n    translation mapping, renaming fields, and saving fixtures to files.\n    Args:\n        collections: A list of collections from which files are\n            processed and fixtures are generated.\n        cache_home: The directory path where the collections are located.\n        output: The directory path where the fixtures will be saved.\n        max_elements_per_file: The maximum number of elements per file\n            when saving fixtures.\n    Returns:\n        This function generates fixtures but does not return any value.\n    \"\"\"\nglobal CACHE_HOME\nglobal OUTPUT\nglobal MAX_ELEMENTS_PER_FILE\nCACHE_HOME = cache_home\nOUTPUT = output\nMAX_ELEMENTS_PER_FILE = max_elements_per_file\n# Set up output directory\nreset_fixture_dir(OUTPUT)\n# Get file lists\nprint(\"\\nGetting file lists...\")\ndef issues_in_x(x):\nreturn \"issues\" in str(x.parent).split(\"/\")\ndef newspapers_in_x(x):\nreturn not any(\n[\ncondition\nfor y in str(x.parent).split(\"/\")\nfor condition in [\n\"issues\" in y,\n\"ingest\" in y,\n\"digitisation\" in y,\n\"data-provider\" in y,\n]\n]\n)\nall_json = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.json\")\n]\nall_jsonl = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.jsonl\")\n]\nprint(f\"--> {len(all_json):,} JSON files altogether\")\nprint(f\"--> {len(all_jsonl):,} JSONL files altogether\")\nprint(\"\\nSetting up fixtures...\")\n# Process data providers\ndef data_provider_in_x(x):\nreturn \"data-provider\" in str(x.parent).split(\"/\")\ndata_provider_json = list(\nfixtures(\nmodel=\"newspapers.dataprovider\",\nfilelist=[x for x in all_json if data_provider_in_x(x)],\nuniq_keys=[\"name\"],\n)\n)\nprint(f\"--> {len(data_provider_json):,} DataProvider fixtures\")\n# Process ingest\ndef ingest_in_x(x):\nreturn \"ingest\" in str(x.parent).split(\"/\")\ningest_json = list(\nfixtures(\nmodel=\"newspapers.ingest\",\nfilelist=[x for x in all_json if ingest_in_x(x)],\nuniq_keys=[\"lwm_tool_name\", \"lwm_tool_version\"],\n)\n)\nprint(f\"--> {len(ingest_json):,} Ingest fixtures\")\n# Process digitisation\ndef digitisation_in_x(x):\nreturn \"digitisation\" in str(x.parent).split(\"/\")\ndigitisation_json = list(\nfixtures(\nmodel=\"newspapers.digitisation\",\nfilelist=[x for x in all_json if digitisation_in_x(x)],\nuniq_keys=[\"software\"],\n)\n)\nprint(f\"--> {len(digitisation_json):,} Digitisation fixtures\")\n# Process newspapers\nnewspaper_json = list(\nfixtures(\nmodel=\"newspapers.newspaper\",\nfilelist=[file for file in all_json if newspapers_in_x(file)],\n)\n)\nprint(f\"--> {len(newspaper_json):,} Newspaper fixtures\")\n# Process issue\ntranslate = get_translator(\n[\nTranslatorTuple(\n\"publication__publication_code\", \"publication_code\", newspaper_json\n)\n]\n)\nrename = {\"publication\": {\"publication_code\": \"newspaper_id\"}}\nissue_json = list(\nfixtures(\nmodel=\"newspapers.issue\",\nfilelist=[file for file in all_json if issues_in_x(file)],\ntranslate=translate,\nrename=rename,\n)\n)\nprint(f\"--> {len(issue_json):,} Issue fixtures\")\n# Create translator/clear up memory before processing items\ntranslate = get_translator(\n[\n(\"issue__issue_identifier\", \"issue_code\", issue_json),\n(\"digitisation__software\", \"software\", digitisation_json),\n(\"data_provider__name\", \"name\", data_provider_json),\n(\n\"ingest__lwm_tool_identifier\",\n[\"lwm_tool_name\", \"lwm_tool_version\"],\ningest_json,\n),\n]\n)\nrename = {\n\"issue\": {\"issue_identifier\": \"issue_id\"},\n\"digitisation\": {\"software\": \"digitisation_id\"},\n\"data_provider\": {\"name\": \"data_provider_id\"},\n\"ingest\": {\"lwm_tool_identifier\": \"ingest_id\"},\n}\nsave_fixture(newspaper_json, \"Newspaper\")\nsave_fixture(issue_json, \"Issue\")\ndel newspaper_json\ndel issue_json\ngc.collect()\nprint(\"\\nSaving...\")\nsave_fixture(digitisation_json, \"Digitisation\")\nsave_fixture(ingest_json, \"Ingest\")\nsave_fixture(data_provider_json, \"DataProvider\")\n# Process items\nitem_json = fixtures(\nmodel=\"newspapers.item\",\nfilelist=all_jsonl,\ntranslate=translate,\nrename=rename,\n)\nsave_fixture(item_json, \"Item\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.reset_fixture_dir","title":"reset_fixture_dir","text":"
reset_fixture_dir(output: str | Path) -> None\n

Resets the fixture directory by removing all JSON files inside it.

This function takes a directory path (output) as input and removes all JSON files within the directory.

Prior to removal, it prompts the user for confirmation to proceed. If the user confirms, the function clears the fixture directory by deleting the JSON files.

Parameters:

Name Type Description Default output str | Path

The directory path of the fixture directory to be reset.

required

Raises:

Type Description RuntimeError

If the output directory is not specified as a string.

Source code in alto2txt2fixture/parser.py
def reset_fixture_dir(output: str | Path) -> None:\n\"\"\"\n    Resets the fixture directory by removing all JSON files inside it.\n    This function takes a directory path (``output``) as input and removes all\n    JSON files within the directory.\n    Prior to removal, it prompts the user for confirmation to proceed. If the\n    user confirms, the function clears the fixture directory by deleting the\n    JSON files.\n    Args:\n        output: The directory path of the fixture directory to be reset.\n    Raises:\n        RuntimeError: If the ``output`` directory is not specified as a string.\n    \"\"\"\nif not isinstance(output, str):\nraise RuntimeError(\"`output` directory needs to be specified as a string.\")\noutput = Path(output)\ny = input(\nf\"This command will automatically empty the fixture directory ({output.absolute()}). \"\n\"Do you want to proceed? [y/N]\"\n)\nif not y.lower() == \"y\":\noutput.mkdir(parents=True, exist_ok=True)\nreturn\nprint(\"\\nClearing up the fixture directory\")\n# Ensure directory exists\noutput.mkdir(parents=True, exist_ok=True)\n# Drop all JSON files\n[x.unlink() for x in Path(output).glob(\"*.json\")]\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.uniq","title":"uniq","text":"
uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]\n

Generates unique items from a list of files based on specified keys.

This function takes a list of files and yields unique items based on a combination of keys. The keys are extracted from each file using the get_key_from function, and duplicate items are ignored.

Parameters:

Name Type Description Default filelist list

A list of files from which unique items are generated.

required keys list

A list of keys used for uniqueness. Each key specifies a field to be used for uniqueness checking in the generated items.

[]

Yields:

Type Description Any

A unique item from filelist.

Source code in alto2txt2fixture/parser.py
def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:\n\"\"\"\n    Generates unique items from a list of files based on specified keys.\n    This function takes a list of files and yields unique items based on a\n    combination of keys. The keys are extracted from each file using the\n    ``get_key_from`` function, and duplicate items are ignored.\n    Args:\n        filelist: A list of files from which unique items are\n            generated.\n        keys: A list of keys used for uniqueness. Each key specifies\n            a field to be used for uniqueness checking in the generated\n            items.\n    Yields:\n        A unique item from `filelist`.\n    \"\"\"\nseen = set()\nfor item in filelist:\nkey = \"-\".join([get_key_from(item, x) for x in keys])\nif key not in seen:\nseen.add(key)\nyield item\nelse:\n# Drop it if duplicate\npass\n
"},{"location":"reference/alto2txt2fixture/patterns.html","title":"patterns","text":"

Useful regular expressions, intially just PUBLICATION_CODE.

"},{"location":"reference/alto2txt2fixture/router.html","title":"router","text":""},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive","title":"Archive","text":"
Archive(\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n)\n

Manage extracting information from a ZIP archive.

The Archive class represents a zip archive of XML files. The class is used to extract information from a ZIP archive, and it contains several methods to process the data contained in the archive.

open(Archive) context manager

Archive can be opened with a context manager, which creates a meta object, with timings for the object. When closed, it will save the meta JSON to the correct paths.

Attributes:

Name Type Description path Path

The path to the zip archive.

collection str

The collection of the XML files in the archive. Default is \"\".

report Path

The file path of the report file for the archive.

report_id str

The report ID for the archive. If not provided, a random UUID is generated.

report_parent Path

The parent directory of the report file for the archive.

jisc_papers pd.DataFrame

A DataFrame of JISC papers.

size str | float

The size of the archive, in human-readable format.

size_raw str | float

The raw size of the archive, in bytes.

roots Generator[ET.Element, None, None]

The root elements of the XML documents contained in the archive.

meta dotdict

Metadata about the archive, such as its path, size, and number of contents.

json_indent int

Indentation formatting of json output

Raises:

Type Description RuntimeError

If the path does not exist.

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n):\n\"\"\"Constructor method.\"\"\"\nself.path: Path = Path(path)\nif not self.path.exists():\nraise RuntimeError(\"Path does not exist.\")\nself.size: str | float = get_size_from_path(self.path)\nself.size_raw: str | float = get_size_from_path(self.path, raw=True)\nself.zip_file: zipfile.ZipFile = zipfile.ZipFile(self.path)\nself.collection: str = collection\nself.roots: Generator[ET.Element, None, None] = self.get_roots()\nself.meta: dotdict = dotdict(\npath=str(self.path),\nbytes=self.size_raw,\nsize=self.size,\ncontents=len(self.filelist),\n)\nif not report_id:\nself.report_id: str = str(uuid.uuid4())\nelse:\nself.report_id = report_id\nself.jisc_papers: pd.DataFrame = jisc_papers\nself.report_parent: Path = Path(f\"{REPORT_DIR}/{self.report_id}\")\nself.report: Path = (\nself.report_parent / f\"{self.path.stem.replace('_metadata', '')}.json\"\n)\nself.json_indent: int = json_indent\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.documents","title":"documents property","text":"
documents\n

Property that calls the get_documents method

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.filelist","title":"filelist property","text":"
filelist\n

Returns the list of files in the zip file

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.__len__","title":"__len__","text":"
__len__()\n

The number of files inside the zip archive.

Source code in alto2txt2fixture/router.py
def __len__(self):\n\"\"\"The number of files inside the zip archive.\"\"\"\nreturn len(self.filelist)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_documents","title":"get_documents","text":"
get_documents() -> Generator[Document, None, None]\n

A generator that yields instances of the Document class for each XML file in the ZIP archive.

It uses the tqdm library to display a progress bar in the terminal while it is running.

If the contents of the ZIP file are not empty, the method creates an instance of the Document class by passing the root element of the XML file, the collection name, meta information about the archive, and the JISC papers data frame (if provided) to the constructor of the Document class. The instance of the Document class is then returned by the generator.

Yields:

Type Description Document

Document class instance for each unzipped XML file.

Source code in alto2txt2fixture/router.py
def get_documents(self) -> Generator[Document, None, None]:\n\"\"\"\n    A generator that yields instances of the Document class for each XML\n    file in the ZIP archive.\n    It uses the `tqdm` library to display a progress bar in the terminal\n    while it is running.\n    If the contents of the ZIP file are not empty, the method creates an\n    instance of the ``Document`` class by passing the root element of the XML\n    file, the collection name, meta information about the archive, and the\n    JISC papers data frame (if provided) to the constructor of the\n    ``Document`` class. The instance of the ``Document`` class is then\n    returned by the generator.\n    Yields:\n        ``Document`` class instance for each unzipped `XML` file.\n    \"\"\"\nfor xml_file in tqdm(\nself.filelist,\ndesc=f\"{Path(self.zip_file.filename).stem} ({self.meta.size})\",\nleave=False,\ncolour=\"green\",\n):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield Document(\nroot=ET.fromstring(xml),\ncollection=self.collection,\nmeta=self.meta,\njisc_papers=self.jisc_papers,\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_roots","title":"get_roots","text":"
get_roots() -> Generator[ET.Element, None, None]\n

Yields the root elements of the XML documents contained in the archive.

Source code in alto2txt2fixture/router.py
def get_roots(self) -> Generator[ET.Element, None, None]:\n\"\"\"\n    Yields the root elements of the XML documents contained in the archive.\n    \"\"\"\nfor xml_file in tqdm(self.filelist, leave=False, colour=\"blue\"):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield ET.fromstring(xml)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache","title":"Cache","text":"
Cache()\n

The Cache class provides a blueprint for creating and managing cache data. The class has several methods that help in getting the cache path, converting the data to a dictionary, and writing the cache data to a file.

It is inherited by many other classes in this document.

Initializes the Cache class object.

Source code in alto2txt2fixture/router.py
def __init__(self):\n\"\"\"\n    Initializes the Cache class object.\n    \"\"\"\npass\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.__str__","title":"__str__","text":"
__str__() -> str\n

Returns the string representation of the cache data as a dictionary.

Source code in alto2txt2fixture/router.py
def __str__(self) -> str:\n\"\"\"\n    Returns the string representation of the cache data as a dictionary.\n    \"\"\"\nreturn str(self.as_dict())\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

Converts the cache data to a dictionary and returns it.

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    Converts the cache data to a dictionary and returns it.\n    \"\"\"\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the cache path, which is used to store the cache data. The path is normally constructed using some of the object's properties (collection, kind, and id) but can be changed when inherited.

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the cache path, which is used to store the cache data.\n    The path is normally constructed using some of the object's\n    properties (collection, kind, and id) but can be changed when\n    inherited.\n    \"\"\"\nreturn Path(f\"{CACHE_HOME}/{self.collection}/{self.kind}/{self.id}.json\")\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.write_to_cache","title":"write_to_cache","text":"
write_to_cache(json_indent: int = JSON_INDENT) -> Optional[bool]\n

Writes the cache data to a file at the specified cache path. The cache data is first converted to a dictionary using the as_dict method. If the cache path already exists, the function returns True.

Source code in alto2txt2fixture/router.py
def write_to_cache(self, json_indent: int = JSON_INDENT) -> Optional[bool]:\n\"\"\"\n    Writes the cache data to a file at the specified cache path. The cache\n    data is first converted to a dictionary using the as_dict method. If\n    the cache path already exists, the function returns True.\n    \"\"\"\npath = self.get_cache_path()\ntry:\nif path.exists():\nreturn True\nexcept AttributeError:\nerror(\nf\"Error occurred when getting cache path for \"\nf\"{self.kind}: {path}. It was not of expected \"\nf\"type Path but of type {type(path)}:\",\n)\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"w+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent))\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Collection","title":"Collection","text":"
Collection(name: str = 'hmd', jisc_papers: Optional[pd.DataFrame] = None)\n

A Collection represents a group of newspaper archives from any passed alto2txt metadata output.

A Collection is initialised with a name and an optional pandas DataFrame of JISC papers. The archives property returns an iterable of the Archive objects within the collection.

Attributes:

Name Type Description name str

Name of the collection (default \"hmd\")

jisc_papers pandas.DataFrame

DataFrame of JISC papers, optional

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, name: str = \"hmd\", jisc_papers: Optional[pd.DataFrame] = None):\n\"\"\"Constructor method.\"\"\"\nself.name: str = name\nself.jisc_papers: pd.DataFrame | None = jisc_papers\nself.dir: Path = Path(f\"{MNT}/{self.name}-alto2txt/metadata\")\nself.zip_files: list[Path] = sorted(\nlist(self.dir.glob(\"*.zip\")), key=lambda x: x.stat().st_size\n)\nself.zip_file_count: int = sum([1 for _ in self.dir.glob(\"*.zip\")])\nself.report_id: str = str(uuid.uuid4())\nself.empty: bool = self.zip_file_count == 0\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider","title":"DataProvider","text":"
DataProvider(collection: str)\n

Bases: Cache

The DataProvider class extends the Cache class and represents a newspaper data provider. The class has several properties and methods that allow creation of a data provider object and the manipulation of its data.

Attributes:

Name Type Description collection str

A string representing publication collection

kind str

Indication of object type, defaults to data-provider

providers_meta_data list[FixtureDict]

structured dict of metadata for known collection sources

collection_type str

related data sources and potential linkage source

index_field str

field name for querying existing records

Example
>>> from pprint import pprint\n>>> hmd = DataProvider(\"hmd\")\n>>> hmd.pk\n2\n>>> pprint(hmd.as_dict())\n{'code': 'bl-hmd',\n 'collection': 'newspapers',\n 'legacy_code': 'hmd',\n 'name': 'Heritage Made Digital',\n 'source_note': 'British Library-funded digitised newspapers provided by the '\n                'British Newspaper Archive'}\n

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, collection: str):\n\"\"\"Constructor method.\"\"\"\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.meta_data","title":"meta_data property","text":"
meta_data: FixtureDict | dict\n

Return self.providers_meta_data[self.collection] or {}.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.meta_data_fields","title":"meta_data_fields property","text":"
meta_data_fields: FixtureDict | dict\n

Return self.providers_meta_data[self.collection] or {}.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.pk","title":"pk property","text":"
pk: int | None\n

Return pk if provided via providers_meta_data, else None.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.providers_index_dict","title":"providers_index_dict property","text":"
providers_index_dict: dict[str, FixtureDict]\n

Return all self.index_field values from providers_meta_data.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

Return a dict of the data provider object.

Returns:

Type Description dict

Dictionary representation of the DataProvider object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    Return a `dict` of the data provider object.\n    Returns:\n        Dictionary representation of the DataProvider object\n    \"\"\"\nif self.meta_data:\nreturn {\n\"name\": self.meta_data_fields[\"name\"],\n\"code\": self.meta_data_fields[\"code\"],\n\"legacy_code\": self.collection,\n\"source_note\": self.meta_data_fields[\"source_note\"],\n\"collection\": self.collection_type,\n}\nelse:\nreturn {\n\"name\": self.collection,\n\"code\": slugify(self.collection),\n\"source_note\": \"\",\n\"legacy_code\": None,\n\"collection\": self.collection_type,\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation","title":"Digitisation","text":"
Digitisation(root: ET.Element, collection: str = '')\n

Bases: Cache

The Digitisation class extends the Cache class and represents a newspaper digitisation. The class has several properties and methods that allow creation of an digitisation object and the manipulation of its data.

Attributes:

Name Type Description root ET.Element

An xml element that represents the root of the publication

collection str

A string that represents the collection of the publication

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'digitisation'\n

A string that represents the type of the object, set to \"digitisation\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the digitisation object.

Returns:

Type Description dict

Dictionary representation of the Digitising object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the digitisation\n    object.\n    Returns:\n        Dictionary representation of the Digitising object\n    \"\"\"\ndic = {\nx.tag: x.text or \"\"\nfor x in self.root.findall(\"./process/*\")\nif x.tag\nin [\n\"xml_flavour\",\n\"software\",\n\"mets_namespace\",\n\"alto_namespace\",\n]\n}\nif not dic.get(\"software\"):\nreturn {}\nreturn dic\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document","title":"Document","text":"
Document(*args, **kwargs)\n

The Document class is a representation of a document that contains information about a publication, newspaper, item, digitisation, and ingest. This class holds all the relevant information about a document in a structured manner and provides properties that can be used to access different aspects of the document.

Attributes:

Name Type Description collection str | None

A string that represents the collection of the publication

root ET.Element | None

An XML element that represents the root of the publication

zip_file str | None

A path to a valid zip file

jisc_papers pd.DataFrame | None

A pandas DataFrame object that holds information about the JISC papers

meta dotdict | None

TODO

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, *args, **kwargs):\n\"\"\"Constructor method.\"\"\"\nself.collection: str | None = kwargs.get(\"collection\")\nif not self.collection or not isinstance(self.collection, str):\nraise RuntimeError(\"A valid collection must be passed\")\nself.root: ET.Element | None = kwargs.get(\"root\")\nif not self.root or not isinstance(self.root, ET.Element):\nraise RuntimeError(\"A valid XML root must be passed\")\nself.zip_file: str | None = kwargs.get(\"zip_file\")\nif self.zip_file and not isinstance(self.zip_file, str):\nraise RuntimeError(\"A valid zip file must be passed\")\nself.jisc_papers: pd.DataFrame | None = kwargs.get(\"jisc_papers\")\nif not isinstance(self.jisc_papers, pd.DataFrame):\nraise RuntimeError(\n\"A valid DataFrame containing JISC papers must be passed\"\n)\nself.meta: dotdict | None = kwargs.get(\"meta\")\nself._publication_elem = None\nself._input_sub_path = None\nself._ingest = None\nself._digitisation = None\nself._item = None\nself._issue = None\nself._newspaper = None\nself._data_provider = None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document.publication","title":"publication property","text":"
publication: ET.Element\n

This property returns an ElementTree object representing the publication information in the XML document.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest","title":"Ingest","text":"
Ingest(root: ET.Element, collection: str = '')\n

Bases: Cache

The Ingest class extends the Cache class and represents a newspaper ingest. The class has several properties and methods that allow the creation of an ingest object and the manipulation of its data.

Attributes:

Name Type Description root ET.Element

An xml element that represents the root of the publication

collection str

A string that represents the collection of the publication

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'ingest'\n

A string that represents the type of the object, set to \"ingest\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the ingest object.

Returns:

Type Description dict

Dictionary representation of the Ingest object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the ingest\n    object.\n    Returns:\n        Dictionary representation of the Ingest object\n    \"\"\"\nreturn {\nf\"lwm_tool_{x.tag}\": x.text or \"\"\nfor x in self.root.findall(\"./process/lwm_tool/*\")\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue","title":"Issue","text":"
Issue(\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n)\n

Bases: Cache

The Issue class extends the Cache class and represents a newspaper issue. The class has several properties and methods that allow the creation of an issue object and the manipulation of its data.

Attributes:

Name Type Description root

An xml element that represents the root of the publication

newspaper Newspaper | None

The parent newspaper

collection str

A string that represents the collection of the publication

input_sub_path str

TODO

meta dotdict

TODO

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nself.publication: ET.Element = publication\nself.newspaper: Newspaper | None = newspaper\nself.collection: str = collection\nself.input_sub_path: str = input_sub_path\nself.meta: dotdict = meta\nself._issue = None\nself._issue_date = None\npath: str = str(self.get_cache_path())\nif not self.meta.issue_paths:\nself.meta.issue_paths = [path]\nelif path not in self.meta.issue_paths:\nself.meta.issue_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_code","title":"issue_code property","text":"
issue_code: str\n

Sets up and saves the issue code for easy access as property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_date","title":"issue_date property","text":"
issue_date: str\n

Sets up and saves the issue date for easy access as property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'issue'\n

A string that represents the type of the object, set to \"issue\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the issue object.

Returns:

Type Description dict

Dictionary representation of the Issue object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the issue\n    object.\n    Returns:\n        Dictionary representation of the Issue object\n    \"\"\"\nif not self._issue:\nself._issue = dict(\nissue_code=self.issue_code,\nissue_date=self.issue_date,\npublication__publication_code=self.newspaper.publication_code,\ninput_sub_path=self.input_sub_path,\n)\nreturn self._issue\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the path to the cache file for the issue object.

Returns:

Type Description Path

Path to the cache file for the issue object

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the path to the cache file for the issue object.\n    Returns:\n        Path to the cache file for the issue object\n    \"\"\"\njson_file = f\"/{self.newspaper.publication_code}/issues/{self.issue_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item","title":"Item","text":"
Item(\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n)\n

Bases: Cache

The Newspaper class extends the Cache class and represents a newspaper item, i.e. an article. The class has several properties and methods that allow the creation of an article object and the manipulation of its data.

Attributes:

Name Type Description root ET.Element

An xml element that represents the root of the publication

issue_code str

A string that represents the issue code

digitisation dict

TODO

ingest dict

TODO

collection str

A string that represents the collection of the publication

newspaper Newspaper | None

The parent newspaper

meta dotdict

TODO

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nif not isinstance(newspaper, Newspaper):\nraise RuntimeError(\"Expected newspaper to be of type router.Newspaper\")\nself.root: ET.Element = root\nself.issue_code: str = issue_code\nself.digitisation: dict = digitisation\nself.ingest: dict = ingest\nself.collection: str = collection\nself.newspaper: Newspaper | None = newspaper\nself.meta: dotdict = meta\nself._item_elem = None\nself._item_code = None\nself._item = None\npath: str = str(self.get_cache_path())\nif not self.meta.item_paths:\nself.meta.item_paths = [path]\nelif path not in self.meta.item_paths:\nself.meta.item_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_code","title":"item_code property","text":"
item_code: str\n

Sets up and saves the item code for easy access as property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_elem","title":"item_elem property","text":"
item_elem\n

Sets up and saves the issue XML item for easy access as a property.

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'item'\n

A string that represents the type of the object, set to \"item\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the item object (i.e. article).

Returns:

Type Description dict

Dictionary representation of the Item object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the item object\n    (i.e. article).\n    Returns:\n        Dictionary representation of the Item object\n    \"\"\"\nif not self._item:\nself._item = {\nf\"{x.tag}\": x.text or \"\"\nfor x in self.item_elem.findall(\"*\")\nif x.tag\nin [\n\"title\",\n\"word_count\",\n\"ocr_quality_mean\",\n\"ocr_quality_sd\",\n\"plain_text_file\",\n\"item_type\",\n]\n}\nself._item[\"title\"] = self._item.get(\"title\", \"\")[:2097151]\nself._item = {\n\"item_code\": self.item_code,\n\"word_count\": self._item.get(\"word_count\", 0),\n\"title\": self._item.get(\"title\"),\n\"item_type\": self._item.get(\"item_type\"),\n\"input_filename\": self._item.get(\"plain_text_file\", \"\"),\n\"ocr_quality_mean\": self._item.get(\"ocr_quality_mean\", 0),\n\"ocr_quality_sd\": self._item.get(\"ocr_quality_sd\", 0),\n\"digitisation__software\": self.digitisation.id,\n\"ingest__lwm_tool_identifier\": self.ingest.id,\n\"issue__issue_identifier\": self.issue_code,\n\"data_provider__name\": self.collection,\n}\nreturn self._item\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the path to the cache file for the item (article) object.

Returns:

Type Description Path

Path to the cache file for the article object

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the path to the cache file for the item (article) object.\n    Returns:\n        Path to the cache file for the article object\n    \"\"\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ f\"/{self.newspaper.publication_code}/items.jsonl\"\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.write_to_cache","title":"write_to_cache","text":"
write_to_cache(json_indent = JSON_INDENT) -> None\n

Special cache-write function that appends rather than writes at the end of the process.

Returns:

Type Description None

None.

Source code in alto2txt2fixture/router.py
def write_to_cache(self, json_indent=JSON_INDENT) -> None:\n\"\"\"\n    Special cache-write function that appends rather than writes at the\n    end of the process.\n    Returns:\n        None.\n    \"\"\"\npath = self.get_cache_path()\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"a+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent) + \"\\n\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper","title":"Newspaper","text":"
Newspaper(\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n)\n

Bases: Cache

The Newspaper class extends the Cache class and represents a newspaper.

The class has several properties and methods that allow the creation of a newspaper object and the manipulation of its data.

Attributes:

Name Type Description root

An xml element that represents the root of the publication.

collection

A string that represents the collection of the publication.

meta

A dotdict object that holds metadata about the publication.

jisc_papers

A pandas DataFrame object for JISC paper information.

Constructor method.

Source code in alto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.publication = root.find(\"./publication\")\nself.input_sub_path = root.find(\"./process/input_sub_path\").text\nself.issue_date = self.publication.find(\"./issue/date\").text\nself.collection = collection\nself.meta = meta\nself.jisc_papers = jisc_papers\nself._newspaper = None\nself._title = None\nself._publication_code = None\npath = str(self.get_cache_path())\nif not self.meta.newspaper_paths:\nself.meta.newspaper_paths = []\nelif path not in self.meta.newspaper_paths:\nself.meta.newspaper_paths.append(path)\nif not self.meta.publication_codes:\nself.meta.publication_codes = [self.publication_code]\nelif self.publication_code not in self.meta.publication_codes:\nself.meta.publication_codes.append(self.publication_code)\nself.zip_file = Path(meta.path).name\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.kind","title":"kind class-attribute instance-attribute","text":"
kind = 'newspaper'\n

A string that represents the type of the object, set to \"newspaper\".

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.number_paths","title":"number_paths property","text":"
number_paths: list\n

Returns the nested directories in which we want to save the cache file.

Returns:

Type Description list

List of the desired directories in descending order

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code","title":"publication_code property","text":"
publication_code: str\n

A property that returns the code of the publication.

Returns:

Type Description str

The code of the publication

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.title","title":"title property","text":"
title: str\n

A property that returns the title of the newspaper.

Returns:

Type Description str

The title of the newspaper

"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.as_dict","title":"as_dict","text":"
as_dict() -> dict\n

A method that returns a dictionary representation of the newspaper object.

Returns:

Type Description dict

Dictionary representation of the Newspaper object

Source code in alto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n    A method that returns a dictionary representation of the newspaper\n    object.\n    Returns:\n        Dictionary representation of the Newspaper object\n    \"\"\"\nif not self._newspaper:\nself._newspaper = dict(\n**dict(publication_code=self.publication_code, title=self.title),\n**{\nx.tag: x.text or \"\"\nfor x in self.publication.findall(\"*\")\nif x.tag in [\"location\"]\n},\n)\nreturn self._newspaper\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.get_cache_path","title":"get_cache_path","text":"
get_cache_path() -> Path\n

Returns the path to the cache file for the newspaper object.

Returns:

Type Description Path

Path to the cache file for the newspaper object

Source code in alto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n    Returns the path to the cache file for the newspaper object.\n    Returns:\n        Path to the cache file for the newspaper object\n    \"\"\"\njson_file = f\"/{self.publication_code}/{self.publication_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\" + \"/\".join(self.number_paths) + json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code_from_input_sub_path","title":"publication_code_from_input_sub_path","text":"
publication_code_from_input_sub_path() -> str | None\n

A method that returns the publication code from the input sub-path of the publication process.

Returns:

Type Description str | None

The code of the publication

Source code in alto2txt2fixture/router.py
def publication_code_from_input_sub_path(self) -> str | None:\n\"\"\"\n    A method that returns the publication code from the input sub-path of\n    the publication process.\n    Returns:\n        The code of the publication\n    \"\"\"\ng = PUBLICATION_CODE.findall(self.input_sub_path)\nif len(g) == 1:\nreturn g[0]\nreturn None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.route","title":"route","text":"
route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None\n

This function is responsible for setting up the path for the alto2txt mountpoint, setting up the JISC papers and routing the collections for processing.

Parameters:

Name Type Description Default collections list

List of collection names

required cache_home str

Directory path for the cache

required mountpoint str

Directory path for the alto2txt mountpoint

required jisc_papers_path str

Path to the JISC papers

required report_dir str

Path to the report directory

required

Returns:

Type Description None

None

Source code in alto2txt2fixture/router.py
def route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None:\n\"\"\"\n    This function is responsible for setting up the path for the alto2txt\n    mountpoint, setting up the JISC papers and routing the collections for\n    processing.\n    Args:\n        collections: List of collection names\n        cache_home: Directory path for the cache\n        mountpoint: Directory path for the alto2txt mountpoint\n        jisc_papers_path: Path to the JISC papers\n        report_dir: Path to the report directory\n    Returns:\n        None\n    \"\"\"\nglobal CACHE_HOME\nglobal MNT\nglobal REPORT_DIR\nCACHE_HOME = cache_home\nREPORT_DIR = report_dir\nMNT = Path(mountpoint) if isinstance(mountpoint, str) else mountpoint\nif not MNT.exists():\nerror(\nf\"The mountpoint provided for alto2txt does not exist. \"\nf\"Either create a local copy or blobfuse it to \"\nf\"`{MNT.absolute()}`.\"\n)\njisc_papers = setup_jisc_papers(path=jisc_papers_path)\nfor collection_name in collections:\ncollection = Collection(name=collection_name, jisc_papers=jisc_papers)\nif collection.empty:\nerror(\nf\"It looks like {collection_name} is empty in the \"\nf\"alto2txt mountpoint: `{collection.dir.absolute()}`.\"\n)\nfor archive in collection.archives:\nwith archive as _:\n[\n(\ndoc.item.write_to_cache(),\ndoc.newspaper.write_to_cache(),\ndoc.issue.write_to_cache(),\ndoc.data_provider.write_to_cache(),\ndoc.ingest.write_to_cache(),\ndoc.digitisation.write_to_cache(),\n)\nfor doc in archive.documents\n]\nreturn\n
"},{"location":"reference/alto2txt2fixture/settings.html","title":"settings","text":"

The settings module provides configuration for running alto2txt2fixture.

Most of these are managed within the settings variable within this module.

Note

See the command line interface parameters documentation for means of modifying settings when run.

Attributes:

Name Type Description JSON_INDEX

Amount of indentation to include in output JSON files

DATA_PROVIDER_INDEX Final[str]

The field used to index DataProvider records

NEWSPAPER_COLLECTION_METADATA Final[list[FixtureDict]]

A list of FixtureDicts specifying speific newspaper data providers

SETUP_TITLE str

the title printed at the commandline via cli.show_setup() function

settings dotdict

a docdict configuration for running newspaper portions of alto2txt2fixture

"},{"location":"reference/alto2txt2fixture/types.html","title":"types","text":""},{"location":"reference/alto2txt2fixture/types.html#alto2txt2fixture.types.FixtureDict","title":"FixtureDict","text":"

Bases: TypedDict

A dict structure to ease use as a json database fixture.

Attributes:

Name Type Description pk int

an id to uniquely define and query each entry

model str

what model a given record is for

fields dict[str, Any]

a dict of record information conforming to model table

"},{"location":"reference/alto2txt2fixture/types.html#alto2txt2fixture.types.TranslatorTuple","title":"TranslatorTuple","text":"

Bases: NamedTuple

A named tuple of fields for translation.

Attributes:

Name Type Description start str

A string representing the starting field name.

finish str | list

A string or list specifying the field(s) to be translated. If it is a string, the translated field will be a direct mapping of the specified field in each item of the input list. If it is a list, the translated field will be a hyphen-separated concatenation of the specified fields in each item of the input list.

lst list[dict]

A list of dictionaries representing the items to be translated. Each dictionary should contain the necessary fields for translation, with the field names specified in the start parameter.

"},{"location":"reference/alto2txt2fixture/types.html#alto2txt2fixture.types.dotdict","title":"dotdict","text":"

Bases: dict

dot.notation access to dictionary attributes

"},{"location":"reference/alto2txt2fixture/utils.html","title":"utils","text":""},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.check_newspaper_collection_configuration","title":"check_newspaper_collection_configuration","text":"
check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[\nFixtureDict\n] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]\n

Check the names in collections match the names in newspaper_collections.

Parameters:

Name Type Description Default collections Iterable[str]

Names of newspaper collections, defaults to settings.COLLECTIONS

settings.COLLECTIONS newspaper_collections Iterable[FixtureDict]

Newspaper collections in a list of FixtureDict format. Defaults to settings.FIXTURE_TABLE['dataprovider]

NEWSPAPER_COLLECTION_METADATA data_provider_index str

dict fields key used to check matchiching collections name

DATA_PROVIDER_INDEX

Returns:

Type Description set[str]

A set of collections without a matching newspaper_collections record.

Example
>>> check_newspaper_collection_configuration()\nset()\n
Source code in alto2txt2fixture/utils.py
def check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]:\n\"\"\"Check the names in `collections` match the names in `newspaper_collections`.\n    Arguments:\n        collections:\n            Names of newspaper collections, defaults to ``settings.COLLECTIONS``\n        newspaper_collections:\n            Newspaper collections in a list of `FixtureDict` format. Defaults\n                to ``settings.FIXTURE_TABLE['dataprovider]``\n        data_provider_index:\n            `dict` `fields` `key` used to check matchiching `collections` name\n    Returns:\n        A set of ``collections`` without a matching `newspaper_collections` record.\n    Example:\n        ```pycon\n        >>> check_newspaper_collection_configuration()\n        set()\n        ```\n    \"\"\"\nnewspaper_collection_names: tuple[str, ...] = tuple(\ndict_from_list_fixture_fields(\nnewspaper_collections, field_name=data_provider_index\n).keys()\n)\ncollection_diff: set[str] = set(collections) - set(newspaper_collection_names)\nif collection_diff:\nwarning(\nf\"{len(collection_diff)} `collections` \"\nf\"not in `newspaper_collections`: {collection_diff}\"\n)\nreturn collection_diff\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.clear_cache","title":"clear_cache","text":"
clear_cache(dir: str | Path) -> None\n

Clears the cache directory by removing all .json files in it.

Parameters:

Name Type Description Default dir str | Path

The path of the directory to be cleared.

required Source code in alto2txt2fixture/utils.py
def clear_cache(dir: str | Path) -> None:\n\"\"\"\n    Clears the cache directory by removing all `.json` files in it.\n    Args:\n        dir: The path of the directory to be cleared.\n    \"\"\"\ndir = get_path_from(dir)\ny = input(\nf\"Do you want to erase the cache path now that the \"\nf\"files have been generated ({dir.absolute()})? [y/N]\"\n)\nif y.lower() == \"y\":\ninfo(\"Clearing up the cache directory\")\nfor x in dir.glob(\"*.json\"):\nx.unlink()\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.create_lookup","title":"create_lookup","text":"
create_lookup(lst: list = [], on: list = []) -> dict\n

Create a lookup dictionary from a list of dictionaries.

Parameters:

Name Type Description Default lst list

A list of dictionaries that should be used to generate the lookup.

[] on list

A list of keys from the dictionaries in the list that should be used as the keys in the lookup.

[]

Returns:

Type Description dict

The generated lookup dictionary.

Source code in alto2txt2fixture/utils.py
def create_lookup(lst: list = [], on: list = []) -> dict:\n\"\"\"\n    Create a lookup dictionary from a list of dictionaries.\n    Args:\n        lst: A list of dictionaries that should be used to generate the lookup.\n        on: A list of keys from the dictionaries in the list that should be used as the keys in the lookup.\n    Returns:\n        The generated lookup dictionary.\n    \"\"\"\nreturn {get_key(x, on): x[\"pk\"] for x in lst}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.dict_from_list_fixture_fields","title":"dict_from_list_fixture_fields","text":"
dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]\n

Create a dict from fixture_list with attr_name as key.

Parameters:

Name Type Description Default fixture_list Iterable[FixtureDict]

list of FixtureDict with attr_name key fields.

NEWSPAPER_COLLECTION_METADATA field_name str

key for values within fixture_list fields.

DATA_PROVIDER_INDEX

Returns:

Type Description dict[str, FixtureDict]

A dict where extracted field_name is key for related FixtureDict values.

Example
>>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n>>> fixture_dict['hmd']['pk']\n2\n>>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture_dict['hmd']['fields']['code']\n'bl-hmd'\n
Source code in alto2txt2fixture/utils.py
def dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]:\n\"\"\"Create a `dict` from ``fixture_list`` with ``attr_name`` as `key`.\n    Args:\n        fixture_list: `list` of `FixtureDict` with ``attr_name`` key `fields`.\n        field_name: key for values within ``fixture_list`` `fields`.\n    Returns:\n        A `dict` where extracted `field_name` is key for related `FixtureDict` values.\n    Example:\n        ```pycon\n        >>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n        >>> fixture_dict['hmd']['pk']\n        2\n        >>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        >>> fixture_dict['hmd']['fields']['code']\n        'bl-hmd'\n        ```\n    \"\"\"\nreturn {record[\"fields\"][field_name]: record for record in fixture_list}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.export_fixtures","title":"export_fixtures","text":"
export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None\n

Export fixture_tables in formats.

Note

This is still in experimental phase of development and not recommended for production.

Parameters:

Name Type Description Default fixture_tables dict[str, Sequence[FixtureDict]]

dict of table name (eg: dataprovider) and FixtureDict

required path str | PathLike

Path to save exports in

settings.FIXTURE_TABLES_OUTPUT prefix str

str to prefix export filenames with

'test-' formats Sequence[EXPORT_FORMATS]

list of EXPORT_FORMATS to export

settings.FIXTURE_TABLES_FORMATS Example
>>> test_fixture_tables: dict[str, FixtureDict] = {\n...     'test0': NEWSPAPER_COLLECTION_METADATA,\n...     'test1': NEWSPAPER_COLLECTION_METADATA}\n>>> export_fixtures(test_fixture_tables, path='tests/')\n...     # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...Warning: Saving test0...\n...Warning: Saving test1...\n>>> from pandas import read_csv\n>>> fixture0_json = load_json('tests/test-test0-1.json')\n>>> fixture0_df = read_csv('tests/test-test0-1.csv')\n>>> fixture1_json = load_json('tests/test-test1-1.json')\n>>> fixture1_df = read_csv('tests/test-test1-1.csv')\n>>> fixture0_json == fixture1_json\nTrue\n>>> all(fixture0_df == fixture1_df)\nTrue\n>>> all(field in fixture0_json[0]['fields']\n...     for field in ['created_at', 'updated_at'])\nTrue\n>>> fixture0_json[1]['pk']\n2\n>>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n[2, 'hmd']\n
Source code in alto2txt2fixture/utils.py
def export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None:\n\"\"\"Export ``fixture_tables`` in ``formats``.\n    Note:\n        This is still in experimental phase of development and not recommended\n        for production.\n    Args:\n        fixture_tables: `dict` of table name (eg: `dataprovider`) and `FixtureDict`\n        path: Path to save exports in\n        prefix: `str` to prefix export filenames with\n        formats: list of `EXPORT_FORMATS` to export\n    Example:\n        ```pycon\n        >>> test_fixture_tables: dict[str, FixtureDict] = {\n        ...     'test0': NEWSPAPER_COLLECTION_METADATA,\n        ...     'test1': NEWSPAPER_COLLECTION_METADATA}\n        >>> export_fixtures(test_fixture_tables, path='tests/')\n        ...     # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n        <BLANKLINE>\n        ...Warning: Saving test0...\n        ...Warning: Saving test1...\n        >>> from pandas import read_csv\n        >>> fixture0_json = load_json('tests/test-test0-1.json')\n        >>> fixture0_df = read_csv('tests/test-test0-1.csv')\n        >>> fixture1_json = load_json('tests/test-test1-1.json')\n        >>> fixture1_df = read_csv('tests/test-test1-1.csv')\n        >>> fixture0_json == fixture1_json\n        True\n        >>> all(fixture0_df == fixture1_df)\n        True\n        >>> all(field in fixture0_json[0]['fields']\n        ...     for field in ['created_at', 'updated_at'])\n        True\n        >>> fixture0_json[1]['pk']\n        2\n        >>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        >>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n        [2, 'hmd']\n        ```\n    \"\"\"\nfor table_name, records in fixture_tables.items():\nwarning(\nf\"Saving {table_name} fixture in {formats} formats \"\nf\"to {path} *without* checks...\"\n)\nif \"json\" in formats:\nsave_fixture(\nrecords,\nprefix=f\"{prefix}{table_name}\",\noutput_path=path,\nadd_created=add_created,\n)\nif \"csv\" in formats:\nfixtures_dict2csv(records, prefix=f\"{prefix}{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.filter_json_fields","title":"filter_json_fields","text":"
filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs: Hashable\n) -> dict | list\n

Return keys and values from json_dict where any fields equal value.

Parameters:

Name Type Description Default file_path PathLike | None

The file path to load based on extension and filter

None fields Sequence[str]

Which fields to check equal value

[] value Hashable

Value to filter by

''

Returns:

Type Description dict | list

A dict of records indexed by pk which fit filter criteria

Raises:

Type Description ValueError

file_path must have a .json suffix

Example
>>> from pprint import pprint\n>>> entry_fixture: dict = [\n...     {\"pk\": 4889, \"model\": \"mitchells.entry\",\n...      \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n...                 \"price_raw\": ['2d'],\n...                 \"year\": 1920,\n...                 \"date_established_raw\": \"1857\",\n...                 \"persons\": [], \"newspaper\": \"\"}},\n...      {\"pk\": 9207, \"model\": \"mitchells.entry\",\n...       \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n...                  \"price_raw\": ['2 \u00bd d', '3 \u00bd d'],\n...                  \"year\": 1856,\n...                  \"date_established_raw\": \"1848\",\n...                  \"persons\": ['Stephen Soulby'],\n...                  \"newspaper\": \"\",}},\n...     {\"pk\": 15, \"model\": \"mitchells.entry\",\n...      \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n...                 \"price_raw\": ['2d', '3d'],\n...                 \"year\": 1857,\n...                 \"date_established_raw\": \"November , 1842\",\n...                 \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n...                 \"newspaper\": 1187}}\n...     ]\n>>> pprint(filter_json_fields(entry_fixture,\n...                           fields=(\"newspaper\", \"persons\"),\n...                           value=\"\"))\n[{'fields': {'date_established_raw': '1857',\n             'newspaper': '',\n             'persons': [],\n             'price_raw': ['2d'],\n             'title': 'BIRMINGHAM POST .',\n             'year': 1920},\n  'model': 'mitchells.entry',\n  'pk': 4889},\n {'fields': {'date_established_raw': '1848',\n             'newspaper': '',\n             'persons': ['Stephen Soulby'],\n             'price_raw': ['2 \u00bd d', '3 \u00bd d'],\n             'title': 'ULVERSTONE ADVERTISER .',\n             'year': 1856},\n  'model': 'mitchells.entry',\n  'pk': 9207}]\n
Source code in alto2txt2fixture/utils.py
def filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs,\n) -> dict | list:\n\"\"\"Return `keys` and `values` from `json_dict` where any `fields` equal `value`.\n    Args:\n        file_path: The file `path` to load based on extension and filter\n        fields: Which fields to check equal `value`\n        value: Value to filter by\n    Returns:\n        A `dict` of records indexed by `pk` which fit filter criteria\n    Raises:\n        ValueError: ``file_path`` must have a `.json` `suffix`\n    Example:\n        ```pycon\n        >>> from pprint import pprint\n        >>> entry_fixture: dict = [\n        ...     {\"pk\": 4889, \"model\": \"mitchells.entry\",\n        ...      \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n        ...                 \"price_raw\": ['2d'],\n        ...                 \"year\": 1920,\n        ...                 \"date_established_raw\": \"1857\",\n        ...                 \"persons\": [], \"newspaper\": \"\"}},\n        ...      {\"pk\": 9207, \"model\": \"mitchells.entry\",\n        ...       \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n        ...                  \"price_raw\": ['2 \\u00bd d', '3 \\u00bd d'],\n        ...                  \"year\": 1856,\n        ...                  \"date_established_raw\": \"1848\",\n        ...                  \"persons\": ['Stephen Soulby'],\n        ...                  \"newspaper\": \"\",}},\n        ...     {\"pk\": 15, \"model\": \"mitchells.entry\",\n        ...      \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n        ...                 \"price_raw\": ['2d', '3d'],\n        ...                 \"year\": 1857,\n        ...                 \"date_established_raw\": \"November , 1842\",\n        ...                 \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n        ...                 \"newspaper\": 1187}}\n        ...     ]\n        >>> pprint(filter_json_fields(entry_fixture,\n        ...                           fields=(\"newspaper\", \"persons\"),\n        ...                           value=\"\"))\n        [{'fields': {'date_established_raw': '1857',\n                     'newspaper': '',\n                     'persons': [],\n                     'price_raw': ['2d'],\n                     'title': 'BIRMINGHAM POST .',\n                     'year': 1920},\n          'model': 'mitchells.entry',\n          'pk': 4889},\n         {'fields': {'date_established_raw': '1848',\n                     'newspaper': '',\n                     'persons': ['Stephen Soulby'],\n                     'price_raw': ['2 \\u00bd d', '3 \\u00bd d'],\n                     'title': 'ULVERSTONE ADVERTISER .',\n                     'year': 1856},\n          'model': 'mitchells.entry',\n          'pk': 9207}]\n        ```\n    \"\"\"\nif not json_results:\nassert file_path\ntry:\nassert Path(file_path).suffix == \".json\"\nexcept AssertionError:\nraise ValueError(f\"{file_path} must be `json` format.\")\njson_results = load_json(Path(file_path), **kwargs)\nassert json_results\nif isinstance(json_results, dict):\nreturn {\nk: v\nfor k, v in json_results.items()\nif any(v[\"fields\"][field] == value for field in fields)\n}\nelse:\nreturn [\nv\nfor v in json_results\nif any(v[\"fields\"][field] == value for field in fields)\n]\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_fields","title":"fixture_fields","text":"
fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]\n

Generate a tuple of FixtureDict field names.

Note

This is not in the utils module to avoid a circular import.

Parameters:

Name Type Description Default fixture_dict FixtureDict

A FixtureDict instance to extract names from fields

required include_pk bool

Whether to include the pk (primary key) column

True Example
>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n('name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> hmd_dict: dict[str, Any] = fixture_fields(\n...     NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n>>> hmd_dict['code']\n'bl-hmd'\n>>> hmd_dict['pk']\n2\n>>> hmd_dict = fixture_fields(\n...     NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n>>> 'pk' in hmd_dict\nFalse\n
Source code in alto2txt2fixture/utils.py
def fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]:\n\"\"\"Generate a tuple of `FixtureDict` `field` names.\n    Note:\n        This is not in the `utils` module to avoid a circular import.\n    Args:\n        fixture_dict: A `FixtureDict` instance to extract names from `fields`\n        include_pk: Whether to include the `pk` (primary key) column\n    Example:\n        ```pycon\n        >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n        ('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n        >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n        ('name', 'code', 'legacy_code', 'collection', 'source_note')\n        >>> hmd_dict: dict[str, Any] = fixture_fields(\n        ...     NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n        >>> hmd_dict['code']\n        'bl-hmd'\n        >>> hmd_dict['pk']\n        2\n        >>> hmd_dict = fixture_fields(\n        ...     NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n        >>> 'pk' in hmd_dict\n        False\n        ```\n    \"\"\"\nfields: OrderedDict[str, Any] = OrderedDict(fixture_dict[\"fields\"])\nif include_pk:\nfields[\"pk\"] = fixture_dict[\"pk\"]\nfields.move_to_end(\"pk\", last=False)\nif as_dict:\nreturn fields\nelse:\nreturn tuple(fields.keys())\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_or_default_dict","title":"fixture_or_default_dict","text":"
fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict\n

Return a FixtureDict from fixture_list via key index, else default_dict.

Parameters:

Name Type Description Default key str

a str to query fixture_dict with

required fixture_dict dict[str, FixtureDict]

a dict of str to FixtureDict, often generated by dict_from_list_fixture_fields

required default_dict FixtureDict | dict

a dict to return if key is not in fixture_dict index

{} Example
>>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n...     NEWSPAPER_COLLECTION_METADATA)\n>>> hmd_dict: FixtureDict = fixture_or_default_dict(\n...     'hmd', newspaper_dict\n... )\n>>> fixture_or_default_dict(\n...     'hmd', NEWSPAPER_COLLECTION_METADATA\n... )\n{}\n>>> fixture_or_default_dict(\n...     'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n... )\n{'a': 'default'}\n
Source code in alto2txt2fixture/utils.py
def fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict:\n\"\"\"Return a `FixtureDict` from ``fixture_list`` via ``key`` index, else ``default_dict``.\n    Args:\n        key:\n            a `str` to query ``fixture_dict`` with\n        fixture_dict: a `dict` of `str` to `FixtureDict`, often generated by\n             ``dict_from_list_fixture_fields``\n        default_dict: a `dict` to return if ``key`` is not in\n            ``fixture_dict`` index\n    Example:\n        ```pycon\n        >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n        ...     NEWSPAPER_COLLECTION_METADATA)\n        >>> hmd_dict: FixtureDict = fixture_or_default_dict(\n        ...     'hmd', newspaper_dict\n        ... )\n        >>> fixture_or_default_dict(\n        ...     'hmd', NEWSPAPER_COLLECTION_METADATA\n        ... )\n        {}\n        >>> fixture_or_default_dict(\n        ...     'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n        ... )\n        {'a': 'default'}\n        ```\n    \"\"\"\nif key in fixture_dict:\nreturn fixture_dict[key]\nelse:\nreturn default_dict\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixtures_dict2csv","title":"fixtures_dict2csv","text":"
fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None\n

Saves fixtures generated by a generator to separate separate CSV files.

This function takes an Iterable or Generator of fixtures and saves to separate CSV files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file parameter.

Parameters:

Name Type Description Default fixtures Iterable[FixtureDict] | Generator[FixtureDict, None, None]

An Iterable or Generator of the fixtures to be saved.

required prefix str

A string prefix to be added to the file names of the saved fixtures.

'' output_path PathLike | str

Path to folder fixtures are saved to

settings.OUTPUT max_elements_per_file int

Maximum JSON records saved in each file

settings.MAX_ELEMENTS_PER_FILE

Returns:

Type Description None

This function saves fixtures to files and does not return a value.

Example
>>> from pandas import read_csv\n>>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n...                   prefix='test', output_path='tests/')\n>>> imported_fixture = read_csv('tests/test-1.csv')\n>>> imported_fixture.iloc[1]['pk']\n2\n>>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n'hmd'\n
Source code in alto2txt2fixture/utils.py
def fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate separate `CSV` files.\n    This function takes an `Iterable` or `Generator` of fixtures and saves to\n    separate `CSV` files. The fixtures are saved in batches, where each batch\n    is determined by the ``max_elements_per_file`` parameter.\n    Args:\n        fixtures: An `Iterable` or `Generator` of the fixtures to be saved.\n        prefix: A string prefix to be added to the file names of the\n            saved fixtures.\n        output_path: Path to folder fixtures are saved to\n        max_elements_per_file: Maximum `JSON` records saved in each file\n    Returns:\n        This function saves fixtures to files and does not return a value.\n    Example:\n        ```pycon\n        >>> from pandas import read_csv\n        >>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n        ...                   prefix='test', output_path='tests/')\n        >>> imported_fixture = read_csv('tests/test-1.csv')\n        >>> imported_fixture.iloc[1]['pk']\n        2\n        >>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n        'hmd'\n        ```\n    \"\"\"\ninternal_counter: int = 1\ncounter: int = 1\nlst: list = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in fixtures:\nlst.append(fixture_fields(item, as_dict=True))\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst: list = []\ninternal_counter = 1\ncounter += 1\nelse:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\nreturn\nsave_fixture(records, prefix=f\"test-{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.gen_fixture_tables","title":"gen_fixture_tables","text":"
gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]\n

Generator of rich.Table instances from FixtureDict configuration tables.

Parameters:

Name Type Description Default fixture_tables dict[str, list[FixtureDict]]

dict where key is for Table title and value is a FixtureDict

{} include_fixture_pk_column bool

whether to include the pk field from FixtureDict

True Example
>>> table_name: str = \"data_provider\"\n>>> tables = tuple(\n...     gen_fixture_tables(\n...         {table_name: NEWSPAPER_COLLECTION_METADATA}\n...     ))\n>>> len(tables)\n1\n>>> assert tables[0].title == table_name\n>>> [column.header for column in tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n
Source code in alto2txt2fixture/utils.py
def gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]:\n\"\"\"Generator of `rich.Table` instances from `FixtureDict` configuration tables.\n    Args:\n        fixture_tables: `dict` where `key` is for `Table` title and `value` is a `FixtureDict`\n        include_fixture_pk_column: whether to include the `pk` field from `FixtureDict`\n    Example:\n        ```pycon\n        >>> table_name: str = \"data_provider\"\n        >>> tables = tuple(\n        ...     gen_fixture_tables(\n        ...         {table_name: NEWSPAPER_COLLECTION_METADATA}\n        ...     ))\n        >>> len(tables)\n        1\n        >>> assert tables[0].title == table_name\n        >>> [column.header for column in tables[0].columns]\n        ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n        ```\n    \"\"\"\nfor name, fixture_records in fixture_tables.items():\nfixture_table: Table = Table(title=name)\nfor i, fixture_dict in enumerate(fixture_records):\nif i == 0:\n[\nfixture_table.add_column(name)\nfor name in fixture_fields(fixture_dict, include_fixture_pk_column)\n]\nrow_values: tuple[str, ...] = tuple(\nstr(x) for x in (fixture_dict[\"pk\"], *fixture_dict[\"fields\"].values())\n)\nfixture_table.add_row(*row_values)\nyield fixture_table\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_chunked_zipfiles","title":"get_chunked_zipfiles","text":"
get_chunked_zipfiles(path: Path) -> list\n

This function takes in a Path object path and returns a list of lists of zipfiles sorted and chunked according to certain conditions defined in the settings object (see settings.CHUNK_THRESHOLD).

Note: the function will also skip zip files of a certain file size, which can be specified in the settings object (see settings.SKIP_FILE_SIZE).

Parameters:

Name Type Description Default path Path

The input path where the zipfiles are located

required

Returns:

Type Description list

A list of lists of zipfiles, each inner list represents a chunk of zipfiles.

Source code in alto2txt2fixture/utils.py
def get_chunked_zipfiles(path: Path) -> list:\n\"\"\"This function takes in a `Path` object `path` and returns a list of lists\n    of `zipfiles` sorted and chunked according to certain conditions defined\n    in the `settings` object (see `settings.CHUNK_THRESHOLD`).\n    Note: the function will also skip zip files of a certain file size, which\n    can be specified in the `settings` object (see `settings.SKIP_FILE_SIZE`).\n    Args:\n        path: The input path where the zipfiles are located\n    Returns:\n        A list of lists of `zipfiles`, each inner list represents a chunk of\n            zipfiles.\n    \"\"\"\nzipfiles = sorted(\npath.glob(\"*.zip\"),\nkey=lambda x: x.stat().st_size,\nreverse=settings.START_WITH_LARGEST,\n)\nzipfiles = [x for x in zipfiles if x.stat().st_size <= settings.SKIP_FILE_SIZE]\nif len(zipfiles) > settings.CHUNK_THRESHOLD:\nchunks = array_split(zipfiles, len(zipfiles) / settings.CHUNK_THRESHOLD)\nelse:\nchunks = [zipfiles]\nreturn chunks\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_key","title":"get_key","text":"
get_key(x: dict = dict(), on: list = []) -> str\n

Get a string key from a dictionary using values from specified keys.

Parameters:

Name Type Description Default x dict

A dictionary from which the key is generated.

dict() on list

A list of keys from the dictionary that should be used to generate the key.

[]

Returns:

Type Description str

The generated string key.

Source code in alto2txt2fixture/utils.py
def get_key(x: dict = dict(), on: list = []) -> str:\n\"\"\"\n    Get a string key from a dictionary using values from specified keys.\n    Args:\n        x: A dictionary from which the key is generated.\n        on: A list of keys from the dictionary that should be used to\n            generate the key.\n    Returns:\n        The generated string key.\n    \"\"\"\nreturn f\"{'-'.join([str(x['fields'][y]) for y in on])}\"\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_lockfile","title":"get_lockfile","text":"
get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path\n

Provides the path to any given lockfile, which controls whether any existing files should be overwritten or not.

Parameters:

Name Type Description Default collection str

Collection folder name

required kind NewspaperElements

Either newspaper or issue or item

required dic dict

A dictionary with required information for either kind passed

required

Returns:

Type Description Path

Path to the resulting lockfile

Source code in alto2txt2fixture/utils.py
def get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path:\n\"\"\"\n    Provides the path to any given lockfile, which controls whether any\n    existing files should be overwritten or not.\n    Args:\n        collection: Collection folder name\n        kind: Either `newspaper` or `issue` or `item`\n        dic: A dictionary with required information for either `kind` passed\n    Returns:\n        Path to the resulting lockfile\n    \"\"\"\np: Path\nbase = Path(f\"cache-lockfiles/{collection}\")\nif kind == \"newspaper\":\np = base / f\"newspapers/{dic['publication_code']}\"\nelif kind == \"issue\":\np = base / f\"issues/{dic['publication__publication_code']}/{dic['issue_code']}\"\nelif kind == \"item\":\ntry:\nif dic.get(\"issue_code\"):\np = base / f\"items/{dic['issue_code']}/{dic['item_code']}\"\nelif dic.get(\"issue__issue_identifier\"):\np = base / f\"items/{dic['issue__issue_identifier']}/{dic['item_code']}\"\nexcept KeyError:\nerror(\"An unknown error occurred (in get_lockfile)\")\nelse:\np = base / \"lockfile\"\np.parent.mkdir(parents=True, exist_ok=True) if settings.WRITE_LOCKFILES else None\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_now","title":"get_now","text":"
get_now(as_str: bool = False) -> datetime.datetime | str\n

Return datetime.now() as either a string or datetime object.

Parameters:

Name Type Description Default as_str bool

Whether to return now time as a str or not, default: False

False

Returns:

Type Description datetime.datetime | str

datetime.now() in pytz.UTC time zone as a string if as_str, else as a datetime.datetime object.

Source code in alto2txt2fixture/utils.py
def get_now(as_str: bool = False) -> datetime.datetime | str:\n\"\"\"\n    Return `datetime.now()` as either a string or `datetime` object.\n    Args:\n        as_str: Whether to return `now` `time` as a `str` or not, default: `False`\n    Returns:\n        `datetime.now()` in `pytz.UTC` time zone as a string if `as_str`, else\n            as a `datetime.datetime` object.\n    \"\"\"\nnow = datetime.datetime.now(tz=pytz.UTC)\nif as_str:\nreturn str(now)\nelse:\nassert isinstance(now, datetime.datetime)\nreturn now\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_path_from","title":"get_path_from","text":"
get_path_from(p: str | Path) -> Path\n

Converts an input value into a Path object if it's not already one.

Parameters:

Name Type Description Default p str | Path

The input value, which can be a string or a Path object.

required

Returns:

Type Description Path

The input value as a Path object.

Source code in alto2txt2fixture/utils.py
def get_path_from(p: str | Path) -> Path:\n\"\"\"\n    Converts an input value into a Path object if it's not already one.\n    Args:\n        p: The input value, which can be a string or a Path object.\n    Returns:\n        The input value as a Path object.\n    \"\"\"\nif isinstance(p, str):\np = Path(p)\nif not isinstance(p, Path):\nraise RuntimeError(f\"Unable to handle type: {type(p)}\")\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_size_from_path","title":"get_size_from_path","text":"
get_size_from_path(p: str | Path, raw: bool = False) -> str | float\n

Returns a nice string for any given file size.

Parameters:

Name Type Description Default p str | Path

Path to read the size from

required raw bool

Whether to return the file size as total number of bytes or a human-readable MB/GB amount

False

Returns:

Type Description str | float

Return str followed by MB or GB for size if not raw otherwise float.

Source code in alto2txt2fixture/utils.py
def get_size_from_path(p: str | Path, raw: bool = False) -> str | float:\n\"\"\"\n    Returns a nice string for any given file size.\n    Args:\n        p: Path to read the size from\n        raw: Whether to return the file size as total number of bytes or\n            a human-readable MB/GB amount\n    Returns:\n        Return `str` followed by `MB` or `GB` for size if not `raw` otherwise `float`.\n    \"\"\"\np = get_path_from(p)\nbytes = p.stat().st_size\nif raw:\nreturn bytes\nrel_size: float | int | str = round(bytes / 1000 / 1000 / 1000, 1)\nassert not isinstance(rel_size, str)\nif rel_size < 0.5:\nrel_size = round(bytes / 1000 / 1000, 1)\nrel_size = f\"{rel_size}MB\"\nelse:\nrel_size = f\"{rel_size}GB\"\nreturn rel_size\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.glob_filter","title":"glob_filter","text":"
glob_filter(p: str) -> list\n

Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.

Parameters:

Name Type Description Default p str

Path to a directory to filter

required

Returns:

Type Description list

Sorted list of files contained in the provided path without the ones

list

whose names start with a .

Source code in alto2txt2fixture/utils.py
def glob_filter(p: str) -> list:\n\"\"\"\n    Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.\n    Args:\n        p: Path to a directory to filter\n    Returns:\n        Sorted list of files contained in the provided path without the ones\n        whose names start with a `.`\n    \"\"\"\nreturn sorted([x for x in get_path_from(p).glob(\"*\") if not x.name.startswith(\".\")])\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.list_json_files","title":"list_json_files","text":"
list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]\n

List json files under the path specified in p.

Parameters:

Name Type Description Default p str | Path

The path to search for json files

required drill bool

A flag indicating whether to drill down the subdirectories or not. Default is False

False exclude_names list

A list of file names to exclude from the search result. Default is an empty list

[] include_names list

A list of file names to include in search result. If provided, the exclude_names argument will be ignored. Default is an empty list

[]

Returns:

Type Description Generator[Path, None, None] | list[Path]

A list of Path objects pointing to the found json files

Source code in alto2txt2fixture/utils.py
def list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]:\n\"\"\"\n    List `json` files under the path specified in ``p``.\n    Args:\n        p: The path to search for `json` files\n        drill: A flag indicating whether to drill down the subdirectories\n            or not. Default is ``False``\n        exclude_names: A list of file names to exclude from the search\n            result. Default is an empty list\n        include_names: A list of file names to include in search result.\n            If provided, the ``exclude_names`` argument will be ignored.\n            Default is an empty list\n    Returns:\n        A list of `Path` objects pointing to the found `json` files\n    \"\"\"\nq: str = \"**/*.json\" if drill else \"*.json\"\nfiles = get_path_from(p).glob(q)\nif exclude_names:\nfiles = list({x for x in files if x.name not in exclude_names})\nelif include_names:\nfiles = list({x for x in files if x.name in include_names})\nreturn sorted(files)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_json","title":"load_json","text":"
load_json(p: str | Path, crash: bool = False) -> dict | list\n

Easier access to reading json files.

Parameters:

Name Type Description Default p str | Path

Path to read json from

required crash bool

Whether the program should crash if there is a json decode error, default: False

False

Returns:

Type Description dict | list

The decoded json contents from the path, but an empty dictionary

dict | list

if the file cannot be decoded and crash is set to False

Source code in alto2txt2fixture/utils.py
def load_json(p: str | Path, crash: bool = False) -> dict | list:\n\"\"\"\n    Easier access to reading `json` files.\n    Args:\n        p: Path to read `json` from\n        crash: Whether the program should crash if there is a `json` decode\n            error, default: ``False``\n    Returns:\n        The decoded `json` contents from the path, but an empty dictionary\n        if the file cannot be decoded and ``crash`` is set to ``False``\n    \"\"\"\np = get_path_from(p)\ntry:\nreturn json.loads(p.read_text())\nexcept json.JSONDecodeError:\nmsg = f\"Error: {p.read_text()}\"\nerror(msg, crash=crash)\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_multiple_json","title":"load_multiple_json","text":"
load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list\n

Load multiple json files and return a list of their content.

Parameters:

Name Type Description Default p str | Path

The path to search for json files

required drill bool

A flag indicating whether to drill down the subdirectories or not. Default is False

False filter_na bool

A flag indicating whether to filter out the content that is None. Default is True.

True crash bool

A flag indicating whether to raise an exception when an error occurs while loading a json file. Default is False.

False

Returns:

Type Description list

A list of the content of the loaded json files.

Source code in alto2txt2fixture/utils.py
def load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list:\n\"\"\"\n    Load multiple `json` files and return a list of their content.\n    Args:\n        p: The path to search for `json` files\n        drill: A flag indicating whether to drill down the subdirectories\n            or not. Default is `False`\n        filter_na: A flag indicating whether to filter out the content that\n            is `None`. Default is `True`.\n        crash: A flag indicating whether to raise an exception when an\n            error occurs while loading a `json` file. Default is `False`.\n    Returns:\n        A `list` of the content of the loaded `json` files.\n    \"\"\"\nfiles = list_json_files(p, drill=drill)\ncontent = [load_json(x, crash=crash) for x in files]\nreturn [x for x in content if x] if filter_na else content\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.lock","title":"lock","text":"
lock(lockfile: Path) -> None\n

Writes a '.' to a lockfile, after making sure the parent directory exists.

Parameters:

Name Type Description Default lockfile Path

The path to the lock file to be created

required

Returns:

Type Description None

None

Source code in alto2txt2fixture/utils.py
def lock(lockfile: Path) -> None:\n\"\"\"\n    Writes a '.' to a lockfile, after making sure the parent directory exists.\n    Args:\n        lockfile: The path to the lock file to be created\n    Returns:\n        None\n    \"\"\"\nlockfile.parent.mkdir(parents=True, exist_ok=True)\nlockfile.write_text(\"\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.save_fixture","title":"save_fixture","text":"
save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n

Saves fixtures generated by a generator to separate JSON files.

This function takes a generator and saves the generated fixtures to separate JSON files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file parameter.

Parameters:

Name Type Description Default generator Sequence | Generator

A generator that yields the fixtures to be saved.

[] prefix str

A string prefix to be added to the file names of the saved fixtures.

'' output_path PathLike | str

Path to folder fixtures are saved to

settings.OUTPUT max_elements_per_file int

Maximum JSON records saved in each file

settings.MAX_ELEMENTS_PER_FILE add_created bool

Whether to add created_at and updated_at timestamps

True json_indent int

Number of indent spaces per line in saved JSON

JSON_INDENT

Returns:

Type Description None

This function saves the fixtures to files but does not return any value.

Example
>>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n...              prefix='test', output_path='tests/')\n>>> imported_fixture = load_json('tests/test-1.json')\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> 'created_at' in imported_fixture[1]['fields']\nTrue\n
Source code in alto2txt2fixture/utils.py
def save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate JSON files.\n    This function takes a generator and saves the generated fixtures to\n    separate JSON files. The fixtures are saved in batches, where each batch\n    is determined by the ``max_elements_per_file`` parameter.\n    Args:\n        generator: A generator that yields the fixtures to be saved.\n        prefix: A string prefix to be added to the file names of the\n            saved fixtures.\n        output_path: Path to folder fixtures are saved to\n        max_elements_per_file: Maximum `JSON` records saved in each file\n        add_created: Whether to add `created_at` and `updated_at` `timestamps`\n        json_indent: Number of indent spaces per line in saved `JSON`\n    Returns:\n        This function saves the fixtures to files but does not return\n            any value.\n    Example:\n        ```pycon\n        >>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n        ...              prefix='test', output_path='tests/')\n        >>> imported_fixture = load_json('tests/test-1.json')\n        >>> imported_fixture[1]['pk']\n        2\n        >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        >>> 'created_at' in imported_fixture[1]['fields']\n        True\n        ```\n    \"\"\"\ninternal_counter = 1\ncounter = 1\nlst = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in generator:\nlst.append(item)\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst = []\ninternal_counter = 1\ncounter += 1\nelse:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.write_json","title":"write_json","text":"
write_json(\np: str | Path,\no: dict,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n

Easier access to writing json files. Checks whether parent exists.

Parameters:

Name Type Description Default p str | Path

Path to write json to

required o dict

Object to write to json file

required add_created bool

If set to True will add created_at and updated_at to the dictionary's fields. If created_at and updated_at already exist in the fields, they will be forcefully updated.

True json_indent int

What indetation format to write out JSON file in

JSON_INDENT

Returns:

Type Description None

None

Example

>>> path = 'test-write-json/example.json'\n>>> write_json(p=path,\n...            o=NEWSPAPER_COLLECTION_METADATA,\n...            add_created=True)\n>>> imported_fixture = load_json(path)\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n
`

Source code in alto2txt2fixture/utils.py
def write_json(\np: str | Path, o: dict, add_created: bool = True, json_indent: int = JSON_INDENT\n) -> None:\n\"\"\"\n    Easier access to writing `json` files. Checks whether parent exists.\n    Args:\n        p: Path to write `json` to\n        o: Object to write to `json` file\n        add_created:\n            If set to True will add `created_at` and `updated_at`\n            to the dictionary's fields. If `created_at` and `updated_at`\n            already exist in the fields, they will be forcefully updated.\n        json_indent:\n            What indetation format to write out `JSON` file in\n    Returns:\n        None\n    Example:\n        ```pycon\n        >>> path = 'test-write-json/example.json'\n        >>> write_json(p=path,\n        ...            o=NEWSPAPER_COLLECTION_METADATA,\n        ...            add_created=True)\n        >>> imported_fixture = load_json(path)\n        >>> imported_fixture[1]['pk']\n        2\n        >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n        'hmd'\n        ```\n        `\n    \"\"\"\np = get_path_from(p)\nif not (isinstance(o, dict) or isinstance(o, list)):\nraise RuntimeError(f\"Unable to handle data of type: {type(o)}\")\ndef _append_created_fields(o: dict):\n\"\"\"Add `created_at` and `updated_at` fields to a `dict` with `FixtureDict` values.\"\"\"\nreturn dict(\n**{k: v for k, v in o.items() if not k == \"fields\"},\nfields=dict(\n**{\nk: v\nfor k, v in o[\"fields\"].items()\nif not k == \"created_at\" and not k == \"updated_at\"\n},\n**{\"created_at\": NOW_str, \"updated_at\": NOW_str},\n),\n)\ntry:\nif add_created and isinstance(o, dict):\no = _append_created_fields(o)\nelif add_created and isinstance(o, list):\no = [_append_created_fields(x) for x in o]\nexcept KeyError:\nerror(\"An unknown error occurred (in write_json)\")\np.parent.mkdir(parents=True, exist_ok=True)\np.write_text(json.dumps(o, indent=json_indent))\nreturn\n
"},{"location":"tutorial/first-steps.html","title":"First Steps","text":""},{"location":"tutorial/first-steps.html#installing","title":"Installing","text":"

The installation process should be fairly easy to take care of, using poetry:

$ poetry install\n

However, this is only the first step in the process. As the script works through the alto2txt collections, you will either need to choose the slower option \u2014 mounting them to your computer (using blobfuse) \u2014\u00a0or the faster option \u2014 downloading the required zip files from the Azure storage to your local hard drive. In the two following sections, both of those options are described.

"},{"location":"tutorial/first-steps.html#connecting-alto2txt-to-the-program","title":"Connecting alto2txt to the program","text":""},{"location":"tutorial/first-steps.html#downloading-local-copies-of-alto2txt-on-your-computer","title":"Downloading local copies of alto2txt on your computer","text":"

This option will take up a lot of hard drive space

As of the time of writing, downloading all of alto2txt\u2019s metadata takes up about 185GB on your local drive.

You do not have to download all of the collections or all of the zip files for each collection, as long as you are aware that the resulting fixtures will be limited in scope.

"},{"location":"tutorial/first-steps.html#step-1-log-in-to-azure-using-microsoft-azure-storage-explorer","title":"Step 1: Log in to Azure using Microsoft Azure Storage Explorer","text":"

Microsoft Azure Storage Explorer (MASE) is a great and free tool for downloading content off Azure. Your first step is to download and install this product on your local computer.

Once you have opened MASE, you will need to sign into the appropriate Azure account.

"},{"location":"tutorial/first-steps.html#step-2-download-the-alto2txt-blob-container-to-your-hard-drive","title":"Step 2: Download the alto2txt blob container to your hard drive","text":"

On your left-hand side, you should see a menu where you can navigate to the correct \u201cblob container\u201d: Living with Machines > Storage Accounts > alto2txt > Blob Containers:

You will want to replicate the same structure as the Blob Container itself in a folder on your hard drive:

Once you have the structure set up, you are ready to download all of the files needed. For each of the blob containers, make sure that you download the metadata directory only onto your computer:

Select all of the files and press the download button:

Make sure you save all the zip files inside the correct local folder:

The \u201cActivities\u201d bar will now show you the progress and speed:

"},{"location":"tutorial/first-steps.html#mounting-alto2txt-on-your-computer","title":"Mounting alto2txt on your computer","text":"

This option will only work on a Linux or UNIX computer

If you have a mac, your only option is the one below.

"},{"location":"tutorial/first-steps.html#step-1-install-blobfuse","title":"Step 1: Install BlobFuse","text":"

Follow the instructions for installing BlobFuse and the instructions for how to prepare your drive for mounting.

"},{"location":"tutorial/first-steps.html#step-2-set-up-sas-tokens","title":"Step 2: Set up SAS tokens","text":"

Follow the instructions for setting up access to your Azure storage account.

"},{"location":"tutorial/first-steps.html#step-3-mount-your-blobs","title":"Step 3: Mount your blobs","text":"

TODO #3: Write this section.

Note that you can also search on the internet for ideas on how to create local scripts to facilitate easier connection next time.

"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..c297f9f --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,83 @@ + + + + https://living-with-machines.github.io/alto2txt2fixture/index.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/running.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/understanding-results.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/index.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/__main__.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/cli.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/create_adjacent_tables.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/jisc.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/log.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/parser.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/patterns.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/router.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/settings.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/types.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/reference/alto2txt2fixture/utils.html + 2023-08-23 + daily + + + https://living-with-machines.github.io/alto2txt2fixture/tutorial/first-steps.html + 2023-08-23 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 0000000..4f19a08 Binary files /dev/null and b/sitemap.xml.gz differ diff --git a/tutorial/first-steps.html b/tutorial/first-steps.html new file mode 100644 index 0000000..e0aa469 --- /dev/null +++ b/tutorial/first-steps.html @@ -0,0 +1,862 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + First Steps - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + + + + + +
+
+ + + + + + + +

First Steps

+ +

Installing

+

The installation process should be fairly easy to take care of, using poetry:

+
$ poetry install
+
+

However, this is only the first step in the process. As the script works through the alto2txt collections, you will either need to choose the slower option — mounting them to your computer (using blobfuse) — or the faster option — downloading the required zip files from the Azure storage to your local hard drive. In the two following sections, both of those options are described.

+

Connecting alto2txt to the program

+

Downloading local copies of alto2txt on your computer

+
+

This option will take up a lot of hard drive space

+

As of the time of writing, downloading all of alto2txt’s metadata takes up about 185GB on your local drive.

+
+

You do not have to download all of the collections or all of the zip files for each collection, as long as you are aware that the resulting fixtures will be limited in scope.

+

Step 1: Log in to Azure using Microsoft Azure Storage Explorer

+

Microsoft Azure Storage Explorer (MASE) is a great and free tool for downloading content off Azure. Your first step is to download and install this product on your local computer.

+

Once you have opened MASE, you will need to sign into the appropriate Azure account.

+

Step 2: Download the alto2txt blob container to your hard drive

+

On your left-hand side, you should see a menu where you can navigate to the correct “blob container”: Living with Machines > Storage Accounts > alto2txt > Blob Containers:

+

../img/azure-storage.png

+

You will want to replicate the same structure as the Blob Container itself in a folder on your hard drive:

+

../img/local-storage.png

+

Once you have the structure set up, you are ready to download all of the files needed. For each of the blob containers, make sure that you download the metadata directory only onto your computer:

+

../img/metadata-fulltext.png

+

Select all of the files and press the download button:

+

../img/files-selected.png

+

Make sure you save all the zip files inside the correct local folder:

+

../img/ensure-correct-folder.png

+

The “Activities” bar will now show you the progress and speed:

+

../img/activity-bar.png

+

Mounting alto2txt on your computer

+
+

This option will only work on a Linux or UNIX computer

+

If you have a mac, your only option is the one below.

+
+

Step 1: Install BlobFuse

+

Follow the instructions for installing BlobFuse and the instructions for how to prepare your drive for mounting.

+

Step 2: Set up SAS tokens

+

Follow the instructions for setting up access to your Azure storage account.

+

Step 3: Mount your blobs

+

TODO #3: Write this section.

+

Note that you can also search on the internet for ideas on how to create local scripts to facilitate easier connection next time.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/understanding-results.html b/understanding-results.html new file mode 100644 index 0000000..27d5dec --- /dev/null +++ b/understanding-results.html @@ -0,0 +1,802 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Understanding the Results - alto2txt2fixture + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Understanding the Results

+ +

The resulting file structure

+
+

The examples below follow standard settings

+

If you choose other settings for when you run the program, your output directory may look different from the information on this page.

+
+

Reports

+

Reports are automatically generated with a unique hash as the overarching folder structure. Inside the reports directory, you’ll find a JSON file for each alto2txt directory (organised by NLP identifier).

+

The report structure, thus, looks like this:

+

img/output-report-dir.png

+

The JSON file has some good troubleshooting information. You’ll find that the contents are structured as a Python dictionary (or JavaScript Object). Here is an example:

+

img/output-report-json.png

+

Here is an explanation of each of the keys in the dictionary:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyExplanationData type
pathThe input path for the zip file that is being converted.string
bytesThe size of the input zip file represented in bytes.integer
sizeThe size of the input zip file represented in a human-readable string.string
contents#TODO #3integer
startDate and time when processing started (see also end below).datestring
newspaper_paths#TODO #3list (string)
publication_codesA list of the NLPs that are contained in the input zip file.list (string)
issue_pathsA list of all the issue paths that are contained in the cache directory.list (string)
item_pathsA list of all the item paths that are contained in the cache directory.list (string)
endDate and time when processing ended (see also start above).datestring
secondsSeconds that the script spent interpreting the zip file (should be added to the microseconds below).integer
microsecondsMicroseconds that the script spent interpreting the zip file (should be added to the seconds above).integer
+

Fixtures

+

The most important output of the script is contained in the fixtures directory. This directory contains JSON files for all the different columns in the corresponding Django metadata database (i.e. DataProvider, Digitisation, Ingest, Issue, Newspaper, and Item). The numbering at the end of each file indicates the order of the files as they are divided into a maximum of 2e6 elements*:

+

img/output-fixtures-dir.png

+

Each JSON file contains a Python-like list (JavaScript Array) of dictionaries (JavaScript Objects), which have a primary key (pk), the related database model (in the example below the Django newspapers app’s newspaper table), and a nested dictionary/Object which contains all the values for the database’s table entry:

+

img/output-fixtures-json.png

+
+

* The maximum elements per file can be adjusted in the settings.py file’s settings object’s MAX_ELEMENTS_PER_FILE value.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file