diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..fceaa194 --- /dev/null +++ b/404.html @@ -0,0 +1,785 @@ + + + + + + + + + + + + + + + + + + + + Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.b4d07000.min.js b/assets/javascripts/bundle.b4d07000.min.js new file mode 100644 index 00000000..3c0bdad9 --- /dev/null +++ b/assets/javascripts/bundle.b4d07000.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ci=Object.create;var gr=Object.defineProperty;var Ri=Object.getOwnPropertyDescriptor;var ki=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Hi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,nn=Object.prototype.propertyIsEnumerable;var rn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&rn(e,r,t[r]);if(Ht)for(var r of Ht(t))nn.call(t,r)&&rn(e,r,t[r]);return e};var on=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&nn.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Pi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ki(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=Ri(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ci(Hi(e)):{},Pi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var sn=Pt((xr,an)=>{(function(e,t){typeof xr=="object"&&typeof an!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(O){return!!(O&&O!==document&&O.nodeName!=="HTML"&&O.nodeName!=="BODY"&&"classList"in O&&"contains"in O.classList)}function f(O){var Qe=O.type,De=O.tagName;return!!(De==="INPUT"&&s[Qe]&&!O.readOnly||De==="TEXTAREA"&&!O.readOnly||O.isContentEditable)}function c(O){O.classList.contains("focus-visible")||(O.classList.add("focus-visible"),O.setAttribute("data-focus-visible-added",""))}function u(O){O.hasAttribute("data-focus-visible-added")&&(O.classList.remove("focus-visible"),O.removeAttribute("data-focus-visible-added"))}function p(O){O.metaKey||O.altKey||O.ctrlKey||(a(r.activeElement)&&c(r.activeElement),n=!0)}function m(O){n=!1}function d(O){a(O.target)&&(n||f(O.target))&&c(O.target)}function h(O){a(O.target)&&(O.target.classList.contains("focus-visible")||O.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(O.target))}function v(O){document.visibilityState==="hidden"&&(o&&(n=!0),Y())}function Y(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function B(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(O){O.target.nodeName&&O.target.nodeName.toLowerCase()==="html"||(n=!1,B())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),Y(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var cn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},s=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(B,N){d.append(N,B)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(O){throw new Error("URL unable to set base "+c+" due to "+O)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,Y=!0,B=this;["append","delete","set"].forEach(function(O){var Qe=h[O];h[O]=function(){Qe.apply(h,arguments),v&&(Y=!1,B.search=h.toString(),Y=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,Y&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(f){Object.defineProperty(s,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){a(f)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var qr=Pt((Mt,Nr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof Nr=="object"?Nr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ai}});var s=i(279),a=i.n(s),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(T){return!1}}var d=function(T){var E=p()(T);return m("cut"),E},h=d;function v(j){var T=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[T?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var Y=function(T,E){var H=v(T);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},B=function(T){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof T=="string"?H=Y(T,E):T instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(T==null?void 0:T.type)?H=Y(T.value,E):(H=p()(T),m("copy")),H},N=B;function O(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?O=function(E){return typeof E}:O=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},O(j)}var Qe=function(){var T=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=T.action,H=E===void 0?"copy":E,I=T.container,q=T.target,Me=T.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&O(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function Ei(j,T){if(!(j instanceof T))throw new TypeError("Cannot call a class as a function")}function tn(j,T){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=c()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ai=Li},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,f){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(f))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return c(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),s=f.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var f=this;function c(){f.off(i,c),s.apply(a,arguments)}return c._=s,this.on(i,c,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=a.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var rs=/["'&<>]/;Yo.exports=ns;function ns(e){var t=""+e,r=rs.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof et?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function pn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,f){s=e[i](s),o(a,f,s.done,s.value)})}}function o(i,s,a,f){Promise.resolve(f).then(function(c){i({value:c,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),f=a.next();!f.done;f=a.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{ln(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ln(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function ln(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new xn(r,n)},t}(F);var xn=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,f=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Sn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Oe=new Sn(wn);var _=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Te(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=zi();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return un(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return Ni(e);if(pt(e))return qi(e);if(Nt(e))return Ki(e);if(Kt(e))return On(e);if(Gt(e))return Qi(e);if(Jt(e))return Yi(e)}throw Qt(e)}function Ni(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function qi(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Dn(function(){return new Zt}))}}function Vn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,f=a===void 0?!0:a;return function(c){var u,p,m,d=0,h=!1,v=!1,Y=function(){p==null||p.unsubscribe(),p=void 0},B=function(){Y(),u=m=void 0,h=v=!1},N=function(){var O=u;B(),O==null||O.unsubscribe()};return y(function(O,Qe){d++,!v&&!h&&Y();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,f))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,Y(),p=$r(B,o,$e),De.error($e)},complete:function(){h=!0,Y(),p=$r(B,s),De.complete()}}),U(O).subscribe(u))})(c)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),J())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Kn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>rr(e)),V(rr(e)))}var Yn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Wr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),va?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Wr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ba.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Gn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Jn=typeof WeakMap!="undefined"?new WeakMap:new Yn,Xn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ga.getInstance(),n=new La(t,r,this);Jn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Xn.prototype[e]=function(){var t;return(t=Jn.get(this))[e].apply(t,arguments)}});var Aa=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Xn}(),Zn=Aa;var eo=new x,Ca=$(()=>k(new Zn(e=>{for(let t of e)eo.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ca.pipe(S(t=>t.observe(e)),g(t=>eo.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var to=new x,Ra=$(()=>k(new IntersectionObserver(e=>{for(let t of e)to.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function sr(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function ro(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),J())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function no(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function ka(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ha(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function oo(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:no("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!ka(n,r)}return!0}),pe());return Ha().pipe(g(t=>t?_:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function io(){return new x}function ao(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)ao(e,r)}function M(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)ao(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function so(){return location.hash.substring(1)}function Dr(e){let t=M("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Pa(e){return L(b(window,"hashchange"),e).pipe(l(so),V(so()),A(t=>t.length>0),X(1))}function co(e){return Pa(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function Vr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function fo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function zr(e,t){return e.pipe(g(r=>r?t():_))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>_),g(r=>r.status!==200?Ot(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),X(1))}function uo(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),X(1))}function pr(e){let t=M("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Ot(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function po(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function lo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(po),V(po()))}function mo(){return{width:innerWidth,height:innerHeight}}function ho(){return b(window,"resize",{passive:!0}).pipe(l(mo),V(mo()))}function bo(){return G([lo(),ho()]).pipe(l(([e,t])=>({offset:e,size:t})),X(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(ee("size")),o=G([n,r]).pipe(l(()=>Xe(e)));return G([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:f,y:c}])=>({offset:{x:s.x-f,y:s.y-c+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,f,c)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:f,error:c});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Deployment

+

Jar

+

Put clickhouse-spark-runtime-3.3_2.12-0.7.2.jar and +clickhouse-jdbc-0.4.5-all.jar into $SPARK_HOME/jars/, then you don't need to bundle the jar +into your Spark application, and --jar is not required when using spark-shell or spark-sql(again, for SQL-only +use cases, Apache Kyuubi is recommended for Production).

+

Configuration

+

Persist catalog configurations into $SPARK_HOME/conf/spark-defaults.conf, then --confs are not required when using +spark-shell or spark-sql.

+
spark.sql.catalog.ck_01=xenon.clickhouse.ClickHouseCatalog
+spark.sql.catalog.ck_01.host=10.0.0.1
+spark.sql.catalog.ck_01.protocol=http
+spark.sql.catalog.ck_01.http_port=8123
+spark.sql.catalog.ck_01.user=app
+spark.sql.catalog.ck_01.password=pwd
+spark.sql.catalog.ck_01.database=default
+
+spark.sql.catalog.ck_02=xenon.clickhouse.ClickHouseCatalog
+spark.sql.catalog.ck_02.host=10.0.0.2
+spark.sql.catalog.ck_02.protocol=http
+spark.sql.catalog.ck_02.http_port=8123
+spark.sql.catalog.ck_02.user=app
+spark.sql.catalog.ck_02.password=pwd
+spark.sql.catalog.ck_02.database=default
+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/best_practices/index.html b/best_practices/index.html new file mode 100644 index 00000000..297e9087 --- /dev/null +++ b/best_practices/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + + + + + + + + Index - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

TODO

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/configurations/01_catalog_configurations/index.html b/configurations/01_catalog_configurations/index.html new file mode 100644 index 00000000..fb217624 --- /dev/null +++ b/configurations/01_catalog_configurations/index.html @@ -0,0 +1,876 @@ + + + + + + + + + + + + + + + + + + + + 01 catalog configurations - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

01 catalog configurations

+ + +

Single Instance

+

Suppose you have one ClickHouse instance which installed on 10.0.0.1 and exposes HTTP on 8123.

+

Edit $SPARK_HOME/conf/spark-defaults.conf.

+
########################################
+## register a catalog named "clickhouse"
+########################################
+spark.sql.catalog.clickhouse                      xenon.clickhouse.ClickHouseCatalog
+
+################################################
+## basic configurations for "clickhouse" catalog
+################################################
+spark.sql.catalog.clickhouse.host                 10.0.0.1
+spark.sql.catalog.clickhouse.protocol             http
+spark.sql.catalog.clickhouse.http_port            8123
+spark.sql.catalog.clickhouse.user                 default
+spark.sql.catalog.clickhouse.password
+spark.sql.catalog.clickhouse.database             default
+
+###############################################################
+## custom options of clickhouse-client for "clickhouse" catalog
+###############################################################
+spark.sql.catalog.clickhouse.option.async         false
+spark.sql.catalog.clickhouse.option.client_name   spark
+
+

Then you can access ClickHouse table <ck_db>.<ck_table> from Spark SQL by using clickhouse.<ck_db>.<ck_table>.

+

Cluster

+

For ClickHouse cluster, give an unique catalog name for each instances.

+

Suppose you have two ClickHouse instances, one installed on 10.0.0.1 and exposes HTTP on port 8123 named +clickhouse1, and another installed on 10.0.0.2 and exposes HTTP on port 8123 named clickhouse2.

+

Edit $SPARK_HOME/conf/spark-defaults.conf.

+
spark.sql.catalog.clickhouse1                xenon.clickhouse.ClickHouseCatalog
+spark.sql.catalog.clickhouse1.host           10.0.0.1
+spark.sql.catalog.clickhouse1.protocol       http
+spark.sql.catalog.clickhouse1.http_port      8123
+spark.sql.catalog.clickhouse1.user           default
+spark.sql.catalog.clickhouse1.password
+spark.sql.catalog.clickhouse1.database       default
+spark.sql.catalog.clickhouse1.option.async   false
+
+spark.sql.catalog.clickhouse2                xenon.clickhouse.ClickHouseCatalog
+spark.sql.catalog.clickhouse2.host           10.0.0.2
+spark.sql.catalog.clickhouse2.protocol       http
+spark.sql.catalog.clickhouse2.http_port      8123
+spark.sql.catalog.clickhouse2.user           default
+spark.sql.catalog.clickhouse2.password
+spark.sql.catalog.clickhouse2.database       default
+spark.sql.catalog.clickhouse2.option.async   false
+
+

Then you can access clickhouse1 table <ck_db>.<ck_table> from Spark SQL by clickhouse1.<ck_db>.<ck_table>, +and access clickhouse2 table <ck_db>.<ck_table> by clickhouse2.<ck_db>.<ck_table>.

+ + + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/configurations/02_sql_configurations/index.html b/configurations/02_sql_configurations/index.html new file mode 100644 index 00000000..1a2e2113 --- /dev/null +++ b/configurations/02_sql_configurations/index.html @@ -0,0 +1,931 @@ + + + + + + + + + + + + + + + + + + + + 02 sql configurations - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

02 sql configurations

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyDefaultDescriptionSince
spark.clickhouse.ignoreUnsupportedTransformfalseClickHouse supports using complex expressions as sharding keys or partition values, e.g. cityHash64(col_1, col_2), and those can not be supported by Spark now. If true, ignore the unsupported expressions, otherwise fail fast w/ an exception. Note, when spark.clickhouse.write.distributed.convertLocal is enabled, ignore unsupported sharding keys may corrupt the data.0.4.0
spark.clickhouse.read.compression.codeclz4The codec used to decompress data for reading. Supported codecs: none, lz4.0.5.0
spark.clickhouse.read.distributed.convertLocaltrueWhen reading Distributed table, read local table instead of itself. If true, ignore spark.clickhouse.read.distributed.useClusterNodes.0.1.0
spark.clickhouse.read.formatjsonSerialize format for reading. Supported formats: json, binary0.6.0
spark.clickhouse.read.runtimeFilter.enabledfalseEnable runtime filter for reading.0.8.0
spark.clickhouse.read.splitByPartitionIdtrueIf true, construct input partition filter by virtual column _partition_id, instead of partition value. There are known bugs to assemble SQL predication by partition value. This feature requires ClickHouse Server v21.6+0.4.0
spark.clickhouse.useNullableQuerySchemafalseIf true, mark all the fields of the query schema as nullable when executing CREATE/REPLACE TABLE ... AS SELECT ... on creating the table. Note, this configuration requires SPARK-43390(available in Spark 3.5), w/o this patch, it always acts as true.0.8.0
spark.clickhouse.write.batchSize10000The number of records per batch on writing to ClickHouse.0.1.0
spark.clickhouse.write.compression.codeclz4The codec used to compress data for writing. Supported codecs: none, lz4.0.3.0
spark.clickhouse.write.distributed.convertLocalfalseWhen writing Distributed table, write local table instead of itself. If true, ignore spark.clickhouse.write.distributed.useClusterNodes.0.1.0
spark.clickhouse.write.distributed.useClusterNodestrueWrite to all nodes of cluster when writing Distributed table.0.1.0
spark.clickhouse.write.formatarrowSerialize format for writing. Supported formats: json, arrow0.4.0
spark.clickhouse.write.localSortByKeytrueIf true, do local sort by sort keys before writing.0.3.0
spark.clickhouse.write.localSortByPartitionIf true, do local sort by partition before writing. If not set, it equals to spark.clickhouse.write.repartitionByPartition.0.3.0
spark.clickhouse.write.maxRetry3The maximum number of write we will retry for a single batch write failed with retryable codes.0.1.0
spark.clickhouse.write.repartitionByPartitiontrueWhether to repartition data by ClickHouse partition keys to meet the distributions of ClickHouse table before writing.0.3.0
spark.clickhouse.write.repartitionNum0Repartition data to meet the distributions of ClickHouse table is required before writing, use this conf to specific the repartition number, value less than 1 mean no requirement.0.1.0
spark.clickhouse.write.repartitionStrictlyfalseIf true, Spark will strictly distribute incoming records across partitions to satisfy the required distribution before passing the records to the data source table on write. Otherwise, Spark may apply certain optimizations to speed up the query but break the distribution requirement. Note, this configuration requires SPARK-37523(available in Spark 3.4), w/o this patch, it always acts as true.0.3.0
spark.clickhouse.write.retryInterval10sThe interval in seconds between write retry.0.1.0
spark.clickhouse.write.retryableErrorCodes241The retryable error codes returned by ClickHouse server when write failing.0.1.0
+ + + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/configurations/index.html b/configurations/index.html new file mode 100644 index 00000000..b9b699f5 --- /dev/null +++ b/configurations/index.html @@ -0,0 +1,1051 @@ + + + + + + + + + + + + + + + + + + + + + + + + Index - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + + + + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Configurations

+

Catalog Configurations

+ + +

Single Instance

+

Suppose you have one ClickHouse instance which installed on 10.0.0.1 and exposes HTTP on 8123.

+

Edit $SPARK_HOME/conf/spark-defaults.conf.

+
########################################
+## register a catalog named "clickhouse"
+########################################
+spark.sql.catalog.clickhouse                      xenon.clickhouse.ClickHouseCatalog
+
+################################################
+## basic configurations for "clickhouse" catalog
+################################################
+spark.sql.catalog.clickhouse.host                 10.0.0.1
+spark.sql.catalog.clickhouse.protocol             http
+spark.sql.catalog.clickhouse.http_port            8123
+spark.sql.catalog.clickhouse.user                 default
+spark.sql.catalog.clickhouse.password
+spark.sql.catalog.clickhouse.database             default
+
+###############################################################
+## custom options of clickhouse-client for "clickhouse" catalog
+###############################################################
+spark.sql.catalog.clickhouse.option.async         false
+spark.sql.catalog.clickhouse.option.client_name   spark
+
+

Then you can access ClickHouse table <ck_db>.<ck_table> from Spark SQL by using clickhouse.<ck_db>.<ck_table>.

+

Cluster

+

For ClickHouse cluster, give an unique catalog name for each instances.

+

Suppose you have two ClickHouse instances, one installed on 10.0.0.1 and exposes HTTP on port 8123 named +clickhouse1, and another installed on 10.0.0.2 and exposes HTTP on port 8123 named clickhouse2.

+

Edit $SPARK_HOME/conf/spark-defaults.conf.

+
spark.sql.catalog.clickhouse1                xenon.clickhouse.ClickHouseCatalog
+spark.sql.catalog.clickhouse1.host           10.0.0.1
+spark.sql.catalog.clickhouse1.protocol       http
+spark.sql.catalog.clickhouse1.http_port      8123
+spark.sql.catalog.clickhouse1.user           default
+spark.sql.catalog.clickhouse1.password
+spark.sql.catalog.clickhouse1.database       default
+spark.sql.catalog.clickhouse1.option.async   false
+
+spark.sql.catalog.clickhouse2                xenon.clickhouse.ClickHouseCatalog
+spark.sql.catalog.clickhouse2.host           10.0.0.2
+spark.sql.catalog.clickhouse2.protocol       http
+spark.sql.catalog.clickhouse2.http_port      8123
+spark.sql.catalog.clickhouse2.user           default
+spark.sql.catalog.clickhouse2.password
+spark.sql.catalog.clickhouse2.database       default
+spark.sql.catalog.clickhouse2.option.async   false
+
+

Then you can access clickhouse1 table <ck_db>.<ck_table> from Spark SQL by clickhouse1.<ck_db>.<ck_table>, +and access clickhouse2 table <ck_db>.<ck_table> by clickhouse2.<ck_db>.<ck_table>.

+ + +

SQL Configurations

+

SQL Configurations could be overwritten by SET <key>=<value> in runtime.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyDefaultDescriptionSince
spark.clickhouse.ignoreUnsupportedTransformfalseClickHouse supports using complex expressions as sharding keys or partition values, e.g. cityHash64(col_1, col_2), and those can not be supported by Spark now. If true, ignore the unsupported expressions, otherwise fail fast w/ an exception. Note, when spark.clickhouse.write.distributed.convertLocal is enabled, ignore unsupported sharding keys may corrupt the data.0.4.0
spark.clickhouse.read.compression.codeclz4The codec used to decompress data for reading. Supported codecs: none, lz4.0.5.0
spark.clickhouse.read.distributed.convertLocaltrueWhen reading Distributed table, read local table instead of itself. If true, ignore spark.clickhouse.read.distributed.useClusterNodes.0.1.0
spark.clickhouse.read.formatjsonSerialize format for reading. Supported formats: json, binary0.6.0
spark.clickhouse.read.runtimeFilter.enabledfalseEnable runtime filter for reading.0.8.0
spark.clickhouse.read.splitByPartitionIdtrueIf true, construct input partition filter by virtual column _partition_id, instead of partition value. There are known bugs to assemble SQL predication by partition value. This feature requires ClickHouse Server v21.6+0.4.0
spark.clickhouse.useNullableQuerySchemafalseIf true, mark all the fields of the query schema as nullable when executing CREATE/REPLACE TABLE ... AS SELECT ... on creating the table. Note, this configuration requires SPARK-43390(available in Spark 3.5), w/o this patch, it always acts as true.0.8.0
spark.clickhouse.write.batchSize10000The number of records per batch on writing to ClickHouse.0.1.0
spark.clickhouse.write.compression.codeclz4The codec used to compress data for writing. Supported codecs: none, lz4.0.3.0
spark.clickhouse.write.distributed.convertLocalfalseWhen writing Distributed table, write local table instead of itself. If true, ignore spark.clickhouse.write.distributed.useClusterNodes.0.1.0
spark.clickhouse.write.distributed.useClusterNodestrueWrite to all nodes of cluster when writing Distributed table.0.1.0
spark.clickhouse.write.formatarrowSerialize format for writing. Supported formats: json, arrow0.4.0
spark.clickhouse.write.localSortByKeytrueIf true, do local sort by sort keys before writing.0.3.0
spark.clickhouse.write.localSortByPartitionIf true, do local sort by partition before writing. If not set, it equals to spark.clickhouse.write.repartitionByPartition.0.3.0
spark.clickhouse.write.maxRetry3The maximum number of write we will retry for a single batch write failed with retryable codes.0.1.0
spark.clickhouse.write.repartitionByPartitiontrueWhether to repartition data by ClickHouse partition keys to meet the distributions of ClickHouse table before writing.0.3.0
spark.clickhouse.write.repartitionNum0Repartition data to meet the distributions of ClickHouse table is required before writing, use this conf to specific the repartition number, value less than 1 mean no requirement.0.1.0
spark.clickhouse.write.repartitionStrictlyfalseIf true, Spark will strictly distribute incoming records across partitions to satisfy the required distribution before passing the records to the data source table on write. Otherwise, Spark may apply certain optimizations to speed up the query but break the distribution requirement. Note, this configuration requires SPARK-37523(available in Spark 3.4), w/o this patch, it always acts as true.0.3.0
spark.clickhouse.write.retryInterval10sThe interval in seconds between write retry.0.1.0
spark.clickhouse.write.retryableErrorCodes241The retryable error codes returned by ClickHouse server when write failing.0.1.0
+ + + + + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/developers/01_build_and_test/index.html b/developers/01_build_and_test/index.html new file mode 100644 index 00000000..6d323138 --- /dev/null +++ b/developers/01_build_and_test/index.html @@ -0,0 +1,901 @@ + + + + + + + + + + + + + + + + + + + + + + + + Build and Test - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Build and Test

+

Build

+

Check out source code from GitHub

+
git checkout https://github.com/housepower/spark-clickhouse-connector.git
+
+

Build w/o test

+
./gradlew clean build -x test
+
+

Go to spark-3.3/clickhouse-spark-runtime/build/libs/ to find the output jar +clickhouse-spark-runtime-3.3_2.12-0.8.0-SNAPSHOT.jar.

+

Test

+

The project leverage Testcontainers and Docker Compose +to do integration tests, you should install Docker and Docker Compose +before running test, and check more details on Testcontainers document if you'd +like to run test with remote Docker daemon.

+

Run all test

+

./gradlew clean test

+

Run single test

+

./gradlew test --tests=ConvertDistToLocalWriteSuite

+

Test against custom ClickHouse image

+

CLICKHOUSE_IMAGE=custom-org/clickhouse-server:custom-tag ./gradlew test

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/developers/02_docs_and_website/index.html b/developers/02_docs_and_website/index.html new file mode 100644 index 00000000..39034dab --- /dev/null +++ b/developers/02_docs_and_website/index.html @@ -0,0 +1,934 @@ + + + + + + + + + + + + + + + + + + + + + + + + Docs and Website - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Docs and Website

+

Setup Python

+

Follow the Python official document to install.

+

Setup pyenv on macOS (optional)

+

Optionally, recommend to manage Python environments by pyenv.

+

Install from Homebrew

+
brew install pyenv pyenv-virtualenv
+
+

Setup in ~/.zshrc

+
eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"
+
+

Install virtualenv

+
pyenv install 3.9.13
+pyenv virtualenv 3.9.13 scc
+
+

Localize virtualenv

+
pyenv local scc
+
+

Install dependencies

+
pip install -r requirements.txt
+
+

Preview website

+
mkdocs serve
+
+

Open http://127.0.0.1:8000/ in browser.

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/developers/03_private_release/index.html b/developers/03_private_release/index.html new file mode 100644 index 00000000..ad14fd46 --- /dev/null +++ b/developers/03_private_release/index.html @@ -0,0 +1,911 @@ + + + + + + + + + + + + + + + + + + + + + + + + Private Release - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Private Release

+
+

Tip

+

Internal Release means deploying to private Nexus Repository. Please make sure you are granted to access your +company private Nexus Repository.

+
+

Repository and Authentication

+

Configure Gradle in ~/.gradle/gradle.properties.

+
mavenUser=xxx
+mavenPassword=xxx
+mavenReleasesRepo=xxx
+mavenSnapshotsRepo=xxx
+
+

Upgrade Version

+

Modify version in version.txt and docker/.env-dev

+

Build and Deploy

+

Publish to Maven Repository using ./gradlew publish

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/developers/04_public_release/index.html b/developers/04_public_release/index.html new file mode 100644 index 00000000..30b432be --- /dev/null +++ b/developers/04_public_release/index.html @@ -0,0 +1,921 @@ + + + + + + + + + + + + + + + + + + + + + + + + Public Release - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Public Release

+
+

Notice

+

Public Release means deploying to Maven Central. Only core team members are granted to deploy into Public Repository.

+
+
+

Note

+

Most of the steps for a public release are done by the GitHub workflow.

+
+

Snapshot Release

+

The daily snapshot release is managed by Publish Snapshot +workflow, it is scheduled to be deployed at midnight every day.

+

Feature Release

+
    +
  1. Cut new branch from master branch, e.g. branch-0.3;
  2. +
  3. Update version in version.txt and docker/.env-dev, e.g. from 0.3.0-SNAPSHOT to 0.3.0;
  4. +
  5. Create new tag, e.g. v0.3.0, it will trigger the Publish Release + workflow;
  6. +
  7. Verify, close, and release in Sonatype Repository
  8. +
  9. Announce in GitHub Release
  10. +
  11. Update version in version.txt and docker/.env-dev, e.g. from 0.3.0 to 0.3.1-SNAPSHOT;
  12. +
  13. Update version on master branch in version.txt and docker/.env-dev, e.g. from 0.3.0-SNAPSHOT to 0.4.0-SNAPSHOT;
  14. +
  15. Publish Docker image after jars + available in Maven Central, generally it costs few minutes after step 3.
  16. +
+

Patch Release

+

Just emit step 1 and step 7 from feature release.

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/developers/index.html b/developers/index.html new file mode 100644 index 00000000..141c9d92 --- /dev/null +++ b/developers/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + + + + + + + + Index - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

TODO

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/imgs/scc_overview.drawio b/imgs/scc_overview.drawio new file mode 100644 index 00000000..ce953e93 --- /dev/null +++ b/imgs/scc_overview.drawio @@ -0,0 +1 @@ +7Vtbd5s4EP41fqwPIG5+9K3JyWm72SZ7urtvBFRDghEVcmzvr98RCGMQju34Jp8mDzEaDbL45hvNSIM7aDhd3FAvDb+SAMcdQwsWHTTqGIauazZ8cMmykLiOEExoFAilSvAQ/YeFUBPSWRTgrKbICIlZlNaFPkkS7LOazKOUzOtqP0lc/9bUm2BJ8OB7sSz9EQUsFE9haZX8FkeTkK0eWPRMvVJZCLLQC8h8TYTGHTSkhLDiaroY4piDV+JS3Pd5Q+9qYhQnbJcbRv2bOPk+H6dJP3Lm5OvL091fn/gNfJhXL56JJxazZcsSAkpmSYD5KFoHDeZhxPBD6vm8dw5GB1nIpjG0dLj8GcXxkMSEQjshCSgN5ImKub9iyvBiTSQmfoPJFDO6BJWyt0RVsMhyRXte2WQlC9ft4QihJ3gwWY1dQQUXAq19kLOuAzm7jpzRQxJyptOCHEB3IuT0fYDTtwN3DJSMOkq2JvNrpbOOUu9U9ELqgWTYqoFkqgeSqakG0l7r1HlAskzVQNJ1CRQcQCIgmoSykExI4sXjSjqoL++VzhdCUgHWM2ZsKbIab8ZIHUpAkC7/5vd3rbL5jxgub4wWtdZStDaaICMz6uM3nlMkX8yjE8y2r9EcgzcNSnHssei1niy1WSe/tU+pt1xTSEmUsGxt5HsuWFtxrEbYtxopTkNfd9/Uh4tiBhVPVo9yAHVkf/qtqYOukjrIuAR10Ad1WuL5lVHHtC9BHfMS1Dk9BayrpIDd3J+egwK2lNiNF9gHm1E5mTn7hqpXB8REO2Z49skyPOv3WGudqwrTqHcFYdrZ6GiX30ohpJyj2dfkaBk4Cuvzg+LO9mO3CyUw2zOO5ilnMVFxV2XPfb1zr8xGuttqUNPSG7Ms1jJplsdyW3ej28pp99mPiRzl3LZ3TW77/vio7xogHSUCZNOJyvPFjQGymXhpZwiQJVHfOmNUeneyMyncY5PisBrURU4tzwCzfZW+19w1nsX3dDk7HVGAhCoX45B+6RhntFXV7Jjx6ivJLVnBZf+akbLjU5b7Qh8UDDNd5PCU/XA14Z/DOPJfbsksw6A1jGcZ4yYoBofJFuMXqpJlAFBWh59i+ErvKVfgbsq9sJhE3u3F0SSBax+sA1+DBtwqke/FfdExjYIg93bBcfgaa9CxRu115oxR8oIbwvo6cQQ+NIusTgsfjDY+GCfjQ1sBcT8+6O4GPvxLAFKcchLsS4e9HJWPJJZqeJxTvUZgNpY2R7Nk2+kttkMns518/HU0Xwa06Qso9NMUvBoiDEk+XLnmyo1CG2rZvhjaWV25JQxGAEb0NGPw5Ib2mJvg0jHRMu2u69Sws7WeHBY1u1u+YHQeb5I3yyrCZwIurqsgfL2rgM+w1YQPyftHFeHTDQ6fVv3pCkIpv133hUA0qUAs4tgTLUNYFno00PPNVB7tFCgebcHZdLpy9rERaetUSBuHIi0fkqiwICiCrrxV2wVdQyUet0UrRdCVNz57ont57ralUoqg2/bK5ZG2lbePj/cwqcn3e5gbBBHbm3Joi/98y5KXsozPdw9/fIMPzPzux9al7pSoxhnLaNm6tL4Qf4SQ/ec0fB3cPeuj5IfxHN0t/Wzg7PI+/FkqMUFrGbRxBvu+g1+33SQHlkGl89nm68FHKoNKhcyDyqDSLN9dBoVm9fOhQr36ERYa/w8= \ No newline at end of file diff --git a/imgs/scc_overview.drawio.png b/imgs/scc_overview.drawio.png new file mode 100644 index 00000000..784cee51 Binary files /dev/null and b/imgs/scc_overview.drawio.png differ diff --git a/imgs/scc_read_bucket_join.drawio b/imgs/scc_read_bucket_join.drawio new file mode 100644 index 00000000..2446a945 --- /dev/null +++ b/imgs/scc_read_bucket_join.drawio @@ -0,0 +1 @@ +7Vpdc5s4FP01zGwfnAGJD/sxdtK02XS3E+9Od/uyo4AKWsuIChHb/fUrgYBgkQ9v7ZikfTK6uhLinHMv0sUWnC3XFxxlyQcWYWoBO1pb8MwCwHFsX/4oy6ayjANtiDmJtFNrmJNvWBttbS1IhPOOo2CMCpJ1jSFLUxyKjg1xzlZdty+Mdu+aoRgbhnmIqGn9RCKR6Kfw7Nb+DpM4Ec0D654lqp21IU9QxFZ3TPDcgjPOmKiulusZpgq8Gpdq3Nt7epuFcZyKpwxYf7ahWK3ejt6PRsX1p4tF9jscwUk1zS2ihX5ivVqxqSHgrEgjrGaxLThdJUTgeYZC1buSpEtbIpZUthx5+YVQOmOUcdlOWSqdpuZC9dpvMRd4fcekF36B2RILvpEuda+rQdQqAnV71XLijbUtucMHqIlCWgdxM3cLlbzQaO2AnPMigfOeCpwTHAo4EBhA4UjGnG4yLhIWsxTR89Y67ULZ+lwxlmkA/8VCbHQCQYVgXXjxmoi/1PATT7f+1pOp67P13cZGN+7FP2cFD/FDz6gzFeIxFg+JSCdDBcCDdHJMkSC33aS0d2qg85OahppgUNSAXdKN83i62Udu8bdyi2/mFgf05JbJwVLL+PXrF+5blnroR0bkUhpyG5I0ue72+6AKID1qi7dmGd+Rioand2gPTu+T169398fQuzs8vbvbe8ej6x3aBiivTu/ej6F3b3h6b85Bg9G73wOST4U6LbKSzBYt/2vB6o5RXkr5VDoAN1uX6NT98ipWvzNKwsU7VuRYes1okQvM68nlWqv5K1eDGImn6KLPsbwluikdlPxVEFWLKLsRJXEqr0NJjrwNnCpSSIjoqe5YkigqgzVTeith9KaWd9Z/Ls4FZwu8ZeyG+R7ksL299QNTDqBPDuBQcgi+Ww7O+B45fGYSUZwpDeyqhp3CVM2kEy1wD1b1cLeSm+97JnVOD3XwUNSNDerOiJQxuSmEhA7Yf5TBc/wM6J+Mg60kODGToO2fAO8Z0TOLlENEz3UleuPhoVcLfeDwQXug8JmV3iHC5/gDhc+sXF0x+epvgateMze8fsPkCeKRU25xM7lLQWZRcgDY+nZwYr5U7kXXOxS6Zp1kR3SPv+/uCfyhoGueyp+CLhiSdnveSkNB1zwD7oju8bXbs2N6fnTJ7W/R9Z/R5sPi4+XXX9OLxaW/GJnKe4ZSidVXu/h/dZD609ijH2qg38/Q83yXccwz+jxEqandX+RRm/8j5wIztY4okmfm/I3pl6A8qSATRBCWkjTuDgU9gxiPMO942qfz2Zujh4cz8Tqx4To9JZVJz0HM38N2pDcszJTxwsIieBlhYdYqnhIWpY5l67XHhRscMS4ekEuHsPdpWtaDLhnpI+7KArCFtRxiX7e244MMYRfk5vvd0UB2zZP43gq6Ekq+UPrOyq2RCpGf9dxHDnGgp8IP7P2UdGWz/W9f9dWk/YckPP8P \ No newline at end of file diff --git a/imgs/scc_read_bucket_join.drawio.png b/imgs/scc_read_bucket_join.drawio.png new file mode 100644 index 00000000..755ea8e5 Binary files /dev/null and b/imgs/scc_read_bucket_join.drawio.png differ diff --git a/imgs/scc_read_pushdown_disable.drawio b/imgs/scc_read_pushdown_disable.drawio new file mode 100644 index 00000000..c513b00f --- /dev/null +++ b/imgs/scc_read_pushdown_disable.drawio @@ -0,0 +1 @@ +7Zrbcps6FIafxjPthTOAANuXsZ2k56ZJOt3tzR4FFFCMERXCh/30W4AAg7AdT3xQk14ZLQlZfOvXQlqiA0bTxRWFkf+ZuCjoGJq76IBxxzB0XbP5T2pZ5pZ+Txg8il3RqDLc4v+QMGrCmmAXxbWGjJCA4ahudEgYIofVbJBSMq83eyBB/V8j6CHJcOvAQLb+wC7zxVNYWmV/h7Dns/KBRc0UFo2FIfahS+YrJnDRASNKCMuvposRClJ4BZf8vss1teXAKArZU25Y/NIAm88vu++73eTmx9Uk+gq6YJB3M4NBIp5YjJYtCwSUJKGL0l60DhjOfczQbQSdtHbOnc5tPpsGvKTzywccBCMSEMrLIQl5o6E8UDH2GaIMLVZMYuBXiEwRo0vepKg1BcRCRaI4r1xiFTZ/xR1mISMoZOCVXVek+IWAtQM4/Y/kZplPBKf3DgXO6EmgkMunnCgSynzikRAGF5V1WEdZtflESCQAPiLGliJ+wISROl60wOyf9PYzS5R+is7S6/FitbAUhbX8Y5JQB216RhGoIPUQ2yQiEQtTABvdSVEAGZ7VY9LeXQP0V+Oa/REXt14TzIdSTjyrV594ZnM+5doQdzX8Vg7jGbNsl/Ckbw9P+4hFdiMW2XIs0o2WWDQ4WCjqv3y9g+PovXTSifQO1NM70JTT++Dl6918HXo31dO72VxrnlzvQJOgvDi9W69D75Z6ei/3Tcro3W6BZAcs3V2SzJkVLft3QoqKbpxJ+Zw3MMxokdEp6vmVl/6OAuxM3pEkRrzVKEhihmjROR9r3n/eVHIM58nq9Cnifwnvswap/NNJlA8iq4YB9kJ+7XDn8L8Bw9Qp2IHBuaiYYtfNJmuU6i3DaA071rh9Hx0zSiaoYaxP8z3Iobm8tXuyHIw2ORiHkkPv2XLQ+2vk8ItwoihKNbCrGnaapmlPItAa5sGyJGYjuNm2JbtOb3EdOJTr+pLrxpjLGN8njKMztLts8pw+Atpn/V4jCA7kIKjZZ4Z1RHpyTlNFeqbJ6fXVo1cIXXF8QFMUn5wZVhGfbiuKT85cfSL81V+By18z97R4w8Q+pK6eLXEjvkqBchJTAba21juTXypr6VqHoivnSXake/p1d8vEV4WuvCt/Cl1DJe22vJVUoSvvAXeke3rttqyYjk8Xz764N9/d5efJ9YffH8OryQd70pWVd4RUyVqiW9MexcnZ1iM2oNYRmy7v0W8dGMrafcO32vRf3pcxSsfhunzPHL89uYCNRpKvPGBf0S5oO1C297BgWI4v8c8HnT7OvvlR5LrOxY35lMy/0koGLUpufVBdKSEDWcjXlDymn+Cs1fLp5QuK+CnkC4yWnN3gQPJtT+TLu629Je04SjrhDc6j7PXHMAn/5uy2LNTL77dWcz/agdJ2rfO8LW2naEDbFKe2xrO9n1w8i7q8a7+GXMDZ8vLc8yjyIGtZYr4Zff3+5Y63UTbElVv1Y4S4TY5eQftHA9VbvqHbF1BerD4Izc/Oqs9qwcX/ \ No newline at end of file diff --git a/imgs/scc_read_pushdown_disable.drawio.png b/imgs/scc_read_pushdown_disable.drawio.png new file mode 100644 index 00000000..859312d3 Binary files /dev/null and b/imgs/scc_read_pushdown_disable.drawio.png differ diff --git a/imgs/scc_read_pushdown_enable.drawio b/imgs/scc_read_pushdown_enable.drawio new file mode 100644 index 00000000..c6e4ba47 --- /dev/null +++ b/imgs/scc_read_pushdown_enable.drawio @@ -0,0 +1 @@ +7ZrbcpswEIafxjPthT2AODiXPuTQmbTpNOmkzU1HAQVUy4gKEdt9+goQYBB244kd06RXRqtFFt/+WrSye2AyX54zGAUfqYdIz9C8ZQ9Me4ah65otPlLLKrcMHWnwGfakU2W4xr+RNGrSmmAPxTVHTinhOKobXRqGyOU1G2SMLupuD5TUvzWCPlIM1y4kqvUWezyQT2Fplf0CYT/g5QPLnjksnKUhDqBHF2smcNoDE0Ypz6/mywkiKbyCS37f2YbecmIMhfwpNyzvNMAXi7P+h34/+XJ7PouuQB+c5MM8QpLIJ5az5asCAaNJ6KF0FK0HxosAc3QdQTftXYigC1vA50S0dHH5gAmZUEKZaIc0FE5jdaJy7o+IcbRcM8mJnyM6R5ythEvRa0qIUkXAlu1FFRNrKG3BejyANEKpA78cu0IlLiStHcjp/yQ4y3wqOOdQ4AxHAYU8seZkkzIeUJ+GkJxW1nEdZeVzSWkkAf5EnK9kAoEJp3W8aIn5t/T2gSVb3+Vg6fV0ud5YycZG/jFNmIu2PaPMVJD5iG8TkUyGKYCt4WSIQI4f60lp76EB+psJzf6Iy1s/UyymUi48y6kvPLO5nnJtyLsacSun8YxVtkt60v+envaRi+xGLmpJ4rrRkotODpaKhq9f7+Bl9F4G6Uh6B93TO9A6p/eT1693823o3eye3s3mXvPoei82/69Z79bb0LvVPb2XdVNn9G63QLIJT6tLmgWzomX/SmjR0Y8zKY+Eg2FGy4xO0S+u/PRzQrA7u6BJjITXhCQxR6wYXMw1Hz93VQIjePI6fYbEV8L7zCGVf7qI8klk3ZBgPxTXrgiO+BowToOCXUhGsmOOPS9brFGqtwyjNe5Z0/Y6OuaMzlDDWF/me5BDc3trO6ocjDY5GIeSg/NsOejDDXK4o4IoilIN7KqGnZZpOpJMtIZ5sFMSs5HcbNtSQ6e3hO5gp0tDJXRTLGSM7xMu0BnaTbZ4jp8B7cHQaSTBEzUJavbAsF6Qnnqo2UV6pinoDbtHrxB6x/EBraP41JPhLuLT7Y7iU0+uLql49Vfg8tfMPSveMHEAmadnW9xI7FKgeojZAba25gzUl8pGutah6KrnJDvSPf6+u2Xhd4WuWpU/ha7RJe22vJW6QletAXeke3zttuyYukJXLR5Hvs+QD3kL2XeTq6+fboS7KAjZD+y9PzpZo3H6ZOpq+QXafum0D/UmM9V91N7KcYGSzYTDKMqEzTEN/1fj26rxMvTrJZ22n2pcNKu/ceQHXtWfYcDpHw== \ No newline at end of file diff --git a/imgs/scc_read_pushdown_enable.drawio.png b/imgs/scc_read_pushdown_enable.drawio.png new file mode 100644 index 00000000..0d7608c0 Binary files /dev/null and b/imgs/scc_read_pushdown_enable.drawio.png differ diff --git a/imgs/scc_read_sort_merge_join.drawio b/imgs/scc_read_sort_merge_join.drawio new file mode 100644 index 00000000..bddf8dfe --- /dev/null +++ b/imgs/scc_read_sort_merge_join.drawio @@ -0,0 +1 @@ +7Vvdc+I2EP9b+sDM5YGMv4HHhOSS61ynnUtnrncvHWErtoqwXFkE6F/flS1jbJlALhCUXJ6wVqu1/NsPrVai545nyxuOsuQ3FmHac6xo2XOveo5j21YAP5KyKinDgSLEnESKqSbckf+wIlqKOicRzhuMgjEqSNYkhixNcSgaNMQ5WzTZ7hltvjVDMdYIdyGiOvUriUSivsK3avotJnEi1h+semaoYlaEPEERW2yQ3OueO+aMifJpthxjKsGrcCnHfdzSu54Yx6nYZ8Dyu+WKxeJj/1O/P//y9Waa/e723VEp5gHRufpiNVuxqiDgbJ5GWEqxeu7lIiEC32UolL0LUDrQEjGj0LLh8Z5QOmaUcWinLAWmS32iau4PmAu83CCpid9gNsOCr4Cl6vUUiMqKBqq5qFXiDxUt2VCHV4GPlBnEa9E1UvCgwHoCcParxM339gTOHhwLOGegAYUjcDnVZFwkLGYpotc19bIJZc3zmbFMAfgPFmKl4geaC9aEFy+J+EsOP/dV65sSJp+vlpuNlWpsxT9ncx7ix75RBSrEYyweMyIVCyUAj6qTY4oEeWjGpIOrxrXfVbNWzcAo1ThPCTf27nBziNgStGJLoMcW2+mILaOjhZbh27df99BmqYb+wQhMZa3ctZKUcr32elA6kBrV0tt6Gs8IRebZu2sZZ++jt2/v3s9h75559u61c8eT27traaC8OXv3fw57982z9/U+yBh7DzpACqiQu0VWKLNGK/h3zqqOfl6Y8gUwOF62LNCp+uEplr9jSsLpLZvnGLjGdJ4LzCvhMNdSfsmqKQbwFE30OYZXoknBIM1fOlE5iaIbURKn8ByCcuA17qVUCgkRvVAdMxJFhbNm0t4KGP3Lnn/VvS/OBWdT3CI23fwA5tBOb4OOmoPTZQ5HKzkMnm0O9nCLOXxngCjOpA081Rqe5KZSkgq0jne0qofXCm5B4OuqsztU5x5LdUNNdVcEzJhM5gKgc6w/C+c5fQQMzoeDVhAc6UHQCs4d/wXR02uUJqLneYDe0Dz0KkM3HD7XMhQ+vdJrInx2YCh8euXqM4OlvwauXGYmvFph8gTxyC5S3AyyFKQXJQ3ANrAG5/qishVd/1jo6nWSJ6J7+ry7w/FNQVffle+DrmOS7XasSqagq+8Bn4ju6W23I2MyBF3X08AxtFYC6PPVxiDZ/FbJk416WNF6fo2lOnbbeQhUVSUMOQSy9YLAXYhS3VE+wL6e/w2ynLGcRxTBBj0/O7m32CO/4Sqe3VFhGXXsy4KjVRT1EPLuJa0T0N1ecvAa/fO8ZHudJM+kt3TUSfSKyD6OxXhUtM50xo3qSfnSQ1RPDpEODIxzQf/dBXetP7td8ODHws87mNQ3fdfLMEFp3JHVJShPSnwEEYSlJI2bbuZYlnlrl+uc3HFOc5Hq5Z3A3dMJXLNuUzlddxh2rENhWQuXtXoeTz5YpQNUP2cFjlZR1L9HM0JXJestpg9YHqhs9G8U/R1V9Fcd5UtlT8r4DPZbdd8D4gTBLyUxEnMu790+yheibBvLQrmA7PSs0gIsCqaFeR8+PwQv10cynkGQUCKdkiZPm/rqBEmS14dIVR8Bg07Vm6zqU4sewUHYPciv3qQOGKTPFheDN16zgMW8ObG1LPiWyZSAOCmzPHzqK69v8E1QOI0LB+u39Oh4QxXKvJF68CttFlIjHDKOZPTri4SE0xTy9XI4SSEoVvi0eTd0+SjfxnQafPeUIdEGJyJ5RtGqYqcklWeEv5BZBuECpVvSJeh7LF2yLu7GryhNakf7dX30dNHefU+Tdm1AdqdJB79d8bwVQq8v/niaFKCZ9IN0kssfI3Km9mbDgJzpNDdEX94j/FeaM+k14T1XFsNM3YAFQy8WfkrT4obHr4x01TY+9xy3hrUYYn2paacH2XWbINsnB9nTz9YPdkULoORTad9ZcdghY/77Da1GkjZoXvMZ6dbgWB3W8AMXtKBZ/1GvvANZ/93Rvf4f \ No newline at end of file diff --git a/imgs/scc_read_sort_merge_join.drawio.png b/imgs/scc_read_sort_merge_join.drawio.png new file mode 100644 index 00000000..a71daf41 Binary files /dev/null and b/imgs/scc_read_sort_merge_join.drawio.png differ diff --git a/imgs/scc_write_rebalance_sort.drawio b/imgs/scc_write_rebalance_sort.drawio new file mode 100644 index 00000000..353e206b --- /dev/null +++ b/imgs/scc_write_rebalance_sort.drawio @@ -0,0 +1 @@ +7Zzrc5s4EMD/Gn9sBiTE42PixEnnHtNpbqZ39002qs0EIyrk2r6//iQesUF4ahoEIlNnxsErGeTf7mq1y2MG59vDI8Pp5g8akngGrPAwg/czAHxki3cpOBYC5PuFYM2isBDZJ8Fz9B8phVYp3UUhyWodOaUxj9K6cEWThKx4TYYZo/t6t680rh81xWuiCJ5XOFalX6KQb6qfZZ3kTyRab6oj21bZssVV51KQbXBI92ci+DCDc0YpL7a2hzmJJbuKS/G9xYXW14ExkvBrvuAGDx+fksNv/+CXv/dP1sLhfz5+cJ1iN99xvCt/cTlafqwQMLpLQiL3Ys3g3X4TcfKc4pVs3QudC9mGb2PxyRabX6M4ntOYMvE5oYnodKcOtBz7d8I4OZyJyoE/ErolnB1Fl6q1olpakVt93p90gvxStjnXh1cKcWkH69d9n1CJjZJWB3K+PUlyAF5JDkJd5IBGcDMAF4sgmM97ouc36AUqPdACD+hi5+plFwSSXj/sXqdwU9h507E74BjGzp8QO88wdsF02EFkGDtb84SnM1hAMDa8Kc14tmnwpjTlOabBm9Cc55gWa6sdTwFeM2CMbnlAZ1rWNzzTsouK1RTSC8e09ALA6cBrWt74bquzDtV3wLCCG2QWPqjZcfvEh4Bhjgs1O26v8ExL0KBmx+1z1mvCG99t0YQsz7TFCpxQaaA5541veRPKbpuWNzo8Z6gcI+OMvpCzFit/9VSuCupYHTQ21qEWMVqxAmQa1qGCzLDW6o+Ndajwo9dagWnWOqHKoQJvbJuslDcJeM15cnR4U6ocQsPcFmmO3b2mgb5p8CZUOVTgje62E8qhoWua5U0oh4Z+s+w6Pj7NJ9h1Fr/GhzehlV6L7Y0977kTWuspy5XR4Wle62l13NHhdYq49o/h9QDJbqRibkuN0G6jFGij1Cm0DkMJuMZR6hRBh6HkNG82GJ9Sp2r9MJSQYxyltgWFG3M5PVPxO89xud92tGr4kOV3Hd2KDsBJDzmeql1sreX/eRytXp7oLiOi1zzeZZywaudisMX+i66KZgRQXsfPiDgkXuYdZHDBO06LQeTNOI7WidheCe2Iw8A7qZVohePbsmEbhaH88l1Ko4TnHNHdDN03AlF5F0m9gFgK69FNwwzstdxoMvD1/m0rpG72YPsX7OFfKpCSVBpBV3Po5KhyT+VNccBpV28fM15jieEBpOrObtGdtnuEPHWBdh8JQ46WOy7YAeuv3H3exra+ZpvPF4veTlk67o3vNebHQJ0fLfemQj0MVjBprI7gVd0/ahRWtYCnG6uE+uC2nR4KvWDZ2+kh11Dg6lVH7wO4DSRw6/SyDYSv5n+/U7E6OWEvAuGSVTEw22AWil9iMZKKhRS2e9bMpTOlSx/J0DaIZlzvRg2aF3WDdOlGzTo76uaNmcUoummbpgzRh5rfXqMPoMtXer0CvyUaG0JdzZc7Uu/XC3otB7YsLQ2h/hP5t5pZdU7RLqXsn8kSxzhZyYx9KQE+Sx1HyVpskkMqcvAsoslMpp/iO9YnLHJsXkjOm5uWokoq03qmjL/tWEWzi7fSnJJlJv99YtEW5wbwQo7ZxSTz0jB/FSfyVuQ1b9lwVY9x3JYUF+laRvk/UZ647C5ZipM3ucsXJmY00YFv5PuqqGnka2mcyHfKwvJziDmWHemZjRWH73lITVdozuG1Rt2Dyd06O7k128UkkzqWu40xW+fzDOarzQ9G8stHLy2wmz6KWkrKCA5ZQvTVeklvJWWxemAvosNtmq84+Fm0+WUQeStU1peg9aKFQS3iipOfYpZI5WYmZoqY3MqnkAkiYcTIKteyaKE7OfwO1V9bVn/zB5pVjyGzboBvodPLgVWP8ivWjWMFZ39eX7G0XixGrqqTtkDqaFPJFWda37dKQGCaSq44rfu+VeJA01RyxTnk960S5I2nko/M4zjIjvYx+LYMyeeA3qYDPwhPAdWC8/orVwe8CKmV3bD3bL+JnWMau2GftdAvO31XrYqPp4ex5m1nT7SFD/8D \ No newline at end of file diff --git a/imgs/scc_write_rebalance_sort.drawio.png b/imgs/scc_write_rebalance_sort.drawio.png new file mode 100644 index 00000000..8c6a231d Binary files /dev/null and b/imgs/scc_write_rebalance_sort.drawio.png differ diff --git a/imgs/spark_centralized_metastore.drawio b/imgs/spark_centralized_metastore.drawio new file mode 100644 index 00000000..746ec2bd --- /dev/null +++ b/imgs/spark_centralized_metastore.drawio @@ -0,0 +1 @@ +7Vjfc6IwEP5rfNRBKGof1dp6U522Y2+uvZdOhAg5A8uExR/96y+BIFg463TOq525BzX77Qay334bgg1rGGxuBIn8KbiUN0zD3TSsq4ZptttGR/4oZJshva4GPMFcHVQAM/ZKNWhoNGEujfcCEYAji/ZBB8KQOriHESFgvR+2AL5/14h4tALMHMKr6A/moq+zsI0CH1Pm+bhLWHsCkgdrIPaJC+sSZI0a1lAAYDYKNkPKFXk5L9m86z94dwsTNMRjJoxnd+70id2Nmsnt/CEhbuK2m/oqK8ITnbBeLG5zBqgrCdEmCPTBg5DwUYEOBCShS9VtDGkVMROASIJtCf6iiFtdXZIgSMjHgGtvNRW9rhgS4eh1fCO3jzNndi8er0fN7c/4dTIZNvVykQiP4oE4O4tTuZRuoIm6oRBQFFsZICgnyFb7xSdaQ94urqBZDjTT9awfWvUh1qVaIjVkQSrQwYoKZFKWEzKn/B5ihgxC6Z8DIgQygCvHgDhLLy3HEDgI6XfpgiQcS1foc+apmaiKMyBxlLXNgm1UCQfpDfs5auSIHPuIqun6KnfzOok4ELe1ZksWUJeRFghPwsqOlC3HDgQBhLEcoZ8Ec/m7UB9LfvUj4vj0ZSZrvnzhUi6teKWmm72eEW2ate5WFHqHxKISpJuD5dVeq6N7Um9Kdt6j66LF2/ZFy+hmsF/q8J5xIlHYFVGMlRDNTlq9uZAjT42mFEmMIGhFM0UfqqZa+wypZDDtn7XcnY/sueNptC/f0GjV0JhTW+awcyoO2xefsX3RDcMnNb1la+u55Lna6CunxjY3QplvaZIyn8u+Ylpq5fM+vlV2vuRW2al0xXQ7e5icnfYvukdq3zqZ9u3z0b7xIe23T6b97pfUfrei/SFnznIMSXx+m3+n7hl6qgaoPctWH6Cf0RAfF2nvSJGaZyXSXoX17zEVFebz82wS8L6D6mj6/pG2/sRa4hoS5Cykw92738Fd4nhtm/b7Bxvrn55rqo9BQT0Wo2JaKmbOqxuCzBX36RI0Zq9pbMaUEm+c6Vi5iabZkcRRUcN/wFw3bZIIWIhpjvagYV9JZME4z185QghVUIwClvQNuN9ef6FUnbelMuxKqXo1pTJPVqrqrv2/VPWlujxZqaRZ/LGS+kp/T1mj3w== \ No newline at end of file diff --git a/imgs/spark_centralized_metastore.drawio.png b/imgs/spark_centralized_metastore.drawio.png new file mode 100644 index 00000000..6dc92581 Binary files /dev/null and b/imgs/spark_centralized_metastore.drawio.png differ diff --git a/imgs/spark_multi_catalog.drawio b/imgs/spark_multi_catalog.drawio new file mode 100644 index 00000000..74ae47d3 --- /dev/null +++ b/imgs/spark_multi_catalog.drawio @@ -0,0 +1 @@ +1VdRc5swDP41eWyOQKHpY0LTdrf0rrt0t24vPQcU8GIQZ0RC+utnExPIaNPsbl2zhwTpk2yjTzKye46flDeSZfEdhiB6thWWPeeqZ9uDgeWph0Y2W2R4YYBI8tA4NcCMP4MBLYMWPIR8z5EQBfFsHwwwTSGgPYxJiet9twWK/VUzFkEHmAVMdNFvPKTYROFaDX4LPIppF7CxJKx2NkAesxDXLciZ9BxfItJWSkofhCav5mU77voV6+7FJKR0zIBP7PPDLJjdy4frydnmR/48nfpng+F2mhUThYnYvC1tagogVIwYFSXFGGHKxKRBxxKLNAS9jqW0xmeKmClwoMCfQLQx6WUFoYJiSoSxdmMx4eVYyAAOBGBel5iMgA74uVs/HUtrAcPUDWACJDfKQYJgxFf72WemiKKdX8OzEgzVf0L75UewDCWnx5b8XU/Vd412VZqZK2VTK6mK97GttEZptRlWafW498+od1IZ3e3yf5vRk83OxWll5+2vnPo8Z1rkSdURxiuQxFUfmLI5iHvMOXFMlX2ORJgoB6ENYxYsoyqNPgqUyh7CghWCWjOMBI/0SNJJHbM82/apBS916sfVgqMatWpEyTGR7nIjHbt9XWQCWdhf8yVPIOSsjzJSsNYzrSs5wCTBNFcSxUUyV8+F/jnqb5SxIIanmcr58kmoMuvnKz3cHg6trDx70dzP0uhQsegAoTyYXmN1PLM9zCnArdv7uumpA/e8b5myiVstdWi9U1G4naK41YVoe1X25lJJkZbugFhOKKFTM83+1ZtxHXMCxWC1f9bqOHRkjzueRvfyNxrtF2issTaH3ntx6HU4vNvMvkxPjqnz849m6qLDlC94sLzFIj+9wvKsj6ZrF8P/ejAdHtko7ZNqlN3rwNcc5KvNskjEKCDd997uly+3wxbXWJDgKfi7m9zBk8rxxW27b381nb9T20ptLnSVrXUtdia/AA== \ No newline at end of file diff --git a/imgs/spark_multi_catalog.drawio.png b/imgs/spark_multi_catalog.drawio.png new file mode 100644 index 00000000..8083c11e Binary files /dev/null and b/imgs/spark_multi_catalog.drawio.png differ diff --git a/index.html b/index.html new file mode 100644 index 00000000..40e6cfc7 --- /dev/null +++ b/index.html @@ -0,0 +1,969 @@ + + + + + + + + + + + + + + + + + + + + + + Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + + + + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Overview

+

Spark ClickHouse Connector is a high performance connector build on top of Spark DataSource V2.

+
+

Overview

+
+

Requirements

+
    +
  1. Basic knowledge of Apache Spark and ClickHouse.
  2. +
  3. An available ClickHouse single node or cluster.
  4. +
  5. An available Spark cluster, and Spark version should be 3.3 or above, because we need the interfaces of Spark DataSource V2 + added in 3.3.0.
  6. +
  7. Make sure your network policy satisfies the following requirements, both driver and executor of Spark need to access + ClickHouse HTTP port. If you are using it to access ClickHouse cluster, ensure the connectivity between driver and + executor of Spark and each node of ClickHouse cluster.
  8. +
+

Notes

+
    +
  1. Integration tests based on Java 8&11, Scala 2.12&2.13, Spark 3.3 and ClickHouse + v23.3, with both single ClickHouse instance and ClickHouse cluster.
  2. +
+

Compatible Matrix

+

For old versions, please refer the compatible matrix.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionCompatible Spark VersionsClickHouse JDBC version
0.8.0Spark 3.3, 3.40.4.6
0.7.2Spark 3.3, 3.40.4.6
0.6.0Spark 3.30.3.2-patch11
0.5.0Spark 3.2, 3.30.3.2-patch11
0.4.0Spark 3.2, 3.3Not depend on
0.3.0Spark 3.2, 3.3Not depend on
0.2.1Spark 3.2Not depend on
0.1.2Spark 3.2Not depend on
+ + + + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/internals/01_catalog/index.html b/internals/01_catalog/index.html new file mode 100644 index 00000000..8249781b --- /dev/null +++ b/internals/01_catalog/index.html @@ -0,0 +1,859 @@ + + + + + + + + + + + + + + + + + + + + + + + + Catalog - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Catalog Management

+

One important end user facing feature of DataSource V2 is supporting of multi-catalogs.

+

In the early stage of Spark, it does not have catalog concept, usually, user uses Hive Metastore or Glue to +manage table metadata, hence user must register external DataSource tables in the centralized metastore before +accessing.

+

In the centralized metastore model, a table is identified by <database>.<table>.

+
+

Overview

+
+

For example, register a MySQL table into metastore, then access it using Spark SQL.

+
CREATE TABLE <db>.<tbl>
+USING org.apache.spark.sql.jdbc
+OPTIONS (
+  url      "jdbc:mysql://<mysql_host>:<mysql_port>",
+  dbtable  "<mysql_db>.<mysql_tbl>",
+  user     "<mysql_username>",
+  password "<mysql_password>"
+);
+
+
SELECT * FROM <db>.<tbl>;
+INSERT INTO <db>.<tbl> SELECT ...
+
+

Things changed in DataSource V2, starting from Spark 3.0, catalog concept is introduced to allow Spark to discover +tables automatically by registering catalog plugins.

+

The default catalog has a fixed name spark_catalog, and typically, a table is identified by <catalog>.<database>.<table>.

+
+

Overview

+
+

For example, we can register a PostgreSQL database as Spark catalog named pg, and access it using Spark SQL.

+
# spark-defaults.conf
+spark.sql.catalog.pg=org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog
+spark.sql.catalog.pg.url=jdbc:postgresql://<pg_host>:<pg_host>/<pg_db>
+spark.sql.catalog.pg.driver=org.postgresql.Driver
+spark.sql.catalog.pg.user=<pg_username>
+spark.sql.catalog.pg.password=<pg_password>
+
+
SELECT * FROM pg.<db>.<tbl>;
+INSERT INTO pg.<db>.<tbl> SELECT ...
+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/internals/02_read/index.html b/internals/02_read/index.html new file mode 100644 index 00000000..0abf5c02 --- /dev/null +++ b/internals/02_read/index.html @@ -0,0 +1,926 @@ + + + + + + + + + + + + + + + + + + + + + + + + Reading - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

How reading of the connector works?

+

Push Down

+

Spark supports push down the processing of queries, or parts of queries, into the connected data source. This means that +a specific predicate, aggregation function, or other operation, could be passed through to ClickHouse for processing.

+

The results of this push down can include the following benefits:

+
    +
  • +

    Improved overall query performance

    +
  • +
  • +

    Reduced network traffic between Spark and ClickHouse

    +
  • +
  • +

    Reduced load on ClickHouse

    +
  • +
+

These benefits often result in significant cost reduction.

+

The connector implements most push down interfaces defined by DataSource V2, such as SupportsPushDownLimit, +SupportsPushDownFilters, SupportsPushDownAggregates, SupportsPushDownRequiredColumns.

+

The below example shows how SupportsPushDownAggregates and SupportsPushDownRequiredColumns work.

+
+

Overview +

+
Push Down disabled
+
+
+

Overview +

+
Push Down enabled
+
+

Bucket Join

+

Sort merge join is a general solution for two large table inner join, it requires two table shuffle by join key first, +then do local sort by join key in each data partition, finally do stream-stream like look up to get the final result.

+

In some cases, the tables store collocated by join keys, w/ +Storage-Partitioned Join(or V2 Bucket Join), Spark could leverage +the existing ClickHouse table layout to eliminate the expensive shuffle and sort operations.

+
+

Overview +

+
Sort Merge Join
+
+
+

Overview +

+
Bucket Join
+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/internals/03_write/index.html b/internals/03_write/index.html new file mode 100644 index 00000000..e7dd72e4 --- /dev/null +++ b/internals/03_write/index.html @@ -0,0 +1,840 @@ + + + + + + + + + + + + + + + + + + + + + + Writing - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

How writing of the connector works?

+

As we know, the ClickHouse MergeTree is a LSM-like format, it's not optimized for frequent and random record insertion, +batch append operation is recommended for large amount of data ingestion.

+

So, to achieve better performance, we should re-organize the DataFrame to fit ClickHouse data layout before inserting.

+

SPARK-23889 allows data source connector to expose sorting and +clustering requirements of DataFrame before writing. By default, for Distributed table, this connector requires the +DataFrame clustered by [sharding keys, partition keys] and sorted by [sharding keys, partition keys, ordering keys]; +for normal *MergeTree table, this connector requires the DataFrame sorted by [partition keys, ordering keys] and +sorted by [partition keys, ordering keys].

+
+

Warning

+

Limitation: Spark does NOT support expressions in sharding keys and partition keys w/o +SPARK-39607.

+
+
+

Overview

+
+

In some cases, the strict data distribution requirements may lead small parallelism and data skew, and finally cause +bad performance. SPARK-37523(requires Spark 3.4+) is introduced to +allow relaxing the data distribution requirements to overcome those shortages.

+

Also, you can consider disabling some configurations like +spark.clickhouse.write.repartitionByPartition to avoid such performance degradation.

+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/internals/index.html b/internals/index.html new file mode 100644 index 00000000..0f88d8a4 --- /dev/null +++ b/internals/index.html @@ -0,0 +1,816 @@ + + + + + + + + + + + + + + + + + + + + + + + + Index - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Overview Design

+

In high level, Spark ClickHouse Connector is a connector build on top of Spark DataSource V2 and +ClickHouse HTTP protocol.

+
+

Overview

+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/quick_start/01_get_the_library/index.html b/quick_start/01_get_the_library/index.html new file mode 100644 index 00000000..76008d4f --- /dev/null +++ b/quick_start/01_get_the_library/index.html @@ -0,0 +1,967 @@ + + + + + + + + + + + + + + + + + + + + + + + + Get the Library - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Get the Library

+

Download the Library

+

The name pattern of binary jar is

+
clickhouse-spark-runtime-${spark_binary_version}_${scala_binary_version}-${version}.jar
+
+

you can find all available released jars under Maven Central Repository +and all daily build SNAPSHOT jars under Sonatype OSS Snapshots Repository.

+

Import as Dependency

+

Gradle

+
dependencies {
+  implementation("com.github.housepower:clickhouse-spark-runtime-3.3_2.12:0.7.2")
+  implementation("com.clickhouse:clickhouse-jdbc:0.4.5:all") { transitive = false }
+}
+
+

Add the following repository if you want to use SNAPSHOT version.

+
repositries {
+  maven { url = "https://oss.sonatype.org/content/repositories/snapshots" }
+}
+
+

Maven

+
<dependency>
+  <groupId>com.github.housepower</groupId>
+  <artifactId>clickhouse-spark-runtime-3.3_2.12</artifactId>
+  <version>0.7.2</version>
+</dependency>
+<dependency>
+  <groupId>com.clickhouse</groupId>
+  <artifactId>clickhouse-jdbc</artifactId>
+  <classifier>all</classifier>
+  <version>0.4.5</version>
+  <exclusions>
+    <exclusion>
+      <groupId>*</groupId>
+      <artifactId>*</artifactId>
+    </exclusion>
+  </exclusions>
+</dependency>
+
+

Add the following repository if you want to use SNAPSHOT version.

+
<repositories>
+  <repository>
+    <id>sonatype-oss-snapshots</id>
+    <name>Sonatype OSS Snapshots Repository</name>
+    <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+  </repository>
+</repositories>
+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/quick_start/02_play_with_spark_sql/index.html b/quick_start/02_play_with_spark_sql/index.html new file mode 100644 index 00000000..1be9c4fa --- /dev/null +++ b/quick_start/02_play_with_spark_sql/index.html @@ -0,0 +1,969 @@ + + + + + + + + + + + + + + + + + + + + + + + + Play with Spark SQL - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Play with Spark SQL

+

Note: For SQL-only use cases, Apache Kyuubi is recommended +for Production.

+

Launch Spark SQL CLI

+
$SPARK_HOME/bin/spark-sql \
+  --conf spark.sql.catalog.clickhouse=xenon.clickhouse.ClickHouseCatalog \
+  --conf spark.sql.catalog.clickhouse.host=${CLICKHOUSE_HOST:-127.0.0.1} \
+  --conf spark.sql.catalog.clickhouse.protocol=http \
+  --conf spark.sql.catalog.clickhouse.http_port=${CLICKHOUSE_HTTP_PORT:-8123} \
+  --conf spark.sql.catalog.clickhouse.user=${CLICKHOUSE_USER:-default} \
+  --conf spark.sql.catalog.clickhouse.password=${CLICKHOUSE_PASSWORD:-} \
+  --conf spark.sql.catalog.clickhouse.database=default \
+  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar
+
+

The following argument

+
  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar
+
+

can be replaced by

+
  --repositories https://{maven-cental-mirror or private-nexus-repo} \
+  --packages com.github.housepower:clickhouse-spark-runtime-3.3_2.12:0.7.2,com.clickhouse:clickhouse-jdbc:0.4.5:all
+
+

to avoid copying jar to your Spark client node.

+

Operations

+

Basic operations, e.g. create database, create table, write table, read table, etc.

+
spark-sql> use clickhouse;
+Time taken: 0.016 seconds
+
+spark-sql> create database if not exists test_db;
+Time taken: 0.022 seconds
+
+spark-sql> show databases;
+default
+system
+test_db
+Time taken: 0.289 seconds, Fetched 3 row(s)
+
+spark-sql> CREATE TABLE test_db.tbl_sql (
+         >   create_time TIMESTAMP NOT NULL,
+         >   m           INT       NOT NULL COMMENT 'part key',
+         >   id          BIGINT    NOT NULL COMMENT 'sort key',
+         >   value       STRING
+         > ) USING ClickHouse
+         > PARTITIONED BY (m)
+         > TBLPROPERTIES (
+         >   engine = 'MergeTree()',
+         >   order_by = 'id',
+         >   settings.index_granularity = 8192
+         > );
+Time taken: 0.242 seconds
+
+spark-sql> insert into test_db.tbl_sql values
+         > (timestamp'2021-01-01 10:10:10', 1, 1L, '1'),
+         > (timestamp'2022-02-02 10:10:10', 2, 2L, '2')
+         > as tabl(create_time, m, id, value);
+Time taken: 0.276 seconds
+
+spark-sql> select * from test_db.tbl_sql;
+2021-01-01 10:10:10 1   1   1
+2022-02-02 10:10:10 2   2   2
+Time taken: 0.116 seconds, Fetched 2 row(s)
+
+spark-sql> insert into test_db.tbl_sql select * from test_db.tbl_sql;
+Time taken: 1.028 seconds
+
+spark-sql> insert into test_db.tbl_sql select * from test_db.tbl_sql;
+Time taken: 0.462 seconds
+
+spark-sql> select count(*) from test_db.tbl_sql;
+6
+Time taken: 1.421 seconds, Fetched 1 row(s)
+
+spark-sql> select * from test_db.tbl_sql;
+2021-01-01 10:10:10 1   1   1
+2021-01-01 10:10:10 1   1   1
+2021-01-01 10:10:10 1   1   1
+2022-02-02 10:10:10 2   2   2
+2022-02-02 10:10:10 2   2   2
+2022-02-02 10:10:10 2   2   2
+Time taken: 0.123 seconds, Fetched 6 row(s)
+
+spark-sql> delete from test_db.tbl_sql where id = 1;
+Time taken: 0.129 seconds
+
+spark-sql> select * from test_db.tbl_sql;
+2022-02-02 10:10:10 2   2   2
+2022-02-02 10:10:10 2   2   2
+2022-02-02 10:10:10 2   2   2
+Time taken: 0.101 seconds, Fetched 3 row(s)
+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/quick_start/03_play_with_spark_shell/index.html b/quick_start/03_play_with_spark_shell/index.html new file mode 100644 index 00000000..03e16a9e --- /dev/null +++ b/quick_start/03_play_with_spark_shell/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + + + + + + + + + + + + Play with Spark Shell - Spark ClickHouse Connector + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Play with Spark Shell

+

Launch Spark Shell

+
$SPARK_HOME/bin/spark-shell \
+  --conf spark.sql.catalog.clickhouse=xenon.clickhouse.ClickHouseCatalog \
+  --conf spark.sql.catalog.clickhouse.host=${CLICKHOUSE_HOST:-127.0.0.1} \
+  --conf spark.sql.catalog.clickhouse.protocol=http \
+  --conf spark.sql.catalog.clickhouse.http_port=${CLICKHOUSE_HTTP_PORT:-8123} \
+  --conf spark.sql.catalog.clickhouse.user=${CLICKHOUSE_USER:-default} \
+  --conf spark.sql.catalog.clickhouse.password=${CLICKHOUSE_PASSWORD:-} \
+  --conf spark.sql.catalog.clickhouse.database=default \
+  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar
+
+

The following argument

+
  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar
+
+

can be replaced by

+
  --repositories https://{maven-cental-mirror or private-nexus-repo} \
+  --packages com.github.housepower:clickhouse-spark-runtime-3.3_2.12:0.7.2,com.clickhouse:clickhouse-jdbc:0.4.5:all
+
+

to avoid copying jar to your Spark client node.

+

Operations

+

Basic operations, e.g. create database, create table, write table, read table, etc.

+
scala> spark.sql("use clickhouse")
+res0: org.apache.spark.sql.DataFrame = []
+
+scala> spark.sql("create database test_db")
+res1: org.apache.spark.sql.DataFrame = []
+
+scala> spark.sql("show databases").show
++---------+
+|namespace|
++---------+
+|  default|
+|   system|
+|  test_db|
++---------+
+
+scala> spark.sql("""
+     | CREATE TABLE test_db.tbl (
+     |   create_time TIMESTAMP NOT NULL,
+     |   m           INT       NOT NULL COMMENT 'part key',
+     |   id          BIGINT    NOT NULL COMMENT 'sort key',
+     |   value       STRING
+     | ) USING ClickHouse
+     | PARTITIONED BY (m)
+     | TBLPROPERTIES (
+     |   engine = 'MergeTree()',
+     |   order_by = 'id',
+     |   settings.index_granularity = 8192
+     | )
+     | """)
+res2: org.apache.spark.sql.DataFrame = []
+
+scala> :paste
+// Entering paste mode (ctrl-D to finish)
+
+spark.createDataFrame(Seq(
+    ("2021-01-01 10:10:10", 1L, "1"),
+    ("2022-02-02 10:10:10", 2L, "2")
+)).toDF("create_time", "id", "value")
+    .withColumn("create_time", to_timestamp($"create_time"))
+    .withColumn("m", month($"create_time"))
+    .select($"create_time", $"m", $"id", $"value")
+    .writeTo("test_db.tbl")
+    .append
+
+// Exiting paste mode, now interpreting.
+
+scala> spark.table("test_db.tbl").show
++-------------------+---+---+-----+
+|        create_time|  m| id|value|
++-------------------+---+---+-----+
+|2021-01-01 10:10:10|  1|  1|    1|
+|2022-02-02 10:10:10|  2|  2|    2|
++-------------------+---+---+-----+
+
+scala> spark.sql("DELETE FROM test_db.tbl WHERE id=1")
+res3: org.apache.spark.sql.DataFrame = []
+
+scala> spark.table("test_db.tbl").show
++-------------------+---+---+-----+
+|        create_time|  m| id|value|
++-------------------+---+---+-----+
+|2022-02-02 10:10:10|  2|  2|    2|
++-------------------+---+---+-----+
+
+

Execute ClickHouse native SQL.

+
scala> val options = Map(
+     |     "host" -> "clickhouse",
+     |     "protocol" -> "http",
+     |     "http_port" -> "8123",
+     |     "user" -> "default",
+     |     "password" -> ""
+     | )
+
+scala> val sql = """
+     | |CREATE TABLE test_db.person (
+     | |  id    Int64,
+     | |  name  String,
+     | |  age Nullable(Int32)
+     | |)
+     | |ENGINE = MergeTree()
+     | |ORDER BY id
+     | """.stripMargin
+
+scala> spark.executeCommand("xenon.clickhouse.ClickHouseCommandRunner", sql, options) 
+
+scala> spark.sql("show tables in clickhouse_s1r1.test_db").show
++---------+---------+-----------+
+|namespace|tableName|isTemporary|
++---------+---------+-----------+
+|  test_db|   person|      false|
++---------+---------+-----------+
+
+scala> spark.table("clickhouse_s1r1.test_db.person").printSchema
+root
+ |-- id: long (nullable = false)
+ |-- name: string (nullable = false)
+ |-- age: integer (nullable = true)
+
+ + + + + + +
+
+ + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..25826cd9 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Overview","text":"

Spark ClickHouse Connector is a high performance connector build on top of Spark DataSource V2.

"},{"location":"#requirements","title":"Requirements","text":"
  1. Basic knowledge of Apache Spark and ClickHouse.
  2. An available ClickHouse single node or cluster.
  3. An available Spark cluster, and Spark version should be 3.3 or above, because we need the interfaces of Spark DataSource V2 added in 3.3.0.
  4. Make sure your network policy satisfies the following requirements, both driver and executor of Spark need to access ClickHouse HTTP port. If you are using it to access ClickHouse cluster, ensure the connectivity between driver and executor of Spark and each node of ClickHouse cluster.
"},{"location":"#notes","title":"Notes","text":"
  1. Integration tests based on Java 8&11, Scala 2.12&2.13, Spark 3.3 and ClickHouse v23.3, with both single ClickHouse instance and ClickHouse cluster.
"},{"location":"#compatible-matrix","title":"Compatible Matrix","text":"

For old versions, please refer the compatible matrix.

Version Compatible Spark Versions ClickHouse JDBC version 0.8.0 Spark 3.3, 3.4 0.4.6 0.7.2 Spark 3.3, 3.4 0.4.6 0.6.0 Spark 3.3 0.3.2-patch11 0.5.0 Spark 3.2, 3.3 0.3.2-patch11 0.4.0 Spark 3.2, 3.3 Not depend on 0.3.0 Spark 3.2, 3.3 Not depend on 0.2.1 Spark 3.2 Not depend on 0.1.2 Spark 3.2 Not depend on"},{"location":"best_practices/","title":"TODO","text":""},{"location":"best_practices/01_deployment/","title":"Deployment","text":""},{"location":"best_practices/01_deployment/#jar","title":"Jar","text":"

Put clickhouse-spark-runtime-3.3_2.12-0.7.2.jar and clickhouse-jdbc-0.4.5-all.jar into $SPARK_HOME/jars/, then you don't need to bundle the jar into your Spark application, and --jar is not required when using spark-shell or spark-sql(again, for SQL-only use cases, Apache Kyuubi is recommended for Production).

"},{"location":"best_practices/01_deployment/#configuration","title":"Configuration","text":"

Persist catalog configurations into $SPARK_HOME/conf/spark-defaults.conf, then --confs are not required when using spark-shell or spark-sql.

spark.sql.catalog.ck_01=xenon.clickhouse.ClickHouseCatalog\nspark.sql.catalog.ck_01.host=10.0.0.1\nspark.sql.catalog.ck_01.protocol=http\nspark.sql.catalog.ck_01.http_port=8123\nspark.sql.catalog.ck_01.user=app\nspark.sql.catalog.ck_01.password=pwd\nspark.sql.catalog.ck_01.database=default\n\nspark.sql.catalog.ck_02=xenon.clickhouse.ClickHouseCatalog\nspark.sql.catalog.ck_02.host=10.0.0.2\nspark.sql.catalog.ck_02.protocol=http\nspark.sql.catalog.ck_02.http_port=8123\nspark.sql.catalog.ck_02.user=app\nspark.sql.catalog.ck_02.password=pwd\nspark.sql.catalog.ck_02.database=default\n
"},{"location":"configurations/","title":"Configurations","text":""},{"location":"configurations/#catalog-configurations","title":"Catalog Configurations","text":""},{"location":"configurations/#single-instance","title":"Single Instance","text":"

Suppose you have one ClickHouse instance which installed on 10.0.0.1 and exposes HTTP on 8123.

Edit $SPARK_HOME/conf/spark-defaults.conf.

########################################\n## register a catalog named \"clickhouse\"\n########################################\nspark.sql.catalog.clickhouse                      xenon.clickhouse.ClickHouseCatalog\n\n################################################\n## basic configurations for \"clickhouse\" catalog\n################################################\nspark.sql.catalog.clickhouse.host                 10.0.0.1\nspark.sql.catalog.clickhouse.protocol             http\nspark.sql.catalog.clickhouse.http_port            8123\nspark.sql.catalog.clickhouse.user                 default\nspark.sql.catalog.clickhouse.password\nspark.sql.catalog.clickhouse.database             default\n\n###############################################################\n## custom options of clickhouse-client for \"clickhouse\" catalog\n###############################################################\nspark.sql.catalog.clickhouse.option.async         false\nspark.sql.catalog.clickhouse.option.client_name   spark\n

Then you can access ClickHouse table <ck_db>.<ck_table> from Spark SQL by using clickhouse.<ck_db>.<ck_table>.

"},{"location":"configurations/#cluster","title":"Cluster","text":"

For ClickHouse cluster, give an unique catalog name for each instances.

Suppose you have two ClickHouse instances, one installed on 10.0.0.1 and exposes HTTP on port 8123 named clickhouse1, and another installed on 10.0.0.2 and exposes HTTP on port 8123 named clickhouse2.

Edit $SPARK_HOME/conf/spark-defaults.conf.

spark.sql.catalog.clickhouse1                xenon.clickhouse.ClickHouseCatalog\nspark.sql.catalog.clickhouse1.host           10.0.0.1\nspark.sql.catalog.clickhouse1.protocol       http\nspark.sql.catalog.clickhouse1.http_port      8123\nspark.sql.catalog.clickhouse1.user           default\nspark.sql.catalog.clickhouse1.password\nspark.sql.catalog.clickhouse1.database       default\nspark.sql.catalog.clickhouse1.option.async   false\n\nspark.sql.catalog.clickhouse2                xenon.clickhouse.ClickHouseCatalog\nspark.sql.catalog.clickhouse2.host           10.0.0.2\nspark.sql.catalog.clickhouse2.protocol       http\nspark.sql.catalog.clickhouse2.http_port      8123\nspark.sql.catalog.clickhouse2.user           default\nspark.sql.catalog.clickhouse2.password\nspark.sql.catalog.clickhouse2.database       default\nspark.sql.catalog.clickhouse2.option.async   false\n

Then you can access clickhouse1 table <ck_db>.<ck_table> from Spark SQL by clickhouse1.<ck_db>.<ck_table>, and access clickhouse2 table <ck_db>.<ck_table> by clickhouse2.<ck_db>.<ck_table>.

"},{"location":"configurations/#sql-configurations","title":"SQL Configurations","text":"

SQL Configurations could be overwritten by SET <key>=<value> in runtime.

Key Default Description Since spark.clickhouse.ignoreUnsupportedTransform false ClickHouse supports using complex expressions as sharding keys or partition values, e.g. cityHash64(col_1, col_2), and those can not be supported by Spark now. If true, ignore the unsupported expressions, otherwise fail fast w/ an exception. Note, when spark.clickhouse.write.distributed.convertLocal is enabled, ignore unsupported sharding keys may corrupt the data. 0.4.0 spark.clickhouse.read.compression.codec lz4 The codec used to decompress data for reading. Supported codecs: none, lz4. 0.5.0 spark.clickhouse.read.distributed.convertLocal true When reading Distributed table, read local table instead of itself. If true, ignore spark.clickhouse.read.distributed.useClusterNodes. 0.1.0 spark.clickhouse.read.format json Serialize format for reading. Supported formats: json, binary 0.6.0 spark.clickhouse.read.runtimeFilter.enabled false Enable runtime filter for reading. 0.8.0 spark.clickhouse.read.splitByPartitionId true If true, construct input partition filter by virtual column _partition_id, instead of partition value. There are known bugs to assemble SQL predication by partition value. This feature requires ClickHouse Server v21.6+ 0.4.0 spark.clickhouse.useNullableQuerySchema false If true, mark all the fields of the query schema as nullable when executing CREATE/REPLACE TABLE ... AS SELECT ... on creating the table. Note, this configuration requires SPARK-43390(available in Spark 3.5), w/o this patch, it always acts as true. 0.8.0 spark.clickhouse.write.batchSize 10000 The number of records per batch on writing to ClickHouse. 0.1.0 spark.clickhouse.write.compression.codec lz4 The codec used to compress data for writing. Supported codecs: none, lz4. 0.3.0 spark.clickhouse.write.distributed.convertLocal false When writing Distributed table, write local table instead of itself. If true, ignore spark.clickhouse.write.distributed.useClusterNodes. 0.1.0 spark.clickhouse.write.distributed.useClusterNodes true Write to all nodes of cluster when writing Distributed table. 0.1.0 spark.clickhouse.write.format arrow Serialize format for writing. Supported formats: json, arrow 0.4.0 spark.clickhouse.write.localSortByKey true If true, do local sort by sort keys before writing. 0.3.0 spark.clickhouse.write.localSortByPartition If true, do local sort by partition before writing. If not set, it equals to spark.clickhouse.write.repartitionByPartition. 0.3.0 spark.clickhouse.write.maxRetry 3 The maximum number of write we will retry for a single batch write failed with retryable codes. 0.1.0 spark.clickhouse.write.repartitionByPartition true Whether to repartition data by ClickHouse partition keys to meet the distributions of ClickHouse table before writing. 0.3.0 spark.clickhouse.write.repartitionNum 0 Repartition data to meet the distributions of ClickHouse table is required before writing, use this conf to specific the repartition number, value less than 1 mean no requirement. 0.1.0 spark.clickhouse.write.repartitionStrictly false If true, Spark will strictly distribute incoming records across partitions to satisfy the required distribution before passing the records to the data source table on write. Otherwise, Spark may apply certain optimizations to speed up the query but break the distribution requirement. Note, this configuration requires SPARK-37523(available in Spark 3.4), w/o this patch, it always acts as true. 0.3.0 spark.clickhouse.write.retryInterval 10s The interval in seconds between write retry. 0.1.0 spark.clickhouse.write.retryableErrorCodes 241 The retryable error codes returned by ClickHouse server when write failing. 0.1.0"},{"location":"configurations/01_catalog_configurations/","title":"01 catalog configurations","text":""},{"location":"configurations/01_catalog_configurations/#single-instance","title":"Single Instance","text":"

Suppose you have one ClickHouse instance which installed on 10.0.0.1 and exposes HTTP on 8123.

Edit $SPARK_HOME/conf/spark-defaults.conf.

########################################\n## register a catalog named \"clickhouse\"\n########################################\nspark.sql.catalog.clickhouse                      xenon.clickhouse.ClickHouseCatalog\n\n################################################\n## basic configurations for \"clickhouse\" catalog\n################################################\nspark.sql.catalog.clickhouse.host                 10.0.0.1\nspark.sql.catalog.clickhouse.protocol             http\nspark.sql.catalog.clickhouse.http_port            8123\nspark.sql.catalog.clickhouse.user                 default\nspark.sql.catalog.clickhouse.password\nspark.sql.catalog.clickhouse.database             default\n\n###############################################################\n## custom options of clickhouse-client for \"clickhouse\" catalog\n###############################################################\nspark.sql.catalog.clickhouse.option.async         false\nspark.sql.catalog.clickhouse.option.client_name   spark\n

Then you can access ClickHouse table <ck_db>.<ck_table> from Spark SQL by using clickhouse.<ck_db>.<ck_table>.

"},{"location":"configurations/01_catalog_configurations/#cluster","title":"Cluster","text":"

For ClickHouse cluster, give an unique catalog name for each instances.

Suppose you have two ClickHouse instances, one installed on 10.0.0.1 and exposes HTTP on port 8123 named clickhouse1, and another installed on 10.0.0.2 and exposes HTTP on port 8123 named clickhouse2.

Edit $SPARK_HOME/conf/spark-defaults.conf.

spark.sql.catalog.clickhouse1                xenon.clickhouse.ClickHouseCatalog\nspark.sql.catalog.clickhouse1.host           10.0.0.1\nspark.sql.catalog.clickhouse1.protocol       http\nspark.sql.catalog.clickhouse1.http_port      8123\nspark.sql.catalog.clickhouse1.user           default\nspark.sql.catalog.clickhouse1.password\nspark.sql.catalog.clickhouse1.database       default\nspark.sql.catalog.clickhouse1.option.async   false\n\nspark.sql.catalog.clickhouse2                xenon.clickhouse.ClickHouseCatalog\nspark.sql.catalog.clickhouse2.host           10.0.0.2\nspark.sql.catalog.clickhouse2.protocol       http\nspark.sql.catalog.clickhouse2.http_port      8123\nspark.sql.catalog.clickhouse2.user           default\nspark.sql.catalog.clickhouse2.password\nspark.sql.catalog.clickhouse2.database       default\nspark.sql.catalog.clickhouse2.option.async   false\n

Then you can access clickhouse1 table <ck_db>.<ck_table> from Spark SQL by clickhouse1.<ck_db>.<ck_table>, and access clickhouse2 table <ck_db>.<ck_table> by clickhouse2.<ck_db>.<ck_table>.

"},{"location":"configurations/02_sql_configurations/","title":"02 sql configurations","text":"Key Default Description Since spark.clickhouse.ignoreUnsupportedTransform false ClickHouse supports using complex expressions as sharding keys or partition values, e.g. cityHash64(col_1, col_2), and those can not be supported by Spark now. If true, ignore the unsupported expressions, otherwise fail fast w/ an exception. Note, when spark.clickhouse.write.distributed.convertLocal is enabled, ignore unsupported sharding keys may corrupt the data. 0.4.0 spark.clickhouse.read.compression.codec lz4 The codec used to decompress data for reading. Supported codecs: none, lz4. 0.5.0 spark.clickhouse.read.distributed.convertLocal true When reading Distributed table, read local table instead of itself. If true, ignore spark.clickhouse.read.distributed.useClusterNodes. 0.1.0 spark.clickhouse.read.format json Serialize format for reading. Supported formats: json, binary 0.6.0 spark.clickhouse.read.runtimeFilter.enabled false Enable runtime filter for reading. 0.8.0 spark.clickhouse.read.splitByPartitionId true If true, construct input partition filter by virtual column _partition_id, instead of partition value. There are known bugs to assemble SQL predication by partition value. This feature requires ClickHouse Server v21.6+ 0.4.0 spark.clickhouse.useNullableQuerySchema false If true, mark all the fields of the query schema as nullable when executing CREATE/REPLACE TABLE ... AS SELECT ... on creating the table. Note, this configuration requires SPARK-43390(available in Spark 3.5), w/o this patch, it always acts as true. 0.8.0 spark.clickhouse.write.batchSize 10000 The number of records per batch on writing to ClickHouse. 0.1.0 spark.clickhouse.write.compression.codec lz4 The codec used to compress data for writing. Supported codecs: none, lz4. 0.3.0 spark.clickhouse.write.distributed.convertLocal false When writing Distributed table, write local table instead of itself. If true, ignore spark.clickhouse.write.distributed.useClusterNodes. 0.1.0 spark.clickhouse.write.distributed.useClusterNodes true Write to all nodes of cluster when writing Distributed table. 0.1.0 spark.clickhouse.write.format arrow Serialize format for writing. Supported formats: json, arrow 0.4.0 spark.clickhouse.write.localSortByKey true If true, do local sort by sort keys before writing. 0.3.0 spark.clickhouse.write.localSortByPartition If true, do local sort by partition before writing. If not set, it equals to spark.clickhouse.write.repartitionByPartition. 0.3.0 spark.clickhouse.write.maxRetry 3 The maximum number of write we will retry for a single batch write failed with retryable codes. 0.1.0 spark.clickhouse.write.repartitionByPartition true Whether to repartition data by ClickHouse partition keys to meet the distributions of ClickHouse table before writing. 0.3.0 spark.clickhouse.write.repartitionNum 0 Repartition data to meet the distributions of ClickHouse table is required before writing, use this conf to specific the repartition number, value less than 1 mean no requirement. 0.1.0 spark.clickhouse.write.repartitionStrictly false If true, Spark will strictly distribute incoming records across partitions to satisfy the required distribution before passing the records to the data source table on write. Otherwise, Spark may apply certain optimizations to speed up the query but break the distribution requirement. Note, this configuration requires SPARK-37523(available in Spark 3.4), w/o this patch, it always acts as true. 0.3.0 spark.clickhouse.write.retryInterval 10s The interval in seconds between write retry. 0.1.0 spark.clickhouse.write.retryableErrorCodes 241 The retryable error codes returned by ClickHouse server when write failing. 0.1.0"},{"location":"developers/","title":"TODO","text":""},{"location":"developers/01_build_and_test/","title":"Build and Test","text":""},{"location":"developers/01_build_and_test/#build","title":"Build","text":"

Check out source code from GitHub

git checkout https://github.com/housepower/spark-clickhouse-connector.git\n

Build w/o test

./gradlew clean build -x test\n

Go to spark-3.3/clickhouse-spark-runtime/build/libs/ to find the output jar clickhouse-spark-runtime-3.3_2.12-0.8.0-SNAPSHOT.jar.

"},{"location":"developers/01_build_and_test/#test","title":"Test","text":"

The project leverage Testcontainers and Docker Compose to do integration tests, you should install Docker and Docker Compose before running test, and check more details on Testcontainers document if you'd like to run test with remote Docker daemon.

Run all test

./gradlew clean test

Run single test

./gradlew test --tests=ConvertDistToLocalWriteSuite

Test against custom ClickHouse image

CLICKHOUSE_IMAGE=custom-org/clickhouse-server:custom-tag ./gradlew test

"},{"location":"developers/02_docs_and_website/","title":"Docs and Website","text":""},{"location":"developers/02_docs_and_website/#setup-python","title":"Setup Python","text":"

Follow the Python official document to install.

"},{"location":"developers/02_docs_and_website/#setup-pyenv-on-macos-optional","title":"Setup pyenv on macOS (optional)","text":"

Optionally, recommend to manage Python environments by pyenv.

Install from Homebrew

brew install pyenv pyenv-virtualenv\n

Setup in ~/.zshrc

eval \"$(pyenv init -)\"\neval \"$(pyenv virtualenv-init -)\"\n

Install virtualenv

pyenv install 3.9.13\npyenv virtualenv 3.9.13 scc\n

Localize virtualenv

pyenv local scc\n
"},{"location":"developers/02_docs_and_website/#install-dependencies","title":"Install dependencies","text":"
pip install -r requirements.txt\n
"},{"location":"developers/02_docs_and_website/#preview-website","title":"Preview website","text":"
mkdocs serve\n

Open http://127.0.0.1:8000/ in browser.

"},{"location":"developers/03_private_release/","title":"Private Release","text":"

Tip

Internal Release means deploying to private Nexus Repository. Please make sure you are granted to access your company private Nexus Repository.

"},{"location":"developers/03_private_release/#repository-and-authentication","title":"Repository and Authentication","text":"

Configure Gradle in ~/.gradle/gradle.properties.

mavenUser=xxx\nmavenPassword=xxx\nmavenReleasesRepo=xxx\nmavenSnapshotsRepo=xxx\n
"},{"location":"developers/03_private_release/#upgrade-version","title":"Upgrade Version","text":"

Modify version in version.txt and docker/.env-dev

"},{"location":"developers/03_private_release/#build-and-deploy","title":"Build and Deploy","text":"

Publish to Maven Repository using ./gradlew publish

"},{"location":"developers/04_public_release/","title":"Public Release","text":"

Notice

Public Release means deploying to Maven Central. Only core team members are granted to deploy into Public Repository.

Note

Most of the steps for a public release are done by the GitHub workflow.

"},{"location":"developers/04_public_release/#snapshot-release","title":"Snapshot Release","text":"

The daily snapshot release is managed by Publish Snapshot workflow, it is scheduled to be deployed at midnight every day.

"},{"location":"developers/04_public_release/#feature-release","title":"Feature Release","text":"
  1. Cut new branch from master branch, e.g. branch-0.3;
  2. Update version in version.txt and docker/.env-dev, e.g. from 0.3.0-SNAPSHOT to 0.3.0;
  3. Create new tag, e.g. v0.3.0, it will trigger the Publish Release workflow;
  4. Verify, close, and release in Sonatype Repository
  5. Announce in GitHub Release
  6. Update version in version.txt and docker/.env-dev, e.g. from 0.3.0 to 0.3.1-SNAPSHOT;
  7. Update version on master branch in version.txt and docker/.env-dev, e.g. from 0.3.0-SNAPSHOT to 0.4.0-SNAPSHOT;
  8. Publish Docker image after jars available in Maven Central, generally it costs few minutes after step 3.
"},{"location":"developers/04_public_release/#patch-release","title":"Patch Release","text":"

Just emit step 1 and step 7 from feature release.

"},{"location":"internals/","title":"Overview Design","text":"

In high level, Spark ClickHouse Connector is a connector build on top of Spark DataSource V2 and ClickHouse HTTP protocol.

"},{"location":"internals/01_catalog/","title":"Catalog Management","text":"

One important end user facing feature of DataSource V2 is supporting of multi-catalogs.

In the early stage of Spark, it does not have catalog concept, usually, user uses Hive Metastore or Glue to manage table metadata, hence user must register external DataSource tables in the centralized metastore before accessing.

In the centralized metastore model, a table is identified by <database>.<table>.

For example, register a MySQL table into metastore, then access it using Spark SQL.

CREATE TABLE <db>.<tbl>\nUSING org.apache.spark.sql.jdbc\nOPTIONS (\n  url      \"jdbc:mysql://<mysql_host>:<mysql_port>\",\n  dbtable  \"<mysql_db>.<mysql_tbl>\",\n  user     \"<mysql_username>\",\n  password \"<mysql_password>\"\n);\n
SELECT * FROM <db>.<tbl>;\nINSERT INTO <db>.<tbl> SELECT ...\n

Things changed in DataSource V2, starting from Spark 3.0, catalog concept is introduced to allow Spark to discover tables automatically by registering catalog plugins.

The default catalog has a fixed name spark_catalog, and typically, a table is identified by <catalog>.<database>.<table>.

For example, we can register a PostgreSQL database as Spark catalog named pg, and access it using Spark SQL.

# spark-defaults.conf\nspark.sql.catalog.pg=org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog\nspark.sql.catalog.pg.url=jdbc:postgresql://<pg_host>:<pg_host>/<pg_db>\nspark.sql.catalog.pg.driver=org.postgresql.Driver\nspark.sql.catalog.pg.user=<pg_username>\nspark.sql.catalog.pg.password=<pg_password>\n
SELECT * FROM pg.<db>.<tbl>;\nINSERT INTO pg.<db>.<tbl> SELECT ...\n
"},{"location":"internals/02_read/","title":"How reading of the connector works?","text":""},{"location":"internals/02_read/#push-down","title":"Push Down","text":"

Spark supports push down the processing of queries, or parts of queries, into the connected data source. This means that a specific predicate, aggregation function, or other operation, could be passed through to ClickHouse for processing.

The results of this push down can include the following benefits:

  • Improved overall query performance

  • Reduced network traffic between Spark and ClickHouse

  • Reduced load on ClickHouse

These benefits often result in significant cost reduction.

The connector implements most push down interfaces defined by DataSource V2, such as SupportsPushDownLimit, SupportsPushDownFilters, SupportsPushDownAggregates, SupportsPushDownRequiredColumns.

The below example shows how SupportsPushDownAggregates and SupportsPushDownRequiredColumns work.

Push Down disabled

Push Down enabled"},{"location":"internals/02_read/#bucket-join","title":"Bucket Join","text":"

Sort merge join is a general solution for two large table inner join, it requires two table shuffle by join key first, then do local sort by join key in each data partition, finally do stream-stream like look up to get the final result.

In some cases, the tables store collocated by join keys, w/ Storage-Partitioned Join(or V2 Bucket Join), Spark could leverage the existing ClickHouse table layout to eliminate the expensive shuffle and sort operations.

Sort Merge Join

Bucket Join"},{"location":"internals/03_write/","title":"How writing of the connector works?","text":"

As we know, the ClickHouse MergeTree is a LSM-like format, it's not optimized for frequent and random record insertion, batch append operation is recommended for large amount of data ingestion.

So, to achieve better performance, we should re-organize the DataFrame to fit ClickHouse data layout before inserting.

SPARK-23889 allows data source connector to expose sorting and clustering requirements of DataFrame before writing. By default, for Distributed table, this connector requires the DataFrame clustered by [sharding keys, partition keys] and sorted by [sharding keys, partition keys, ordering keys]; for normal *MergeTree table, this connector requires the DataFrame sorted by [partition keys, ordering keys] and sorted by [partition keys, ordering keys].

Warning

Limitation: Spark does NOT support expressions in sharding keys and partition keys w/o SPARK-39607.

In some cases, the strict data distribution requirements may lead small parallelism and data skew, and finally cause bad performance. SPARK-37523(requires Spark 3.4+) is introduced to allow relaxing the data distribution requirements to overcome those shortages.

Also, you can consider disabling some configurations like spark.clickhouse.write.repartitionByPartition to avoid such performance degradation.

"},{"location":"quick_start/01_get_the_library/","title":"Get the Library","text":""},{"location":"quick_start/01_get_the_library/#download-the-library","title":"Download the Library","text":"

The name pattern of binary jar is

clickhouse-spark-runtime-${spark_binary_version}_${scala_binary_version}-${version}.jar\n

you can find all available released jars under Maven Central Repository and all daily build SNAPSHOT jars under Sonatype OSS Snapshots Repository.

"},{"location":"quick_start/01_get_the_library/#import-as-dependency","title":"Import as Dependency","text":""},{"location":"quick_start/01_get_the_library/#gradle","title":"Gradle","text":"
dependencies {\n  implementation(\"com.github.housepower:clickhouse-spark-runtime-3.3_2.12:0.7.2\")\n  implementation(\"com.clickhouse:clickhouse-jdbc:0.4.5:all\") { transitive = false }\n}\n

Add the following repository if you want to use SNAPSHOT version.

repositries {\n  maven { url = \"https://oss.sonatype.org/content/repositories/snapshots\" }\n}\n
"},{"location":"quick_start/01_get_the_library/#maven","title":"Maven","text":"
<dependency>\n  <groupId>com.github.housepower</groupId>\n  <artifactId>clickhouse-spark-runtime-3.3_2.12</artifactId>\n  <version>0.7.2</version>\n</dependency>\n<dependency>\n  <groupId>com.clickhouse</groupId>\n  <artifactId>clickhouse-jdbc</artifactId>\n  <classifier>all</classifier>\n  <version>0.4.5</version>\n  <exclusions>\n    <exclusion>\n      <groupId>*</groupId>\n      <artifactId>*</artifactId>\n    </exclusion>\n  </exclusions>\n</dependency>\n

Add the following repository if you want to use SNAPSHOT version.

<repositories>\n  <repository>\n    <id>sonatype-oss-snapshots</id>\n    <name>Sonatype OSS Snapshots Repository</name>\n    <url>https://oss.sonatype.org/content/repositories/snapshots</url>\n  </repository>\n</repositories>\n
"},{"location":"quick_start/02_play_with_spark_sql/","title":"Play with Spark SQL","text":"

Note: For SQL-only use cases, Apache Kyuubi is recommended for Production.

"},{"location":"quick_start/02_play_with_spark_sql/#launch-spark-sql-cli","title":"Launch Spark SQL CLI","text":"
$SPARK_HOME/bin/spark-sql \\\n  --conf spark.sql.catalog.clickhouse=xenon.clickhouse.ClickHouseCatalog \\\n  --conf spark.sql.catalog.clickhouse.host=${CLICKHOUSE_HOST:-127.0.0.1} \\\n  --conf spark.sql.catalog.clickhouse.protocol=http \\\n  --conf spark.sql.catalog.clickhouse.http_port=${CLICKHOUSE_HTTP_PORT:-8123} \\\n  --conf spark.sql.catalog.clickhouse.user=${CLICKHOUSE_USER:-default} \\\n  --conf spark.sql.catalog.clickhouse.password=${CLICKHOUSE_PASSWORD:-} \\\n  --conf spark.sql.catalog.clickhouse.database=default \\\n  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar\n

The following argument

  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar\n

can be replaced by

  --repositories https://{maven-cental-mirror or private-nexus-repo} \\\n  --packages com.github.housepower:clickhouse-spark-runtime-3.3_2.12:0.7.2,com.clickhouse:clickhouse-jdbc:0.4.5:all\n

to avoid copying jar to your Spark client node.

"},{"location":"quick_start/02_play_with_spark_sql/#operations","title":"Operations","text":"

Basic operations, e.g. create database, create table, write table, read table, etc.

spark-sql> use clickhouse;\nTime taken: 0.016 seconds\n\nspark-sql> create database if not exists test_db;\nTime taken: 0.022 seconds\n\nspark-sql> show databases;\ndefault\nsystem\ntest_db\nTime taken: 0.289 seconds, Fetched 3 row(s)\n\nspark-sql> CREATE TABLE test_db.tbl_sql (\n         >   create_time TIMESTAMP NOT NULL,\n         >   m           INT       NOT NULL COMMENT 'part key',\n         >   id          BIGINT    NOT NULL COMMENT 'sort key',\n         >   value       STRING\n         > ) USING ClickHouse\n         > PARTITIONED BY (m)\n         > TBLPROPERTIES (\n         >   engine = 'MergeTree()',\n         >   order_by = 'id',\n         >   settings.index_granularity = 8192\n         > );\nTime taken: 0.242 seconds\n\nspark-sql> insert into test_db.tbl_sql values\n         > (timestamp'2021-01-01 10:10:10', 1, 1L, '1'),\n         > (timestamp'2022-02-02 10:10:10', 2, 2L, '2')\n         > as tabl(create_time, m, id, value);\nTime taken: 0.276 seconds\n\nspark-sql> select * from test_db.tbl_sql;\n2021-01-01 10:10:10 1   1   1\n2022-02-02 10:10:10 2   2   2\nTime taken: 0.116 seconds, Fetched 2 row(s)\n\nspark-sql> insert into test_db.tbl_sql select * from test_db.tbl_sql;\nTime taken: 1.028 seconds\n\nspark-sql> insert into test_db.tbl_sql select * from test_db.tbl_sql;\nTime taken: 0.462 seconds\n\nspark-sql> select count(*) from test_db.tbl_sql;\n6\nTime taken: 1.421 seconds, Fetched 1 row(s)\n\nspark-sql> select * from test_db.tbl_sql;\n2021-01-01 10:10:10 1   1   1\n2021-01-01 10:10:10 1   1   1\n2021-01-01 10:10:10 1   1   1\n2022-02-02 10:10:10 2   2   2\n2022-02-02 10:10:10 2   2   2\n2022-02-02 10:10:10 2   2   2\nTime taken: 0.123 seconds, Fetched 6 row(s)\n\nspark-sql> delete from test_db.tbl_sql where id = 1;\nTime taken: 0.129 seconds\n\nspark-sql> select * from test_db.tbl_sql;\n2022-02-02 10:10:10 2   2   2\n2022-02-02 10:10:10 2   2   2\n2022-02-02 10:10:10 2   2   2\nTime taken: 0.101 seconds, Fetched 3 row(s)\n
"},{"location":"quick_start/03_play_with_spark_shell/","title":"Play with Spark Shell","text":""},{"location":"quick_start/03_play_with_spark_shell/#launch-spark-shell","title":"Launch Spark Shell","text":"
$SPARK_HOME/bin/spark-shell \\\n  --conf spark.sql.catalog.clickhouse=xenon.clickhouse.ClickHouseCatalog \\\n  --conf spark.sql.catalog.clickhouse.host=${CLICKHOUSE_HOST:-127.0.0.1} \\\n  --conf spark.sql.catalog.clickhouse.protocol=http \\\n  --conf spark.sql.catalog.clickhouse.http_port=${CLICKHOUSE_HTTP_PORT:-8123} \\\n  --conf spark.sql.catalog.clickhouse.user=${CLICKHOUSE_USER:-default} \\\n  --conf spark.sql.catalog.clickhouse.password=${CLICKHOUSE_PASSWORD:-} \\\n  --conf spark.sql.catalog.clickhouse.database=default \\\n  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar\n

The following argument

  --jars /path/clickhouse-spark-runtime-3.3_2.12:0.7.2.jar,/path/clickhouse-jdbc-0.4.5-all.jar\n

can be replaced by

  --repositories https://{maven-cental-mirror or private-nexus-repo} \\\n  --packages com.github.housepower:clickhouse-spark-runtime-3.3_2.12:0.7.2,com.clickhouse:clickhouse-jdbc:0.4.5:all\n

to avoid copying jar to your Spark client node.

"},{"location":"quick_start/03_play_with_spark_shell/#operations","title":"Operations","text":"

Basic operations, e.g. create database, create table, write table, read table, etc.

scala> spark.sql(\"use clickhouse\")\nres0: org.apache.spark.sql.DataFrame = []\n\nscala> spark.sql(\"create database test_db\")\nres1: org.apache.spark.sql.DataFrame = []\n\nscala> spark.sql(\"show databases\").show\n+---------+\n|namespace|\n+---------+\n|  default|\n|   system|\n|  test_db|\n+---------+\n\nscala> spark.sql(\"\"\"\n     | CREATE TABLE test_db.tbl (\n     |   create_time TIMESTAMP NOT NULL,\n     |   m           INT       NOT NULL COMMENT 'part key',\n     |   id          BIGINT    NOT NULL COMMENT 'sort key',\n     |   value       STRING\n     | ) USING ClickHouse\n     | PARTITIONED BY (m)\n     | TBLPROPERTIES (\n     |   engine = 'MergeTree()',\n     |   order_by = 'id',\n     |   settings.index_granularity = 8192\n     | )\n     | \"\"\")\nres2: org.apache.spark.sql.DataFrame = []\n\nscala> :paste\n// Entering paste mode (ctrl-D to finish)\n\nspark.createDataFrame(Seq(\n    (\"2021-01-01 10:10:10\", 1L, \"1\"),\n    (\"2022-02-02 10:10:10\", 2L, \"2\")\n)).toDF(\"create_time\", \"id\", \"value\")\n    .withColumn(\"create_time\", to_timestamp($\"create_time\"))\n    .withColumn(\"m\", month($\"create_time\"))\n    .select($\"create_time\", $\"m\", $\"id\", $\"value\")\n    .writeTo(\"test_db.tbl\")\n    .append\n\n// Exiting paste mode, now interpreting.\n\nscala> spark.table(\"test_db.tbl\").show\n+-------------------+---+---+-----+\n|        create_time|  m| id|value|\n+-------------------+---+---+-----+\n|2021-01-01 10:10:10|  1|  1|    1|\n|2022-02-02 10:10:10|  2|  2|    2|\n+-------------------+---+---+-----+\n\nscala> spark.sql(\"DELETE FROM test_db.tbl WHERE id=1\")\nres3: org.apache.spark.sql.DataFrame = []\n\nscala> spark.table(\"test_db.tbl\").show\n+-------------------+---+---+-----+\n|        create_time|  m| id|value|\n+-------------------+---+---+-----+\n|2022-02-02 10:10:10|  2|  2|    2|\n+-------------------+---+---+-----+\n

Execute ClickHouse native SQL.

scala> val options = Map(\n     |     \"host\" -> \"clickhouse\",\n     |     \"protocol\" -> \"http\",\n     |     \"http_port\" -> \"8123\",\n     |     \"user\" -> \"default\",\n     |     \"password\" -> \"\"\n     | )\n\nscala> val sql = \"\"\"\n     | |CREATE TABLE test_db.person (\n     | |  id    Int64,\n     | |  name  String,\n     | |  age Nullable(Int32)\n     | |)\n     | |ENGINE = MergeTree()\n     | |ORDER BY id\n     | \"\"\".stripMargin\n\nscala> spark.executeCommand(\"xenon.clickhouse.ClickHouseCommandRunner\", sql, options) \n\nscala> spark.sql(\"show tables in clickhouse_s1r1.test_db\").show\n+---------+---------+-----------+\n|namespace|tableName|isTemporary|\n+---------+---------+-----------+\n|  test_db|   person|      false|\n+---------+---------+-----------+\n\nscala> spark.table(\"clickhouse_s1r1.test_db.person\").printSchema\nroot\n |-- id: long (nullable = false)\n |-- name: string (nullable = false)\n |-- age: integer (nullable = true)\n
"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..a1c67ef1 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,93 @@ + + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + + None + 2023-10-23 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 00000000..31f61501 Binary files /dev/null and b/sitemap.xml.gz differ