diff --git a/dapla-manual/notebooks/spark/deltalake-intro.ipynb b/dapla-manual/notebooks/spark/deltalake-intro.ipynb index 328eaebf..d781f68e 100644 --- a/dapla-manual/notebooks/spark/deltalake-intro.ipynb +++ b/dapla-manual/notebooks/spark/deltalake-intro.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "7398549f-f7cf-4298-8254-ece4a08fa5d9", "metadata": { "tags": [] @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e037f70d-e6f8-4fd3-8a18-8b358ca9613f", "metadata": { "tags": [] @@ -83,12 +83,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5c5dd7d8-b1de-477b-8521-637698cc0f48", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 10|\n", + "| 11|\n", + "| 12|\n", + "| 13|\n", + "| 14|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "# Genererer noe data med Spark\n", "data = spark.range(10, 15)\n", @@ -107,12 +124,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "b5e9dc42-599f-414b-ab95-c7a6e94780c3", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.38 ms, sys: 1.7 ms, total: 5.08 ms\n", + "Wall time: 5.59 s\n" + ] + } + ], "source": [ "%%time\n", "data.write.format(\"delta\").mode(\"overwrite\").save(\n", @@ -130,12 +156,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "cb0a2616-f5cb-4648-ae6a-f11b6d0e6093", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ssb-dapla-felles-data-delt-prod/temp4/_delta_log',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000000.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-9b3b81a9-2771-4fb4-9f0f-659fd160d643-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00001-0f2f8ba5-3161-41e8-b5d1-2084128a5bed-c000.snappy.parquet']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dapla import FileClient\n", "\n", @@ -180,12 +221,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "bc78bc00-1ef9-49d9-b8a0-87b3a7ec74ab", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 13|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable = DeltaTable.forPath(spark, \"gs://ssb-dapla-felles-data-delt-prod/temp4\")\n", "deltaTable.toDF().show()" @@ -203,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "42699f66-ed3a-4fbb-ab54-a5f6874b0a54", "metadata": { "tags": [] @@ -226,12 +284,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "2b10a295-843b-41e8-b407-f8fe1daf275b", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 15|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable2 = DeltaTable.forPath(spark, \"gs://ssb-dapla-felles-data-delt-prod/temp4\")\n", "deltaTable2.toDF().show()" @@ -247,12 +322,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "4258ccf3-6701-496c-baea-1bbff4274e5e", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 15|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable.toDF().show()" ] @@ -275,12 +367,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "4c64e127-54ed-4a2b-8d35-a052faf985d0", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 20|\n", + "| 21|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "new_data = [(20,), (21,)]\n", "new_df = spark.createDataFrame(new_data, [\"id\"])\n", @@ -297,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "d3e6b418-3cb7-4360-bbd0-e09d7f82faf0", "metadata": { "tags": [] @@ -311,12 +417,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "a1182c79-36a5-4d4d-beb0-a72b3d377caf", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 15|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "| 21|\n", + "| 20|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable.toDF().show()" ] @@ -333,12 +458,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "f805cfdb-9081-4d6a-8840-48898aaa3956", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ssb-dapla-felles-data-delt-prod/temp4/_delta_log',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000000.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000001.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000002.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-73e5052f-1b82-48da-ab37-2cbc01bb46c1-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-9b3b81a9-2771-4fb4-9f0f-659fd160d643-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-d04d0ca2-8e8b-42e9-a8a3-0fed9a0e4e41-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00001-0f2f8ba5-3161-41e8-b5d1-2084128a5bed-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00001-30d707e4-dd9a-4bfd-a4c7-7fbb1933e9ae-c000.snappy.parquet']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Lister ut filene i bøtta\n", "fs = FileClient.get_gcs_file_system()\n", @@ -355,12 +500,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "df9e687c-b0f3-4c5b-9327-6619196e5348", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"commitInfo\": {\n", + " \"timestamp\": 1696942544879,\n", + " \"operation\": \"WRITE\",\n", + " \"operationParameters\": {\n", + " \"mode\": \"Append\",\n", + " \"partitionBy\": \"[]\"\n", + " },\n", + " \"readVersion\": 1,\n", + " \"isolationLevel\": \"Serializable\",\n", + " \"isBlindAppend\": true,\n", + " \"operationMetrics\": {\n", + " \"numFiles\": \"2\",\n", + " \"numOutputRows\": \"2\",\n", + " \"numOutputBytes\": \"956\"\n", + " },\n", + " \"engineInfo\": \"Apache-Spark/3.3.1 Delta-Lake/2.3.0\",\n", + " \"txnId\": \"a3dcd582-8362-4fc2-a8ce-57613d2eb2b8\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00000-73e5052f-1b82-48da-ab37-2cbc01bb46c1-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 478,\n", + " \"modificationTime\": 1696942544755,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":1,\\\"minValues\\\":{\\\"id\\\":20},\\\"maxValues\\\":{\\\"id\\\":20},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00001-30d707e4-dd9a-4bfd-a4c7-7fbb1933e9ae-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 478,\n", + " \"modificationTime\": 1696942544833,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":1,\\\"minValues\\\":{\\\"id\\\":21},\\\"maxValues\\\":{\\\"id\\\":21},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n" + ] + } + ], "source": [ "import json\n", "\n", @@ -390,12 +585,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "71ff024b-ebee-409e-943f-7625b7d78a8a", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+\n", + "|version| timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend| operationMetrics|userMetadata| engineInfo|\n", + "+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+\n", + "| 2|2023-10-10 12:55:...| null| null| WRITE|{mode -> Append, ...|null| null| null| 1| Serializable| true|{numFiles -> 2, n...| null|Apache-Spark/3.3....|\n", + "| 1|2023-10-10 12:55:...| null| null| UPDATE|{predicate -> (id...|null| null| null| 0| Serializable| false|{numRemovedFiles ...| null|Apache-Spark/3.3....|\n", + "| 0|2023-10-10 12:55:...| null| null| WRITE|{mode -> Overwrit...|null| null| null| null| Serializable| false|{numFiles -> 2, n...| null|Apache-Spark/3.3....|\n", + "+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+\n", + "\n" + ] + } + ], "source": [ "history = deltaTable.history()\n", "history.show()" @@ -411,12 +621,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "a596ee4f-5773-4f24-a1fb-621574f4e1ba", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['version',\n", + " 'timestamp',\n", + " 'userId',\n", + " 'userName',\n", + " 'operation',\n", + " 'operationParameters',\n", + " 'job',\n", + " 'notebook',\n", + " 'clusterId',\n", + " 'readVersion',\n", + " 'isolationLevel',\n", + " 'isBlindAppend',\n", + " 'operationMetrics',\n", + " 'userMetadata',\n", + " 'engineInfo']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Oversikt over alle kolonner som finnes i historien\n", "history.columns" @@ -424,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "1baa4548-8ab3-406b-a76e-ea9b4191bf71", "metadata": { "tags": [] @@ -439,12 +674,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "5178957e-f735-4434-8b1b-d94a9d77ee93", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+-----------------------+---------+--------------------------------------+\n", + "|version| timestamp|operation| operationParameters|\n", + "+-------+-----------------------+---------+--------------------------------------+\n", + "| 2|2023-10-10 12:55:45.014| WRITE| {mode -> Append, partitionBy -> []}|\n", + "| 1|2023-10-10 12:55:37.054| UPDATE| {predicate -> (id#4452L = 13)}|\n", + "| 0|2023-10-10 12:55:29.048| WRITE|{mode -> Overwrite, partitionBy -> []}|\n", + "+-------+-----------------------+---------+--------------------------------------+\n", + "\n" + ] + } + ], "source": [ "# Display the selected columns\n", "selected_history.show(truncate=50)" @@ -462,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "80128e45-f7a3-4050-b34e-539b814e4f45", "metadata": { "tags": [] @@ -475,12 +725,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "b2ec5ce9-87d2-41c0-ac51-5cf425b95be3", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'comment': 'Kontaktet oppgavegiver og kranglet!',\n", + " 'manueltEditert': 'True',\n", + " 'maskineltEditert': 'False'}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "#| label: show-meta\n", "#| code-fold: true\n", @@ -506,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "2f5ae1c3-64b8-4864-b174-443481aa5364", "metadata": { "tags": [] @@ -523,7 +786,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "66e15b79-7bd1-44ec-a278-e5f758122ea6", "metadata": { "tags": [] @@ -539,12 +802,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "da236532-cacb-4e23-8669-f323944a2326", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+----------+---------+-------------------+------------+\n", + "|version| timestamp|operation|operationParameters|userMetadata|\n", + "+-------+----------+---------+-------------------+------------+\n", + "| 3|2023-10...| WRITE| {mode -...| {\"comme...|\n", + "| 2|2023-10...| WRITE| {mode -...| null|\n", + "| 1|2023-10...| UPDATE| {predic...| null|\n", + "| 0|2023-10...| WRITE| {mode -...| null|\n", + "+-------+----------+---------+-------------------+------------+\n", + "\n" + ] + } + ], "source": [ "# Show the operation details, including metadata\n", "history_df.select(\n", @@ -554,12 +833,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "8004531b-c729-4f63-9495-1bd45adce7ea", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------------------------------------+\n", + "|version| userMetadata|\n", + "+-------+--------------------------------------------------+\n", + "| 3|{\"comment\": \"Kontaktet oppgavegiver og kranglet...|\n", + "| 2| null|\n", + "| 1| null|\n", + "| 0| null|\n", + "+-------+--------------------------------------------------+\n", + "\n" + ] + } + ], "source": [ "history_df.select(\"version\", \"userMetadata\").show(truncate=50)" ] @@ -574,12 +869,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "f5099b8b-408b-420d-828d-6689c746160f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"commitInfo\": {\n", + " \"timestamp\": 1696942553907,\n", + " \"operation\": \"WRITE\",\n", + " \"operationParameters\": {\n", + " \"mode\": \"Append\",\n", + " \"partitionBy\": \"[]\"\n", + " },\n", + " \"readVersion\": 2,\n", + " \"isolationLevel\": \"Serializable\",\n", + " \"isBlindAppend\": false,\n", + " \"operationMetrics\": {\n", + " \"numFiles\": \"2\",\n", + " \"numOutputRows\": \"7\",\n", + " \"numOutputBytes\": \"989\"\n", + " },\n", + " \"userMetadata\": \"{\\\"comment\\\": \\\"Kontaktet oppgavegiver og kranglet!\\\", \\\"manueltEditert\\\": \\\"True\\\", \\\"maskineltEditert\\\": \\\"False\\\"}\",\n", + " \"engineInfo\": \"Apache-Spark/3.3.1 Delta-Lake/2.3.0\",\n", + " \"txnId\": \"e7de92bf-b0f9-4341-8bbb-9b382f2f3eb6\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00000-96369f3d-fe4a-4365-a0df-00c813027399-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 503,\n", + " \"modificationTime\": 1696942553856,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":5,\\\"minValues\\\":{\\\"id\\\":10},\\\"maxValues\\\":{\\\"id\\\":15},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00001-0f1bc8e6-093b-49a9-ad0b-78d5a148cfb6-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 486,\n", + " \"modificationTime\": 1696942553853,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":2,\\\"minValues\\\":{\\\"id\\\":20},\\\"maxValues\\\":{\\\"id\\\":21},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n" + ] + } + ], "source": [ "from dapla import FileClient\n", "\n", diff --git a/dapla-manual/notebooks/spark/pyspark-intro.ipynb b/dapla-manual/notebooks/spark/pyspark-intro.ipynb index 3951d546..d019fa79 100644 --- a/dapla-manual/notebooks/spark/pyspark-intro.ipynb +++ b/dapla-manual/notebooks/spark/pyspark-intro.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "9a3d798c-570d-4112-99ba-b3f4d694ca80", "metadata": { "tags": [] @@ -57,12 +57,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b10a7561-5cf0-47af-9e34-c07974449c8c", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
SparkContext
\n", + "\n", + " \n", + "\n", + "v3.3.1
local[*]
pyspark-shell