diff --git a/dapla-manual/notebooks/spark/deltalake-intro.ipynb b/dapla-manual/notebooks/spark/deltalake-intro.ipynb index 328eaebf..d781f68e 100644 --- a/dapla-manual/notebooks/spark/deltalake-intro.ipynb +++ b/dapla-manual/notebooks/spark/deltalake-intro.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "7398549f-f7cf-4298-8254-ece4a08fa5d9", "metadata": { "tags": [] @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e037f70d-e6f8-4fd3-8a18-8b358ca9613f", "metadata": { "tags": [] @@ -83,12 +83,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5c5dd7d8-b1de-477b-8521-637698cc0f48", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 10|\n", + "| 11|\n", + "| 12|\n", + "| 13|\n", + "| 14|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "# Genererer noe data med Spark\n", "data = spark.range(10, 15)\n", @@ -107,12 +124,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "b5e9dc42-599f-414b-ab95-c7a6e94780c3", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.38 ms, sys: 1.7 ms, total: 5.08 ms\n", + "Wall time: 5.59 s\n" + ] + } + ], "source": [ "%%time\n", "data.write.format(\"delta\").mode(\"overwrite\").save(\n", @@ -130,12 +156,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "cb0a2616-f5cb-4648-ae6a-f11b6d0e6093", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ssb-dapla-felles-data-delt-prod/temp4/_delta_log',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000000.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-9b3b81a9-2771-4fb4-9f0f-659fd160d643-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00001-0f2f8ba5-3161-41e8-b5d1-2084128a5bed-c000.snappy.parquet']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dapla import FileClient\n", "\n", @@ -180,12 +221,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "bc78bc00-1ef9-49d9-b8a0-87b3a7ec74ab", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 13|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable = DeltaTable.forPath(spark, \"gs://ssb-dapla-felles-data-delt-prod/temp4\")\n", "deltaTable.toDF().show()" @@ -203,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "42699f66-ed3a-4fbb-ab54-a5f6874b0a54", "metadata": { "tags": [] @@ -226,12 +284,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "2b10a295-843b-41e8-b407-f8fe1daf275b", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 15|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable2 = DeltaTable.forPath(spark, \"gs://ssb-dapla-felles-data-delt-prod/temp4\")\n", "deltaTable2.toDF().show()" @@ -247,12 +322,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "4258ccf3-6701-496c-baea-1bbff4274e5e", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 15|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable.toDF().show()" ] @@ -275,12 +367,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "4c64e127-54ed-4a2b-8d35-a052faf985d0", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 20|\n", + "| 21|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "new_data = [(20,), (21,)]\n", "new_df = spark.createDataFrame(new_data, [\"id\"])\n", @@ -297,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "d3e6b418-3cb7-4360-bbd0-e09d7f82faf0", "metadata": { "tags": [] @@ -311,12 +417,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "a1182c79-36a5-4d4d-beb0-a72b3d377caf", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "| id|\n", + "+---+\n", + "| 12|\n", + "| 15|\n", + "| 14|\n", + "| 10|\n", + "| 11|\n", + "| 21|\n", + "| 20|\n", + "+---+\n", + "\n" + ] + } + ], "source": [ "deltaTable.toDF().show()" ] @@ -333,12 +458,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "f805cfdb-9081-4d6a-8840-48898aaa3956", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ssb-dapla-felles-data-delt-prod/temp4/_delta_log',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000000.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000001.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/_delta_log/00000000000000000002.json',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-73e5052f-1b82-48da-ab37-2cbc01bb46c1-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-9b3b81a9-2771-4fb4-9f0f-659fd160d643-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00000-d04d0ca2-8e8b-42e9-a8a3-0fed9a0e4e41-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00001-0f2f8ba5-3161-41e8-b5d1-2084128a5bed-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp4/part-00001-30d707e4-dd9a-4bfd-a4c7-7fbb1933e9ae-c000.snappy.parquet']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Lister ut filene i bøtta\n", "fs = FileClient.get_gcs_file_system()\n", @@ -355,12 +500,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "df9e687c-b0f3-4c5b-9327-6619196e5348", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"commitInfo\": {\n", + " \"timestamp\": 1696942544879,\n", + " \"operation\": \"WRITE\",\n", + " \"operationParameters\": {\n", + " \"mode\": \"Append\",\n", + " \"partitionBy\": \"[]\"\n", + " },\n", + " \"readVersion\": 1,\n", + " \"isolationLevel\": \"Serializable\",\n", + " \"isBlindAppend\": true,\n", + " \"operationMetrics\": {\n", + " \"numFiles\": \"2\",\n", + " \"numOutputRows\": \"2\",\n", + " \"numOutputBytes\": \"956\"\n", + " },\n", + " \"engineInfo\": \"Apache-Spark/3.3.1 Delta-Lake/2.3.0\",\n", + " \"txnId\": \"a3dcd582-8362-4fc2-a8ce-57613d2eb2b8\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00000-73e5052f-1b82-48da-ab37-2cbc01bb46c1-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 478,\n", + " \"modificationTime\": 1696942544755,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":1,\\\"minValues\\\":{\\\"id\\\":20},\\\"maxValues\\\":{\\\"id\\\":20},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00001-30d707e4-dd9a-4bfd-a4c7-7fbb1933e9ae-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 478,\n", + " \"modificationTime\": 1696942544833,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":1,\\\"minValues\\\":{\\\"id\\\":21},\\\"maxValues\\\":{\\\"id\\\":21},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n" + ] + } + ], "source": [ "import json\n", "\n", @@ -390,12 +585,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "71ff024b-ebee-409e-943f-7625b7d78a8a", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+\n", + "|version| timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend| operationMetrics|userMetadata| engineInfo|\n", + "+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+\n", + "| 2|2023-10-10 12:55:...| null| null| WRITE|{mode -> Append, ...|null| null| null| 1| Serializable| true|{numFiles -> 2, n...| null|Apache-Spark/3.3....|\n", + "| 1|2023-10-10 12:55:...| null| null| UPDATE|{predicate -> (id...|null| null| null| 0| Serializable| false|{numRemovedFiles ...| null|Apache-Spark/3.3....|\n", + "| 0|2023-10-10 12:55:...| null| null| WRITE|{mode -> Overwrit...|null| null| null| null| Serializable| false|{numFiles -> 2, n...| null|Apache-Spark/3.3....|\n", + "+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+\n", + "\n" + ] + } + ], "source": [ "history = deltaTable.history()\n", "history.show()" @@ -411,12 +621,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "a596ee4f-5773-4f24-a1fb-621574f4e1ba", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['version',\n", + " 'timestamp',\n", + " 'userId',\n", + " 'userName',\n", + " 'operation',\n", + " 'operationParameters',\n", + " 'job',\n", + " 'notebook',\n", + " 'clusterId',\n", + " 'readVersion',\n", + " 'isolationLevel',\n", + " 'isBlindAppend',\n", + " 'operationMetrics',\n", + " 'userMetadata',\n", + " 'engineInfo']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Oversikt over alle kolonner som finnes i historien\n", "history.columns" @@ -424,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "1baa4548-8ab3-406b-a76e-ea9b4191bf71", "metadata": { "tags": [] @@ -439,12 +674,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "5178957e-f735-4434-8b1b-d94a9d77ee93", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+-----------------------+---------+--------------------------------------+\n", + "|version| timestamp|operation| operationParameters|\n", + "+-------+-----------------------+---------+--------------------------------------+\n", + "| 2|2023-10-10 12:55:45.014| WRITE| {mode -> Append, partitionBy -> []}|\n", + "| 1|2023-10-10 12:55:37.054| UPDATE| {predicate -> (id#4452L = 13)}|\n", + "| 0|2023-10-10 12:55:29.048| WRITE|{mode -> Overwrite, partitionBy -> []}|\n", + "+-------+-----------------------+---------+--------------------------------------+\n", + "\n" + ] + } + ], "source": [ "# Display the selected columns\n", "selected_history.show(truncate=50)" @@ -462,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "80128e45-f7a3-4050-b34e-539b814e4f45", "metadata": { "tags": [] @@ -475,12 +725,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "b2ec5ce9-87d2-41c0-ac51-5cf425b95be3", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'comment': 'Kontaktet oppgavegiver og kranglet!',\n", + " 'manueltEditert': 'True',\n", + " 'maskineltEditert': 'False'}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "#| label: show-meta\n", "#| code-fold: true\n", @@ -506,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "2f5ae1c3-64b8-4864-b174-443481aa5364", "metadata": { "tags": [] @@ -523,7 +786,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "66e15b79-7bd1-44ec-a278-e5f758122ea6", "metadata": { "tags": [] @@ -539,12 +802,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "da236532-cacb-4e23-8669-f323944a2326", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+----------+---------+-------------------+------------+\n", + "|version| timestamp|operation|operationParameters|userMetadata|\n", + "+-------+----------+---------+-------------------+------------+\n", + "| 3|2023-10...| WRITE| {mode -...| {\"comme...|\n", + "| 2|2023-10...| WRITE| {mode -...| null|\n", + "| 1|2023-10...| UPDATE| {predic...| null|\n", + "| 0|2023-10...| WRITE| {mode -...| null|\n", + "+-------+----------+---------+-------------------+------------+\n", + "\n" + ] + } + ], "source": [ "# Show the operation details, including metadata\n", "history_df.select(\n", @@ -554,12 +833,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "8004531b-c729-4f63-9495-1bd45adce7ea", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------------------------------------+\n", + "|version| userMetadata|\n", + "+-------+--------------------------------------------------+\n", + "| 3|{\"comment\": \"Kontaktet oppgavegiver og kranglet...|\n", + "| 2| null|\n", + "| 1| null|\n", + "| 0| null|\n", + "+-------+--------------------------------------------------+\n", + "\n" + ] + } + ], "source": [ "history_df.select(\"version\", \"userMetadata\").show(truncate=50)" ] @@ -574,12 +869,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "f5099b8b-408b-420d-828d-6689c746160f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"commitInfo\": {\n", + " \"timestamp\": 1696942553907,\n", + " \"operation\": \"WRITE\",\n", + " \"operationParameters\": {\n", + " \"mode\": \"Append\",\n", + " \"partitionBy\": \"[]\"\n", + " },\n", + " \"readVersion\": 2,\n", + " \"isolationLevel\": \"Serializable\",\n", + " \"isBlindAppend\": false,\n", + " \"operationMetrics\": {\n", + " \"numFiles\": \"2\",\n", + " \"numOutputRows\": \"7\",\n", + " \"numOutputBytes\": \"989\"\n", + " },\n", + " \"userMetadata\": \"{\\\"comment\\\": \\\"Kontaktet oppgavegiver og kranglet!\\\", \\\"manueltEditert\\\": \\\"True\\\", \\\"maskineltEditert\\\": \\\"False\\\"}\",\n", + " \"engineInfo\": \"Apache-Spark/3.3.1 Delta-Lake/2.3.0\",\n", + " \"txnId\": \"e7de92bf-b0f9-4341-8bbb-9b382f2f3eb6\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00000-96369f3d-fe4a-4365-a0df-00c813027399-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 503,\n", + " \"modificationTime\": 1696942553856,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":5,\\\"minValues\\\":{\\\"id\\\":10},\\\"maxValues\\\":{\\\"id\\\":15},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"add\": {\n", + " \"path\": \"part-00001-0f1bc8e6-093b-49a9-ad0b-78d5a148cfb6-c000.snappy.parquet\",\n", + " \"partitionValues\": {},\n", + " \"size\": 486,\n", + " \"modificationTime\": 1696942553853,\n", + " \"dataChange\": true,\n", + " \"stats\": \"{\\\"numRecords\\\":2,\\\"minValues\\\":{\\\"id\\\":20},\\\"maxValues\\\":{\\\"id\\\":21},\\\"nullCount\\\":{\\\"id\\\":0}}\"\n", + " }\n", + "}\n", + "--------------------------------------------------\n" + ] + } + ], "source": [ "from dapla import FileClient\n", "\n", diff --git a/dapla-manual/notebooks/spark/pyspark-intro.ipynb b/dapla-manual/notebooks/spark/pyspark-intro.ipynb index 3951d546..d019fa79 100644 --- a/dapla-manual/notebooks/spark/pyspark-intro.ipynb +++ b/dapla-manual/notebooks/spark/pyspark-intro.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "9a3d798c-570d-4112-99ba-b3f4d694ca80", "metadata": { "tags": [] @@ -57,12 +57,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b10a7561-5cf0-47af-9e34-c07974449c8c", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.3.1
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
pyspark-shell
\n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "spark.sparkContext" ] @@ -87,12 +116,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "3d9d1e1d-8173-47b6-a70b-be84c38a7bbc", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+\n", + "| Date|\n", + "+----------+\n", + "|2000-01-01|\n", + "|2000-02-01|\n", + "|2000-03-01|\n", + "|2000-04-01|\n", + "|2000-05-01|\n", + "+----------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], "source": [ "# Genererer månedlige data\n", "dates_df = spark.range(1).select(\n", @@ -119,12 +166,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "b292de78-6144-47a6-ba9e-0fc2947dd7b9", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------+------------------+\n", + "| serie00| serie01|\n", + "+------------------+------------------+\n", + "|10.410703377184355| 21.06318801110079|\n", + "|10.509249410154466| 19.5674295298024|\n", + "| 9.618310122060274|17.635805093465642|\n", + "| 9.691112692298294|18.593842915949082|\n", + "| 9.903675228685067|20.012215769058564|\n", + "+------------------+------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], "source": [ "# Genererer random walk data\n", "schema = StructType(\n", @@ -151,12 +216,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "2aee8d1d-0cd5-4065-8b76-433df3a62085", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+----+-------+-----+------------------+------------------+\n", + "| Date|Year|Quarter|Month| serie00| serie01|\n", + "+----------+----+-------+-----+------------------+------------------+\n", + "|2000-01-01|2000| 1| 01| 9.495232388801012| 19.016168503192|\n", + "|2000-02-01|2000| 1| 02| 10.70952411634649|21.404467063442723|\n", + "|2000-03-01|2000| 1| 03|11.118293927071951| 21.25035527677261|\n", + "|2000-04-01|2000| 2| 04| 9.346911680164684|19.982136698759238|\n", + "|2000-05-01|2000| 2| 05| 9.663303382177363|19.925236690504494|\n", + "+----------+----+-------+-----+------------------+------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], "source": [ "#| label: gen-df\n", "#| code-fold: true\n", @@ -203,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "b51d6660-e506-42ae-9968-852ec4bc9399", "metadata": { "tags": [] @@ -225,12 +308,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "ebc60a1f-d443-409d-ac7c-4e6e8460ef9d", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ssb-dapla-felles-data-delt-prod/temp/',\n", + " 'ssb-dapla-felles-data-delt-prod/temp/timeseries.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp/timeseries.parquet/',\n", + " 'ssb-dapla-felles-data-delt-prod/temp/timeseries.parquet/_SUCCESS',\n", + " 'ssb-dapla-felles-data-delt-prod/temp/timeseries.parquet/part-00000-b32e7299-0590-4b31-bcc2-dc3d58725529-c000.snappy.parquet',\n", + " 'ssb-dapla-felles-data-delt-prod/temp/timeseries.parquet/part-00001-b32e7299-0590-4b31-bcc2-dc3d58725529-c000.snappy.parquet']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dapla import FileClient\n", "\n", @@ -261,12 +360,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a91e4e9f-b1fd-4d13-b90f-c1431e366eda", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+----+-------+-----+-----------------+-----------------+\n", + "| Date|Year|Quarter|Month| serie66| serie55|\n", + "+----------+----+-------+-----+-----------------+-----------------+\n", + "|2000-01-01|2000| 1| 01|670.2679830025959|562.4312808525777|\n", + "|2000-02-01|2000| 1| 02|675.4233411662802|562.5168447360121|\n", + "|2000-03-01|2000| 1| 03|687.3412458214908|568.6203957584232|\n", + "|2000-04-01|2000| 2| 04|673.1128047244955|557.4633871253379|\n", + "|2000-05-01|2000| 2| 05| 667.513406101114|561.7766450346327|\n", + "+----------+----+-------+-----+-----------------+-----------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], "source": [ "df_ts = spark.read.parquet(\n", " \"gs://ssb-dapla-felles-data-delt-prod/temp/timeseries.parquet\"\n", @@ -286,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "18d6a9fd-6daa-4d08-98f7-8be1efe6b4b3", "metadata": { "tags": [] @@ -306,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "f2c00323-1ce4-4ca6-911a-7197cb35e77a", "metadata": { "tags": [] @@ -326,12 +443,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "8ffe42a4-9c6b-4186-8623-3026f82e85ed", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+----+-------+-----+------------------+------------------+\n", + "| Date|Year|Quarter|Month| serie00| serie01|\n", + "+----------+----+-------+-----+------------------+------------------+\n", + "|2010-01-01|2010| 1| 01| 11.26910423907778|21.730128215168268|\n", + "|2010-02-01|2010| 1| 02| 8.722783282690738| 17.46851086792347|\n", + "|2010-03-01|2010| 1| 03|10.376873608348605|20.109386343182802|\n", + "|2010-04-01|2010| 2| 04|11.459959305590926|21.995141825523866|\n", + "|2010-05-01|2010| 2| 05|10.441456792180572| 21.25096473981906|\n", + "+----------+----+-------+-----+------------------+------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], "source": [ "result_df = spark.sql(query)\n", "result_df.select(\"Date\", \"Year\", \"Quarter\", \"Month\", \"serie00\", \"serie01\").show(5)" @@ -349,12 +484,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "7c35ea7d-b94d-4acd-8fe7-d73c65398e73", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------------------+------------------+------------------+\n", + "|Quarter| Sum| Average| Maximum|\n", + "+-------+------------------+------------------+------------------+\n", + "| 1|363.95869885234185|10.109963857009497|11.829453550532005|\n", + "| 3|365.68324879453405| 10.15786802207039|12.233378837422391|\n", + "| 4| 342.2334082209804|10.065688477087658|12.210138970053695|\n", + "| 2| 361.991445506568|10.055317930738001|12.276030776082463|\n", + "+-------+------------------+------------------+------------------+\n", + "\n" + ] + } + ], "source": [ "from pyspark.sql import functions as F\n", "\n", @@ -377,12 +528,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "d84f9e97-9b97-442b-be1e-fd5a53597b34", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+-------+------------------+------------------+------------------+\n", + "|Year|Quarter| Sum| Average| Maximum|\n", + "+----+-------+------------------+------------------+------------------+\n", + "|2000| 1|31.323050432219453|10.441016810739818|11.118293927071951|\n", + "|2000| 2|28.911192473027377| 9.637064157675793| 9.900977410685329|\n", + "|2000| 3|33.670797229797415|11.223599076599138|12.233378837422391|\n", + "|2000| 4|28.094793356286914| 9.364931118762305| 10.32000478359274|\n", + "|2001| 1|31.636678535169537|10.545559511723178|11.367822302191831|\n", + "|2001| 2|29.629770128521507| 9.876590042840503|11.135215930381191|\n", + "|2001| 3| 30.75408440118315| 10.25136146706105|10.723803326978505|\n", + "|2001| 4|30.361048932627902| 10.1203496442093|10.368365984482093|\n", + "|2002| 1|31.184163218551227|10.394721072850409|10.550579652234951|\n", + "|2002| 2|29.128978392451202| 9.7096594641504|10.186365745367246|\n", + "+----+-------+------------------+------------------+------------------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], "source": [ "# Assuming df_ts is your DataFrame\n", "df_ts.createOrReplaceTempView(\"temp_table\")\n", diff --git a/dapla-manual/notebooks/spark/sparkr-intro.ipynb b/dapla-manual/notebooks/spark/sparkr-intro.ipynb index 4e3b8559..1b4c7fb4 100644 --- a/dapla-manual/notebooks/spark/sparkr-intro.ipynb +++ b/dapla-manual/notebooks/spark/sparkr-intro.ipynb @@ -26,12 +26,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "8e85a49a-c02d-443c-a516-50eeca8c1ddd", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Java ref type org.apache.spark.sql.SparkSession id 1 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "spark" ] @@ -46,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "8c256b40-6f6d-4a3b-941f-6e4be26c1af5", "metadata": { "tags": [] @@ -58,12 +68,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "7ed99872-26bd-442e-ac73-502e315ac638", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+----+-------+-----+------------------+------------------+\n", + "| Date|Year|Quarter|Month| serie00| serie01|\n", + "+----------+----+-------+-----+------------------+------------------+\n", + "|2000-01-01|2000| 1| 01| 9.495232388801012| 19.016168503192|\n", + "|2000-02-01|2000| 1| 02| 10.70952411634649|21.404467063442723|\n", + "|2000-03-01|2000| 1| 03|11.118293927071951| 21.25035527677261|\n", + "|2000-04-01|2000| 2| 04| 9.346911680164684|19.982136698759238|\n", + "|2000-05-01|2000| 2| 05| 9.663303382177363|19.925236690504494|\n", + "+----------+----+-------+-----+------------------+------------------+\n", + "only showing top 5 rows\n" + ] + } + ], "source": [ "#| label: print-cell\n", "selectedColumns <- select(file, \"Date\", \"Year\", \"Quarter\", \"Month\", \"serie00\", \"serie01\")\n", @@ -82,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "354a5c54-53c7-4b21-a83b-f549a8a90289", "metadata": { "tags": [] diff --git a/dapla-manual/statistikkere/hva-er-dapla-team.ipynb b/dapla-manual/statistikkere/hva-er-dapla-team.ipynb index b0ce140d..ccc40e72 100644 --- a/dapla-manual/statistikkere/hva-er-dapla-team.ipynb +++ b/dapla-manual/statistikkere/hva-er-dapla-team.ipynb @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -227,7 +227,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" - } + }, + "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 diff --git a/dapla-manual/statistikkere/tilgangsstyring.ipynb b/dapla-manual/statistikkere/tilgangsstyring.ipynb index 0fed6eb3..385f9fd5 100644 --- a/dapla-manual/statistikkere/tilgangsstyring.ipynb +++ b/dapla-manual/statistikkere/tilgangsstyring.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -110,7 +110,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" - } + }, + "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 diff --git a/dapla-manual/statistikkere/transfer-service.ipynb b/dapla-manual/statistikkere/transfer-service.ipynb index e9acea11..6e28d828 100644 --- a/dapla-manual/statistikkere/transfer-service.ipynb +++ b/dapla-manual/statistikkere/transfer-service.ipynb @@ -118,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [