From 307ce9918a894793a856256fba16af1d6f47f9a8 Mon Sep 17 00:00:00 2001 From: Gerhard Brueckl Date: Thu, 16 Jan 2025 10:17:41 +0100 Subject: [PATCH] fix notebook serialization issue for markdown --- CHANGELOG.md | 3 +++ README.md | 10 +++++++--- package.json | 2 +- .../notebook/DatabricksNotebookSerializer.ts | 20 ++++++++++++------- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a80c92b..b12ed54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Release Notes +**v2.4.1:** +- fix issue with notebook serialization not working properly for markdown + **v2.4.0:** - added interactive kernel again ([202](/../../issues/202)) diff --git a/README.md b/README.md index b4758f9..fee36e4 100644 --- a/README.md +++ b/README.md @@ -229,12 +229,12 @@ Whenever a notebook is opened from either the local sync folder or via the [Virt If you are using the [Databricks Extension Connection Manager](#setup-and-configuration-databricks-extension-connection-manager) we will also create a generic notebook kernel for you which used the configured cluster. -To work with non-`.ipynb` notebooks, you can also open source files from Databricks as notebooks (e.g. `.sql`, `.scala`, `.r`). For this to work you need to add `workbench.editorAssociations` for the file types to your VSCode settings. The important part is to use `databricks-notebook` as default editor: +To work with non-`.ipynb` notebooks (=`SOURCE` format), you can also open source files from Databricks as notebooks (e.g. `.py`, `.sql`, `.scala`, `.r`). This is very common if you open files from a Git Repository used in Databricks. To open those `SOURCE` files, you need to add `workbench.editorAssociations` for the file types to your VSCode settings. The important part is to use `databricks-notebook` as default editor: ```json "settings": { "workbench.editorAssociations":{ - "*.py": "databricks-notebook", + "**/notebooks/**/*.py": "databricks-notebook", "*.scala": "databricks-notebook", "*.sql": "databricks-notebook", "*.r": "databricks-notebook" @@ -243,10 +243,14 @@ To work with non-`.ipynb` notebooks, you can also open source files from Databri } ``` +The example above will open all `.py` files, where any parent folder is called `notebooks`, as a VSCode notebook. Same also for all Scale, SQL and R files. +To control how notebooks are serialized in GIT, please check [Manage notebook format](https://docs.databricks.com/en/notebooks/notebook-format.html) + However, there are some technical restrictions working with those files. While they behave like notebooks, they are still just source files in the background which means, the output of executed cells is not persisted. So it can happen that if you save the notebook and it is then reloaded from the source (which can happen automatically in the background), your cell outputs are lost. Also, please make sure that the file extensions you configure here are the same as you configured in your `exportFormats`! +To execute those files as notebooks, you can only use Kernels provided by Databricks Power Tools and no local kernels! -As it is also possible to maintain Python libraries within DAtabricks using [Workspace Files](https://docs.databricks.com/files/workspace.html) there is a clash between file extensions of workspace files and Python notebooks downloaded in source format hence there can be some issues when creating new files etc. Therefore it is recommended to keep using `.ipynb` format for Python notebooks and `.py` for workspace files used in libraries. +As it is also possible to maintain Python libraries within Databricks using [Workspace Files](https://docs.databricks.com/files/workspace.html) there is a clash between file extensions of workspace files and Python notebooks downloaded in source format hence there can be some issues when creating new files etc. Therefore it is recommended to keep using `.ipynb` format for Python notebooks and `.py` for workspace files used in libraries. ## Execution Modes We distinguish between Live-execution and Offline-execution. In Live-execution mode, files are opened directly from Databricks by mounting the Databricks Workspace into your VSCode Workspace using `wsfs:/` URI scheme. In this mode there is no intermediate local copy but you work directly against the Databricks Workspace. Everything you run must already exist online in the Databricks Workspace. diff --git a/package.json b/package.json index b074794..c3fe8f2 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "databricks-vscode", "displayName": "Databricks Power Tools", "description": "Run notebooks cell-by-cell, browse and edit your Databricks Workspace, DBFS, Clusters, Jobs, Secrets, Repos and SQL. Supports Azure Databricks, Databricks on AWS and Databricks on GCP.", - "version": "2.4.0", + "version": "2.4.1", "publisher": "paiqo", "icon": "resources/databricks_extension.png", "author": { diff --git a/src/vscode/notebook/DatabricksNotebookSerializer.ts b/src/vscode/notebook/DatabricksNotebookSerializer.ts index 242b733..f07f7bb 100644 --- a/src/vscode/notebook/DatabricksNotebookSerializer.ts +++ b/src/vscode/notebook/DatabricksNotebookSerializer.ts @@ -47,12 +47,20 @@ export class DatabricksNotebookSerializer implements vscode.NotebookSerializer { "magic": "%sql", "commentCharacters": "--", "fileExtension": ".sql" + }, + { + "databricksLanguage": undefined, + "vscodeLanguage": "markdown", + "magic": "%md", + "commentCharacters": undefined, + "fileExtension": undefined } ] ; public async deserializeNotebook(data: Uint8Array, token: vscode.CancellationToken): Promise { var contents = Buffer.from(data).toString(); + contents = contents.replace(/\r/gm, ""); // remove any carriage returns var firstLineWithCode: number = 1; const lines: string[] = contents.trimStart().split("\n"); @@ -76,7 +84,7 @@ export class DatabricksNotebookSerializer implements vscode.NotebookSerializer { notebookLanguage = languages[0]; } else { - // its Python or R + // its Python or R which use the same comment-character const rAssignments = contents.split("<-").length; const pythonAssignments = contents.split("=").length; @@ -100,19 +108,17 @@ export class DatabricksNotebookSerializer implements vscode.NotebookSerializer { let firstLine = rawCell.split("\n")[0]; let firstLineValues = firstLine.split(/\s+/gm); let magic = firstLineValues[2]; + cellLanguage = this.LANGUAGE_MAPPING.find(x => x.magic == magic); if (magic == "%md") { cell.kind = vscode.NotebookCellKind.Markup; cell.value = cell.value.replace(new RegExp(`^${commentChars} ${this.MAGIC_PREFIX} ${magic}\n`, "gm"), ""); } - else { - cellLanguage = this.LANGUAGE_MAPPING.find(x => x.magic == magic); - if (cellLanguage) { - cell.metadata = { "cellLanguage": cellLanguage }; - } + if (cellLanguage) { + cell.metadata = { "cellLanguage": cellLanguage }; } - cell.value = cell.value.replace(new RegExp(`^${commentChars} ${this.MAGIC_PREFIX} `, "gm"), ""); + cell.value = cell.value.replace(new RegExp(`^${commentChars} ${this.MAGIC_PREFIX}`, "gm"), ""); } cell.languageId = cell.metadata.cellLanguage.vscodeLanguage; notebook.cells.push(cell);