From 7cab163db778f2f25ad34f0ff4b12a3f08566542 Mon Sep 17 00:00:00 2001 From: joke1196 Date: Tue, 28 Jan 2025 08:51:57 +0000 Subject: [PATCH 1/2] Create rule S7187 --- rules/S7187/metadata.json | 2 ++ rules/S7187/python/metadata.json | 25 ++++++++++++++++++ rules/S7187/python/rule.adoc | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 rules/S7187/metadata.json create mode 100644 rules/S7187/python/metadata.json create mode 100644 rules/S7187/python/rule.adoc diff --git a/rules/S7187/metadata.json b/rules/S7187/metadata.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/rules/S7187/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S7187/python/metadata.json b/rules/S7187/python/metadata.json new file mode 100644 index 00000000000..b1742f82691 --- /dev/null +++ b/rules/S7187/python/metadata.json @@ -0,0 +1,25 @@ +{ + "title": "FIXME", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-7187", + "sqKey": "S7187", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "unknown", + "code": { + "impacts": { + "MAINTAINABILITY": "HIGH", + "RELIABILITY": "MEDIUM", + "SECURITY": "LOW" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S7187/python/rule.adoc b/rules/S7187/python/rule.adoc new file mode 100644 index 00000000000..caae0d69054 --- /dev/null +++ b/rules/S7187/python/rule.adoc @@ -0,0 +1,44 @@ +FIXME: add a description + +// If you want to factorize the description uncomment the following line and create the file. +//include::../description.adoc[] + +== Why is this an issue? + +FIXME: remove the unused optional headers (that are commented out) + +//=== What is the potential impact? + +== How to fix it +//== How to fix it in FRAMEWORK NAME + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +FIXME +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +FIXME +---- + +//=== How does this work? + +//=== Pitfalls + +//=== Going the extra mile + + +//== Resources +//=== Documentation +//=== Articles & blog posts +//=== Conference presentations +//=== Standards +//=== External coding guidelines +//=== Benchmarks From 55266d91b69feca562bd3224365ecb97a24e887a Mon Sep 17 00:00:00 2001 From: David Kunzmann Date: Tue, 28 Jan 2025 15:31:50 +0100 Subject: [PATCH 2/2] Create rule S7187: PySpark Pandas DataFrame columns should not use a reserved name --- rules/S7187/python/metadata.json | 10 ++++----- rules/S7187/python/rule.adoc | 38 +++++++++++++++----------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/rules/S7187/python/metadata.json b/rules/S7187/python/metadata.json index b1742f82691..08a4e812622 100644 --- a/rules/S7187/python/metadata.json +++ b/rules/S7187/python/metadata.json @@ -1,5 +1,5 @@ { - "title": "FIXME", + "title": "PySpark Pandas DataFrame columns should not use a reserved name", "type": "CODE_SMELL", "status": "ready", "remediation": { @@ -7,18 +7,18 @@ "constantCost": "5min" }, "tags": [ + "data-science", + "pyspark" ], "defaultSeverity": "Major", "ruleSpecification": "RSPEC-7187", "sqKey": "S7187", "scope": "All", "defaultQualityProfiles": ["Sonar way"], - "quickfix": "unknown", + "quickfix": "infeasible", "code": { "impacts": { - "MAINTAINABILITY": "HIGH", - "RELIABILITY": "MEDIUM", - "SECURITY": "LOW" + "RELIABILITY": "MEDIUM" }, "attribute": "CONVENTIONAL" } diff --git a/rules/S7187/python/rule.adoc b/rules/S7187/python/rule.adoc index caae0d69054..926bdbf50df 100644 --- a/rules/S7187/python/rule.adoc +++ b/rules/S7187/python/rule.adoc @@ -1,16 +1,18 @@ -FIXME: add a description - -// If you want to factorize the description uncomment the following line and create the file. -//include::../description.adoc[] +This rule raises an issue when a PySpark Pandas DataFrame column name is set to a reserved name. == Why is this an issue? -FIXME: remove the unused optional headers (that are commented out) +PySpark offers powerful APIs to work with Pandas DataFrames in a distributed environment. +While the integration between PySpark and Pandas is seamless, there are some caveats that should be taken into account. -//=== What is the potential impact? +Spark Pandas API uses some special column names for internal purposes. +These column names contain leading `++__++` and trailing `++__++`. +Therefore, when using PySpark with Pandas and naming or renaming columns, +it is discouraged to use such reserved column names as they are not guaranteed to yield the expected results. == How to fix it -//== How to fix it in FRAMEWORK NAME + +To fix this issue provide a column name without leading and trailing `++__++`. === Code examples @@ -18,27 +20,23 @@ FIXME: remove the unused optional headers (that are commented out) [source,python,diff-id=1,diff-type=noncompliant] ---- -FIXME +import pyspark.pandas as ps + +df = ps.DataFrame({'__value__': [1, 2, 3]}) # Noncompliant: __value__ is a reserved column name ---- ==== Compliant solution [source,python,diff-id=1,diff-type=compliant] ---- -FIXME ----- +import pyspark.pandas as ps -//=== How does this work? +df = ps.DataFrame({'value': [1, 2, 3]}) # Compliant +---- -//=== Pitfalls -//=== Going the extra mile +== Resources +=== Documentation +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/best_practices.html#avoid-reserved-column-names[Best Practices] -//== Resources -//=== Documentation -//=== Articles & blog posts -//=== Conference presentations -//=== Standards -//=== External coding guidelines -//=== Benchmarks