diff --git a/rules/S7187/python/metadata.json b/rules/S7187/python/metadata.json index b1742f82691..08a4e812622 100644 --- a/rules/S7187/python/metadata.json +++ b/rules/S7187/python/metadata.json @@ -1,5 +1,5 @@ { - "title": "FIXME", + "title": "PySpark Pandas DataFrame columns should not use a reserved name", "type": "CODE_SMELL", "status": "ready", "remediation": { @@ -7,18 +7,18 @@ "constantCost": "5min" }, "tags": [ + "data-science", + "pyspark" ], "defaultSeverity": "Major", "ruleSpecification": "RSPEC-7187", "sqKey": "S7187", "scope": "All", "defaultQualityProfiles": ["Sonar way"], - "quickfix": "unknown", + "quickfix": "infeasible", "code": { "impacts": { - "MAINTAINABILITY": "HIGH", - "RELIABILITY": "MEDIUM", - "SECURITY": "LOW" + "RELIABILITY": "MEDIUM" }, "attribute": "CONVENTIONAL" } diff --git a/rules/S7187/python/rule.adoc b/rules/S7187/python/rule.adoc index caae0d69054..e3d01d6e861 100644 --- a/rules/S7187/python/rule.adoc +++ b/rules/S7187/python/rule.adoc @@ -1,16 +1,17 @@ -FIXME: add a description - -// If you want to factorize the description uncomment the following line and create the file. -//include::../description.adoc[] - +This rule raises an issue when a PySpark Pandas DataFrame column name is set to a reserved name. == Why is this an issue? -FIXME: remove the unused optional headers (that are commented out) +PySpark offers powerful APIs to work with Pandas DataFrames in a distributed environment. +While the integration between PySpark and Pandas is seamless, there are some caveats that should be taken into account. +Spark Pandas API uses some special column names for internal purposes. +These column names contain leading `++__++` and trailing `++__++`. +Therefore, when using PySpark with Pandas and naming or renaming columns, +it is discouraged to use such reserved column names as they are not guaranteed to yield the expected results. -//=== What is the potential impact? == How to fix it -//== How to fix it in FRAMEWORK NAME + +To fix this issue provide a column name without leading and trailing `++__++`. === Code examples @@ -18,27 +19,23 @@ FIXME: remove the unused optional headers (that are commented out) [source,python,diff-id=1,diff-type=noncompliant] ---- -FIXME +import pyspark.pandas as ps + +df = ps.DataFrame({'__value__': [1, 2, 3]}) # Noncompliant: __value__ is a reserved column name ---- ==== Compliant solution [source,python,diff-id=1,diff-type=compliant] ---- -FIXME ----- +import pyspark.pandas as ps -//=== How does this work? +df = ps.DataFrame({'value': [1, 2, 3]}) # Compliant +---- -//=== Pitfalls -//=== Going the extra mile +== Resources +=== Documentation +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/best_practices.html#avoid-reserved-column-names[Best Practices] -//== Resources -//=== Documentation -//=== Articles & blog posts -//=== Conference presentations -//=== Standards -//=== External coding guidelines -//=== Benchmarks