[SPARK-43665][CONNECT][PS] Enable PandasSQLStringFormatter.vformat to work with Spark Connect

itholic · zhengruifeng · commit daa984430739 · 2023-07-12T11:27:18.000+08:00
### What changes were proposed in this pull request? This PR aims enabling SQL parity test `test_sql_with_python_objects` for pandas API on Spark with Spark Connect. ### Why are the changes needed? To increase the API coverage for pandas API on Spark with Spark Connect. ### Does this PR introduce _any_ user-facing change? This enables `ps.sql` with Python objects. ### How was this patch tested? Reuse the existing SQL tests. Closes #41931 from itholic/SPARK-43665. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/pandas/sql_formatter.py b/python/pyspark/pandas/sql_formatter.py
@@ -25,7 +25,6 @@
 
 from pyspark.pandas.internal import InternalFrame
 from pyspark.pandas.namespace import _get_index_map
-from pyspark.sql.functions import lit
 from pyspark import pandas as ps
 from pyspark.sql import SparkSession
 from pyspark.pandas.utils import default_session
@@ -265,7 +264,10 @@ def _convert_value(self, val: Any, name: str) -> Optional[str]:
             val._to_spark().createOrReplaceTempView(df_name)
             return df_name
         elif isinstance(val, str):
-            return lit(val)._jc.expr().sql()  # for escaped characters.
+            # This is matched to behavior from JVM implementation.
+            # See `sql` definition from `sql/catalyst/src/main/scala/org/apache/spark/
+            # sql/catalyst/expressions/literals.scala`
+            return "'" + val.replace("\\", "\\\\").replace("'", "\\'") + "'"
         else:
             return val
 
diff --git a/python/pyspark/pandas/tests/connect/test_parity_sql.py b/python/pyspark/pandas/tests/connect/test_parity_sql.py
@@ -30,12 +30,6 @@ def test_sql_with_index_col(self):
     def test_sql_with_pandas_on_spark_objects(self):
         super().test_sql_with_pandas_on_spark_objects()
 
-    @unittest.skip(
-        "TODO(SPARK-43665): Enable PandasSQLStringFormatter.vformat to work with Spark Connect."
-    )
-    def test_sql_with_python_objects(self):
-        super().test_sql_with_python_objects()
-
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.connect.test_parity_sql import *  # noqa: F401
diff --git a/python/pyspark/pandas/tests/test_sql.py b/python/pyspark/pandas/tests/test_sql.py
@@ -81,6 +81,14 @@ def test_sql_with_python_objects(self):
             ps.sql("SELECT id FROM range(10) WHERE id IN {pred}", col="lit", pred=(1, 2, 3)),
             ps.DataFrame({"id": [1, 2, 3]}),
         )
+        self.assert_eq(
+            ps.sql("SELECT {col} as a FROM range(1)", col="a'''c''d"),
+            ps.DataFrame({"a": ["a'''c''d"]}),
+        )
+        self.assert_eq(
+            ps.sql("SELECT id FROM range(10) WHERE id IN {pred}", col="a'''c''d", pred=(1, 2, 3)),
+            ps.DataFrame({"id": [1, 2, 3]}),
+        )
 
     def test_sql_with_pandas_on_spark_objects(self):
         psdf = ps.DataFrame({"a": [1, 2, 3, 4]})