Skip to content

Commit 422a670

Browse files
Support insert-into-by-name for generated columns
## Description Spark 3.4 no longer requires users to provide _all_ columns in insert-by-name queries. This means Delta can now support omitting generated columns from the column list in such queries. This test adds support for this and adds some additional tests related to the changed by-name support. Resolves #1215 Adds unit tests. ## Does this PR introduce _any_ user-facing changes? Yes. Users will be able to omit generated columns from the column list when inserting by name. Closes #1743 GitOrigin-RevId: 8694fab3d93b71b4230bf6f5dd0f2a21be6f3634
1 parent 9fac2e6 commit 422a670

File tree

4 files changed

+212
-17
lines changed

4 files changed

+212
-17
lines changed

core/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -830,12 +830,6 @@ class DeltaAnalysis(session: SparkSession)
830830
*/
831831
private def needsSchemaAdjustmentByName(query: LogicalPlan, targetAttrs: Seq[Attribute],
832832
deltaTable: DeltaTableV2): Boolean = {
833-
// TODO: update this to allow columns with default expressions to not be
834-
// specified (i.e. generated columns)
835-
if (targetAttrs.length != query.output.length) {
836-
throw QueryCompilationErrors.writeTableWithMismatchedColumnsError(
837-
targetAttrs.length, query.output.length, query)
838-
}
839833
insertIntoByNameMissingColumn(query, targetAttrs, deltaTable)
840834
val userSpecifiedNames = if (session.sessionState.conf.caseSensitiveAnalysis) {
841835
query.output.map(a => (a.name, a)).toMap

core/src/test/scala/org/apache/spark/sql/delta/DeltaDataFrameWriterV2Suite.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,12 @@ class DeltaDataFrameWriterV2Suite
538538
checkAnswer(
539539
spark.table(s"delta.`$location`"),
540540
Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c")))
541+
542+
// allows missing columns
543+
Seq(4L).toDF("id").writeTo(s"delta.`$location`").append()
544+
checkAnswer(
545+
spark.table(s"delta.`$location`"),
546+
Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"), Row(4L, null)))
541547
}
542548

543549
test("Create: basic behavior by path") {

core/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,105 @@ class DeltaInsertIntoSQLSuite
7676
}
7777
}
7878

79+
test("insertInto: append by name") {
80+
import testImplicits._
81+
val t1 = "tbl"
82+
withTable(t1) {
83+
sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format")
84+
val df = Seq((1L, "a"), (2L, "b"), (3L, "c")).toDF("id", "data")
85+
sql(s"INSERT INTO $t1(id, data) VALUES(1L, 'a')")
86+
// Can be in a different order
87+
sql(s"INSERT INTO $t1(data, id) VALUES('b', 2L)")
88+
// Can be casted automatically
89+
sql(s"INSERT INTO $t1(data, id) VALUES('c', 3)")
90+
verifyTable(t1, df)
91+
withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> "false") {
92+
// Missing columns
93+
assert(intercept[AnalysisException] {
94+
sql(s"INSERT INTO $t1(data) VALUES(4)")
95+
}.getMessage.contains("Column id is not specified in INSERT"))
96+
// Missing columns with matching dataType
97+
assert(intercept[AnalysisException] {
98+
sql(s"INSERT INTO $t1(data) VALUES('b')")
99+
}.getMessage.contains("Column id is not specified in INSERT"))
100+
}
101+
// Duplicate columns
102+
assert(intercept[AnalysisException](
103+
sql(s"INSERT INTO $t1(data, data) VALUES(5)")).getMessage.nonEmpty)
104+
}
105+
}
106+
107+
test("insertInto: overwrite by name") {
108+
import testImplicits._
109+
val t1 = "tbl"
110+
withTable(t1) {
111+
sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format")
112+
sql(s"INSERT OVERWRITE $t1(id, data) VALUES(1L, 'a')")
113+
verifyTable(t1, Seq((1L, "a")).toDF("id", "data"))
114+
// Can be in a different order
115+
sql(s"INSERT OVERWRITE $t1(data, id) VALUES('b', 2L)")
116+
verifyTable(t1, Seq((2L, "b")).toDF("id", "data"))
117+
// Can be casted automatically
118+
sql(s"INSERT OVERWRITE $t1(data, id) VALUES('c', 3)")
119+
verifyTable(t1, Seq((3L, "c")).toDF("id", "data"))
120+
withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> "false") {
121+
// Missing columns
122+
assert(intercept[AnalysisException] {
123+
sql(s"INSERT OVERWRITE $t1(data) VALUES(4)")
124+
}.getMessage.contains("Column id is not specified in INSERT"))
125+
// Missing columns with matching datatype
126+
assert(intercept[AnalysisException] {
127+
sql(s"INSERT OVERWRITE $t1(data) VALUES(4L)")
128+
}.getMessage.contains("Column id is not specified in INSERT"))
129+
}
130+
// Duplicate columns
131+
assert(intercept[AnalysisException](
132+
sql(s"INSERT OVERWRITE $t1(data, data) VALUES(5)")).getMessage.nonEmpty)
133+
}
134+
}
135+
136+
dynamicOverwriteTest("insertInto: dynamic overwrite by name") {
137+
import testImplicits._
138+
val t1 = "tbl"
139+
withTable(t1) {
140+
sql(s"CREATE TABLE $t1 (id bigint, data string, data2 string) " +
141+
s"USING $v2Format PARTITIONED BY (id)")
142+
sql(s"INSERT OVERWRITE $t1(id, data, data2) VALUES(1L, 'a', 'b')")
143+
verifyTable(t1, Seq((1L, "a", "b")).toDF("id", "data", "data2"))
144+
// Can be in a different order
145+
sql(s"INSERT OVERWRITE $t1(data, data2, id) VALUES('b', 'd', 2L)")
146+
verifyTable(t1, Seq((1L, "a", "b"), (2L, "b", "d")).toDF("id", "data", "data2"))
147+
// Can be casted automatically
148+
sql(s"INSERT OVERWRITE $t1(data, data2, id) VALUES('c', 'e', 1)")
149+
verifyTable(t1, Seq((1L, "c", "e"), (2L, "b", "d")).toDF("id", "data", "data2"))
150+
withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> "false") {
151+
// Missing columns
152+
assert(intercept[AnalysisException] {
153+
sql(s"INSERT OVERWRITE $t1(data, id) VALUES('c', 1)")
154+
}.getMessage.contains("Column data2 is not specified in INSERT"))
155+
// Missing columns with matching datatype
156+
assert(intercept[AnalysisException] {
157+
sql(s"INSERT OVERWRITE $t1(data, id) VALUES('c', 1L)")
158+
}.getMessage.contains("Column data2 is not specified in INSERT"))
159+
}
160+
// Duplicate columns
161+
assert(intercept[AnalysisException](
162+
sql(s"INSERT OVERWRITE $t1(data, data) VALUES(5)")).getMessage.nonEmpty)
163+
}
164+
}
165+
166+
test("insertInto: static partition column name should not be used in the column list") {
167+
withTable("t") {
168+
sql(s"CREATE TABLE t(i STRING, c string) USING $v2Format PARTITIONED BY (c)")
169+
checkError(
170+
exception = intercept[AnalysisException] {
171+
sql("INSERT OVERWRITE t PARTITION (c='1') (c) VALUES ('2')")
172+
},
173+
errorClass = "STATIC_PARTITION_COLUMN_IN_INSERT_COLUMN_LIST",
174+
parameters = Map("staticName" -> "c"))
175+
}
176+
}
177+
79178

80179
Seq(("ordinal", ""), ("name", "(id, col2, col)")).foreach { case (testName, values) =>
81180
test(s"INSERT OVERWRITE schema evolution works for array struct types - $testName") {

core/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala

Lines changed: 107 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,14 @@ trait GeneratedColumnSuiteBase extends GeneratedColumnTest {
209209
assert(errMsg.contains(str))
210210
}
211211

212+
protected def testTableUpdateDPO(
213+
testName: String)(updateFunc: (String, String) => Seq[Row]): Unit = {
214+
withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
215+
SQLConf.PartitionOverwriteMode.DYNAMIC.toString) {
216+
testTableUpdate("dpo_" + testName)(updateFunc)
217+
}
218+
}
219+
212220
testTableUpdate("append_data") { (table, path) =>
213221
Seq(
214222
Tuple5(1L, "foo", "2020-10-11 12:30:30", 100, "2020-11-12")
@@ -269,6 +277,26 @@ trait GeneratedColumnSuiteBase extends GeneratedColumnTest {
269277
100, 1000, sqlDate("2020-11-12")) :: Nil
270278
}
271279

280+
testTableUpdate("insert_into_by_name_provide_all_columns") { (table, _) =>
281+
sql(s"INSERT INTO $table (c5, c6, c7_g_p, c8, c1, c2_g, c3_p, c4_g_p) VALUES" +
282+
s"('2020-10-11 12:30:30', 100, 1000, '2020-11-12', 1, 11, 'foo', '2020-10-11')")
283+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
284+
100, 1000, sqlDate("2020-11-12")) :: Nil
285+
}
286+
287+
testTableUpdate("insert_into_by_name_not_provide_generated_columns") { (table, _) =>
288+
sql(s"INSERT INTO $table (c6, c8, c1, c3_p, c5) VALUES" +
289+
s"(100, '2020-11-12', 1L, 'foo', '2020-10-11 12:30:30')")
290+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
291+
100, 1000, sqlDate("2020-11-12")) :: Nil
292+
}
293+
294+
testTableUpdate("insert_into_by_name_with_some_generated_columns") { (table, _) =>
295+
sql(s"INSERT INTO $table (c5, c6, c8, c1, c3_p, c4_g_p) VALUES" +
296+
s"('2020-10-11 12:30:30', 100, '2020-11-12', 1L, 'foo', '2020-10-11')")
297+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
298+
100, 1000, sqlDate("2020-11-12")) :: Nil
299+
}
272300

273301
testTableUpdate("insert_into_select_provide_all_columns") { (table, path) =>
274302
sql(s"INSERT INTO $table SELECT " +
@@ -277,6 +305,17 @@ trait GeneratedColumnSuiteBase extends GeneratedColumnTest {
277305
100, 1000, sqlDate("2020-11-12")) :: Nil
278306
}
279307

308+
testTableUpdate("insert_into_by_name_not_provide_normal_columns") { (table, _) =>
309+
val e = intercept[AnalysisException] {
310+
withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> "false") {
311+
sql(s"INSERT INTO $table (c6, c8, c1, c3_p) VALUES" +
312+
s"(100, '2020-11-12', 1L, 'foo')")
313+
}
314+
}
315+
errorContains(e.getMessage, "Column c5 is not specified in INSERT")
316+
Nil
317+
}
318+
280319
testTableUpdate("insert_overwrite_values_provide_all_columns") { (table, path) =>
281320
sql(s"INSERT OVERWRITE TABLE $table VALUES" +
282321
s"(1, 11, 'foo', '2020-10-11', '2020-10-11 12:30:30', 100, 1000, '2020-11-12')")
@@ -291,27 +330,84 @@ trait GeneratedColumnSuiteBase extends GeneratedColumnTest {
291330
100, 1000, sqlDate("2020-11-12")) :: Nil
292331
}
293332

333+
testTableUpdate("insert_overwrite_by_name_provide_all_columns") { (table, _) =>
334+
sql(s"INSERT OVERWRITE $table (c5, c6, c7_g_p, c8, c1, c2_g, c3_p, c4_g_p) VALUES" +
335+
s"('2020-10-11 12:30:30', 100, 1000, '2020-11-12', 1, 11, 'foo', '2020-10-11')")
336+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
337+
100, 1000, sqlDate("2020-11-12")) :: Nil
338+
}
294339

295-
testTableUpdate("dpo_insert_overwrite_values_provide_all_columns") { (table, path) =>
296-
withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
297-
SQLConf.PartitionOverwriteMode.DYNAMIC.toString) {
298-
sql(s"INSERT OVERWRITE TABLE $table VALUES" +
299-
s"(1, 11, 'foo', '2020-10-11', '2020-10-11 12:30:30', 100, 1000, '2020-11-12')")
300-
}
340+
testTableUpdate("insert_overwrite_by_name_not_provide_generated_columns") { (table, _) =>
341+
sql(s"INSERT OVERWRITE $table (c6, c8, c1, c3_p, c5) VALUES" +
342+
s"(100, '2020-11-12', 1L, 'foo', '2020-10-11 12:30:30')")
301343
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
302344
100, 1000, sqlDate("2020-11-12")) :: Nil
303345
}
304346

305-
testTableUpdate("dpo_insert_overwrite_select_provide_all_columns") { (table, path) =>
306-
withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
307-
SQLConf.PartitionOverwriteMode.DYNAMIC.toString) {
308-
sql(s"INSERT OVERWRITE TABLE $table SELECT " +
309-
s"1, 11, 'foo', '2020-10-11', '2020-10-11 12:30:30', 100, 1000, '2020-11-12'")
347+
testTableUpdate("insert_overwrite_by_name_with_some_generated_columns") { (table, _) =>
348+
sql(s"INSERT OVERWRITE $table (c5, c6, c8, c1, c3_p, c4_g_p) VALUES" +
349+
s"('2020-10-11 12:30:30', 100, '2020-11-12', 1L, 'foo', '2020-10-11')")
350+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
351+
100, 1000, sqlDate("2020-11-12")) :: Nil
352+
}
353+
354+
testTableUpdate("insert_overwrite_by_name_not_provide_normal_columns") { (table, _) =>
355+
val e = intercept[AnalysisException] {
356+
withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> "false") {
357+
sql(s"INSERT OVERWRITE $table (c6, c8, c1, c3_p) VALUES" +
358+
s"(100, '2020-11-12', 1L, 'foo')")
359+
}
310360
}
361+
errorContains(e.getMessage, "Column c5 is not specified in INSERT")
362+
Nil
363+
}
364+
365+
testTableUpdateDPO("insert_overwrite_values_provide_all_columns") { (table, path) =>
366+
sql(s"INSERT OVERWRITE TABLE $table VALUES" +
367+
s"(1, 11, 'foo', '2020-10-11', '2020-10-11 12:30:30', 100, 1000, '2020-11-12')")
368+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
369+
100, 1000, sqlDate("2020-11-12")) :: Nil
370+
}
371+
372+
testTableUpdateDPO("insert_overwrite_select_provide_all_columns") { (table, path) =>
373+
sql(s"INSERT OVERWRITE TABLE $table SELECT " +
374+
s"1, 11, 'foo', '2020-10-11', '2020-10-11 12:30:30', 100, 1000, '2020-11-12'")
311375
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
312376
100, 1000, sqlDate("2020-11-12")) :: Nil
313377
}
314378

379+
testTableUpdateDPO("insert_overwrite_by_name_values_provide_all_columns") { (table, _) =>
380+
sql(s"INSERT OVERWRITE $table (c5, c6, c7_g_p, c8, c1, c2_g, c3_p, c4_g_p) VALUES" +
381+
s"(CAST('2020-10-11 12:30:30' AS TIMESTAMP), 100, 1000, CAST('2020-11-12' AS DATE), " +
382+
s"1L, 11L, 'foo', CAST('2020-10-11' AS DATE))")
383+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
384+
100, 1000, sqlDate("2020-11-12")) :: Nil
385+
}
386+
387+
testTableUpdateDPO(
388+
"insert_overwrite_by_name_not_provide_generated_columns") { (table, _) =>
389+
sql(s"INSERT OVERWRITE $table (c6, c8, c1, c3_p, c5) VALUES" +
390+
s"(100, CAST('2020-11-12' AS DATE), 1L, 'foo', CAST('2020-10-11 12:30:30' AS TIMESTAMP))")
391+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
392+
100, 1000, sqlDate("2020-11-12")) :: Nil
393+
}
394+
395+
testTableUpdateDPO("insert_overwrite_by_name_with_some_generated_columns") { (table, _) =>
396+
sql(s"INSERT OVERWRITE $table (c5, c6, c8, c1, c3_p, c4_g_p) VALUES" +
397+
s"(CAST('2020-10-11 12:30:30' AS TIMESTAMP), 100, CAST('2020-11-12' AS DATE), 1L, " +
398+
s"'foo', CAST('2020-10-11' AS DATE))")
399+
Row(1L, 11L, "foo", sqlDate("2020-10-11"), sqlTimestamp("2020-10-11 12:30:30"),
400+
100, 1000, sqlDate("2020-11-12")) :: Nil
401+
}
402+
403+
testTableUpdateDPO("insert_overwrite_by_name_not_provide_normal_columns") { (table, _) =>
404+
val e = intercept[AnalysisException] {
405+
sql(s"INSERT OVERWRITE $table (c6, c8, c1, c3_p) VALUES" +
406+
s"(100, '2020-11-12', 1L, 'foo')")
407+
}
408+
assert(e.getMessage.contains("Column c5 is not specified in INSERT"))
409+
Nil
410+
}
315411

316412
testTableUpdate("delete") { (table, path) =>
317413
Seq(

0 commit comments

Comments
 (0)