address

aykut-bozkurt · aykut-bozkurt · commit cd30884d00a4 · 2025-04-07T10:47:03.000+03:00
diff --git a/src/arrow_parquet/field_ids.rs b/src/arrow_parquet/field_ids.rs
@@ -1,5 +1,6 @@
 use std::{collections::HashMap, fmt::Display, str::FromStr};
 
+use arrow_schema::{DataType, Schema};
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Default)]
@@ -50,6 +51,7 @@ pub(crate) struct FieldIdMapping {
 }
 
 impl FieldIdMapping {
+    /// Returns the field ID, if any, from `FieldIdMapping` for the given field path.
     pub(crate) fn field_id(&self, field_path: &[String]) -> Option<i32> {
         if field_path.is_empty() {
             panic!("Field path is empty");
@@ -65,6 +67,94 @@ impl FieldIdMapping {
             None => None,
         }
     }
+
+    /// Validates that every field name in the `FieldIdMapping` exists in the provided Arrow schema
+    fn validate_against_schema(&self, arrow_schema: &Schema) -> Result<(), String> {
+        // Build a map from field name to &Field for quick lookups
+        let mut arrow_field_map = HashMap::new();
+        for field in arrow_schema.fields() {
+            arrow_field_map.insert(field.name().clone(), field);
+        }
+
+        // Check every field name in the JSON mapping
+        for (field_name, mapping_item) in &self.fields {
+            if field_name == "__root_field_id" {
+                // Skip the root field, as it doesn't exist in the Arrow schema
+                continue;
+            }
+
+            // Ensure the field exists in the Arrow schema
+            let arrow_field = match arrow_field_map.get(field_name) {
+                Some(f) => f,
+                None => {
+                    return Err(format!(
+                    "Field '{}' in the mapping does not exist in the Arrow schema.\nAvailable fields: {:?}",
+                    field_name,
+                    arrow_schema
+                        .fields()
+                        .iter()
+                        .map(|f| f.name())
+                        .collect::<Vec<_>>()
+                ));
+                }
+            };
+
+            match mapping_item {
+                // If the JSON item is an integer field ID, we're done
+                FieldIdMappingItem::FieldId(_id) => {}
+
+                // If the JSON item is a nested mapping, we need to validate it
+                FieldIdMappingItem::FieldIdMapping(mapping) => match arrow_field.data_type() {
+                    DataType::Struct(subfields) => {
+                        // We expect the JSON keys to include something like:
+                        //   "__root_field_id": <int>,
+                        //   "field_name": <int or nested mapping>
+
+                        let subschema = Schema::new(subfields.clone());
+                        mapping.validate_against_schema(&subschema)?;
+                    }
+                    DataType::List(element_field) => {
+                        // We expect the JSON keys to include something like:
+                        //   "__root_field_id": <int>,
+                        //   "element": <int or nested mapping>
+                        //
+
+                        let element_schema = Schema::new(vec![element_field.clone()]);
+                        mapping.validate_against_schema(&element_schema)?;
+                    }
+                    DataType::Map(entry_field, _) => {
+                        // We expect the JSON keys to include something like:
+                        //   "__root_field_id": <int>,
+                        //   "key": <int or nested mapping>
+                        //   "val": <int or nested mapping>
+
+                        match entry_field.data_type() {
+                            DataType::Struct(entry_fields) => {
+                                let entry_schema = Schema::new(entry_fields.clone());
+                                mapping.validate_against_schema(&entry_schema)?;
+                            }
+                            other_type => {
+                                panic!(
+                                "Map entry field should be a struct, but got '{:?}' for field '{}'",
+                                other_type, field_name
+                            );
+                            }
+                        };
+
+                        return Ok(());
+                    }
+                    other_type => {
+                        panic!(
+                            "Unexpected data type '{:?}' for field '{}'",
+                            other_type, field_name
+                        );
+                    }
+                },
+            }
+        }
+
+        Ok(())
+    }
 }
 
 pub(crate) fn field_id_mapping_from_json_string(
@@ -76,3 +166,15 @@ pub(crate) fn field_id_mapping_from_json_string(
 fn field_id_mapping_to_json_string(field_id_mapping: &FieldIdMapping) -> String {
     serde_json::to_string(field_id_mapping).unwrap()
 }
+
+/// Validate that every field name in the `FieldIdMapping` exists in the provided Arrow schema
+/// when the `FieldIds` are explicitly specified.
+pub(crate) fn validate_field_ids(field_ids: FieldIds, arrow_schema: &Schema) -> Result<(), String> {
+    match field_ids {
+        FieldIds::None => Ok(()),
+        FieldIds::Auto => Ok(()),
+        FieldIds::Explicit(field_id_mapping) => {
+            field_id_mapping.validate_against_schema(arrow_schema)
+        }
+    }
+}
diff --git a/src/arrow_parquet/parquet_writer.rs b/src/arrow_parquet/parquet_writer.rs
@@ -12,6 +12,7 @@ use pgrx::{heap_tuple::PgHeapTuple, AllocatedByRust, PgTupleDesc};
 use crate::{
     arrow_parquet::{
         compression::PgParquetCompressionWithLevel,
+        field_ids::validate_field_ids,
         pg_to_arrow::context::collect_pg_to_arrow_attribute_contexts,
         schema_parser::{
             parquet_schema_string_from_attributes, parse_arrow_schema_from_attributes,
@@ -62,7 +63,10 @@ impl ParquetWriterContext {
             parquet_schema_string_from_attributes(&attributes, field_ids.clone())
         );
 
-        let schema = parse_arrow_schema_from_attributes(&attributes, field_ids);
+        let schema = parse_arrow_schema_from_attributes(&attributes, field_ids.clone());
+
+        validate_field_ids(field_ids, &schema).unwrap_or_else(|e| panic!("{e}"));
+
         let schema = Arc::new(schema);
 
         let writer_props = Self::writer_props(tupledesc, options);
diff --git a/src/pgrx_tests/copy_options.rs b/src/pgrx_tests/copy_options.rs
@@ -1006,4 +1006,156 @@ mod tests {
             ]
         );
     }
+
+    #[pg_test]
+    fn test_explicit_field_ids_with_missing_field_ids() {
+        let setup_commands = "create type dog as (id int, name text);
+                              create type person as (id int, dog dog, dogs dog[]);
+                              create table test_table(a int, b text, c person, d person[]);";
+        Spi::run(setup_commands).unwrap();
+
+        let explicit_field_ids = "{\"a\": 10,
+                                   \"c\": {
+                                            \"__root_field_id\": 100,
+                                            \"id\": 200,
+                                            \"dog\": {
+                                                        \"id\": 400,
+                                                        \"name\": 500
+                                                     },
+                                            \"dogs\": {
+                                                        \"__root_field_id\": 600,
+                                                        \"element\": {
+                                                                        \"__root_field_id\": 700,
+                                                                        \"id\": 800,
+                                                                        \"name\": 900
+                                                                     }
+                                                      }
+                                          },
+                                   \"d\": {
+                                            \"__root_field_id\": 1000,
+                                            \"element\": {
+                                                            \"dog\": {
+                                                                        \"__root_field_id\": 1300,
+                                                                        \"id\": 1400,
+                                                                        \"name\": 1500
+                                                                     },
+                                                            \"dogs\": {
+                                                                        \"__root_field_id\": 1600,
+                                                                        \"element\": {
+                                                                                        \"__root_field_id\": 1700,
+                                                                                        \"id\": 1800
+                                                                                     }
+                                                                      }
+                                                         }
+                                          }
+                                  }";
+
+        let copy_to_parquet = format!(
+            "copy test_table to '{LOCAL_TEST_FILE_PATH}' with (field_ids '{explicit_field_ids}');"
+        );
+        Spi::run(&copy_to_parquet).unwrap();
+
+        let fields = Spi::connect(|client| {
+            let parquet_schema_command = format!(
+                "select field_id, name from parquet.schema('{}') order by 1,2;",
+                LOCAL_TEST_FILE_PATH
+            );
+
+            let tup_table = client.select(&parquet_schema_command, None, &[]).unwrap();
+            let mut results = Vec::new();
+
+            for row in tup_table {
+                let field_id = row["field_id"].value::<i64>().unwrap();
+                let name = row["name"].value::<String>().unwrap().unwrap();
+
+                results.push((field_id, name));
+            }
+
+            results
+        });
+
+        assert_eq!(
+            fields,
+            vec![
+                (Some(10), "a".into()),
+                (Some(100), "c".into()),
+                (Some(200), "id".into()),
+                (Some(400), "id".into()),
+                (Some(500), "name".into()),
+                (Some(600), "dogs".into()),
+                (Some(700), "element".into()),
+                (Some(800), "id".into()),
+                (Some(900), "name".into()),
+                (Some(1000), "d".into()),
+                (Some(1300), "dog".into()),
+                (Some(1400), "id".into()),
+                (Some(1500), "name".into()),
+                (Some(1600), "dogs".into()),
+                (Some(1700), "element".into()),
+                (Some(1800), "id".into()),
+                (None, "arrow_schema".into()),
+                (None, "b".into()),
+                (None, "dog".into()),
+                (None, "element".into()),
+                (None, "id".into()),
+                (None, "list".into()),
+                (None, "list".into()),
+                (None, "list".into()),
+                (None, "name".into()),
+            ]
+        );
+    }
+
+    #[pg_test]
+    #[should_panic(expected = "Available fields: [\"a\", \"b\", \"c\", \"d\"]")]
+    fn test_explicit_field_ids_invalid_json() {
+        let setup_commands = "create type dog as (id int, name text);
+                              create type person as (id int, dog dog, dogs dog[]);
+                              create table test_table(a int, b text, c person, d person[]);";
+        Spi::run(setup_commands).unwrap();
+
+        let explicit_field_ids = "{\"aa\": 10, \"b\": 12}";
+
+        let copy_to_parquet = format!(
+            "copy test_table to '{LOCAL_TEST_FILE_PATH}' with (field_ids '{explicit_field_ids}');"
+        );
+        Spi::run(&copy_to_parquet).unwrap();
+    }
+
+    #[pg_test]
+    #[should_panic(expected = "Available fields: [\"id\", \"name\"]")]
+    fn test_explicit_field_ids_another_invalid_json() {
+        let setup_commands = "create type dog as (id int, name text);
+                              create type person as (id int, dog dog, dogs dog[]);
+                              create table test_table(a int, b text, c person, d person[]);";
+        Spi::run(setup_commands).unwrap();
+
+        let explicit_field_ids = "{\"a\": 10,
+                                   \"d\": {
+                                            \"__root_field_id\": 1000,
+                                            \"element\": {
+                                                            \"__root_field_id\": 1100,
+                                                            \"id\": 1200,
+                                                            \"dog\": {
+                                                                        \"__root_field_id\": 1300,
+                                                                        \"id\": 1400,
+                                                                        \"name\": 1500
+                                                                     },
+                                                            \"dogs\": {
+                                                                        \"__root_field_id\": 1600,
+                                                                        \"element\": {
+                                                                                        \"__root_field_id\": 1700,
+                                                                                        \"iddd\": 1800,
+                                                                                        \"name\": 1900
+                                                                                     }
+                                                                      }
+                                                         }
+                                          }
+                                  }";
+
+        let copy_to_parquet = format!(
+            "copy test_table to '{LOCAL_TEST_FILE_PATH}' with (field_ids '{explicit_field_ids}');"
+        );
+        Spi::run(&copy_to_parquet).unwrap();
+    }
 }