address

aykut-bozkurt · aykut-bozkurt · commit 462c1a042d6b · 2025-04-07T13:03:08.000+03:00
diff --git a/README.md b/README.md
@@ -248,9 +248,9 @@ Supported authorization methods' priority order is shown below:
 ## Copy Options
 `pg_parquet` supports the following options in the `COPY TO` command:
 - `format parquet`: you need to specify this option to read or write Parquet files which does not end with `.parquet[.<compression>]` extension,
-- `file_size_bytes <int>`: the total byte size per Parquet file. When set, the parquet files, with target size, are created under parent directory (named the same as file name without file extension). By default, when not specified, a single file is generated without creating a parent folder.
-- `row_group_size <int>`: the number of rows in each row group while writing Parquet files. The default row group size is `122880`,
-- `row_group_size_bytes <int>`: the total byte size of rows in each row group while writing Parquet files. The default row group size bytes is `row_group_size * 1024`,
+- `file_size_bytes <string>`: the total file size per Parquet file. When set, the parquet files, with target size, are created under parent directory (named the same as file name). By default, when not specified, a single file is generated without creating a parent folder. You can specify total bytes without unit like `file_size_bytes 2000000` or with unit (KB, MB, or GB) like `file_size_bytes '1MB'`,
+- `row_group_size <int64>`: the number of rows in each row group while writing Parquet files. The default row group size is `122880`,
+- `row_group_size_bytes <int64>`: the total byte size of rows in each row group while writing Parquet files. The default row group size bytes is `row_group_size * 1024`,
 - `compression <string>`: the compression format to use while writing Parquet files. The supported compression formats are `uncompressed`, `snappy`, `gzip`, `brotli`, `lz4`, `lz4raw` and `zstd`. The default compression format is `snappy`. If not specified, the compression format is determined by the file extension,
 - `compression_level <int>`: the compression level to use while writing Parquet files. The supported compression levels are only supported for `gzip`, `zstd` and `brotli` compression formats. The default compression level is `6` for `gzip (0-10)`, `1` for `zstd (1-22)` and `1` for `brotli (0-11)`.
 
diff --git a/src/parquet_copy_hook/copy_to_dest_receiver.rs b/src/parquet_copy_hook/copy_to_dest_receiver.rs
@@ -167,12 +167,9 @@ pub(crate) extern "C" fn copy_startup(
     };
     parquet_dest.natts = tupledesc.len();
 
-    parquet_dest.target_batch_size = if parquet_dest.copy_options.row_group_size < RECORD_BATCH_SIZE
-    {
-        parquet_dest.copy_options.row_group_size
-    } else {
-        RECORD_BATCH_SIZE
-    };
+    // handle when row group size is set less than RECORD_BATCH_SIZE
+    parquet_dest.target_batch_size =
+        std::cmp::min(parquet_dest.copy_options.row_group_size, RECORD_BATCH_SIZE);
 
     let uri = unsafe { CStr::from_ptr(parquet_dest.uri) }
         .to_str()
diff --git a/src/parquet_copy_hook/copy_to_split_dest_receiver.rs b/src/parquet_copy_hook/copy_to_split_dest_receiver.rs
@@ -115,26 +115,11 @@ impl CopyToParquetSplitDestReceiver {
             .to_str()
             .expect("invalid uri");
 
-        let file_name = Path::new(uri)
-            .file_name()
-            .expect("invalid uri")
-            .to_str()
-            .expect("invalid uri");
-
-        let (file_name_prefix, file_extension) = match file_name.find('.') {
-            Some(index) => file_name.split_at(index),
-            None => (file_name, ""),
-        };
-
-        let parent_folder = Path::new(uri)
-            .parent()
-            .expect("invalid uri")
-            .join(file_name_prefix);
+        let parent_folder = Path::new(uri);
 
-        // append child id to final part of uri
         let file_id = self.current_child_id;
 
-        let child_uri = parent_folder.join(format!("data_{file_id}{file_extension}"));
+        let child_uri = parent_folder.join(format!("data_{file_id}.parquet"));
 
         child_uri.to_str().expect("invalid uri").as_pg_cstr()
     }
diff --git a/src/parquet_copy_hook/copy_utils.rs b/src/parquet_copy_hook/copy_utils.rs
@@ -64,13 +64,16 @@ pub(crate) fn validate_copy_to_options(p_stmt: &PgBox<PlannedStmt>, uri_info: Pa
     let file_size_bytes_option = copy_stmt_get_option(p_stmt, "file_size_bytes");
 
     if !file_size_bytes_option.is_null() {
-        let file_size_bytes = unsafe { defGetInt64(file_size_bytes_option.as_ptr()) };
+        let file_size_bytes = unsafe { defGetString(file_size_bytes_option.as_ptr()) };
 
-        const ONE_MB: i64 = 1024 * 1024;
+        let file_size_bytes = unsafe {
+            CStr::from_ptr(file_size_bytes)
+                .to_str()
+                .expect("file_size_bytes option is not a valid CString")
+        };
 
-        if file_size_bytes < ONE_MB {
-            panic!("file_size_bytes must be at least {ONE_MB} bytes");
-        }
+        parse_file_size(file_size_bytes)
+            .unwrap_or_else(|e| panic!("file_size_bytes option is not valid: {}", e));
     }
 
     let row_group_size_option = copy_stmt_get_option(p_stmt, "row_group_size");
@@ -204,7 +207,16 @@ pub(crate) fn copy_to_stmt_file_size_bytes(p_stmt: &PgBox<PlannedStmt>) -> i64 {
     if file_size_bytes_option.is_null() {
         INVALID_FILE_SIZE_BYTES
     } else {
-        unsafe { defGetInt64(file_size_bytes_option.as_ptr()) }
+        let file_size_bytes = unsafe { defGetString(file_size_bytes_option.as_ptr()) };
+
+        let file_size_bytes = unsafe {
+            CStr::from_ptr(file_size_bytes)
+                .to_str()
+                .expect("file_size_bytes option is not a valid CString")
+        };
+
+        parse_file_size(file_size_bytes)
+            .unwrap_or_else(|e| panic!("file_size_bytes option is not valid: {}", e)) as i64
     }
 }
 
@@ -590,3 +602,55 @@ pub(crate) fn create_filtered_tupledesc_for_relation<'a>(
 
     filtered_tupledesc
 }
+
+/// Parses a size string like "1MB", "512KB", or just "1000000" into a byte count.
+/// Enforces a minimum of 1MB.
+fn parse_file_size(size_str: &str) -> Result<u64, String> {
+    // Normalize casing and trim whitespace
+    let size_str = size_str.trim().to_uppercase();
+
+    // Find the first non-digit character
+    let mut idx = 0;
+    for c in size_str.chars() {
+        if !c.is_ascii_digit() {
+            break;
+        }
+        idx += 1;
+    }
+
+    // If there's no numeric portion, return an error
+    if idx == 0 {
+        return Err(format!("No numeric value found in '{}'", size_str));
+    }
+
+    // Split into numeric part and (optional) unit
+    let num_part = &size_str[..idx];
+    let unit_part = size_str[idx..].trim();
+
+    // Convert the numeric portion
+    let mut bytes = match num_part.parse::<u64>() {
+        Ok(n) => n,
+        Err(_) => return Err(format!("Invalid numeric portion in '{}'", size_str)),
+    };
+
+    // Interpret the suffix, if present
+    match unit_part {
+        "" => { /* no suffix: treat as bytes */ }
+        "KB" => bytes *= 1_024,
+        "MB" => bytes *= 1_024 * 1_024,
+        "GB" => bytes *= 1_024 * 1_024 * 1_024,
+        _ => {
+            return Err(format!(
+                "Unrecognized unit '{}'. Allowed units are KB, MB or GB.",
+                unit_part
+            ))
+        }
+    }
+
+    // Enforce a minimum of 1MB
+    if bytes < 1_024 * 1_024 {
+        return Err(format!("Minimum allowed size is 1MB. Got {} bytes.", bytes));
+    }
+
+    Ok(bytes)
+}
diff --git a/src/pgrx_tests/common.rs b/src/pgrx_tests/common.rs
@@ -40,7 +40,6 @@ pub(crate) fn comma_separated_copy_options(options: &HashMap<String, CopyOptionV
 }
 
 pub(crate) const LOCAL_TEST_FILE_PATH: &str = "/tmp/pg_parquet_test.parquet";
-pub(crate) const LOCAL_TEST_FOLDER_PATH: &str = "/tmp/pg_parquet_test";
 
 pub(crate) struct TestTable<T: IntoDatum + FromDatum> {
     uri: String,
diff --git a/src/pgrx_tests/copy_options.rs b/src/pgrx_tests/copy_options.rs
@@ -5,9 +5,7 @@ mod tests {
     use pgrx::{pg_test, Spi};
 
     use crate::{
-        pgrx_tests::common::{
-            CopyOptionValue, TestTable, LOCAL_TEST_FILE_PATH, LOCAL_TEST_FOLDER_PATH,
-        },
+        pgrx_tests::common::{CopyOptionValue, TestTable, LOCAL_TEST_FILE_PATH},
         PgParquetCompression,
     };
 
@@ -282,15 +280,16 @@ mod tests {
     }
 
     #[pg_test]
-    #[should_panic(expected = "file_size_bytes must be at least 1048576 bytes")]
+    #[should_panic(expected = "Minimum allowed size is 1MB. Got 102400 bytes.")]
     fn test_invalid_file_size_bytes() {
-        let parent_folder = Path::new(LOCAL_TEST_FOLDER_PATH);
+        let parent_folder = Path::new(LOCAL_TEST_FILE_PATH);
         std::fs::remove_dir_all(parent_folder).ok();
+        std::fs::remove_file(parent_folder).ok();
 
         let mut copy_options = HashMap::new();
         copy_options.insert(
             "file_size_bytes".to_string(),
-            CopyOptionValue::IntOption(100),
+            CopyOptionValue::StringOption("100KB".into()),
         );
 
         let test_table = TestTable::<i32>::new("int4".into()).with_copy_to_options(copy_options);
@@ -414,8 +413,6 @@ mod tests {
 
     #[pg_test]
     fn test_file_size_bytes() {
-        let parent_folder = Path::new(LOCAL_TEST_FOLDER_PATH);
-
         let uris = [
             // with ".parquet" extension
             LOCAL_TEST_FILE_PATH.to_string(),
@@ -432,29 +429,26 @@ mod tests {
 
         for (uri, expected_file_count) in uris.into_iter().zip(expected_file_counts) {
             // cleanup
+
+            // drop tables
             Spi::run("drop table if exists test_expected, test_result;").unwrap();
-            std::fs::remove_dir_all(parent_folder).ok();
 
-            const ONE_MB: i32 = 1024 * 1024;
+            let parent_folder = Path::new(&uri);
+
+            // remove if there is a directory
+            std::fs::remove_dir(parent_folder).ok();
+
+            // remove if there is a file
+            std::fs::remove_file(parent_folder).ok();
+
             let setup_commands = format!(
                 "create table test_expected(a text);\n\
                  create table test_result(a text);\n\
                  insert into test_expected select 'hellooooo' || i from generate_series(1, 1000000) i;\n\
-                 copy test_expected to '{uri}' with (format parquet, file_size_bytes {ONE_MB})");
+                 copy test_expected to '{uri}' with (format parquet, file_size_bytes '1MB')");
             Spi::run(&setup_commands).unwrap();
 
             // assert file count
-            let file_name = Path::new(&uri)
-                .file_name()
-                .expect("invalid uri")
-                .to_str()
-                .expect("invalid uri");
-
-            let file_extension = file_name
-                .find('.')
-                .map(|idx| &file_name[idx..])
-                .unwrap_or("");
-
             let mut file_entries = parent_folder
                 .read_dir()
                 .unwrap()
@@ -473,7 +467,7 @@ mod tests {
 
             // assert file paths
             for (file_idx, file_entry) in file_entries.iter().enumerate() {
-                let expected_path = parent_folder.join(format!("data_{file_idx}{file_extension}"));
+                let expected_path = parent_folder.join(format!("data_{file_idx}.parquet"));
 
                 let expected_path = expected_path.to_str().unwrap();
 

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,6 @@ pub(crate) fn comma_separated_copy_options(options: &HashMap<String, CopyOptionV`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`pub(crate) const LOCAL_TEST_FILE_PATH: &str = "/tmp/pg_parquet_test.parquet";`
`43`		`-pub(crate) const LOCAL_TEST_FOLDER_PATH: &str = "/tmp/pg_parquet_test";`
`44`	`43`
`45`	`44`	`pub(crate) struct TestTable<T: IntoDatum + FromDatum> {`
`46`	`45`	`uri: String,`