Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ COPY table FROM 's3://mybucket/data.parquet' WITH (format 'parquet');
- [Copy FROM/TO Parquet files TO/FROM Postgres tables](#copy-tofrom-parquet-files-fromto-postgres-tables)
- [Inspect Parquet schema](#inspect-parquet-schema)
- [Inspect Parquet metadata](#inspect-parquet-metadata)
- [Inspect Parquet column statistics](#inspect-parquet-column-statistics)
- [Object Store Support](#object-store-support)
- [Copy Options](#copy-options)
- [Configuration](#configuration)
Expand Down Expand Up @@ -157,6 +158,30 @@ SELECT uri, encode(key, 'escape') as key, encode(value, 'escape') as value FROM
(1 row)
```

### Inspect Parquet column statistics

You can call `SELECT * FROM parquet.column_stats(<uri>)` to discover the column statistics of the Parquet file, such as min and max value for the column, at given uri.

```sql
SELECT * FROM parquet.column_stats('/tmp/product_example.parquet')
column_id | field_id | stats_min | stats_max | stats_null_count | stats_distinct_count
-----------+----------+----------------------------+----------------------------+------------------+----------------------
4 | 7 | item 1 | item 2 | 1 |
6 | 11 | 1 | 1 | 1 |
7 | 12 | | | 2 |
10 | 17 | | | 2 |
0 | 0 | 1 | 1 | 0 |
11 | 18 | 2025-03-11 14:01:22.045739 | 2025-03-11 14:01:22.045739 | 0 |
3 | 6 | 1 | 2 | 1 |
12 | 19 | 2022-05-01 19:00:00+03 | 2022-05-01 19:00:00+03 | 0 |
8 | 15 | | | 2 |
5 | 8 | 1 | 2 | 1 |
9 | 16 | | | 2 |
1 | 2 | 1 | 1 | 0 |
2 | 3 | product 1 | product 1 | 0 |
(13 rows)
```

## Object Store Support
`pg_parquet` supports reading and writing Parquet files from/to `S3`, `Azure Blob Storage` and `http(s)` object stores.

Expand Down
1 change: 1 addition & 0 deletions src/parquet_udfs.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub(crate) mod metadata;
pub(crate) mod schema;
pub(crate) mod stats;
56 changes: 9 additions & 47 deletions src/parquet_udfs/metadata.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use ::parquet::file::statistics::Statistics;
use pgrx::{iter::TableIterator, name, pg_extern, pg_schema};

use crate::arrow_parquet::uri_utils::{
ensure_access_privilege_to_uri, parquet_metadata_from_uri, uri_as_string, ParsedUriInfo,
use crate::{
arrow_parquet::uri_utils::{
ensure_access_privilege_to_uri, parquet_metadata_from_uri, uri_as_string, ParsedUriInfo,
},
parquet_udfs::stats::{stats_max_value_to_pg_str, stats_min_value_to_pg_str},
};

#[pg_schema]
Expand Down Expand Up @@ -69,10 +71,12 @@ mod parquet {
let mut stats_null_count = None;
let mut stats_distinct_count = None;

let column_descriptor = column.column_descr();

if let Some(statistics) = column.statistics() {
stats_min = stats_min_value_to_str(statistics);
stats_min = stats_min_value_to_pg_str(statistics, column_descriptor);

stats_max = stats_max_value_to_str(statistics);
stats_max = stats_max_value_to_pg_str(statistics, column_descriptor);

stats_null_count = statistics.null_count_opt().map(|v| v as i64);

Expand Down Expand Up @@ -215,45 +219,3 @@ mod parquet {
TableIterator::new(rows)
}
}

fn stats_min_value_to_str(statistics: &Statistics) -> Option<String> {
match &statistics {
Statistics::Boolean(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
Statistics::Int32(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
Statistics::Int64(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
Statistics::Int96(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
Statistics::Float(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
Statistics::Double(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
Statistics::ByteArray(val_stats) => val_stats.min_opt().map(|v| match v.as_utf8() {
Ok(v) => v.to_string(),
Err(_) => v.to_string(),
}),
Statistics::FixedLenByteArray(val_stats) => {
val_stats.min_opt().map(|v| match v.as_utf8() {
Ok(v) => v.to_string(),
Err(_) => v.to_string(),
})
}
}
}

fn stats_max_value_to_str(statistics: &Statistics) -> Option<String> {
match statistics {
Statistics::Boolean(statistics) => statistics.max_opt().map(|v| v.to_string()),
Statistics::Int32(statistics) => statistics.max_opt().map(|v| v.to_string()),
Statistics::Int64(statistics) => statistics.max_opt().map(|v| v.to_string()),
Statistics::Int96(statistics) => statistics.max_opt().map(|v| v.to_string()),
Statistics::Float(statistics) => statistics.max_opt().map(|v| v.to_string()),
Statistics::Double(statistics) => statistics.max_opt().map(|v| v.to_string()),
Statistics::ByteArray(statistics) => statistics.max_opt().map(|v| match v.as_utf8() {
Ok(v) => v.to_string(),
Err(_) => v.to_string(),
}),
Statistics::FixedLenByteArray(statistics) => {
statistics.max_opt().map(|v| match v.as_utf8() {
Ok(v) => v.to_string(),
Err(_) => v.to_string(),
})
}
}
}
Loading