Skip to content

Commit 5f53ea9

Browse files
committed
Improved stats inspection
Previously, we printed column statistics for each column per row group via `parquet.metadata(uri)`. With the new udf `parquet.column_stats(uri)`, we print stats for each column aggregated by row groups. Stats for some of the types were printed in a text format that cannot be converted to actual Postgres type. This PR also makes sure the output format is convertible to the actual Postgres type.
1 parent fa728df commit 5f53ea9

File tree

6 files changed

+753
-45
lines changed

6 files changed

+753
-45
lines changed

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ COPY table FROM 's3://mybucket/data.parquet' WITH (format 'parquet');
2121
- [Copy FROM/TO Parquet files TO/FROM Postgres tables](#copy-tofrom-parquet-files-fromto-postgres-tables)
2222
- [Inspect Parquet schema](#inspect-parquet-schema)
2323
- [Inspect Parquet metadata](#inspect-parquet-metadata)
24+
- [Inspect Parquet column statistics](#inspect-parquet-column-statistics)
2425
- [Object Store Support](#object-store-support)
2526
- [Copy Options](#copy-options)
2627
- [Configuration](#configuration)
@@ -155,6 +156,29 @@ SELECT uri, encode(key, 'escape') as key, encode(value, 'escape') as value FROM
155156
(1 row)
156157
```
157158

159+
### Inspect Parquet column statistics
160+
You can call `SELECT * FROM parquet.column_stats(<uri>)` to discover the column statistics of the Parquet file, such as min and max value for the column, at given uri.
161+
162+
```sql
163+
SELECT * FROM parquet.column_stats('/tmp/product_example.parquet')
164+
field_id | stats_min | stats_max | stats_null_count | stats_distinct_count
165+
----------+----------------------------+----------------------------+------------------+----------------------
166+
19 | 2022-05-01 16:00:00 | 2022-05-01 16:00:00 | 0 |
167+
15 | | | 2 |
168+
3 | product 1 | product 1 | 0 |
169+
2 | 1 | 1 | 0 |
170+
0 | 1 | 1 | 0 |
171+
6 | 1 | 2 | 1 |
172+
7 | item 1 | item 2 | 1 |
173+
16 | | | 2 |
174+
12 | | | 2 |
175+
18 | 2025-01-29 02:28:35.193773 | 2025-01-29 02:28:35.193773 | 0 |
176+
11 | 1 | 1 | 1 |
177+
8 | 1 | 2 | 1 |
178+
17 | | | 2 |
179+
(13 rows)
180+
```
181+
158182
## Object Store Support
159183
`pg_parquet` supports reading and writing Parquet files from/to `S3` and `Azure Blob Storage` object stores.
160184

src/parquet_udfs.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub(crate) mod metadata;
22
pub(crate) mod schema;
3+
pub(crate) mod stats;

src/parquet_udfs/metadata.rs

Lines changed: 6 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use ::parquet::file::statistics::Statistics;
21
use pgrx::{iter::TableIterator, name, pg_extern, pg_schema};
32

43
use crate::arrow_parquet::uri_utils::{
@@ -7,6 +6,8 @@ use crate::arrow_parquet::uri_utils::{
76

87
#[pg_schema]
98
mod parquet {
9+
use crate::parquet_udfs::stats::{stats_max_value_to_pg_str, stats_min_value_to_pg_str};
10+
1011
use super::*;
1112

1213
#[pg_extern]
@@ -65,10 +66,12 @@ mod parquet {
6566
let mut stats_null_count = None;
6667
let mut stats_distinct_count = None;
6768

69+
let column_descriptor = column.column_descr();
70+
6871
if let Some(statistics) = column.statistics() {
69-
stats_min = stats_min_value_to_str(statistics);
72+
stats_min = stats_min_value_to_pg_str(statistics, column_descriptor);
7073

71-
stats_max = stats_max_value_to_str(statistics);
74+
stats_max = stats_max_value_to_pg_str(statistics, column_descriptor);
7275

7376
stats_null_count = statistics.null_count_opt().map(|v| v as i64);
7477

@@ -202,45 +205,3 @@ mod parquet {
202205
TableIterator::new(rows)
203206
}
204207
}
205-
206-
fn stats_min_value_to_str(statistics: &Statistics) -> Option<String> {
207-
match &statistics {
208-
Statistics::Boolean(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
209-
Statistics::Int32(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
210-
Statistics::Int64(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
211-
Statistics::Int96(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
212-
Statistics::Float(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
213-
Statistics::Double(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
214-
Statistics::ByteArray(val_stats) => val_stats.min_opt().map(|v| match v.as_utf8() {
215-
Ok(v) => v.to_string(),
216-
Err(_) => v.to_string(),
217-
}),
218-
Statistics::FixedLenByteArray(val_stats) => {
219-
val_stats.min_opt().map(|v| match v.as_utf8() {
220-
Ok(v) => v.to_string(),
221-
Err(_) => v.to_string(),
222-
})
223-
}
224-
}
225-
}
226-
227-
fn stats_max_value_to_str(statistics: &Statistics) -> Option<String> {
228-
match statistics {
229-
Statistics::Boolean(statistics) => statistics.max_opt().map(|v| v.to_string()),
230-
Statistics::Int32(statistics) => statistics.max_opt().map(|v| v.to_string()),
231-
Statistics::Int64(statistics) => statistics.max_opt().map(|v| v.to_string()),
232-
Statistics::Int96(statistics) => statistics.max_opt().map(|v| v.to_string()),
233-
Statistics::Float(statistics) => statistics.max_opt().map(|v| v.to_string()),
234-
Statistics::Double(statistics) => statistics.max_opt().map(|v| v.to_string()),
235-
Statistics::ByteArray(statistics) => statistics.max_opt().map(|v| match v.as_utf8() {
236-
Ok(v) => v.to_string(),
237-
Err(_) => v.to_string(),
238-
}),
239-
Statistics::FixedLenByteArray(statistics) => {
240-
statistics.max_opt().map(|v| match v.as_utf8() {
241-
Ok(v) => v.to_string(),
242-
Err(_) => v.to_string(),
243-
})
244-
}
245-
}
246-
}

0 commit comments

Comments
 (0)