Skip to content

Commit b626eb4

Browse files
Improved stats inspection (#101)
Previously, we printed column statistics for each column per row group via `parquet.metadata(uri)`. With the new udf `parquet.column_stats(uri)`, we print the column stats for each column aggregated by row groups. Stats for some of the types were printed in a text format that cannot be converted to actual Postgres type. This PR also makes sure the output format is convertible to the actual Postgres type.
1 parent 1b5878d commit b626eb4

File tree

6 files changed

+877
-47
lines changed

6 files changed

+877
-47
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ COPY table FROM 's3://mybucket/data.parquet' WITH (format 'parquet');
2121
- [Copy FROM/TO Parquet files TO/FROM Postgres tables](#copy-tofrom-parquet-files-fromto-postgres-tables)
2222
- [Inspect Parquet schema](#inspect-parquet-schema)
2323
- [Inspect Parquet metadata](#inspect-parquet-metadata)
24+
- [Inspect Parquet column statistics](#inspect-parquet-column-statistics)
2425
- [Object Store Support](#object-store-support)
2526
- [Copy Options](#copy-options)
2627
- [Configuration](#configuration)
@@ -157,6 +158,30 @@ SELECT uri, encode(key, 'escape') as key, encode(value, 'escape') as value FROM
157158
(1 row)
158159
```
159160

161+
### Inspect Parquet column statistics
162+
163+
You can call `SELECT * FROM parquet.column_stats(<uri>)` to discover the column statistics of the Parquet file, such as min and max value for the column, at given uri.
164+
165+
```sql
166+
SELECT * FROM parquet.column_stats('/tmp/product_example.parquet')
167+
column_id | field_id | stats_min | stats_max | stats_null_count | stats_distinct_count
168+
-----------+----------+----------------------------+----------------------------+------------------+----------------------
169+
4 | 7 | item 1 | item 2 | 1 |
170+
6 | 11 | 1 | 1 | 1 |
171+
7 | 12 | | | 2 |
172+
10 | 17 | | | 2 |
173+
0 | 0 | 1 | 1 | 0 |
174+
11 | 18 | 2025-03-11 14:01:22.045739 | 2025-03-11 14:01:22.045739 | 0 |
175+
3 | 6 | 1 | 2 | 1 |
176+
12 | 19 | 2022-05-01 19:00:00+03 | 2022-05-01 19:00:00+03 | 0 |
177+
8 | 15 | | | 2 |
178+
5 | 8 | 1 | 2 | 1 |
179+
9 | 16 | | | 2 |
180+
1 | 2 | 1 | 1 | 0 |
181+
2 | 3 | product 1 | product 1 | 0 |
182+
(13 rows)
183+
```
184+
160185
## Object Store Support
161186
`pg_parquet` supports reading and writing Parquet files from/to `S3`, `Azure Blob Storage` and `http(s)` object stores.
162187

src/parquet_udfs.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub(crate) mod metadata;
22
pub(crate) mod schema;
3+
pub(crate) mod stats;

src/parquet_udfs/metadata.rs

Lines changed: 9 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
use ::parquet::file::statistics::Statistics;
21
use pgrx::{iter::TableIterator, name, pg_extern, pg_schema};
32

4-
use crate::arrow_parquet::uri_utils::{
5-
ensure_access_privilege_to_uri, parquet_metadata_from_uri, uri_as_string, ParsedUriInfo,
3+
use crate::{
4+
arrow_parquet::uri_utils::{
5+
ensure_access_privilege_to_uri, parquet_metadata_from_uri, uri_as_string, ParsedUriInfo,
6+
},
7+
parquet_udfs::stats::{stats_max_value_to_pg_str, stats_min_value_to_pg_str},
68
};
79

810
#[pg_schema]
@@ -69,10 +71,12 @@ mod parquet {
6971
let mut stats_null_count = None;
7072
let mut stats_distinct_count = None;
7173

74+
let column_descriptor = column.column_descr();
75+
7276
if let Some(statistics) = column.statistics() {
73-
stats_min = stats_min_value_to_str(statistics);
77+
stats_min = stats_min_value_to_pg_str(statistics, column_descriptor);
7478

75-
stats_max = stats_max_value_to_str(statistics);
79+
stats_max = stats_max_value_to_pg_str(statistics, column_descriptor);
7680

7781
stats_null_count = statistics.null_count_opt().map(|v| v as i64);
7882

@@ -215,45 +219,3 @@ mod parquet {
215219
TableIterator::new(rows)
216220
}
217221
}
218-
219-
fn stats_min_value_to_str(statistics: &Statistics) -> Option<String> {
220-
match &statistics {
221-
Statistics::Boolean(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
222-
Statistics::Int32(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
223-
Statistics::Int64(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
224-
Statistics::Int96(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
225-
Statistics::Float(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
226-
Statistics::Double(val_stats) => val_stats.min_opt().map(|v| v.to_string()),
227-
Statistics::ByteArray(val_stats) => val_stats.min_opt().map(|v| match v.as_utf8() {
228-
Ok(v) => v.to_string(),
229-
Err(_) => v.to_string(),
230-
}),
231-
Statistics::FixedLenByteArray(val_stats) => {
232-
val_stats.min_opt().map(|v| match v.as_utf8() {
233-
Ok(v) => v.to_string(),
234-
Err(_) => v.to_string(),
235-
})
236-
}
237-
}
238-
}
239-
240-
fn stats_max_value_to_str(statistics: &Statistics) -> Option<String> {
241-
match statistics {
242-
Statistics::Boolean(statistics) => statistics.max_opt().map(|v| v.to_string()),
243-
Statistics::Int32(statistics) => statistics.max_opt().map(|v| v.to_string()),
244-
Statistics::Int64(statistics) => statistics.max_opt().map(|v| v.to_string()),
245-
Statistics::Int96(statistics) => statistics.max_opt().map(|v| v.to_string()),
246-
Statistics::Float(statistics) => statistics.max_opt().map(|v| v.to_string()),
247-
Statistics::Double(statistics) => statistics.max_opt().map(|v| v.to_string()),
248-
Statistics::ByteArray(statistics) => statistics.max_opt().map(|v| match v.as_utf8() {
249-
Ok(v) => v.to_string(),
250-
Err(_) => v.to_string(),
251-
}),
252-
Statistics::FixedLenByteArray(statistics) => {
253-
statistics.max_opt().map(|v| match v.as_utf8() {
254-
Ok(v) => v.to_string(),
255-
Err(_) => v.to_string(),
256-
})
257-
}
258-
}
259-
}

0 commit comments

Comments
 (0)