Skip to content

Commit d97f8ad

Browse files
committed
Numeric improvement and fix
**Problem** Previously, we were writing unbounded numerics, that does not specify precision and scale (i.e. `numeric`), as text since they can be too large to represent as parquet decimal. Most of the time users ignore the precision for numeric columns, so they were written as text. That prevented pushing down some operators on the numeric type by execution engines. **Improvement** We start to read/write unbounded numerics as numeric(38, 16) to parquet file. We throw a runtime error if an unbounded numeric value exceeds 22 digits before the decimal or 16 digits after the decimal. For the ones that bump into the error, we give hint to change the column type to a numeric(p,s) with precision and scale specified, to get rid of the error. **Fix** Arrow to pg conversions were not correct for some cases e.g. when there is no decimal point. These cases are fixed and covered by tests.
1 parent 631cb1a commit d97f8ad

File tree

9 files changed

+523
-133
lines changed

9 files changed

+523
-133
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,12 @@ There is currently only one GUC parameter to enable/disable the `pg_parquet`:
225225
| `crunchy_map`(5) | GROUP | MAP |
226226

227227
> [!WARNING]
228-
> - (1) The `numeric` types with <= `38` precision is represented as `FIXED_LEN_BYTE_ARRAY(16)` with `DECIMAL(128)` logical type. The `numeric` types with > `38` precision is represented as `BYTE_ARRAY` with `STRING` logical type.
228+
> - (1) `numeric` type is written the smallest possible memory width to parquet file as follows:
229+
> * `numeric(P <= 9, S)` is represented as `INT32` with `DECIMAL` logical type
230+
> * `numeric(9 < P <= 18, S)` is represented as `INT64` with `DECIMAL` logical type
231+
> * `numeric(18 < P <= 38, S)` is represented as `FIXED_LEN_BYTE_ARRAY(9-16)` with `DECIMAL` logical type
232+
> * `numeric(38 < P, S)` is represented as `BYTE_ARRAY` with `STRING` logical type
233+
> * `numeric` is allowed by Postgres. (precision and scale not specified). These are represented by a default precision (38) and scale (16) instead of writing them as string. You get runtime error if your table tries to read or write a numeric value which is not allowed by the default precision and scale (22 integral digits before decimal point, 16 digits after decimal point).
229234
> - (2) The `date` type is represented according to `Unix epoch` when writing to Parquet files. It is converted back according to `PostgreSQL epoch` when reading from Parquet files.
230235
> - (3) The `timestamptz` and `timetz` types are adjusted to `UTC` when writing to Parquet files. They are converted back with `UTC` timezone when reading from Parquet files.
231236
> - (4) The `geometry` type is represented as `BYTE_ARRAY` encoded as `WKB` when `postgis` extension is created. Otherwise, it is represented as `BYTE_ARRAY` with `STRING` logical type.

src/arrow_parquet/arrow_to_pg.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ use crate::{
2424
geometry::{is_postgis_geometry_type, Geometry},
2525
map::{is_map_type, Map},
2626
pg_arrow_type_conversions::{
27-
extract_precision_from_numeric_typmod, extract_scale_from_numeric_typmod,
28-
MAX_DECIMAL_PRECISION,
27+
extract_precision_and_scale_from_numeric_typmod, should_write_numeric_as_text,
2928
},
3029
},
3130
};
@@ -66,8 +65,8 @@ pub(crate) struct ArrowToPgAttributeContext {
6665
is_map: bool,
6766
attribute_contexts: Option<Vec<ArrowToPgAttributeContext>>,
6867
attribute_tupledesc: Option<PgTupleDesc<'static>>,
69-
precision: Option<usize>,
70-
scale: Option<usize>,
68+
precision: Option<u32>,
69+
scale: Option<u32>,
7170
}
7271

7372
impl ArrowToPgAttributeContext {
@@ -127,8 +126,9 @@ impl ArrowToPgAttributeContext {
127126
let precision;
128127
let scale;
129128
if attribute_typoid == NUMERICOID {
130-
precision = Some(extract_precision_from_numeric_typmod(typmod));
131-
scale = Some(extract_scale_from_numeric_typmod(typmod));
129+
let (p, s) = extract_precision_and_scale_from_numeric_typmod(typmod);
130+
precision = Some(p);
131+
scale = Some(s);
132132
} else {
133133
precision = None;
134134
scale = None;
@@ -263,7 +263,7 @@ fn to_pg_nonarray_datum(
263263
.precision
264264
.expect("missing precision in context");
265265

266-
if precision > MAX_DECIMAL_PRECISION {
266+
if should_write_numeric_as_text(precision) {
267267
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
268268

269269
to_pg_datum!(
@@ -415,7 +415,7 @@ fn to_pg_array_datum(
415415
.precision
416416
.expect("missing precision in context");
417417

418-
if precision > MAX_DECIMAL_PRECISION {
418+
if should_write_numeric_as_text(precision) {
419419
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
420420

421421
to_pg_datum!(

src/arrow_parquet/arrow_to_pg/numeric.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,28 @@ impl ArrowArrayToPgType<AnyNumeric> for Decimal128Array {
1111
if self.is_null(0) {
1212
None
1313
} else {
14+
let precision = context.precision.expect("Expected precision");
1415
let scale = context.scale.expect("Expected scale");
15-
Some(i128_to_numeric(self.value(0), scale))
16+
17+
Some(i128_to_numeric(
18+
self.value(0),
19+
precision,
20+
scale,
21+
context.typmod,
22+
))
1623
}
1724
}
1825
}
1926

2027
// Numeric[]
2128
impl ArrowArrayToPgType<Vec<Option<AnyNumeric>>> for Decimal128Array {
2229
fn to_pg_type(self, context: &ArrowToPgAttributeContext) -> Option<Vec<Option<AnyNumeric>>> {
30+
let precision = context.precision.expect("Expected precision");
2331
let scale = context.scale.expect("Expected scale");
32+
2433
let mut vals = vec![];
2534
for val in self.iter() {
26-
let val = val.map(|v| i128_to_numeric(v, scale));
35+
let val = val.map(|v| i128_to_numeric(v, precision, scale, context.typmod));
2736
vals.push(val);
2837
}
2938
Some(vals)

src/arrow_parquet/pg_to_arrow.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@ use crate::{
2323
geometry::{is_postgis_geometry_type, Geometry},
2424
map::{is_map_type, Map},
2525
pg_arrow_type_conversions::{
26-
extract_precision_from_numeric_typmod, extract_scale_from_numeric_typmod,
27-
MAX_DECIMAL_PRECISION,
26+
extract_precision_and_scale_from_numeric_typmod, should_write_numeric_as_text,
2827
},
2928
},
3029
};
@@ -65,8 +64,8 @@ pub(crate) struct PgToArrowAttributeContext {
6564
is_geometry: bool,
6665
is_map: bool,
6766
attribute_contexts: Option<Vec<PgToArrowAttributeContext>>,
68-
scale: Option<usize>,
69-
precision: Option<usize>,
67+
scale: Option<u32>,
68+
precision: Option<u32>,
7069
}
7170

7271
impl PgToArrowAttributeContext {
@@ -126,8 +125,9 @@ impl PgToArrowAttributeContext {
126125
let precision;
127126
let scale;
128127
if attribute_typoid == NUMERICOID {
129-
precision = Some(extract_precision_from_numeric_typmod(typmod));
130-
scale = Some(extract_scale_from_numeric_typmod(typmod));
128+
let (p, s) = extract_precision_and_scale_from_numeric_typmod(typmod);
129+
precision = Some(p);
130+
scale = Some(s);
131131
} else {
132132
precision = None;
133133
scale = None;
@@ -274,7 +274,7 @@ fn to_arrow_primitive_array(
274274
.precision
275275
.expect("missing precision in context");
276276

277-
if precision > MAX_DECIMAL_PRECISION {
277+
if should_write_numeric_as_text(precision) {
278278
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
279279

280280
to_arrow_primitive_array!(FallbackToText, tuples, attribute_context)
@@ -359,7 +359,7 @@ fn to_arrow_list_array(
359359
.precision
360360
.expect("missing precision in context");
361361

362-
if precision > MAX_DECIMAL_PRECISION {
362+
if should_write_numeric_as_text(precision) {
363363
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
364364

365365
to_arrow_list_array!(pgrx::Array<FallbackToText>, tuples, attribute_context)

src/arrow_parquet/pg_to_arrow/numeric.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ impl PgTypeToArrowArray<AnyNumeric> for Vec<Option<AnyNumeric>> {
2222

2323
let numerics = self
2424
.into_iter()
25-
.map(|numeric| numeric.map(numeric_to_i128))
25+
.map(|numeric| {
26+
numeric
27+
.map(|numeric| numeric_to_i128(numeric, context.typmod, context.field.name()))
28+
})
2629
.collect::<Vec<_>>();
2730

2831
let numeric_array = Decimal128Array::from(numerics)
@@ -43,7 +46,10 @@ impl PgTypeToArrowArray<AnyNumeric> for Vec<Option<Vec<Option<AnyNumeric>>>> {
4346
.into_iter()
4447
.flatten()
4548
.flatten()
46-
.map(|numeric| numeric.map(numeric_to_i128))
49+
.map(|numeric| {
50+
numeric
51+
.map(|numeric| numeric_to_i128(numeric, context.typmod, context.field.name()))
52+
})
4753
.collect::<Vec<_>>();
4854

4955
let precision = context

src/arrow_parquet/schema_parser.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ use crate::{
1818
geometry::is_postgis_geometry_type,
1919
map::is_map_type,
2020
pg_arrow_type_conversions::{
21-
extract_precision_from_numeric_typmod, extract_scale_from_numeric_typmod,
22-
MAX_DECIMAL_PRECISION,
21+
extract_precision_and_scale_from_numeric_typmod, should_write_numeric_as_text,
2322
},
2423
},
2524
};
@@ -213,10 +212,9 @@ fn parse_primitive_schema(
213212
INT4OID => Field::new(elem_name, arrow::datatypes::DataType::Int32, true),
214213
INT8OID => Field::new(elem_name, arrow::datatypes::DataType::Int64, true),
215214
NUMERICOID => {
216-
let precision = extract_precision_from_numeric_typmod(typmod);
217-
let scale = extract_scale_from_numeric_typmod(typmod);
215+
let (precision, scale) = extract_precision_and_scale_from_numeric_typmod(typmod);
218216

219-
if precision > MAX_DECIMAL_PRECISION {
217+
if should_write_numeric_as_text(precision) {
220218
Field::new(elem_name, arrow::datatypes::DataType::Utf8, true)
221219
} else {
222220
Field::new(

0 commit comments

Comments
 (0)