Skip to content

Commit da43062

Browse files
committed
Numeric improvement and fix
**Problem** Previously, we were writing unbounded numerics, that does not specify precision and scale (i.e. `numeric`), as text since they can be too large to represent as parquet decimal. Most of the time users ignore the precision for numeric columns, so they were written as text. That prevented pushing down some operators on the numeric type by execution engines. **Improvement** We start to read/write unbounded numerics as numeric(38, 16) to parquet file. We throw a runtime error if an unbounded numeric value exceeds 22 digits before the decimal or 16 digits after the decimal. For the ones that bump into the error, we give hint to change the column type to a numeric(p,s) with precision and scale specified, to get rid of the error. **Fix** Arrow to pg conversions were not correct for some cases e.g. when there is no decimal point. These cases are fixed and covered by tests.
1 parent 78fc489 commit da43062

File tree

9 files changed

+485
-123
lines changed

9 files changed

+485
-123
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,12 @@ There is currently only one GUC parameter to enable/disable the `pg_parquet`:
225225
| `crunchy_map`(5) | GROUP | MAP |
226226

227227
> [!WARNING]
228-
> - (1) The `numeric` types with <= `38` precision is represented as `FIXED_LEN_BYTE_ARRAY(16)` with `DECIMAL(128)` logical type. The `numeric` types with > `38` precision is represented as `BYTE_ARRAY` with `STRING` logical type.
228+
> - (1) `numeric` type is written the smallest possible memory width to parquet file as follows:
229+
> * `numeric(P <= 9, S)` is represented as `INT32` with `DECIMAL` logical type
230+
> * `numeric(9 < P <= 18, S)` is represented as `INT64` with `DECIMAL` logical type
231+
> * `numeric(18 < P <= 38, S)` is represented as `FIXED_LEN_BYTE_ARRAY(9-16)` with `DECIMAL` logical type
232+
> * `numeric(38 < P, S)` is represented as `BYTE_ARRAY` with `STRING` logical type
233+
> * `numeric` is allowed by Postgres. (precision and scale not specified). These are represented by a default precision (38) and scale (16) instead of writing them as string. You get runtime error if your table tries to read or write a numeric value which is not allowed by the default precision and scale (22 integral digits before decimal point, 16 digits after decimal point).
229234
> - (2) The `date` type is represented according to `Unix epoch` when writing to Parquet files. It is converted back according to `PostgreSQL epoch` when reading from Parquet files.
230235
> - (3) The `timestamptz` and `timetz` types are adjusted to `UTC` when writing to Parquet files. They are converted back with `UTC` timezone when reading from Parquet files.
231236
> - (4) The `geometry` type is represented as `BYTE_ARRAY` encoded as `WKB` when `postgis` extension is created. Otherwise, it is represented as `BYTE_ARRAY` with `STRING` logical type.

src/arrow_parquet/arrow_to_pg.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use crate::{
2525
map::{is_map_type, Map},
2626
pg_arrow_type_conversions::{
2727
extract_precision_from_numeric_typmod, extract_scale_from_numeric_typmod,
28-
MAX_DECIMAL_PRECISION,
28+
should_write_numeric_as_text,
2929
},
3030
},
3131
};
@@ -66,8 +66,8 @@ pub(crate) struct ArrowToPgAttributeContext {
6666
is_map: bool,
6767
attribute_contexts: Option<Vec<ArrowToPgAttributeContext>>,
6868
attribute_tupledesc: Option<PgTupleDesc<'static>>,
69-
precision: Option<usize>,
70-
scale: Option<usize>,
69+
precision: Option<i32>,
70+
scale: Option<i32>,
7171
}
7272

7373
impl ArrowToPgAttributeContext {
@@ -263,7 +263,7 @@ fn to_pg_nonarray_datum(
263263
.precision
264264
.expect("missing precision in context");
265265

266-
if precision > MAX_DECIMAL_PRECISION {
266+
if should_write_numeric_as_text(precision) {
267267
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
268268

269269
to_pg_datum!(
@@ -415,7 +415,7 @@ fn to_pg_array_datum(
415415
.precision
416416
.expect("missing precision in context");
417417

418-
if precision > MAX_DECIMAL_PRECISION {
418+
if should_write_numeric_as_text(precision) {
419419
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
420420

421421
to_pg_datum!(

src/arrow_parquet/arrow_to_pg/numeric.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,28 @@ impl ArrowArrayToPgType<AnyNumeric> for Decimal128Array {
1111
if self.is_null(0) {
1212
None
1313
} else {
14+
let precision = context.precision.expect("Expected precision");
1415
let scale = context.scale.expect("Expected scale");
15-
Some(i128_to_numeric(self.value(0), scale))
16+
17+
Some(i128_to_numeric(
18+
self.value(0),
19+
precision,
20+
scale,
21+
context.typmod,
22+
))
1623
}
1724
}
1825
}
1926

2027
// Numeric[]
2128
impl ArrowArrayToPgType<Vec<Option<AnyNumeric>>> for Decimal128Array {
2229
fn to_pg_type(self, context: &ArrowToPgAttributeContext) -> Option<Vec<Option<AnyNumeric>>> {
30+
let precision = context.precision.expect("Expected precision");
2331
let scale = context.scale.expect("Expected scale");
32+
2433
let mut vals = vec![];
2534
for val in self.iter() {
26-
let val = val.map(|v| i128_to_numeric(v, scale));
35+
let val = val.map(|v| i128_to_numeric(v, precision, scale, context.typmod));
2736
vals.push(val);
2837
}
2938
Some(vals)

src/arrow_parquet/pg_to_arrow.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use crate::{
2424
map::{is_map_type, Map},
2525
pg_arrow_type_conversions::{
2626
extract_precision_from_numeric_typmod, extract_scale_from_numeric_typmod,
27-
MAX_DECIMAL_PRECISION,
27+
should_write_numeric_as_text,
2828
},
2929
},
3030
};
@@ -65,8 +65,8 @@ pub(crate) struct PgToArrowAttributeContext {
6565
is_geometry: bool,
6666
is_map: bool,
6767
attribute_contexts: Option<Vec<PgToArrowAttributeContext>>,
68-
scale: Option<usize>,
69-
precision: Option<usize>,
68+
scale: Option<i32>,
69+
precision: Option<i32>,
7070
}
7171

7272
impl PgToArrowAttributeContext {
@@ -274,7 +274,7 @@ fn to_arrow_primitive_array(
274274
.precision
275275
.expect("missing precision in context");
276276

277-
if precision > MAX_DECIMAL_PRECISION {
277+
if should_write_numeric_as_text(precision) {
278278
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
279279

280280
to_arrow_primitive_array!(FallbackToText, tuples, attribute_context)
@@ -359,7 +359,7 @@ fn to_arrow_list_array(
359359
.precision
360360
.expect("missing precision in context");
361361

362-
if precision > MAX_DECIMAL_PRECISION {
362+
if should_write_numeric_as_text(precision) {
363363
reset_fallback_to_text_context(attribute_context.typoid, attribute_context.typmod);
364364

365365
to_arrow_list_array!(pgrx::Array<FallbackToText>, tuples, attribute_context)

src/arrow_parquet/pg_to_arrow/numeric.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ impl PgTypeToArrowArray<AnyNumeric> for Vec<Option<AnyNumeric>> {
2222

2323
let numerics = self
2424
.into_iter()
25-
.map(|numeric| numeric.map(numeric_to_i128))
25+
.map(|numeric| {
26+
numeric
27+
.map(|numeric| numeric_to_i128(numeric, context.typmod, context.field.name()))
28+
})
2629
.collect::<Vec<_>>();
2730

2831
let numeric_array = Decimal128Array::from(numerics)
@@ -43,7 +46,10 @@ impl PgTypeToArrowArray<AnyNumeric> for Vec<Option<Vec<Option<AnyNumeric>>>> {
4346
.into_iter()
4447
.flatten()
4548
.flatten()
46-
.map(|numeric| numeric.map(numeric_to_i128))
49+
.map(|numeric| {
50+
numeric
51+
.map(|numeric| numeric_to_i128(numeric, context.typmod, context.field.name()))
52+
})
4753
.collect::<Vec<_>>();
4854

4955
let precision = context

src/arrow_parquet/schema_parser.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::{
1919
map::is_map_type,
2020
pg_arrow_type_conversions::{
2121
extract_precision_from_numeric_typmod, extract_scale_from_numeric_typmod,
22-
MAX_DECIMAL_PRECISION,
22+
should_write_numeric_as_text,
2323
},
2424
},
2525
};
@@ -214,11 +214,12 @@ fn parse_primitive_schema(
214214
INT8OID => Field::new(elem_name, arrow::datatypes::DataType::Int64, true),
215215
NUMERICOID => {
216216
let precision = extract_precision_from_numeric_typmod(typmod);
217-
let scale = extract_scale_from_numeric_typmod(typmod);
218217

219-
if precision > MAX_DECIMAL_PRECISION {
218+
if should_write_numeric_as_text(precision) {
220219
Field::new(elem_name, arrow::datatypes::DataType::Utf8, true)
221220
} else {
221+
let scale = extract_scale_from_numeric_typmod(typmod);
222+
222223
Field::new(
223224
elem_name,
224225
arrow::datatypes::DataType::Decimal128(precision as _, scale as _),

0 commit comments

Comments
 (0)