Skip to content

Commit 58424d9

Browse files
Micah Kornfieldemkornfieldpitrou
authored andcommitted
ARROW-13806: [C++][Python] Add support for new MonthDayNano Interval Type
- Refactored ObjectWriter helpers from arrow_to_pandas, so they can be used for plain python types as well (generalized the lowest level so it can work on both PyObject** and an adapter for PyList. - Add DateOffset to static pandas imports - Tried to start laying out code in a way to use C++ for Array.to_pylist (feel free to comment). Support importing from timeinterval, relativedelta and DateOffset types (this is actually mostly duck types, the one complication is that relativedelta has a property weeks that is automatically calculated, so some type checking is necessary). Open questions: - Should we be more strict on duck typing imports? I chose generalism over performance here (rechecking non-present attributes, etc)? - Is the new arrow_to_python.h desirable (I think this can be easily extended for other types)? - My python is rusty and Python C-API even more so, please don't assume I know exactly what I'm doing :) Closes apache#11302 from emkornfield/interval_python Lead-authored-by: Micah Kornfield <[email protected]> Co-authored-by: emkornfield <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 84099c1 commit 58424d9

25 files changed

+695
-47
lines changed

cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,13 @@ std::shared_ptr<CastFunction> GetDurationCast() {
505505
return func;
506506
}
507507

508+
std::shared_ptr<CastFunction> GetIntervalCast() {
509+
auto func = std::make_shared<CastFunction>("cast_month_day_nano_interval",
510+
Type::INTERVAL_MONTH_DAY_NANO);
511+
AddCommonCasts(Type::INTERVAL_MONTH_DAY_NANO, kOutputTargetType, func.get());
512+
return func;
513+
}
514+
508515
std::shared_ptr<CastFunction> GetTime32Cast() {
509516
auto func = std::make_shared<CastFunction>("cast_time32", Type::TIME32);
510517
AddCommonCasts(Type::TIME32, kOutputTargetType, func.get());
@@ -579,6 +586,7 @@ std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts() {
579586
functions.push_back(GetDate32Cast());
580587
functions.push_back(GetDate64Cast());
581588
functions.push_back(GetDurationCast());
589+
functions.push_back(GetIntervalCast());
582590
functions.push_back(GetTime32Cast());
583591
functions.push_back(GetTime64Cast());
584592
functions.push_back(GetTimestampCast());

cpp/src/arrow/python/arrow_to_pandas.cc

Lines changed: 53 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949
#include "arrow/compute/api.h"
5050

51+
#include "arrow/python/arrow_to_python_internal.h"
5152
#include "arrow/python/common.h"
5253
#include "arrow/python/datetime.h"
5354
#include "arrow/python/decimal.h"
@@ -574,24 +575,6 @@ inline void ConvertIntegerNoNullsCast(const PandasOptions& options,
574575
}
575576
}
576577

577-
// Generic Array -> PyObject** converter that handles object deduplication, if
578-
// requested
579-
template <typename ArrayType, typename WriteValue>
580-
inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
581-
PyObject** out_values) {
582-
const bool has_nulls = arr.null_count() > 0;
583-
for (int64_t i = 0; i < arr.length(); ++i) {
584-
if (has_nulls && arr.IsNull(i)) {
585-
Py_INCREF(Py_None);
586-
*out_values = Py_None;
587-
} else {
588-
RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
589-
}
590-
++out_values;
591-
}
592-
return Status::OK();
593-
}
594-
595578
template <typename T, typename Enable = void>
596579
struct MemoizationTraits {
597580
using Scalar = typename T::c_type;
@@ -604,14 +587,15 @@ struct MemoizationTraits<T, enable_if_has_string_view<T>> {
604587
using Scalar = util::string_view;
605588
};
606589

590+
// Generic Array -> PyObject** converter that handles object deduplication, if
591+
// requested
607592
template <typename Type, typename WrapFunction>
608593
inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data,
609594
WrapFunction&& wrap_func, PyObject** out_values) {
610595
using ArrayType = typename TypeTraits<Type>::ArrayType;
611596
using Scalar = typename MemoizationTraits<Type>::Scalar;
612597

613-
// TODO(fsaintjacques): propagate memory pool.
614-
::arrow::internal::ScalarMemoTable<Scalar> memo_table(default_memory_pool());
598+
::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
615599
std::vector<PyObject*> unique_values;
616600
int32_t memo_size = 0;
617601

@@ -636,11 +620,11 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra
636620
};
637621

638622
for (int c = 0; c < data.num_chunks(); c++) {
639-
const auto& arr = checked_cast<const ArrayType&>(*data.chunk(c));
623+
const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
640624
if (options.deduplicate_objects) {
641-
RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values));
625+
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values));
642626
} else {
643-
RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values));
627+
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values));
644628
}
645629
out_values += arr.length();
646630
}
@@ -1097,6 +1081,42 @@ struct ObjectWriterVisitor {
10971081
return Status::OK();
10981082
}
10991083

1084+
template <typename Type>
1085+
enable_if_t<std::is_same<Type, MonthDayNanoIntervalType>::value, Status> Visit(
1086+
const Type& type) {
1087+
OwnedRef args(PyTuple_New(0));
1088+
OwnedRef kwargs(PyDict_New());
1089+
RETURN_IF_PYERROR();
1090+
auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval,
1091+
PyObject** out) {
1092+
DCHECK(internal::BorrowPandasDataOffsetType() != nullptr);
1093+
// DateOffset objects do not add nanoseconds component to pd.Timestamp.
1094+
// as of Pandas 1.3.3
1095+
// (https://github.com/pandas-dev/pandas/issues/43892).
1096+
// So convert microseconds and remainder to preserve data
1097+
// but give users more expected results.
1098+
int64_t microseconds = interval.nanoseconds / 1000;
1099+
int64_t nanoseconds;
1100+
if (interval.nanoseconds >= 0) {
1101+
nanoseconds = interval.nanoseconds % 1000;
1102+
} else {
1103+
nanoseconds = -((-interval.nanoseconds) % 1000);
1104+
}
1105+
1106+
PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months));
1107+
PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days));
1108+
PyDict_SetItemString(kwargs.obj(), "microseconds",
1109+
PyLong_FromLongLong(microseconds));
1110+
PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds));
1111+
*out =
1112+
PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj());
1113+
RETURN_IF_PYERROR();
1114+
return Status::OK();
1115+
};
1116+
return ConvertAsPyObjects<MonthDayNanoIntervalType>(options, data, to_date_offset,
1117+
out_values);
1118+
}
1119+
11001120
Status Visit(const Decimal128Type& type) {
11011121
OwnedRef decimal;
11021122
OwnedRef Decimal;
@@ -1171,7 +1191,8 @@ struct ObjectWriterVisitor {
11711191
std::is_same<DictionaryType, Type>::value ||
11721192
std::is_same<DurationType, Type>::value ||
11731193
std::is_same<ExtensionType, Type>::value ||
1174-
std::is_base_of<IntervalType, Type>::value ||
1194+
(std::is_base_of<IntervalType, Type>::value &&
1195+
!std::is_same<MonthDayNanoIntervalType, Type>::value) ||
11751196
std::is_base_of<UnionType, Type>::value,
11761197
Status>
11771198
Visit(const Type& type) {
@@ -1869,13 +1890,14 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
18691890
case Type::LARGE_STRING: // fall through
18701891
case Type::BINARY: // fall through
18711892
case Type::LARGE_BINARY:
1872-
case Type::NA: // fall through
1873-
case Type::FIXED_SIZE_BINARY: // fall through
1874-
case Type::STRUCT: // fall through
1875-
case Type::TIME32: // fall through
1876-
case Type::TIME64: // fall through
1877-
case Type::DECIMAL128: // fall through
1878-
case Type::DECIMAL256: // fall through
1893+
case Type::NA: // fall through
1894+
case Type::FIXED_SIZE_BINARY: // fall through
1895+
case Type::STRUCT: // fall through
1896+
case Type::TIME32: // fall through
1897+
case Type::TIME64: // fall through
1898+
case Type::DECIMAL128: // fall through
1899+
case Type::DECIMAL256: // fall through
1900+
case Type::INTERVAL_MONTH_DAY_NANO: // fall through
18791901
*output_type = PandasWriter::OBJECT;
18801902
break;
18811903
case Type::DATE32: // fall through
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include "arrow/array.h"
21+
#include "arrow/python/platform.h"
22+
23+
namespace arrow {
24+
namespace py {
25+
namespace internal {
26+
// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic
27+
// to the .cc file and move this there as well if we can.
28+
29+
// Converts array to a sequency of python objects.
30+
template <typename ArrayType, typename WriteValue, typename Assigner>
31+
inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
32+
Assigner out_values) {
33+
// TODO(ARROW-12976): Use visitor here?
34+
const bool has_nulls = arr.null_count() > 0;
35+
for (int64_t i = 0; i < arr.length(); ++i) {
36+
if (has_nulls && arr.IsNull(i)) {
37+
Py_INCREF(Py_None);
38+
*out_values = Py_None;
39+
} else {
40+
RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
41+
}
42+
++out_values;
43+
}
44+
return Status::OK();
45+
}
46+
47+
} // namespace internal
48+
} // namespace py
49+
} // namespace arrow

cpp/src/arrow/python/datetime.cc

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
#include <chrono>
2121
#include <iomanip>
2222

23+
#include "arrow/array.h"
24+
#include "arrow/python/arrow_to_python_internal.h"
2325
#include "arrow/python/common.h"
2426
#include "arrow/python/helpers.h"
2527
#include "arrow/python/platform.h"
28+
#include "arrow/scalar.h"
2629
#include "arrow/status.h"
2730
#include "arrow/type.h"
2831
#include "arrow/util/logging.h"
@@ -71,6 +74,26 @@ bool MatchFixedOffset(const std::string& tz, util::string_view* sign,
7174
return iter == (tz.data() + tz.size());
7275
}
7376

77+
static PyTypeObject MonthDayNanoTupleType = {};
78+
79+
constexpr char* NonConst(const char* st) {
80+
// Hack for python versions < 3.7 where members of PyStruct members
81+
// where non-const (C++ doesn't like assigning string literals to these types)
82+
return const_cast<char*>(st);
83+
}
84+
85+
static PyStructSequence_Field MonthDayNanoField[] = {
86+
{NonConst("months"), NonConst("The number of months in the interval")},
87+
{NonConst("days"), NonConst("The number days in the interval")},
88+
{NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")},
89+
{nullptr, nullptr}};
90+
91+
static PyStructSequence_Desc MonthDayNanoTupleDesc = {
92+
NonConst("MonthDayNano"),
93+
NonConst("A calendar interval consisting of months, days and nanoseconds."),
94+
MonthDayNanoField,
95+
/*n_in_sequence=*/3};
96+
7497
} // namespace
7598

7699
PyDateTime_CAPI* datetime_api = nullptr;
@@ -270,6 +293,16 @@ static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_
270293
return Status::OK();
271294
}
272295

296+
PyObject* NewMonthDayNanoTupleType() {
297+
if (MonthDayNanoTupleType.tp_name == nullptr) {
298+
if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) {
299+
Py_FatalError("Could not initialize MonthDayNanoTuple");
300+
}
301+
}
302+
Py_INCREF(&MonthDayNanoTupleType);
303+
return (PyObject*)&MonthDayNanoTupleType;
304+
}
305+
273306
Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
274307
int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
275308
RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, &microsecond));
@@ -450,6 +483,84 @@ Result<std::string> TzinfoToString(PyObject* tzinfo) {
450483
return PyTZInfo_utcoffset_hhmm(tzinfo);
451484
}
452485

486+
PyObject* MonthDayNanoIntervalToNamedTuple(
487+
const MonthDayNanoIntervalType::MonthDayNanos& interval) {
488+
OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType));
489+
if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) {
490+
return nullptr;
491+
}
492+
PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months));
493+
PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days));
494+
PyStructSequence_SetItem(tuple.obj(), /*pos=*/2,
495+
PyLong_FromLongLong(interval.nanoseconds));
496+
return tuple.detach();
497+
}
498+
499+
namespace {
500+
501+
// Wrapper around a Python list object that mimics dereference and assignment
502+
// operations.
503+
struct PyListAssigner {
504+
public:
505+
explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); }
506+
507+
PyListAssigner& operator*() { return *this; }
508+
509+
void operator=(PyObject* obj) {
510+
if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) {
511+
Py_FatalError("list did not have the correct preallocated size.");
512+
}
513+
}
514+
515+
PyListAssigner& operator++() {
516+
current_index_++;
517+
return *this;
518+
}
519+
520+
PyListAssigner& operator+=(int64_t offset) {
521+
current_index_ += offset;
522+
return *this;
523+
}
524+
525+
private:
526+
PyObject* list_;
527+
int64_t current_index_ = 0;
528+
};
529+
530+
} // namespace
531+
532+
Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
533+
const MonthDayNanoIntervalArray& array) {
534+
OwnedRef out_list(PyList_New(array.length()));
535+
RETURN_IF_PYERROR();
536+
PyListAssigner out_objects(out_list.obj());
537+
auto& interval_array =
538+
arrow::internal::checked_cast<const MonthDayNanoIntervalArray&>(array);
539+
RETURN_NOT_OK(internal::WriteArrayObjects(
540+
interval_array,
541+
[&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) {
542+
PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval);
543+
if (ARROW_PREDICT_FALSE(tuple == nullptr)) {
544+
RETURN_IF_PYERROR();
545+
}
546+
547+
*out = tuple;
548+
return Status::OK();
549+
},
550+
out_objects));
551+
return out_list.detach();
552+
}
553+
554+
Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
555+
const MonthDayNanoIntervalScalar& scalar) {
556+
if (scalar.is_valid) {
557+
return internal::MonthDayNanoIntervalToNamedTuple(scalar.value);
558+
} else {
559+
Py_INCREF(Py_None);
560+
return Py_None;
561+
}
562+
}
563+
453564
} // namespace internal
454565
} // namespace py
455566
} // namespace arrow

0 commit comments

Comments
 (0)