Skip to content

ARROW-13806: [C++][Python] Add support for new MonthDayNano Interval Type #11302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,13 @@ std::shared_ptr<CastFunction> GetDurationCast() {
return func;
}

std::shared_ptr<CastFunction> GetIntervalCast() {
auto func = std::make_shared<CastFunction>("cast_month_day_nano_interval",
Type::INTERVAL_MONTH_DAY_NANO);
AddCommonCasts(Type::INTERVAL_MONTH_DAY_NANO, kOutputTargetType, func.get());
return func;
}

std::shared_ptr<CastFunction> GetTime32Cast() {
auto func = std::make_shared<CastFunction>("cast_time32", Type::TIME32);
AddCommonCasts(Type::TIME32, kOutputTargetType, func.get());
Expand Down Expand Up @@ -579,6 +586,7 @@ std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts() {
functions.push_back(GetDate32Cast());
functions.push_back(GetDate64Cast());
functions.push_back(GetDurationCast());
functions.push_back(GetIntervalCast());
functions.push_back(GetTime32Cast());
functions.push_back(GetTime64Cast());
functions.push_back(GetTimestampCast());
Expand Down
84 changes: 53 additions & 31 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

#include "arrow/compute/api.h"

#include "arrow/python/arrow_to_python_internal.h"
#include "arrow/python/common.h"
#include "arrow/python/datetime.h"
#include "arrow/python/decimal.h"
Expand Down Expand Up @@ -574,24 +575,6 @@ inline void ConvertIntegerNoNullsCast(const PandasOptions& options,
}
}

// Generic Array -> PyObject** converter that handles object deduplication, if
// requested
template <typename ArrayType, typename WriteValue>
inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
PyObject** out_values) {
const bool has_nulls = arr.null_count() > 0;
for (int64_t i = 0; i < arr.length(); ++i) {
if (has_nulls && arr.IsNull(i)) {
Py_INCREF(Py_None);
*out_values = Py_None;
} else {
RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
}
++out_values;
}
return Status::OK();
}

template <typename T, typename Enable = void>
struct MemoizationTraits {
using Scalar = typename T::c_type;
Expand All @@ -604,14 +587,15 @@ struct MemoizationTraits<T, enable_if_has_string_view<T>> {
using Scalar = util::string_view;
};

// Generic Array -> PyObject** converter that handles object deduplication, if
// requested
template <typename Type, typename WrapFunction>
inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data,
WrapFunction&& wrap_func, PyObject** out_values) {
using ArrayType = typename TypeTraits<Type>::ArrayType;
using Scalar = typename MemoizationTraits<Type>::Scalar;

// TODO(fsaintjacques): propagate memory pool.
::arrow::internal::ScalarMemoTable<Scalar> memo_table(default_memory_pool());
::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
std::vector<PyObject*> unique_values;
int32_t memo_size = 0;

Expand All @@ -636,11 +620,11 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra
};

for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const ArrayType&>(*data.chunk(c));
const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
if (options.deduplicate_objects) {
RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values));
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values));
} else {
RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values));
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values));
}
out_values += arr.length();
}
Expand Down Expand Up @@ -1097,6 +1081,42 @@ struct ObjectWriterVisitor {
return Status::OK();
}

template <typename Type>
enable_if_t<std::is_same<Type, MonthDayNanoIntervalType>::value, Status> Visit(
const Type& type) {
OwnedRef args(PyTuple_New(0));
OwnedRef kwargs(PyDict_New());
RETURN_IF_PYERROR();
auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval,
PyObject** out) {
DCHECK(internal::BorrowPandasDataOffsetType() != nullptr);
// DateOffset objects do not add nanoseconds component to pd.Timestamp.
// as of Pandas 1.3.3
// (https://github.com/pandas-dev/pandas/issues/43892).
// So convert microseconds and remainder to preserve data
// but give users more expected results.
int64_t microseconds = interval.nanoseconds / 1000;
int64_t nanoseconds;
if (interval.nanoseconds >= 0) {
nanoseconds = interval.nanoseconds % 1000;
} else {
nanoseconds = -((-interval.nanoseconds) % 1000);
}

PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months));
PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days));
PyDict_SetItemString(kwargs.obj(), "microseconds",
PyLong_FromLongLong(microseconds));
PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds));
*out =
PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj());
RETURN_IF_PYERROR();
return Status::OK();
};
return ConvertAsPyObjects<MonthDayNanoIntervalType>(options, data, to_date_offset,
out_values);
}

Status Visit(const Decimal128Type& type) {
OwnedRef decimal;
OwnedRef Decimal;
Expand Down Expand Up @@ -1171,7 +1191,8 @@ struct ObjectWriterVisitor {
std::is_same<DictionaryType, Type>::value ||
std::is_same<DurationType, Type>::value ||
std::is_same<ExtensionType, Type>::value ||
std::is_base_of<IntervalType, Type>::value ||
(std::is_base_of<IntervalType, Type>::value &&
!std::is_same<MonthDayNanoIntervalType, Type>::value) ||
std::is_base_of<UnionType, Type>::value,
Status>
Visit(const Type& type) {
Expand Down Expand Up @@ -1869,13 +1890,14 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
case Type::LARGE_STRING: // fall through
case Type::BINARY: // fall through
case Type::LARGE_BINARY:
case Type::NA: // fall through
case Type::FIXED_SIZE_BINARY: // fall through
case Type::STRUCT: // fall through
case Type::TIME32: // fall through
case Type::TIME64: // fall through
case Type::DECIMAL128: // fall through
case Type::DECIMAL256: // fall through
case Type::NA: // fall through
case Type::FIXED_SIZE_BINARY: // fall through
case Type::STRUCT: // fall through
case Type::TIME32: // fall through
case Type::TIME64: // fall through
case Type::DECIMAL128: // fall through
case Type::DECIMAL256: // fall through
case Type::INTERVAL_MONTH_DAY_NANO: // fall through
*output_type = PandasWriter::OBJECT;
break;
case Type::DATE32: // fall through
Expand Down
49 changes: 49 additions & 0 deletions cpp/src/arrow/python/arrow_to_python_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/array.h"
#include "arrow/python/platform.h"

namespace arrow {
namespace py {
namespace internal {
// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic
// to the .cc file and move this there as well if we can.

// Converts array to a sequency of python objects.
template <typename ArrayType, typename WriteValue, typename Assigner>
inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
Assigner out_values) {
// TODO(ARROW-12976): Use visitor here?
const bool has_nulls = arr.null_count() > 0;
for (int64_t i = 0; i < arr.length(); ++i) {
if (has_nulls && arr.IsNull(i)) {
Py_INCREF(Py_None);
*out_values = Py_None;
} else {
RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
}
++out_values;
}
return Status::OK();
}

} // namespace internal
} // namespace py
} // namespace arrow
111 changes: 111 additions & 0 deletions cpp/src/arrow/python/datetime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
#include <chrono>
#include <iomanip>

#include "arrow/array.h"
#include "arrow/python/arrow_to_python_internal.h"
#include "arrow/python/common.h"
#include "arrow/python/helpers.h"
#include "arrow/python/platform.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/logging.h"
Expand Down Expand Up @@ -71,6 +74,26 @@ bool MatchFixedOffset(const std::string& tz, util::string_view* sign,
return iter == (tz.data() + tz.size());
}

static PyTypeObject MonthDayNanoTupleType = {};

constexpr char* NonConst(const char* st) {
// Hack for python versions < 3.7 where members of PyStruct members
// where non-const (C++ doesn't like assigning string literals to these types)
return const_cast<char*>(st);
Comment on lines +80 to +82
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still support python versions < 3.7? I thought we stopped shipping binary wheels for these versions but maybe we still support building from source.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do, I asked that we ddidn't drop it last release. I would also ask that we don't drop it this release (i.e. keep it through its full python support cycle. There are a number of consumers of pyarrow that try to keep support for python versions until they dropped it which is end of this year). This was caught because we run it in CI.

}

static PyStructSequence_Field MonthDayNanoField[] = {
{NonConst("months"), NonConst("The number of months in the interval")},
{NonConst("days"), NonConst("The number days in the interval")},
{NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")},
{nullptr, nullptr}};

static PyStructSequence_Desc MonthDayNanoTupleDesc = {
NonConst("MonthDayNano"),
NonConst("A calendar interval consisting of months, days and nanoseconds."),
MonthDayNanoField,
/*n_in_sequence=*/3};

} // namespace

PyDateTime_CAPI* datetime_api = nullptr;
Expand Down Expand Up @@ -270,6 +293,16 @@ static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_
return Status::OK();
}

PyObject* NewMonthDayNanoTupleType() {
if (MonthDayNanoTupleType.tp_name == nullptr) {
if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) {
Py_FatalError("Could not initialize MonthDayNanoTuple");
}
}
Py_INCREF(&MonthDayNanoTupleType);
return (PyObject*)&MonthDayNanoTupleType;
}

Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, &microsecond));
Expand Down Expand Up @@ -450,6 +483,84 @@ Result<std::string> TzinfoToString(PyObject* tzinfo) {
return PyTZInfo_utcoffset_hhmm(tzinfo);
}

PyObject* MonthDayNanoIntervalToNamedTuple(
const MonthDayNanoIntervalType::MonthDayNanos& interval) {
OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType));
if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) {
return nullptr;
}
PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months));
PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days));
PyStructSequence_SetItem(tuple.obj(), /*pos=*/2,
PyLong_FromLongLong(interval.nanoseconds));
return tuple.detach();
}

namespace {

// Wrapper around a Python list object that mimics dereference and assignment
// operations.
struct PyListAssigner {
public:
explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); }

PyListAssigner& operator*() { return *this; }

void operator=(PyObject* obj) {
if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) {
Py_FatalError("list did not have the correct preallocated size.");
}
}

PyListAssigner& operator++() {
current_index_++;
return *this;
}

PyListAssigner& operator+=(int64_t offset) {
current_index_ += offset;
return *this;
}

private:
PyObject* list_;
int64_t current_index_ = 0;
};

} // namespace

Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
const MonthDayNanoIntervalArray& array) {
OwnedRef out_list(PyList_New(array.length()));
RETURN_IF_PYERROR();
PyListAssigner out_objects(out_list.obj());
auto& interval_array =
arrow::internal::checked_cast<const MonthDayNanoIntervalArray&>(array);
RETURN_NOT_OK(internal::WriteArrayObjects(
interval_array,
[&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) {
PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval);
if (ARROW_PREDICT_FALSE(tuple == nullptr)) {
RETURN_IF_PYERROR();
}

*out = tuple;
return Status::OK();
},
out_objects));
return out_list.detach();
}

Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
const MonthDayNanoIntervalScalar& scalar) {
if (scalar.is_valid) {
return internal::MonthDayNanoIntervalToNamedTuple(scalar.value);
} else {
Py_INCREF(Py_None);
return Py_None;
}
}

} // namespace internal
} // namespace py
} // namespace arrow
Loading