From 108638bb925643d3d1fb0802d8e5f792cb6f205d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 3 Oct 2021 15:29:58 -0700 Subject: [PATCH 01/24] ARROW-13806: [C++][Python] Add support for new Interval Type - Refactored ObjectWriter helpers from arrow_to_pandas, so they can be used for plain python types as well (generalized the lowest level so it can work on both PyObject** and an adapter for PyList. - Add DateOffset to static pandas imports - Tried to start laying out code in a way to use C++ for Array.to_pylist (feel free to comment). Support importing from timeinterval, relativedelta and DateOffset types (this is actually mostly duck types, the one complication is that relativedelta has a property weeks that is automatically calculated, so some type checking is necessary). Open questions: - Should we be more strict on duck typing imports? I chose generalism over performance here (rechecking non-present attributes, etc)? - Is the new arrow_to_python.h desirable (I think this can be easily extended for other types)? - My python is rusty and Python C-API even more so, please don't assume I know exactly what I'm doing :) --- cpp/src/arrow/python/CMakeLists.txt | 1 + cpp/src/arrow/python/api.h | 1 + cpp/src/arrow/python/arrow_to_pandas.cc | 113 ++++++--------------- cpp/src/arrow/python/datetime.cc | 38 +++++++ cpp/src/arrow/python/datetime.h | 15 +++ cpp/src/arrow/python/helpers.cc | 12 +++ cpp/src/arrow/python/helpers.h | 3 + cpp/src/arrow/python/inference.cc | 47 ++++++++- cpp/src/arrow/python/python_to_arrow.cc | 127 +++++++++++++++++++++++- python/pyarrow/__init__.py | 11 +- python/pyarrow/array.pxi | 25 +++++ python/pyarrow/includes/libarrow.pxd | 9 ++ python/pyarrow/lib.pxd | 4 + python/pyarrow/lib.pyx | 4 + python/pyarrow/scalar.pxi | 27 +++++ python/pyarrow/tests/test_array.py | 96 ++++++++++++++++++ python/pyarrow/tests/test_pandas.py | 13 +++ python/pyarrow/tests/test_scalars.py | 29 +++++- python/pyarrow/types.pxi | 9 ++ 19 files changed, 492 insertions(+), 92 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 40f351b56a5d2..c3b501a049ee0 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -28,6 +28,7 @@ add_dependencies(arrow_python-all arrow_python arrow_python-tests) set(ARROW_PYTHON_SRCS arrow_to_pandas.cc + arrow_to_python.cc benchmark.cc common.cc datetime.cc diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index a0b13d6d13013..b170470d2785a 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -18,6 +18,7 @@ #pragma once #include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/arrow_to_python.h" #include "arrow/python/common.h" #include "arrow/python/datetime.h" #include "arrow/python/deserialize.h" diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 9b9fc03a72478..4ec2e15f6a982 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -48,6 +48,7 @@ #include "arrow/compute/api.h" +#include "arrow/python/arrow_to_python.h" #include "arrow/python/common.h" #include "arrow/python/datetime.h" #include "arrow/python/decimal.h" @@ -574,79 +575,6 @@ inline void ConvertIntegerNoNullsCast(const PandasOptions& options, } } -// Generic Array -> PyObject** converter that handles object deduplication, if -// requested -template -inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, - PyObject** out_values) { - const bool has_nulls = arr.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); - } - ++out_values; - } - return Status::OK(); -} - -template -struct MemoizationTraits { - using Scalar = typename T::c_type; -}; - -template -struct MemoizationTraits> { - // For binary, we memoize string_view as a scalar value to avoid having to - // unnecessarily copy the memory into the memo table data structure - using Scalar = util::string_view; -}; - -template -inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, - WrapFunction&& wrap_func, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - using Scalar = typename MemoizationTraits::Scalar; - - // TODO(fsaintjacques): propagate memory pool. - ::arrow::internal::ScalarMemoTable memo_table(default_memory_pool()); - std::vector unique_values; - int32_t memo_size = 0; - - auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { - int32_t memo_index; - RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); - if (memo_index == memo_size) { - // New entry - RETURN_NOT_OK(wrap_func(value, out_values)); - unique_values.push_back(*out_values); - ++memo_size; - } else { - // Duplicate entry - Py_INCREF(unique_values[memo_index]); - *out_values = unique_values[memo_index]; - } - return Status::OK(); - }; - - auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { - return wrap_func(value, out_values); - }; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - if (options.deduplicate_objects) { - RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values)); - } else { - RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values)); - } - out_values += arr.length(); - } - return Status::OK(); -} - Status ConvertStruct(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { if (data.num_chunks() == 0) { @@ -970,6 +898,17 @@ class TypedPandasWriter : public PandasWriter { Status Allocate() override { return AllocateNDArray(NPY_TYPE); } }; +template +inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, + WrapFunction&& wrap_func, PyObject** out_values) { + ArrowToPythonObjectOptions to_object_options; + to_object_options.pool = options.pool; + to_object_options.deduplicate_objects = options.deduplicate_objects; + + return internal::ConvertAsPyObjects(to_object_options, data, wrap_func, + out_values); +} + struct ObjectWriterVisitor { const PandasOptions& options; const ChunkedArray& data; @@ -1097,6 +1036,16 @@ struct ObjectWriterVisitor { return Status::OK(); } + template + enable_if_t::value, Status> Visit( + const Type& type) { + ArrowToPython arrow_to_python; + ArrowToPythonObjectOptions to_py_options; + to_py_options.pool = options.pool; + to_py_options.deduplicate_objects = options.deduplicate_objects; + return arrow_to_python.ToNumpyObjectArray(to_py_options, data, out_values); + } + Status Visit(const Decimal128Type& type) { OwnedRef decimal; OwnedRef Decimal; @@ -1171,7 +1120,8 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_same::value || std::is_same::value || - std::is_base_of::value || + (std::is_base_of::value && + !std::is_same::value) || std::is_base_of::value, Status> Visit(const Type& type) { @@ -1869,13 +1819,14 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::LARGE_STRING: // fall through case Type::BINARY: // fall through case Type::LARGE_BINARY: - case Type::NA: // fall through - case Type::FIXED_SIZE_BINARY: // fall through - case Type::STRUCT: // fall through - case Type::TIME32: // fall through - case Type::TIME64: // fall through - case Type::DECIMAL128: // fall through - case Type::DECIMAL256: // fall through + case Type::NA: // fall through + case Type::FIXED_SIZE_BINARY: // fall through + case Type::STRUCT: // fall through + case Type::TIME32: // fall through + case Type::TIME64: // fall through + case Type::DECIMAL128: // fall through + case Type::DECIMAL256: // fall through + case Type::INTERVAL_MONTH_DAY_NANO: // fall through *output_type = PandasWriter::OBJECT; break; case Type::DATE32: // fall through diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 4b18918cbcf74..e6a84a3cc4859 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -71,6 +71,19 @@ bool MatchFixedOffset(const std::string& tz, util::string_view* sign, return iter == (tz.data() + tz.size()); } +static PyTypeObject MonthDayNanoTupleType = {0, 0}; + +static PyStructSequence_Field MonthDayNanoField[] = { + {"months", "The number of months in the interval"}, + {"days", "The number days in the interval"}, + {"nanoseconds", "The number of nanoseconds in the interval"}, + {nullptr, nullptr}}; + +static PyStructSequence_Desc MonthDayNanoTupleDesc = { + "MonthDayNanoTuple", "A interval consistent of months, days and nanoseconds.", + MonthDayNanoField, + /*n_in_sequence=*/3}; + } // namespace PyDateTime_CAPI* datetime_api = nullptr; @@ -270,6 +283,18 @@ static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_ return Status::OK(); } +PyObject* NewMonthDayNanoTupleType() { + if (MonthDayNanoTupleType.tp_name == nullptr) { + if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) { + Py_FatalError("Could not initialize MonthDayNanoTuple"); + } + } + Py_INCREF(&MonthDayNanoTupleType); + return (PyObject*)&MonthDayNanoTupleType; +} + +PyTypeObject* BorrowMonthDayNanoTupleType() { return &MonthDayNanoTupleType; } + Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { int64_t hour = 0, minute = 0, second = 0, microsecond = 0; RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); @@ -450,6 +475,19 @@ Result TzinfoToString(PyObject* tzinfo) { return PyTZInfo_utcoffset_hhmm(tzinfo); } +Result MonthDayNanoIntervalToNamedTuple( + const MonthDayNanoIntervalType::MonthDayNanos& interval) { + OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType)); + if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) { + return nullptr; + } + PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months)); + PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days)); + PyStructSequence_SetItem(tuple.obj(), /*pos=*/2, + PyLong_FromLongLong(interval.nanoseconds)); + return tuple.detach(); +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 0072cdda4cb46..405676c39dfea 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -42,6 +42,10 @@ extern PyDateTime_CAPI* datetime_api; ARROW_PYTHON_EXPORT void InitDatetime(); +// Returns the MonthDayNano namedtuple type (increments the reference count). +ARROW_PYTHON_EXPORT +PyObject* NewMonthDayNanoTupleType(); + ARROW_PYTHON_EXPORT inline int64_t PyTime_to_us(PyObject* pytime) { return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL + @@ -178,6 +182,17 @@ Result StringToTzinfo(const std::string& tz); ARROW_PYTHON_EXPORT Result TzinfoToString(PyObject* pytzinfo); +/// Converts MonthDayNano to a python dictionary. +/// +/// Returns a named tuple (pyarrow.MonthDayNanoTuple) containin attributes +/// "months", "days", "nanoseconds" in the given order +/// with values extracted from the fields on interval. +/// +/// GIL must be held when calling this method. +ARROW_PYTHON_EXPORT +Result MonthDayNanoIntervalToNamedTuple( + const MonthDayNanoIntervalType::MonthDayNanos& interval); + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 75a77c640bbc4..c37bfde2bc329 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -63,6 +63,7 @@ std::shared_ptr GetPrimitiveType(Type::type type) { GET_PRIMITIVE_TYPE(STRING, utf8); GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); + GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval); default: return nullptr; } @@ -273,6 +274,7 @@ static PyObject* pandas_NaT = nullptr; static PyObject* pandas_Timedelta = nullptr; static PyObject* pandas_Timestamp = nullptr; static PyTypeObject* pandas_NaTType = nullptr; +static PyObject* pandas_DateOffset = nullptr; } // namespace @@ -321,6 +323,14 @@ void InitPandasStaticData() { pandas_NA = ref.obj(); } + // Import DateOffset type + OwnedRef offsets; + if (internal::ImportModule("pandas.tseries.offsets", &offsets).ok()) { + if (internal::ImportFromModule(offsets.obj(), "DateOffset", &ref).ok()) { + pandas_DateOffset = ref.obj(); + } + } + pandas_static_initialized = true; } @@ -347,6 +357,8 @@ bool IsPandasTimestamp(PyObject* obj) { return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp); } +PyObject* BorrowPandasDataOffsetType() { return pandas_DateOffset; } + Status InvalidValue(PyObject* obj, const std::string& why) { auto obj_as_str = PyObject_StdStringRepr(obj); return Status::Invalid("Could not convert ", std::move(obj_as_str), " with type ", diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 19288756c0ed6..a8e5f80b60678 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -87,6 +87,9 @@ bool IsPandasTimedelta(PyObject* obj); // \brief Check that obj is a pandas.Timestamp instance bool IsPandasTimestamp(PyObject* obj); +// \brief Returned a borrowed reference to the pandas.tseries.offsets.DateOffset +PyObject* BorrowPandasDataOffsetType(); + // \brief Check whether obj is a floating-point NaN ARROW_PYTHON_EXPORT bool PyFloat_IsNaN(PyObject* obj); diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 5dcabcd9c5ed7..35d1482819226 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -39,6 +39,43 @@ namespace arrow { namespace py { +namespace { +// Assigns a tuple to interval_types_tuple containing the nametuple for +// MonthDayNanoIntervalType and if present dateutil's relativedelta and +// pandas DateOffset. +Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) { + OwnedRef relative_delta_module; + // These are Optional imports so swallow errors. + OwnedRef relative_delta_type; + // Try to import pandas to get types. + internal::InitPandasStaticData(); + if (internal::ImportModule("dateutil.relativedelta", &relative_delta_module).ok()) { + RETURN_NOT_OK(internal::ImportFromModule(relative_delta_module.obj(), "relativedelta", + &relative_delta_type)); + } + + PyObject* date_offset_type = internal::BorrowPandasDataOffsetType(); + interval_types_tuple->reset( + PyTuple_New(1 + (date_offset_type != nullptr ? 1 : 0) + + (relative_delta_type.obj() != nullptr ? 1 : 0))); + RETURN_IF_PYERROR(); + int index = 0; + PyTuple_SetItem(interval_types_tuple->obj(), index++, + internal::NewMonthDayNanoTupleType()); + RETURN_IF_PYERROR(); + if (date_offset_type != nullptr) { + Py_XINCREF(date_offset_type); + PyTuple_SetItem(interval_types_tuple->obj(), index++, date_offset_type); + RETURN_IF_PYERROR(); + } + if (relative_delta_type.obj() != nullptr) { + PyTuple_SetItem(interval_types_tuple->obj(), index++, relative_delta_type.detach()); + RETURN_IF_PYERROR(); + } + return Status::OK(); +} + +} // namespace #define _NUMPY_UNIFY_NOOP(DTYPE) \ case NPY_##DTYPE: \ @@ -304,10 +341,12 @@ class TypeInferrer { list_count_(0), struct_count_(0), numpy_dtype_count_(0), + interval_count_(0), max_decimal_metadata_(std::numeric_limits::min(), std::numeric_limits::min()), decimal_type_() { ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_)); + ARROW_CHECK_OK(ImportPresentIntervalTypes(&interval_types_)); } /// \param[in] obj a Python object in the sequence @@ -365,6 +404,8 @@ class TypeInferrer { } else if (PyObject_IsInstance(obj, decimal_type_.obj())) { RETURN_NOT_OK(max_decimal_metadata_.Update(obj)); ++decimal_count_; + } else if (PyObject_IsInstance(obj, interval_types_.obj())) { + ++interval_count_; } else { return internal::InvalidValue(obj, "did not recognize Python value type when inferring " @@ -489,6 +530,8 @@ class TypeInferrer { *out = binary(); } else if (unicode_count_) { *out = utf8(); + } else if (interval_count_) { + *out = month_day_nano_interval(); } else { *out = null(); } @@ -631,6 +674,7 @@ class TypeInferrer { int64_t list_count_; int64_t struct_count_; int64_t numpy_dtype_count_; + int64_t interval_count_; std::unique_ptr list_inferrer_; std::map struct_inferrers_; @@ -640,9 +684,8 @@ class TypeInferrer { internal::DecimalMetadata max_decimal_metadata_; - // Place to accumulate errors - // std::vector errors_; OwnedRefNoGIL decimal_type_; + OwnedRefNoGIL interval_types_; }; // Non-exhaustive type inference diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 3e9369d73ea47..c5450e79e6b41 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -33,6 +33,7 @@ #include "arrow/array/builder_dict.h" #include "arrow/array/builder_nested.h" #include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_time.h" #include "arrow/chunked_array.h" #include "arrow/status.h" #include "arrow/type.h" @@ -68,6 +69,95 @@ using internal::MakeConverter; namespace py { +enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds }; + +template +struct MonthDayNanoTraits; + +struct MonthDayNanoAttrData { + const char* name; + const int64_t multiplier; +}; + +template <> +struct MonthDayNanoTraits { + using c_type = int32_t; + static constexpr char name[] = "months"; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = { + {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}}; + +template <> +struct MonthDayNanoTraits { + using c_type = int32_t; + static constexpr char name[] = "days"; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = + {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}}; + +template <> +struct MonthDayNanoTraits { + using c_type = int32_t; + static constexpr char name[] = "days"; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = { + {"days", 1}, {nullptr, 0}}; + +template <> +struct MonthDayNanoTraits { + using c_type = int64_t; + static constexpr char name[] = "nanoseconds"; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = + {{"hours", 1}, + {"minutes", /*minutes_in_hours=*/60}, + {"seconds", /*seconds_in_minute=*/60}, + {"milliseconds", /*milliseconds_in_seconds*/ 1000}, + {"microseconds", /*microseconds_in_millseconds=*/1000}, + {"nanoseconds", /*nanoseconds_in_microseconds=*/1000}, + {nullptr, 0}}; + +template +struct PopulateMonthDayNano { + using Traits = MonthDayNanoTraits; + inline static Status Field(PyObject* obj, + typename MonthDayNanoTraits::c_type* out, + bool* found_attrs) { + *out = 0; + for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; + ++attr) { + if (attr->multiplier != 1 && + ::arrow::internal::MultiplyWithOverflow( + static_cast(attr->multiplier), *out, out)) { + return Status::Invalid("Overflow on: ", (attr - 1)->name); + } + if (PyObject_HasAttrString(obj, attr->name)) { + OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); + RETURN_IF_PYERROR(); + *found_attrs = true; + if (field_value.obj() == Py_None) { + continue; + } + typename Traits::c_type value; + RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); + if (::arrow::internal::AddWithOverflow(*out, value, out)) { + return Status::Invalid("Overflow on: ", attr->name); + } + } + } + + return Status::OK(); + } +}; + // Utility for converting single python objects to their intermediate C representations // which can be fed to the typed builders class PyValue { @@ -304,6 +394,34 @@ class PyValue { return value; } + static Result Convert( + const MonthDayNanoIntervalType* /*type*/, const O& /*options*/, I obj) { + MonthDayNanoIntervalType::MonthDayNanos output; + bool found_attrs = false; + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.months, &found_attrs)); + // on relativeoffset weeks is a property calculated from days. On + // DateOffset is is a field on its own. timedelta doesn't have a weeks + // attribute. + PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType(); + bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj); + if (!is_date_offset) { + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.days, &found_attrs)); + } else { + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.days, &found_attrs)); + } + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.nanoseconds, &found_attrs)); + + if (ARROW_PREDICT_FALSE(!found_attrs) && !is_date_offset) { + // date_offset can have zero fields. + return Status::TypeError("No temporal attributes found on object."); + } + return output; + } + static Result Convert(const DurationType* type, const O&, I obj) { int64_t value; if (PyDelta_Check(obj)) { @@ -438,8 +556,9 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, enable_if_t::value && !is_interval_type::value && - !is_extension_type::value>> { + T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; @@ -478,7 +597,9 @@ template class PyPrimitiveConverter< T, enable_if_t::value || is_number_type::value || is_decimal_type::value || is_date_type::value || - is_time_type::value>> : public PrimitiveConverter { + is_time_type::value || + std::is_same::value>> + : public PrimitiveConverter { public: Status Append(PyObject* value) override { // Since the required space has been already allocated in the Extend functions we can diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 684c97315bb33..6a705cff47f49 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -64,10 +64,10 @@ def parse_git(root, **kwargs): if _gc_enabled: _gc.enable() -from pyarrow.lib import (BuildInfo, RuntimeInfo, VersionInfo, - cpp_build_info, cpp_version, cpp_version_info, - runtime_info, cpu_count, set_cpu_count, - enable_signal_handlers, +from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNanoTuple, + VersionInfo, cpp_build_info, cpp_version, + cpp_version_info, runtime_info, cpu_count, + set_cpu_count, enable_signal_handlers, io_thread_count, set_io_thread_count) @@ -94,6 +94,7 @@ def show_versions(): int8, int16, int32, int64, uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, duration, + month_day_nano_interval, float16, float32, float64, binary, string, utf8, large_binary, large_string, large_utf8, @@ -137,6 +138,7 @@ def show_versions(): DictionaryArray, Date32Array, Date64Array, TimestampArray, Time32Array, Time64Array, DurationArray, + MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, @@ -148,6 +150,7 @@ def show_versions(): Date32Scalar, Date64Scalar, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, + MonthDayNanoIntervalScalar, BinaryScalar, LargeBinaryScalar, StringScalar, LargeStringScalar, FixedSizeBinaryScalar, DictionaryScalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c9a4f3efb5e6f..5fc35da448cc3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1508,6 +1508,30 @@ cdef class DurationArray(NumericArray): Concrete class for Arrow arrays of duration data type. """ +cdef class MonthDayNanoIntervalArray(Array): + """ + Concrete class for Arrow arrays of interval[MonthDayNano] type. + """ + + def to_pylist(self): + """ + Convert to a list of native Python objects. + + is installed the objects will be + pd.tseries.offsets.DateOffset objects. Otherwise they are + pyarrow.MonthDayNanoTuple objects. + + Returns + ------- + lst : list + """ + cdef: + CResult[PyObject*] maybe_py_list + PyObject* py_list + maybe_py_list = ARROW_TO_PYTHON.ToPyList(deref(self.sp_array)) + py_list = GetResultValue(maybe_py_list) + return PyObject_to_object(py_list) + cdef class HalfFloatArray(FloatingPointArray): """ Concrete class for Arrow arrays of float16 data type. @@ -2389,6 +2413,7 @@ cdef dict _array_classes = { _Type_TIME32: Time32Array, _Type_TIME64: Time64Array, _Type_DURATION: DurationArray, + _Type_INTERVAL_MONTH_DAY_NANO: MonthDayNanoIntervalArray, _Type_HALF_FLOAT: HalfFloatArray, _Type_FLOAT: FloatArray, _Type_DOUBLE: DoubleArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 3ab62581d8ff1..c7af6a4095163 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -107,6 +107,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_TIME32" arrow::Type::TIME32" _Type_TIME64" arrow::Type::TIME64" _Type_DURATION" arrow::Type::DURATION" + _Type_INTERVAL_MONTH_DAY_NANO" arrow::Type::INTERVAL_MONTH_DAY_NANO" _Type_BINARY" arrow::Type::BINARY" _Type_STRING" arrow::Type::STRING" @@ -2213,6 +2214,14 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py": CResult[shared_ptr[CDataType]] InferArrowType( object obj, object mask, c_bool pandas_null_sentinels) + cdef cppclass ArrowToPython: + CResult[PyObject*] ToPyList(const CArray& array) + CResult[PyObject*] ToPrimitive(const CScalar& scalar) + CResult[PyObject*] ToLogical(const CScalar& scalar) + +cdef extern from "arrow/python/api.h" namespace "arrow::py::internal": + cdef object NewMonthDayNanoTupleType() + cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CDataType] GetPrimitiveType(Type type) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 592533e70156d..e3b07f40496bf 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -399,6 +399,10 @@ cdef class ExtensionArray(Array): pass +cdef class MonthDayNanoIntervalArray(Array): + pass + + cdef wrap_array_output(PyObject* output) cdef wrap_datum(const CDatum& datum) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 0d86df601369a..9bfda108eab98 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -37,6 +37,10 @@ arrow_init_numpy() # (used from some of our C++ code, see e.g. ARROW-5260) import_pyarrow() +cdef libarrow.ArrowToPython ARROW_TO_PYTHON + +MonthDayNanoTuple = NewMonthDayNanoTupleType() + def cpu_count(): """ diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a306165585157..a5724db843180 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -514,6 +514,32 @@ cdef class DurationScalar(Scalar): return datetime.timedelta(microseconds=sp.value // 1000) +cdef class MonthDayNanoIntervalScalar(Scalar): + """ + Concrete class for month, day, nanosecond scalars. + """ + + @property + def value(self): + """ + Returns this value pyarrow.MonthDayNanoTuple. + """ + cdef PyObject* val + val = GetResultValue(ARROW_TO_PYTHON.ToPrimitive( + (deref(self.wrapped.get())))) + return PyObject_to_object(val) + + def as_py(self): + """ + Return this value as a Pandas DateOffset instance if Pandas is present + otherwise as a named tuple containing months days and nanoseconds. + """ + cdef PyObject* val + val = GetResultValue(ARROW_TO_PYTHON.ToLogical( + (deref(self.wrapped.get())))) + return PyObject_to_object(val) + + cdef class BinaryScalar(Scalar): """ Concrete class for binary-like scalars. @@ -940,6 +966,7 @@ cdef dict _scalar_classes = { _Type_DICTIONARY: DictionaryScalar, _Type_SPARSE_UNION: UnionScalar, _Type_DENSE_UNION: UnionScalar, + _Type_INTERVAL_MONTH_DAY_NANO: MonthDayNanoIntervalScalar, _Type_EXTENSION: ExtensionScalar, } diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 97a0eac651a4d..3003093bfb7fc 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1486,6 +1486,7 @@ def test_cast_from_null(): pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.duration('us'), + pa.month_day_nano_interval(), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), @@ -2162,6 +2163,101 @@ def test_array_from_numpy_ascii(): assert arrow_arr.equals(expected) +@pytest.mark.nopandas +def test_interval_array_from_timedelta(): + data = [ + None, + datetime.timedelta(days=1, seconds=1, microseconds=1, + milliseconds=1, minutes=1, hours=1, weeks=1)] + + # From timedelta (explicit type required) + arr = pa.array(data, pa.month_day_nano_interval()) + assert isinstance(arr, pa.MonthDayNanoIntervalArray) + assert arr.type == pa.month_day_nano_interval() + expected_list = [ + None, + pa.MonthDayNanoTuple([0, 8, + (datetime.timedelta(seconds=1, microseconds=1, + milliseconds=1, minutes=1, + hours=1) // + datetime.timedelta(microseconds=1)) * 1000])] + expected = pa.array(expected_list) + assert arr.equals(expected) + assert arr.to_pylist() == expected_list + +# dateutil is dependency of pandas + + +@pytest.mark.pandas +def test_interval_array_from_relativedelta(): + from dateutil.relativedelta import relativedelta + from pandas.tseries.offsets import DateOffset + data = [ + None, + relativedelta(years=1, months=1, + days=1, seconds=1, microseconds=1, + minutes=1, hours=1, weeks=1, leapdays=1)] + # Note leapdays are ignored. + + # From relativedelta + arr = pa.array(data) + assert isinstance(arr, pa.MonthDayNanoIntervalArray) + assert arr.type == pa.month_day_nano_interval() + expected_list = [ + None, + pa.MonthDayNanoTuple([13, 8, + (datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)) * 1000])] + expected = pa.array(expected_list) + assert arr.equals(expected) + assert arr.to_pylist() == [ + None, DateOffset(months=13, days=8, + microseconds=( + datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)), + nanoseconds=0)] + with pytest.raises(ValueError): + pa.array([DateOffset(years=((1 << 32) // 12), months=100)]) + with pytest.raises(ValueError): + pa.array([DateOffset(weeks=((1 << 32) // 7), days=100)]) + with pytest.raises(ValueError): + pa.array([DateOffset(seconds=((1 << 64) // 1000000000), + nanoseconds=1)]) + with pytest.raises(ValueError): + pa.array([DateOffset(microseconds=((1 << 64) // 100))]) + + +@pytest.mark.pandas +def test_interval_array_from_dateoffset(): + from pandas.tseries.offsets import DateOffset + data = [ + None, + DateOffset(years=1, months=1, + days=1, seconds=1, microseconds=1, + minutes=1, hours=1, weeks=1, nanoseconds=1), + DateOffset()] + + arr = pa.array(data) + assert isinstance(arr, pa.MonthDayNanoIntervalArray) + assert arr.type == pa.month_day_nano_interval() + expected_list = [ + None, + pa.MonthDayNanoTuple([13, 8, 3661000001001]), + pa.MonthDayNanoTuple([0, 0, 0])] + expected = pa.array(expected_list) + assert arr.equals(expected) + assert arr.to_pylist() == [ + None, DateOffset(months=13, days=8, + microseconds=( + datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)), + nanoseconds=1), + DateOffset(months=0, days=0, microseconds=0, nanoseconds=0)] + + def test_array_from_numpy_unicode(): dtypes = ['U5'] diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index f25f161dd735e..8b1f42130cc6b 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1495,6 +1495,19 @@ def test_timedeltas_nulls(self): expected_schema=schema, ) + def test_month_day_nano_interval(self): + from pandas.tseries.offsets import DateOffset + df = pd.DataFrame({ + 'date_offset': [None, + DateOffset(days=3600, months=3600, microseconds=3, + nanoseconds=600)] + }) + field = pa.field('date_offset', pa.month_day_nano_interval()) + schema = pa.schema([field]) + _check_pandas_roundtrip( + df, + expected_schema=schema) + # ---------------------------------------------------------------------- # Conversion tests for string and binary types. diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 86dfe949cb497..c345d14724099 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -64,6 +64,8 @@ pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), + (pa.MonthDayNanoTuple([1, -1, -10100]), None, + pa.MonthDayNanoIntervalScalar, None), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue), @@ -78,8 +80,9 @@ def test_basics(value, ty, klass, deprecated): assert hash(s) == hash(s) assert s.is_valid is True assert s != None # noqa: E711 - with pytest.warns(FutureWarning): - assert isinstance(s, deprecated) + if deprecated is not None: + with pytest.warns(FutureWarning): + assert isinstance(s, deprecated) s = pa.scalar(None, type=s.type) assert s.is_valid is False @@ -364,6 +367,28 @@ def test_duration_nanos_nopandas(): arr[0].as_py() +@pytest.mark.pandas +def test_month_day_nano_interval_pandas(): + from pandas.tseries.offsets import DateOffset + + triple = pa.MonthDayNanoTuple([3600, 3600, 3600]) + arr = pa.array([triple]) + expected = DateOffset(days=3600, months=3600, microseconds=3, + nanoseconds=600) + assert isinstance(arr[0].as_py(), DateOffset) + assert arr[0].as_py() == expected + assert arr[0].value == triple + + +@pytest.mark.nopandas +def test_month_day_nano_interval_nopandas(): + triple = pa.MonthDayNanoTuple([3600, 3600, 3600]) + arr = pa.array([triple]) + assert isinstance(arr[0].as_py(), pa.MonthDayNanoTuple) + assert arr[0].as_py() == triple + assert arr[0].value == triple + + @pytest.mark.parametrize('value', ['foo', 'maƱana']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.string(), pa.StringScalar), diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b4e0b659df528..03ef08e68f329 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2176,6 +2176,14 @@ def duration(unit): return out +def month_day_nano_interval(): + """ + Create instance of a interval representing the time between two calendar + instances represented as a triple of months, days and nanoseconds. + """ + return primitive_type(_Type_INTERVAL_MONTH_DAY_NANO) + + def date32(): """ Create instance of 32-bit date (days since UNIX epoch 1970-01-01). @@ -2724,6 +2732,7 @@ cdef dict _type_aliases = { 'duration[ms]': duration('ms'), 'duration[us]': duration('us'), 'duration[ns]': duration('ns'), + 'month_day_nano_interval': month_day_nano_interval(), } From f3f0273203f2a5dbc4080743ecb16f51f4bced0c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 4 Oct 2021 09:35:21 -0700 Subject: [PATCH 02/24] add missing files --- cpp/src/arrow/python/arrow_to_python.cc | 201 ++++++++++++++++++++++++ cpp/src/arrow/python/arrow_to_python.h | 161 +++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100644 cpp/src/arrow/python/arrow_to_python.cc create mode 100644 cpp/src/arrow/python/arrow_to_python.h diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc new file mode 100644 index 0000000000000..50a00a5542429 --- /dev/null +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#include "arrow/python/arrow_to_python.h" + +#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" +#include "arrow/result_internal.h" +#include "arrow/scalar.h" + +namespace arrow { +namespace py { +namespace { + +Status CheckInterval(const DataType& datatype) { + if (datatype.id() != Type::INTERVAL_MONTH_DAY_NANO) { + return Status::NotImplemented( + "Only MonthDayIntervalNanoIntervalType supported. Provided.", + datatype.ToString()); + } + return Status::OK(); +} + +// Wrapper around a Python list object that mimics dereference and assignment +// operations. +struct PyListAssigner { + public: + explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); } + + PyListAssigner& operator*() { return *this; } + + void operator=(PyObject* obj) { + if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) { + Py_FatalError("list did not have the correct preallocated size."); + } + } + + PyListAssigner& operator++() { + current_index_++; + return *this; + } + + PyListAssigner& operator+=(int64_t offset) { + current_index_ += offset; + return *this; + } + + private: + PyObject* list_; + int64_t current_index_ = 0; +}; + +// Args and Kwargs are passed in to avoid reallocation for batch conversion. +template +Status ConvertToDateOffset(const MonthDayNanoIntervalType::MonthDayNanos& interval, + PyObject* date_offset_constructor, PyObject* args, + PyObject* kwargs, OutputType out) { + DCHECK(internal::BorrowPandasDataOffsetType()); + RETURN_IF_PYERROR(); + // TimeDelta objects do not add nanoseconds component to timestamp. + // so convert microseconds and remainder to preserve data + // but give users more expected results. + int64_t microseconds = interval.nanoseconds / 1000; + int64_t nanoseconds; + if (interval.nanoseconds >= 0) { + nanoseconds = interval.nanoseconds % 1000; + } else { + nanoseconds = -((-interval.nanoseconds) % 1000); + } + + PyDict_SetItemString(kwargs, "months", PyLong_FromLong(interval.months)); + PyDict_SetItemString(kwargs, "days", PyLong_FromLong(interval.days)); + PyDict_SetItemString(kwargs, "microseconds", PyLong_FromLongLong(microseconds)); + PyDict_SetItemString(kwargs, "nanoseconds", PyLong_FromLongLong(nanoseconds)); + *out = PyObject_Call(internal::BorrowPandasDataOffsetType(), args, kwargs); + RETURN_IF_PYERROR(); + return Status::OK(); +} + +} // namespace + +Result ArrowToPython::ToPyList(const Array& array) { + RETURN_NOT_OK(Init()); + RETURN_NOT_OK(CheckInterval(*array.type())); + OwnedRef out_list(PyList_New(array.length())); + RETURN_IF_PYERROR(); + PyListAssigner out_objects(out_list.obj()); + auto& interval_array = + arrow::internal::checked_cast(array); + PyObject* date_offset_type = internal::BorrowPandasDataOffsetType(); + if (date_offset_type != nullptr) { + OwnedRef args(PyTuple_New(0)); + OwnedRef kwargs(PyDict_New()); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(internal::WriteArrayObjects( + interval_array, + [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, + PyListAssigner& out) { + return ConvertToDateOffset(interval, date_offset_type, args.obj(), kwargs.obj(), + out); + }, + out_objects)); + } else { + RETURN_NOT_OK(internal::WriteArrayObjects( + interval_array, + [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, + PyListAssigner& out) { + ASSIGN_OR_RAISE(PyObject * tuple, + internal::MonthDayNanoIntervalToNamedTuple(interval)); + if (ARROW_PREDICT_FALSE(tuple == nullptr)) { + RETURN_IF_PYERROR(); + } + + *out = tuple; + return Status::OK(); + }, + out_objects)); + } + return out_list.detach(); +} + +Status ArrowToPython::ToNumpyObjectArray(const ArrowToPythonObjectOptions& options, + const ChunkedArray& array, + PyObject** out_objects) { + RETURN_NOT_OK(Init()); + RETURN_NOT_OK(CheckInterval(*array.type())); + OwnedRef args(PyTuple_New(0)); + OwnedRef kwargs(PyDict_New()); + RETURN_IF_PYERROR(); + return internal::ConvertAsPyObjects( + options, array, + [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyObject** out) { + return ConvertToDateOffset(interval, internal::BorrowPandasDataOffsetType(), + args.obj(), kwargs.obj(), out); + }, + out_objects); +} + +Result ArrowToPython::ToLogical(const Scalar& scalar) { + // Pandas's DateOffset is the best type in the python ecosystem to + // for MonthDayNano interval type so use that if it is available. + // Otherwise use the primitive type. (this logical is similar to how + // we handle timestamps today, since datetime.datetime doesn't support nanos). + // In this case timedelta doesn't support months, years or nanos. + RETURN_NOT_OK(Init()); + if (internal::BorrowPandasDataOffsetType() != nullptr) { + RETURN_NOT_OK(CheckInterval(*scalar.type)); + if (!scalar.is_valid) { + Py_INCREF(Py_None); + return Py_None; + } + OwnedRef args(PyTuple_New(0)); + OwnedRef kwargs(PyDict_New()); + RETURN_IF_PYERROR(); + PyObject* out; + RETURN_NOT_OK(ConvertToDateOffset( + arrow::internal::checked_cast(scalar).value, + internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj(), &out)); + return out; + } else { + return ToPrimitive(scalar); + } +} + +Result ArrowToPython::ToPrimitive(const Scalar& scalar) { + RETURN_NOT_OK(Init()); + RETURN_NOT_OK(CheckInterval(*scalar.type)); + if (scalar.is_valid) { + return internal::MonthDayNanoIntervalToNamedTuple( + arrow::internal::checked_cast(scalar).value); + } else { + Py_INCREF(Py_None); + return Py_None; + } +} + +Status ArrowToPython::Init() { + // relies on GIL for interpretation. + internal::InitPandasStaticData(); + return Status::OK(); +} + +} // namespace py +} // namespace arrow diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h new file mode 100644 index 0000000000000..8a752d53fc752 --- /dev/null +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#pragma once + +#include "arrow/chunked_array.h" +#include "arrow/python/common.h" +#include "arrow/python/platform.h" +#include "arrow/util/hashing.h" + +namespace arrow { + +class Array; +class Scalar; + +namespace py { + +struct ArrowToPythonObjectOptions { + MemoryPool* pool = default_memory_pool(); + bool deduplicate_objects = false; +}; + +class ARROW_PYTHON_EXPORT ArrowToPython { + public: + // \brief Converts the given Array to a PyList object. Returns NULL if there + // is an error converting the Array. The list elements are the same ones + // generated via ToLogical() + // + // N.B. This has limited type support. ARROW-12976 tracks extending the implementation. + Result ToPyList(const Array& array); + + // Populates out_objects with the result of converting the array values + // to python objects. The same logic as ToLogical(). + // + // N.B. Not all types are supported. ARROW-12976 tracks extending the implementation. + Status ToNumpyObjectArray(const ArrowToPythonObjectOptions& options, + const ChunkedArray& array, PyObject** out_objects); + + // \brief Converts the given Scalar to a python object that best corresponds + // with the Scalar's type. + // + // For example timestamp[ms] is translated into datetime.datetime. + // + // N.B. This has limited type support. ARROW-12976 tracks extending the implementation. + Result ToLogical(const Scalar& scalar); + + // \brief Converts the given Scalar the type that is closed to its arrow + // representation. + // + // For instance timestamp would be translated to a integer representing an + // offset from the unix epoch. + // + // Returns nullptr on error. + // + // GIL must be health when calling this method. + // N.B. This has limited type support. ARROW-12976 tracks full implementation. + Result ToPrimitive(const Scalar& scalar); + + private: + Status Init(); +}; + +namespace internal { +// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic +// to the .cc file and move this there as well if we can. + +// Generic Array -> PyObject** converter that handles object deduplication, if +// requested +template +inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, + Assigner out_values) { + // TODO(ARROW-12976): Use visitor here? + const bool has_nulls = arr.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); + } + ++out_values; + } + return Status::OK(); +} + +template +struct MemoizationTraits { + using Scalar = typename T::c_type; +}; + +template +struct MemoizationTraits> { + // For binary, we memoize string_view as a scalar value to avoid having to + // unnecessarily copy the memory into the memo table data structure + using Scalar = util::string_view; +}; + +template +inline Status ConvertAsPyObjects(const ArrowToPythonObjectOptions& options, + const ChunkedArray& data, WrapFunction&& wrap_func, + PyObject** out_values) { + using ArrayType = typename TypeTraits::ArrayType; + using Scalar = typename MemoizationTraits::Scalar; + + ::arrow::internal::ScalarMemoTable memo_table(options.pool); + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index; + RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + + auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); + if (options.deduplicate_objects) { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values)); + } else { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values)); + } + out_values += arr.length(); + } + return Status::OK(); +} + +} // namespace internal + +} // namespace py +} // namespace arrow From 47aa6f7b142e8a77335896f79aea0a024c32e291 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 4 Oct 2021 20:58:40 -0700 Subject: [PATCH 03/24] Update python/pyarrow/scalar.pxi Co-authored-by: Weston Pace --- python/pyarrow/scalar.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a5724db843180..716a72869c95e 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -522,7 +522,7 @@ cdef class MonthDayNanoIntervalScalar(Scalar): @property def value(self): """ - Returns this value pyarrow.MonthDayNanoTuple. + Returns this value as a pyarrow.MonthDayNanoTuple. """ cdef PyObject* val val = GetResultValue(ARROW_TO_PYTHON.ToPrimitive( From faae9714e6690b9632cca1a16ac1cfe1512f20ba Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 4 Oct 2021 21:50:48 -0700 Subject: [PATCH 04/24] wip --- cpp/src/arrow/python/arrow_to_pandas.cc | 106 +++++++++++--- cpp/src/arrow/python/arrow_to_python.cc | 97 +------------ cpp/src/arrow/python/arrow_to_python.h | 135 +++--------------- .../arrow/python/arrow_to_python_internal.h | 52 +++++++ cpp/src/arrow/python/datetime.cc | 4 +- cpp/src/arrow/python/datetime.h | 4 +- python/pyarrow/__init__.py | 2 +- python/pyarrow/includes/libarrow.pxd | 1 - python/pyarrow/lib.pyx | 3 +- 9 files changed, 165 insertions(+), 239 deletions(-) create mode 100644 cpp/src/arrow/python/arrow_to_python_internal.h diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 4ec2e15f6a982..cec3d0b819460 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -48,7 +48,7 @@ #include "arrow/compute/api.h" -#include "arrow/python/arrow_to_python.h" +#include "arrow/python/arrow_to_python_internal.h" #include "arrow/python/common.h" #include "arrow/python/datetime.h" #include "arrow/python/decimal.h" @@ -575,6 +575,65 @@ inline void ConvertIntegerNoNullsCast(const PandasOptions& options, } } +template +struct MemoizationTraits { + using Scalar = typename T::c_type; +}; + +template +struct MemoizationTraits> { + // For binary, we memoize string_view as a scalar value to avoid having to + // unnecessarily copy the memory into the memo table data structure + using Scalar = util::string_view; +}; + +// Generic Array -> PyObject** converter that handles object deduplication, if +// requested +template +inline Status ConvertAsPyObjects(const PandasOptions& options, + const ChunkedArray& data, WrapFunction&& wrap_func, + PyObject** out_values) { + using ArrayType = typename TypeTraits::ArrayType; + using Scalar = typename MemoizationTraits::Scalar; + + ::arrow::internal::ScalarMemoTable memo_table(options.pool); + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index; + RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + + auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); + if (options.deduplicate_objects) { + RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values)); + } else { + RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values)); + } + out_values += arr.length(); + } + return Status::OK(); +} + + + Status ConvertStruct(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { if (data.num_chunks() == 0) { @@ -898,17 +957,6 @@ class TypedPandasWriter : public PandasWriter { Status Allocate() override { return AllocateNDArray(NPY_TYPE); } }; -template -inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, - WrapFunction&& wrap_func, PyObject** out_values) { - ArrowToPythonObjectOptions to_object_options; - to_object_options.pool = options.pool; - to_object_options.deduplicate_objects = options.deduplicate_objects; - - return internal::ConvertAsPyObjects(to_object_options, data, wrap_func, - out_values); -} - struct ObjectWriterVisitor { const PandasOptions& options; const ChunkedArray& data; @@ -1039,11 +1087,35 @@ struct ObjectWriterVisitor { template enable_if_t::value, Status> Visit( const Type& type) { - ArrowToPython arrow_to_python; - ArrowToPythonObjectOptions to_py_options; - to_py_options.pool = options.pool; - to_py_options.deduplicate_objects = options.deduplicate_objects; - return arrow_to_python.ToNumpyObjectArray(to_py_options, data, out_values); + OwnedRef args(PyTuple_New(0)); + OwnedRef kwargs(PyDict_New()); + RETURN_IF_PYERROR(); + auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, + PyObject** out) { + DCHECK(internal::BorrowPandasDataOffsetType() != nullptr); + // TimeDelta objects do not add nanoseconds component to timestamp. + // so convert microseconds and remainder to preserve data + // but give users more expected results. + int64_t microseconds = interval.nanoseconds / 1000; + int64_t nanoseconds; + if (interval.nanoseconds >= 0) { + nanoseconds = interval.nanoseconds % 1000; + } else { + nanoseconds = -((-interval.nanoseconds) % 1000); + } + + PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months)); + PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days)); + PyDict_SetItemString(kwargs.obj(), "microseconds", PyLong_FromLongLong(microseconds)); + PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds)); + *out = PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj()); + RETURN_IF_PYERROR(); + return Status::OK(); + + } + return internal::ConvertAsPyObjects( + options, array, to_date_offset, out_objects); + } Status Visit(const Decimal128Type& type) { diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index 50a00a5542429..bcb0cf30bfe8f 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -20,6 +20,7 @@ #include "arrow/python/arrow_to_python.h" +#include "arrow/python/arrow_to_python_internal.h" #include "arrow/python/datetime.h" #include "arrow/python/helpers.h" #include "arrow/result_internal.h" @@ -67,63 +68,20 @@ struct PyListAssigner { int64_t current_index_ = 0; }; -// Args and Kwargs are passed in to avoid reallocation for batch conversion. -template -Status ConvertToDateOffset(const MonthDayNanoIntervalType::MonthDayNanos& interval, - PyObject* date_offset_constructor, PyObject* args, - PyObject* kwargs, OutputType out) { - DCHECK(internal::BorrowPandasDataOffsetType()); - RETURN_IF_PYERROR(); - // TimeDelta objects do not add nanoseconds component to timestamp. - // so convert microseconds and remainder to preserve data - // but give users more expected results. - int64_t microseconds = interval.nanoseconds / 1000; - int64_t nanoseconds; - if (interval.nanoseconds >= 0) { - nanoseconds = interval.nanoseconds % 1000; - } else { - nanoseconds = -((-interval.nanoseconds) % 1000); - } - - PyDict_SetItemString(kwargs, "months", PyLong_FromLong(interval.months)); - PyDict_SetItemString(kwargs, "days", PyLong_FromLong(interval.days)); - PyDict_SetItemString(kwargs, "microseconds", PyLong_FromLongLong(microseconds)); - PyDict_SetItemString(kwargs, "nanoseconds", PyLong_FromLongLong(nanoseconds)); - *out = PyObject_Call(internal::BorrowPandasDataOffsetType(), args, kwargs); - RETURN_IF_PYERROR(); - return Status::OK(); -} - } // namespace Result ArrowToPython::ToPyList(const Array& array) { - RETURN_NOT_OK(Init()); RETURN_NOT_OK(CheckInterval(*array.type())); OwnedRef out_list(PyList_New(array.length())); RETURN_IF_PYERROR(); PyListAssigner out_objects(out_list.obj()); auto& interval_array = arrow::internal::checked_cast(array); - PyObject* date_offset_type = internal::BorrowPandasDataOffsetType(); - if (date_offset_type != nullptr) { - OwnedRef args(PyTuple_New(0)); - OwnedRef kwargs(PyDict_New()); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(internal::WriteArrayObjects( - interval_array, - [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, - PyListAssigner& out) { - return ConvertToDateOffset(interval, date_offset_type, args.obj(), kwargs.obj(), - out); - }, - out_objects)); - } else { RETURN_NOT_OK(internal::WriteArrayObjects( interval_array, [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) { - ASSIGN_OR_RAISE(PyObject * tuple, - internal::MonthDayNanoIntervalToNamedTuple(interval)); + PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); if (ARROW_PREDICT_FALSE(tuple == nullptr)) { RETURN_IF_PYERROR(); } @@ -132,55 +90,10 @@ Result ArrowToPython::ToPyList(const Array& array) { return Status::OK(); }, out_objects)); - } return out_list.detach(); } -Status ArrowToPython::ToNumpyObjectArray(const ArrowToPythonObjectOptions& options, - const ChunkedArray& array, - PyObject** out_objects) { - RETURN_NOT_OK(Init()); - RETURN_NOT_OK(CheckInterval(*array.type())); - OwnedRef args(PyTuple_New(0)); - OwnedRef kwargs(PyDict_New()); - RETURN_IF_PYERROR(); - return internal::ConvertAsPyObjects( - options, array, - [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyObject** out) { - return ConvertToDateOffset(interval, internal::BorrowPandasDataOffsetType(), - args.obj(), kwargs.obj(), out); - }, - out_objects); -} - -Result ArrowToPython::ToLogical(const Scalar& scalar) { - // Pandas's DateOffset is the best type in the python ecosystem to - // for MonthDayNano interval type so use that if it is available. - // Otherwise use the primitive type. (this logical is similar to how - // we handle timestamps today, since datetime.datetime doesn't support nanos). - // In this case timedelta doesn't support months, years or nanos. - RETURN_NOT_OK(Init()); - if (internal::BorrowPandasDataOffsetType() != nullptr) { - RETURN_NOT_OK(CheckInterval(*scalar.type)); - if (!scalar.is_valid) { - Py_INCREF(Py_None); - return Py_None; - } - OwnedRef args(PyTuple_New(0)); - OwnedRef kwargs(PyDict_New()); - RETURN_IF_PYERROR(); - PyObject* out; - RETURN_NOT_OK(ConvertToDateOffset( - arrow::internal::checked_cast(scalar).value, - internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj(), &out)); - return out; - } else { - return ToPrimitive(scalar); - } -} - Result ArrowToPython::ToPrimitive(const Scalar& scalar) { - RETURN_NOT_OK(Init()); RETURN_NOT_OK(CheckInterval(*scalar.type)); if (scalar.is_valid) { return internal::MonthDayNanoIntervalToNamedTuple( @@ -191,11 +104,5 @@ Result ArrowToPython::ToPrimitive(const Scalar& scalar) { } } -Status ArrowToPython::Init() { - // relies on GIL for interpretation. - internal::InitPandasStaticData(); - return Status::OK(); -} - } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 8a752d53fc752..18595b745a22f 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -20,10 +20,8 @@ #pragma once -#include "arrow/chunked_array.h" #include "arrow/python/common.h" #include "arrow/python/platform.h" -#include "arrow/util/hashing.h" namespace arrow { @@ -32,130 +30,29 @@ class Scalar; namespace py { -struct ArrowToPythonObjectOptions { - MemoryPool* pool = default_memory_pool(); - bool deduplicate_objects = false; -}; - +/// \brief Utility class for converting Arrow to Python obects. A class instead +/// +/// A class is chosen because in the future some amount of state will be +/// (e.g. imported python classes), doing this one lazily will be helpful +/// and having members present avoids static C++ variables. class ARROW_PYTHON_EXPORT ArrowToPython { public: - // \brief Converts the given Array to a PyList object. Returns NULL if there - // is an error converting the Array. The list elements are the same ones - // generated via ToLogical() - // - // N.B. This has limited type support. ARROW-12976 tracks extending the implementation. + /// \brief Converts the given Array to a PyList object. Returns NULL if there + /// is an error converting the Array. The list elements are the same ones + /// generated via ToLogical() + /// + /// N.B. This has limited type support. ARROW-12976 tracks extending the implementation. Result ToPyList(const Array& array); - // Populates out_objects with the result of converting the array values - // to python objects. The same logic as ToLogical(). - // - // N.B. Not all types are supported. ARROW-12976 tracks extending the implementation. - Status ToNumpyObjectArray(const ArrowToPythonObjectOptions& options, - const ChunkedArray& array, PyObject** out_objects); - - // \brief Converts the given Scalar to a python object that best corresponds - // with the Scalar's type. - // - // For example timestamp[ms] is translated into datetime.datetime. - // - // N.B. This has limited type support. ARROW-12976 tracks extending the implementation. - Result ToLogical(const Scalar& scalar); - - // \brief Converts the given Scalar the type that is closed to its arrow - // representation. - // - // For instance timestamp would be translated to a integer representing an + /// \brief Converts the given Scalar the type that is closest to its arrow + /// representation. + /// + /// For instance timestamp would be translated to a integer representing an // offset from the unix epoch. - // - // Returns nullptr on error. - // - // GIL must be health when calling this method. - // N.B. This has limited type support. ARROW-12976 tracks full implementation. + /// + /// N.B. This has limited type support. ARROW-12976 tracks full implementation. Result ToPrimitive(const Scalar& scalar); - - private: - Status Init(); -}; - -namespace internal { -// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic -// to the .cc file and move this there as well if we can. - -// Generic Array -> PyObject** converter that handles object deduplication, if -// requested -template -inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, - Assigner out_values) { - // TODO(ARROW-12976): Use visitor here? - const bool has_nulls = arr.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); - } - ++out_values; - } - return Status::OK(); -} - -template -struct MemoizationTraits { - using Scalar = typename T::c_type; }; -template -struct MemoizationTraits> { - // For binary, we memoize string_view as a scalar value to avoid having to - // unnecessarily copy the memory into the memo table data structure - using Scalar = util::string_view; -}; - -template -inline Status ConvertAsPyObjects(const ArrowToPythonObjectOptions& options, - const ChunkedArray& data, WrapFunction&& wrap_func, - PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - using Scalar = typename MemoizationTraits::Scalar; - - ::arrow::internal::ScalarMemoTable memo_table(options.pool); - std::vector unique_values; - int32_t memo_size = 0; - - auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { - int32_t memo_index; - RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); - if (memo_index == memo_size) { - // New entry - RETURN_NOT_OK(wrap_func(value, out_values)); - unique_values.push_back(*out_values); - ++memo_size; - } else { - // Duplicate entry - Py_INCREF(unique_values[memo_index]); - *out_values = unique_values[memo_index]; - } - return Status::OK(); - }; - - auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { - return wrap_func(value, out_values); - }; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); - if (options.deduplicate_objects) { - RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values)); - } else { - RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values)); - } - out_values += arr.length(); - } - return Status::OK(); -} - -} // namespace internal - } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/arrow_to_python_internal.h b/cpp/src/arrow/python/arrow_to_python_internal.h new file mode 100644 index 0000000000000..f9474b089844c --- /dev/null +++ b/cpp/src/arrow/python/arrow_to_python_internal.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#pragma once + +#include "arrow/python/platform.h" +#include "arrow/array.h" + +namespace arrow { +namespace py { +namespace internal { +// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic +// to the .cc file and move this there as well if we can. + +// Converts array to a sequency of python objects. +template +inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, + Assigner out_values) { + // TODO(ARROW-12976): Use visitor here? + const bool has_nulls = arr.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); + } + ++out_values; + } + return Status::OK(); +} + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index e6a84a3cc4859..1c30b0cfafeeb 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -293,8 +293,6 @@ PyObject* NewMonthDayNanoTupleType() { return (PyObject*)&MonthDayNanoTupleType; } -PyTypeObject* BorrowMonthDayNanoTupleType() { return &MonthDayNanoTupleType; } - Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { int64_t hour = 0, minute = 0, second = 0, microsecond = 0; RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); @@ -475,7 +473,7 @@ Result TzinfoToString(PyObject* tzinfo) { return PyTZInfo_utcoffset_hhmm(tzinfo); } -Result MonthDayNanoIntervalToNamedTuple( +PyObject* MonthDayNanoIntervalToNamedTuple( const MonthDayNanoIntervalType::MonthDayNanos& interval) { OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType)); if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) { diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 405676c39dfea..f11446a759a01 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -184,13 +184,13 @@ Result TzinfoToString(PyObject* pytzinfo); /// Converts MonthDayNano to a python dictionary. /// -/// Returns a named tuple (pyarrow.MonthDayNanoTuple) containin attributes +/// Returns a named tuple (pyarrow.MonthDayNanoTuple) containing attributes /// "months", "days", "nanoseconds" in the given order /// with values extracted from the fields on interval. /// /// GIL must be held when calling this method. ARROW_PYTHON_EXPORT -Result MonthDayNanoIntervalToNamedTuple( +PyObject* MonthDayNanoIntervalToNamedTuple( const MonthDayNanoIntervalType::MonthDayNanos& interval); } // namespace internal diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 6a705cff47f49..1ec229d538133 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -64,7 +64,7 @@ def parse_git(root, **kwargs): if _gc_enabled: _gc.enable() -from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNanoTuple, +from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNano, VersionInfo, cpp_build_info, cpp_version, cpp_version_info, runtime_info, cpu_count, set_cpu_count, enable_signal_handlers, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c7af6a4095163..abea0d25e5496 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2217,7 +2217,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py": cdef cppclass ArrowToPython: CResult[PyObject*] ToPyList(const CArray& array) CResult[PyObject*] ToPrimitive(const CScalar& scalar) - CResult[PyObject*] ToLogical(const CScalar& scalar) cdef extern from "arrow/python/api.h" namespace "arrow::py::internal": cdef object NewMonthDayNanoTupleType() diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 9bfda108eab98..221b5299c6f64 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -37,9 +37,10 @@ arrow_init_numpy() # (used from some of our C++ code, see e.g. ARROW-5260) import_pyarrow() +# Singleton object to do conversion from C++ to Python. cdef libarrow.ArrowToPython ARROW_TO_PYTHON -MonthDayNanoTuple = NewMonthDayNanoTupleType() +MonthDayNano = NewMonthDayNanoTupleType() def cpu_count(): From 299d97b6cefd09f60ec15734f243086cffc01325 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 4 Oct 2021 22:47:59 -0700 Subject: [PATCH 05/24] address feedback --- cpp/src/arrow/python/arrow_to_pandas.cc | 63 +++++++++---------- cpp/src/arrow/python/arrow_to_python.cc | 25 ++++---- cpp/src/arrow/python/arrow_to_python.h | 3 +- .../arrow/python/arrow_to_python_internal.h | 2 +- cpp/src/arrow/python/datetime.cc | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 22 ++++--- python/pyarrow/array.pxi | 12 +++- python/pyarrow/scalar.pxi | 14 ++--- python/pyarrow/tests/test_array.py | 30 +++++---- python/pyarrow/tests/test_scalars.py | 22 ++----- python/pyarrow/types.pxi | 4 +- 11 files changed, 92 insertions(+), 107 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index cec3d0b819460..8e50d12432992 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -590,9 +590,8 @@ struct MemoizationTraits> { // Generic Array -> PyObject** converter that handles object deduplication, if // requested template -inline Status ConvertAsPyObjects(const PandasOptions& options, - const ChunkedArray& data, WrapFunction&& wrap_func, - PyObject** out_values) { +inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, + WrapFunction&& wrap_func, PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; using Scalar = typename MemoizationTraits::Scalar; @@ -632,8 +631,6 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, return Status::OK(); } - - Status ConvertStruct(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { if (data.num_chunks() == 0) { @@ -1087,35 +1084,35 @@ struct ObjectWriterVisitor { template enable_if_t::value, Status> Visit( const Type& type) { - OwnedRef args(PyTuple_New(0)); - OwnedRef kwargs(PyDict_New()); - RETURN_IF_PYERROR(); - auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, - PyObject** out) { - DCHECK(internal::BorrowPandasDataOffsetType() != nullptr); - // TimeDelta objects do not add nanoseconds component to timestamp. - // so convert microseconds and remainder to preserve data - // but give users more expected results. - int64_t microseconds = interval.nanoseconds / 1000; - int64_t nanoseconds; - if (interval.nanoseconds >= 0) { - nanoseconds = interval.nanoseconds % 1000; - } else { - nanoseconds = -((-interval.nanoseconds) % 1000); - } - - PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months)); - PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days)); - PyDict_SetItemString(kwargs.obj(), "microseconds", PyLong_FromLongLong(microseconds)); - PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds)); - *out = PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj()); - RETURN_IF_PYERROR(); - return Status::OK(); - - } - return internal::ConvertAsPyObjects( - options, array, to_date_offset, out_objects); + OwnedRef args(PyTuple_New(0)); + OwnedRef kwargs(PyDict_New()); + RETURN_IF_PYERROR(); + auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, + PyObject** out) { + DCHECK(internal::BorrowPandasDataOffsetType() != nullptr); + // TimeDelta objects do not add nanoseconds component to timestamp. + // so convert microseconds and remainder to preserve data + // but give users more expected results. + int64_t microseconds = interval.nanoseconds / 1000; + int64_t nanoseconds; + if (interval.nanoseconds >= 0) { + nanoseconds = interval.nanoseconds % 1000; + } else { + nanoseconds = -((-interval.nanoseconds) % 1000); + } + PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months)); + PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days)); + PyDict_SetItemString(kwargs.obj(), "microseconds", + PyLong_FromLongLong(microseconds)); + PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds)); + *out = + PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj()); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects( + options, data, to_date_offset, out_values); } Status Visit(const Decimal128Type& type) { diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index bcb0cf30bfe8f..eb0aab0f12f98 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -77,19 +77,18 @@ Result ArrowToPython::ToPyList(const Array& array) { PyListAssigner out_objects(out_list.obj()); auto& interval_array = arrow::internal::checked_cast(array); - RETURN_NOT_OK(internal::WriteArrayObjects( - interval_array, - [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, - PyListAssigner& out) { - PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); - if (ARROW_PREDICT_FALSE(tuple == nullptr)) { - RETURN_IF_PYERROR(); - } - - *out = tuple; - return Status::OK(); - }, - out_objects)); + RETURN_NOT_OK(internal::WriteArrayObjects( + interval_array, + [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) { + PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); + if (ARROW_PREDICT_FALSE(tuple == nullptr)) { + RETURN_IF_PYERROR(); + } + + *out = tuple; + return Status::OK(); + }, + out_objects)); return out_list.detach(); } diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 18595b745a22f..11fcd3e4aaee9 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -41,7 +41,8 @@ class ARROW_PYTHON_EXPORT ArrowToPython { /// is an error converting the Array. The list elements are the same ones /// generated via ToLogical() /// - /// N.B. This has limited type support. ARROW-12976 tracks extending the implementation. + /// N.B. This has limited type support. ARROW-12976 tracks extending the + /// implementation. Result ToPyList(const Array& array); /// \brief Converts the given Scalar the type that is closest to its arrow diff --git a/cpp/src/arrow/python/arrow_to_python_internal.h b/cpp/src/arrow/python/arrow_to_python_internal.h index f9474b089844c..abdf48b270612 100644 --- a/cpp/src/arrow/python/arrow_to_python_internal.h +++ b/cpp/src/arrow/python/arrow_to_python_internal.h @@ -20,8 +20,8 @@ #pragma once -#include "arrow/python/platform.h" #include "arrow/array.h" +#include "arrow/python/platform.h" namespace arrow { namespace py { diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 1c30b0cfafeeb..1f4c986ec75d6 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -80,7 +80,7 @@ static PyStructSequence_Field MonthDayNanoField[] = { {nullptr, nullptr}}; static PyStructSequence_Desc MonthDayNanoTupleDesc = { - "MonthDayNanoTuple", "A interval consistent of months, days and nanoseconds.", + "MonthDayNano", "A calendar interval consisting of months, days and nanoseconds.", MonthDayNanoField, /*n_in_sequence=*/3}; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index c5450e79e6b41..10cde4d4dc106 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -128,30 +128,32 @@ const MonthDayNanoAttrData MonthDayNanoTraits:: template struct PopulateMonthDayNano { using Traits = MonthDayNanoTraits; + using field_c_type = typename Traits::c_type; inline static Status Field(PyObject* obj, - typename MonthDayNanoTraits::c_type* out, + field_c_type* out, bool* found_attrs) { *out = 0; for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; ++attr) { if (attr->multiplier != 1 && ::arrow::internal::MultiplyWithOverflow( - static_cast(attr->multiplier), *out, out)) { - return Status::Invalid("Overflow on: ", (attr - 1)->name); + static_cast(attr->multiplier), *out, out)) { + return Status::Invalid("Overflow on: ", (attr - 1)->name, " for: ", internal::PyObject_StdStringRepr(obj)); } - if (PyObject_HasAttrString(obj, attr->name)) { + OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); + if (field_value.obj() == nullptr) { + // No attribute present, skip to the next one. + PyErr_Clear(); + continue; + } RETURN_IF_PYERROR(); *found_attrs = true; - if (field_value.obj() == Py_None) { - continue; - } - typename Traits::c_type value; + field_c_type value; RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); if (::arrow::internal::AddWithOverflow(*out, value, out)) { - return Status::Invalid("Overflow on: ", attr->name); + return Status::Invalid("Overflow on: ", attr->name, " for: ", internal::PyObject_StdStringRepr(obj)); } - } } return Status::OK(); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5fc35da448cc3..cb33f15091476 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -163,6 +163,12 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, representation). Timezone-naive data will be implicitly interpreted as UTC. + Pandas's DateOffsets and dateutil.relativedelta.relativedetla are by + default converted as MonthDayNanoIntervalArray. relativedelta leapday's + are ignored as are all absolute fields on both objects. datetime.timedelta + also be converted to MonthDayNanoIntervalArray but require passing + MonthDayIntervalType explicitly. + Converting to dictionary array will promote to a wider integer type for indices if the number of distinct values cannot be represented, even if the index type was explicitly set. This means that if there are more than @@ -1508,6 +1514,7 @@ cdef class DurationArray(NumericArray): Concrete class for Arrow arrays of duration data type. """ + cdef class MonthDayNanoIntervalArray(Array): """ Concrete class for Arrow arrays of interval[MonthDayNano] type. @@ -1517,9 +1524,7 @@ cdef class MonthDayNanoIntervalArray(Array): """ Convert to a list of native Python objects. - is installed the objects will be - pd.tseries.offsets.DateOffset objects. Otherwise they are - pyarrow.MonthDayNanoTuple objects. + pyarrow.MonthDayNano is used as the native representation. Returns ------- @@ -1532,6 +1537,7 @@ cdef class MonthDayNanoIntervalArray(Array): py_list = GetResultValue(maybe_py_list) return PyObject_to_object(py_list) + cdef class HalfFloatArray(FloatingPointArray): """ Concrete class for Arrow arrays of float16 data type. diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 716a72869c95e..72fac799b3eba 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -516,26 +516,22 @@ cdef class DurationScalar(Scalar): cdef class MonthDayNanoIntervalScalar(Scalar): """ - Concrete class for month, day, nanosecond scalars. + Concrete class for month, day, nanosecond interval scalars. """ @property def value(self): """ - Returns this value as a pyarrow.MonthDayNanoTuple. + Same as self.as_py() """ - cdef PyObject* val - val = GetResultValue(ARROW_TO_PYTHON.ToPrimitive( - (deref(self.wrapped.get())))) - return PyObject_to_object(val) + return self.as_py() def as_py(self): """ - Return this value as a Pandas DateOffset instance if Pandas is present - otherwise as a named tuple containing months days and nanoseconds. + Returns this value as a pyarrow.MonthDayNano. """ cdef PyObject* val - val = GetResultValue(ARROW_TO_PYTHON.ToLogical( + val = GetResultValue(ARROW_TO_PYTHON.ToPrimitive( (deref(self.wrapped.get())))) return PyObject_to_object(val) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 3003093bfb7fc..7be1c676ae453 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2163,7 +2163,6 @@ def test_array_from_numpy_ascii(): assert arrow_arr.equals(expected) -@pytest.mark.nopandas def test_interval_array_from_timedelta(): data = [ None, @@ -2176,20 +2175,19 @@ def test_interval_array_from_timedelta(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNanoTuple([0, 8, - (datetime.timedelta(seconds=1, microseconds=1, - milliseconds=1, minutes=1, - hours=1) // - datetime.timedelta(microseconds=1)) * 1000])] + pa.MonthDayNano([0, 8, + (datetime.timedelta(seconds=1, microseconds=1, + milliseconds=1, minutes=1, + hours=1) // + datetime.timedelta(microseconds=1)) * 1000])] expected = pa.array(expected_list) assert arr.equals(expected) assert arr.to_pylist() == expected_list -# dateutil is dependency of pandas - @pytest.mark.pandas def test_interval_array_from_relativedelta(): + # dateutil is dependency of pandas from dateutil.relativedelta import relativedelta from pandas.tseries.offsets import DateOffset data = [ @@ -2205,13 +2203,13 @@ def test_interval_array_from_relativedelta(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNanoTuple([13, 8, - (datetime.timedelta(seconds=1, microseconds=1, - minutes=1, hours=1) // - datetime.timedelta(microseconds=1)) * 1000])] + pa.MonthDayNano([13, 8, + (datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)) * 1000])] expected = pa.array(expected_list) assert arr.equals(expected) - assert arr.to_pylist() == [ + assert arr.to_pandas().tolist() == [ None, DateOffset(months=13, days=8, microseconds=( datetime.timedelta(seconds=1, microseconds=1, @@ -2244,11 +2242,11 @@ def test_interval_array_from_dateoffset(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNanoTuple([13, 8, 3661000001001]), - pa.MonthDayNanoTuple([0, 0, 0])] + pa.MonthDayNano([13, 8, 3661000001001]), + pa.MonthDayNano([0, 0, 0])] expected = pa.array(expected_list) assert arr.equals(expected) - assert arr.to_pylist() == [ + assert arr.to_pandas().tolist() == [ None, DateOffset(months=13, days=8, microseconds=( datetime.timedelta(seconds=1, microseconds=1, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index c345d14724099..5c774003ad668 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -64,7 +64,7 @@ pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), - (pa.MonthDayNanoTuple([1, -1, -10100]), None, + (pa.MonthDayNano([1, -1, -10100]), None, pa.MonthDayNanoIntervalScalar, None), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, @@ -367,24 +367,10 @@ def test_duration_nanos_nopandas(): arr[0].as_py() -@pytest.mark.pandas -def test_month_day_nano_interval_pandas(): - from pandas.tseries.offsets import DateOffset - - triple = pa.MonthDayNanoTuple([3600, 3600, 3600]) - arr = pa.array([triple]) - expected = DateOffset(days=3600, months=3600, microseconds=3, - nanoseconds=600) - assert isinstance(arr[0].as_py(), DateOffset) - assert arr[0].as_py() == expected - assert arr[0].value == triple - - -@pytest.mark.nopandas -def test_month_day_nano_interval_nopandas(): - triple = pa.MonthDayNanoTuple([3600, 3600, 3600]) +def test_month_day_nano_interval(): + triple = pa.MonthDayNano([3600, 3600, 3600]) arr = pa.array([triple]) - assert isinstance(arr[0].as_py(), pa.MonthDayNanoTuple) + assert isinstance(arr[0].as_py(), pa.MonthDayNano) assert arr[0].as_py() == triple assert arr[0].value == triple diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 03ef08e68f329..233d207de01d3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2178,8 +2178,8 @@ def duration(unit): def month_day_nano_interval(): """ - Create instance of a interval representing the time between two calendar - instances represented as a triple of months, days and nanoseconds. + Create instance of an interval representing months, days and nanoseconds + between two dates. """ return primitive_type(_Type_INTERVAL_MONTH_DAY_NANO) From 096797b38e9fbeba5c5e09900eecd93e98d5ba56 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 4 Oct 2021 22:53:19 -0700 Subject: [PATCH 06/24] last format/lint/anonymous namespace --- cpp/src/arrow/python/arrow_to_pandas.cc | 4 +-- cpp/src/arrow/python/python_to_arrow.cc | 37 +++++++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 8e50d12432992..1ab9aa35422a9 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1111,8 +1111,8 @@ struct ObjectWriterVisitor { RETURN_IF_PYERROR(); return Status::OK(); }; - return ConvertAsPyObjects( - options, data, to_date_offset, out_values); + return ConvertAsPyObjects(options, data, to_date_offset, + out_values); } Status Visit(const Decimal128Type& type) { diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 10cde4d4dc106..75012dfe58480 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -69,6 +69,7 @@ using internal::MakeConverter; namespace py { +namespace { enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds }; template @@ -129,37 +130,39 @@ template struct PopulateMonthDayNano { using Traits = MonthDayNanoTraits; using field_c_type = typename Traits::c_type; - inline static Status Field(PyObject* obj, - field_c_type* out, - bool* found_attrs) { + inline static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) { *out = 0; for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; ++attr) { if (attr->multiplier != 1 && ::arrow::internal::MultiplyWithOverflow( static_cast(attr->multiplier), *out, out)) { - return Status::Invalid("Overflow on: ", (attr - 1)->name, " for: ", internal::PyObject_StdStringRepr(obj)); + return Status::Invalid("Overflow on: ", (attr - 1)->name, + " for: ", internal::PyObject_StdStringRepr(obj)); } - OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); - if (field_value.obj() == nullptr) { - // No attribute present, skip to the next one. - PyErr_Clear(); - continue; - } - RETURN_IF_PYERROR(); - *found_attrs = true; - field_c_type value; - RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); - if (::arrow::internal::AddWithOverflow(*out, value, out)) { - return Status::Invalid("Overflow on: ", attr->name, " for: ", internal::PyObject_StdStringRepr(obj)); - } + OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); + if (field_value.obj() == nullptr) { + // No attribute present, skip to the next one. + PyErr_Clear(); + continue; + } + RETURN_IF_PYERROR(); + *found_attrs = true; + field_c_type value; + RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); + if (::arrow::internal::AddWithOverflow(*out, value, out)) { + return Status::Invalid("Overflow on: ", attr->name, + " for: ", internal::PyObject_StdStringRepr(obj)); + } } return Status::OK(); } }; +} // namespace + // Utility for converting single python objects to their intermediate C representations // which can be fed to the typed builders class PyValue { From 67527dc6027b7bdb45b53a72c4c862354517b7ff Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 4 Oct 2021 22:58:13 -0700 Subject: [PATCH 07/24] add all the rest of apis to anonymous i python_to_arrow --- cpp/src/arrow/python/python_to_arrow.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 75012dfe58480..0960ff8ccc73d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -161,8 +161,6 @@ struct PopulateMonthDayNano { } }; -} // namespace - // Utility for converting single python objects to their intermediate C representations // which can be fed to the typed builders class PyValue { @@ -1121,6 +1119,8 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } +} // namespace + Result> ConvertPySequence(PyObject* obj, PyObject* mask, PyConversionOptions options, MemoryPool* pool) { From 9845763dd2cfec5822b2e3affcc051c2cb4cac4e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 Oct 2021 00:08:31 -0700 Subject: [PATCH 08/24] some ci fixes --- cpp/src/arrow/python/arrow_to_python.h | 2 +- cpp/src/arrow/python/inference.cc | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 11fcd3e4aaee9..d7435816ff3ee 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -26,7 +26,7 @@ namespace arrow { class Array; -class Scalar; +struct Scalar; namespace py { diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 35d1482819226..db5f0896a95bc 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -399,7 +399,9 @@ class TypeInferrer { RETURN_NOT_OK(VisitNdarray(obj, keep_going)); } else if (PyDict_Check(obj)) { RETURN_NOT_OK(VisitDict(obj)); - } else if (PyList_Check(obj) || PyTuple_Check(obj)) { + } else if (PyList_Check(obj) || + (PyTuple_Check(obj) && + !PyObject_IsInstance(obj, PyTuple_GetItem(interval_types_.obj(), 0)))) { RETURN_NOT_OK(VisitList(obj, keep_going)); } else if (PyObject_IsInstance(obj, decimal_type_.obj())) { RETURN_NOT_OK(max_decimal_metadata_.Update(obj)); From 51fa76bf577e86e99cc095e852948e57db067834 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 Oct 2021 00:12:35 -0700 Subject: [PATCH 09/24] try to fix py 3.6 --- cpp/src/arrow/python/datetime.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 1f4c986ec75d6..7e7b1c843de00 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -73,14 +73,21 @@ bool MatchFixedOffset(const std::string& tz, util::string_view* sign, static PyTypeObject MonthDayNanoTupleType = {0, 0}; +constexpr char* NonConst(const char* st) { + // Hack for python versions < 3.7 where members of PyStruct members + // where non-const (C++ doesn't like assigning string literals to these types) + return const_cast(st); +} + static PyStructSequence_Field MonthDayNanoField[] = { - {"months", "The number of months in the interval"}, - {"days", "The number days in the interval"}, - {"nanoseconds", "The number of nanoseconds in the interval"}, + {NonConst("months"), NonConst("The number of months in the interval")}, + {NonConst("days"), NonConst("The number days in the interval")}, + {NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")}, {nullptr, nullptr}}; static PyStructSequence_Desc MonthDayNanoTupleDesc = { - "MonthDayNano", "A calendar interval consisting of months, days and nanoseconds.", + NonConst("MonthDayNano"), + NonConst("A calendar interval consisting of months, days and nanoseconds."), MonthDayNanoField, /*n_in_sequence=*/3}; From f5b27504c780e0358cf8942fc984f5d35d5610b0 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 Oct 2021 00:45:43 -0700 Subject: [PATCH 10/24] Add common casts --- cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc index 0fe537ebbaf49..5f16f1e9db455 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc @@ -505,6 +505,13 @@ std::shared_ptr GetDurationCast() { return func; } +std::shared_ptr GetIntervalCast() { + auto func = std::make_shared("cast_month_day_nano_interval", + Type::INTERVAL_MONTH_DAY_NANO); + AddCommonCasts(Type::INTERVAL_MONTH_DAY_NANO, kOutputTargetType, func.get()); + return func; +} + std::shared_ptr GetTime32Cast() { auto func = std::make_shared("cast_time32", Type::TIME32); AddCommonCasts(Type::TIME32, kOutputTargetType, func.get()); @@ -579,6 +586,7 @@ std::vector> GetTemporalCasts() { functions.push_back(GetDate32Cast()); functions.push_back(GetDate64Cast()); functions.push_back(GetDurationCast()); + functions.push_back(GetIntervalCast()); functions.push_back(GetTime32Cast()); functions.push_back(GetTime64Cast()); functions.push_back(GetTimestampCast()); From a9ca3edbcd0a89734cf10dc15fe23acd340e4a4e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 Oct 2021 16:30:03 -0700 Subject: [PATCH 11/24] address more comments --- cpp/src/arrow/python/arrow_to_pandas.cc | 6 ++++-- cpp/src/arrow/python/arrow_to_python.h | 18 ++++++++---------- cpp/src/arrow/python/datetime.cc | 2 +- python/pyarrow/array.pxi | 4 ++-- python/pyarrow/tests/test_scalars.py | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 1ab9aa35422a9..3f386ad529ad8 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1090,8 +1090,10 @@ struct ObjectWriterVisitor { auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyObject** out) { DCHECK(internal::BorrowPandasDataOffsetType() != nullptr); - // TimeDelta objects do not add nanoseconds component to timestamp. - // so convert microseconds and remainder to preserve data + // DateOffset objects do not add nanoseconds component to pd.Timestamp. + // as of Pandas 1.3.3 + // (https://github.com/pandas-dev/pandas/issues/43892). + // So convert microseconds and remainder to preserve data // but give users more expected results. int64_t microseconds = interval.nanoseconds / 1000; int64_t nanoseconds; diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index d7435816ff3ee..1dd46a0231385 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -15,9 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - +// Utilities for converting arrow to python (non-pandas) objects. #pragma once #include "arrow/python/common.h" @@ -30,16 +28,16 @@ struct Scalar; namespace py { -/// \brief Utility class for converting Arrow to Python obects. A class instead +/// \brief Utility class for converting Arrow to Python obects. /// /// A class is chosen because in the future some amount of state will be -/// (e.g. imported python classes), doing this one lazily will be helpful -/// and having members present avoids static C++ variables. +/// (e.g. imported python classes), doing this once lazily is helpful. +/// A class allows for keeping the state as member variables instead of +/// static variables. It is expected cython code will instantiate this +/// class as a singleton on module class. class ARROW_PYTHON_EXPORT ArrowToPython { public: - /// \brief Converts the given Array to a PyList object. Returns NULL if there - /// is an error converting the Array. The list elements are the same ones - /// generated via ToLogical() + /// \brief Converts the given Array to a PyList object. /// /// N.B. This has limited type support. ARROW-12976 tracks extending the /// implementation. @@ -49,7 +47,7 @@ class ARROW_PYTHON_EXPORT ArrowToPython { /// representation. /// /// For instance timestamp would be translated to a integer representing an - // offset from the unix epoch. + /// offset from the unix epoch. /// /// N.B. This has limited type support. ARROW-12976 tracks full implementation. Result ToPrimitive(const Scalar& scalar); diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 7e7b1c843de00..193c47ce7a4d2 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -71,7 +71,7 @@ bool MatchFixedOffset(const std::string& tz, util::string_view* sign, return iter == (tz.data() + tz.size()); } -static PyTypeObject MonthDayNanoTupleType = {0, 0}; +static PyTypeObject MonthDayNanoTupleType = {0, 0, 0, 0, 0, 0}; constexpr char* NonConst(const char* st) { // Hack for python versions < 3.7 where members of PyStruct members diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cb33f15091476..a2eb5524ed943 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -163,10 +163,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, representation). Timezone-naive data will be implicitly interpreted as UTC. - Pandas's DateOffsets and dateutil.relativedelta.relativedetla are by + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by default converted as MonthDayNanoIntervalArray. relativedelta leapday's are ignored as are all absolute fields on both objects. datetime.timedelta - also be converted to MonthDayNanoIntervalArray but require passing + can also be converted to MonthDayNanoIntervalArray but require passing MonthDayIntervalType explicitly. Converting to dictionary array will promote to a wider integer type for diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 5c774003ad668..778ce1066aba9 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -368,7 +368,7 @@ def test_duration_nanos_nopandas(): def test_month_day_nano_interval(): - triple = pa.MonthDayNano([3600, 3600, 3600]) + triple = pa.MonthDayNano([-3600, 1800, -50]) arr = pa.array([triple]) assert isinstance(arr[0].as_py(), pa.MonthDayNano) assert arr[0].as_py() == triple From 88a97db904e08db3200f5c9b51871823b107d61f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 Oct 2021 21:49:07 -0700 Subject: [PATCH 12/24] try empty initializer --- cpp/src/arrow/python/datetime.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 193c47ce7a4d2..e2ecd3fb6d8ef 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -71,7 +71,7 @@ bool MatchFixedOffset(const std::string& tz, util::string_view* sign, return iter == (tz.data() + tz.size()); } -static PyTypeObject MonthDayNanoTupleType = {0, 0, 0, 0, 0, 0}; +static PyTypeObject MonthDayNanoTupleType = {}; constexpr char* NonConst(const char* st) { // Hack for python versions < 3.7 where members of PyStruct members From f1a6d15ff2b313dbba0807fbdab6844f4e493b99 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 Oct 2021 23:21:52 -0700 Subject: [PATCH 13/24] rename ToPrimitive as ToPyObject. Update docs and removeunussed variables --- cpp/src/arrow/python/arrow_to_python.cc | 2 +- cpp/src/arrow/python/arrow_to_python.h | 13 ++++++++----- cpp/src/arrow/python/python_to_arrow.cc | 4 ---- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/scalar.pxi | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index eb0aab0f12f98..a25141e71ed5a 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -92,7 +92,7 @@ Result ArrowToPython::ToPyList(const Array& array) { return out_list.detach(); } -Result ArrowToPython::ToPrimitive(const Scalar& scalar) { +Result ArrowToPython::ToPyObject(const Scalar& scalar) { RETURN_NOT_OK(CheckInterval(*scalar.type)); if (scalar.is_valid) { return internal::MonthDayNanoIntervalToNamedTuple( diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 1dd46a0231385..98f66b2356198 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -39,18 +39,21 @@ class ARROW_PYTHON_EXPORT ArrowToPython { public: /// \brief Converts the given Array to a PyList object. /// + /// The list consists of the same as calling ToPyObject on each scalar + /// in the array.. + /// /// N.B. This has limited type support. ARROW-12976 tracks extending the /// implementation. Result ToPyList(const Array& array); - /// \brief Converts the given Scalar the type that is closest to its arrow - /// representation. + /// \brief Converts the given Scalar the type to its logical equivalent type + /// in python. /// - /// For instance timestamp would be translated to a integer representing an - /// offset from the unix epoch. + /// For instance Decimal128 and Decimal256 would be converted to + /// decimal.Decimal. /// /// N.B. This has limited type support. ARROW-12976 tracks full implementation. - Result ToPrimitive(const Scalar& scalar); + Result ToPyObject(const Scalar& scalar); }; } // namespace py diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 0960ff8ccc73d..4b51205034a9f 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -83,7 +83,6 @@ struct MonthDayNanoAttrData { template <> struct MonthDayNanoTraits { using c_type = int32_t; - static constexpr char name[] = "months"; static const MonthDayNanoAttrData attrs[]; }; @@ -93,7 +92,6 @@ const MonthDayNanoAttrData MonthDayNanoTraits::attrs template <> struct MonthDayNanoTraits { using c_type = int32_t; - static constexpr char name[] = "days"; static const MonthDayNanoAttrData attrs[]; }; @@ -103,7 +101,6 @@ const MonthDayNanoAttrData MonthDayNanoTraits: template <> struct MonthDayNanoTraits { using c_type = int32_t; - static constexpr char name[] = "days"; static const MonthDayNanoAttrData attrs[]; }; @@ -113,7 +110,6 @@ const MonthDayNanoAttrData MonthDayNanoTraits::att template <> struct MonthDayNanoTraits { using c_type = int64_t; - static constexpr char name[] = "nanoseconds"; static const MonthDayNanoAttrData attrs[]; }; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index abea0d25e5496..223d64f44bd6d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2216,7 +2216,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py": cdef cppclass ArrowToPython: CResult[PyObject*] ToPyList(const CArray& array) - CResult[PyObject*] ToPrimitive(const CScalar& scalar) + CResult[PyObject*] ToPyObject(const CScalar& scalar) cdef extern from "arrow/python/api.h" namespace "arrow::py::internal": cdef object NewMonthDayNanoTupleType() diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 72fac799b3eba..3f7e2f5e00439 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -531,7 +531,7 @@ cdef class MonthDayNanoIntervalScalar(Scalar): Returns this value as a pyarrow.MonthDayNano. """ cdef PyObject* val - val = GetResultValue(ARROW_TO_PYTHON.ToPrimitive( + val = GetResultValue(ARROW_TO_PYTHON.ToPyObject( (deref(self.wrapped.get())))) return PyObject_to_object(val) From 19d407276cc2e92c7dd1d47cef4831df575b1eee Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 Oct 2021 13:36:41 -0700 Subject: [PATCH 14/24] remove arrow_to_python for now --- cpp/src/arrow/python/CMakeLists.txt | 1 - cpp/src/arrow/python/arrow_to_python.cc | 107 ------------------------ cpp/src/arrow/python/arrow_to_python.h | 60 ------------- cpp/src/arrow/python/datetime.cc | 68 +++++++++++++++ cpp/src/arrow/python/datetime.h | 16 ++++ python/pyarrow/array.pxi | 4 +- python/pyarrow/includes/libarrow.pxd | 17 +++- python/pyarrow/lib.pyx | 2 - python/pyarrow/scalar.pxi | 9 +- 9 files changed, 106 insertions(+), 178 deletions(-) delete mode 100644 cpp/src/arrow/python/arrow_to_python.cc delete mode 100644 cpp/src/arrow/python/arrow_to_python.h diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index c3b501a049ee0..40f351b56a5d2 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -28,7 +28,6 @@ add_dependencies(arrow_python-all arrow_python arrow_python-tests) set(ARROW_PYTHON_SRCS arrow_to_pandas.cc - arrow_to_python.cc benchmark.cc common.cc datetime.cc diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc deleted file mode 100644 index a25141e71ed5a..0000000000000 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ /dev/null @@ -1,107 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#include "arrow/python/arrow_to_python.h" - -#include "arrow/python/arrow_to_python_internal.h" -#include "arrow/python/datetime.h" -#include "arrow/python/helpers.h" -#include "arrow/result_internal.h" -#include "arrow/scalar.h" - -namespace arrow { -namespace py { -namespace { - -Status CheckInterval(const DataType& datatype) { - if (datatype.id() != Type::INTERVAL_MONTH_DAY_NANO) { - return Status::NotImplemented( - "Only MonthDayIntervalNanoIntervalType supported. Provided.", - datatype.ToString()); - } - return Status::OK(); -} - -// Wrapper around a Python list object that mimics dereference and assignment -// operations. -struct PyListAssigner { - public: - explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); } - - PyListAssigner& operator*() { return *this; } - - void operator=(PyObject* obj) { - if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) { - Py_FatalError("list did not have the correct preallocated size."); - } - } - - PyListAssigner& operator++() { - current_index_++; - return *this; - } - - PyListAssigner& operator+=(int64_t offset) { - current_index_ += offset; - return *this; - } - - private: - PyObject* list_; - int64_t current_index_ = 0; -}; - -} // namespace - -Result ArrowToPython::ToPyList(const Array& array) { - RETURN_NOT_OK(CheckInterval(*array.type())); - OwnedRef out_list(PyList_New(array.length())); - RETURN_IF_PYERROR(); - PyListAssigner out_objects(out_list.obj()); - auto& interval_array = - arrow::internal::checked_cast(array); - RETURN_NOT_OK(internal::WriteArrayObjects( - interval_array, - [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) { - PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); - if (ARROW_PREDICT_FALSE(tuple == nullptr)) { - RETURN_IF_PYERROR(); - } - - *out = tuple; - return Status::OK(); - }, - out_objects)); - return out_list.detach(); -} - -Result ArrowToPython::ToPyObject(const Scalar& scalar) { - RETURN_NOT_OK(CheckInterval(*scalar.type)); - if (scalar.is_valid) { - return internal::MonthDayNanoIntervalToNamedTuple( - arrow::internal::checked_cast(scalar).value); - } else { - Py_INCREF(Py_None); - return Py_None; - } -} - -} // namespace py -} // namespace arrow diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h deleted file mode 100644 index 98f66b2356198..0000000000000 --- a/cpp/src/arrow/python/arrow_to_python.h +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Utilities for converting arrow to python (non-pandas) objects. -#pragma once - -#include "arrow/python/common.h" -#include "arrow/python/platform.h" - -namespace arrow { - -class Array; -struct Scalar; - -namespace py { - -/// \brief Utility class for converting Arrow to Python obects. -/// -/// A class is chosen because in the future some amount of state will be -/// (e.g. imported python classes), doing this once lazily is helpful. -/// A class allows for keeping the state as member variables instead of -/// static variables. It is expected cython code will instantiate this -/// class as a singleton on module class. -class ARROW_PYTHON_EXPORT ArrowToPython { - public: - /// \brief Converts the given Array to a PyList object. - /// - /// The list consists of the same as calling ToPyObject on each scalar - /// in the array.. - /// - /// N.B. This has limited type support. ARROW-12976 tracks extending the - /// implementation. - Result ToPyList(const Array& array); - - /// \brief Converts the given Scalar the type to its logical equivalent type - /// in python. - /// - /// For instance Decimal128 and Decimal256 would be converted to - /// decimal.Decimal. - /// - /// N.B. This has limited type support. ARROW-12976 tracks full implementation. - Result ToPyObject(const Scalar& scalar); -}; - -} // namespace py -} // namespace arrow diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index e2ecd3fb6d8ef..8c954998f0e0a 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -20,9 +20,12 @@ #include #include +#include "arrow/array.h" +#include "arrow/python/arrow_to_python_internal.h" #include "arrow/python/common.h" #include "arrow/python/helpers.h" #include "arrow/python/platform.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" @@ -493,6 +496,71 @@ PyObject* MonthDayNanoIntervalToNamedTuple( return tuple.detach(); } +namespace { + +// Wrapper around a Python list object that mimics dereference and assignment +// operations. +struct PyListAssigner { + public: + explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); } + + PyListAssigner& operator*() { return *this; } + + void operator=(PyObject* obj) { + if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) { + Py_FatalError("list did not have the correct preallocated size."); + } + } + + PyListAssigner& operator++() { + current_index_++; + return *this; + } + + PyListAssigner& operator+=(int64_t offset) { + current_index_ += offset; + return *this; + } + + private: + PyObject* list_; + int64_t current_index_ = 0; +}; + +} // namespace + +Result MonthDayNanoIntervalArrayToPyList( + const MonthDayNanoIntervalArray& array) { + OwnedRef out_list(PyList_New(array.length())); + RETURN_IF_PYERROR(); + PyListAssigner out_objects(out_list.obj()); + auto& interval_array = + arrow::internal::checked_cast(array); + RETURN_NOT_OK(internal::WriteArrayObjects( + interval_array, + [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) { + PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); + if (ARROW_PREDICT_FALSE(tuple == nullptr)) { + RETURN_IF_PYERROR(); + } + + *out = tuple; + return Status::OK(); + }, + out_objects)); + return out_list.detach(); +} + +Result MonthDayNanoIntervalScalarToPyObject( + const MonthDayNanoIntervalScalar& scalar) { + if (scalar.is_valid) { + return internal::MonthDayNanoIntervalToNamedTuple(scalar.value); + } else { + Py_INCREF(Py_None); + return Py_None; + } +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index f11446a759a01..529b74f04d56d 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -34,6 +34,10 @@ #define PyDateTimeAPI ::arrow::py::internal::datetime_api namespace arrow { + +class MonthDayNanoIntervalArray; +class MonthDayNanoIntervalScalar; + namespace py { namespace internal { @@ -193,6 +197,18 @@ ARROW_PYTHON_EXPORT PyObject* MonthDayNanoIntervalToNamedTuple( const MonthDayNanoIntervalType::MonthDayNanos& interval); +/// \brief Converts the given Array to a PyList object contain +/// pyarrow.MonthDayNano objects. +ARROW_PYTHON_EXPORT +Result MonthDayNanoIntervalArrayToPyList( + const MonthDayNanoIntervalArray& array); + +/// \brief Converts the Scalar obect to a pyarrow.MonthDayNano (or None if +/// is isn't valid. +ARROW_PYTHON_EXPORT +Result MonthDayNanoIntervalScalarToPyObject( + const MonthDayNanoIntervalScalar& scalar); + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a2eb5524ed943..e92b5ec982eb8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1533,7 +1533,9 @@ cdef class MonthDayNanoIntervalArray(Array): cdef: CResult[PyObject*] maybe_py_list PyObject* py_list - maybe_py_list = ARROW_TO_PYTHON.ToPyList(deref(self.sp_array)) + CMonthDayNanoIntervalArray* array + array = self.sp_array.get() + maybe_py_list = MonthDayNanoIntervalArrayToPyList(deref(array)) py_list = GetResultValue(maybe_py_list) return PyObject_to_object(py_list) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 223d64f44bd6d..815238f112c6e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -558,6 +558,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CDurationArray" arrow::DurationArray"(CArray): int64_t Value(int i) + cdef cppclass CMonthDayNanoIntervalArray \ + "arrow::MonthDayNanoIntervalArray"(CArray): + pass + cdef cppclass CHalfFloatArray" arrow::HalfFloatArray"(CArray): uint16_t Value(int i) @@ -1000,6 +1004,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CDurationScalar" arrow::DurationScalar"(CScalar): int64_t value + cdef cppclass CMonthDayNanoIntervalScalar \ + "arrow::MonthDayNanoIntervalScalar"(CScalar): + pass + cdef cppclass CBaseBinaryScalar" arrow::BaseBinaryScalar"(CScalar): shared_ptr[CBuffer] value @@ -2214,12 +2222,13 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py": CResult[shared_ptr[CDataType]] InferArrowType( object obj, object mask, c_bool pandas_null_sentinels) - cdef cppclass ArrowToPython: - CResult[PyObject*] ToPyList(const CArray& array) - CResult[PyObject*] ToPyObject(const CScalar& scalar) cdef extern from "arrow/python/api.h" namespace "arrow::py::internal": - cdef object NewMonthDayNanoTupleType() + object NewMonthDayNanoTupleType() + CResult[PyObject*] MonthDayNanoIntervalArrayToPyList( + const CMonthDayNanoIntervalArray& array) + CResult[PyObject*] MonthDayNanoIntervalScalarToPyObject( + const CMonthDayNanoIntervalScalar& scalar) cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 221b5299c6f64..e081c14dcdc4b 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -37,8 +37,6 @@ arrow_init_numpy() # (used from some of our C++ code, see e.g. ARROW-5260) import_pyarrow() -# Singleton object to do conversion from C++ to Python. -cdef libarrow.ArrowToPython ARROW_TO_PYTHON MonthDayNano = NewMonthDayNanoTupleType() diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 3f7e2f5e00439..49c94873cd1f0 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -530,9 +530,12 @@ cdef class MonthDayNanoIntervalScalar(Scalar): """ Returns this value as a pyarrow.MonthDayNano. """ - cdef PyObject* val - val = GetResultValue(ARROW_TO_PYTHON.ToPyObject( - (deref(self.wrapped.get())))) + cdef: + PyObject* val + CMonthDayNanoIntervalScalar* scalar + scalar = self.wrapped.get() + val = GetResultValue(MonthDayNanoIntervalScalarToPyObject( + deref(scalar))) return PyObject_to_object(val) From 3cac885c2267db1670e8119cbed2f0340b17e9f1 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Wed, 6 Oct 2021 14:28:29 -0700 Subject: [PATCH 15/24] Apply suggestions from Joris's code review Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/python/datetime.h | 4 ++-- python/pyarrow/array.pxi | 2 +- python/pyarrow/tests/test_pandas.py | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 529b74f04d56d..e20c905376587 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -186,9 +186,9 @@ Result StringToTzinfo(const std::string& tz); ARROW_PYTHON_EXPORT Result TzinfoToString(PyObject* pytzinfo); -/// Converts MonthDayNano to a python dictionary. +/// Converts MonthDayNano to a python namedtuple. /// -/// Returns a named tuple (pyarrow.MonthDayNanoTuple) containing attributes +/// Returns a named tuple (pyarrow.MonthDayNano) containing attributes /// "months", "days", "nanoseconds" in the given order /// with values extracted from the fields on interval. /// diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e92b5ec982eb8..e9ee33c84699d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -167,7 +167,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, default converted as MonthDayNanoIntervalArray. relativedelta leapday's are ignored as are all absolute fields on both objects. datetime.timedelta can also be converted to MonthDayNanoIntervalArray but require passing - MonthDayIntervalType explicitly. + MonthDayNanoIntervalType explicitly. Converting to dictionary array will promote to a wider integer type for indices if the number of distinct values cannot be represented, even if diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 8b1f42130cc6b..be3ccda67e200 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1502,8 +1502,7 @@ def test_month_day_nano_interval(self): DateOffset(days=3600, months=3600, microseconds=3, nanoseconds=600)] }) - field = pa.field('date_offset', pa.month_day_nano_interval()) - schema = pa.schema([field]) + schema = pa.schema([('date_offset', pa.month_day_nano_interval()]) _check_pandas_roundtrip( df, expected_schema=schema) From b4c45013ae97f53903111b79c1a5a158b143849b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 Oct 2021 14:44:52 -0700 Subject: [PATCH 16/24] address small comments --- cpp/src/arrow/python/arrow_to_python_internal.h | 2 -- cpp/src/arrow/python/helpers.cc | 2 +- python/pyarrow/tests/test_array.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_python_internal.h b/cpp/src/arrow/python/arrow_to_python_internal.h index abdf48b270612..486aaf6d92874 100644 --- a/cpp/src/arrow/python/arrow_to_python_internal.h +++ b/cpp/src/arrow/python/arrow_to_python_internal.h @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures #pragma once diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index c37bfde2bc329..38f93cd053c1c 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -325,7 +325,7 @@ void InitPandasStaticData() { // Import DateOffset type OwnedRef offsets; - if (internal::ImportModule("pandas.tseries.offsets", &offsets).ok()) { + if (internal::ImportModule("pandas", &offsets).ok()) { if (internal::ImportFromModule(offsets.obj(), "DateOffset", &ref).ok()) { pandas_DateOffset = ref.obj(); } diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 7be1c676ae453..bbd0ad048f58d 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2189,7 +2189,7 @@ def test_interval_array_from_timedelta(): def test_interval_array_from_relativedelta(): # dateutil is dependency of pandas from dateutil.relativedelta import relativedelta - from pandas.tseries.offsets import DateOffset + from pandas import DateOffset data = [ None, relativedelta(years=1, months=1, From c64015237203100158ded1e8d201dedffba81f31 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 Oct 2021 14:48:50 -0700 Subject: [PATCH 17/24] simplify --- cpp/src/arrow/python/arrow_to_python_internal.h | 1 - cpp/src/arrow/python/helpers.cc | 7 ++----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_python_internal.h b/cpp/src/arrow/python/arrow_to_python_internal.h index 486aaf6d92874..514cda3200123 100644 --- a/cpp/src/arrow/python/arrow_to_python_internal.h +++ b/cpp/src/arrow/python/arrow_to_python_internal.h @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. - #pragma once #include "arrow/array.h" diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 38f93cd053c1c..8e3bebb26f18b 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -324,11 +324,8 @@ void InitPandasStaticData() { } // Import DateOffset type - OwnedRef offsets; - if (internal::ImportModule("pandas", &offsets).ok()) { - if (internal::ImportFromModule(offsets.obj(), "DateOffset", &ref).ok()) { - pandas_DateOffset = ref.obj(); - } + if (internal::ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { + pandas_DateOffset = ref.obj(); } pandas_static_initialized = true; From b5aadb1959e0206e2335129a3fcf6fd26bc4f436 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 Oct 2021 15:12:37 -0700 Subject: [PATCH 18/24] remove bad include --- cpp/src/arrow/python/api.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index b170470d2785a..a0b13d6d13013 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -18,7 +18,6 @@ #pragma once #include "arrow/python/arrow_to_pandas.h" -#include "arrow/python/arrow_to_python.h" #include "arrow/python/common.h" #include "arrow/python/datetime.h" #include "arrow/python/deserialize.h" From c2aa56e5ddfdbdcea46dc5630cb36886a2cad39c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 Oct 2021 15:41:20 -0700 Subject: [PATCH 19/24] address more feedback --- docs/source/python/api/arrays.rst | 4 ++++ docs/source/python/api/datatypes.rst | 2 ++ python/pyarrow/lib.pyx | 1 + python/pyarrow/tests/strategies.py | 7 ++++++- python/pyarrow/tests/test_pandas.py | 2 +- python/pyarrow/tests/test_schema.py | 1 + python/pyarrow/tests/test_types.py | 14 +++++++++++++- python/pyarrow/types.py | 17 +++++++++++++++-- 8 files changed, 43 insertions(+), 5 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 17b061dc7d82d..37e02616dc974 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -65,6 +65,8 @@ may expose data type-specific methods or properties. Date32Array Date64Array TimestampArray + DuratoinArray + MonthDayNanoIntervalArray Decimal128Array DictionaryArray ListArray @@ -115,6 +117,8 @@ classes may expose data type-specific methods or properties. Date32Scalar Date64Scalar TimestampScalar + DurationScalar + MonthDayNanoIntervalScalar Decimal128Scalar DictionaryScalar ListScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 90e387ee07a87..48a254a001391 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -47,6 +47,8 @@ These should be used to create Arrow data types and schemas. timestamp date32 date64 + duration + month_day_nano_interval binary string utf8 diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index e081c14dcdc4b..0c9cbcc5bcc9f 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -99,6 +99,7 @@ Type_TIMESTAMP = _Type_TIMESTAMP Type_TIME32 = _Type_TIME32 Type_TIME64 = _Type_TIME64 Type_DURATION = _Type_DURATION +Type_INTERVAL_MONTH_DAY_NANO = _Type_INTERVAL_MONTH_DAY_NANO Type_BINARY = _Type_BINARY Type_STRING = _Type_STRING Type_LARGE_BINARY = _Type_LARGE_BINARY diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 92b0d3617c0a5..d314785ff64c8 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -105,11 +105,15 @@ pa.duration, st.sampled_from(['s', 'ms', 'us', 'ns']) ) +interval_types = st.sampled_from( + pa.month_day_nano_interval() +) temporal_types = st.one_of( date_types, time_types, timestamp_types, - duration_types + duration_types, + interval_types ) primitive_types = st.one_of( @@ -372,6 +376,7 @@ def tables(draw, type, rows=None, max_fields=None): # discovers ARROW-10210 # timestamp_types, # duration_types + interval_types, binary_type, string_type, large_binary_type, diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index be3ccda67e200..cdfcb19590975 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1502,7 +1502,7 @@ def test_month_day_nano_interval(self): DateOffset(days=3600, months=3600, microseconds=3, nanoseconds=600)] }) - schema = pa.schema([('date_offset', pa.month_day_nano_interval()]) + schema = pa.schema(['date_offset', pa.month_day_nano_interval()]) _check_pandas_roundtrip( df, expected_schema=schema) diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 3ea9bd0ce7b83..f26eaaf5fc14f 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -137,6 +137,7 @@ def test_type_for_alias(): ('duration[ms]', pa.duration('ms')), ('duration[us]', pa.duration('us')), ('duration[ns]', pa.duration('ns')), + ('month_day_nano_interval', pa.month_day_nano_interval()), ] for val, expected in cases: diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e4192c6701b66..07715b985bdbd 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -241,8 +241,10 @@ def test_is_temporal_date_time_timestamp(): time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] duration_types = [pa.duration('ms')] + interval_types = [pa.month_day_nano_interval()] - for case in date_types + time_types + timestamp_types + duration_types: + for case in (date_types + time_types + timestamp_types + duration_types + + interval_types): assert types.is_temporal(case) for case in date_types: @@ -250,24 +252,34 @@ def test_is_temporal_date_time_timestamp(): assert not types.is_time(case) assert not types.is_timestamp(case) assert not types.is_duration(case) + assert not types.is_interval(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) assert not types.is_duration(case) + assert not types.is_interval(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_duration(case) + assert not types.is_interval(case) for case in duration_types: assert types.is_duration(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) + assert not types.is_interval(case) + + for case in interval_types: + assert types.is_interval(case) + assert not types.is_date(case) + assert not types.is_time(case) + assert not types.is_timestamp(case) assert not types.is_temporal(pa.int32()) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 041946d66e657..0617210dc1908 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -34,8 +34,10 @@ _DECIMAL_TYPES = {lib.Type_DECIMAL128, lib.Type_DECIMAL256} _DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64} _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64} -_TEMPORAL_TYPES = {lib.Type_TIMESTAMP, - lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES +_INTERVAL_TYPES = {lib.Type_INTERVAL_MONTH_DAY_NANO} +_TEMPORAL_TYPES = ({lib.Type_TIMESTAMP, + lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES | + _INTERVAL_TYPES) _UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION} _NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT, lib.Type_MAP} | _UNION_TYPES @@ -526,6 +528,17 @@ def is_dictionary(t): return t.id == lib.Type_DICTIONARY +def is_interval(t): + """ + Return Tre if the value is an instance of an interval type. + + Parameters + ---------- + t : DateType + """ + return t.id == lib.Type_INTERVAL_MONTH_DAY_NANO + + def is_primitive(t): """ Return True if the value is an instance of a primitive type. From 0e47a98d333024482538582f00ca9ff62400630a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 Oct 2021 15:59:15 -0700 Subject: [PATCH 20/24] fix pandas test --- python/pyarrow/tests/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index cdfcb19590975..112c7938edd80 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1502,7 +1502,7 @@ def test_month_day_nano_interval(self): DateOffset(days=3600, months=3600, microseconds=3, nanoseconds=600)] }) - schema = pa.schema(['date_offset', pa.month_day_nano_interval()]) + schema = pa.schema([('date_offset', pa.month_day_nano_interval())]) _check_pandas_roundtrip( df, expected_schema=schema) From 3b2db245126c10cd2376eaa9538f68b72b20d28c Mon Sep 17 00:00:00 2001 From: emkornfield Date: Wed, 6 Oct 2021 21:19:56 -0700 Subject: [PATCH 21/24] Apply suggestions from code review from Weston Co-authored-by: Weston Pace --- python/pyarrow/array.pxi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e9ee33c84699d..d85687f7d3934 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -164,10 +164,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, UTC. Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by - default converted as MonthDayNanoIntervalArray. relativedelta leapday's + default converted as MonthDayNanoIntervalArray. relativedelta leapdays are ignored as are all absolute fields on both objects. datetime.timedelta - can also be converted to MonthDayNanoIntervalArray but require passing - MonthDayNanoIntervalType explicitly. + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. Converting to dictionary array will promote to a wider integer type for indices if the number of distinct values cannot be represented, even if From 5a5c73b1797d01290e616a9b79b01e88b069cd5b Mon Sep 17 00:00:00 2001 From: emkornfield Date: Wed, 6 Oct 2021 22:01:32 -0700 Subject: [PATCH 22/24] Update python/pyarrow/types.pxi Co-authored-by: Weston Pace --- python/pyarrow/types.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 233d207de01d3..43549982d7279 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2178,7 +2178,7 @@ def duration(unit): def month_day_nano_interval(): """ - Create instance of an interval representing months, days and nanoseconds + Create instance of an interval type representing months, days and nanoseconds between two dates. """ return primitive_type(_Type_INTERVAL_MONTH_DAY_NANO) From 9467051368498680207830395258d034f53bce46 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 7 Oct 2021 00:18:53 -0700 Subject: [PATCH 23/24] fix lint --- python/pyarrow/types.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 43549982d7279..8795e4d3af907 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2178,8 +2178,8 @@ def duration(unit): def month_day_nano_interval(): """ - Create instance of an interval type representing months, days and nanoseconds - between two dates. + Create instance of an interval type representing months, days and + nanoseconds between two dates. """ return primitive_type(_Type_INTERVAL_MONTH_DAY_NANO) From 93108a906a3bcbcd334f34a7e78c8ef8459a8d97 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 7 Oct 2021 09:36:36 +0200 Subject: [PATCH 24/24] Nits --- cpp/src/arrow/python/datetime.h | 15 ++++++--------- cpp/src/arrow/python/helpers.cc | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 3 ++- docs/source/python/api/arrays.rst | 2 +- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/types.py | 4 ++-- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index e20c905376587..dd07710aaf6a4 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -24,6 +24,7 @@ #include "arrow/python/visibility.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/util/logging.h" // By default, PyDateTimeAPI is a *static* variable. This forces @@ -34,10 +35,6 @@ #define PyDateTimeAPI ::arrow::py::internal::datetime_api namespace arrow { - -class MonthDayNanoIntervalArray; -class MonthDayNanoIntervalScalar; - namespace py { namespace internal { @@ -186,9 +183,9 @@ Result StringToTzinfo(const std::string& tz); ARROW_PYTHON_EXPORT Result TzinfoToString(PyObject* pytzinfo); -/// Converts MonthDayNano to a python namedtuple. +/// \brief Convert MonthDayNano to a python namedtuple. /// -/// Returns a named tuple (pyarrow.MonthDayNano) containing attributes +/// Return a named tuple (pyarrow.MonthDayNano) containing attributes /// "months", "days", "nanoseconds" in the given order /// with values extracted from the fields on interval. /// @@ -197,14 +194,14 @@ ARROW_PYTHON_EXPORT PyObject* MonthDayNanoIntervalToNamedTuple( const MonthDayNanoIntervalType::MonthDayNanos& interval); -/// \brief Converts the given Array to a PyList object contain +/// \brief Convert the given Array to a PyList object containing /// pyarrow.MonthDayNano objects. ARROW_PYTHON_EXPORT Result MonthDayNanoIntervalArrayToPyList( const MonthDayNanoIntervalArray& array); -/// \brief Converts the Scalar obect to a pyarrow.MonthDayNano (or None if -/// is isn't valid. +/// \brief Convert the Scalar obect to a pyarrow.MonthDayNano (or None if +/// is isn't valid). ARROW_PYTHON_EXPORT Result MonthDayNanoIntervalScalarToPyObject( const MonthDayNanoIntervalScalar& scalar); diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 8e3bebb26f18b..bcaf656d785c7 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -324,7 +324,7 @@ void InitPandasStaticData() { } // Import DateOffset type - if (internal::ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { + if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { pandas_DateOffset = ref.obj(); } diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 4b51205034a9f..d13b9083d233a 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -126,7 +126,8 @@ template struct PopulateMonthDayNano { using Traits = MonthDayNanoTraits; using field_c_type = typename Traits::c_type; - inline static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) { + + static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) { *out = 0; for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; ++attr) { diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 37e02616dc974..dbc4c0bd19643 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -65,7 +65,7 @@ may expose data type-specific methods or properties. Date32Array Date64Array TimestampArray - DuratoinArray + DurationArray MonthDayNanoIntervalArray Decimal128Array DictionaryArray diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 49c94873cd1f0..80fcc00286154 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -528,7 +528,7 @@ cdef class MonthDayNanoIntervalScalar(Scalar): def as_py(self): """ - Returns this value as a pyarrow.MonthDayNano. + Return this value as a pyarrow.MonthDayNano. """ cdef: PyObject* val diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 0617210dc1908..f239c883b45a5 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -36,7 +36,7 @@ _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64} _INTERVAL_TYPES = {lib.Type_INTERVAL_MONTH_DAY_NANO} _TEMPORAL_TYPES = ({lib.Type_TIMESTAMP, - lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES | + lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES | _INTERVAL_TYPES) _UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION} _NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT, @@ -530,7 +530,7 @@ def is_dictionary(t): def is_interval(t): """ - Return Tre if the value is an instance of an interval type. + Return True if the value is an instance of an interval type. Parameters ----------