-
-
Notifications
You must be signed in to change notification settings - Fork 19.4k
API/BUG: Handling Dtype Coercions in Series/Index (GH 15832) #15859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 21 commits
676a4e5
e12bca7
9fc617b
8b463cb
43456a5
faa5c5c
1c90e7e
278c2fb
a8cd752
bbdea4b
d2e26ac
3c868a4
20ac5c6
14ed83b
3d0e76f
1726408
1f8e9b7
83cfc5d
417188a
939ae11
86e9d5e
359086d
50950f5
012fb57
35a5ff1
b1e6632
a1033cb
b78f4cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
| is_timedelta64_dtype, is_dtype_equal, | ||
| is_float_dtype, is_complex_dtype, | ||
| is_integer_dtype, | ||
| is_unsigned_integer_dtype, | ||
| is_datetime_or_timedelta_dtype, | ||
| is_bool_dtype, is_scalar, | ||
| _string_dtypes, | ||
|
|
@@ -1026,3 +1027,28 @@ def find_common_type(types): | |
| return np.object | ||
|
|
||
| return np.find_common_type(types, []) | ||
|
|
||
|
|
||
| def maybe_cast_to_integer(arr, dtype): | ||
| """ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually can take any dtype and will return a casted version. It happens to check for integer/unsigned integer dtypes. So pls expand a little. |
||
| Find a common data type among the given dtypes. | ||
|
||
|
|
||
| Parameters | ||
| ---------- | ||
| arr : array | ||
|
||
| dtype : dtype | ||
|
|
||
| Returns | ||
| ------- | ||
| integer or unsigned integer array (or raise if the dtype is incompatible) | ||
|
||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the spirit of good documentation, let's add some examples here!
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Excellent! Nice examples. |
||
| """ | ||
|
|
||
| if is_unsigned_integer_dtype(dtype) and (np.asarray(arr) < 0).any(): | ||
|
||
| raise OverflowError("Trying to coerce negative values to negative " | ||
| "integers") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this then needs to cast, no? e.g. (before the return)
|
||
| elif is_integer_dtype(dtype) and is_float_dtype(np.asarray(arr)): | ||
| if ((np.asarray(arr) % np.asarray(arr).astype(int)) > 0).any(): | ||
|
||
| raise OverflowError("Trying to coerce float values to integers") | ||
| else: | ||
|
||
| return arr | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,6 +52,7 @@ | |
| from pandas.core.ops import _comp_method_OBJECT_ARRAY | ||
| from pandas.core.strings import StringAccessorMixin | ||
| from pandas.core.config import get_option | ||
| from pandas.core.dtypes.cast import maybe_cast_to_integer | ||
|
||
|
|
||
|
|
||
| # simplify | ||
|
|
@@ -212,12 +213,21 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
| if is_integer_dtype(dtype): | ||
| inferred = lib.infer_dtype(data) | ||
| if inferred == 'integer': | ||
| data = maybe_cast_to_integer(data, dtype=dtype) | ||
| data = np.array(data, copy=copy, dtype=dtype) | ||
|
||
| elif inferred in ['floating', 'mixed-integer-float']: | ||
| if isnull(data).any(): | ||
| raise ValueError('cannot convert float ' | ||
| 'NaN to integer') | ||
|
|
||
| if is_integer_dtype(dtype) and not \ | ||
|
||
| (np.asarray(data).astype(int) == 0).all(): | ||
| if ((np.asarray(data) % np.asarray(data). | ||
| astype(int)) > 0).any(): | ||
| raise OverflowError("Trying to coerce " | ||
| "float values to " | ||
| "integers") | ||
|
|
||
| # If we are actually all equal to integers, | ||
| # then coerce to integer. | ||
| try: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,7 +35,8 @@ | |
| from pandas.core.dtypes.cast import ( | ||
| maybe_upcast, infer_dtype_from_scalar, | ||
| maybe_convert_platform, | ||
| maybe_cast_to_datetime, maybe_castable) | ||
| maybe_cast_to_datetime, maybe_castable, | ||
| maybe_cast_to_integer) | ||
| from pandas.core.dtypes.missing import isnull, notnull | ||
|
|
||
| from pandas.core.common import (is_bool_indexer, | ||
|
|
@@ -2916,9 +2917,11 @@ def _try_cast(arr, take_fast_path): | |
| return arr | ||
|
|
||
| try: | ||
| subarr = maybe_cast_to_integer(arr, dtype) | ||
| subarr = maybe_cast_to_datetime(arr, dtype) | ||
| if not is_extension_type(subarr): | ||
| subarr = np.array(subarr, dtype=dtype, copy=copy) | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am going to move this entire section (e.g.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you want to move part/all of this ok with that as well.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| except (ValueError, TypeError): | ||
| if is_categorical_dtype(dtype): | ||
| subarr = Categorical(arr) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -678,6 +678,17 @@ def test_constructor_corner(self): | |
| with tm.assert_raises_regex(TypeError, 'casting'): | ||
| Int64Index(arr_with_floats) | ||
|
|
||
| def test_constructor_overflow_coercion_signed_to_unsigned(self): | ||
| # GH 15832 | ||
| for t in ['uint8', 'uint16', 'uint32', 'uint64']: | ||
|
||
| with pytest.raises(OverflowError): | ||
| Index([-1], dtype=t) | ||
|
|
||
| def test_constructor_overflow_coercion_float_to_int(self): | ||
| # GH 15832 | ||
| with pytest.raises(OverflowError): | ||
|
||
| Index([1, 2, 3.5], dtype=int) | ||
|
|
||
| def test_coerce_list(self): | ||
| # coerce things | ||
| arr = Index([1, 2, 3, 4]) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -285,8 +285,10 @@ def test_setitem_index_int64(self): | |
| self._assert_setitem_index_conversion(obj, 5, exp_index, np.int64) | ||
|
|
||
| # int + float -> float | ||
| exp_index = pd.Index([0, 1, 2, 3, 1.1]) | ||
| self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.float64) | ||
| with pytest.raises(OverflowError): | ||
| exp_index = pd.Index([0, 1, 2, 3, 1.1]) | ||
|
||
| self._assert_setitem_index_conversion(obj, 1.1, exp_index, | ||
| np.float64) | ||
|
|
||
| # int + object -> object | ||
| exp_index = pd.Index([0, 1, 2, 3, 'x']) | ||
|
|
@@ -373,8 +375,9 @@ def test_insert_index_int64(self): | |
| self._assert_insert_conversion(obj, 1, exp, np.int64) | ||
|
|
||
| # int + float -> float | ||
| exp = pd.Index([1, 1.1, 2, 3, 4]) | ||
| self._assert_insert_conversion(obj, 1.1, exp, np.float64) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same, this is a legit operation |
||
| with pytest.raises(OverflowError): | ||
| exp = pd.Index([1, 1.1, 2, 3, 4]) | ||
| self._assert_insert_conversion(obj, 1.1, exp, np.float64) | ||
|
|
||
| # int + bool -> int | ||
| exp = pd.Index([1, 0, 2, 3, 4]) | ||
|
|
@@ -622,7 +625,8 @@ def test_where_series_int64(self): | |
| self._where_int64_common(pd.Series) | ||
|
|
||
| def test_where_index_int64(self): | ||
| self._where_int64_common(pd.Index) | ||
| with pytest.raises(OverflowError): | ||
|
||
| self._where_int64_common(pd.Index) | ||
|
|
||
| def _where_float64_common(self, klass): | ||
| obj = klass([1.1, 2.2, 3.3, 4.4]) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2080,24 +2080,26 @@ def test_table_values_dtypes_roundtrip(self): | |
| assert df1.dtypes[0] == 'float32' | ||
|
|
||
| # check with mixed dtypes | ||
| df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c)) | ||
| for c in ['float32', 'float64', 'int32', | ||
| 'int64', 'int16', 'int8']])) | ||
| df1['string'] = 'foo' | ||
| df1['float322'] = 1. | ||
| df1['float322'] = df1['float322'].astype('float32') | ||
| df1['bool'] = df1['float32'] > 0 | ||
| df1['time1'] = Timestamp('20130101') | ||
| df1['time2'] = Timestamp('20130102') | ||
|
|
||
| store.append('df_mixed_dtypes1', df1) | ||
| result = store.select('df_mixed_dtypes1').get_dtype_counts() | ||
| expected = Series({'float32': 2, 'float64': 1, 'int32': 1, | ||
| 'bool': 1, 'int16': 1, 'int8': 1, | ||
| 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) | ||
| result = result.sort_index() | ||
| result = expected.sort_index() | ||
| tm.assert_series_equal(result, expected) | ||
| with pytest.raises(OverflowError): | ||
| df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c)) | ||
|
||
| for c in ['float32', 'float64', 'int32', | ||
| 'int64', 'int16', 'int8']])) | ||
| df1['string'] = 'foo' | ||
| df1['float322'] = 1. | ||
| df1['float322'] = df1['float322'].astype('float32') | ||
| df1['bool'] = df1['float32'] > 0 | ||
| df1['time1'] = Timestamp('20130101') | ||
| df1['time2'] = Timestamp('20130102') | ||
|
|
||
| store.append('df_mixed_dtypes1', df1) | ||
| result = store.select('df_mixed_dtypes1').get_dtype_counts() | ||
| expected = Series({'float32': 2, 'float64': 1, 'int32': 1, | ||
| 'bool': 1, 'int16': 1, 'int8': 1, | ||
| 'int64': 1, 'object': 1, | ||
| 'datetime64[ns]': 2}) | ||
| result = result.sort_index() | ||
| result = expected.sort_index() | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
| def test_table_mixed_dtypes(self): | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,30 +1,26 @@ | ||
| # coding=utf-8 | ||
| # pylint: disable-msg=E1101,W0612 | ||
|
|
||
| import pytest | ||
|
|
||
| from datetime import datetime, timedelta | ||
|
|
||
| from numpy import nan | ||
| import numpy as np | ||
| import numpy.ma as ma | ||
| import pandas as pd | ||
|
|
||
| from pandas.core.dtypes.common import ( | ||
| is_categorical_dtype, | ||
| is_datetime64tz_dtype) | ||
| import pytest | ||
| from numpy import nan | ||
| from pandas import (Index, Series, isnull, date_range, | ||
| NaT, period_range, MultiIndex, IntervalIndex) | ||
| from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex | ||
| from pandas import compat | ||
| from pandas.compat import lrange, range, zip, OrderedDict, long | ||
|
|
||
| import pandas.util.testing as tm | ||
| from pandas._libs import lib | ||
| from pandas._libs.tslib import iNaT | ||
|
|
||
| from pandas.compat import lrange, range, zip, OrderedDict, long | ||
| from pandas import compat | ||
| from pandas.core.dtypes.common import ( | ||
| is_categorical_dtype, | ||
| is_datetime64tz_dtype) | ||
| from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex | ||
| from pandas.util.testing import assert_series_equal | ||
| import pandas.util.testing as tm | ||
|
|
||
| from .common import TestData | ||
|
|
||
|
|
||
|
|
@@ -888,3 +884,14 @@ def test_constructor_generic_timestamp_deprecated(self): | |
| msg = "cannot convert datetimelike" | ||
| with tm.assert_raises_regex(TypeError, msg): | ||
| Series([], dtype='M8[ps]') | ||
|
|
||
| def test_constructor_overflow_coercion_signed_to_unsigned(self): | ||
| # GH 15832 | ||
| for t in ['uint8', 'uint16', 'uint32', 'uint64']: | ||
| with pytest.raises(OverflowError): | ||
| Series([-1], dtype=t) | ||
|
|
||
| def test_constructor_overflow_coercion_float_to_int(self): | ||
| # GH 15832 | ||
| with pytest.raises(OverflowError): | ||
|
||
| Series([1, 2, 3.5], dtype=int) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rename to
maybe_cast_to_integer_arrayUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback : Perhaps we could just generalize to accept scalar as well as
array(as consistent with other casting functions in this module) and keep the name as is?