Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Fixed regressions
- Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`)
- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`)

.. ---------------------------------------------------------------------------

Expand Down
11 changes: 7 additions & 4 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1726,7 +1726,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
// Process string of digits.
num_digits = 0;
n = 0;
while (isdigit_ascii(*p)) {
while (num_digits < max_digits && isdigit_ascii(*p)) {
n = n * 10 + (*p - '0');
num_digits++;
p++;
Expand All @@ -1747,10 +1747,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
} else if (exponent > 0) {
number *= e[exponent];
} else if (exponent < -308) { // Subnormal
if (exponent < -616) // Prevent invalid array access.
if (exponent < -616) { // Prevent invalid array access.
number = 0.;
number /= e[-308 - exponent];
number /= e[308];
} else {
number /= e[-308 - exponent];
number /= e[308];
}

} else {
number /= e[-exponent];
}
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,33 @@ def python_parser_only(request):
return request.param


def _get_all_parser_float_precision_combinations():
"""
Return all allowable parser and float precision
combinations and corresponding ids.
"""
params = []
ids = []
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
for precision in parser.float_precision_choices:
params.append((parser, precision))
ids.append(f"{parser_id}-{precision}")

return {"params": params, "ids": ids}


@pytest.fixture(
params=_get_all_parser_float_precision_combinations()["params"],
ids=_get_all_parser_float_precision_combinations()["ids"],
)
def all_parsers_all_precisions(request):
"""
Fixture for all allowable combinations of parser
and float precision
"""
return request.param


_utf_values = [8, 16, 32]

_encoding_seps = ["", "-", "_"]
Expand Down
39 changes: 34 additions & 5 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest

from pandas._libs.tslib import Timestamp
from pandas.compat import is_platform_linux
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
import pandas.util._test_decorators as td

Expand Down Expand Up @@ -1259,15 +1260,14 @@ def test_float_parser(all_parsers):
tm.assert_frame_equal(result, expected)


def test_scientific_no_exponent(all_parsers):
def test_scientific_no_exponent(all_parsers_all_precisions):
# see gh-12215
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
data = df.to_csv(index=False)
parser = all_parsers
parser, precision = all_parsers_all_precisions

for precision in parser.float_precision_choices:
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
tm.assert_frame_equal(df_roundtrip, df)
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
tm.assert_frame_equal(df_roundtrip, df)


@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
Expand Down Expand Up @@ -1351,6 +1351,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{neg_exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
expected = DataFrame({"data": [0.0]})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
if precision == "round_trip":
if exp == 999999999999999999 and is_platform_linux():
mark = pytest.mark.xfail(reason="On Linux gives object result")
request.node.add_marker(mark)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This discrepancy is unrelated, occurs on master, have opened #38794 for it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk can you put the issue number in the xfail itself.


value = np.inf if exp > 0 else 0.0
expected = DataFrame({"data": [value]})
else:
expected = DataFrame({"data": [f"10E{exp}"]})

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("iterator", [True, False])
def test_empty_with_nrows_chunksize(all_parsers, iterator):
# see gh-9535
Expand Down