Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 8d7d3fb

Browse files
jeffknuppjreback
authored andcommittedJul 23, 2017
BUG: Use size_t to avoid array index overflow; add missing malloc of error_msg
Fix a few locations where a parser's `error_msg` buffer is written to without having been previously allocated. This manifested as a double free during exception handling code making use of the `error_msg`. Additionally, use `size_t/ssize_t` where array indices or lengths will be stored. Previously, int32_t was used and would overflow on columns with very large amounts of data (i.e. greater than INTMAX bytes). xref #14696 closes #16798 Author: Jeff Knupp <[email protected]> Author: Jeff Knupp <[email protected]> Closes #17040 from jeffknupp/16790-core-on-large-csv and squashes the following commits: 6a1ba23 [Jeff Knupp] Clear up prose a5d5677 [Jeff Knupp] Fix linting issues 4380c53 [Jeff Knupp] Fix linting issues 7b1cd8d [Jeff Knupp] Fix linting issues e3cb9c1 [Jeff Knupp] Add unit test plus '--high-memory' option, *off by default*. 2ab4971 [Jeff Knupp] Remove debugging code 2930eaa [Jeff Knupp] Fix line length to conform to linter rules e4dfd19 [Jeff Knupp] Revert printf format strings; fix more comment alignment 3171674 [Jeff Knupp] Fix some leftover size_t references 0985cf3 [Jeff Knupp] Remove debugging code; fix type cast 669d99b [Jeff Knupp] Fix linting errors re: line length 1f24847 [Jeff Knupp] Fix comment alignment; add whatsnew entry e04d12a [Jeff Knupp] Switch to use int64_t rather than size_t due to portability concerns. d5c75e8 [Jeff Knupp] BUG: Use size_t to avoid array index overflow; add missing malloc of error_msg
1 parent ee6412a commit 8d7d3fb

File tree

7 files changed

+187
-139
lines changed

7 files changed

+187
-139
lines changed
 

‎doc/source/whatsnew/v0.21.0.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,8 @@ I/O
264264
^^^
265265

266266
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
267-
267+
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`).
268+
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
268269
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
269270

270271
- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)

‎pandas/_libs/parsers.pyx

Lines changed: 79 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
121121
io_callback cb_io
122122
io_cleanup cb_cleanup
123123

124-
int chunksize # Number of bytes to prepare for each chunk
125-
char *data # pointer to data to be processed
126-
int datalen # amount of data available
127-
int datapos
124+
int64_t chunksize # Number of bytes to prepare for each chunk
125+
char *data # pointer to data to be processed
126+
int64_t datalen # amount of data available
127+
int64_t datapos
128128

129129
# where to write out tokenized data
130130
char *stream
131-
int stream_len
132-
int stream_cap
131+
int64_t stream_len
132+
int64_t stream_cap
133133

134134
# Store words in (potentially ragged) matrix for now, hmm
135135
char **words
136-
int *word_starts # where we are in the stream
137-
int words_len
138-
int words_cap
136+
int64_t *word_starts # where we are in the stream
137+
int64_t words_len
138+
int64_t words_cap
139139

140-
char *pword_start # pointer to stream start of current field
141-
int word_start # position start of current field
140+
char *pword_start # pointer to stream start of current field
141+
int64_t word_start # position start of current field
142142

143-
int *line_start # position in words for start of line
144-
int *line_fields # Number of fields in each line
145-
int lines # Number of lines observed
146-
int file_lines # Number of file lines observed (with bad/skipped)
147-
int lines_cap # Vector capacity
143+
int64_t *line_start # position in words for start of line
144+
int64_t *line_fields # Number of fields in each line
145+
int64_t lines # Number of lines observed
146+
int64_t file_lines # Number of lines observed (with bad/skipped)
147+
int64_t lines_cap # Vector capacity
148148

149149
# Tokenizing stuff
150150
ParserState state
@@ -177,14 +177,14 @@ cdef extern from "parser/tokenizer.h":
177177
# thousands separator (comma, period)
178178
char thousands
179179

180-
int header # Boolean: 1: has header, 0: no header
181-
int header_start # header row start
182-
int header_end # header row end
180+
int header # Boolean: 1: has header, 0: no header
181+
int64_t header_start # header row start
182+
int64_t header_end # header row end
183183

184184
void *skipset
185185
PyObject *skipfunc
186186
int64_t skip_first_N_rows
187-
int skipfooter
187+
int64_t skipfooter
188188
# pick one, depending on whether the converter requires GIL
189189
double (*double_converter_nogil)(const char *, char **,
190190
char, char, char, int) nogil
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
195195
char *warn_msg
196196
char *error_msg
197197

198-
int skip_empty_lines
198+
int64_t skip_empty_lines
199199

200200
ctypedef struct coliter_t:
201201
char **words
202-
int *line_start
203-
int col
202+
int64_t *line_start
203+
int64_t col
204204

205205
ctypedef struct uint_state:
206206
int seen_sint
@@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h":
210210
void uint_state_init(uint_state *self)
211211
int uint64_conflict(uint_state *self)
212212

213-
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
213+
void coliter_setup(coliter_t *it, parser_t *parser,
214+
int64_t i, int64_t start) nogil
214215
void COLITER_NEXT(coliter_t, const char *) nogil
215216

216217
parser_t* parser_new()
@@ -289,14 +290,14 @@ cdef class TextReader:
289290
object true_values, false_values
290291
object handle
291292
bint na_filter, verbose, has_usecols, has_mi_columns
292-
int parser_start
293+
int64_t parser_start
293294
list clocks
294295
char *c_encoding
295296
kh_str_t *false_set
296297
kh_str_t *true_set
297298

298299
cdef public:
299-
int leading_cols, table_width, skipfooter, buffer_lines
300+
int64_t leading_cols, table_width, skipfooter, buffer_lines
300301
object allow_leading_cols
301302
object delimiter, converters, delim_whitespace
302303
object na_values
@@ -730,7 +731,8 @@ cdef class TextReader:
730731
Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa
731732
char *word
732733
object name
733-
int status, hr, data_line
734+
int status
735+
int64_t hr, data_line
734736
char *errors = "strict"
735737
cdef StringPath path = _string_path(self.c_encoding)
736738

@@ -949,8 +951,8 @@ cdef class TextReader:
949951

950952
cdef _read_rows(self, rows, bint trim):
951953
cdef:
952-
int buffered_lines
953-
int irows, footer = 0
954+
int64_t buffered_lines
955+
int64_t irows, footer = 0
954956

955957
self._start_clock()
956958

@@ -1018,12 +1020,13 @@ cdef class TextReader:
10181020

10191021
def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
10201022
cdef:
1021-
Py_ssize_t i, nused
1023+
int64_t i
1024+
int nused
10221025
kh_str_t *na_hashset = NULL
1023-
int start, end
1026+
int64_t start, end
10241027
object name, na_flist, col_dtype = None
10251028
bint na_filter = 0
1026-
Py_ssize_t num_cols
1029+
int64_t num_cols
10271030

10281031
start = self.parser_start
10291032

@@ -1195,7 +1198,7 @@ cdef class TextReader:
11951198
return col_res, na_count
11961199

11971200
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
1198-
int start, int end,
1201+
int64_t start, int64_t end,
11991202
bint na_filter,
12001203
bint user_dtype,
12011204
kh_str_t *na_hashset,
@@ -1275,7 +1278,7 @@ cdef class TextReader:
12751278
raise TypeError("the dtype %s is not "
12761279
"supported for parsing" % dtype)
12771280

1278-
cdef _string_convert(self, Py_ssize_t i, int start, int end,
1281+
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
12791282
bint na_filter, kh_str_t *na_hashset):
12801283

12811284
cdef StringPath path = _string_path(self.c_encoding)
@@ -1336,6 +1339,7 @@ cdef class TextReader:
13361339
kh_destroy_str(table)
13371340

13381341
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
1342+
cdef int64_t j
13391343
if self.has_usecols and self.names is not None:
13401344
if (not callable(self.usecols) and
13411345
len(self.names) == len(self.usecols)):
@@ -1427,8 +1431,8 @@ cdef inline StringPath _string_path(char *encoding):
14271431
# ----------------------------------------------------------------------
14281432
# Type conversions / inference support code
14291433

1430-
cdef _string_box_factorize(parser_t *parser, int col,
1431-
int line_start, int line_end,
1434+
cdef _string_box_factorize(parser_t *parser, int64_t col,
1435+
int64_t line_start, int64_t line_end,
14321436
bint na_filter, kh_str_t *na_hashset):
14331437
cdef:
14341438
int error, na_count = 0
@@ -1480,8 +1484,8 @@ cdef _string_box_factorize(parser_t *parser, int col,
14801484

14811485
return result, na_count
14821486

1483-
cdef _string_box_utf8(parser_t *parser, int col,
1484-
int line_start, int line_end,
1487+
cdef _string_box_utf8(parser_t *parser, int64_t col,
1488+
int64_t line_start, int64_t line_end,
14851489
bint na_filter, kh_str_t *na_hashset):
14861490
cdef:
14871491
int error, na_count = 0
@@ -1533,8 +1537,8 @@ cdef _string_box_utf8(parser_t *parser, int col,
15331537

15341538
return result, na_count
15351539

1536-
cdef _string_box_decode(parser_t *parser, int col,
1537-
int line_start, int line_end,
1540+
cdef _string_box_decode(parser_t *parser, int64_t col,
1541+
int64_t line_start, int64_t line_end,
15381542
bint na_filter, kh_str_t *na_hashset,
15391543
char *encoding):
15401544
cdef:
@@ -1592,8 +1596,8 @@ cdef _string_box_decode(parser_t *parser, int col,
15921596

15931597

15941598
@cython.boundscheck(False)
1595-
cdef _categorical_convert(parser_t *parser, int col,
1596-
int line_start, int line_end,
1599+
cdef _categorical_convert(parser_t *parser, int64_t col,
1600+
int64_t line_start, int64_t line_end,
15971601
bint na_filter, kh_str_t *na_hashset,
15981602
char *encoding):
15991603
"Convert column data into codes, categories"
@@ -1663,8 +1667,8 @@ cdef _categorical_convert(parser_t *parser, int col,
16631667
kh_destroy_str(table)
16641668
return np.asarray(codes), result, na_count
16651669

1666-
cdef _to_fw_string(parser_t *parser, int col, int line_start,
1667-
int line_end, size_t width):
1670+
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
1671+
int64_t line_end, int64_t width):
16681672
cdef:
16691673
Py_ssize_t i
16701674
coliter_t it
@@ -1680,11 +1684,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
16801684

16811685
return result
16821686

1683-
cdef inline void _to_fw_string_nogil(parser_t *parser, int col,
1684-
int line_start, int line_end,
1687+
cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
1688+
int64_t line_start, int64_t line_end,
16851689
size_t width, char *data) nogil:
16861690
cdef:
1687-
Py_ssize_t i
1691+
int64_t i
16881692
coliter_t it
16891693
const char *word = NULL
16901694

@@ -1699,7 +1703,8 @@ cdef char* cinf = b'inf'
16991703
cdef char* cposinf = b'+inf'
17001704
cdef char* cneginf = b'-inf'
17011705

1702-
cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
1706+
cdef _try_double(parser_t *parser, int64_t col,
1707+
int64_t line_start, int64_t line_end,
17031708
bint na_filter, kh_str_t *na_hashset, object na_flist):
17041709
cdef:
17051710
int error, na_count = 0
@@ -1808,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser,
18081813

18091814
return 0
18101815

1811-
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
1816+
cdef _try_uint64(parser_t *parser, int64_t col,
1817+
int64_t line_start, int64_t line_end,
18121818
bint na_filter, kh_str_t *na_hashset):
18131819
cdef:
18141820
int error
@@ -1842,8 +1848,9 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
18421848

18431849
return result
18441850

1845-
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
1846-
int line_end, bint na_filter,
1851+
cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
1852+
int64_t line_start,
1853+
int64_t line_end, bint na_filter,
18471854
const kh_str_t *na_hashset,
18481855
uint64_t *data, uint_state *state) nogil:
18491856
cdef:
@@ -1879,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
18791886

18801887
return 0
18811888

1882-
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
1889+
cdef _try_int64(parser_t *parser, int64_t col,
1890+
int64_t line_start, int64_t line_end,
18831891
bint na_filter, kh_str_t *na_hashset):
18841892
cdef:
18851893
int error, na_count = 0
@@ -1906,8 +1914,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
19061914

19071915
return result, na_count
19081916

1909-
cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
1910-
int line_end, bint na_filter,
1917+
cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
1918+
int64_t line_start,
1919+
int64_t line_end, bint na_filter,
19111920
const kh_str_t *na_hashset, int64_t NA,
19121921
int64_t *data, int *na_count) nogil:
19131922
cdef:
@@ -1944,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
19441953

19451954
return 0
19461955

1947-
cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
1956+
cdef _try_bool(parser_t *parser, int64_t col,
1957+
int64_t line_start, int64_t line_end,
19481958
bint na_filter, kh_str_t *na_hashset):
19491959
cdef:
19501960
int na_count
@@ -1966,8 +1976,9 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
19661976
return None, None
19671977
return result.view(np.bool_), na_count
19681978

1969-
cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
1970-
int line_end, bint na_filter,
1979+
cdef inline int _try_bool_nogil(parser_t *parser, int64_t col,
1980+
int64_t line_start,
1981+
int64_t line_end, bint na_filter,
19711982
const kh_str_t *na_hashset, uint8_t NA,
19721983
uint8_t *data, int *na_count) nogil:
19731984
cdef:
@@ -2006,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
20062017
data += 1
20072018
return 0
20082019

2009-
cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
2020+
cdef _try_bool_flex(parser_t *parser, int64_t col,
2021+
int64_t line_start, int64_t line_end,
20102022
bint na_filter, const kh_str_t *na_hashset,
20112023
const kh_str_t *true_hashset,
20122024
const kh_str_t *false_hashset):
@@ -2032,8 +2044,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
20322044
return None, None
20332045
return result.view(np.bool_), na_count
20342046

2035-
cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
2036-
int line_end, bint na_filter,
2047+
cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
2048+
int64_t line_start,
2049+
int64_t line_end, bint na_filter,
20372050
const kh_str_t *na_hashset,
20382051
const kh_str_t *true_hashset,
20392052
const kh_str_t *false_hashset,
@@ -2251,8 +2264,8 @@ for k in list(na_values):
22512264
na_values[np.dtype(k)] = na_values[k]
22522265

22532266

2254-
cdef _apply_converter(object f, parser_t *parser, int col,
2255-
int line_start, int line_end,
2267+
cdef _apply_converter(object f, parser_t *parser, int64_t col,
2268+
int64_t line_start, int64_t line_end,
22562269
char* c_encoding):
22572270
cdef:
22582271
int error
@@ -2296,7 +2309,7 @@ def _to_structured_array(dict columns, object names, object usecols):
22962309

22972310
object name, fnames, field_type
22982311
Py_ssize_t i, offset, nfields, length
2299-
int stride, elsize
2312+
int64_t stride, elsize
23002313
char *buf
23012314

23022315
if names is None:
@@ -2344,10 +2357,10 @@ def _to_structured_array(dict columns, object names, object usecols):
23442357

23452358
return recs
23462359

2347-
cdef _fill_structured_column(char *dst, char* src, int elsize,
2348-
int stride, int length, bint incref):
2360+
cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
2361+
int64_t stride, int64_t length, bint incref):
23492362
cdef:
2350-
Py_ssize_t i
2363+
int64_t i
23512364

23522365
if incref:
23532366
util.transfer_object_column(dst, src, stride, length)

‎pandas/_libs/src/parser/tokenizer.c

Lines changed: 61 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ static void free_if_not_null(void **ptr) {
6969
7070
*/
7171

72-
static void *grow_buffer(void *buffer, int length, int *capacity, int space,
73-
int elsize, int *error) {
74-
int cap = *capacity;
72+
static void *grow_buffer(void *buffer, size_t length, size_t *capacity,
73+
size_t space, size_t elsize, int *error) {
74+
size_t cap = *capacity;
7575
void *newbuffer = buffer;
7676

7777
// Can we fit potentially nbytes tokens (+ null terminators) in the stream?
@@ -169,7 +169,7 @@ int parser_cleanup(parser_t *self) {
169169
}
170170

171171
int parser_init(parser_t *self) {
172-
int sz;
172+
size_t sz;
173173

174174
/*
175175
Initialize data buffers
@@ -196,14 +196,14 @@ int parser_init(parser_t *self) {
196196
sz = STREAM_INIT_SIZE / 10;
197197
sz = sz ? sz : 1;
198198
self->words = (char **)malloc(sz * sizeof(char *));
199-
self->word_starts = (int *)malloc(sz * sizeof(int));
199+
self->word_starts = (size_t *)malloc(sz * sizeof(size_t));
200200
self->words_cap = sz;
201201
self->words_len = 0;
202202

203203
// line pointers and metadata
204-
self->line_start = (int *)malloc(sz * sizeof(int));
204+
self->line_start = (size_t *)malloc(sz * sizeof(size_t));
205205

206-
self->line_fields = (int *)malloc(sz * sizeof(int));
206+
self->line_fields = (size_t *)malloc(sz * sizeof(size_t));
207207

208208
self->lines_cap = sz;
209209
self->lines = 0;
@@ -247,7 +247,8 @@ void parser_del(parser_t *self) {
247247
}
248248

249249
static int make_stream_space(parser_t *self, size_t nbytes) {
250-
int i, status, cap;
250+
size_t i, cap;
251+
int status;
251252
void *orig_ptr, *newptr;
252253

253254
// Can we fit potentially nbytes tokens (+ null terminators) in the stream?
@@ -304,11 +305,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
304305
"self->words_cap=%d\n",
305306
nbytes, self->words_cap))
306307
newptr = safe_realloc((void *)self->word_starts,
307-
sizeof(int) * self->words_cap);
308+
sizeof(int64_t) * self->words_cap);
308309
if (newptr == NULL) {
309310
return PARSER_OUT_OF_MEMORY;
310311
} else {
311-
self->word_starts = (int *)newptr;
312+
self->word_starts = (int64_t *)newptr;
312313
}
313314
}
314315

@@ -317,8 +318,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
317318
*/
318319
cap = self->lines_cap;
319320
self->line_start =
320-
(int *)grow_buffer((void *)self->line_start, self->lines + 1,
321-
&self->lines_cap, nbytes, sizeof(int), &status);
321+
(int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
322+
&self->lines_cap, nbytes, sizeof(int64_t), &status);
322323
TRACE((
323324
"make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
324325
self->lines + 1, self->lines_cap, nbytes, status))
@@ -331,11 +332,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
331332
TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
332333
nbytes))
333334
newptr = safe_realloc((void *)self->line_fields,
334-
sizeof(int) * self->lines_cap);
335+
sizeof(int64_t) * self->lines_cap);
335336
if (newptr == NULL) {
336337
return PARSER_OUT_OF_MEMORY;
337338
} else {
338-
self->line_fields = (int *)newptr;
339+
self->line_fields = (int64_t *)newptr;
339340
}
340341
}
341342

@@ -350,7 +351,7 @@ static int push_char(parser_t *self, char c) {
350351
("push_char: ERROR!!! self->stream_len(%d) >= "
351352
"self->stream_cap(%d)\n",
352353
self->stream_len, self->stream_cap))
353-
int bufsize = 100;
354+
size_t bufsize = 100;
354355
self->error_msg = (char *)malloc(bufsize);
355356
snprintf(self->error_msg, bufsize,
356357
"Buffer overflow caught - possible malformed input file.\n");
@@ -367,7 +368,7 @@ int P_INLINE end_field(parser_t *self) {
367368
("end_field: ERROR!!! self->words_len(%zu) >= "
368369
"self->words_cap(%zu)\n",
369370
self->words_len, self->words_cap))
370-
int bufsize = 100;
371+
size_t bufsize = 100;
371372
self->error_msg = (char *)malloc(bufsize);
372373
snprintf(self->error_msg, bufsize,
373374
"Buffer overflow caught - possible malformed input file.\n");
@@ -399,8 +400,8 @@ int P_INLINE end_field(parser_t *self) {
399400
}
400401

401402
static void append_warning(parser_t *self, const char *msg) {
402-
int ex_length;
403-
int length = strlen(msg);
403+
size_t ex_length;
404+
size_t length = strlen(msg);
404405
void *newptr;
405406

406407
if (self->warn_msg == NULL) {
@@ -420,19 +421,21 @@ static int end_line(parser_t *self) {
420421
char *msg;
421422
int fields;
422423
int ex_fields = self->expected_fields;
423-
int bufsize = 100; // for error or warning messages
424+
size_t bufsize = 100; // for error or warning messages
424425

425426
fields = self->line_fields[self->lines];
426427

427428
TRACE(("end_line: Line end, nfields: %d\n", fields));
428429

430+
TRACE(("end_line: lines: %d\n", self->lines));
429431
if (self->lines > 0) {
430432
if (self->expected_fields >= 0) {
431433
ex_fields = self->expected_fields;
432434
} else {
433435
ex_fields = self->line_fields[self->lines - 1];
434436
}
435437
}
438+
TRACE(("end_line: ex_fields: %d\n", ex_fields));
436439

437440
if (self->state == START_FIELD_IN_SKIP_LINE ||
438441
self->state == IN_FIELD_IN_SKIP_LINE ||
@@ -450,7 +453,7 @@ static int end_line(parser_t *self) {
450453
return 0;
451454
}
452455

453-
if (!(self->lines <= self->header_end + 1) &&
456+
if (!(self->lines <= (int64_t) self->header_end + 1) &&
454457
(self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
455458
// increment file line count
456459
self->file_lines++;
@@ -485,10 +488,13 @@ static int end_line(parser_t *self) {
485488
}
486489
} else {
487490
// missing trailing delimiters
488-
if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
491+
if ((self->lines >= (int64_t) self->header_end + 1) &&
492+
fields < ex_fields) {
489493
// might overrun the buffer when closing fields
490494
if (make_stream_space(self, ex_fields - fields) < 0) {
491-
self->error_msg = "out of memory";
495+
size_t bufsize = 100;
496+
self->error_msg = (char *)malloc(bufsize);
497+
snprintf(self->error_msg, bufsize, "out of memory");
492498
return -1;
493499
}
494500

@@ -507,7 +513,7 @@ static int end_line(parser_t *self) {
507513
TRACE((
508514
"end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
509515
self->lines, self->lines_cap))
510-
int bufsize = 100;
516+
size_t bufsize = 100;
511517
self->error_msg = (char *)malloc(bufsize);
512518
snprintf(self->error_msg, bufsize,
513519
"Buffer overflow caught - "
@@ -568,7 +574,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
568574
self->datalen = bytes_read;
569575

570576
if (status != REACHED_EOF && self->data == NULL) {
571-
int bufsize = 200;
577+
size_t bufsize = 200;
572578
self->error_msg = (char *)malloc(bufsize);
573579

574580
if (status == CALLING_READ_FAILED) {
@@ -599,7 +605,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
599605
if (slen >= self->stream_cap) { \
600606
TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
601607
self->stream_cap)) \
602-
int bufsize = 100; \
608+
size_t bufsize = 100; \
603609
self->error_msg = (char *)malloc(bufsize); \
604610
snprintf(self->error_msg, bufsize, \
605611
"Buffer overflow caught - possible malformed input file.\n");\
@@ -626,7 +632,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
626632
stream = self->stream + self->stream_len; \
627633
slen = self->stream_len; \
628634
self->state = STATE; \
629-
if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \
635+
if (line_limit > 0 && self->lines == start_lines + (size_t)line_limit) { \
630636
goto linelimit; \
631637
}
632638

@@ -641,7 +647,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
641647
stream = self->stream + self->stream_len; \
642648
slen = self->stream_len; \
643649
self->state = STATE; \
644-
if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \
650+
if (line_limit > 0 && self->lines == start_lines + (size_t)line_limit) { \
645651
goto linelimit; \
646652
}
647653

@@ -712,15 +718,17 @@ int skip_this_line(parser_t *self, int64_t rownum) {
712718
}
713719
}
714720

715-
int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
716-
int i, slen;
721+
int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
722+
int64_t i, slen;
717723
int should_skip;
718724
char c;
719725
char *stream;
720726
char *buf = self->data + self->datapos;
721727

722728
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
723-
self->error_msg = "out of memory";
729+
size_t bufsize = 100;
730+
self->error_msg = (char *)malloc(bufsize);
731+
snprintf(self->error_msg, bufsize, "out of memory");
724732
return -1;
725733
}
726734

@@ -1025,7 +1033,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
10251033
PUSH_CHAR(c);
10261034
self->state = IN_FIELD;
10271035
} else {
1028-
int bufsize = 100;
1036+
size_t bufsize = 100;
10291037
self->error_msg = (char *)malloc(bufsize);
10301038
snprintf(self->error_msg, bufsize,
10311039
"delimiter expected after quote in quote");
@@ -1079,7 +1087,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
10791087
--i;
10801088
buf--; // let's try this character again (HACK!)
10811089
if (line_limit > 0 &&
1082-
self->lines == start_lines + (int)line_limit) {
1090+
self->lines == start_lines + line_limit) {
10831091
goto linelimit;
10841092
}
10851093
}
@@ -1121,7 +1129,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
11211129
}
11221130

11231131
static int parser_handle_eof(parser_t *self) {
1124-
int bufsize = 100;
1132+
size_t bufsize = 100;
11251133

11261134
TRACE(
11271135
("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
@@ -1165,9 +1173,9 @@ static int parser_handle_eof(parser_t *self) {
11651173
}
11661174

11671175
int parser_consume_rows(parser_t *self, size_t nrows) {
1168-
int i, offset, word_deletions, char_count;
1176+
size_t i, offset, word_deletions, char_count;
11691177

1170-
if ((int)nrows > self->lines) {
1178+
if (nrows > self->lines) {
11711179
nrows = self->lines;
11721180
}
11731181

@@ -1204,7 +1212,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
12041212
self->word_start -= char_count;
12051213

12061214
/* move line metadata */
1207-
for (i = 0; i < self->lines - (int)nrows + 1; ++i) {
1215+
for (i = 0; i < self->lines - nrows + 1; ++i) {
12081216
offset = i + nrows;
12091217
self->line_start[i] = self->line_start[offset] - word_deletions;
12101218
self->line_fields[i] = self->line_fields[offset];
@@ -1227,23 +1235,24 @@ int parser_trim_buffers(parser_t *self) {
12271235
size_t new_cap;
12281236
void *newptr;
12291237

1230-
int i;
1238+
int64_t i;
12311239

12321240
/* trim words, word_starts */
12331241
new_cap = _next_pow2(self->words_len) + 1;
1234-
if ((int)new_cap < self->words_cap) {
1242+
if (new_cap < self->words_cap) {
12351243
TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
12361244
newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *));
12371245
if (newptr == NULL) {
12381246
return PARSER_OUT_OF_MEMORY;
12391247
} else {
12401248
self->words = (char **)newptr;
12411249
}
1242-
newptr = safe_realloc((void *)self->word_starts, new_cap * sizeof(int));
1250+
newptr = safe_realloc((void *)self->word_starts,
1251+
new_cap * sizeof(int64_t));
12431252
if (newptr == NULL) {
12441253
return PARSER_OUT_OF_MEMORY;
12451254
} else {
1246-
self->word_starts = (int *)newptr;
1255+
self->word_starts = (int64_t *)newptr;
12471256
self->words_cap = new_cap;
12481257
}
12491258
}
@@ -1254,7 +1263,7 @@ int parser_trim_buffers(parser_t *self) {
12541263
("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
12551264
"%zu\n",
12561265
new_cap, self->stream_cap, self->lines_cap));
1257-
if ((int)new_cap < self->stream_cap) {
1266+
if (new_cap < self->stream_cap) {
12581267
TRACE(
12591268
("parser_trim_buffers: new_cap < self->stream_cap, calling "
12601269
"safe_realloc\n"));
@@ -1282,19 +1291,21 @@ int parser_trim_buffers(parser_t *self) {
12821291

12831292
/* trim line_start, line_fields */
12841293
new_cap = _next_pow2(self->lines) + 1;
1285-
if ((int)new_cap < self->lines_cap) {
1294+
if (new_cap < self->lines_cap) {
12861295
TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
1287-
newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int));
1296+
newptr = safe_realloc((void *)self->line_start,
1297+
new_cap * sizeof(int64_t));
12881298
if (newptr == NULL) {
12891299
return PARSER_OUT_OF_MEMORY;
12901300
} else {
1291-
self->line_start = (int *)newptr;
1301+
self->line_start = (int64_t *)newptr;
12921302
}
1293-
newptr = safe_realloc((void *)self->line_fields, new_cap * sizeof(int));
1303+
newptr = safe_realloc((void *)self->line_fields,
1304+
new_cap * sizeof(int64_t));
12941305
if (newptr == NULL) {
12951306
return PARSER_OUT_OF_MEMORY;
12961307
} else {
1297-
self->line_fields = (int *)newptr;
1308+
self->line_fields = (int64_t *)newptr;
12981309
self->lines_cap = new_cap;
12991310
}
13001311
}
@@ -1303,7 +1314,7 @@ int parser_trim_buffers(parser_t *self) {
13031314
}
13041315

13051316
void debug_print_parser(parser_t *self) {
1306-
int j, line;
1317+
int64_t j, line;
13071318
char *token;
13081319

13091320
for (line = 0; line < self->lines; ++line) {
@@ -1324,18 +1335,18 @@ void debug_print_parser(parser_t *self) {
13241335

13251336
int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13261337
int status = 0;
1327-
int start_lines = self->lines;
1338+
int64_t start_lines = self->lines;
13281339

13291340
if (self->state == FINISHED) {
13301341
return 0;
13311342
}
13321343

13331344
TRACE((
13341345
"_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
1335-
(int)nrows, self->datapos, self->datalen));
1346+
nrows, self->datapos, self->datalen));
13361347

13371348
while (1) {
1338-
if (!all && self->lines - start_lines >= (int)nrows) break;
1349+
if (!all && self->lines - start_lines >= nrows) break;
13391350

13401351
if (self->datapos == self->datalen) {
13411352
status = parser_buffer_bytes(self, self->chunksize);

‎pandas/_libs/src/parser/tokenizer.h

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -137,30 +137,30 @@ typedef struct parser_t {
137137
io_callback cb_io;
138138
io_cleanup cb_cleanup;
139139

140-
int chunksize; // Number of bytes to prepare for each chunk
141-
char *data; // pointer to data to be processed
142-
int datalen; // amount of data available
143-
int datapos;
140+
int64_t chunksize; // Number of bytes to prepare for each chunk
141+
char *data; // pointer to data to be processed
142+
int64_t datalen; // amount of data available
143+
int64_t datapos;
144144

145145
// where to write out tokenized data
146146
char *stream;
147-
int stream_len;
148-
int stream_cap;
147+
int64_t stream_len;
148+
int64_t stream_cap;
149149

150150
// Store words in (potentially ragged) matrix for now, hmm
151151
char **words;
152-
int *word_starts; // where we are in the stream
153-
int words_len;
154-
int words_cap;
152+
int64_t *word_starts; // where we are in the stream
153+
int64_t words_len;
154+
int64_t words_cap;
155155

156-
char *pword_start; // pointer to stream start of current field
157-
int word_start; // position start of current field
156+
char *pword_start; // pointer to stream start of current field
157+
int64_t word_start; // position start of current field
158158

159-
int *line_start; // position in words for start of line
160-
int *line_fields; // Number of fields in each line
161-
int lines; // Number of (good) lines observed
162-
int file_lines; // Number of file lines observed (including bad or skipped)
163-
int lines_cap; // Vector capacity
159+
int64_t *line_start; // position in words for start of line
160+
int64_t *line_fields; // Number of fields in each line
161+
int64_t lines; // Number of (good) lines observed
162+
int64_t file_lines; // Number of lines (including bad or skipped)
163+
int64_t lines_cap; // Vector capacity
164164

165165
// Tokenizing stuff
166166
ParserState state;
@@ -193,9 +193,9 @@ typedef struct parser_t {
193193
// thousands separator (comma, period)
194194
char thousands;
195195

196-
int header; // Boolean: 1: has header, 0: no header
197-
int header_start; // header row start
198-
int header_end; // header row end
196+
int header; // Boolean: 1: has header, 0: no header
197+
int64_t header_start; // header row start
198+
int64_t header_end; // header row end
199199

200200
void *skipset;
201201
PyObject *skipfunc;
@@ -216,7 +216,7 @@ typedef struct parser_t {
216216

217217
typedef struct coliter_t {
218218
char **words;
219-
int *line_start;
219+
int64_t *line_start;
220220
int col;
221221
} coliter_t;
222222

@@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i);
225225

226226
#define COLITER_NEXT(iter, word) \
227227
do { \
228-
const int i = *iter.line_start++ + iter.col; \
228+
const int64_t i = *iter.line_start++ + iter.col; \
229229
word = i < *iter.line_start ? iter.words[i] : ""; \
230230
} while (0)
231231

‎pandas/conftest.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ def pytest_addoption(parser):
99
parser.addoption("--skip-slow", action="store_true",
1010
help="skip slow tests")
1111
parser.addoption("--skip-network", action="store_true",
12-
help="run network tests")
12+
help="skip network tests")
13+
parser.addoption("--run-highmemory", action="store_true",
14+
help="run high memory tests")
1315
parser.addoption("--only-slow", action="store_true",
1416
help="run only slow tests")
1517

@@ -24,6 +26,11 @@ def pytest_runtest_setup(item):
2426
if 'network' in item.keywords and item.config.getoption("--skip-network"):
2527
pytest.skip("skipping due to --skip-network")
2628

29+
if 'high_memory' in item.keywords and not item.config.getoption(
30+
"--run-highmemory"):
31+
pytest.skip(
32+
"skipping high memory test since --run-highmemory was not set")
33+
2734

2835
# Configurations for all tests and all test modules
2936

‎pandas/tests/io/parser/test_parsers.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# -*- coding: utf-8 -*-
22

33
import os
4+
from io import StringIO
5+
6+
import pytest
47

58
import pandas.util.testing as tm
69

@@ -25,6 +28,18 @@
2528
from .dtypes import DtypeTests
2629

2730

31+
@pytest.mark.high_memory
32+
def test_bytes_exceed_2gb():
33+
"""Read from a "CSV" that has a column larger than 2GB.
34+
35+
GH 16798
36+
"""
37+
csv = StringIO('strings\n' + '\n'.join(
38+
['x' * (1 << 20) for _ in range(2100)]))
39+
df = read_csv(csv, low_memory=False)
40+
assert not df.empty
41+
42+
2843
class BaseParser(CommentTests, CompressionTests,
2944
ConverterTests, DialectTests,
3045
HeaderTests, IndexColTests,

‎setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ markers =
2727
single: mark a test as single cpu only
2828
slow: mark a test as slow
2929
network: mark a test as network
30+
highmemory: mark a test as a high-memory only

0 commit comments

Comments
 (0)
Please sign in to comment.