From b2c4211b10c3da552bb832eb56ede0cc0356d575 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Nov 2016 05:02:08 -0500 Subject: [PATCH] MAINT: Cleanup pandas/src/parser Remove dead code and reformat for style using Google's C++ style guide. Also adds Google's cpplint (fork) to the style checking for Travis. --- ci/lint.sh | 12 + pandas/src/parser/io.c | 118 +-- pandas/src/parser/io.h | 47 +- pandas/src/parser/tokenizer.c | 1831 ++++++++++++++------------------- pandas/src/parser/tokenizer.h | 166 ++- 5 files changed, 921 insertions(+), 1253 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index d6390a16b763e..7ab97bfc6d328 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -35,6 +35,18 @@ if [ "$LINT" ]; then done echo "Linting *.pxi.in DONE" + # readability/casting: Warnings about C casting instead of C++ casting + # runtime/int: Warnings about using C number types instead of C++ ones + # build/include_subdir: Warnings about prefacing included header files with directory + pip install cpplint + + echo "Linting *.c and *.h" + cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/parser + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting *.c and *.h DONE" + echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas if [ $? = "0" ]; then diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c index 566de72804968..562d6033ce3eb 100644 --- a/pandas/src/parser/io.c +++ b/pandas/src/parser/io.c @@ -1,12 +1,20 @@ -#include "io.h" +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ - /* - On-disk FILE, uncompressed - */ +#include "io.h" +/* + On-disk FILE, uncompressed +*/ void *new_file_source(char *fname, size_t buffer_size) { - file_source *fs = (file_source *) malloc(sizeof(file_source)); + file_source *fs = (file_source *)malloc(sizeof(file_source)); fs->fp = fopen(fname, "rb"); if (fs->fp == NULL) { @@ -18,7 +26,7 @@ void *new_file_source(char *fname, size_t buffer_size) { fs->initial_file_pos = ftell(fs->fp); // Only allocate this heap memory if we are not memory-mapping the file - fs->buffer = (char*) malloc((buffer_size + 1) * sizeof(char)); + fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { return NULL; @@ -27,25 +35,11 @@ void *new_file_source(char *fname, size_t buffer_size) { memset(fs->buffer, 0, buffer_size + 1); fs->buffer[buffer_size] = '\0'; - return (void *) fs; + return (void *)fs; } - -// XXX handle on systems without the capability - - -/* - * void *new_file_buffer(FILE *f, int buffer_size) - * - * Allocate a new file_buffer. - * Returns NULL if the memory allocation fails or if the call to mmap fails. - * - * buffer_size is ignored. - */ - - -void* new_rd_source(PyObject *obj) { - rd_source *rds = (rd_source *) malloc(sizeof(rd_source)); +void *new_rd_source(PyObject *obj) { + rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); /* hold on to this object */ Py_INCREF(obj); @@ -53,7 +47,7 @@ void* new_rd_source(PyObject *obj) { rds->buffer = NULL; rds->position = 0; - return (void*) rds; + return (void *)rds; } /* @@ -63,9 +57,7 @@ void* new_rd_source(PyObject *obj) { */ int del_file_source(void *fs) { - // fseek(FS(fs)->fp, FS(fs)->initial_file_pos, SEEK_SET); - if (fs == NULL) - return 0; + if (fs == NULL) return 0; /* allocated on the heap */ free(FS(fs)->buffer); @@ -89,13 +81,11 @@ int del_rd_source(void *rds) { */ - -void* buffer_file_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { +void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { file_source *src = FS(source); - *bytes_read = fread((void*) src->buffer, sizeof(char), nbytes, - src->fp); + *bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp); if (*bytes_read == 0) { *status = REACHED_EOF; @@ -103,13 +93,11 @@ void* buffer_file_bytes(void *source, size_t nbytes, *status = 0; } - return (void*) src->buffer; - + return (void *)src->buffer; } - -void* buffer_rd_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { +void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { PyGILState_STATE state; PyObject *result, *func, *args, *tmp; @@ -125,21 +113,18 @@ void* buffer_rd_bytes(void *source, size_t nbytes, args = Py_BuildValue("(i)", nbytes); func = PyObject_GetAttrString(src->obj, "read"); - /* printf("%s\n", PyBytes_AsString(PyObject_Repr(func))); */ /* TODO: does this release the GIL? */ result = PyObject_CallObject(func, args); Py_XDECREF(args); Py_XDECREF(func); - /* PyObject_Print(PyObject_Type(result), stdout, 0); */ if (result == NULL) { PyGILState_Release(state); *bytes_read = 0; *status = CALLING_READ_FAILED; return NULL; - } - else if (!PyBytes_Check(result)) { + } else if (!PyBytes_Check(result)) { tmp = PyUnicode_AsUTF8String(result); Py_XDECREF(result); result = tmp; @@ -154,8 +139,7 @@ void* buffer_rd_bytes(void *source, size_t nbytes, /* hang on to the Python object */ src->buffer = result; - retval = (void*) PyBytes_AsString(result); - + retval = (void *)PyBytes_AsString(result); PyGILState_Release(state); @@ -165,21 +149,18 @@ void* buffer_rd_bytes(void *source, size_t nbytes, return retval; } - #ifdef HAVE_MMAP -#include #include +#include -void *new_mmap(char *fname) -{ +void *new_mmap(char *fname) { struct stat buf; int fd; memory_map *mm; - /* off_t position; */ off_t filesize; - mm = (memory_map *) malloc(sizeof(memory_map)); + mm = (memory_map *)malloc(sizeof(memory_map)); mm->fp = fopen(fname, "rb"); fd = fileno(mm->fp); @@ -187,20 +168,19 @@ void *new_mmap(char *fname) fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); return NULL; } - filesize = buf.st_size; /* XXX This might be 32 bits. */ - + filesize = buf.st_size; /* XXX This might be 32 bits. */ if (mm == NULL) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: malloc() failed.\n"); return NULL; } - mm->size = (off_t) filesize; + mm->size = (off_t)filesize; mm->line_number = 0; mm->fileno = fd; mm->position = ftell(mm->fp); - mm->last_pos = (off_t) filesize; + mm->last_pos = (off_t)filesize; mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); if (mm->memmap == NULL) { @@ -210,30 +190,20 @@ void *new_mmap(char *fname) mm = NULL; } - return (void*) mm; + return (void *)mm; } - -int del_mmap(void *src) -{ +int del_mmap(void *src) { munmap(MM(src)->memmap, MM(src)->size); fclose(MM(src)->fp); - - /* - * With a memory mapped file, there is no need to do - * anything if restore == RESTORE_INITIAL. - */ - /* if (restore == RESTORE_FINAL) { */ - /* fseek(FB(fb)->file, FB(fb)->current_pos, SEEK_SET); */ - /* } */ free(src); return 0; } -void* buffer_mmap_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { +void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { void *retval; memory_map *src = MM(source); @@ -264,19 +234,15 @@ void* buffer_mmap_bytes(void *source, size_t nbytes, /* kludgy */ -void *new_mmap(char *fname) { - return NULL; -} +void *new_mmap(char *fname) { return NULL; } -int del_mmap(void *src) { - return 0; -} +int del_mmap(void *src) { return 0; } /* don't use this! */ -void* buffer_mmap_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { - return NULL; +void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { + return NULL; } #endif diff --git a/pandas/src/parser/io.h b/pandas/src/parser/io.h index 2ae72ff8a7fe0..5a0c2b2b5e4a4 100644 --- a/pandas/src/parser/io.h +++ b/pandas/src/parser/io.h @@ -1,14 +1,23 @@ +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#ifndef PANDAS_SRC_PARSER_IO_H_ +#define PANDAS_SRC_PARSER_IO_H_ + #include "Python.h" #include "tokenizer.h" - typedef struct _file_source { /* The file being read. */ FILE *fp; char *buffer; - /* Size of the file, in bytes. */ - /* off_t size; */ /* file position when the file_buffer was created. */ off_t initial_file_pos; @@ -16,15 +25,9 @@ typedef struct _file_source { /* Offset in the file of the data currently in the buffer. */ off_t buffer_file_pos; - /* Actual number of bytes in the current buffer. (Can be less than buffer_size.) */ + /* Actual number of bytes in the current buffer. (Can be less than + * buffer_size.) */ off_t last_pos; - - /* Size (in bytes) of the buffer. */ - // off_t buffer_size; - - /* Pointer to the buffer. */ - // char *buffer; - } file_source; #define FS(source) ((file_source *)source) @@ -34,7 +37,6 @@ typedef struct _file_source { #endif typedef struct _memory_map { - FILE *fp; /* Size of the file, in bytes. */ @@ -49,22 +51,20 @@ typedef struct _memory_map { off_t position; off_t last_pos; char *memmap; - } memory_map; -#define MM(src) ((memory_map*) src) +#define MM(src) ((memory_map *)src) void *new_mmap(char *fname); int del_mmap(void *src); -void* buffer_mmap_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status); - +void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status); typedef struct _rd_source { - PyObject* obj; - PyObject* buffer; + PyObject *obj; + PyObject *buffer; size_t position; } rd_source; @@ -77,9 +77,10 @@ void *new_rd_source(PyObject *obj); int del_file_source(void *src); int del_rd_source(void *src); -void* buffer_file_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status); +void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status); -void* buffer_rd_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status); +void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status); +#endif // PANDAS_SRC_PARSER_IO_H_ diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 450abcf6c325c..1ea62d66345bd 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -9,61 +9,33 @@ See LICENSE for the license */ - /* - Low-level ascii-file processing for pandas. Combines some elements from - Python's built-in csv module and Warren Weckesser's textreader project on - GitHub. See Python Software Foundation License and BSD licenses for these. +/* - */ +Low-level ascii-file processing for pandas. Combines some elements from +Python's built-in csv module and Warren Weckesser's textreader project on +GitHub. See Python Software Foundation License and BSD licenses for these. +*/ #include "tokenizer.h" #include -#include #include - - -//#define READ_ERROR_OUT_OF_MEMORY 1 - - -/* -* restore: -* RESTORE_NOT (0): -* Free memory, but leave the file position wherever it -* happend to be. -* RESTORE_INITIAL (1): -* Restore the file position to the location at which -* the file_buffer was created. -* RESTORE_FINAL (2): -* Put the file position at the next byte after the -* data read from the file_buffer. -* -#define RESTORE_NOT 0 -#define RESTORE_INITIAL 1 -#define RESTORE_FINAL 2 -*/ +#include static void *safe_realloc(void *buffer, size_t size) { void *result; - // OS X is weird + // OSX is weird. // http://stackoverflow.com/questions/9560609/ // different-realloc-behaviour-in-linux-and-osx result = realloc(buffer, size); - TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, result)) + TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, + result)) -/* if (result != NULL) { - // errno gets set to 12 on my OS Xmachine in some cases even when the - // realloc succeeds. annoying - errno = 0; - } else { - return buffer; - }*/ return result; } - void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { // column i, starting at 0 self->words = parser->words; @@ -73,7 +45,7 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { coliter_t *coliter_new(parser_t *self, int i) { // column i, starting at 0 - coliter_t *iter = (coliter_t*) malloc(sizeof(coliter_t)); + coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t)); if (NULL == iter) { return NULL; @@ -83,36 +55,28 @@ coliter_t *coliter_new(parser_t *self, int i) { return iter; } - - /* int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error); */ - /* uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); */ - - -static void free_if_not_null(void **ptr) { +static void free_if_not_null(void **ptr) { TRACE(("free_if_not_null %p\n", *ptr)) if (*ptr != NULL) { free(*ptr); *ptr = NULL; } - } - - - - /* +} - Parser / tokenizer +/* - */ + Parser / tokenizer +*/ -static void *grow_buffer(void *buffer, int length, int *capacity, - int space, int elsize, int *error) { +static void *grow_buffer(void *buffer, int length, int *capacity, int space, + int elsize, int *error) { int cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - while ( (length + space >= cap) && (newbuffer != NULL) ){ - cap = cap? cap << 1 : 2; + while ((length + space >= cap) && (newbuffer != NULL)) { + cap = cap ? cap << 1 : 2; buffer = newbuffer; newbuffer = safe_realloc(newbuffer, elsize * cap); } @@ -122,15 +86,14 @@ static void *grow_buffer(void *buffer, int length, int *capacity, // and return the last good realloc'd buffer so it can be freed *error = errno; newbuffer = buffer; - } else { + } else { // realloc worked, update *capacity and set *error to 0 // sigh, multiple return values *capacity = cap; *error = 0; } return newbuffer; - } - +} void parser_set_default_options(parser_t *self) { self->decimal = '.'; @@ -139,7 +102,7 @@ void parser_set_default_options(parser_t *self) { // For tokenization self->state = START_RECORD; - self->delimiter = ','; // XXX + self->delimiter = ','; // XXX self->delim_whitespace = 0; self->doublequote = 0; @@ -161,17 +124,13 @@ void parser_set_default_options(parser_t *self) { self->thousands = '\0'; self->skipset = NULL; - self-> skip_first_N_rows = -1; + self->skip_first_N_rows = -1; self->skip_footer = 0; } -int get_parser_memory_footprint(parser_t *self) { - return 0; -} +int get_parser_memory_footprint(parser_t *self) { return 0; } -parser_t* parser_new() { - return (parser_t*) calloc(1, sizeof(parser_t)); -} +parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); } int parser_clear_data_buffers(parser_t *self) { free_if_not_null((void *)&self->stream); @@ -183,14 +142,14 @@ int parser_clear_data_buffers(parser_t *self) { } int parser_cleanup(parser_t *self) { - int status = 0; + int status = 0; // XXX where to put this - free_if_not_null((void *) &self->error_msg); - free_if_not_null((void *) &self->warn_msg); + free_if_not_null((void *)&self->error_msg); + free_if_not_null((void *)&self->warn_msg); if (self->skipset != NULL) { - kh_destroy_int64((kh_int64_t*) self->skipset); + kh_destroy_int64((kh_int64_t *)self->skipset); self->skipset = NULL; } @@ -207,8 +166,6 @@ int parser_cleanup(parser_t *self) { return status; } - - int parser_init(parser_t *self) { int sz; @@ -225,7 +182,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = (char*) malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = (char *)malloc(STREAM_INIT_SIZE * sizeof(char)); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -235,16 +192,16 @@ int parser_init(parser_t *self) { // word pointers and metadata sz = STREAM_INIT_SIZE / 10; - sz = sz? sz : 1; - self->words = (char**) malloc(sz * sizeof(char*)); - self->word_starts = (int*) malloc(sz * sizeof(int)); + sz = sz ? sz : 1; + self->words = (char **)malloc(sz * sizeof(char *)); + self->word_starts = (int *)malloc(sz * sizeof(int)); self->words_cap = sz; self->words_len = 0; // line pointers and metadata - self->line_start = (int*) malloc(sz * sizeof(int)); + self->line_start = (int *)malloc(sz * sizeof(int)); - self->line_fields = (int*) malloc(sz * sizeof(int)); + self->line_fields = (int *)malloc(sz * sizeof(int)); self->lines_cap = sz; self->lines = 0; @@ -253,7 +210,6 @@ int parser_init(parser_t *self) { if (self->stream == NULL || self->words == NULL || self->word_starts == NULL || self->line_start == NULL || self->line_fields == NULL) { - parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -279,7 +235,6 @@ int parser_init(parser_t *self) { return 0; } - void parser_free(parser_t *self) { // opposite of parser_init parser_cleanup(self); @@ -292,20 +247,21 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - /* TRACE(("maybe growing buffers\n")); */ - /* TOKEN STREAM */ - orig_ptr = (void *) self->stream; - TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = (char*) grow_buffer((void *) self->stream, - self->stream_len, - &self->stream_cap, nbytes * 2, - sizeof(char), &status); - TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", - self->stream, self->stream_len, self->stream_cap, status)) + orig_ptr = (void *)self->stream; + TRACE( + ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", + nbytes)) + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, + sizeof(char), &status); + TRACE( + ("make_stream_space: self->stream=%p, self->stream_len = %zu, " + "self->stream_cap=%zu, status=%zu\n", + self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -313,95 +269,86 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // realloc sets errno when moving buffer? if (self->stream != orig_ptr) { - // uff - /* TRACE(("Moving word pointers\n")) */ - self->pword_start = self->stream + self->word_start; - for (i = 0; i < self->words_len; ++i) - { + for (i = 0; i < self->words_len; ++i) { self->words[i] = self->stream + self->word_starts[i]; } } - /* WORD VECTORS */ cap = self->words_cap; - self->words = (char**) grow_buffer((void *) self->words, - self->words_len, - &self->words_cap, nbytes, - sizeof(char*), &status); - TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", - self->words_len, self->words_cap, nbytes, status)) + self->words = + (char **)grow_buffer((void *)self->words, self->words_len, + &self->words_cap, nbytes, sizeof(char *), &status); + TRACE( + ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " + "%d)\n", + self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } - // realloc took place if (cap != self->words_cap) { - TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, self->words_cap=%d\n", nbytes, self->words_cap)) - newptr = safe_realloc((void *) self->word_starts, sizeof(int) * self->words_cap); + TRACE( + ("make_stream_space: cap != self->words_cap, nbytes = %d, " + "self->words_cap=%d\n", + nbytes, self->words_cap)) + newptr = safe_realloc((void *)self->word_starts, + sizeof(int) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int*) newptr; + self->word_starts = (int *)newptr; } } - /* LINE VECTORS */ - /* - printf("Line_start: "); - - for (j = 0; j < self->lines + 1; ++j) { - printf("%d ", self->line_fields[j]); - } - printf("\n"); - - printf("lines_cap: %d\n", self->lines_cap); - */ cap = self->lines_cap; - self->line_start = (int*) grow_buffer((void *) self->line_start, - self->lines + 1, - &self->lines_cap, nbytes, - sizeof(int), &status); - TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", - self->lines + 1, self->lines_cap, nbytes, status)) + self->line_start = + (int *)grow_buffer((void *)self->line_start, self->lines + 1, + &self->lines_cap, nbytes, sizeof(int), &status); + TRACE(( + "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } // realloc took place if (cap != self->lines_cap) { - TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - newptr = safe_realloc((void *) self->line_fields, sizeof(int) * self->lines_cap); + TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", + nbytes)) + newptr = safe_realloc((void *)self->line_fields, + sizeof(int) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int*) newptr; + self->line_fields = (int *)newptr; } } - /* TRACE(("finished growing buffers\n")); */ - return 0; } - static int push_char(parser_t *self, char c) { - /* TRACE(("pushing %c \n", c)) */ - TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) + TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", + self->stream_len + 1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { - TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", - self->stream_len, self->stream_cap)) - self->error_msg = (char*) malloc(64); - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); + TRACE( + ("push_char: ERROR!!! self->stream_len(%d) >= " + "self->stream_cap(%d)\n", + self->stream_len, self->stream_cap)) + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } self->stream[self->stream_len++] = c; @@ -410,11 +357,15 @@ static int push_char(parser_t *self, char c) { int P_INLINE end_field(parser_t *self) { // XXX cruft -// self->numeric_field = 0; if (self->words_len >= self->words_cap) { - TRACE(("end_field: ERROR!!! self->words_len(%zu) >= self->words_cap(%zu)\n", self->words_len, self->words_cap)) - self->error_msg = (char*) malloc(64); - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); + TRACE( + ("end_field: ERROR!!! self->words_len(%zu) >= " + "self->words_cap(%zu)\n", + self->words_len, self->words_cap)) + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } @@ -426,8 +377,8 @@ int P_INLINE end_field(parser_t *self) { TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); - TRACE(("end_field: Saw word %s at: %d. Total: %d\n", - self->pword_start, self->word_start, self->words_len + 1)) + TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, + self->word_start, self->words_len + 1)) self->word_starts[self->words_len] = self->word_start; self->words_len++; @@ -442,29 +393,29 @@ int P_INLINE end_field(parser_t *self) { return 0; } - static void append_warning(parser_t *self, const char *msg) { int ex_length; int length = strlen(msg); void *newptr; if (self->warn_msg == NULL) { - self->warn_msg = (char*) malloc(length + 1); - strcpy(self->warn_msg, msg); + self->warn_msg = (char *)malloc(length + 1); + strncpy(self->warn_msg, msg, strlen(msg) + 1); } else { ex_length = strlen(self->warn_msg); newptr = safe_realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { - self->warn_msg = (char*) newptr; - strcpy(self->warn_msg + ex_length, msg); + self->warn_msg = (char *)newptr; + strncpy(self->warn_msg + ex_length, msg, strlen(msg) + 1); } } } static int end_line(parser_t *self) { + char *msg; int fields; int ex_fields = self->expected_fields; - char *msg; + int bufsize = 100; // for error or warning messages fields = self->line_fields[self->lines]; @@ -478,11 +429,10 @@ static int end_line(parser_t *self) { } } - if (self->state == START_FIELD_IN_SKIP_LINE || \ - self->state == IN_FIELD_IN_SKIP_LINE || \ - self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \ - self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE - ) { + if (self->state == START_FIELD_IN_SKIP_LINE || + self->state == IN_FIELD_IN_SKIP_LINE || + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; @@ -495,9 +445,8 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= self->header_end + 1) - && (self->expected_fields < 0 && fields > ex_fields) - && !(self->usecols)) { + if (!(self->lines <= self->header_end + 1) && + (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -509,8 +458,9 @@ static int end_line(parser_t *self) { // file_lines is now the actual file line number (starting at 1) if (self->error_bad_lines) { - self->error_msg = (char*) malloc(100); - sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n", + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Expected %d fields in line %d, saw %d\n", ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -520,9 +470,10 @@ static int end_line(parser_t *self) { // simply skip bad lines if (self->warn_bad_lines) { // pass up error message - msg = (char*) malloc(100); - sprintf(msg, "Skipping line %d: expected %d fields, saw %d\n", - self->file_lines, ex_fields, fields); + msg = (char *)malloc(bufsize); + snprintf(msg, bufsize, + "Skipping line %d: expected %d fields, saw %d\n", + self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); } @@ -530,14 +481,13 @@ static int end_line(parser_t *self) { } else { // missing trailing delimiters if ((self->lines >= self->header_end + 1) && fields < ex_fields) { - // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { self->error_msg = "out of memory"; return -1; } - while (fields < ex_fields){ + while (fields < ex_fields) { end_field(self); fields++; } @@ -549,15 +499,21 @@ static int end_line(parser_t *self) { // good line, set new start point if (self->lines >= self->lines_cap) { - TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) \ - self->error_msg = (char*) malloc(100); \ - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); \ - return PARSER_OUT_OF_MEMORY; \ + TRACE(( + "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", + self->lines, self->lines_cap)) + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - " + "possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; } - self->line_start[self->lines] = (self->line_start[self->lines - 1] + - fields); + self->line_start[self->lines] = + (self->line_start[self->lines - 1] + fields); - TRACE(("end_line: new line start: %d\n", self->line_start[self->lines])); + TRACE( + ("end_line: new line start: %d\n", self->line_start[self->lines])); // new line start with 0 fields self->line_fields[self->lines] = 0; @@ -574,10 +530,10 @@ int parser_add_skiprow(parser_t *self, int64_t row) { int ret = 0; if (self->skipset == NULL) { - self->skipset = (void*) kh_init_int64(); + self->skipset = (void *)kh_init_int64(); } - set = (kh_int64_t*) self->skipset; + set = (kh_int64_t *)self->skipset; k = kh_put_int64(set, row, &ret); set->keys[k] = row; @@ -601,18 +557,21 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); - TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", - nbytes, bytes_read, status)); + TRACE(( + "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + nbytes, bytes_read, status)); self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { - self->error_msg = (char*) malloc(200); + int bufsize = 200; + self->error_msg = (char *)malloc(bufsize); if (status == CALLING_READ_FAILED) { - sprintf(self->error_msg, ("Calling read(nbytes) on source failed. " - "Try engine='python'.")); + snprintf(self->error_msg, bufsize, + "Calling read(nbytes) on source failed. " + "Try engine='python'."); } else { - sprintf(self->error_msg, "Unknown error in IO callback"); + snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); } return -1; } @@ -622,93 +581,96 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { return status; } - /* Tokenization macros and state machine code */ -// printf("pushing %c\n", c); - -#define PUSH_CHAR(c) \ - TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= maxstreamsize) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, maxstreamsize)) \ - self->error_msg = (char*) malloc(100); \ - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); \ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ +#define PUSH_CHAR(c) \ + TRACE( \ + ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ + c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= maxstreamsize) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, \ + maxstreamsize)) \ + int bufsize = 100; \ + self->error_msg = (char *)malloc(bufsize); \ + snprintf(self->error_msg, bufsize, \ + "Buffer overflow caught - possible malformed input file.\n");\ + return PARSER_OUT_OF_MEMORY; \ + } \ + *stream++ = c; \ slen++; // This is a little bit of a hack but works for now -#define END_FIELD() \ - self->stream_len = slen; \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ +#define END_FIELD() \ + self->stream_len = slen; \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ slen = self->stream_len; -#define END_LINE_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - \ - } - -#define END_LINE_AND_FIELD_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - \ +#define END_LINE_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + } + +#define END_LINE_AND_FIELD_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ } #define END_LINE() END_LINE_STATE(START_RECORD) #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \ - (self->lineterminator != '\0' && \ - c == self->lineterminator)) +#define IS_TERMINATOR(c) \ + ((self->lineterminator == '\0' && c == '\n') || \ + (self->lineterminator != '\0' && c == self->lineterminator)) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) // don't parse '\r' with a custom line terminator #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) -#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar)) +#define IS_COMMENT_CHAR(c) \ + ((self->commentchar != '\0' && c == self->commentchar)) #define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar)) -#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ - self->skipinitialspace)) +#define IS_SKIPPABLE_SPACE(c) \ + ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) ((!self->delim_whitespace && c == self->delimiter) || \ - (self->delim_whitespace && IS_WHITESPACE(c))) +#define IS_DELIMITER(c) \ + ((!self->delim_whitespace && c == self->delimiter) || \ + (self->delim_whitespace && IS_WHITESPACE(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); + TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ + self->datalen)); #define CHECK_FOR_BOM() \ if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ @@ -718,16 +680,14 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != - ((kh_int64_t*)self->skipset)->n_buckets ); - } - else { - return ( rownum <= self->skip_first_N_rows ); + return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != + ((kh_int64_t *)self->skipset)->n_buckets); + } else { + return (rownum <= self->skip_first_N_rows); } } -int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) -{ +int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { int i, slen; long maxstreamsize; char c; @@ -749,368 +709,364 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) CHECK_FOR_BOM(); } - for (i = self->datapos; i < self->datalen; ++i) - { + for (i = self->datapos; i < self->datalen; ++i) { // next character in file c = *buf++; - TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case START_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_DELIMITER(c)) { - // Do nothing, we're starting a new field again. - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; + TRACE( + ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " + "state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); - case IN_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } - break; - - case IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - if (self->doublequote) { - self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; + switch (self->state) { + case START_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. } else { self->state = IN_FIELD_IN_SKIP_LINE; } - } - break; - - case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case WHITESPACE_LINE: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; break; - } else if (!self->delim_whitespace) { - if (IS_WHITESPACE(c) && c != self->delimiter) { - ; - } else { // backtrack - // use i + 1 because buf has been incremented but not i - do { - --buf; - --i; - } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); - // reached a newline rather than the beginning - if (IS_TERMINATOR(*buf)) { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } break; - } - // fall through - case EAT_WHITESPACE: - if (IS_TERMINATOR(c)) { - END_LINE(); - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - break; - } else if (!IS_WHITESPACE(c)) { - self->state = START_FIELD; - // fall through to subsequent state - } else { - // if whitespace char, keep slurping + case IN_QUOTED_FIELD_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + if (self->doublequote) { + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + } break; - } - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } else { self->state = IN_FIELD_IN_SKIP_LINE; - - if (IS_TERMINATOR(c)) { - END_LINE(); - } } break; - } else if (IS_TERMINATOR(c)) { - // \n\r possible? - if (self->skip_empty_lines) { + + case WHITESPACE_LINE: + if (IS_TERMINATOR(c)) { self->file_lines++; - } else { - END_LINE(); - } - break; - } else if (IS_CARRIAGE(c)) { - if (self->skip_empty_lines) { + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; - } else { + break; + } else if (!self->delim_whitespace) { + if (IS_WHITESPACE(c) && c != self->delimiter) { + } else { // backtrack + // use i + 1 because buf has been incremented but not i + do { + --buf; + --i; + } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); + + // reached a newline rather than the beginning + if (IS_TERMINATOR(*buf)) { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + } + // fall through + + case EAT_WHITESPACE: + if (IS_TERMINATOR(c)) { + END_LINE(); + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; + break; + } else if (!IS_WHITESPACE(c)) { + self->state = START_FIELD; + // fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; } - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_LINE_COMMENT; - break; - } else if (IS_WHITESPACE(c)) { - if (self->delim_whitespace) { + + case START_RECORD: + // start of record + if (skip_this_line(self, self->file_lines)) { + if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + + if (IS_TERMINATOR(c)) { + END_LINE(); + } + } + break; + } else if (IS_TERMINATOR(c)) { + // \n\r possible? if (self->skip_empty_lines) { - self->state = WHITESPACE_LINE; + self->file_lines++; } else { - self->state = EAT_WHITESPACE; + END_LINE(); } break; - } else if (c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; + } else if (IS_CARRIAGE(c)) { + if (self->skip_empty_lines) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else { + self->state = EAT_CRNL; + } + break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_LINE_COMMENT; break; + } else if (IS_WHITESPACE(c)) { + if (self->delim_whitespace) { + if (self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + } else { + self->state = EAT_WHITESPACE; + } + break; + } else if (c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } + // fall through } - // fall through - } - // normal character - fall through - // to handle as START_FIELD - self->state = START_FIELD; + // normal character - fall through + // to handle as START_FIELD + self->state = START_FIELD; - case START_FIELD: - // expecting field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_QUOTE(c)) { - // start quoted field - self->state = IN_QUOTED_FIELD; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_SKIPPABLE_SPACE(c)) { - // ignore space at start of field - ; - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - // save empty field + case START_FIELD: + // expecting field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_QUOTE(c)) { + // start quoted field + self->state = IN_QUOTED_FIELD; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_SKIPPABLE_SPACE(c)) { + // ignore space at start of field + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + // save empty field + END_FIELD(); + } + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); + self->state = EAT_COMMENT; + } else { + // begin new unquoted field + PUSH_CHAR(c); + self->state = IN_FIELD; } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // begin new unquoted field - // if (self->delim_whitespace && \ - // self->quoting == QUOTE_NONNUMERIC) { - // self->numeric_field = 1; - // } + break; + case ESCAPED_CHAR: PUSH_CHAR(c); self->state = IN_FIELD; - } - break; + break; - case ESCAPED_CHAR: - PUSH_CHAR(c); - self->state = IN_FIELD; - break; + case EAT_LINE_COMMENT: + if (IS_TERMINATOR(c)) { + self->file_lines++; + self->state = START_RECORD; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; - case EAT_LINE_COMMENT: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; + case IN_FIELD: + // in unquoted field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); - case IN_FIELD: - // in unquoted field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_COMMENT_CHAR(c)) { + END_FIELD(); + self->state = EAT_COMMENT; } else { - self->state = START_FIELD; + // normal character - save in field + PUSH_CHAR(c); } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; + break; - case IN_QUOTED_FIELD: - // in quoted field - if (IS_ESCAPE_CHAR(c)) { - // possible escape character - self->state = ESCAPE_IN_QUOTED_FIELD; - } else if (IS_QUOTE(c)) { - if (self->doublequote) { - // double quote - " represented by "" - self->state = QUOTE_IN_QUOTED_FIELD; + case IN_QUOTED_FIELD: + // in quoted field + if (IS_ESCAPE_CHAR(c)) { + // possible escape character + self->state = ESCAPE_IN_QUOTED_FIELD; + } else if (IS_QUOTE(c)) { + if (self->doublequote) { + // double quote - " represented by "" + self->state = QUOTE_IN_QUOTED_FIELD; + } else { + // end of quote part of field + self->state = IN_FIELD; + } } else { - // end of quote part of field - self->state = IN_FIELD; + // normal character - save in field + PUSH_CHAR(c); } - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - // double quote - seen a quote in an quoted field - if (IS_QUOTE(c)) { - // save "" as " + break; + case ESCAPE_IN_QUOTED_FIELD: PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, - "delimiter expected after " - "quote in quote"); - goto parsingerror; - } - break; + break; - case EAT_COMMENT: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - } - break; + case QUOTE_IN_QUOTED_FIELD: + // double quote - seen a quote in an quoted field + if (IS_QUOTE(c)) { + // save "" as " - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - } else if (IS_DELIMITER(c)){ + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); - if (self->delim_whitespace) { - END_LINE_STATE(EAT_WHITESPACE); + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; } else { - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "delimiter expected after quote in quote"); + goto parsingerror; } - } else { - if (self->delim_whitespace) { - /* XXX - * first character of a new record--need to back up and reread - * to handle properly... - */ - i--; buf--; // back up one character (HACK!) - END_LINE_STATE(START_RECORD); - } else { - // \r line terminator - // UGH. we don't actually want - // to consume the token. fix this later - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } + break; - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; + case EAT_COMMENT: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL: + if (c == '\n') { + END_LINE(); + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + END_LINE_STATE(EAT_WHITESPACE); + } else { + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } + } else { + if (self->delim_whitespace) { + /* XXX + * first character of a new record--need to back up and + * reread + * to handle properly... + */ + i--; + buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } else { + // \r line terminator + // UGH. we don't actually want + // to consume the token. fix this later + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; - --i; buf--; // let's try this character again (HACK!) - if (line_limit > 0 && self->lines == start_lines + line_limit) { - goto linelimit; + --i; + buf--; // let's try this character again (HACK!) + if (line_limit > 0 && + self->lines == start_lines + line_limit) { + goto linelimit; + } } } - } - break; + break; - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL_NOP: // inside an ignored comment line - self->state = START_RECORD; - // \r line terminator -- parse this character again - if (c != '\n' && !IS_DELIMITER(c)) { - --i; - --buf; - } - break; - default: - break; + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + // \r line terminator -- parse this character again + if (c != '\n' && !IS_DELIMITER(c)) { + --i; + --buf; + } + break; + default: + break; } } @@ -1134,39 +1090,41 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) } static int parser_handle_eof(parser_t *self) { - TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) + int bufsize = 100; - if (self->datalen != 0) - return -1; + TRACE( + ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) - switch (self->state) { - case START_RECORD: - case WHITESPACE_LINE: - case EAT_CRNL_NOP: - case EAT_LINE_COMMENT: - return 0; + if (self->datalen != 0) return -1; - case ESCAPE_IN_QUOTED_FIELD: - case IN_QUOTED_FIELD: - self->error_msg = (char*)malloc(100); - sprintf(self->error_msg, "EOF inside string starting at line %d", - self->file_lines); - return -1; + switch (self->state) { + case START_RECORD: + case WHITESPACE_LINE: + case EAT_CRNL_NOP: + case EAT_LINE_COMMENT: + return 0; - case ESCAPED_CHAR: - self->error_msg = (char*)malloc(100); - sprintf(self->error_msg, "EOF following escape character"); - return -1; + case ESCAPE_IN_QUOTED_FIELD: + case IN_QUOTED_FIELD: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "EOF inside string starting at line %d", self->file_lines); + return -1; - case IN_FIELD: - case START_FIELD: - case QUOTE_IN_QUOTED_FIELD: - if (end_field(self) < 0) + case ESCAPED_CHAR: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "EOF following escape character"); return -1; - break; - default: - break; + case IN_FIELD: + case START_FIELD: + case QUOTE_IN_QUOTED_FIELD: + if (end_field(self) < 0) return -1; + break; + + default: + break; } if (end_line(self) < 0) @@ -1183,19 +1141,19 @@ int parser_consume_rows(parser_t *self, size_t nrows) { } /* do nothing */ - if (nrows == 0) - return 0; + if (nrows == 0) return 0; /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); - TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); + TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, + char_count)); /* move stream, only if something to move */ if (char_count < self->stream_len) { - memmove((void*) self->stream, (void*) (self->stream + char_count), + memmove((void *)self->stream, (void *)(self->stream + char_count), self->stream_len - char_count); } /* buffer counts */ @@ -1213,26 +1171,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* move current word pointer to stream */ self->pword_start -= char_count; self->word_start -= char_count; - /* - printf("Line_start: "); - for (i = 0; i < self->lines + 1; ++i) { - printf("%d ", self->line_fields[i]); - } - printf("\n"); - */ + /* move line metadata */ - for (i = 0; i < self->lines - nrows + 1; ++i) - { + for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; - - /* TRACE(("First word in line %d is now %s\n", i, */ - /* self->words[self->line_start[i]])); */ - self->line_fields[i] = self->line_fields[offset]; } self->lines -= nrows; - /* self->line_fields[self->lines] = 0; */ return 0; } @@ -1256,47 +1202,50 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - newptr = safe_realloc((void*) self->words, new_cap * sizeof(char*)); + newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->words = (char**) newptr; + self->words = (char **)newptr; } - newptr = safe_realloc((void*) self->word_starts, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->word_starts, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int*) newptr; + self->word_starts = (int *)newptr; self->words_cap = new_cap; } } /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", - new_cap, self->stream_cap, self->lines_cap)); + TRACE( + ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " + "%zu\n", + new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { - TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); - newptr = safe_realloc((void*) self->stream, new_cap); + TRACE( + ("parser_trim_buffers: new_cap < self->stream_cap, calling " + "safe_realloc\n")); + newptr = safe_realloc((void *)self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - // Update the pointers in the self->words array (char **) if `safe_realloc` - // moved the `self->stream` buffer. This block mirrors a similar block in + // Update the pointers in the self->words array (char **) if + // `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar + // block in // `make_stream_space`. if (self->stream != newptr) { - /* TRACE(("Moving word pointers\n")) */ - self->pword_start = (char*) newptr + self->word_start; + self->pword_start = (char *)newptr + self->word_start; - for (i = 0; i < self->words_len; ++i) - { - self->words[i] = (char*) newptr + self->word_starts[i]; + for (i = 0; i < self->words_len; ++i) { + self->words[i] = (char *)newptr + self->word_starts[i]; } } self->stream = newptr; self->stream_cap = new_cap; - } } @@ -1304,17 +1253,17 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = safe_realloc((void*) self->line_start, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_start = (int*) newptr; + self->line_start = (int *)newptr; } - newptr = safe_realloc((void*) self->line_fields, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->line_fields, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int*) newptr; + self->line_fields = (int *)newptr; self->lines_cap = new_cap; } } @@ -1326,12 +1275,10 @@ void debug_print_parser(parser_t *self) { int j, line; char *token; - for (line = 0; line < self->lines; ++line) - { + for (line = 0; line < self->lines; ++line) { printf("(Parsed) Line %d: ", line); - for (j = 0; j < self->line_fields[j]; ++j) - { + for (j = 0; j < self->line_fields[j]; ++j) { token = self->words[j + self->line_start[line]]; printf("%s ", token); } @@ -1339,13 +1286,6 @@ void debug_print_parser(parser_t *self) { } } -/*int clear_parsed_lines(parser_t *self, size_t nlines) { - // TODO. move data up in stream, shift relevant word pointers - - return 0; -}*/ - - /* nrows : number of rows to tokenize (or until reach EOF) all : tokenize all the data vs. certain number of rows @@ -1359,12 +1299,12 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { return 0; } - TRACE(("_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", \ - (int) nrows, self->datapos, self->datalen)); + TRACE(( + "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", + (int)nrows, self->datapos, self->datalen)); while (1) { - if (!all && self->lines - start_lines >= nrows) - break; + if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { status = parser_buffer_bytes(self, self->chunksize); @@ -1379,15 +1319,19 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { } } - TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", - self->datalen - self->datapos, self->datalen, self->datapos)); + TRACE( + ("_tokenize_helper: Trying to process %d bytes, datalen=%d, " + "datapos= %d\n", + self->datalen - self->datapos, self->datalen, self->datapos)); status = tokenize_bytes(self, nrows, start_lines); if (status < 0) { // XXX - TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, breaking\n", - status)); + TRACE( + ("_tokenize_helper: Status %d returned from tokenize_bytes, " + "breaking\n", + status)); status = -1; break; } @@ -1406,86 +1350,11 @@ int tokenize_all_rows(parser_t *self) { return status; } -/* SEL - does not look like this routine is used anywhere -void test_count_lines(char *fname) { - clock_t start = clock(); - - char *buffer, *tmp; - size_t bytes, lines = 0; - int i; - FILE *fp = fopen(fname, "rb"); - - buffer = (char*) malloc(CHUNKSIZE * sizeof(char)); - - while(1) { - tmp = buffer; - bytes = fread((void *) buffer, sizeof(char), CHUNKSIZE, fp); - // printf("Read %d bytes\n", bytes); - - if (bytes == 0) { - break; - } - - for (i = 0; i < bytes; ++i) - { - if (*tmp++ == '\n') { - lines++; - } - } - } - - - printf("Saw %d lines\n", (int) lines); - - free(buffer); - fclose(fp); - - printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC); -}*/ - - P_INLINE void uppercase(char *p) { - for ( ; *p; ++p) *p = toupper(*p); -} - -/* SEL - does not look like these routines are used anywhere -P_INLINE void lowercase(char *p) { - for ( ; *p; ++p) *p = tolower(*p); + for (; *p; ++p) *p = toupper(*p); } -int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) -{ - char *p_end; - - *p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE); - if (*p_end == '\0') { - *p_imag = 0.0; - return errno == 0; - } - if (*p_end == 'i' || *p_end == 'j') { - *p_imag = *p_real; - *p_real = 0.0; - ++p_end; - } - else { - if (*p_end == '+') { - ++p_end; - } - *p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE); - if (errno || ((*p_end != 'i') && (*p_end != 'j'))) { - return FALSE; - } - ++p_end; - } - while(*p_end == ' ') { - ++p_end; - } - return *p_end == '\0'; -}*/ - - -int P_INLINE to_longlong(char *item, long long *p_value) -{ +int P_INLINE to_longlong(char *item, long long *p_value) { char *p_end; // Try integer conversion. We explicitly give the base to be 10. If @@ -1500,65 +1369,26 @@ int P_INLINE to_longlong(char *item, long long *p_value) return (errno == 0) && (!*p_end); } -/* does not look like this routine is used anywhere -int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep) -{ - int i, pos, status, n = strlen(item), count = 0; - char *tmp; - char *p_end; - - for (i = 0; i < n; ++i) - { - if (*(item + i) == tsep) { - count++; - } - } - - if (count == 0) { - return to_longlong(item, p_value); - } - - tmp = (char*) malloc((n - count + 1) * sizeof(char)); - if (tmp == NULL) { - return 0; - } - - pos = 0; - for (i = 0; i < n; ++i) - { - if (item[i] != tsep) - tmp[pos++] = item[i]; - } - - tmp[pos] = '\0'; - - status = to_longlong(tmp, p_value); - free(tmp); - - return status; -}*/ - int to_boolean(const char *item, uint8_t *val) { char *tmp; int i, status = 0; + int bufsize = sizeof(char) * (strlen(item) + 1); static const char *tstrs[1] = {"TRUE"}; static const char *fstrs[1] = {"FALSE"}; - tmp = malloc(sizeof(char) * (strlen(item) + 1)); - strcpy(tmp, item); + tmp = malloc(bufsize); + strncpy(tmp, item, bufsize); uppercase(tmp); - for (i = 0; i < 1; ++i) - { + for (i = 0; i < 1; ++i) { if (strcmp(tmp, tstrs[i]) == 0) { *val = 1; goto done; } } - for (i = 0; i < 1; ++i) - { + for (i = 0; i < 1; ++i) { if (strcmp(tmp, fstrs[i]) == 0) { *val = 0; goto done; @@ -1572,27 +1402,19 @@ int to_boolean(const char *item, uint8_t *val) { return status; } -// #define TEST - #ifdef TEST -int main(int argc, char *argv[]) -{ +int main(int argc, char *argv[]) { double x, y; long long xi; int status; char *s; - //s = "0.10e-3-+5.5e2i"; - // s = "1-0j"; - // status = to_complex(s, &x, &y, 'e', '.'); s = "123,789"; status = to_longlong_thousands(s, &xi, ','); printf("s = '%s'\n", s); printf("status = %d\n", status); - printf("x = %d\n", (int) xi); - - // printf("x = %lg, y = %lg\n", x, y); + printf("x = %d\n", (int)xi); return 0; } @@ -1621,10 +1443,12 @@ int main(int argc, char *argv[]) // may be used to endorse or promote products derived from this software // without specific prior written permission. // -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -1643,197 +1467,185 @@ int main(int argc, char *argv[]) // * Add tsep argument for thousands separator // -double xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing) -{ - double number; - int exponent; - int negative; - char *p = (char *) str; - double p10; - int n; - int num_digits; - int num_decimals; - - errno = 0; - - // Skip leading whitespace - while (isspace(*p)) p++; - - // Handle optional sign - negative = 0; - switch (*p) - { - case '-': negative = 1; // Fall through to increment position - case '+': p++; - } - - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; - - // Process string of digits - while (isdigit(*p)) - { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - - // Process decimal part - if (*p == decimal) - { - p++; - - while (isdigit(*p)) - { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - exponent -= num_decimals; - } - - if (num_digits == 0) - { - errno = ERANGE; - return 0.0; - } - - // Correct for sign - if (negative) number = -number; - - // Process an exponent string - if (toupper(*p) == toupper(sci)) - { - // Handle optional sign +double xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing) { + double number; + int exponent; + int negative; + char *p = (char *)str; + double p10; + int n; + int num_digits; + int num_decimals; + + errno = 0; + + // Skip leading whitespace. + while (isspace(*p)) p++; + + // Handle optional sign. negative = 0; - switch (*++p) - { - case '-': negative = 1; // Fall through to increment pos - case '+': p++; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position. + case '+': + p++; } - // Process string of digits + number = 0.; + exponent = 0; num_digits = 0; - n = 0; - while (isdigit(*p)) - { - n = n * 10 + (*p - '0'); - num_digits++; - p++; + num_decimals = 0; + + // Process string of digits. + while (isdigit(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + + p += (tsep != '\0' && *p == tsep); } - if (negative) - exponent -= n; - else - exponent += n; + // Process decimal part. + if (*p == decimal) { + p++; + + while (isdigit(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) - p--; - } + exponent -= num_decimals; + } + if (num_digits == 0) { + errno = ERANGE; + return 0.0; + } - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) - { + // Correct for sign. + if (negative) number = -number; - errno = ERANGE; - return HUGE_VAL; - } + // Process an exponent string. + if (toupper(*p) == toupper(sci)) { + // Handle optional sign. + negative = 0; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos. + case '+': + p++; + } - // Scale the result - p10 = 10.; - n = exponent; - if (n < 0) n = -n; - while (n) - { - if (n & 1) - { - if (exponent < 0) - number /= p10; - else - number *= p10; + // Process string of digits. + num_digits = 0; + n = 0; + while (isdigit(*p)) { + n = n * 10 + (*p - '0'); + num_digits++; + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) p--; } - n >>= 1; - p10 *= p10; - } + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { + errno = ERANGE; + return HUGE_VAL; + } - if (number == HUGE_VAL) { - errno = ERANGE; - } + // Scale the result. + p10 = 10.; + n = exponent; + if (n < 0) n = -n; + while (n) { + if (n & 1) { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } - if (skip_trailing) { - // Skip trailing whitespace - while (isspace(*p)) p++; - } + if (number == HUGE_VAL) { + errno = ERANGE; + } - if (endptr) *endptr = p; + if (skip_trailing) { + // Skip trailing whitespace. + while (isspace(*p)) p++; + } + if (endptr) *endptr = p; - return number; + return number; } -double precise_xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing) -{ +double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing) { double number; int exponent; int negative; - char *p = (char *) str; + char *p = (char *)str; int num_digits; int num_decimals; int max_digits = 17; int n; - // Cache powers of 10 in memory - static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, - 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, - 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, - 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, - 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, - 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, - 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, - 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, - 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, - 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, - 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, - 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, - 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, - 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, - 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, - 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, - 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, - 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, - 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, - 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, - 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, - 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, - 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, - 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, - 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, - 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, - 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, - 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, - 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, - 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, - 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + // Cache powers of 10 in memory. + static double e[] = { + 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, + 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, + 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, + 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, + 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, + 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, + 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, + 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, + 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, + 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, + 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, + 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, + 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, + 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, + 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, + 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, + 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, + 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, + 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, + 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, + 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, + 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, + 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, + 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, + 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, + 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, + 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, + 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, + 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, + 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, + 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; errno = 0; - // Skip leading whitespace + // Skip leading whitespace. while (isspace(*p)) p++; - // Handle optional sign + // Handle optional sign. negative = 0; - switch (*p) - { - case '-': negative = 1; // Fall through to increment position - case '+': p++; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position. + case '+': + p++; } number = 0.; @@ -1841,66 +1653,59 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, num_digits = 0; num_decimals = 0; - // Process string of digits - while (isdigit(*p)) - { - if (num_digits < max_digits) - { + // Process string of digits. + while (isdigit(*p)) { + if (num_digits < max_digits) { number = number * 10. + (*p - '0'); num_digits++; - } - else + } else { ++exponent; + } p++; p += (tsep != '\0' && *p == tsep); } // Process decimal part - if (*p == decimal) - { + if (*p == decimal) { p++; - while (num_digits < max_digits && isdigit(*p)) - { + while (num_digits < max_digits && isdigit(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; num_decimals++; } - if (num_digits >= max_digits) // consume extra decimal digits - while (isdigit(*p)) - ++p; + if (num_digits >= max_digits) // Consume extra decimal digits. + while (isdigit(*p)) ++p; exponent -= num_decimals; } - if (num_digits == 0) - { + if (num_digits == 0) { errno = ERANGE; return 0.0; } - // Correct for sign + // Correct for sign. if (negative) number = -number; - // Process an exponent string - if (toupper(*p) == toupper(sci)) - { + // Process an exponent string. + if (toupper(*p) == toupper(sci)) { // Handle optional sign negative = 0; - switch (*++p) - { - case '-': negative = 1; // Fall through to increment pos - case '+': p++; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos. + case '+': + p++; } - // Process string of digits + // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) - { + while (isdigit(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1911,33 +1716,28 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, else exponent += n; - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) - p--; + // If no digits after the 'e'/'E', un-consume it. + if (num_digits == 0) p--; } - if (exponent > 308) - { + if (exponent > 308) { errno = ERANGE; return HUGE_VAL; - } - else if (exponent > 0) + } else if (exponent > 0) { number *= e[exponent]; - else if (exponent < -308) // subnormal - { - if (exponent < -616) // prevent invalid array access + } else if (exponent < -308) { // Subnormal + if (exponent < -616) // Prevent invalid array access. number = 0.; number /= e[-308 - exponent]; number /= e[308]; - } - else + } else { number /= e[-exponent]; + } - if (number == HUGE_VAL || number == -HUGE_VAL) - errno = ERANGE; + if (number == HUGE_VAL || number == -HUGE_VAL) errno = ERANGE; if (skip_trailing) { - // Skip trailing whitespace + // Skip trailing whitespace. while (isspace(*p)) p++; } @@ -1945,9 +1745,8 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } -double round_trip(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) -{ +double round_trip(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing) { #if PY_VERSION_HEX >= 0x02070000 return PyOS_string_to_double(p, q, 0); #else @@ -1955,31 +1754,12 @@ double round_trip(const char *p, char **q, char decimal, char sci, #endif } -/* -float strtof(const char *str, char **endptr) -{ - return (float) strtod(str, endptr); -} - - -long double strtold(const char *str, char **endptr) -{ - return strtod(str, endptr); -} - -double atof(const char *str) -{ - return strtod(str, NULL); -} -*/ - // End of xstrtod code // --------------------------------------------------------------------------- int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, - int *error, char tsep) -{ - const char *p = (const char *) p_item; + int *error, char tsep) { + const char *p = (const char *)p_item; int isneg = 0; int64_t number = 0; int d; @@ -1993,8 +1773,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (*p == '-') { isneg = 1; ++p; - } - else if (*p == '+') { + } else if (*p == '+') { p++; } @@ -2023,11 +1802,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } @@ -2036,25 +1813,20 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, while (isdigit(d)) { if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } } } - } - else { + } else { // If number is less than pre_max, at least one more digit // can be processed without overflowing. int64_t pre_max = int_max / 10; int dig_pre_max = int_max % 10; - //printf("pre_max = %lld dig_pre_max = %d\n", pre_max, dig_pre_max); - // Process the digits. d = *p; if (tsep != '\0') { @@ -2067,12 +1839,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } @@ -2081,12 +1851,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, while (isdigit(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } @@ -2108,66 +1876,3 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, *error = 0; return number; } - -/* does not look like this routine is used anywhere -uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error) -{ - int d, dig_pre_max; - uint64_t pre_max; - const char *p = (const char *) p_item; - uint64_t number = 0; - - // Skip leading spaces. - while (isspace(*p)) { - ++p; - } - - // Handle sign. - if (*p == '-') { - *error = ERROR_MINUS_SIGN; - return 0; - } - if (*p == '+') { - p++; - } - - // Check that there is a first digit. - if (!isdigit(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - pre_max = uint_max / 10; - dig_pre_max = uint_max % 10; - - // Process the digits. - d = *p; - while (isdigit(d)) { - if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - } - else { - *error = ERROR_OVERFLOW; - return 0; - } - } - - // Skip trailing spaces. - while (isspace(*p)) { - ++p; - } - - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } - - *error = 0; - return number; -} -*/ diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 487c1265d9358..e01812f1c5520 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -9,29 +9,29 @@ See LICENSE for the license */ -#ifndef _PARSER_COMMON_H_ -#define _PARSER_COMMON_H_ +#ifndef PANDAS_SRC_PARSER_TOKENIZER_H_ +#define PANDAS_SRC_PARSER_TOKENIZER_H_ -#include "Python.h" +#include #include -#include #include +#include #include -#include +#include "Python.h" #include -#define ERROR_OK 0 -#define ERROR_NO_DIGITS 1 -#define ERROR_OVERFLOW 2 -#define ERROR_INVALID_CHARS 3 -#define ERROR_MINUS_SIGN 4 +#define ERROR_OK 0 +#define ERROR_NO_DIGITS 1 +#define ERROR_OVERFLOW 2 +#define ERROR_INVALID_CHARS 3 +#define ERROR_MINUS_SIGN 4 #include "../headers/stdint.h" #include "khash.h" -#define CHUNKSIZE 1024*256 +#define CHUNKSIZE 1024 * 256 #define KB 1024 #define MB 1024 * KB #define STREAM_INIT_SIZE 32 @@ -40,15 +40,15 @@ See LICENSE for the license #define CALLING_READ_FAILED 2 #ifndef P_INLINE - #if defined(__GNUC__) - #define P_INLINE static __inline__ - #elif defined(_MSC_VER) - #define P_INLINE - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define P_INLINE static inline - #else - #define P_INLINE - #endif +#if defined(__GNUC__) +#define P_INLINE static __inline__ +#elif defined(_MSC_VER) +#define P_INLINE +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define P_INLINE static inline +#else +#define P_INLINE +#endif #endif #if defined(_MSC_VER) @@ -62,41 +62,34 @@ See LICENSE for the license */ #define FALSE 0 -#define TRUE 1 - -/* Maximum number of columns in a file. */ -#define MAX_NUM_COLUMNS 2000 +#define TRUE 1 -/* Maximum number of characters in single field. */ - -#define FIELD_BUFFER_SIZE 2000 +// Maximum number of columns in a file. +#define MAX_NUM_COLUMNS 2000 +// Maximum number of characters in single field. +#define FIELD_BUFFER_SIZE 2000 /* * Common set of error types for the read_rows() and tokenize() * functions. */ - -#define ERROR_OUT_OF_MEMORY 1 -#define ERROR_INVALID_COLUMN_INDEX 10 +#define ERROR_OUT_OF_MEMORY 1 +#define ERROR_INVALID_COLUMN_INDEX 10 #define ERROR_CHANGED_NUMBER_OF_FIELDS 12 -#define ERROR_TOO_MANY_CHARS 21 -#define ERROR_TOO_MANY_FIELDS 22 -#define ERROR_NO_DATA 23 - - -/* #define VERBOSE */ +#define ERROR_TOO_MANY_CHARS 21 +#define ERROR_TOO_MANY_FIELDS 22 +#define ERROR_NO_DATA 23 +// #define VERBOSE #if defined(VERBOSE) #define TRACE(X) printf X; #else #define TRACE(X) #endif - #define PARSER_OUT_OF_MEMORY -1 - /* * XXX Might want to couple count_rows() with read_rows() to avoid duplication * of some file I/O. @@ -108,7 +101,6 @@ See LICENSE for the license */ #define WORD_BUFFER_SIZE 4000 - typedef enum { START_RECORD, START_FIELD, @@ -131,12 +123,14 @@ typedef enum { } ParserState; typedef enum { - QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE + QUOTE_MINIMAL, + QUOTE_ALL, + QUOTE_NONNUMERIC, + QUOTE_NONE } QuoteStyle; - -typedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status); +typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, + int *status); typedef int (*io_cleanup)(void *src); typedef struct parser_t { @@ -156,38 +150,38 @@ typedef struct parser_t { // Store words in (potentially ragged) matrix for now, hmm char **words; - int *word_starts; // where we are in the stream + int *word_starts; // where we are in the stream int words_len; int words_cap; - char *pword_start; // pointer to stream start of current field - int word_start; // position start of current field + char *pword_start; // pointer to stream start of current field + int word_start; // position start of current field - int *line_start; // position in words for start of line - int *line_fields; // Number of fields in each line - int lines; // Number of (good) lines observed - int file_lines; // Number of file lines observed (including bad or skipped) - int lines_cap; // Vector capacity + int *line_start; // position in words for start of line + int *line_fields; // Number of fields in each line + int lines; // Number of (good) lines observed + int file_lines; // Number of file lines observed (including bad or skipped) + int lines_cap; // Vector capacity // Tokenizing stuff ParserState state; - int doublequote; /* is " represented by ""? */ - char delimiter; /* field separator */ - int delim_whitespace; /* delimit by consuming space/tabs instead */ - char quotechar; /* quote character */ - char escapechar; /* escape character */ + int doublequote; /* is " represented by ""? */ + char delimiter; /* field separator */ + int delim_whitespace; /* delimit by consuming space/tabs instead */ + char quotechar; /* quote character */ + char escapechar; /* escape character */ char lineterminator; - int skipinitialspace; /* ignore spaces following delimiter? */ - int quoting; /* style of quoting to write */ + int skipinitialspace; /* ignore spaces following delimiter? */ + int quoting; /* style of quoting to write */ // krufty, hmm =/ int numeric_field; char commentchar; int allow_embedded_newline; - int strict; /* raise exception on bad CSV */ + int strict; /* raise exception on bad CSV */ - int usecols; // Boolean: 1: usecols provided, 0: none provided + int usecols; // Boolean: 1: usecols provided, 0: none provided int expected_fields; int error_bad_lines; @@ -200,9 +194,9 @@ typedef struct parser_t { // thousands separator (comma, period) char thousands; - int header; // Boolean: 1: has header, 0: no header - int header_start; // header row start - int header_end; // header row end + int header; // Boolean: 1: has header, 0: no header + int header_start; // header row start + int header_end; // header row end void *skipset; int64_t skip_first_N_rows; @@ -216,7 +210,6 @@ typedef struct parser_t { int skip_empty_lines; } parser_t; - typedef struct coliter_t { char **words; int *line_start; @@ -226,15 +219,13 @@ typedef struct coliter_t { void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); coliter_t *coliter_new(parser_t *self, int i); -/* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */ -// #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col] +#define COLITER_NEXT(iter, word) \ + do { \ + const int i = *iter.line_start++ + iter.col; \ + word = i < *iter.line_start ? iter.words[i] : ""; \ + } while (0) -#define COLITER_NEXT(iter, word) do { \ - const int i = *iter.line_start++ + iter.col; \ - word = i < *iter.line_start ? iter.words[i]: ""; \ - } while(0) - -parser_t* parser_new(void); +parser_t *parser_new(void); int parser_init(parser_t *self); @@ -256,24 +247,17 @@ int tokenize_nrows(parser_t *self, size_t nrows); int tokenize_all_rows(parser_t *self); -/* - - Have parsed / type-converted a chunk of data and want to free memory from the - token stream - - */ -//int clear_parsed_lines(parser_t *self, size_t nlines); - -int64_t str_to_int64(const char *p_item, int64_t int_min, - int64_t int_max, int *error, char tsep); -//uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); - -double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); -double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); -double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); -//int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); -//int P_INLINE to_longlong(char *item, long long *p_value); -//int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); +// Have parsed / type-converted a chunk of data +// and want to free memory from the token stream + +int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, + int *error, char tsep); +double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing); +double round_trip(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing); int to_boolean(const char *item, uint8_t *val); -#endif // _PARSER_COMMON_H_ +#endif // PANDAS_SRC_PARSER_TOKENIZER_H_