From 778da5312fd5abbe3433f1317470fa7fdfa7236a Mon Sep 17 00:00:00 2001 From: John Myles White Date: Wed, 19 Jun 2013 22:49:09 -0400 Subject: [PATCH 1/5] Clean and fast readtable implementation --- src/DataFrames.jl | 2 +- src/io.jl | 926 ++++++++++++++++++------------------- test/data/complex_data.csv | 1 + test/data/messy.csv | 5 + test/io.jl | 242 +--------- 5 files changed, 476 insertions(+), 700 deletions(-) create mode 100644 test/data/messy.csv diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 73e738341e..c318c59a92 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -159,7 +159,7 @@ export # reconcile_groups, range, rbind, read_minibatch, - read_table, + readtable, reldiff, removeNA, rename_group!, diff --git a/src/io.jl b/src/io.jl index 9ec939f64f..ea46df8123 100644 --- a/src/io.jl +++ b/src/io.jl @@ -1,502 +1,468 @@ -const DEFAULT_BOOLEAN_STRINGS = - ["T", "F", "t", "f", "TRUE", "FALSE", "true", "false"] -const DEFAULT_TRUE_STRINGS = - ["T", "t", "TRUE", "true"] -const DEFAULT_FALSE_STRINGS = - ["F", "f", "FALSE", "false"] - -const DEFAULT_QUOTATION_CHARACTER = '"' -const DEFAULT_SEPARATOR = ',' - -const DEFAULT_MISSINGNESS_INDICATORS = - ["", "NA", "#NA", "N/A", "#N/A", "NULL", "."] - -function parse_bool(x::String) - if contains(DEFAULT_TRUE_STRINGS, x) - return true - elseif contains(DEFAULT_FALSE_STRINGS, x) - return false - else - error("Could not parse bool") - end +function readnrows!(io::IO, + buffer::Vector{Uint8}, + eol_indices::Vector{Int}, + separator_indices::Vector{Int}, + nrows::Int, + eol::Char, + separator::Char, + quotemark::Char) + bytesread::Int = 0 + nrowsread::Int = 0 + eolsread::Int = 0 + separatorsread::Int = 0 + + inquotes::Bool = false + inescape::Bool = false + + chr::Uint8 = ' ' + + buffer_size::Int = length(buffer) + eol_size::Int = length(eol_indices) + separator_size::Int = length(separator_indices) + + while !eof(io) && nrowsread < nrows + bytesread += 1 + chr = read(io, Uint8) + + if buffer_size < bytesread + buffer_size *= 2 + resize!(buffer, buffer_size) + end + buffer[bytesread] = chr + + if !inquotes + if chr == eol + nrowsread +=1 + eolsread +=1 + if eol_size < eolsread + eol_size *= 2 + resize!(eol_indices, eol_size) + end + eol_indices[eolsread] = bytesread + end + + if chr == separator + separatorsread += 1 + if separator_size < separatorsread + separator_size *= 2 + resize!(separator_indices, separator_size) + end + separator_indices[separatorsread] = bytesread + end + + if chr == quotemark && !inescape + inquotes = true + end + else + if chr == quotemark && !inescape + inquotes = false + end + end + + if chr == '\\' + inescape = true + else + inescape = false + end + end + + return nrowsread, bytesread, eolsread, separatorsread end -############################################################################## -# -# Low-level text parsing -# -############################################################################## - -function make_extract_string() - extract_cache = IOBuffer(Array(Uint8, 500), true, true) - # Do we really need a closure? - # Why not just keep passing this argument in? - function f(this, left::Int, right::Int, omitlist::Set = Set()) - extract_cache_size = right - left - if extract_cache_size > extract_cache.size - extract_cache = IOBuffer(Array(Uint8, extract_cache_size), true, true) - end - seek(extract_cache, 0) # necessary? - if length(this) >= 1 - while isvalid(this, right) && right > left && this[right] == ' ' - right -= 1 - end - i = left - while i <= right - lasti = i - ch, i = next(this, i) - if !contains(omitlist, lasti) - print(extract_cache, ch) - end - end - return takebuf_string(extract_cache) - else - return "" - end - end - return f -end -extract_string = make_extract_string() - -const STATE_EXPECTING_VALUE = 0 -const STATE_IN_BARE = 1 -const STATE_IN_QUOTED = 2 -const STATE_POSSIBLE_EOQUOTED = 3 -const STATE_EXPECTING_SEP = 4 - -# Read one line of delimited text -# This is complex because delimited text can contain EOL inside quoted fields -function read_separated_line(io, - separator::Char, - quotation_character::Char) - # Indexes into the current line for the current item - left = 0 - right = 0 - - # Was using RopeString for efficient appends, but rare case and makes - # UTF-8 processing harder - this = Base.chomp!(readline(io)) - - # Short-circuit on the empty line - if this == "" - return Array(UTF8String, 0) - end - - # 5-state machine. See list of possible states above - state = STATE_EXPECTING_VALUE - - # Index of characters to remove - omitlist = Set() - - # Where are we - i = start(this) - eol = false - - # Will eventually return a Vector of strings - num_elems = 0 - ret = Array(ByteString, 0) - - # off we go! use manual loops because this can grow - while true - eol = done(this, i) - if !eol - this_i = i - this_char, i = next(this, i) - end - if state == STATE_EXPECTING_VALUE - if eol - num_elems += 1 - push!(ret, "") - break - elseif this_char == ' ' - continue - elseif this_char == separator - num_elems += 1 - push!(ret, "") - elseif this_char == quotation_character - left = this_i + 1 - state = STATE_IN_QUOTED - else - left = this_i - state = STATE_IN_BARE - end - elseif state == STATE_IN_BARE - if eol - right = this_i - num_elems += 1 - push!(ret, extract_string(this, left, right)) - break - elseif this_char == separator - right = this_i - 1 - num_elems += 1 - push!(ret, extract_string(this, left, right)) - state = STATE_EXPECTING_VALUE - else - continue - end - elseif state == STATE_IN_QUOTED - if eol - this = string(this, "\n", Base.chomp!(readline(io))) - elseif this_char == quotation_character - state = STATE_POSSIBLE_EOQUOTED - else - continue - end - elseif state == STATE_POSSIBLE_EOQUOTED - if eol - right = this_i - 1 - num_elems += 1 - push!(ret, extract_string(this, left, right, omitlist)) - break - elseif this_char == quotation_character - add!(omitlist, this_i) - state = STATE_IN_QUOTED - elseif this_char == separator - right = this_i - 2 - num_elems += 1 - push!(ret, extract_string(this, left, right, omitlist)) - empty!(omitlist) - state = STATE_EXPECTING_VALUE - elseif this_char == ' ' - right = this_i - 2 - num_elems += 1 - push!(ret, extract_string(this, left, right, omitlist)) - empty!(omitlist) - state = STATE_EXPECTING_SEP - else - error("unexpected character after a quote") - end - elseif state == STATE_EXPECTING_SEP - if eol - break - elseif this_char == ' ' - continue - elseif this_char == separator - state = STATE_EXPECTING_VALUE - else - error("expecting a separator but got something else") - end - end - end - ret -end - -# Read data line-by-line -function read_separated_text(io::IO, - nrows::Int, - separator::Char, - quotation_character::Char) - # Read one line to determine the number of columns - i = 1 - sp = read_separated_line(io, separator, quotation_character) - # Find a way to reuse this buffer - ncols = length(sp) - - # If the line is blank, return a 0x0 array to signify this - if ncols == 0 - return Array(UTF8String, 0, 0) - end - - # Otherwise, allocate an array to store all of the text we'll read - text_data = Array(UTF8String, nrows, ncols) - text_data[i, :] = sp - - # Loop until we've read nrows of text or run out of text - while i < nrows - sp = read_separated_line(io, separator, quotation_character) - if length(sp) == ncols - i += 1 - text_data[i, :] = sp - else - break - end - end - - # Return as much text as we read - return text_data[1:i, :] -end - -############################################################################## -# -# Inferential steps -# -############################################################################## - -function determine_separator{T <: String}(filename::T) - if ismatch(r"csv$", filename) - return ',' - elseif ismatch(r"tsv$", filename) - return '\t' - elseif ismatch(r"wsv$", filename) - return ' ' - else - error("Unable to determine separator used in $filename") - end -end +function buffermatch(buffer::Vector{Uint8}, + left::Int, + right::Int, + exemplars::Vector{ASCIIString}) + l::Int = right - left + 1 -function determine_nrows{T <: String}(filename::T, header::Bool) - total_lines = countlines(filename) - if header - return total_lines - 1 - else - return total_lines - end -end - -function determine_column_names(io::IO, - separator::Char, - quotation_character::Char, - header::Bool) - seek(io, 0) - fields = read_separated_line(io, separator, quotation_character) + for index in 1:length(exemplars) + exemplar::ASCIIString = exemplars[index] + if length(exemplar) == l + isamatch::Bool = true - if length(fields) == 0 - error("Failed to determine column names from an empty data source") - end + for i in 0:(l - 1) + isamatch &= buffer[left + i] == exemplar[1 + i] + end - column_names = header ? fields : generate_column_names(length(fields)) - seek(io, 0) - return column_names -end + if isamatch + return true + end + end + end -function convert_to_dataframe{R <: String, - S <: String, - T <: String}(text_data::Matrix{R}, - missingness_indicators::Vector{S}, - column_names::Vector{T}) - # Keep a record of number of rows and columns - nrows, ncols = size(text_data, 1), length(column_names) - - # Short-circuit if the text data is empty - if nrows == 0 - column_types = {Any for i in 1:ncols} - return DataFrame(column_types, column_names, 0) - end - - # Store the columns as a set of DataVector's inside an Array of Any's - columns = Array(Any, ncols) - - # Convert each column of text into a DataVector of the - # appropriate type - dtime = 0.0 - for j in 1:ncols - is_missing = BitVector(nrows) - for i in 1:nrows - value_missing = contains(missingness_indicators, text_data[i, j]) - if value_missing - text_data[i, j] = utf8("0") - is_missing[i] = true - else - is_missing[i] = false - end - end - values = Array(Int64, nrows) - try - for i in 1:nrows - values[i] = parseint(text_data[i, j]) - end - catch - try - values = Array(Float64, nrows) - for i in 1:nrows - values[i] = parsefloat(text_data[i, j]) - end - catch - try - values = Array(Bool, nrows) - for i in 1:nrows - values[i] = parse_bool(text_data[i, j]) - end - catch - values = text_data[:, j] - end - end - end - columns[j] = DataArray(values, is_missing) - end - - # Prepare the DataFrame we'll return - df = DataFrame(columns, column_names) - return df + return false end -############################################################################## -# -# Text input -# -############################################################################## - -# Read at most N lines from an IO object -# Then return a minibatch of at most N rows as a DataFrame -# Add column_types, force_types option -function read_minibatch{R <: String, - S <: String}(io::IO, - separator::Char, - quotation_character::Char, - missingness_indicators::Vector{R}, - column_names::Vector{S}, - minibatch_size::Int) - # Represent data as an array of strings before type conversion - text_data = read_separated_text(io, minibatch_size, separator, quotation_character) - - # Convert text data to a DataFrame - return convert_to_dataframe(text_data, missingness_indicators, column_names) +# All of these functions return three items: +# Parsed value, Success indicator, Missing indicator + +# TODO: Align more closely with parseint code +function bytestoint(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_nonstrings) + return 0, true, true + end + + value::Int = 0 + power::Int = 1 + index::Int = right + byte::Uint8 = buffer[index] + + while index > left + if '0' <= byte <= '9' + value += (byte - '0') * power + power *= 10 + else + return value, false, false + end + index -= 1 + byte = buffer[index] + end + + if byte == '-' + return -value, true, false + elseif byte == '+' + return value, true, false + elseif '0' <= byte <= '9' + value += (byte - '0') * power + return value, true, false + else + return value, false, false + end end -# Read an entire data set into a DataFrame from an IO -# TODO: Do only IO-pass through the data -function read_table{R <: String, - S <: String}(io::IO, - separator::Char, - quotation_character::Char, - missingness_indicators::Vector{R}, - header::Bool, - column_names::Vector{S}, - nrows::Int) - # Return to start of stream - seek(io, 0) - - # Read first line to remove header in advance - if header - readline(io) - end - - # Represent data as an array of strings before type conversion - text_data = read_separated_text(io, nrows, separator, quotation_character) - - # Short-circuit if data set is empty except for a header line - if size(text_data, 1) == 0 - column_types = {Any for i in 1:length(column_names)} - return DataFrame(column_types, column_names, 0) - else - # Convert text data to a DataFrame - df = convert_to_dataframe(text_data, missingness_indicators, column_names) - return df - end +let out::Vector{Float64} = Array(Float64, 1) + global bytestofloat + function bytestofloat(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_nonstrings) + return 0.0, true, true + end + + success = ccall(:jl_substrtod, + Int32, + (Ptr{Uint8}, Int, Int, Ptr{Float64}), + buffer, + left - 1, + right - left + 1, + out) == 0 + + return out[1], success, false + end end -function read_table{T <: String}(filename::T) - # Do inference for missing configuration settings - separator = determine_separator(filename) - quotation_character = DEFAULT_QUOTATION_CHARACTER - missingness_indicators = DEFAULT_MISSINGNESS_INDICATORS - header = true - nrows = determine_nrows(filename, header) - io = open(filename, "r") - column_names = determine_column_names(io, separator, quotation_character, header) - df = read_table(io, - separator, - quotation_character, - missingness_indicators, - header, - column_names, - nrows) - close(io) - return df +function bytestobool(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}, + true_strings::Vector{ASCIIString}, + false_strings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_nonstrings) + return false, true, true + end + + if buffermatch(buffer, left, right, true_strings) + return true, true, false + elseif buffermatch(buffer, left, right, false_strings) + return false, true, false + else + return false, false, false + end end +function bytestostring(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_strings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_strings) + return "", true, true + end -############################################################################## -# -# Text output -# -############################################################################## - -# Quotation rules -function in_quotes(val::String, quotation_character::Char) - string(quotation_character, val, quotation_character) -end -function in_quotes(val::Real, quotation_character::Char) - string(val) -end -function in_quotes(val::Any, quotation_character::Char) - string(quotation_character, string(val), quotation_character) + return bytestring(buffer[left:right]), true, false end -# TODO: write_table should do more to react to the type of each column -# Need to increase precision of string representation of Float64's -function print_table(io::IO, - df::DataFrame, - separator::Char, - quotation_character::Char, - header::Bool) - n, p = nrow(df), ncol(df) - if header - column_names = colnames(df) - for j in 1:p - if j < p - print(io, in_quotes(column_names[j], quotation_character)) - print(io, separator) - else - println(io, in_quotes(column_names[j], quotation_character)) - end - end - end - for i in 1:n - for j in 1:p - if j < p - print(io, in_quotes(df[i, j], quotation_character)) - print(io, separator) - else - println(io, in_quotes(df[i, j], quotation_character)) - end - end - end +function builddf(rows::Int, + cols::Int, + bytes::Int, + eols::Int, + separators::Int, + buffer::Vector{Uint8}, + eol_indices::Vector{Int}, + separator_indices::Vector{Int}, + separator::Char, + eol::Char, + quotemark::Char, + missing_nonstrings::Vector{ASCIIString}, + missing_strings::Vector{ASCIIString}, + true_strings::Vector{ASCIIString}, + false_strings::Vector{ASCIIString}, + ignore_padding::Bool, + strings_as_factors::Bool) + columns::Vector{Any} = Array(Any, cols) + + for j in 1:cols + values = Array(Int, rows) + missing::BitVector = falses(rows) + isint::Bool = true + isfloat::Bool = true + isbool::Bool = true + isstring::Bool = true + + i::Int = 0 + + while i < rows + i += 1 + + # Determine left and right boundaries of field + if j == 1 + if i == 1 + left = 1 + else + left = eol_indices[i - 1] + 1 + end + else + left = separator_indices[(i - 1) * (cols - 1) + j - 1] + 1 + end + + if j == cols + if i == rows + if buffer[bytes] == eol + right = bytes - 1 + else + right = bytes + end + else + right = eol_indices[i] - 1 + end + else + right = separator_indices[(i - 1) * (cols - 1) + j] - 1 + end + + # Ignore left-and-right whitespace padding + if ignore_padding + while left < right && buffer[left] == ' ' + left += 1 + end + while left < right && buffer[right] == ' ' + right -= 1 + end + end + + # (1) Try to parse values as Int's + if isint + values[i], success, missing[i] = + bytestoint(buffer, left, right, missing_nonstrings) + if success + continue + else + isint = false + values = convert(Array{Float64}, values) + end + end + + # (2) Try to parse as Float64's + if isfloat + values[i], success, missing[i] = + bytestofloat(buffer, left, right, missing_nonstrings) + if success + continue + else + isfloat = false + values = Array(Bool, rows) + i = 1 + end + end + + # If we go this far, we should ignore quote marks on the boundaries + while left < right && buffer[left] == quotemark + left += 1 + end + while left < right && buffer[right] == quotemark + right -= 1 + end + + # (3) Try to parse as Bool's + if isbool + values[i], success, missing[i] = + bytestobool(buffer, left, right, + missing_nonstrings, + true_strings, false_strings) + if success + continue + else + isbool = false + values = Array(UTF8String, rows) + i = 1 + end + end + + # (4) Fallback to UTF8String + if left == right && buffer[right] == quotemark + # Empty string special method + values[i], success, missing[i] = "", true, false + else + values[i], success, missing[i] = + bytestostring(buffer, left, right, missing_strings) + end + end + + if strings_as_factors && isstring + columns[j] = PooledDataArray(values, missing) + else + columns[j] = DataArray(values, missing) + end + end + + # Need to pass this in + column_names = DataFrames.generate_column_names(cols) + + return DataFrame(columns, column_names) end -function print_table(io::IO, - df::DataFrame, - separator::Char, - quotation_character::Char) - print_table(io, df, separator, quotation_character, true) +function readtable(io::IO; + header::Bool = true, + separator::Char = ',', + eol::Char = '\n', + quotemark::Char = '"', + missing_nonstrings::Vector{ASCIIString} = ["", "NA"], + missing_strings::Vector{ASCIIString} = ["NA"], + true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], + false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], + strings_as_factors::Bool = false, + ignore_padding::Bool = true, + decimal::Char = '.', + colnames::Vector{UTF8String} = Array(UTF8String, 0), + coltypes::Vector{Any} = Array(Any, 0), + nrows::Int = -1, + skip_lines_at_start::Int = 0, + clean_colnames::Bool = true, + skip_blanklines::Bool = true, + comment::Char = '#', + encoding::Symbol = :utf8) + + # Allocate buffers to conserve memory + buffer::Vector{Uint8} = Array(Uint8, 2^20) + eol_indices::Vector{Int} = Array(Int, 1) + separator_indices::Vector{Int} = Array(Int, 1) + + # Skip lines at the start + skipped_lines::Int = 0 + while skipped_lines < skip_lines_at_start + # TODO: Use a method that doesn't allocate buffers + readuntil(io, eol) + end + + # Deal with header + # TODO: Extract column names during this step + if header + column_text = readuntil(io, eol) + end + + # Separate text into fields + rows, bytes, eols, separators = + readnrows!(io, + buffer, + eol_indices, + separator_indices, + nrows, + eol, + separator, + quotemark) + + # Determine the number of columns + cols = fld(separators, rows) + 1 + + # Confirm that the number of columns is consistent across rows + if rem(separators, rows) != 0 + error(@sprintf "Every line must have %d columns" cols) + end + + # Parse contents of a buffer into a DataFrame + df = builddf(rows, + cols, + bytes, + eols, + separators, + buffer, + eol_indices, + separator_indices, + separator, + eol, + quotemark, + missing_nonstrings, + missing_strings, + true_strings, + false_strings, + ignore_padding, + strings_as_factors) + + # Clean up column names if requested + if clean_colnames + clean_colnames!(df) + end + + # Return the final DataFrame + return df end -function print_table(df::DataFrame, separator::Char, quotation_character::Char) - print_table(OUTPUT_STREAM, df, separator, quotation_character, true) +function readtable(filename::String; + header::Bool = true, + separator::Char = ',', + eol::Char = '\n', + quotemark::Char = '"', + missing_nonstrings::Vector{ASCIIString} = ["", "NA"], + missing_strings::Vector{ASCIIString} = ["NA"], + true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], + false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], + strings_as_factors::Bool = false, + ignore_padding::Bool = true, + decimal::Char = '.', + colnames::Vector{UTF8String} = Array(UTF8String, 0), + coltypes::Vector{Any} = Array(Any, 0), + nrows::Int = -1, + skip_lines_at_start::Int = 0, + clean_colnames::Bool = true, + skip_blanklines::Bool = true, + comment::Char = '#', + encoding::Symbol = :utf8) + + # Open an IO stream + io = open(filename, "r") + + # If user wants all rows, overestimate nrows + if nrows == -1 + nrows = filesize(filename) + end + + # Use the IO stream method for readtable() + df = readtable(io, + header = header, + separator = separator, + eol = eol, + quotemark = quotemark, + missing_nonstrings = missing_nonstrings, + missing_strings = missing_strings, + true_strings = true_strings, + false_strings = false_strings, + strings_as_factors = strings_as_factors, + ignore_padding = ignore_padding, + decimal = decimal, + colnames = colnames, + coltypes = coltypes, + nrows = nrows, + skip_lines_at_start = skip_lines_at_start, + clean_colnames = clean_colnames, + skip_blanklines = clean_colnames, + comment = comment, + encoding = encoding) + + # Close the IO stream + close(io) + + # Return the resulting DataFrame + return df end - -function print_table(df::DataFrame) - print_table(OUTPUT_STREAM, - df, - DEFAULT_SEPARATOR, - DEFAULT_QUOTATION_CHARACTER, - true) -end - -function write_table(filename::String, - df::DataFrame, - separator::Char, - quotation_character::Char) - io = open(filename, "w") - print_table(io, df, separator, quotation_character) - close(io) -end - -# Infer configuration settings from filename -function write_table(filename::String, df::DataFrame) - separator = determine_separator(filename) - quotation_character = DEFAULT_QUOTATION_CHARACTER - write_table(filename, df, separator, quotation_character) -end - -############################################################################## -# -# Binary serialization -# -############################################################################## - -# Wrappers for serialization -function save(filename, d) - f = open(filename, "w") - serialize(f, d) - close(f) -end - -function load_df(filename) - f = open(filename) - dd = deserialize(f) - close(f) - return dd -end - -# end diff --git a/test/data/complex_data.csv b/test/data/complex_data.csv index fbdf0a5783..15bea6b698 100644 --- a/test/data/complex_data.csv +++ b/test/data/complex_data.csv @@ -1,2 +1,3 @@ +C1,C2,C3,C4,C5 a,"b","c,d",1.0,1 a,"b","",, diff --git a/test/data/messy.csv b/test/data/messy.csv new file mode 100644 index 0000000000..251c075224 --- /dev/null +++ b/test/data/messy.csv @@ -0,0 +1,5 @@ +"A","B","C","D","E" +1 , 2, 3.1 ,"true","X" +3 , 4, 2.3 ,"false","Y" +NA,,NA,NA,NA +,NA,,,NA diff --git a/test/io.jl b/test/io.jl index 6b28c7529e..811c3dc0ee 100644 --- a/test/io.jl +++ b/test/io.jl @@ -1,222 +1,26 @@ -# unit tests of extract_string -x = "12345678" -@assert DataFrames.extract_string(x, 3, 6) == "3456" -@assert DataFrames.extract_string(x, 3, 3) == "3" -@assert DataFrames.extract_string(x, 3, 6, Set(3)) == "456" -@assert DataFrames.extract_string(x, 3, 6, Set(5, 3)) == "46" -x = "\"Güerín\",\"Sí\",\"No\"" -@assert DataFrames.extract_string(x, 2, 7, Set(3)) == "Gerí" -@assert DataFrames.extract_string("", 0,0,Set()) == "" - -# Handle the empty string properly -test0 = IOString("") -res0 = DataFrames.read_separated_line(test0, ',', '"') -@assert isempty(res0) - -test1 = IOString("I'm A,I'm B,I'm C,-0.3932755625236671,20.157657978753534") -res1 = DataFrames.read_separated_line(test1, ',', '"') -@assert res1[2] == "I'm B" -@assert res1[4] == "-0.3932755625236671" - -test2 = IOString("123, 456 , \"789\",TRUE") -res2 = DataFrames.read_separated_line(test2, ',', '"') -@assert res2[2] == "456" -@assert res2[3] == "789" - -test3 = IOString("123 ,456 , \"789\" , \"TRUE\"") -res3 = DataFrames.read_separated_line(test3, ',', '"') -@assert res3[2] == "456" -@assert res3[4] == "TRUE" - -test4 = IOString("123 ,456 , , \"TRUE\"") -res4 = DataFrames.read_separated_line(test4, ',', '"') -@assert res4[3] == "" -@assert res4[4] == "TRUE" - -test5 = IOString("123 ,456 , \"a\"\"b\" ,\"TRUE\"") -res5 = DataFrames.read_separated_line(test5, ',', '"') -@assert res5[3] == "a\"b" -@assert res5[4] == "TRUE" - -test6 = IOString("123 ,456 , \"a -b\" ,\"TRUE\"") -res6 = DataFrames.read_separated_line(test6, ',', '"') -@assert res6[3] == "a\nb" -@assert res6[4] == "TRUE" - -# Should this be one NA? -# test7 = IOString("") -# res7 = DataFrames.read_separated_line(test7, ',', '"') -# @assert length(res7) == 1 - -test8 = IOString("a,\"b\",\"cd\",1.0,1\na,\"b\",\"cd\",1.0,1") -res8 = DataFrames.read_separated_line(test8, ',', '"') -@assert length(res8) == 5 -@assert res8[5] == "1" - -test9 = IOString("\"Güerín\",\"Sí\",\"No\"") -res9 = DataFrames.read_separated_line(test9, ',', '"') -@assert res9[2] == "Sí" - -test10 = IOString("1,2,3,,") -res10 = DataFrames.read_separated_line(test10, ',', '"') -@assert length(res10) == 5 - -filename = Pkg.dir("DataFrames", "test", "data", "simple_data.csv") -open(filename,"r") do io - t1 = read_table(io, ',', '"', DataFrames.DEFAULT_MISSINGNESS_INDICATORS, false, ["1","2","3","4","5"], 2) - @assert nrow(t1) == 2 - @assert t1[1,2] == "b" +using DataFrames + +filenames = ["test/data/big_data.csv", + "test/data/bool.csv", + "test/data/complex_data.csv", + "test/data/corrupt_utf8.csv", + "test/data/corrupt_utf8_short.csv", + "test/data/messy.csv", + "test/data/movies.csv", + "test/data/sample_data.csv", + "test/data/simple_data.csv", + "test/data/space_after_delimiter.csv", + "test/data/space_around_delimiter.csv", + "test/data/space_before_delimiter.csv", + "test/data/types.csv", + "test/data/utf8.csv"] + +for filename in filenames + df = readtable(filename) end +filename = "test/data/sample_data.tsv" +df = readtable(filename, separator = '\t') - -# Test separated line splitting -# -# TODO: Test minimially-quoted -# TODO: Test only-strings-quoted - -separators = [',', '\t', ' '] -quotation_characters = ['\'', '"'] - -# Test all-entries-quoted for all quote characters and separators -items = {"a", "b", "c,d", "1.0", "1"} -item_buffer = Array(UTF8String, length(items)) - -# TODO: make this work with new splitting code -# for separator in separators -# for quotation_character in quotation_characters -# line = join(map(x -> string(quotation_character, x, quotation_character), -# items), -# separator) -# current_item_buffer = Array(Char, strlen(line)) -# split_results = DataFrames.split_separated_line(line, separator, quotation_character, item_buffer, current_item_buffer) -# @assert all(split_results .== items) -# end -# end - -# Test reading -@assert DataFrames.determine_separator("blah.csv") == ',' -@assert DataFrames.determine_separator("blah.tsv") == '\t' -@assert DataFrames.determine_separator("blah.wsv") == ' ' -# @assert DataFrames.determine_separator("blah.txt") -# Need to change to use @expects to test that error gets raised - -filename = Pkg.dir("DataFrames", "test", "data", "big_data.csv") -separator = DataFrames.determine_separator(filename) -quotation_character = '"' -missingness_indicators = ["", "NA"] -header = true -column_names = UTF8String["A", "B", "C", "D", "E"] -minibatch_size = 10 - -file = open(filename, "r") -readline(file) -minibatch = read_minibatch(file, - separator, - quotation_character, - missingness_indicators, - column_names, - minibatch_size) -@assert nrow(minibatch) == minibatch_size -@assert ncol(minibatch) == length(column_names) -@assert colnames(minibatch) == column_names -@assert eltype(minibatch[:, 1]) == UTF8String -@assert eltype(minibatch[:, 2]) == UTF8String -@assert eltype(minibatch[:, 3]) == UTF8String -@assert eltype(minibatch[:, 4]) == Float64 -@assert eltype(minibatch[:, 5]) == Float64 -close(file) - -@elapsed df = read_table(filename) -@assert nrow(df) == 10_000 -@assert ncol(df) == 5 -@assert colnames(df) == column_names -@assert typeof(df[:, 1]) == DataVector{UTF8String} -@assert typeof(df[:, 2]) == DataVector{UTF8String} -@assert typeof(df[:, 3]) == DataVector{UTF8String} -@assert typeof(df[:, 4]) == DataVector{Float64} -@assert typeof(df[:, 5]) == DataVector{Float64} - -# TODO: Split apart methods that perform seek() from those that don't -text_data = convert(Array{UTF8String, 2}, (["1" "3" "A"; "2" "3" "NA"; "3" "3.1" "C"])) - -true_df = DataFrame(quote - x1 = DataArray([1, 2, 3]) - x2 = DataArray([3, 3, 3.1]) - x3 = DataArray(UTF8String["A", "", "C"], [false, true, false]) - end) -df = DataFrames.convert_to_dataframe(text_data, - ["", "NA"], - ["x1", "x2", "x3"]) -@assert isequal(df, true_df) -@assert isequal(eltype(df["x1"]), Int64) -@assert isequal(eltype(df["x2"]), Float64) -@assert isequal(eltype(df["x3"]), UTF8String) - -filename = Pkg.dir("DataFrames", "test", "data", "big_data.csv") -separator = DataFrames.determine_separator(filename) -quotation_character = '"' -missingness_indicators = ["", "NA"] -header = true - -nrows = DataFrames.determine_nrows(filename, header) -@assert nrows == 10_000 - -io = open(filename, "r") - -column_names = DataFrames.determine_column_names(io, separator, quotation_character, header) -@assert column_names == UTF8String["A", "B", "C", "D", "E"] - -seek(io, 0) -if header - readline(io) -end -text_data = DataFrames.read_separated_text(io, nrows, separator, quotation_character) -@assert eltype(text_data) == UTF8String -@assert size(text_data) == (10_000, 5) - -df = read_table(io, - separator, - quotation_character, - missingness_indicators, - header, - column_names, - nrows) -@assert nrow(df) == 10_000 -@assert ncol(df) == 5 -@assert colnames(df) == column_names -@assert eltype(df[:, 1]) == UTF8String -@assert eltype(df[:, 2]) == UTF8String -@assert eltype(df[:, 3]) == UTF8String -@assert eltype(df[:, 4]) == Float64 -@assert eltype(df[:, 5]) == Float64 - -df = read_table(filename) -@assert nrow(df) == 10_000 -@assert ncol(df) == 5 -@assert colnames(df) == column_names -@assert eltype(df[:, 1]) == UTF8String -@assert eltype(df[:, 2]) == UTF8String -@assert eltype(df[:, 3]) == UTF8String -@assert eltype(df[:, 4]) == Float64 -@assert eltype(df[:, 5]) == Float64 - -# TODO: Add test case in which data file has header, but no rows -# Example "RDatasets/data/Zelig/sna.ex.csv" -# "","Var1","Var2","Var3","Var4","Var5" - -# Additional data sets - -@elapsed df = read_table("test/data/big_data.csv") -# TODO: Make this faster -@elapsed df = read_table("test/data/movies.csv") -# TODO: Release this data set publicly -#@elapsed df = read_table("test/data/bigrams.tsv") -@elapsed df = read_table("test/data/utf8.csv") -@elapsed df = read_table("test/data/bool.csv") -@elapsed df = read_table("test/data/types.csv") -@elapsed df = read_table("test/data/space_after_delimiter.csv") -@elapsed df = read_table("test/data/space_before_delimiter.csv") -@elapsed df = read_table("test/data/space_around_delimiter.csv") -@elapsed df = read_table("test/data/corrupt_utf8.csv") +filename = "test/data/sample_data.wsv" +df = readtable(filename, separator = ' ') From a1b5db98568bebabb9a51669be775fdc2ff3173a Mon Sep 17 00:00:00 2001 From: John Myles White Date: Thu, 20 Jun 2013 09:46:51 -0400 Subject: [PATCH 2/5] Implement additional IO features Change some keyword argument names Allow user to select whether to ignore whitespace padding Allow user to decide whether to convert strings to factors Read in column names correctly Allow user to skip lines at the start of a file Restore printtable and writetable Fix test that broke with Julia changes --- src/io.jl | 1042 ++++++++++++++++++++++++++++++-------------------- test/data.jl | 2 +- 2 files changed, 624 insertions(+), 420 deletions(-) diff --git a/src/io.jl b/src/io.jl index ea46df8123..7494de5d6f 100644 --- a/src/io.jl +++ b/src/io.jl @@ -1,96 +1,96 @@ function readnrows!(io::IO, - buffer::Vector{Uint8}, - eol_indices::Vector{Int}, - separator_indices::Vector{Int}, - nrows::Int, - eol::Char, - separator::Char, - quotemark::Char) - bytesread::Int = 0 - nrowsread::Int = 0 - eolsread::Int = 0 - separatorsread::Int = 0 - - inquotes::Bool = false - inescape::Bool = false - - chr::Uint8 = ' ' - - buffer_size::Int = length(buffer) - eol_size::Int = length(eol_indices) - separator_size::Int = length(separator_indices) - - while !eof(io) && nrowsread < nrows - bytesread += 1 - chr = read(io, Uint8) - - if buffer_size < bytesread - buffer_size *= 2 - resize!(buffer, buffer_size) - end - buffer[bytesread] = chr - - if !inquotes - if chr == eol - nrowsread +=1 - eolsread +=1 - if eol_size < eolsread - eol_size *= 2 - resize!(eol_indices, eol_size) - end - eol_indices[eolsread] = bytesread - end - - if chr == separator - separatorsread += 1 - if separator_size < separatorsread - separator_size *= 2 - resize!(separator_indices, separator_size) - end - separator_indices[separatorsread] = bytesread - end - - if chr == quotemark && !inescape - inquotes = true - end - else - if chr == quotemark && !inescape - inquotes = false - end - end - - if chr == '\\' - inescape = true - else - inescape = false - end - end - - return nrowsread, bytesread, eolsread, separatorsread + buffer::Vector{Uint8}, + eol_indices::Vector{Int}, + separator_indices::Vector{Int}, + nrows::Int, + eol::Char, + separator::Char, + quotemark::Char) + bytesread::Int = 0 + nrowsread::Int = 0 + eolsread::Int = 0 + separatorsread::Int = 0 + + inquotes::Bool = false + inescape::Bool = false + + chr::Uint8 = ' ' + + buffer_size::Int = length(buffer) + eol_size::Int = length(eol_indices) + separator_size::Int = length(separator_indices) + + while !eof(io) && nrowsread < nrows + bytesread += 1 + chr = read(io, Uint8) + + if buffer_size < bytesread + buffer_size *= 2 + resize!(buffer, buffer_size) + end + buffer[bytesread] = chr + + if !inquotes + if chr == eol + nrowsread +=1 + eolsread +=1 + if eol_size < eolsread + eol_size *= 2 + resize!(eol_indices, eol_size) + end + eol_indices[eolsread] = bytesread + end + + if chr == separator + separatorsread += 1 + if separator_size < separatorsread + separator_size *= 2 + resize!(separator_indices, separator_size) + end + separator_indices[separatorsread] = bytesread + end + + if chr == quotemark && !inescape + inquotes = true + end + else + if chr == quotemark && !inescape + inquotes = false + end + end + + if chr == '\\' + inescape = true + else + inescape = false + end + end + + return nrowsread, bytesread, eolsread, separatorsread end function buffermatch(buffer::Vector{Uint8}, - left::Int, - right::Int, - exemplars::Vector{ASCIIString}) - l::Int = right - left + 1 - - for index in 1:length(exemplars) - exemplar::ASCIIString = exemplars[index] - if length(exemplar) == l - isamatch::Bool = true - - for i in 0:(l - 1) - isamatch &= buffer[left + i] == exemplar[1 + i] - end - - if isamatch - return true - end - end - end - - return false + left::Int, + right::Int, + exemplars::Vector{ASCIIString}) + l::Int = right - left + 1 + + for index in 1:length(exemplars) + exemplar::ASCIIString = exemplars[index] + if length(exemplar) == l + isamatch::Bool = true + + for i in 0:(l - 1) + isamatch &= buffer[left + i] == exemplar[1 + i] + end + + if isamatch + return true + end + end + end + + return false end # All of these functions return three items: @@ -98,371 +98,575 @@ end # TODO: Align more closely with parseint code function bytestoint(buffer::Vector{Uint8}, - left::Int, - right::Int, - missing_nonstrings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_nonstrings) - return 0, true, true - end - - value::Int = 0 - power::Int = 1 - index::Int = right - byte::Uint8 = buffer[index] - - while index > left - if '0' <= byte <= '9' - value += (byte - '0') * power - power *= 10 - else - return value, false, false - end - index -= 1 - byte = buffer[index] - end - - if byte == '-' - return -value, true, false - elseif byte == '+' - return value, true, false - elseif '0' <= byte <= '9' - value += (byte - '0') * power - return value, true, false - else - return value, false, false - end + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_nonstrings) + return 0, true, true + end + + value::Int = 0 + power::Int = 1 + index::Int = right + byte::Uint8 = buffer[index] + + while index > left + if '0' <= byte <= '9' + value += (byte - '0') * power + power *= 10 + else + return value, false, false + end + index -= 1 + byte = buffer[index] + end + + if byte == '-' + return -value, true, false + elseif byte == '+' + return value, true, false + elseif '0' <= byte <= '9' + value += (byte - '0') * power + return value, true, false + else + return value, false, false + end end let out::Vector{Float64} = Array(Float64, 1) - global bytestofloat - function bytestofloat(buffer::Vector{Uint8}, - left::Int, - right::Int, - missing_nonstrings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_nonstrings) - return 0.0, true, true - end - - success = ccall(:jl_substrtod, - Int32, - (Ptr{Uint8}, Int, Int, Ptr{Float64}), - buffer, - left - 1, - right - left + 1, - out) == 0 - - return out[1], success, false - end + global bytestofloat + function bytestofloat(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_nonstrings) + return 0.0, true, true + end + + success = ccall(:jl_substrtod, + Int32, + (Ptr{Uint8}, Int, Int, Ptr{Float64}), + buffer, + left - 1, + right - left + 1, + out) == 0 + + return out[1], success, false + end end function bytestobool(buffer::Vector{Uint8}, - left::Int, - right::Int, - missing_nonstrings::Vector{ASCIIString}, - true_strings::Vector{ASCIIString}, - false_strings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_nonstrings) - return false, true, true - end - - if buffermatch(buffer, left, right, true_strings) - return true, true, false - elseif buffermatch(buffer, left, right, false_strings) - return false, true, false - else - return false, false, false - end + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}, + true_strings::Vector{ASCIIString}, + false_strings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_nonstrings) + return false, true, true + end + + if buffermatch(buffer, left, right, true_strings) + return true, true, false + elseif buffermatch(buffer, left, right, false_strings) + return false, true, false + else + return false, false, false + end end function bytestostring(buffer::Vector{Uint8}, - left::Int, - right::Int, - missing_strings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_strings) - return "", true, true - end - - return bytestring(buffer[left:right]), true, false + left::Int, + right::Int, + missing_strings::Vector{ASCIIString}) + if buffermatch(buffer, left, right, missing_strings) + return "", true, true + end + + return bytestring(buffer[left:right]), true, false end function builddf(rows::Int, - cols::Int, - bytes::Int, - eols::Int, - separators::Int, - buffer::Vector{Uint8}, - eol_indices::Vector{Int}, - separator_indices::Vector{Int}, - separator::Char, - eol::Char, - quotemark::Char, - missing_nonstrings::Vector{ASCIIString}, - missing_strings::Vector{ASCIIString}, - true_strings::Vector{ASCIIString}, - false_strings::Vector{ASCIIString}, - ignore_padding::Bool, - strings_as_factors::Bool) - columns::Vector{Any} = Array(Any, cols) - - for j in 1:cols - values = Array(Int, rows) - missing::BitVector = falses(rows) - isint::Bool = true - isfloat::Bool = true - isbool::Bool = true - isstring::Bool = true - - i::Int = 0 - - while i < rows - i += 1 - - # Determine left and right boundaries of field - if j == 1 - if i == 1 - left = 1 - else - left = eol_indices[i - 1] + 1 - end - else - left = separator_indices[(i - 1) * (cols - 1) + j - 1] + 1 - end - - if j == cols - if i == rows - if buffer[bytes] == eol - right = bytes - 1 - else - right = bytes - end - else - right = eol_indices[i] - 1 - end - else - right = separator_indices[(i - 1) * (cols - 1) + j] - 1 - end - - # Ignore left-and-right whitespace padding - if ignore_padding - while left < right && buffer[left] == ' ' - left += 1 - end - while left < right && buffer[right] == ' ' - right -= 1 - end - end - - # (1) Try to parse values as Int's - if isint - values[i], success, missing[i] = - bytestoint(buffer, left, right, missing_nonstrings) - if success - continue - else - isint = false - values = convert(Array{Float64}, values) - end - end - - # (2) Try to parse as Float64's - if isfloat - values[i], success, missing[i] = - bytestofloat(buffer, left, right, missing_nonstrings) - if success - continue - else - isfloat = false - values = Array(Bool, rows) - i = 1 - end - end - - # If we go this far, we should ignore quote marks on the boundaries - while left < right && buffer[left] == quotemark - left += 1 - end - while left < right && buffer[right] == quotemark - right -= 1 - end - - # (3) Try to parse as Bool's - if isbool - values[i], success, missing[i] = - bytestobool(buffer, left, right, - missing_nonstrings, - true_strings, false_strings) - if success - continue - else - isbool = false - values = Array(UTF8String, rows) - i = 1 - end - end - - # (4) Fallback to UTF8String - if left == right && buffer[right] == quotemark - # Empty string special method - values[i], success, missing[i] = "", true, false - else - values[i], success, missing[i] = - bytestostring(buffer, left, right, missing_strings) - end - end - - if strings_as_factors && isstring - columns[j] = PooledDataArray(values, missing) - else - columns[j] = DataArray(values, missing) - end - end - - # Need to pass this in - column_names = DataFrames.generate_column_names(cols) - - return DataFrame(columns, column_names) + cols::Int, + bytes::Int, + eols::Int, + separators::Int, + buffer::Vector{Uint8}, + eol_indices::Vector{Int}, + separator_indices::Vector{Int}, + separator::Char, + eol::Char, + quotemark::Char, + missing_nonstrings::Vector{ASCIIString}, + missing_strings::Vector{ASCIIString}, + true_strings::Vector{ASCIIString}, + false_strings::Vector{ASCIIString}, + ignorespace::Bool, + makefactors::Bool) + columns::Vector{Any} = Array(Any, cols) + + for j in 1:cols + values = Array(Int, rows) + missing::BitVector = falses(rows) + isint::Bool = true + isfloat::Bool = true + isbool::Bool = true + isstring::Bool = true + + i::Int = 0 + + while i < rows + i += 1 + + # Determine left and right boundaries of field + if j == 1 + if i == 1 + left = 1 + else + left = eol_indices[i - 1] + 1 + end + else + left = separator_indices[(i - 1) * (cols - 1) + j - 1] + 1 + end + + if j == cols + if i == rows + if buffer[bytes] == eol + right = bytes - 1 + else + right = bytes + end + else + right = eol_indices[i] - 1 + end + else + right = separator_indices[(i - 1) * (cols - 1) + j] - 1 + end + + # Ignore left-and-right whitespace padding + if ignorespace + while left < right && buffer[left] == ' ' + left += 1 + end + while left < right && buffer[right] == ' ' + right -= 1 + end + end + + # (1) Try to parse values as Int's + if isint + values[i], success, missing[i] = + bytestoint(buffer, left, right, missing_nonstrings) + if success + continue + else + isint = false + values = convert(Array{Float64}, values) + end + end + + # (2) Try to parse as Float64's + if isfloat + values[i], success, missing[i] = + bytestofloat(buffer, left, right, missing_nonstrings) + if success + continue + else + isfloat = false + values = Array(Bool, rows) + i = 1 + end + end + + # If we go this far, we should ignore quote marks on the boundaries + while left < right && buffer[left] == quotemark + left += 1 + end + while left < right && buffer[right] == quotemark + right -= 1 + end + + # (3) Try to parse as Bool's + if isbool + values[i], success, missing[i] = + bytestobool(buffer, left, right, + missing_nonstrings, + true_strings, false_strings) + if success + continue + else + isbool = false + values = Array(UTF8String, rows) + i = 1 + end + end + + # (4) Fallback to UTF8String + if left == right && buffer[right] == quotemark + # Empty string special method + values[i], success, missing[i] = "", true, false + else + values[i], success, missing[i] = + bytestostring(buffer, left, right, missing_strings) + end + end + + if makefactors && isstring + columns[j] = PooledDataArray(values, missing) + else + columns[j] = DataArray(values, missing) + end + end + + # Need to pass this in + column_names = DataFrames.generate_column_names(cols) + + return DataFrame(columns, column_names) end +function parseline(buffer::Vector{Uint8}, + upper::Int, + eol::Char, + separator::Char, + quotemark::Char) + column_names = Array(UTF8String, 0) + if upper == 1 + return column_names + end + + left::Int = 1 + right::Int = -1 + index::Int = -1 + atbound::Bool = false + inquotes::Bool = false + inescape::Bool = false + chr::Uint8 = uint8(' ') + + while left < upper + chr = buffer[left] + while chr == ' ' + left += 1 + chr = buffer[left] + end + + right = left - 1 + + atbound = false + while right < upper && !atbound + right += 1 + chr = buffer[right] + if !inquotes + if chr == separator + atbound = true + elseif chr == quotemark && !inescape + inquotes = true + end + else + if chr == quotemark && !inescape + inquotes = false + end + end + + if chr == '\\' + inescape = true + else + inescape = false + end + end + + inquotes = false + inescape = false + + if buffer[left] == quotemark + left += 1 + end + + index = right + chr = buffer[index] + while index > left && + (chr == ' ' || chr == eol || + chr == separator || chr == quotemark) + index -= 1 + chr = buffer[index] + end + + push!(column_names, bytestring(buffer[left:index])) + + left = right + 1 + end + + return column_names +end + +# TODO: Respect coltypes +# TODO: Skip blanklines +# TODO: Skip comment lines +# TODO: Use file encoding information function readtable(io::IO; - header::Bool = true, - separator::Char = ',', - eol::Char = '\n', - quotemark::Char = '"', - missing_nonstrings::Vector{ASCIIString} = ["", "NA"], - missing_strings::Vector{ASCIIString} = ["NA"], - true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], - false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], - strings_as_factors::Bool = false, - ignore_padding::Bool = true, - decimal::Char = '.', + header::Bool = true, + separator::Char = ',', + eol::Char = '\n', + quotemark::Char = '"', + missing_nonstrings::Vector{ASCIIString} = ["", "NA"], + missing_strings::Vector{ASCIIString} = ["NA"], + true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], + false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], + makefactors::Bool = false, + ignorespace::Bool = true, + decimal::Char = '.', colnames::Vector{UTF8String} = Array(UTF8String, 0), coltypes::Vector{Any} = Array(Any, 0), nrows::Int = -1, - skip_lines_at_start::Int = 0, - clean_colnames::Bool = true, - skip_blanklines::Bool = true, + skipstartlines::Int = 0, + cleancolnames::Bool = true, + skipblanklines::Bool = true, comment::Char = '#', encoding::Symbol = :utf8) - # Allocate buffers to conserve memory - buffer::Vector{Uint8} = Array(Uint8, 2^20) - eol_indices::Vector{Int} = Array(Int, 1) - separator_indices::Vector{Int} = Array(Int, 1) - - # Skip lines at the start - skipped_lines::Int = 0 - while skipped_lines < skip_lines_at_start - # TODO: Use a method that doesn't allocate buffers - readuntil(io, eol) - end - - # Deal with header - # TODO: Extract column names during this step - if header - column_text = readuntil(io, eol) - end - - # Separate text into fields - rows, bytes, eols, separators = - readnrows!(io, - buffer, - eol_indices, - separator_indices, - nrows, - eol, - separator, - quotemark) - - # Determine the number of columns - cols = fld(separators, rows) + 1 - - # Confirm that the number of columns is consistent across rows - if rem(separators, rows) != 0 - error(@sprintf "Every line must have %d columns" cols) - end - - # Parse contents of a buffer into a DataFrame - df = builddf(rows, - cols, - bytes, - eols, - separators, - buffer, - eol_indices, - separator_indices, - separator, - eol, - quotemark, - missing_nonstrings, - missing_strings, - true_strings, - false_strings, - ignore_padding, - strings_as_factors) - - # Clean up column names if requested - if clean_colnames - clean_colnames!(df) - end - - # Return the final DataFrame - return df + # Allocate buffers to conserve memory + buffer::Vector{Uint8} = Array(Uint8, 2^20) + eol_indices::Vector{Int} = Array(Int, 1) + separator_indices::Vector{Int} = Array(Int, 1) + chr::Uint8 = uint8(' ') + + # Skip lines at the start + skipped_lines::Int = 0 + while skipped_lines < skipstartlines + while chr != eol + chr = read(io, Uint8) + end + skipped_lines += 1 + end + + # Deal with header + if header + chr = uint8(' ') + headerbytesread = 0 + headerbytes = Array(Uint8, 2^16) + headerbytes_size = length(headerbytes) + while chr != eol + chr = read(io, Uint8) + headerbytesread += 1 + if headerbytesread > headerbytes_size + headerbytes_size *= 2 + resize!(headerbytes, headerbytes_size) + end + headerbytes[headerbytesread] = chr + end + column_names = + parseline(headerbytes, headerbytesread, eol, separator, quotemark) + end + + # Separate text into fields + rows, bytes, eols, separators = + readnrows!(io, + buffer, + eol_indices, + separator_indices, + nrows, + eol, + separator, + quotemark) + + # Determine the number of columns + cols = fld(separators, rows) + 1 + + # Confirm that the number of columns is consistent across rows + if rem(separators, rows) != 0 + error(@sprintf "Every line must have %d columns" cols) + end + + # Parse contents of a buffer into a DataFrame + df = builddf(rows, + cols, + bytes, + eols, + separators, + buffer, + eol_indices, + separator_indices, + separator, + eol, + quotemark, + missing_nonstrings, + missing_strings, + true_strings, + false_strings, + ignorespace, + makefactors) + + # Set up column names based on user input and header + if isempty(colnames) + if header + colnames!(df, column_names) + end + else + colnames!(df, colnames) + end + + # Clean up column names if requested + if cleancolnames + clean_colnames!(df) + end + + # Return the final DataFrame + return df end function readtable(filename::String; - header::Bool = true, - separator::Char = ',', - eol::Char = '\n', - quotemark::Char = '"', - missing_nonstrings::Vector{ASCIIString} = ["", "NA"], - missing_strings::Vector{ASCIIString} = ["NA"], - true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], - false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], - strings_as_factors::Bool = false, - ignore_padding::Bool = true, - decimal::Char = '.', + header::Bool = true, + separator::Char = getseparator(filename), + eol::Char = '\n', + quotemark::Char = '"', + missing_nonstrings::Vector{ASCIIString} = ["", "NA"], + missing_strings::Vector{ASCIIString} = ["NA"], + true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], + false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], + makefactors::Bool = false, + ignorespace::Bool = true, + decimal::Char = '.', colnames::Vector{UTF8String} = Array(UTF8String, 0), coltypes::Vector{Any} = Array(Any, 0), nrows::Int = -1, - skip_lines_at_start::Int = 0, - clean_colnames::Bool = true, - skip_blanklines::Bool = true, + skipstartlines::Int = 0, + cleancolnames::Bool = false, + skipblanklines::Bool = true, comment::Char = '#', encoding::Symbol = :utf8) - # Open an IO stream - io = open(filename, "r") - - # If user wants all rows, overestimate nrows - if nrows == -1 - nrows = filesize(filename) - end - - # Use the IO stream method for readtable() - df = readtable(io, - header = header, - separator = separator, - eol = eol, - quotemark = quotemark, - missing_nonstrings = missing_nonstrings, - missing_strings = missing_strings, - true_strings = true_strings, - false_strings = false_strings, - strings_as_factors = strings_as_factors, - ignore_padding = ignore_padding, - decimal = decimal, + # Open an IO stream + io = open(filename, "r") + + # If user wants all rows, overestimate nrows + if nrows == -1 + nrows = filesize(filename) + end + + # Use the IO stream method for readtable() + df = readtable(io, + header = header, + separator = separator, + eol = eol, + quotemark = quotemark, + missing_nonstrings = missing_nonstrings, + missing_strings = missing_strings, + true_strings = true_strings, + false_strings = false_strings, + makefactors = makefactors, + ignorespace = ignorespace, + decimal = decimal, colnames = colnames, coltypes = coltypes, nrows = nrows, - skip_lines_at_start = skip_lines_at_start, - clean_colnames = clean_colnames, - skip_blanklines = clean_colnames, + skipstartlines = skipstartlines, + cleancolnames = cleancolnames, + skipblanklines = cleancolnames, comment = comment, encoding = encoding) - # Close the IO stream - close(io) + # Close the IO stream + close(io) + + # Return the resulting DataFrame + return df +end + +function getseparator(filename::String) + if ismatch(r"csv$", filename) + return ',' + elseif ismatch(r"tsv$", filename) + return '\t' + elseif ismatch(r"wsv$", filename) + return ' ' + else + error("Unable to determine separator used in $filename") + end +end + +############################################################################## +# +# Text output +# +############################################################################## + +quoted(val::String, quotemark::Char) = string(quotemark, val, quotemark) +quoted(val::Real, quotemark::Char) = string(val) +quoted(val::Any, quotemark::Char) = string(quotemark, string(val), quotemark) + +# TODO: Increase precision of string representation of Float64's +function printtable(io::IO, + df::DataFrame; + separator::Char = ',', + quotemark::Char = '"', + header::Bool = true) + n, p = size(df) + if header + column_names = colnames(df) + for j in 1:p + if j < p + print(io, quoted(column_names[j], quotemark)) + print(io, separator) + else + println(io, quoted(column_names[j], quotemark)) + end + end + end + for i in 1:n + for j in 1:p + if j < p + print(io, quoted(df[i, j], quotemark)) + print(io, separator) + else + println(io, quoted(df[i, j], quotemark)) + end + end + end + return +end + +function printtable(df::DataFrame; + separator::Char = ',', + quotemark::Char = '"', + header::Bool = true) + printtable(OUTPUT_STREAM, + df, + separator = separator, + quotemark = quotemark, + header = header) + return +end + +# Infer configuration settings from filename +function writetable(filename::String, + df::DataFrame; + separator::Char = getseparator(filename), + quotemark::Char = '"', + header::Bool = true) + io = open(filename, "w") + printtable(io, + df, + separator = separator, + quotemark = quotemark, + header = header) + close(io) + return +end + +############################################################################## +# +# Binary serialization +# +############################################################################## + +function save(filename::String, df::AbstractDataFrame) + f = open(filename, "w") + serialize(f, df) + close(f) + return +end - # Return the resulting DataFrame - return df +function load_df(filename::String) + f = open(filename) + dd = deserialize(f) + close(f) + return dd end diff --git a/test/data.jl b/test/data.jl index ce947c0299..97e8e4799e 100644 --- a/test/data.jl +++ b/test/data.jl @@ -93,7 +93,7 @@ test_group("DataVector to something else") @assert all(convert(Vector{Int}, dvint2) .== [5:8]) @assert all([i + 1 for i in dvint2] .== [6:9]) @assert all([length(x)::Int for x in dvstr] == [3, 3, 1, 4]) -@assert repr(dvint) == "[1, 2, NA, 4]" +@assert repr(dvint) == "[1,2,NA,4]" test_group("PooledDataVector to something else") @assert all(removeNA(pdvstr) .== ["one", "one", "two", "two", "one", "one"]) From 7587b7b73098793083aea4b86ef8c63bdfa92c3e Mon Sep 17 00:00:00 2001 From: John Myles White Date: Thu, 20 Jun 2013 11:15:59 -0400 Subject: [PATCH 3/5] Fix small bugs re. naming and EOF --- src/DataFrames.jl | 4 ++-- src/io.jl | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index c318c59a92..68ca1fba8b 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -155,7 +155,7 @@ export # reconcile_groups, PooledDataArray, PooledDataMatrix, PooledDataVector, - print_table, + printtable, range, rbind, read_minibatch, @@ -198,7 +198,7 @@ export # reconcile_groups, within!, within, without, - write_table, + writetable, xtab, xtabs, stack_df, diff --git a/src/io.jl b/src/io.jl index 7494de5d6f..e3e66c80ab 100644 --- a/src/io.jl +++ b/src/io.jl @@ -261,6 +261,8 @@ function builddf(rows::Int, else isint = false values = convert(Array{Float64}, values) + values[i], success, missing[i] = + bytestoint(buffer, left, right, missing_nonstrings) end end @@ -431,7 +433,7 @@ function readtable(io::IO; # Skip lines at the start skipped_lines::Int = 0 while skipped_lines < skipstartlines - while chr != eol + while !eof(io) && chr != eol chr = read(io, Uint8) end skipped_lines += 1 @@ -443,7 +445,7 @@ function readtable(io::IO; headerbytesread = 0 headerbytes = Array(Uint8, 2^16) headerbytes_size = length(headerbytes) - while chr != eol + while !eof(io) && chr != eol chr = read(io, Uint8) headerbytesread += 1 if headerbytesread > headerbytes_size From dbcd128068c5256dde87c09a521c5c1e61873f06 Mon Sep 17 00:00:00 2001 From: John Myles White Date: Thu, 20 Jun 2013 12:36:18 -0400 Subject: [PATCH 4/5] Bug fixes Deal with missing EOL's at the end of data files Encode completely blank string fields as NA's Provide more diagnostic information when data can't be read --- src/io.jl | 46 ++++++++++++++++++++++++++++++++++++++------- test/data/messy.csv | 1 + 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/io.jl b/src/io.jl index e3e66c80ab..09bf4e6283 100644 --- a/src/io.jl +++ b/src/io.jl @@ -66,6 +66,16 @@ function readnrows!(io::IO, end end + # Deal with files that do not include a final EOL + if eof(io) && chr != eol + nrowsread += 1 + if eol_size < eolsread + eol_size *= 2 + resize!(eol_indices, eol_size) + end + eol_indices[eolsread + 1] = bytesread + 1 + end + return nrowsread, bytesread, eolsread, separatorsread end @@ -101,7 +111,8 @@ function bytestoint(buffer::Vector{Uint8}, left::Int, right::Int, missing_nonstrings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_nonstrings) + + if left > right || buffermatch(buffer, left, right, missing_nonstrings) return 0, true, true end @@ -139,7 +150,7 @@ let out::Vector{Float64} = Array(Float64, 1) left::Int, right::Int, missing_nonstrings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_nonstrings) + if left > right || buffermatch(buffer, left, right, missing_nonstrings) return 0.0, true, true end @@ -161,7 +172,7 @@ function bytestobool(buffer::Vector{Uint8}, missing_nonstrings::Vector{ASCIIString}, true_strings::Vector{ASCIIString}, false_strings::Vector{ASCIIString}) - if buffermatch(buffer, left, right, missing_nonstrings) + if left > right || buffermatch(buffer, left, right, missing_nonstrings) return false, true, true end @@ -177,7 +188,12 @@ end function bytestostring(buffer::Vector{Uint8}, left::Int, right::Int, - missing_strings::Vector{ASCIIString}) + missing_strings::Vector{ASCIIString}, + quotemark::Char) + if left > right && buffer[right] != quotemark + return "", true, true + end + if buffermatch(buffer, left, right, missing_strings) return "", true, true end @@ -308,7 +324,7 @@ function builddf(rows::Int, values[i], success, missing[i] = "", true, false else values[i], success, missing[i] = - bytestostring(buffer, left, right, missing_strings) + bytestostring(buffer, left, right, missing_strings, quotemark) end end @@ -473,8 +489,24 @@ function readtable(io::IO; cols = fld(separators, rows) + 1 # Confirm that the number of columns is consistent across rows - if rem(separators, rows) != 0 - error(@sprintf "Every line must have %d columns" cols) + if separators != rows * (cols - 1) + linenumber = -1 + j = -1 + for i in 1:rows + bound = eol_indices[i] + j = 1 + while separator_indices[(i - 1) * (cols - 1) + j] < bound + j += 1 + end + if j != cols + linenumber = i + break + end + end + msg1 = @sprintf "Every line must have %d columns\n" cols + msg2 = @sprintf "Reading failed at line %d with %d columns\n" linenumber j + msg3 = @sprintf "Saw %d rows, but %d fields\n" rows separators + error(string(msg1, msg2, msg3)) end # Parse contents of a buffer into a DataFrame diff --git a/test/data/messy.csv b/test/data/messy.csv index 251c075224..b3e96ef21b 100644 --- a/test/data/messy.csv +++ b/test/data/messy.csv @@ -3,3 +3,4 @@ 3 , 4, 2.3 ,"false","Y" NA,,NA,NA,NA ,NA,,,NA +,,,, From 54f43745caafd7720c64ccb7abaeee22b2796fc0 Mon Sep 17 00:00:00 2001 From: John Myles White Date: Thu, 20 Jun 2013 14:23:29 -0400 Subject: [PATCH 5/5] Deprecate *_table for *table --- src/DataFrames.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 68ca1fba8b..f9212fa6db 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -221,6 +221,16 @@ export # reconcile_groups, read_rda, vecbind +############################################################################## +## +## Deprecations +## +############################################################################## + +Base.@deprecate read_table readtable +Base.@deprecate print_table printtable +Base.@deprecate write_table writetable + ############################################################################## ## ## Load files