Skip to content

Commit

Permalink
Merge pull request #764 from JuliaStats/crlf
Browse files Browse the repository at this point in the history
IO updates and testing
  • Loading branch information
garborg committed Feb 12, 2015
2 parents 2e84403 + caa7c62 commit b6e6525
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 102 deletions.
88 changes: 29 additions & 59 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,82 +57,52 @@ function printtable(df::AbstractDataFrame;
quotemark::Char = '"')
printtable(STDOUT,
df,
header = header,
separator = separator,
quotemark = quotemark,
header = header)
quotemark = quotemark)
return
end

# Infer configuration settings from filename
function writetable(filename::String,
df::AbstractDataFrame;
header::Bool = ifelse(append,false,true),
header::Bool = true,
separator::Char = getseparator(filename),
quotemark::Char = '"',
append::Bool = false,
check_column_names::Bool = false
)



if append==false
# Case 0: Move to end of the if statement

elseif !isfile(filename)
# Case 1: There is no file to append to ==> Create a file
append == false
else
# Case 2: There is a file.

# Checking number of columns and matching headers

# Read first line
read_io = open(filename, "r")
file_headers = Base.split(readline(read_io),separator)
close(read_io)


append::Bool = false)

if append && isfile(filename)
file_df = readtable(filename, header = false, nrows = 1)

# Check if number of columns matches
if length(file_headers) != length(names(df))
error("Number of columns mismatch between file and DataFrame")
if size(file_df, 2) != size(df, 2)
throw(DimensionMismatch("Number of columns differ between file and DataFrame"))
end

# In append mode, 'header' triggers a check for matching names
if header
if any(i -> symbol(file_df[1, i]) != index(df)[i], 1:size(df, 2))
throw(KeyError("Column names don't match names in file"))
end

if check_column_names==true # Perform column name check

# Convert to String
file_headers = [convert(String,i) for i in file_headers]

# Parse headers
map!(k-> Base.strip(Base.strip(k),'\"'),file_headers)
header = false
end
end

# Change to symbol
file_headers = map(m -> symbol(m),file_headers)

if any(file_headers.!= names(df)) # Check that all column names matches
error("Column headers are not matching with first line of file")
end
end
if endswith(filename, ".bz") || endswith(filename, ".bz2")
throw(ArgumentError("BZip2 compression not yet implemented"))
end

mode_selected = ifelse(append, "a", "w")

if endswith(filename, ".gz")
io = gzopen(filename, mode_selected)
elseif endswith(filename, ".bz") || endswith(filename, ".bz2")
error("BZip2 compression not yet implemented")
else
io = open(filename, mode_selected)

openfunc = endswith(filename, ".gz") ? gzopen : open

openfunc(filename, append ? "a" : "w") do io
printtable(io,
df,
header = header,
separator = separator,
quotemark = quotemark)
end


printtable(io,
df,
separator = separator,
quotemark = quotemark,
header = header)
close(io)

return
end

Expand Down
2 changes: 1 addition & 1 deletion src/dataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ function readtable(pathname::String;
allowescapes::Bool = false)
# Open an IO stream based on pathname
# (1) Path is an HTTP or FTP URL
if beginswith(pathname, "http://") || beginswith(pathname, "ftp://")
if startswith(pathname, "http://") || startswith(pathname, "ftp://")
error("URL retrieval not yet implemented")
# (2) Path is GZip file
elseif endswith(pathname, ".gz")
Expand Down
2 changes: 1 addition & 1 deletion src/statsmodels/statsmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ function Base.show(io::IO, model::DataFrameModels)
println(io, "$(typeof(model)):\n\nCoefficients:")
show(io, ct)
catch e
if isa(e, String) && beginswith(e, "coeftable is not defined")
if isa(e, String) && startswith(e, "coeftable is not defined")
show(io, model.model)
else
rethrow(e)
Expand Down
8 changes: 8 additions & 0 deletions test/data/comments/before_after_data_windows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Some people put comments in CSV files
"A","B","C","D"
1,2,3,4# Some people put comments in CSV files
1,2,3,4
1,2,3,4
1,2,3,4
1,2,3,4
# Some people put comments in CSV files
6 changes: 6 additions & 0 deletions test/data/comments/middata_windows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"A","B","C","D"
1,2,3,4# Some people put comments in CSV files
1,2,3,4
1,2,3,4
1,2,3,4
1,2,3,4
9 changes: 9 additions & 0 deletions test/data/skiplines/skipfront_windows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
%
% Stuff to skip at the top
%
"A","B","C","D"
1,2,3,4
1,2,3,4
1,2,3,4
1,2,3,4
1,2,3,4
36 changes: 0 additions & 36 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -282,39 +282,3 @@ module TestDataFrame
@test deleterows!(df, [2, 3]) === df
@test isequal(df, DataFrame(a=@data([1]), b=@data([3.])))
end


# Test writetable with append

df1 = DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))
df2 = DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]))
df3 = DataFrame(a=@data([1, 2, 3]), c=@data([4, 5, 6])) # 2nd column mismatch
df3b = DataFrame(a=@data([1, 2, 3]), b=@data([4, 5, 6]),c=@data([4, 5, 6])) # number of columns mitmatch

writetable("test1.csv",df1,header=true)
writetable("test1.csv",df2, header=false,append=true)

# TEST 1: Does append = true work ...(expected: append to file)

df4 = readtable("test1.csv")

@test nrow(df4) == nrow(df1) + nrow(df2)


# TEST 2: Does append = false work ... (expected: overwrite file)

writetable("test2.csv",df1,header=true)
writetable("test2.csv",df2,header=true)

df5 = readtable("test2.csv")

@test nrow(df5) == nrow(df2)

# TEST 3: Does check_column_name flag work ...
writetable("test3.csv",df2, header=true, append=false)
@test_throws ErrorException writetable("test3.csv",df3, header=false,append=true,check_column_names=true)


# TEST 4: check the check on number of columns works ...
writetable("test4.csv",df3, header=true, append=false)
@test_throws ErrorException writetable("test4.csv",df3b, header=false,append=true)
54 changes: 49 additions & 5 deletions test/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,27 @@ module TestIO
df3 = readtable("$data/skiplines/skipfront.csv", skipstart = 3)
df4 = readtable("$data/skiplines/skipfront.csv", skipstart = 4, header = false)
names!(df4, names(df1))
# df5 = readtable("$data/skiplines/skipfront.csv", skipstart = 3, skiprows = 5:6)
# df6 = readtable("$data/skiplines/skipfront.csv", skipstart = 3, header = false, skiprows = [4, 6])
# names!(df6, names(df1))
df5 = readtable("$data/comments/before_after_data_windows.csv", allowcomments = true)
df6 = readtable("$data/comments/middata_windows.csv", allowcomments = true)
df7 = readtable("$data/skiplines/skipfront_windows.csv", skipstart = 3)
df8 = readtable("$data/skiplines/skipfront_windows.csv", skipstart = 4, header = false)
names!(df8, names(df1))
# df9 = readtable("$data/skiplines/skipfront.csv", skipstart = 3, skiprows = 5:6)
# df10 = readtable("$data/skiplines/skipfront.csv", skipstart = 3, header = false, skiprows = [4, 6])
# names!(df10, names(df1))

@test df2 == df1
@test df3 == df1
@test df4 == df1
# @test df5 == df1[3:end]
# @test df6 == df1[[1, 3:end]]

# Windows EOLS
@test df5 == df1
@test df6 == df1
@test df7 == df1
@test df8 == df1

# @test df9 == df1[3:end]
# @test df10 == df1[[1, 3:end]]

function normalize_eol!(df)
for (name, col) in eachcol(df)
Expand Down Expand Up @@ -314,4 +326,36 @@ module TestIO

io = IOBuffer(abnormal*",%_B*\tC*,end\n1,2,3\n")
@test names(readtable(io)) == ns

# Test writetable with append

df1 = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]))
df2 = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]))
df3 = DataFrame(a = @data([1, 2, 3]), c = @data([4, 5, 6])) # 2nd column mismatch
df3b = DataFrame(a = @data([1, 2, 3]), b = @data([4, 5, 6]), c = @data([4, 5, 6])) # number of columns mismatch

tf = tempname()

# Written as normal if file doesn't exist
writetable(tf, df1, append = true)
@test readtable(tf) == df1

# Appends to existing file if append == true
writetable(tf, df1)
writetable(tf, df2, header = false, append = true)
@test readtable(tf) == vcat(df1, df2)

# Overwrites file if append == false
writetable(tf, df1)
writetable(tf, df2)
@test readtable(tf) == df2

# Enforces matching column names iff append == true && header == true
writetable(tf, df2)
@test_throws KeyError writetable(tf, df3, append = true)
writetable(tf, df3, header = false, append = true)

# Enforces matching column count if append == true
writetable(tf, df3)
@test_throws DimensionMismatch writetable(tf, df3b, header = false, append = true)
end

0 comments on commit b6e6525

Please sign in to comment.