Skip to content

Commit

Permalink
fread NA and blank in logical columns, #567
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle committed Jun 27, 2014
1 parent 8c6edf9 commit 24ce773
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 23 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,17 +143,19 @@ DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.

* `setorder()` and `DT[order(.)]` handles `integer64` type in descending order as well. Closes [#703](https://github.com/Rdatatable/data.table/issues/703).

* `setorder()` and `setorderv()` gain `na.last = TRUE/FALSE`. Closes [#706](https://github.com/Rdatatable/data.table/issues/706).

#### BUG FIXES

* `fread()` :
* now accepts line breaks inside quoted fields. Thanks to Clayton Stanley for highlighting :
http://stackoverflow.com/questions/21006661/fread-and-a-quoted-multi-line-column-value

* now accepts trailing backslash in quoted fields. Thanks to user2970844 for highlighting :
http://stackoverflow.com/questions/24375832/fread-and-column-with-a-trailing-backslash

* `setorder()` and `setorderv()` gain `na.last = TRUE/FALSE`. Closes [#706](https://github.com/Rdatatable/data.table/issues/706).

#### BUG FIXES


* Blank and `"NA"` values in logical columns (`T`,`True`,`TRUE`) no longer cause them to be read as character, [#567](https://github.com/Rdatatable/data.table/issues/567). Tests added.

* When joining to fewer columns than the key has, using one of the later key columns explicitly in j repeated the first value. A problem introduced by v1.9.2 and not caught bythe 1,220 tests, or tests in 37 dependent packages. Test added. Many thanks to Michele Carriero for reporting.
```R
DT = data.table(a=1:2, b=letters[1:6], key="a,b") # keyed by a and b
Expand Down
26 changes: 16 additions & 10 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
.lasttest <<- num
ntest <<- ntest + 1
lastnum <<- num
xsub = substitute(x)
ysub = substitute(y)
if (is.null(output)) err <<- try(x,TRUE)
else out = gsub("NULL$","",paste(capture.output(print(err<<-try(x,TRUE))),collapse=""))
if (!is.null(error) || !is.null(warning)) {
Expand All @@ -61,7 +63,7 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
length(grep(patt,err)) &&
type==observedtype)) {
cat("Test",num,"didn't produce correct",type,":\n")
cat(">",deparse(substitute(x)),"\n")
cat(">",deparse(xsub),"\n")
cat("Expected ",type,": '",txt,"'\n",sep="")
if (!inherits(err,"try-error"))
cat("Observed: no error or warning\n")
Expand Down Expand Up @@ -90,7 +92,7 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
output = gsub("[)]","[)]",output)
if (!length(grep(output,out))) {
cat("Test",num,"didn't produce correct output:\n")
cat(">",deparse(substitute(x)),"\n")
cat(">",deparse(xsub),"\n")
cat("Expected: '",output,"'\n",sep="")
cat("Observed: '",out,"'\n",sep="")
nfail <<- nfail + 1
Expand All @@ -102,7 +104,7 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
if (missing(y)) {
if (isTRUE(as.vector(x))) return() # as.vector to drop names of a named vector such as returned by system.time
cat("Test",num,"expected TRUE but observed:\n")
cat(">",deparse(substitute(x)),"\n")
cat(">",deparse(xsub),"\n")
if (is.data.table(x)) compactprint(x) else print(x)
nfail <<- nfail + 1
whichfail <<- c(whichfail, num)
Expand Down Expand Up @@ -131,9 +133,9 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
if (is.atomic(x) && is.atomic(y) && isTRUE(all.equal(x,y))) return() # For test 617 on r-prerel-solaris-sparc on 7 Mar 2013
}
cat("Test",num,"ran without errors but failed check that x equals y:\n")
cat("> x =",deparse(substitute(x)),"\n")
cat("> x =",deparse(xsub),"\n")
if (is.data.table(x)) compactprint(x) else if (length(x)>6) {cat("First 6 of", length(x),":");print(head(x))} else print(x)
cat("> y =",deparse(substitute(y)),"\n")
cat("> y =",deparse(ysub),"\n")
if (is.data.table(y)) compactprint(y) else if (length(y)>6) {cat("First 6 of", length(y),":");print(head(y))} else print(y)
nfail <<- nfail + 1
whichfail <<- c(whichfail, num)
Expand Down Expand Up @@ -2593,7 +2595,7 @@ test(904, fread("A,B,C,D\n1,3,foo,5\n2,4,barbaz,6"), DT<-data.table(A=1:2,B=3:4,
test(905, fread('A,B,C,D\n1,3,foo,5\n2,4,"barbaz",6'), DT)
test(906, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,r,baz",6'), DT[2,C:="ba,r,baz"])
test(907, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,\\"r,baz",6'), DT[2,C:='ba,\\"r,baz']) # \" protected ok, but \ needs taking off too (TO DO)
test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA_integer_))
test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA)) # where NA is type logical

test(909, fread("
Date and Time,Open,High,Low,Close,Volume
Expand Down Expand Up @@ -4248,11 +4250,11 @@ test(1265, DT[,.N,by=id]$N, INT(1,1,1,1))
test(1266, getNumericRounding(), 0L)
setNumericRounding(2)

# fread reading TRUE, #4766 TO DO
DF = data.frame(I=1:3, L=c(T,F,NA))
# fread reading NA in logical columns, #4766
DF = data.frame(I=1:3, L=c(T,F,NA), R=3.14)
write.csv(DF,f<-tempfile(),row.names=F)
# read.csv(f)
# test(1267, fread(f)$L, c(TRUE, FALSE, NA))
test(1267.1, fread(f)$L, c(TRUE, FALSE, NA))
test(1267.2, fread(f), as.data.table(read.csv(f)))
unlink(f)

### FR #2722 test begins here ###
Expand Down Expand Up @@ -4845,6 +4847,10 @@ test(1341.3, DT[, list(f(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9)
bla <- data.table(x=c(1,1,2,2), y=c(1,1,1,1))
test(1342, unique(bla)[, bla := 2L], data.table(x=c(1,2),y=1,bla=2L))

# blank and NA fields in logical columns
test(1343, fread("A,B\n1,TRUE\n2,\n3,F"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
test(1344, fread("A,B\n1,T\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA)))


##########################

Expand Down
35 changes: 27 additions & 8 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ static clock_t tCoerce, tCoerceAlloc;
#define SXP_NULL 5 // NILSXP i.e. skip column (last so that all types can be bumped up to it by user)
static const char TypeName[6][10] = {"LGL","INT","INT64","REAL","STR","NULL"}; // for messages and errors
static int TypeSxp[6] = {LGLSXP,INTSXP,REALSXP,REALSXP,STRSXP,NILSXP};
static union {double d; long long l; Rboolean b;} u;
static union {double d; long long l; int b;} u; // b=boolean, can hold NA_LOGICAL
#define NUT 8 // Number of User Types (just for colClasses where "numeric"/"double" are equivalent)
static const char UserTypeName[NUT][10] = {"logical", "integer", "integer64", "numeric", "character", "NULL", "double", "CLASS" }; // important that first 6 correspond to TypeName. "CLASS" is the fall back to character then as.class at R level ("CLASS" string is just a placeholder).
static int UserTypeNameMap[NUT] = { SXP_LGL, SXP_INT, SXP_INT64, SXP_REAL, SXP_STR, SXP_NULL, SXP_REAL, SXP_STR };
Expand Down Expand Up @@ -231,6 +231,11 @@ static inline Rboolean Strtob()
ch = start+1;
if (*ch=='a' && *++ch=='l' && *++ch=='s' && *++ch=='e' && (++ch==eof || *ch==sep || *ch==eol)) return(TRUE);
}
else if (ch==eof || *ch==sep || *ch==eol ||
(ch<eof-1 && *ch=='N' && *++ch=='A' && (++ch==eof || *ch==sep || *ch==eol))) {
u.b = NA_LOGICAL;
return(TRUE);
}
ch = start;
return(FALSE); // invalid boolean, need to bump type.
}
Expand Down Expand Up @@ -279,9 +284,19 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
}
setAttrib(newv, R_ClassSymbol, newtype==SXP_INT64 ? ScalarString(mkChar("integer64")) : R_NilValue);
switch(newtype) {
case SXP_INT :
switch(oldtype) {
case SXP_LGL :
for (i=0; i<sofar; i++) INTEGER(newv)[i] = INTEGER(v)[i];
break;
default :
sprintf(errormsg, "Internal error: attempt to bump from type %d to type %d. Please report to datatable-help.", oldtype, newtype);
EXIT();
}
break;
case SXP_INT64:
switch(oldtype) {
case SXP_INT :
case SXP_LGL : case SXP_INT :
for (i=0; i<sofar; i++) REAL(newv)[i] = (INTEGER(v)[i]==NA_INTEGER ? NA_REAL : (u.l=(long long)INTEGER(v)[i],u.d));
break;
default :
Expand All @@ -291,7 +306,7 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
break;
case SXP_REAL:
switch(oldtype) {
case SXP_INT :
case SXP_LGL : case SXP_INT :
for (i=0; i<sofar; i++) REAL(newv)[i] = (INTEGER(v)[i]==NA_INTEGER ? NA_REAL : (double)INTEGER(v)[i]);
break;
case SXP_INT64 :
Expand All @@ -303,10 +318,10 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
}
break;
case SXP_STR:
warning("Bumped column %d to type character on data row %d, field contains '%.*s'. Coercing previously read values in this column from integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.\n", col+1, sofar+1, MsgLimit(lch-ch), ch);
warning("Bumped column %d to type character on data row %d, field contains '%.*s'. Coercing previously read values in this column from logical, integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.\n", col+1, sofar+1, MsgLimit(lch-ch), ch);
static char buffer[129]; // 25 to hold [+-]2^63, with spare space to be safe and snprintf too
switch(oldtype) {
case SXP_INT :
case SXP_LGL : case SXP_INT :
// This for loop takes 0.000007 seconds if the bump occurs on line 179, for example, timing showed
for (i=0; i<sofar; i++) {
if (INTEGER(v)[i] == NA_INTEGER)
Expand Down Expand Up @@ -422,7 +437,11 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
if (fstat(fd,&stat_buf) == -1) {close(fd); error("Opened file ok but couldn't obtain file size: %s", fnam);}
filesize = stat_buf.st_size;
if (filesize<=0) {close(fd); error("File is empty: %s", fnam);}
if (verbose) Rprintf("File opened, filesize is %.3f GB\n", 1.0*filesize/(1024*1024*1024));
if (verbose) Rprintf("File opened, filesize is %.3f GB.\nMemory mapping ... ", 1.0*filesize/(1024*1024*1024));
// Would be nice to print 'Memory mapping' when not verbose, but then it would also print for small files
// which would be annoying. If we could estimate if the mmap was likely to take more than 2 seconds (and thus
// the % meter to kick in) then that'd be ideal. A simple size check isn't enough because it might already
// be cached from a previous run. Perhaps spawn an event, which is cancelled if mmap returns within 2 secs.
#ifdef MAP_POPULATE
mmp = (const char *)mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); // TO DO?: MAP_HUGETLB
#else
Expand Down Expand Up @@ -451,7 +470,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
if (filesize<=0) { CloseHandle(hFile); error("File is empty: %s", fnam); }
hMap=CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL); // filesize+1 not allowed here, unlike mmap where +1 is zero'd
if (hMap==NULL) { CloseHandle(hFile); error("This is Windows, CreateFileMapping returned error %d for file %s", GetLastError(), fnam); }
if (verbose) Rprintf("File opened, filesize is %.3f GB\n", 1.0*filesize/(1024*1024*1024));
if (verbose) Rprintf("File opened, filesize is %.3f GB.\nMemory mapping ... ", 1.0*filesize/(1024*1024*1024));
mmp = (const char *)MapViewOfFile(hMap,FILE_MAP_READ,0,0,filesize);
if (mmp == NULL) {
CloseHandle(hMap);
Expand All @@ -472,7 +491,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
if (EOF > -1) error("Internal error. EOF is not -1 or less\n");
if (mmp[filesize-1] < 0) error("mmap'd region has EOF at the end");
eof = mmp+filesize; // byte after last byte of file. Never dereference eof as it's not mapped.
if (verbose) Rprintf("File is opened and mapped ok\n");
if (verbose) Rprintf("ok\n"); // to end 'Memory mapping ... '
}
clock_t tMap = clock();
// From now use EXIT() wrapper instead of error(), to close it on Windows so as not to lock the file after an error.
Expand Down

0 comments on commit 24ce773

Please sign in to comment.