fread NA and blank in logical columns, #567

Rdatatable · Jun 27, 2014 · 24ce773 · 24ce773
1 parent 8c6edf9
commit 24ce773
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -143,17 +143,19 @@ DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.
 
   *  `setorder()` and `DT[order(.)]` handles `integer64` type in descending order as well. Closes [#703](https://github.com/Rdatatable/data.table/issues/703).
 
+  *  `setorder()` and `setorderv()` gain `na.last = TRUE/FALSE`. Closes [#706](https://github.com/Rdatatable/data.table/issues/706).
+
+#### BUG FIXES
+
   * `fread()` :
     * now accepts line breaks inside quoted fields. Thanks to Clayton Stanley for highlighting :
     http://stackoverflow.com/questions/21006661/fread-and-a-quoted-multi-line-column-value
 
     * now accepts trailing backslash in quoted fields. Thanks to user2970844 for highlighting :
     http://stackoverflow.com/questions/24375832/fread-and-column-with-a-trailing-backslash
-
-  *  `setorder()` and `setorderv()` gain `na.last = TRUE/FALSE`. Closes [#706](https://github.com/Rdatatable/data.table/issues/706).
-
-#### BUG FIXES
-
+
+    * Blank and `"NA"` values in logical columns (`T`,`True`,`TRUE`) no longer cause them to be read as character, [#567](https://github.com/Rdatatable/data.table/issues/567). Tests added.
+
   *  When joining to fewer columns than the key has, using one of the later key columns explicitly in j repeated the first value. A problem introduced by v1.9.2 and not caught bythe 1,220 tests, or tests in 37 dependent packages. Test added. Many thanks to Michele Carriero for reporting.
   ```R
   DT = data.table(a=1:2, b=letters[1:6], key="a,b")    # keyed by a and b

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -48,6 +48,8 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
     .lasttest <<- num
     ntest <<- ntest + 1
     lastnum <<- num
+    xsub = substitute(x)
+    ysub = substitute(y)
     if (is.null(output)) err <<- try(x,TRUE)
     else out = gsub("NULL$","",paste(capture.output(print(err<<-try(x,TRUE))),collapse=""))
     if (!is.null(error) || !is.null(warning)) {
@@ -61,7 +63,7 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
                length(grep(patt,err)) &&
                type==observedtype)) {
             cat("Test",num,"didn't produce correct",type,":\n")
-            cat(">",deparse(substitute(x)),"\n")
+            cat(">",deparse(xsub),"\n")
             cat("Expected ",type,": '",txt,"'\n",sep="")
             if (!inherits(err,"try-error"))
                 cat("Observed: no error or warning\n")
@@ -90,7 +92,7 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
         output = gsub("[)]","[)]",output)
         if (!length(grep(output,out))) {
             cat("Test",num,"didn't produce correct output:\n")
-            cat(">",deparse(substitute(x)),"\n")
+            cat(">",deparse(xsub),"\n")
             cat("Expected: '",output,"'\n",sep="")
             cat("Observed: '",out,"'\n",sep="")
             nfail <<- nfail + 1
@@ -102,7 +104,7 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
     if (missing(y)) {
         if (isTRUE(as.vector(x))) return()  # as.vector to drop names of a named vector such as returned by system.time
         cat("Test",num,"expected TRUE but observed:\n")
-        cat(">",deparse(substitute(x)),"\n")
+        cat(">",deparse(xsub),"\n")
         if (is.data.table(x)) compactprint(x) else print(x)
         nfail <<- nfail + 1
         whichfail <<- c(whichfail, num)
@@ -131,9 +133,9 @@ test = function(num,x,y,error=NULL,warning=NULL,output=NULL) {
         if (is.atomic(x) && is.atomic(y) && isTRUE(all.equal(x,y))) return()   # For test 617 on r-prerel-solaris-sparc on 7 Mar 2013
     }
     cat("Test",num,"ran without errors but failed check that x equals y:\n")
-    cat("> x =",deparse(substitute(x)),"\n")
+    cat("> x =",deparse(xsub),"\n")
     if (is.data.table(x)) compactprint(x) else if (length(x)>6) {cat("First 6 of", length(x),":");print(head(x))} else print(x)
-    cat("> y =",deparse(substitute(y)),"\n")
+    cat("> y =",deparse(ysub),"\n")
     if (is.data.table(y)) compactprint(y) else if (length(y)>6) {cat("First 6 of", length(y),":");print(head(y))} else print(y)
     nfail <<- nfail + 1
     whichfail <<- c(whichfail, num)
@@ -2593,7 +2595,7 @@ test(904, fread("A,B,C,D\n1,3,foo,5\n2,4,barbaz,6"), DT<-data.table(A=1:2,B=3:4,
 test(905, fread('A,B,C,D\n1,3,foo,5\n2,4,"barbaz",6'), DT)
 test(906, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,r,baz",6'), DT[2,C:="ba,r,baz"])
 test(907, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,\\"r,baz",6'), DT[2,C:='ba,\\"r,baz'])  # \" protected ok, but \ needs taking off too (TO DO)
-test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA_integer_))
+test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA)) # where NA is type logical
 
 test(909, fread("
 Date and Time,Open,High,Low,Close,Volume
@@ -4248,11 +4250,11 @@ test(1265, DT[,.N,by=id]$N, INT(1,1,1,1))
 test(1266, getNumericRounding(), 0L)
 setNumericRounding(2)
 
-# fread reading TRUE, #4766  TO DO
-DF = data.frame(I=1:3, L=c(T,F,NA))
+# fread reading NA in logical columns, #4766
+DF = data.frame(I=1:3, L=c(T,F,NA), R=3.14)
 write.csv(DF,f<-tempfile(),row.names=F)
-# read.csv(f)     
-# test(1267, fread(f)$L, c(TRUE, FALSE, NA))
+test(1267.1, fread(f)$L, c(TRUE, FALSE, NA))
+test(1267.2, fread(f), as.data.table(read.csv(f)))
 unlink(f)
 
 ### FR #2722 test begins here ###
@@ -4845,6 +4847,10 @@ test(1341.3, DT[, list(f(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9)
 bla <- data.table(x=c(1,1,2,2), y=c(1,1,1,1))
 test(1342, unique(bla)[, bla := 2L], data.table(x=c(1,2),y=1,bla=2L))
 
+# blank and NA fields in logical columns
+test(1343, fread("A,B\n1,TRUE\n2,\n3,F"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
+test(1344, fread("A,B\n1,T\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA)))
+
 
 ##########################
 

diff --git a/src/fread.c b/src/fread.c
@@ -70,7 +70,7 @@ static clock_t tCoerce, tCoerceAlloc;
 #define SXP_NULL   5   // NILSXP i.e. skip column (last so that all types can be bumped up to it by user)
 static const char TypeName[6][10] = {"LGL","INT","INT64","REAL","STR","NULL"};  // for messages and errors
 static int TypeSxp[6] = {LGLSXP,INTSXP,REALSXP,REALSXP,STRSXP,NILSXP};
-static union {double d; long long l; Rboolean b;} u;
+static union {double d; long long l; int b;} u;   // b=boolean, can hold NA_LOGICAL
 #define NUT        8   // Number of User Types (just for colClasses where "numeric"/"double" are equivalent)
 static const char UserTypeName[NUT][10] = {"logical", "integer", "integer64", "numeric", "character", "NULL", "double", "CLASS" };  // important that first 6 correspond to TypeName.  "CLASS" is the fall back to character then as.class at R level ("CLASS" string is just a placeholder).
 static int UserTypeNameMap[NUT] = { SXP_LGL, SXP_INT, SXP_INT64, SXP_REAL, SXP_STR, SXP_NULL, SXP_REAL, SXP_STR };
@@ -231,6 +231,11 @@ static inline Rboolean Strtob()
         ch = start+1;
         if (*ch=='a' && *++ch=='l' && *++ch=='s' && *++ch=='e' && (++ch==eof || *ch==sep || *ch==eol)) return(TRUE);
     }
+    else if (ch==eof || *ch==sep || *ch==eol ||
+             (ch<eof-1 && *ch=='N' && *++ch=='A' && (++ch==eof || *ch==sep || *ch==eol))) {
+        u.b = NA_LOGICAL;
+        return(TRUE);
+    }
     ch = start;
     return(FALSE);     // invalid boolean, need to bump type.
 }
@@ -279,9 +284,19 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
     }
     setAttrib(newv, R_ClassSymbol, newtype==SXP_INT64 ? ScalarString(mkChar("integer64")) : R_NilValue);
     switch(newtype) {
+    case SXP_INT :
+        switch(oldtype) {
+        case SXP_LGL :
+            for (i=0; i<sofar; i++) INTEGER(newv)[i] = INTEGER(v)[i];
+            break;
+        default :
+            sprintf(errormsg, "Internal error: attempt to bump from type %d to type %d. Please report to datatable-help.", oldtype, newtype);
+            EXIT();
+        }
+        break;
     case SXP_INT64:
         switch(oldtype) {
-        case SXP_INT :
+        case SXP_LGL : case SXP_INT :
             for (i=0; i<sofar; i++) REAL(newv)[i] = (INTEGER(v)[i]==NA_INTEGER ? NA_REAL : (u.l=(long long)INTEGER(v)[i],u.d));
             break;
         default :
@@ -291,7 +306,7 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
         break;
     case SXP_REAL:
         switch(oldtype) {
-        case SXP_INT :
+        case SXP_LGL : case SXP_INT :
             for (i=0; i<sofar; i++) REAL(newv)[i] = (INTEGER(v)[i]==NA_INTEGER ? NA_REAL : (double)INTEGER(v)[i]);
             break;
         case SXP_INT64 :
@@ -303,10 +318,10 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
         }
         break;
     case SXP_STR:
-        warning("Bumped column %d to type character on data row %d, field contains '%.*s'. Coercing previously read values in this column from integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.\n", col+1, sofar+1, MsgLimit(lch-ch), ch);
+        warning("Bumped column %d to type character on data row %d, field contains '%.*s'. Coercing previously read values in this column from logical, integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.\n", col+1, sofar+1, MsgLimit(lch-ch), ch);
         static char buffer[129];  // 25 to hold [+-]2^63, with spare space to be safe and snprintf too
         switch(oldtype) {
-        case SXP_INT :
+        case SXP_LGL : case SXP_INT :
             // This for loop takes 0.000007 seconds if the bump occurs on line 179, for example, timing showed
             for (i=0; i<sofar; i++) {
                 if (INTEGER(v)[i] == NA_INTEGER)
@@ -422,7 +437,11 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         if (fstat(fd,&stat_buf) == -1) {close(fd); error("Opened file ok but couldn't obtain file size: %s", fnam);}
         filesize = stat_buf.st_size;
         if (filesize<=0) {close(fd); error("File is empty: %s", fnam);}
-        if (verbose) Rprintf("File opened, filesize is %.3f GB\n", 1.0*filesize/(1024*1024*1024));
+        if (verbose) Rprintf("File opened, filesize is %.3f GB.\nMemory mapping ... ", 1.0*filesize/(1024*1024*1024));
+        // Would be nice to print 'Memory mapping' when not verbose, but then it would also print for small files
+        // which would be annoying. If we could estimate if the mmap was likely to take more than 2 seconds (and thus
+        // the % meter to kick in) then that'd be ideal. A simple size check isn't enough because it might already
+        // be cached from a previous run. Perhaps spawn an event, which is cancelled if mmap returns within 2 secs.
 #ifdef MAP_POPULATE
         mmp = (const char *)mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);    // TO DO?: MAP_HUGETLB
 #else
@@ -451,7 +470,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         if (filesize<=0) { CloseHandle(hFile); error("File is empty: %s", fnam); }
         hMap=CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL); // filesize+1 not allowed here, unlike mmap where +1 is zero'd
         if (hMap==NULL) { CloseHandle(hFile); error("This is Windows, CreateFileMapping returned error %d for file %s", GetLastError(), fnam); }
-        if (verbose) Rprintf("File opened, filesize is %.3f GB\n", 1.0*filesize/(1024*1024*1024));
+        if (verbose) Rprintf("File opened, filesize is %.3f GB.\nMemory mapping ... ", 1.0*filesize/(1024*1024*1024));
         mmp = (const char *)MapViewOfFile(hMap,FILE_MAP_READ,0,0,filesize);
         if (mmp == NULL) {
             CloseHandle(hMap);
@@ -472,7 +491,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         if (EOF > -1) error("Internal error. EOF is not -1 or less\n");
         if (mmp[filesize-1] < 0) error("mmap'd region has EOF at the end");
         eof = mmp+filesize;  // byte after last byte of file.  Never dereference eof as it's not mapped.
-        if (verbose) Rprintf("File is opened and mapped ok\n");
+        if (verbose) Rprintf("ok\n");  // to end 'Memory mapping ... '
     }
     clock_t tMap = clock();
     // From now use EXIT() wrapper instead of error(), to close it on Windows so as not to lock the file after an error.