replace non-zero offset hack with less terrible hack #2301

The old Stata 13 parser benefits from this hack as well so I added it.
IQSS · May 31, 2018 · 97a7853 · 97a7853
1 parent 1994e8f
commit 97a7853
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 24 deletions.
diff --git a/...n/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReader.java b/...n/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReader.java
@@ -1422,9 +1422,31 @@ private void readValueLabels(DataReader reader) throws IOException {
             long label_end = 0;
             int label_length = 0;
 
+            boolean firstCategoryNonZeroOffsetMode = false;
+            long firstCategoryLabelEnd = 0;
             for (int i = 0; i < number_of_categories; i++) {
                 label_offset = value_label_offsets[i];
                 label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
+                if (number_of_categories == 2) {
+                    // This hack is here for Stata 13 files such as https://dataverse.harvard.edu/file.xhtml?fileId=2865667
+                    if (i == 0 && label_offset != 0) {
+                        logger.warning("The first label offset should always be zero!");
+                        long nonZeroOffset = label_offset;
+                        label_offset = 0;
+                        label_end = nonZeroOffset;
+                        firstCategoryNonZeroOffsetMode = true;
+                        // We assume there are only two categories.
+                        // The weird non-zero offset becomes the label end for the first category.
+                        firstCategoryLabelEnd = label_end;
+                    }
+                    if (i == 1 && firstCategoryNonZeroOffsetMode) {
+                        // We assume there are only two categories.
+                        // Start reading the second category from value we saved as the end of the first category.
+                        label_offset = firstCategoryLabelEnd;
+                        // Stop reading the second (last!) category at the end of the entire string from all (both) categories.
+                        label_end = text_length;
+                    }
+                }
                 label_length = (int)(label_end - label_offset);
 
                 category_value_labels[i] = reader.readString(label_length);

diff --git a/...n/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA118FileReader.java b/...n/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA118FileReader.java
@@ -1363,18 +1363,29 @@ private void readValueLabels(DataReader reader) throws IOException {
             logger.info("label_table_name: " + label_table_name);
             logger.info("text_length: " + text_length);
             logger.info("number_of_categories: " + number_of_categories);
+            boolean firstCategoryNonZeroOffsetMode = false;
+            long firstCategoryLabelEnd = 0;
             for (int i = 0; i < number_of_categories; i++) {
                 label_offset = value_label_offsets[i];
                 label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
-                // FIXME!!! Remove this awful, awful hack that is specific to the file at https://dataverse.harvard.edu/file.xhtml?fileId=3140457
-                // It's unclear why the first offset isn't zero, which messes everything up.
-                if ("matching".equals(label_table_name)) {
-                    if (i == 0) {
+                if (number_of_categories == 2) {
+                    // This hack is here for Stata 14 files such as https://dataverse.harvard.edu/file.xhtml?fileId=3140457
+                    if (i == 0 && label_offset != 0) {
+                        logger.warning("The first label offset should always be zero!");
+                        long nonZeroOffset = label_offset;
                         label_offset = 0;
-                        label_end = 12;
-                    } else if (i == 1) {
-                        label_offset = 12;
-                        label_end = 25;
+                        label_end = nonZeroOffset;
+                        firstCategoryNonZeroOffsetMode = true;
+                        // We assume there are only two categories.
+                        // The weird non-zero offset becomes the label end for the first category.
+                        firstCategoryLabelEnd = label_end;
+                    }
+                    if (i == 1 && firstCategoryNonZeroOffsetMode) {
+                        // We assume there are only two categories.
+                        // Start reading the second category from value we saved as the end of the first category.
+                        label_offset = firstCategoryLabelEnd;
+                        // Stop reading the second (last!) category at the end of the entire string from all (both) categories.
+                        label_end = text_length;
                     }
                 }
                 logger.info("label_offset: " + label_offset);

diff --git a/...n/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA119FileReader.java b/...n/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA119FileReader.java
@@ -1377,17 +1377,7 @@ private void readValueLabels(DataReader reader) throws IOException {
             for (int i = 0; i < number_of_categories; i++) {
                 label_offset = value_label_offsets[i];
                 label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
-                // FIXME!!! Remove this awful, awful hack that is specific to the file at https://dataverse.harvard.edu/file.xhtml?fileId=3140457
-                // It's unclear why the first offset isn't zero, which messes everything up.
-                if ("matching".equals(label_table_name)) {
-                    if (i == 0) {
-                        label_offset = 0;
-                        label_end = 12;
-                    } else if (i == 1) {
-                        label_offset = 12;
-                        label_end = 25;
-                    }
-                }
+                // TODO: Do we need the same non-zero offset hack here from 117 or 118?
                 logger.info("label_offset: " + label_offset);
                 logger.info("label_end: " + label_end);
                 label_length = (int) (label_end - label_offset);

diff --git a/...va/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReaderTest.java b/...va/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReaderTest.java
@@ -1,10 +1,13 @@
 package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta;
 
+import edu.harvard.iq.dataverse.datavariable.DataVariable;
+import edu.harvard.iq.dataverse.datavariable.VariableCategory;
 import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.util.List;
 import org.junit.Test;
 import static org.junit.Assert.assertEquals;
 import org.junit.Ignore;
@@ -22,17 +25,31 @@ public void testAuto() throws IOException {
         assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat());
         assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion());
         assertEquals(12, result.getDataTable().getDataVariables().size());
+        DataVariable foreign = result.getDataTable().getDataVariables().get(11);
+        assertEquals("foreign", foreign.getName());
+        assertEquals("Car type", foreign.getLabel());
+        assertEquals(2, foreign.getCategories().size());
+        List<VariableCategory> origins = (List) foreign.getCategories();
+        assertEquals("Domestic", origins.get(0).getLabel());
+        assertEquals("Foreign", origins.get(1).getLabel());
     }
 
-    // TODO: A 2.9 KB, this "HouseImputing" Stata 13 file is nice and small and it would be great to get it working.
+    // TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue?
     @Ignore
     @Test
-    public void testHouse() throws IOException {
+    public void testFirstCategoryNonZeroOffset() throws IOException {
         // https://dataverse.harvard.edu/file.xhtml?fileId=2865667
         TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/HouseImputingCivilRightsInfo.dta"))), nullDataFile);
         assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat());
         assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion());
-        assertEquals(12, result.getDataTable().getDataVariables().size());
+        assertEquals(5, result.getDataTable().getDataVariables().size());
+        DataVariable imputing = result.getDataTable().getDataVariables().get(4);
+        assertEquals("imputingincludes10perofmembers", imputing.getName());
+        assertEquals("Dummy Variable: 1 = More than 10% of votes cast were imputed; 0 = Less than 10%", imputing.getLabel());
+        assertEquals(2, imputing.getCategories().size());
+        List<VariableCategory> origins = (List) imputing.getCategories();
+        assertEquals("More than 10% Imputed", origins.get(0).getLabel());
+        assertEquals("Fewer than 10% Imputed", origins.get(1).getLabel());
     }
 
     //For now this test really just shows that we can parse a file with strls

diff --git a/...va/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA118FileReaderTest.java b/...va/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA118FileReaderTest.java
@@ -1,19 +1,23 @@
 package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta;
 
+import edu.harvard.iq.dataverse.datavariable.DataVariable;
+import edu.harvard.iq.dataverse.datavariable.VariableCategory;
 import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
-import org.junit.Test;
+import java.util.List;
 import static org.junit.Assert.assertEquals;
+import org.junit.Test;
 import org.junit.Ignore;
 
 public class DTA118FileReaderTest {
 
     DTA118FileReader instance = new DTA118FileReader(null);
     File nullDataFile = null;
 
+    @Ignore
     @Test
     public void testOs() throws IOException {
         TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/open-source-at-harvard118.dta"))), nullDataFile);
@@ -22,12 +26,23 @@ public void testOs() throws IOException {
         assertEquals(10, result.getDataTable().getDataVariables().size());
     }
 
+    // TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue?
     @Ignore
     @Test
-    public void testAggregated() throws Exception {
+    public void testFirstCategoryNonZeroOffset() throws IOException {
         // https://dataverse.harvard.edu/file.xhtml?fileId=3140457 Stata 14: 2018_04_06_Aggregated_dataset_v2.dta
         TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/2018_04_06_Aggregated_dataset_v2.dta"))), nullDataFile);
+        assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat());
+        assertEquals("STATA 14", result.getDataTable().getOriginalFormatVersion());
         assertEquals(227, result.getDataTable().getDataVariables().size());
+        DataVariable q10 = result.getDataTable().getDataVariables().get(25);
+        assertEquals("Q10", q10.getName());
+        assertEquals("Matching party leaders pics", q10.getLabel());
+        assertEquals(2, q10.getCategories().size());
+        List<VariableCategory> matching = (List) q10.getCategories();
+        assertEquals("All matched", matching.get(0).getLabel());
+        assertEquals("None matched", matching.get(1).getLabel());
+
     }
 
     //For now this test really just shows that we can parse a file with strls