Skip to content

Commit

Permalink
replace non-zero offset hack with less terrible hack #2301
Browse files Browse the repository at this point in the history
The old Stata 13 parser benefits from this hack as well so I added it.
  • Loading branch information
pdurbin committed May 31, 2018
1 parent 1994e8f commit 97a7853
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1422,9 +1422,31 @@ private void readValueLabels(DataReader reader) throws IOException {
long label_end = 0;
int label_length = 0;

boolean firstCategoryNonZeroOffsetMode = false;
long firstCategoryLabelEnd = 0;
for (int i = 0; i < number_of_categories; i++) {
label_offset = value_label_offsets[i];
label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
if (number_of_categories == 2) {
// This hack is here for Stata 13 files such as https://dataverse.harvard.edu/file.xhtml?fileId=2865667
if (i == 0 && label_offset != 0) {
logger.warning("The first label offset should always be zero!");
long nonZeroOffset = label_offset;
label_offset = 0;
label_end = nonZeroOffset;
firstCategoryNonZeroOffsetMode = true;
// We assume there are only two categories.
// The weird non-zero offset becomes the label end for the first category.
firstCategoryLabelEnd = label_end;
}
if (i == 1 && firstCategoryNonZeroOffsetMode) {
// We assume there are only two categories.
// Start reading the second category from value we saved as the end of the first category.
label_offset = firstCategoryLabelEnd;
// Stop reading the second (last!) category at the end of the entire string from all (both) categories.
label_end = text_length;
}
}
label_length = (int)(label_end - label_offset);

category_value_labels[i] = reader.readString(label_length);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1363,18 +1363,29 @@ private void readValueLabels(DataReader reader) throws IOException {
logger.info("label_table_name: " + label_table_name);
logger.info("text_length: " + text_length);
logger.info("number_of_categories: " + number_of_categories);
boolean firstCategoryNonZeroOffsetMode = false;
long firstCategoryLabelEnd = 0;
for (int i = 0; i < number_of_categories; i++) {
label_offset = value_label_offsets[i];
label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
// FIXME!!! Remove this awful, awful hack that is specific to the file at https://dataverse.harvard.edu/file.xhtml?fileId=3140457
// It's unclear why the first offset isn't zero, which messes everything up.
if ("matching".equals(label_table_name)) {
if (i == 0) {
if (number_of_categories == 2) {
// This hack is here for Stata 14 files such as https://dataverse.harvard.edu/file.xhtml?fileId=3140457
if (i == 0 && label_offset != 0) {
logger.warning("The first label offset should always be zero!");
long nonZeroOffset = label_offset;
label_offset = 0;
label_end = 12;
} else if (i == 1) {
label_offset = 12;
label_end = 25;
label_end = nonZeroOffset;
firstCategoryNonZeroOffsetMode = true;
// We assume there are only two categories.
// The weird non-zero offset becomes the label end for the first category.
firstCategoryLabelEnd = label_end;
}
if (i == 1 && firstCategoryNonZeroOffsetMode) {
// We assume there are only two categories.
// Start reading the second category from value we saved as the end of the first category.
label_offset = firstCategoryLabelEnd;
// Stop reading the second (last!) category at the end of the entire string from all (both) categories.
label_end = text_length;
}
}
logger.info("label_offset: " + label_offset);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1377,17 +1377,7 @@ private void readValueLabels(DataReader reader) throws IOException {
for (int i = 0; i < number_of_categories; i++) {
label_offset = value_label_offsets[i];
label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
// FIXME!!! Remove this awful, awful hack that is specific to the file at https://dataverse.harvard.edu/file.xhtml?fileId=3140457
// It's unclear why the first offset isn't zero, which messes everything up.
if ("matching".equals(label_table_name)) {
if (i == 0) {
label_offset = 0;
label_end = 12;
} else if (i == 1) {
label_offset = 12;
label_end = 25;
}
}
// TODO: Do we need the same non-zero offset hack here from 117 or 118?
logger.info("label_offset: " + label_offset);
logger.info("label_end: " + label_end);
label_length = (int) (label_end - label_offset);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta;

import edu.harvard.iq.dataverse.datavariable.DataVariable;
import edu.harvard.iq.dataverse.datavariable.VariableCategory;
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import org.junit.Ignore;
Expand All @@ -22,17 +25,31 @@ public void testAuto() throws IOException {
assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat());
assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion());
assertEquals(12, result.getDataTable().getDataVariables().size());
DataVariable foreign = result.getDataTable().getDataVariables().get(11);
assertEquals("foreign", foreign.getName());
assertEquals("Car type", foreign.getLabel());
assertEquals(2, foreign.getCategories().size());
List<VariableCategory> origins = (List) foreign.getCategories();
assertEquals("Domestic", origins.get(0).getLabel());
assertEquals("Foreign", origins.get(1).getLabel());
}

// TODO: A 2.9 KB, this "HouseImputing" Stata 13 file is nice and small and it would be great to get it working.
// TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue?
@Ignore
@Test
public void testHouse() throws IOException {
public void testFirstCategoryNonZeroOffset() throws IOException {
// https://dataverse.harvard.edu/file.xhtml?fileId=2865667
TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/HouseImputingCivilRightsInfo.dta"))), nullDataFile);
assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat());
assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion());
assertEquals(12, result.getDataTable().getDataVariables().size());
assertEquals(5, result.getDataTable().getDataVariables().size());
DataVariable imputing = result.getDataTable().getDataVariables().get(4);
assertEquals("imputingincludes10perofmembers", imputing.getName());
assertEquals("Dummy Variable: 1 = More than 10% of votes cast were imputed; 0 = Less than 10%", imputing.getLabel());
assertEquals(2, imputing.getCategories().size());
List<VariableCategory> origins = (List) imputing.getCategories();
assertEquals("More than 10% Imputed", origins.get(0).getLabel());
assertEquals("Fewer than 10% Imputed", origins.get(1).getLabel());
}

//For now this test really just shows that we can parse a file with strls
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta;

import edu.harvard.iq.dataverse.datavariable.DataVariable;
import edu.harvard.iq.dataverse.datavariable.VariableCategory;
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.junit.Test;
import java.util.List;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
import org.junit.Ignore;

public class DTA118FileReaderTest {

DTA118FileReader instance = new DTA118FileReader(null);
File nullDataFile = null;

@Ignore
@Test
public void testOs() throws IOException {
TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/open-source-at-harvard118.dta"))), nullDataFile);
Expand All @@ -22,12 +26,23 @@ public void testOs() throws IOException {
assertEquals(10, result.getDataTable().getDataVariables().size());
}

// TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue?
@Ignore
@Test
public void testAggregated() throws Exception {
public void testFirstCategoryNonZeroOffset() throws IOException {
// https://dataverse.harvard.edu/file.xhtml?fileId=3140457 Stata 14: 2018_04_06_Aggregated_dataset_v2.dta
TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/2018_04_06_Aggregated_dataset_v2.dta"))), nullDataFile);
assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat());
assertEquals("STATA 14", result.getDataTable().getOriginalFormatVersion());
assertEquals(227, result.getDataTable().getDataVariables().size());
DataVariable q10 = result.getDataTable().getDataVariables().get(25);
assertEquals("Q10", q10.getName());
assertEquals("Matching party leaders pics", q10.getLabel());
assertEquals(2, q10.getCategories().size());
List<VariableCategory> matching = (List) q10.getCategories();
assertEquals("All matched", matching.get(0).getLabel());
assertEquals("None matched", matching.get(1).getLabel());

}

//For now this test really just shows that we can parse a file with strls
Expand Down

0 comments on commit 97a7853

Please sign in to comment.