-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv2rdf.java
357 lines (280 loc) · 12.1 KB
/
csv2rdf.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/* Ruth Helfinstein
July 2012
Translate CSV to RDF in a very simple translation.
Assumes the first line is the attributes
Take as input which attribute is the ID of the instance
Take as input the type of the instance (which is not in the CSV)
The CSV file must be complete (same number of items on each line)
Does very very minimal error checking
compile:
javac -classpath '.:opencsv-2.3.jar' csv2rdf.java
make jar:
jar cmf manifest.txt csv2rdf.jar *.class au
run
java -jar csv2rdf.jar <csv file name>
changes:
12.09.24 be sure to use the actual rdf_name of the class, not the name in the class list
when writing the data (class names are lower case in the list for lookup)
12.09.18 check for properties which are classes when writing out property tags and instance data
12.08.18 fix fixAttributeName to really strip the quotes. Ooops
12.07.27 redo to use configuration file to help with translation
step 1: implement data structures for keeping track of classes and properties
with current interface.
step 2: read and process configuration file.
12.06.27 change spaces to underscores in attribute names.
trim leading/trailing spaces from data.
*/
import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
public class csv2rdf {
private static final String INPUT_FILE="input.csv";
public static final Boolean DEBUG = false;
static int blankCounter; // keep track of blanks seen in config file and then in header file.
public static String BaseFileName(String name) {
// takes a file name (like input.csv) and returns it without the extension
int p = name.lastIndexOf(".");
if (p == -1) {
return name;
} else {
return name.substring(0,p);
}
}
public static CSVConfig readConfigFile(String configFile) throws IOException {
CSVConfig config = new CSVConfig();
CSVReader configReader;
configReader = new CSVReader(new FileReader(configFile)); // if there's a config file this will open it.
System.out.println("Reading Configuration File: " + configFile);
String [] nextLine;
while ((nextLine = configReader.readNext()) != null) {
// process one line at a time.
// of the form csv_name type(rdf_name) class_name
// type can be "class" or "property"
// class_name is the name of the rdf class to use or the superclass
// if any of these fields is blank, at least the tab must be there.
// minimal error checking
if (nextLine.length == 3) {
String csv_name = renameBlankAttribute(fixAttributeName(nextLine[0])); // fixes this name like the header (trim, underscores for blanks, unlabeled1, etc)
String [] typeInfo = nextLine[1].split("[\\(\\)]"); // split out what's inside the parenthesis
String itemType = typeInfo[0].trim().toLowerCase();
String rdf_name = "";
if (typeInfo.length > 1) {
rdf_name = fixAttributeName(typeInfo[1]); // may still be blank after this.
}
if (rdf_name.length() == 0) rdf_name = csv_name; // there's not a different rdf_name for this item
String class_name = nextLine[2].trim();
if (class_name.length() > 1) class_name = fixAttributeName(class_name); // in case there are quotes, etc
// OK now add this. Config object will look at the type and do the right thing.
config.addItem(itemType, csv_name, rdf_name, class_name);
} else {
System.out.println("Badly formed instance, skipping");
System.out.print(" <");
for (int i=0;i<nextLine.length; i++) {
System.out.print(nextLine[i] + " ");
}
System.out.println(">");
}
}
configReader.close();
return config;
}
public static CSVConfig doManualConfig(String fileName) {
CSVConfig config = null;
try {
CSVReader reader;
reader = new CSVReader(new FileReader(fileName)); // if no file, will be caught by outer call
String [] attributes = reader.readNext(); // header line
reader.close(); // done with the file for our test
config = new CSVConfig(attributes); // create a config file from the line with asking user questions.
String configName = configFileName(fileName);
System.out.println("Writing config file to " + configName);
config.writeToFile(configName);
}
catch (IOException i) {
// no file. will catch it later.
config = new CSVConfig();
}
return config;
}
public static String fixAttributeName(String att) {
// trim, replace spaces with underscores, remove quotes, etc.
// used to sanitize attributes read from the csv or the config file
// to make them work for the rdf file.
// need to remove # characters?
att = att.trim();
att = att.replaceAll(" ","_");
att = att.replaceAll("\"", ""); // remove any quotes, too;
att = att.replaceAll("/", "-");
return att;
}
public static String renameBlankAttribute(String att) {
// uses the global blankCounter to keep track of how many blanks we've seen.
// and change empty strings to "unlabeled1", "unlabeled2" ...
// this is used only to sanitize the csv header data and the csv name in the config
if (att.length() == 0) {
blankCounter++;
att = "unlabeled" + blankCounter;
}
return att;
}
public static void fixAttributes(String[] attributes) {
// replace any spaces in the attributes with underscores
// replace any blank attributes with "unlabeled1", "unlabeled2" -- RH 12.07.03
// (now that happens in fixAttributeName)
for (int i = 0; i < attributes.length; i++) {
attributes[i] = renameBlankAttribute(fixAttributeName(attributes[i]));
}
}
public static boolean containsData(String s) {
// returns false if s is an empty String
// or the word "null" (or NULL)
return ((s.length() > 0) && !s.equalsIgnoreCase("null"));
}
public static void fixInputLine(String[] data) {
for (int i = 0; i < data.length; i++) {
data[i] = data[i].trim(); // fix up the data
}
}
public static String configFileName(String fileName) {
return BaseFileName(fileName) + "-config.csv";
}
public static void main(String[] args) throws IOException {
String fileName, outputFile;
if (args.length > 0) {
fileName = args[0];
} else {
fileName = INPUT_FILE; // default if none is specified.
}
String baseFileName = BaseFileName(fileName);
CSVConfig config;
blankCounter = 0; // before we read the file, reset the blank counter
// is there a configuration file?
try {
config = readConfigFile(configFileName(fileName));
}
catch (IOException ioe) {
// no configuration file, just ask a few questions and do simple configuration from that.
// use info from the header of the main file.
System.out.println("No configuration file, will construct one from " + fileName);
config = doManualConfig(fileName);
if (DEBUG) {
System.out.println("CONFIG file: ");
config.writeToFile("");
System.out.println();
}
}
// we have now got our configuration. Print it out if we're debugging.
if (DEBUG) {
System.out.println("Class list: ");
for (String cName : config.classes()) System.out.print(cName + ", ");
System.out.println();
}
if (config.numClasses() > 0) System.out.println("\nClasses and Properties");
for (String cName : config.classes()) {
CSVConfig.HeaderClass c = config.getClass(cName);
System.out.print(c.rdf_name());
String s = config.superClassOf(cName);
if (s != null && s.length() > 0) System.out.print(" (" + s + ")");
System.out.print(": ");
for (String pName : config.getProperties(cName)) {
System.out.print(pName + " ");
}
System.out.println("");
}
System.out.println("");
// Now read in the input file and process it using the information we stored from the configuration step.
try { // handle error where file doesn't exist.
CSVReader reader;
reader = new CSVReader(new FileReader(fileName));
System.out.println("Reading CSV from " + fileName);
blankCounter = 0; // before we read the file, reset the blank counter
String [] attributes = reader.readNext(); // header line
if (attributes != null) {
int numAttributes = attributes.length;
fixAttributes(attributes); // consistent with the config file...
// go through header line and put the column numbers into the config file.
for (int i = 0; i < numAttributes; i++) {
// is is the column
config.setItemColumn(attributes[i], i);
}
outputFile = baseFileName + ".rdf";
System.out.println("Writing RDF to " + outputFile);
RDFWriter writer = new RDFWriter(outputFile);
writer.startRDF(); // this writes header and beginning part of the file.
// write out the classes and property descriptions
for (String cName : config.classes()) {
// write the class description
CSVConfig.HeaderClass c = config.getClass(cName); // definitely there.
String superClassName = config.superClassOf(c); // make sure we get the right name RH 12.09.24
String className = c.rdf_name(); // make sure we get the right capitalization
writer.writeClassInfo(className, superClassName);
// write the property descriptions
for (String propName : config.getProperties(cName)) {
String propClass = config.propertyIsClass(config.getProperty(className, propName));
writer.writePropertyTag(propName, className, propClass);
}
}
// now write the individual instances
String [] nextLine;
while ((nextLine = reader.readNext()) != null) {
// process one line at a time.
// assuming well formed input - length of each line is exactly the same as length of header.
// add some error checking
if (nextLine.length == numAttributes) {
fixInputLine(nextLine); // trims each string
// drive the writing of the data from the classes/properties in config
for (String cName : config.classes()) {
CSVConfig.HeaderItem item = config.getClass(cName);
int col = item.column();
if ((col != -1) && containsData(nextLine[col])) {
// if there is no data for the class field, don't write this line at all for that class
// (or this could be unlabeledx...?
// (need to sanitize this data (underscores, etc) because it's an ID)
writer.startInstance(fixAttributeName(nextLine[col]), item.rdf_name());
if (DEBUG) System.out.println("Instance of " + item.rdf_name() + " " +'"' + nextLine[col] + '"');
for (String propName : config.getProperties(cName)) {
// write a line for each property that has data
CSVConfig.HeaderItem propItem = config.getProperty(cName, propName);
if (propItem != null) {
int propCol = propItem.column();
if ((propCol != -1) && containsData(nextLine[propCol])) {
// we have a column for this property, and that column has data in this instance
String propClass = config.propertyIsClass(propItem);
// write the data differently if it's also a class
if (propClass.length() == 0)
writer.writeAttributeData(propName, nextLine[propCol]);
else
writer.writeAttributeDataResource(propName, fixAttributeName(nextLine[propCol])); // in this case it is an ID so sanitize
if (DEBUG) System.out.println(" " + propName + " "
+ '"' + nextLine[propCol] + '"' + " (" + propClass + ")");
}
}
}
writer.endInstance();
}
}
// done with this line of the file.
if (DEBUG) System.out.println("---");
} else {
System.out.println("Badly formed instance, skipping");
System.out.print(" <");
for (int i=0;i<nextLine.length; i++) {
System.out.print(nextLine[i] + " ");
}
System.out.println(">");
}
}
writer.endRDF(); // anything that goes at the end of the RDF (and closes the file)
reader.close();
} else {
System.out.println("Empty input file:" + fileName);
}
}
catch (IOException ioe) {
System.err.println("Could not open file:" + fileName);
}
}
}