Skip to content
This repository has been archived by the owner on Nov 21, 2018. It is now read-only.

page metadata dump truck #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToSnippets.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// ugly hack, <[email protected]>

package org.clueweb.clueweb12.app;

import java.io.IOException;
import java.util.Arrays;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.clueweb.clueweb12.ClueWeb12WarcRecord;
import org.clueweb.clueweb12.mapred.ClueWeb12InputFormat;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class DumpWarcRecordsToSnippets extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(DumpWarcRecordsToSnippets.class);

private static enum Records { TOTAL, PAGES, ERRORS };

private static final int SLEN = 1000;

private static class MyMapper extends MapReduceBase implements
Mapper<Writable, ClueWeb12WarcRecord, Text, Text> {
private static final Text KEY = new Text();
private static final Text VALUE = new Text();

public void configure(JobConf job) {}

public void map(Writable key, ClueWeb12WarcRecord doc, OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
reporter.incrCounter(Records.TOTAL, 1);

String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
if (docid != null) {
reporter.incrCounter(Records.PAGES, 1);
try {
KEY.set("\""+docid+"\"");
// get title tag
String url = doc.getHeaderMetadataItem("WARC-Target-URI");
Document parsedDoc = Jsoup.parse(doc.getContent());
String parsedDocTxt = parsedDoc.text().replaceAll("[\\r\\n\\t]+", " ");
String title = parsedDoc.title();
String snippet =parsedDocTxt .substring(0, Math.min(parsedDocTxt.length(),SLEN));
VALUE.set("\"" + url + "\"\t\"" +title + "\"\t\"" + snippet + "\"" );
output.collect(KEY, VALUE);
} catch (Exception e) {
// If Jsoup throws any exceptions, catch and move on.
reporter.incrCounter(Records.ERRORS, 1);
}
}
}
}

public DumpWarcRecordsToSnippets() {}

public static final String INPUT_OPTION = "input";
public static final String OUTPUT_OPTION = "output";

/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();

options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("input path").create(INPUT_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("output path").create(OUTPUT_OPTION));

CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}

if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}

String input = cmdline.getOptionValue(INPUT_OPTION);
String output = cmdline.getOptionValue(OUTPUT_OPTION);

LOG.info("Tool name: " + DumpWarcRecordsToSnippets.class.getSimpleName());
LOG.info(" - input: " + input);
LOG.info(" - output: " + output);

JobConf conf = new JobConf(getConf(), DumpWarcRecordsToSnippets.class);
conf.setJobName(DumpWarcRecordsToSnippets.class.getSimpleName() + ":" + input);

conf.setNumReduceTasks(0);

FileInputFormat.addInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, new Path(output));

conf.setInputFormat(ClueWeb12InputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.setMapperClass(MyMapper.class);

RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();

LOG.info("Read " + numDocs + " docs.");

return 0;
}

/**
* Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
*/
public static void main(String[] args) throws Exception {
LOG.info("Running " + DumpWarcRecordsToSnippets.class.getCanonicalName() + " with args "
+ Arrays.toString(args));
ToolRunner.run(new DumpWarcRecordsToSnippets(), args);
}
}