Document extraction app added

lintool · Jul 26, 2013 · db38e53 · db38e53
1 parent 8b7d392
commit db38e53
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 530 deletions.
diff --git a/README.md b/README.md
@@ -60,6 +60,54 @@ The parameters are:
 + `trecinputfile`: HDFS path to the TREC result file which is used as starting point for filtering
 
 
+
+De-duplication
+--------------
+To increase diversity, duplicate documents can be removed from the result ranking (in effect pushing lower ranked results up the ranking).
+
+A simple cosine based similarity approach is implemented in `DuplicateFiltering`: every document at rank x is compared to all non-duplicate documents at higher ranks. If its cosine similarity is high enough, it is filtered out.
+
+To run the code, call:
+
+```
+$ hadoop jar clueweb-tools-0.3-SNAPSHOT-fatjar.jar \
+	org.clueweb.clueweb12.app.DuplicateFiltering 
+	-cosineSimThreshold 0.8 \ 
+	-dictionary /data/private/clueweb12/derived/dictionary.XXX \
+	-docvector /data/private/clueweb12/derived/docvectors.XXX/*/part* \
+	-output /user/chauff/res.dir1000.porter.deduplicated \
+	-topk 1000 \
+	-trecinputfile /user/chauff/res.dir1000.porter
+```
+
+The parameters (apart from the usual ones) are:
++ `cosineSimThreshold`: documents having a cosine similarity above this threshold are removed from the result file
++ `trecinputfile`: file in TREC result format which is used as a starting point for de-duplication
+
+
+Document extraction
+-------------------
+
+A helper app: given a file with a list of docids, it extracts the documents' content from the WARC files.
+
+To run the code, call:
+
+```
+$ hadoop jar clueweb-tools-0.3-SNAPSHOT-fatjar.jar \
+	org.clueweb.clueweb12.app.DocumentExtractor \
+	-docidsfile /user/chauff/docids \
+	-input /data/private/clueweb12/Disk*/*/*/*.warc.gz \
+	-keephtml false \
+	-output /user/chauff/docids-output
+```
+
+The parameters are:
++`docidsfile`: a file with one docid per line; all docids are extracted from the WARC input files
++`input`: list of WARC files
++`keephtml`: parameter that is either `true` (keep the HTML source of each document) or `false` (parse the documents, remove HTML)
++`output`: folder where the documents' content is stored - one file per docid
+
+
 Retrieval runs
 --------------
 The files `runs/res.dir1000.{standard,porter}` contain the baseline results when running the above retrieval program (i.e. LM with Dirichlet smoothing and mu=1000) with `standard` and `porter` preprocessing respectively.

diff --git a/src/main/java/org/clueweb/clueweb12/app/DocumentExtractor.java b/src/main/java/org/clueweb/clueweb12/app/DocumentExtractor.java
@@ -54,6 +54,7 @@
 import org.apache.hadoop.mapreduce.Mapper.Context;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@@ -78,6 +79,10 @@
 public class DocumentExtractor extends Configured implements Tool {
 	private static final Logger LOG = Logger
 			.getLogger(DocumentExtractor.class);
+
+	private static enum Records {
+		DOCUMENTS_FOUND
+	};
 
 	private static boolean keepHTML;
 	private static final HashMap<String,String> docidMap = Maps.newHashMap();
@@ -121,6 +126,7 @@ public void map(LongWritable key, ClueWeb12WarcRecord doc,
 						content = Jsoup.parse(content).text();
 					}
 					docidMap.put(docid, content);
+					context.getCounter(Records.DOCUMENTS_FOUND).increment(1);
 
 				} catch (Exception e) {
 					// If Jsoup throws any exceptions, catch and move on.
@@ -221,18 +227,25 @@ public int run(String[] args) throws Exception {
 		job.setJarByClass(DocumentExtractor.class);
 
 		FileInputFormat.setInputPaths(job, input);
+
 		job.setInputFormatClass(ClueWeb12InputFormat.class);
+		job.setOutputFormatClass(NullOutputFormat.class);
+
 		job.setMapOutputKeyClass(NullWritable.class);
 		job.setMapOutputValueClass(NullWritable.class);
 
 		job.setMapperClass(MyMapper.class);
+		job.setNumReduceTasks(0);
 
 		FileSystem.get(getConf()).delete(new Path(output), true);
 
 		long startTime = System.currentTimeMillis();
 		job.waitForCompletion(true);
 		LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime)
 				/ 1000.0 + " seconds");
+
+		int numDocsFound = (int) job.getCounters().findCounter(Records.DOCUMENTS_FOUND).getValue();
+		LOG.info("Number of documents found: "+numDocsFound);
 
 		return 0;
 	}

diff --git a/src/main/java/org/clueweb/clueweb12/app/DuplicateFiltering.java b/src/main/java/org/clueweb/clueweb12/app/DuplicateFiltering.java
@@ -1,6 +1,7 @@
 /*
  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 
+
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you
  * may not use this file except in compliance with the License. You may
@@ -45,6 +46,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
+
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.GnuParser;
@@ -74,6 +76,7 @@
 import org.apache.log4j.Logger;
 import org.clueweb.data.PForDocVector;
 import org.clueweb.data.TermStatistics;
+
 import tl.lin.data.array.IntArrayWritable;
 import tl.lin.data.pair.PairOfIntString;
 import tl.lin.data.pair.PairOfInts;
@@ -84,6 +87,10 @@
 public class DuplicateFiltering extends Configured implements Tool {
 
 	private static final Logger LOG = Logger.getLogger(DuplicateFiltering.class);
+
+	private static enum Records {
+		DUPLICATES
+	};
 
 	/*
 	 * we need to emit [termid1 weight1 termid2 weight2 ...] as value in
@@ -165,13 +172,13 @@ public void setup(Context context) throws IOException {
 		@Override
 		public void map(Text key, IntArrayWritable ints, Context context)
 				throws IOException, InterruptedException {
-
-			PForDocVector.fromIntArrayWritable(ints, DOC);
-
+
 			// is the document of interest to us?
 			if (!docidResults.containsKey(key.toString())) {
 				return;
 			}
+
+			PForDocVector.fromIntArrayWritable(ints, DOC);
 
 			// tfMap of the document
 			HashMap<Integer, Integer> tfMap = Maps.newHashMap();
@@ -213,6 +220,7 @@ private static class MyReducer extends
 		private float cosineSimThreshold;
 		private static final NullWritable nullKey = NullWritable.get();
 		private static final Text valueOut = new Text();
+
 		// outer key: qid, inner key: rank, inner value: term weights of the
 		// document at rank for query
 		private static final HashMap<Integer, HashMap<Integer, HashMap<Integer, Float>>> termWeightsPerQuery = Maps
@@ -309,6 +317,7 @@ public void cleanup(Context context) throws IOException,
 
 						if (sim >= cosineSimThreshold) {
 							termWeights.remove(r);
+							context.getCounter(Records.DUPLICATES).increment(1);
 							break;
 						}
 					}
@@ -343,7 +352,7 @@ public int run(String[] args) throws Exception {
 		Options options = new Options();
 
 		options.addOption(OptionBuilder.withArgName("path").hasArg()
-				.withDescription("input path pfor docvec (seg*/part*)")
+				.withDescription("input path (pfor format expected, add * to retrieve files)")
 				.create(DOCVECTOR_OPTION));
 		options.addOption(OptionBuilder.withArgName("path").hasArg()
 				.withDescription("input path").create(TREC_RESULT_FILE));
@@ -430,6 +439,10 @@ public int run(String[] args) throws Exception {
 		job.waitForCompletion(true);
 		LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime)
 				/ 1000.0 + " seconds");
+
+		int numDuplicates = (int) job.getCounters().findCounter(Records.DUPLICATES).getValue();
+		LOG.info("Number of duplicates: "+numDuplicates);
+
 		return 0;
 	}