Skip to content
This repository has been archived by the owner on Nov 21, 2018. It is now read-only.

Commit

Permalink
Document extraction app added
Browse files Browse the repository at this point in the history
  • Loading branch information
chauff committed Jul 26, 2013
1 parent 8b7d392 commit db38e53
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 530 deletions.
48 changes: 48 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,54 @@ The parameters are:
+ `trecinputfile`: HDFS path to the TREC result file which is used as starting point for filtering



De-duplication
--------------
To increase diversity, duplicate documents can be removed from the result ranking (in effect pushing lower ranked results up the ranking).

A simple cosine based similarity approach is implemented in `DuplicateFiltering`: every document at rank x is compared to all non-duplicate documents at higher ranks. If its cosine similarity is high enough, it is filtered out.

To run the code, call:

```
$ hadoop jar clueweb-tools-0.3-SNAPSHOT-fatjar.jar \
org.clueweb.clueweb12.app.DuplicateFiltering
-cosineSimThreshold 0.8 \
-dictionary /data/private/clueweb12/derived/dictionary.XXX \
-docvector /data/private/clueweb12/derived/docvectors.XXX/*/part* \
-output /user/chauff/res.dir1000.porter.deduplicated \
-topk 1000 \
-trecinputfile /user/chauff/res.dir1000.porter
```

The parameters (apart from the usual ones) are:
+ `cosineSimThreshold`: documents having a cosine similarity above this threshold are removed from the result file
+ `trecinputfile`: file in TREC result format which is used as a starting point for de-duplication


Document extraction
-------------------

A helper app: given a file with a list of docids, it extracts the documents' content from the WARC files.

To run the code, call:

```
$ hadoop jar clueweb-tools-0.3-SNAPSHOT-fatjar.jar \
org.clueweb.clueweb12.app.DocumentExtractor \
-docidsfile /user/chauff/docids \
-input /data/private/clueweb12/Disk*/*/*/*.warc.gz \
-keephtml false \
-output /user/chauff/docids-output
```

The parameters are:
+`docidsfile`: a file with one docid per line; all docids are extracted from the WARC input files
+`input`: list of WARC files
+`keephtml`: parameter that is either `true` (keep the HTML source of each document) or `false` (parse the documents, remove HTML)
+`output`: folder where the documents' content is stored - one file per docid


Retrieval runs
--------------
The files `runs/res.dir1000.{standard,porter}` contain the baseline results when running the above retrieval program (i.e. LM with Dirichlet smoothing and mu=1000) with `standard` and `porter` preprocessing respectively.
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/org/clueweb/clueweb12/app/DocumentExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
Expand All @@ -78,6 +79,10 @@
public class DocumentExtractor extends Configured implements Tool {
private static final Logger LOG = Logger
.getLogger(DocumentExtractor.class);

private static enum Records {
DOCUMENTS_FOUND
};

private static boolean keepHTML;
private static final HashMap<String,String> docidMap = Maps.newHashMap();
Expand Down Expand Up @@ -121,6 +126,7 @@ public void map(LongWritable key, ClueWeb12WarcRecord doc,
content = Jsoup.parse(content).text();
}
docidMap.put(docid, content);
context.getCounter(Records.DOCUMENTS_FOUND).increment(1);

} catch (Exception e) {
// If Jsoup throws any exceptions, catch and move on.
Expand Down Expand Up @@ -221,18 +227,25 @@ public int run(String[] args) throws Exception {
job.setJarByClass(DocumentExtractor.class);

FileInputFormat.setInputPaths(job, input);

job.setInputFormatClass(ClueWeb12InputFormat.class);
job.setOutputFormatClass(NullOutputFormat.class);

job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(NullWritable.class);

job.setMapperClass(MyMapper.class);
job.setNumReduceTasks(0);

FileSystem.get(getConf()).delete(new Path(output), true);

long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime)
/ 1000.0 + " seconds");

int numDocsFound = (int) job.getCounters().findCounter(Records.DOCUMENTS_FOUND).getValue();
LOG.info("Number of documents found: "+numDocsFound);

return 0;
}
Expand Down
21 changes: 17 additions & 4 deletions src/main/java/org/clueweb/clueweb12/app/DuplicateFiltering.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
Expand Down Expand Up @@ -45,6 +46,7 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
Expand Down Expand Up @@ -74,6 +76,7 @@
import org.apache.log4j.Logger;
import org.clueweb.data.PForDocVector;
import org.clueweb.data.TermStatistics;

import tl.lin.data.array.IntArrayWritable;
import tl.lin.data.pair.PairOfIntString;
import tl.lin.data.pair.PairOfInts;
Expand All @@ -84,6 +87,10 @@
public class DuplicateFiltering extends Configured implements Tool {

private static final Logger LOG = Logger.getLogger(DuplicateFiltering.class);

private static enum Records {
DUPLICATES
};

/*
* we need to emit [termid1 weight1 termid2 weight2 ...] as value in
Expand Down Expand Up @@ -165,13 +172,13 @@ public void setup(Context context) throws IOException {
@Override
public void map(Text key, IntArrayWritable ints, Context context)
throws IOException, InterruptedException {

PForDocVector.fromIntArrayWritable(ints, DOC);


// is the document of interest to us?
if (!docidResults.containsKey(key.toString())) {
return;
}

PForDocVector.fromIntArrayWritable(ints, DOC);

// tfMap of the document
HashMap<Integer, Integer> tfMap = Maps.newHashMap();
Expand Down Expand Up @@ -213,6 +220,7 @@ private static class MyReducer extends
private float cosineSimThreshold;
private static final NullWritable nullKey = NullWritable.get();
private static final Text valueOut = new Text();

// outer key: qid, inner key: rank, inner value: term weights of the
// document at rank for query
private static final HashMap<Integer, HashMap<Integer, HashMap<Integer, Float>>> termWeightsPerQuery = Maps
Expand Down Expand Up @@ -309,6 +317,7 @@ public void cleanup(Context context) throws IOException,

if (sim >= cosineSimThreshold) {
termWeights.remove(r);
context.getCounter(Records.DUPLICATES).increment(1);
break;
}
}
Expand Down Expand Up @@ -343,7 +352,7 @@ public int run(String[] args) throws Exception {
Options options = new Options();

options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("input path pfor docvec (seg*/part*)")
.withDescription("input path (pfor format expected, add * to retrieve files)")
.create(DOCVECTOR_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("input path").create(TREC_RESULT_FILE));
Expand Down Expand Up @@ -430,6 +439,10 @@ public int run(String[] args) throws Exception {
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime)
/ 1000.0 + " seconds");

int numDuplicates = (int) job.getCounters().findCounter(Records.DUPLICATES).getValue();
LOG.info("Number of duplicates: "+numDuplicates);

return 0;
}

Expand Down
Loading

0 comments on commit db38e53

Please sign in to comment.