Updated filtering for near-duplicates and spam scores

lintool · Jul 23, 2013 · 4fd0b07 · 4fd0b07
1 parent c63c320
commit 4fd0b07
Show file tree

Hide file tree

Showing 46 changed files with 5,856 additions and 5,154 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
 .DS_Store
-.settings
 .classpath
 .project
 target/

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ $ hadoop jar clueweb-tools-X.X-SNAPSHOT-fatjar.jar \
 	-smoothing 1000 \
 	-output /user/chauff/res.dir1000 \
 	-queries /user/chauff/web.queries.trec2013 \
-	-vbdocvector /data/private/clueweb12/derived/docvectors.20130710/segm*/part* \
+	-docvector /data/private/clueweb12/derived/docvectors.20130710/segm*/part* \
 	-topk 1000
 ``` 
 
@@ -28,7 +28,7 @@ The parameters are:
 + `smoothing`: the smoothing parameter in the LM-based retrieval model; a value of <=1 automatically backs off to smoothing with linear interpolation while a value >1 runs Dirichlet smoothing (default is 1000)
 + `output`: folder in which the TREC results are collected (in TREC result file format); to merge everything into one file in the end call `hadoop fs -getmerge /user/chauff/res.dir1000 res.dir1000`; the merged result file should run smoothly through `trec_eval`
 + `queries`: HDFS path to query file (assumed format is the same as this year's distributed query file, i.e. per line [queryID]:[term1] [term2] ...)
-+ `vbdocvector`: HDFS path to the document vectors created by the clueweb tools; beware of the necessity for using `*` to identify the files (instead of just the folder)
++ `docvector`: HDFS path to the document vectors created by the clueweb tools; beware of the necessity for using `*` to identify the files (instead of just the folder); PFor format expected
 + `topk`: number of results that should be returned per query (default is 1000)
 
 

diff --git a/org.eclipse.jdt.core.prefs b/org.eclipse.jdt.core.prefs
diff --git a/org.eclipse.jdt.ui.prefs b/org.eclipse.jdt.ui.prefs
@@ -0,0 +1,7 @@
+eclipse.preferences.version=1
+formatter_profile=_clueweb
+formatter_settings_version=12
+org.eclipse.jdt.ui.exception.name=e
+org.eclipse.jdt.ui.gettersetter.use.is=true
+org.eclipse.jdt.ui.keywordthis=false
+org.eclipse.jdt.ui.overrideannotation=true
diff --git a/src/main/java/org/clueweb/clueweb09/ClueWeb09WarcRecord.java b/src/main/java/org/clueweb/clueweb09/ClueWeb09WarcRecord.java
@@ -86,7 +86,8 @@ public class ClueWeb09WarcRecord extends Indexable {
 	 * @return the read line (or null if eof)
 	 * @throws java.io.IOException
 	 */
-	private static String readLineFromInputStream(DataInputStream in) throws IOException {
+	private static String readLineFromInputStream(DataInputStream in)
+			throws IOException {
 		StringBuilder retString = new StringBuilder();
 
 		boolean keepReading = true;
@@ -170,8 +171,8 @@ private static String readLineFromInputStream(DataInputStream in) throws IOExcep
 	 * @return the content byts (w/ the headerBuffer populated)
 	 * @throws java.io.IOException
 	 */
-	private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer)
-			throws IOException {
+	private static byte[] readNextRecord(DataInputStream in,
+			StringBuffer headerBuffer) throws IOException {
 		if (in == null) {
 			return null;
 		}
@@ -202,18 +203,21 @@ private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuff
 		// make sure we get the content length here
 		int contentLength = -1;
 		boolean foundContentLength = false;
-		while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) {
+		while (!foundContentLength && inHeader
+				&& ((line = readLineFromInputStream(in)) != null)) {
 			if ((line.trim().length() == 0) && foundContentLength) {
 				inHeader = false;
 			} else {
 				headerBuffer.append(line);
 				headerBuffer.append(NEWLINE);
 				String[] thisHeaderPieceParts = line.split(":", 2);
 				if (thisHeaderPieceParts.length == 2) {
-					if (thisHeaderPieceParts[0].toLowerCase().startsWith("content-length")) {
+					if (thisHeaderPieceParts[0].toLowerCase().startsWith(
+							"content-length")) {
 						foundContentLength = true;
 						try {
-							contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim());
+							contentLength = Integer
+									.parseInt(thisHeaderPieceParts[1].trim());
 						} catch (NumberFormatException nfEx) {
 							contentLength = -1;
 						}
@@ -262,7 +266,8 @@ private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuff
 	 * @return a WARC record (or null if eof)
 	 * @throws java.io.IOException
 	 */
-	public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
+	public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in)
+			throws IOException {
 		StringBuffer recordHeader = new StringBuffer();
 		byte[] recordContent = readNextRecord(in, recordHeader);
 		if (recordContent == null) {
@@ -348,7 +353,8 @@ public void write(DataOutput out) throws IOException {
 			out.writeUTF(dateString);
 			out.writeUTF(recordType);
 			out.writeInt(metadata.size());
-			Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
+			Iterator<Entry<String, String>> metadataIterator = metadata
+					.entrySet().iterator();
 			while (metadataIterator.hasNext()) {
 				Entry<String, String> thisEntry = metadataIterator.next();
 				out.writeUTF(thisEntry.getKey());
@@ -390,7 +396,8 @@ public String toString() {
 			retBuffer.append("WARC-Date: " + dateString + NEWLINE);
 
 			retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE);
-			Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
+			Iterator<Entry<String, String>> metadataIterator = metadata
+					.entrySet().iterator();
 			while (metadataIterator.hasNext()) {
 				Entry<String, String> thisEntry = metadataIterator.next();
 				retBuffer.append(thisEntry.getKey());
@@ -644,10 +651,10 @@ public String getContent() {
 		String str = getContentUTF8();
 		int i = str.indexOf("Content-Length:");
 		int j = str.indexOf("\n", i);
-		
-		return str.substring(j+1);
+
+		return str.substring(j + 1);
 	}
-	
+
 	public String getDisplayContentType() {
 		return "text/html";
 	}

diff --git a/src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java b/src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java
@@ -41,87 +41,95 @@
 import org.clueweb.clueweb09.mapreduce.ClueWeb09InputFormat;
 
 public class CountWarcRecordsNew extends Configured implements Tool {
-  private static final Logger LOG = Logger.getLogger(CountWarcRecordsNew.class);
-
-  private static enum Records { TOTAL, PAGES };
-
-  private static class MyMapper
-      extends Mapper<LongWritable, ClueWeb09WarcRecord, NullWritable, NullWritable> {
-    @Override
-    public void map(LongWritable key, ClueWeb09WarcRecord doc, Context context)
-        throws IOException, InterruptedException {
-      context.getCounter(Records.TOTAL).increment(1);
-
-      String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
-      if (docid != null) {
-        context.getCounter(Records.PAGES).increment(1);
-      }
-    }
-  }
-
-  public CountWarcRecordsNew() {}
-
-  public static final String INPUT_OPTION = "input";
-
-  /**
-   * Runs this tool.
-   */
-  @SuppressWarnings("static-access")
-  public int run(String[] args) throws Exception {
-    Options options = new Options();
-
-    options.addOption(OptionBuilder.withArgName("path").hasArg()
-        .withDescription("input path").create(INPUT_OPTION));
-
-    CommandLine cmdline;
-    CommandLineParser parser = new GnuParser();
-    try {
-      cmdline = parser.parse(options, args);
-    } catch (ParseException exp) {
-      HelpFormatter formatter = new HelpFormatter();
-      formatter.printHelp(this.getClass().getName(), options);
-      ToolRunner.printGenericCommandUsage(System.out);
-      System.err.println("Error parsing command line: " + exp.getMessage());
-      return -1;
-    }
-
-    if (!cmdline.hasOption(INPUT_OPTION)) {
-      HelpFormatter formatter = new HelpFormatter();
-      formatter.printHelp(this.getClass().getName(), options);
-      ToolRunner.printGenericCommandUsage(System.out);
-      return -1;
-    }
-
-    String input = cmdline.getOptionValue(INPUT_OPTION);
-
-    LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
-    LOG.info(" - input: " + input);
-
-    Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input);
-    job.setJarByClass(CountWarcRecordsNew.class);
-    job.setNumReduceTasks(0);
-
-    FileInputFormat.addInputPaths(job, input);
-
-    job.setInputFormatClass(ClueWeb09InputFormat.class);
-    job.setOutputFormatClass(NullOutputFormat.class);
-    job.setMapperClass(MyMapper.class);
-
-    job.waitForCompletion(true);
-
-    Counters counters = job.getCounters();
-    int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
-    LOG.info("Read " + numDocs + " docs.");
-
-    return 0;
-  }
-
-  /**
-   * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
-   */
-  public static void main(String[] args) throws Exception {
-    LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName() + " with args "
-        + Arrays.toString(args));
-    ToolRunner.run(new CountWarcRecordsNew(), args);
-  }
+	private static final Logger LOG = Logger
+			.getLogger(CountWarcRecordsNew.class);
+
+	private static enum Records {
+		TOTAL, PAGES
+	};
+
+	private static class MyMapper
+			extends
+			Mapper<LongWritable, ClueWeb09WarcRecord, NullWritable, NullWritable> {
+		@Override
+		public void map(LongWritable key, ClueWeb09WarcRecord doc,
+				Context context) throws IOException, InterruptedException {
+			context.getCounter(Records.TOTAL).increment(1);
+
+			String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
+			if (docid != null) {
+				context.getCounter(Records.PAGES).increment(1);
+			}
+		}
+	}
+
+	public CountWarcRecordsNew() {
+	}
+
+	public static final String INPUT_OPTION = "input";
+
+	/**
+	 * Runs this tool.
+	 */
+	@SuppressWarnings("static-access")
+	public int run(String[] args) throws Exception {
+		Options options = new Options();
+
+		options.addOption(OptionBuilder.withArgName("path").hasArg()
+				.withDescription("input path").create(INPUT_OPTION));
+
+		CommandLine cmdline;
+		CommandLineParser parser = new GnuParser();
+		try {
+			cmdline = parser.parse(options, args);
+		} catch (ParseException exp) {
+			HelpFormatter formatter = new HelpFormatter();
+			formatter.printHelp(this.getClass().getName(), options);
+			ToolRunner.printGenericCommandUsage(System.out);
+			System.err.println("Error parsing command line: "
+					+ exp.getMessage());
+			return -1;
+		}
+
+		if (!cmdline.hasOption(INPUT_OPTION)) {
+			HelpFormatter formatter = new HelpFormatter();
+			formatter.printHelp(this.getClass().getName(), options);
+			ToolRunner.printGenericCommandUsage(System.out);
+			return -1;
+		}
+
+		String input = cmdline.getOptionValue(INPUT_OPTION);
+
+		LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
+		LOG.info(" - input: " + input);
+
+		Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName()
+				+ ":" + input);
+		job.setJarByClass(CountWarcRecordsNew.class);
+		job.setNumReduceTasks(0);
+
+		FileInputFormat.addInputPaths(job, input);
+
+		job.setInputFormatClass(ClueWeb09InputFormat.class);
+		job.setOutputFormatClass(NullOutputFormat.class);
+		job.setMapperClass(MyMapper.class);
+
+		job.waitForCompletion(true);
+
+		Counters counters = job.getCounters();
+		int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
+		LOG.info("Read " + numDocs + " docs.");
+
+		return 0;
+	}
+
+	/**
+	 * Dispatches command-line arguments to the tool via the
+	 * <code>ToolRunner</code>.
+	 */
+	public static void main(String[] args) throws Exception {
+		LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName()
+				+ " with args " + Arrays.toString(args));
+		ToolRunner.run(new CountWarcRecordsNew(), args);
+	}
 }