Skip to content
This repository has been archived by the owner on Nov 21, 2018. It is now read-only.

Commit

Permalink
Updated filtering for near-duplicates and spam scores
Browse files Browse the repository at this point in the history
  • Loading branch information
chauff committed Jul 23, 2013
1 parent c63c320 commit 4fd0b07
Show file tree
Hide file tree
Showing 46 changed files with 5,856 additions and 5,154 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
.DS_Store
.settings
.classpath
.project
target/
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ $ hadoop jar clueweb-tools-X.X-SNAPSHOT-fatjar.jar \
-smoothing 1000 \
-output /user/chauff/res.dir1000 \
-queries /user/chauff/web.queries.trec2013 \
-vbdocvector /data/private/clueweb12/derived/docvectors.20130710/segm*/part* \
-docvector /data/private/clueweb12/derived/docvectors.20130710/segm*/part* \
-topk 1000
```

Expand All @@ -28,7 +28,7 @@ The parameters are:
+ `smoothing`: the smoothing parameter in the LM-based retrieval model; a value of <=1 automatically backs off to smoothing with linear interpolation while a value >1 runs Dirichlet smoothing (default is 1000)
+ `output`: folder in which the TREC results are collected (in TREC result file format); to merge everything into one file in the end call `hadoop fs -getmerge /user/chauff/res.dir1000 res.dir1000`; the merged result file should run smoothly through `trec_eval`
+ `queries`: HDFS path to query file (assumed format is the same as this year's distributed query file, i.e. per line [queryID]:[term1] [term2] ...)
+ `vbdocvector`: HDFS path to the document vectors created by the clueweb tools; beware of the necessity for using `*` to identify the files (instead of just the folder)
+ `docvector`: HDFS path to the document vectors created by the clueweb tools; beware of the necessity for using `*` to identify the files (instead of just the folder); PFor format expected
+ `topk`: number of results that should be returned per query (default is 1000)


Expand Down
303 changes: 303 additions & 0 deletions org.eclipse.jdt.core.prefs

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions org.eclipse.jdt.ui.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
eclipse.preferences.version=1
formatter_profile=_clueweb
formatter_settings_version=12
org.eclipse.jdt.ui.exception.name=e
org.eclipse.jdt.ui.gettersetter.use.is=true
org.eclipse.jdt.ui.keywordthis=false
org.eclipse.jdt.ui.overrideannotation=true
31 changes: 19 additions & 12 deletions src/main/java/org/clueweb/clueweb09/ClueWeb09WarcRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ public class ClueWeb09WarcRecord extends Indexable {
* @return the read line (or null if eof)
* @throws java.io.IOException
*/
private static String readLineFromInputStream(DataInputStream in) throws IOException {
private static String readLineFromInputStream(DataInputStream in)
throws IOException {
StringBuilder retString = new StringBuilder();

boolean keepReading = true;
Expand Down Expand Up @@ -170,8 +171,8 @@ private static String readLineFromInputStream(DataInputStream in) throws IOExcep
* @return the content byts (w/ the headerBuffer populated)
* @throws java.io.IOException
*/
private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer)
throws IOException {
private static byte[] readNextRecord(DataInputStream in,
StringBuffer headerBuffer) throws IOException {
if (in == null) {
return null;
}
Expand Down Expand Up @@ -202,18 +203,21 @@ private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuff
// make sure we get the content length here
int contentLength = -1;
boolean foundContentLength = false;
while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) {
while (!foundContentLength && inHeader
&& ((line = readLineFromInputStream(in)) != null)) {
if ((line.trim().length() == 0) && foundContentLength) {
inHeader = false;
} else {
headerBuffer.append(line);
headerBuffer.append(NEWLINE);
String[] thisHeaderPieceParts = line.split(":", 2);
if (thisHeaderPieceParts.length == 2) {
if (thisHeaderPieceParts[0].toLowerCase().startsWith("content-length")) {
if (thisHeaderPieceParts[0].toLowerCase().startsWith(
"content-length")) {
foundContentLength = true;
try {
contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim());
contentLength = Integer
.parseInt(thisHeaderPieceParts[1].trim());
} catch (NumberFormatException nfEx) {
contentLength = -1;
}
Expand Down Expand Up @@ -262,7 +266,8 @@ private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuff
* @return a WARC record (or null if eof)
* @throws java.io.IOException
*/
public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in)
throws IOException {
StringBuffer recordHeader = new StringBuffer();
byte[] recordContent = readNextRecord(in, recordHeader);
if (recordContent == null) {
Expand Down Expand Up @@ -348,7 +353,8 @@ public void write(DataOutput out) throws IOException {
out.writeUTF(dateString);
out.writeUTF(recordType);
out.writeInt(metadata.size());
Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
Iterator<Entry<String, String>> metadataIterator = metadata
.entrySet().iterator();
while (metadataIterator.hasNext()) {
Entry<String, String> thisEntry = metadataIterator.next();
out.writeUTF(thisEntry.getKey());
Expand Down Expand Up @@ -390,7 +396,8 @@ public String toString() {
retBuffer.append("WARC-Date: " + dateString + NEWLINE);

retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE);
Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
Iterator<Entry<String, String>> metadataIterator = metadata
.entrySet().iterator();
while (metadataIterator.hasNext()) {
Entry<String, String> thisEntry = metadataIterator.next();
retBuffer.append(thisEntry.getKey());
Expand Down Expand Up @@ -644,10 +651,10 @@ public String getContent() {
String str = getContentUTF8();
int i = str.indexOf("Content-Length:");
int j = str.indexOf("\n", i);
return str.substring(j+1);

return str.substring(j + 1);
}

public String getDisplayContentType() {
return "text/html";
}
Expand Down
174 changes: 91 additions & 83 deletions src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,87 +41,95 @@
import org.clueweb.clueweb09.mapreduce.ClueWeb09InputFormat;

public class CountWarcRecordsNew extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(CountWarcRecordsNew.class);

private static enum Records { TOTAL, PAGES };

private static class MyMapper
extends Mapper<LongWritable, ClueWeb09WarcRecord, NullWritable, NullWritable> {
@Override
public void map(LongWritable key, ClueWeb09WarcRecord doc, Context context)
throws IOException, InterruptedException {
context.getCounter(Records.TOTAL).increment(1);

String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
if (docid != null) {
context.getCounter(Records.PAGES).increment(1);
}
}
}

public CountWarcRecordsNew() {}

public static final String INPUT_OPTION = "input";

/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();

options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("input path").create(INPUT_OPTION));

CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}

if (!cmdline.hasOption(INPUT_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}

String input = cmdline.getOptionValue(INPUT_OPTION);

LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
LOG.info(" - input: " + input);

Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input);
job.setJarByClass(CountWarcRecordsNew.class);
job.setNumReduceTasks(0);

FileInputFormat.addInputPaths(job, input);

job.setInputFormatClass(ClueWeb09InputFormat.class);
job.setOutputFormatClass(NullOutputFormat.class);
job.setMapperClass(MyMapper.class);

job.waitForCompletion(true);

Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
LOG.info("Read " + numDocs + " docs.");

return 0;
}

/**
* Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
*/
public static void main(String[] args) throws Exception {
LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName() + " with args "
+ Arrays.toString(args));
ToolRunner.run(new CountWarcRecordsNew(), args);
}
private static final Logger LOG = Logger
.getLogger(CountWarcRecordsNew.class);

private static enum Records {
TOTAL, PAGES
};

private static class MyMapper
extends
Mapper<LongWritable, ClueWeb09WarcRecord, NullWritable, NullWritable> {
@Override
public void map(LongWritable key, ClueWeb09WarcRecord doc,
Context context) throws IOException, InterruptedException {
context.getCounter(Records.TOTAL).increment(1);

String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
if (docid != null) {
context.getCounter(Records.PAGES).increment(1);
}
}
}

public CountWarcRecordsNew() {
}

public static final String INPUT_OPTION = "input";

/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();

options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("input path").create(INPUT_OPTION));

CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Error parsing command line: "
+ exp.getMessage());
return -1;
}

if (!cmdline.hasOption(INPUT_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}

String input = cmdline.getOptionValue(INPUT_OPTION);

LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
LOG.info(" - input: " + input);

Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName()
+ ":" + input);
job.setJarByClass(CountWarcRecordsNew.class);
job.setNumReduceTasks(0);

FileInputFormat.addInputPaths(job, input);

job.setInputFormatClass(ClueWeb09InputFormat.class);
job.setOutputFormatClass(NullOutputFormat.class);
job.setMapperClass(MyMapper.class);

job.waitForCompletion(true);

Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
LOG.info("Read " + numDocs + " docs.");

return 0;
}

/**
* Dispatches command-line arguments to the tool via the
* <code>ToolRunner</code>.
*/
public static void main(String[] args) throws Exception {
LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName()
+ " with args " + Arrays.toString(args));
ToolRunner.run(new CountWarcRecordsNew(), args);
}
}
Loading

0 comments on commit 4fd0b07

Please sign in to comment.