diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d081115
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+/bin
+/target
+/.settings
+/.classpath
+/.DS_Store
+/.project
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..f2ae202
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,18 @@
+
+ boilerpipe
+
+ Copyright (c) 2009-2011 Christian Kohlschütter
+
+ The author licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ 
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f84a305
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+boilerpipe
+======
+
+Repackaging of [boilerpipe](https://code.google.com/p/boilerpipe/) published on Maven Central Repository.
+
+Overview
+--------
+This is a repackaging of the last sources of [boilerpipe](https://code.google.com/p/boilerpipe/) with some improvements:
+
+  * Published on Maven Central Repository
+  * Media extraction (Youtube videos, Vimeo videos and Images) within an article from [Netbreeze-GmbH fork](https://github.com/Netbreeze-GmbH/boilerpipe)
+
+Getting started
+-----
+
+The best way to start is to look at [boilerpipe QuickStart](https://code.google.com/p/boilerpipe/wiki/QuickStart)
+
+### Including the SDK in your project
+
+Simply add a new dependency to your `pom.xml`:
+
+```xml
+    	<dependency>
+			<groupId>com.syncthemall</groupId>
+			<artifactId>boilerpipe</artifactId>
+			<version>1.2.1</version>
+		</dependency>
+```
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..5c92a19
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,147 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>com.syncthemall</groupId>
+	<artifactId>boilerpipe</artifactId>
+	<version>1.2.1</version>
+	<packaging>jar</packaging>
+	<name>boilerpipe</name>
+	<description>Repackaging of Dropbox Java SDK with minor bug fixes and published on Maven Central Repository.</description>
+	<url>https://github.com/vanduynslagerp/boilerpipe</url>
+	<licenses>
+		<license>
+			<name>Apache License 2.0</name>
+			<distribution>repo</distribution>
+			<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+		</license>
+	</licenses>
+	<scm>
+		<connection>scm:git:git@github.com:vanduynslagerp/boilerpipe.git</connection>
+		<developerConnection>scm:git:git@github.com:vanduynslagerp/boilerpipe.git</developerConnection>
+		<url>https://github.com/vanduynslagerp/boilerpipe</url>
+	</scm>
+	<developers>
+		<developer>
+			<id>1</id>
+			<name>Christian Kohlschütter</name>
+			<url>http://www.kohlschutter.com</url>
+			<roles>
+				<role>project initiator</role>
+			</roles>
+		</developer>
+		<developer>
+			<id>2</id>
+			<name>Manuel Codiga</name>
+			<email>manuel.codiga@gmail.com</email>
+			<roles>
+				<role>contributor</role>
+			</roles>
+		</developer>
+	</developers>
+	<dependencies>
+		<dependency>
+			<groupId>net.sourceforge.nekohtml</groupId>
+			<artifactId>nekohtml</artifactId>
+			<version>1.9.18</version>
+		</dependency>
+		<dependency>
+			<groupId>xerces</groupId>
+			<artifactId>xercesImpl</artifactId>
+			<version>2.11.0</version>
+		</dependency>
+	</dependencies>
+	<build>
+		<defaultGoal>package</defaultGoal>
+		<finalName>${project.artifactId}-${project.version}</finalName>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-jar-plugin</artifactId>
+				<version>2.4</version>
+				<configuration>
+					<archive>
+						<manifest>
+							<addClasspath>true</addClasspath>
+							<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
+							<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+						</manifest>
+					</archive>
+				</configuration>
+			</plugin>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-resources-plugin</artifactId>
+				<version>2.6</version>
+				<configuration>
+					<encoding>${project.build.sourceEncoding}</encoding>
+				</configuration>
+			</plugin>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>3.1</version>
+				<configuration>
+					<encoding>${project.build.sourceEncoding}</encoding>
+					<source>${source.version}</source>
+					<target>${source.version}</target>
+				</configuration>
+			</plugin>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-source-plugin</artifactId>
+				<version>2.2.1</version>
+				<executions>
+					<execution>
+						<id>attach-sources</id>
+						<goals>
+							<goal>jar</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-release-plugin</artifactId>
+				<version>2.4.1</version>
+			</plugin>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-javadoc-plugin</artifactId>
+				<version>2.9</version>
+				<executions>
+					<execution>
+						<id>attach-javadocs</id>
+						<goals>
+							<goal>jar</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-gpg-plugin</artifactId>
+				<version>1.4</version>
+				<executions>
+					<execution>
+						<id>sign-artifacts</id>
+						<phase>verify</phase>
+						<goals>
+							<goal>sign</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
+		</plugins>
+	</build>
+	<distributionManagement>
+		<repository>
+			<id>nexus-releases</id>
+			<name>Nexus Release Repository</name>
+			<url>http://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+		</repository>
+	</distributionManagement>
+	<properties>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+		<source.version>1.6</source.version>
+	</properties>
+</project>
\ No newline at end of file
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..9bb88d3
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1 @@
+/.DS_Store
diff --git a/src/main/.gitignore b/src/main/.gitignore
new file mode 100644
index 0000000..9bb88d3
--- /dev/null
+++ b/src/main/.gitignore
@@ -0,0 +1 @@
+/.DS_Store
diff --git a/src/main/java/.gitignore b/src/main/java/.gitignore
new file mode 100644
index 0000000..9bb88d3
--- /dev/null
+++ b/src/main/java/.gitignore
@@ -0,0 +1 @@
+/.DS_Store
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java
new file mode 100644
index 0000000..febbe96
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java
@@ -0,0 +1,10 @@
+package de.l3s.boilerpipe;
+
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Something that can be represented as a {@link TextDocument}.
+ */
+public interface BoilerpipeDocumentSource {
+    TextDocument toTextDocument() throws BoilerpipeProcessingException;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java
new file mode 100644
index 0000000..fcc8aab
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java
@@ -0,0 +1,58 @@
+package de.l3s.boilerpipe;
+
+import java.io.Reader;
+
+import org.xml.sax.InputSource;
+
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Describes a complete filter pipeline.
+ * 
+ * @author Christian Kohlschütter
+ */
+public interface BoilerpipeExtractor extends BoilerpipeFilter {
+    /**
+     * Extracts text from the HTML code given as a String.
+     * 
+     * @param html
+     *            The HTML code as a String.
+     * @return The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final String html)
+            throws BoilerpipeProcessingException;
+
+    /**
+     * Extracts text from the HTML code available from the given
+     * {@link InputSource}.
+     * 
+     * @param is
+     *            The InputSource containing the HTML
+     * @return The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final InputSource is)
+            throws BoilerpipeProcessingException;
+
+    /**
+     * Extracts text from the HTML code available from the given {@link Reader}.
+     * 
+     * @param r
+     *            The Reader containing the HTML
+     * @return The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final Reader r) throws BoilerpipeProcessingException;
+
+    /**
+     * Extracts text from the given {@link TextDocument} object.
+     * 
+     * @param doc
+     *            The {@link TextDocument}.
+     * @return The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(TextDocument doc)
+            throws BoilerpipeProcessingException;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java
new file mode 100644
index 0000000..8a15f77
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java
@@ -0,0 +1,40 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe;
+
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and
+ * processes it somehow.
+ * 
+ * @author Christian Kohlschütter
+ */
+public interface BoilerpipeFilter {
+    /**
+     * Processes the given document <code>doc</code>.
+     * 
+     * @param doc
+     *            The {@link TextDocument} that is to be processed.
+     * @return <code>true</code> if changes have been made to the
+     *         {@link TextDocument}.
+     * @throws BoilerpipeProcessingException
+     */
+    boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java
new file mode 100644
index 0000000..bcb603d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java
@@ -0,0 +1,35 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe;
+
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * A source that returns {@link TextDocument}s.
+ * 
+ * @author Christian Kohlschütter
+ */
+public interface BoilerpipeInput {
+    /**
+     * Returns (somehow) a {@link TextDocument}.
+     * 
+     * @return  A {@link TextDocument}.
+     * @throws BoilerpipeProcessingException
+     */
+    TextDocument getTextDocument() throws BoilerpipeProcessingException;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java
new file mode 100644
index 0000000..f3a9cc4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java
@@ -0,0 +1,43 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe;
+
+/**
+ * Exception for signaling failure in the processing pipeline.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class BoilerpipeProcessingException extends Exception {
+    private static final long serialVersionUID = 1L;
+
+    public BoilerpipeProcessingException() {
+        super();
+    }
+
+    public BoilerpipeProcessingException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    public BoilerpipeProcessingException(String message) {
+        super(message);
+    }
+
+    public BoilerpipeProcessingException(Throwable cause) {
+        super(cause);
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java b/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java
new file mode 100644
index 0000000..df92f10
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java
@@ -0,0 +1,37 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.conditions;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.labels.ConditionalLabelAction;
+
+/**
+ * Evaluates whether a given {@link TextBlock} meets a certain condition.
+ * Useful in combination with {@link ConditionalLabelAction}.
+ * 
+ * @author Christian Kohlschuetter
+ */
+public interface TextBlockCondition {
+    /**
+     * Returns <code>true</code> iff the given {@link TextBlock} tb meets the defined condition.
+     * 
+     * @param tb
+     * @return <code><true</code> iff the condition is met.
+     */
+    boolean meetsCondition(final TextBlock tb);
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/Image.java b/src/main/java/de/l3s/boilerpipe/document/Image.java
new file mode 100644
index 0000000..91abc66
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/Image.java
@@ -0,0 +1,97 @@
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an Image resource that is contained in the document.
+ * 
+ * Any of the attributes may be null, except for "src".
+ *  
+ * @author Christian Kohlschuetter
+ */
+public class Image extends Media implements Comparable<Image> {
+	private final String src;
+	private final String width;
+	private final String height;
+	private final String alt;
+	private final int area;
+
+	public Image(final String src, final String width, final String height, final String alt) {
+		this.src = src;
+		if(src == null) {
+			throw new NullPointerException("src attribute must not be null");
+		}
+		this.width = nullTrim(width);
+		this.height = nullTrim(height);
+		this.alt = nullTrim(alt);
+		
+		if(width != null && height != null) {
+			int a;
+			try {
+				a = Integer.parseInt(width) * Integer.parseInt(height);
+			} catch(NumberFormatException e) {
+				a = -1;
+			}
+			this.area = a;
+		} else {
+			this.area = -1;
+		}
+	}
+	
+	/**
+	 * gets the src attribut from the image tag in the html source. 
+	 * it's not everytime an absolute path!
+	 * 
+	 * @return gets the src attribut from the image 
+	 */
+	public String getSrc() {
+		return src;
+	}
+
+	public String getWidth() {
+		return width;
+	}
+
+	public String getHeight() {
+		return height;
+	}
+
+	public String getAlt() {
+		return alt;
+	}
+	
+	private static String nullTrim(String s) {
+		if(s == null) {
+			return null;
+		}
+		s = s.trim();
+		if(s.length() == 0) {
+			return null;
+		}
+		return s;
+	}
+	
+	/**
+	 * Returns the image's area (specified by width * height), or -1 if width/height weren't both specified or could not be parsed.
+	 * 
+	 * @return the image's area
+	 */
+	public int getArea() {
+		return area;
+	}
+	
+	public String toString() {
+		return src+"\twidth="+width+"\theight="+height+"\talt="+alt+"\tarea="+area;
+	}
+
+	public int compareTo(Image o) {
+		if(o == this) {
+			return 0;
+		}
+		if(area > o.area) {
+			return -1;
+		} else if(area == o.area) {
+			return src.compareTo(o.src);
+		} else {
+			return 1;
+		}
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/Media.java b/src/main/java/de/l3s/boilerpipe/document/Media.java
new file mode 100644
index 0000000..8923b24
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/Media.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *       
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Media class
+ * 
+ * @author manuel.codiga@gmail.com
+ *
+ */
+public abstract class Media {
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/TextBlock.java b/src/main/java/de/l3s/boilerpipe/document/TextBlock.java
new file mode 100644
index 0000000..f7e59ac
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/TextBlock.java
@@ -0,0 +1,286 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.document;
+
+import java.util.BitSet;
+import java.util.HashSet;
+import java.util.Set;
+
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Describes a block of text.
+ * 
+ * A block can be an "atomic" text element (i.e., a sequence of text that is not
+ * interrupted by any HTML markup) or a compound of such atomic elements.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class TextBlock implements Cloneable {
+    boolean isContent = false;
+    private CharSequence text;
+    Set<String> labels = null;
+
+    int offsetBlocksStart;
+    int offsetBlocksEnd;
+
+    int numWords;
+    int numWordsInAnchorText;
+    int numWordsInWrappedLines;
+    int numWrappedLines;
+    float textDensity;
+    float linkDensity;
+
+    BitSet containedTextElements;
+
+    private int numFullTextWords = 0;
+	private int tagLevel;
+
+    private static final BitSet EMPTY_BITSET = new BitSet();
+    public static final TextBlock EMPTY_START = new TextBlock("", EMPTY_BITSET,
+            0, 0, 0, 0, -1);
+    public static final TextBlock EMPTY_END = new TextBlock("", EMPTY_BITSET,
+            0, 0, 0, 0, Integer.MAX_VALUE);
+
+    public TextBlock(final String text) {
+        this(text, null, 0,0,0,0,0);
+    }
+    
+    public TextBlock(final String text, final BitSet containedTextElements,
+            final int numWords, final int numWordsInAnchorText,
+            final int numWordsInWrappedLines, final int numWrappedLines,
+            final int offsetBlocks) {
+        this.text = text;
+        this.containedTextElements = containedTextElements;
+        this.numWords = numWords;
+        this.numWordsInAnchorText = numWordsInAnchorText;
+        this.numWordsInWrappedLines = numWordsInWrappedLines;
+        this.numWrappedLines = numWrappedLines;
+        this.offsetBlocksStart = offsetBlocks;
+        this.offsetBlocksEnd = offsetBlocks;
+        initDensities();
+    }
+
+    public boolean isContent() {
+        return isContent;
+    }
+
+    public boolean setIsContent(boolean isContent) {
+        if (isContent != this.isContent) {
+            this.isContent = isContent;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    public String getText() {
+        return text.toString();
+    }
+
+    public int getNumWords() {
+        return numWords;
+    }
+
+    public int getNumWordsInAnchorText() {
+        return numWordsInAnchorText;
+    }
+
+    public float getTextDensity() {
+        return textDensity;
+    }
+
+    public float getLinkDensity() {
+        return linkDensity;
+    }
+
+    public void mergeNext(final TextBlock other) {
+        if (!(text instanceof StringBuilder)) {
+            text = new StringBuilder(text);
+        }
+        StringBuilder sb = (StringBuilder) text;
+        sb.append('\n');
+        sb.append(other.text);
+
+        numWords += other.numWords;
+        numWordsInAnchorText += other.numWordsInAnchorText;
+
+        numWordsInWrappedLines += other.numWordsInWrappedLines;
+        numWrappedLines += other.numWrappedLines;
+
+        offsetBlocksStart = Math
+                .min(offsetBlocksStart, other.offsetBlocksStart);
+        offsetBlocksEnd = Math.max(offsetBlocksEnd, other.offsetBlocksEnd);
+
+        initDensities();
+
+        this.isContent |= other.isContent;
+
+        if(containedTextElements == null) {
+        	containedTextElements = (BitSet)other.containedTextElements.clone();
+        } else {
+        	containedTextElements.or(other.containedTextElements);
+        }
+
+        numFullTextWords += other.numFullTextWords;
+
+        if (other.labels != null) {
+            if (labels == null) {
+                labels = new HashSet<String>(other.labels);
+            } else {
+                labels.addAll(other.labels);
+            }
+        }
+        
+        tagLevel = Math.min(tagLevel, other.tagLevel);
+    }
+
+    private void initDensities() {
+        if (numWordsInWrappedLines == 0) {
+            numWordsInWrappedLines = numWords;
+            numWrappedLines = 1;
+        }
+        textDensity = numWordsInWrappedLines / (float) numWrappedLines;
+        linkDensity = numWords == 0 ? 0 : numWordsInAnchorText / (float) numWords;
+    }
+
+    public int getOffsetBlocksStart() {
+        return offsetBlocksStart;
+    }
+    public int getOffsetBlocksEnd() {
+        return offsetBlocksEnd;
+    }
+
+    public String toString() {
+        return "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl="+tagLevel+"; nw="+numWords+";nwl="+numWrappedLines+";ld="+linkDensity+"]\t"
+                + (isContent?"CONTENT":"boilerplate") + "," + labels + "\n" + getText();
+    }
+
+    /**
+     * Adds an arbitrary String label to this {@link TextBlock}.
+     * 
+     * @param label The label
+     * @see DefaultLabels
+     */
+    public void addLabel(final String label) {
+        if (labels == null) {
+            labels = new HashSet<String>(2);
+        }
+        labels.add(label);
+    }
+
+    /**
+     * Checks whether this TextBlock has the given label.
+     * 
+     * @param label The label
+     * @return <code>true</code> if this block is marked by the given label.
+     */
+    public boolean hasLabel(final String label) {
+        return labels != null && labels.contains(label);
+    }
+    
+    public boolean removeLabel(final String label) {
+    	return labels != null && labels.remove(label);
+    }
+    
+    /**
+     * Returns the labels associated to this TextBlock, or <code>null</code> if no such labels
+     * exist.
+     * 
+     * NOTE: The returned instance is the one used directly in TextBlock. You have full access
+     * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock}
+     * whenever possible.
+     * 
+     * @return Returns the set of labels, or <code>null</code> if no labels was added yet.
+     */
+    public Set<String> getLabels() {
+        return labels;
+    }
+    
+    /**
+     * Adds a set of labels to this {@link TextBlock}.
+     * <code>null</code>-references are silently ignored.
+     * 
+     * @param l The labels to be added. 
+     */
+    public void addLabels(final Set<String> l) {
+        if(l == null) {
+            return;
+        }
+        if(this.labels == null) {
+            this.labels = new HashSet<String>(l);
+        } else {
+            this.labels.addAll(l);
+        }
+    }
+    
+    /**
+     * Adds a set of labels to this {@link TextBlock}.
+     * <code>null</code>-references are silently ignored.
+     * 
+     * @param l The labels to be added. 
+     */
+    public void addLabels(final String... l) {
+        if(l == null) {
+            return;
+        }
+        if(this.labels == null) {
+            this.labels = new HashSet<String>();
+        }
+        for(final String label : l) {
+            this.labels.add(label);
+        }
+    }
+
+    /**
+     * Returns the containedTextElements BitSet, or <code>null</code>.
+     * @return the containedTextElements BitSet, or <code>null</code>.
+     */
+    public BitSet getContainedTextElements() {
+        return containedTextElements;
+    }
+
+	@Override
+	protected TextBlock clone() {
+		final TextBlock clone;
+		try {
+			clone = (TextBlock)super.clone();
+		} catch (CloneNotSupportedException e) {
+			throw new RuntimeException(e);
+		}
+		if(text != null && !(text instanceof String)) {
+			clone.text = new StringBuilder(text);
+		}
+		if(labels != null && !labels.isEmpty()) {
+			clone.labels = new HashSet<String>(labels);
+		}
+		if(containedTextElements != null) {
+			clone.containedTextElements = (BitSet)containedTextElements.clone();
+		}
+		
+		return clone;
+	}
+
+	public int getTagLevel() {
+		return tagLevel;
+	}
+
+	public void setTagLevel(int tagLevel) {
+		this.tagLevel = tagLevel;
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/TextDocument.java b/src/main/java/de/l3s/boilerpipe/document/TextDocument.java
new file mode 100644
index 0000000..5ea893c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/TextDocument.java
@@ -0,0 +1,141 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.document;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A text document, consisting of one or more {@link TextBlock}s.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class TextDocument implements Cloneable {
+    final List<TextBlock> textBlocks;
+    String title;
+
+    /**
+     * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no
+     * title.
+     * 
+     * @param textBlocks
+     *            The text blocks of this document.
+     */
+    public TextDocument(final List<TextBlock> textBlocks) {
+        this(null, textBlocks);
+    }
+
+    /**
+     * Creates a new {@link TextDocument} with given {@link TextBlock}s and
+     * given title.
+     * 
+     * @param title
+     *            The "main" title for this text document.
+     * @param textBlocks
+     *            The text blocks of this document.
+     */
+    public TextDocument(final String title, final List<TextBlock> textBlocks) {
+        this.title = title;
+        this.textBlocks = textBlocks;
+    }
+
+    /**
+     * Returns the {@link TextBlock}s of this document.
+     * 
+     * @return A list of {@link TextBlock}s, in sequential order of appearance.
+     */
+    public List<TextBlock> getTextBlocks() {
+        return textBlocks;
+    }
+
+    /**
+     * Returns the "main" title for this document, or <code>null</code> if no
+     * such title has ben set.
+     * 
+     * @return The "main" title.
+     */
+    public String getTitle() {
+        return title;
+    }
+    
+    /**
+     * Updates the "main" title for this document.
+     * 
+     * @param title
+     */
+    public void setTitle(final String title) {
+    	this.title = title;
+    }
+
+    /**
+     * Returns the {@link TextDocument}'s content.
+     * 
+     * @return The content text.
+     */
+    public String getContent() {
+        return getText(true, false);
+    }
+
+    
+    /**
+     * Returns the {@link TextDocument}'s content, non-content or both
+     * 
+     * @param includeContent Whether to include TextBlocks marked as "content".
+     * @param includeNonContent Whether to include TextBlocks marked as "non-content".
+     * @return The text.
+     */
+    public String getText(boolean includeContent, boolean includeNonContent) {
+        StringBuilder sb = new StringBuilder();
+        LOOP: for (TextBlock block : getTextBlocks()) {
+            if(block.isContent()) {
+                if(!includeContent) {
+                    continue LOOP;
+                }
+            } else {
+                if(!includeNonContent) {
+                    continue LOOP;
+                }
+            }
+            sb.append(block.getText());
+            sb.append('\n');
+        }
+        return sb.toString();
+    }
+    
+    /**
+     * Returns detailed debugging information about the contained {@link TextBlock}s.
+     *  
+     * @return Debug information.
+     */
+    public String debugString() {
+        StringBuilder sb = new StringBuilder();
+        for(TextBlock tb : getTextBlocks()) {
+            sb.append(tb.toString());
+            sb.append('\n');
+        }
+        return sb.toString();
+    }
+    
+    public TextDocument clone() {
+    	final List<TextBlock> list = new ArrayList<TextBlock>(textBlocks.size());
+    	for(TextBlock tb : textBlocks) {
+    		list.add(tb.clone());
+    	}
+    	return new TextDocument(title, list);
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java b/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java
new file mode 100644
index 0000000..51abe73
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java
@@ -0,0 +1,64 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.document;
+
+/**
+ * Provides shallow statistics on a given TextDocument
+ * 
+ * @author Christian Kohlschuetter
+ */
+public final class TextDocumentStatistics {
+    private int numWords = 0;
+    private int numBlocks = 0;
+
+    /**
+     * Computes statistics on a given {@link TextDocument}.
+     *
+     * @param doc The {@link TextDocument}.
+     * @param contentOnly if true then o
+     */
+    public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) {
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (contentOnly && !tb.isContent()) {
+                continue;
+            }
+
+            numWords += tb.getNumWords();
+            numBlocks++;
+        }
+    }
+
+    /**
+     * Returns the average number of words at block-level (= overall number of words divided by
+     * the number of blocks).
+     * 
+     * @return Average
+     */
+    public float avgNumWords() {
+        return numWords / (float) numBlocks;
+    }
+
+    /**
+     * Returns the overall number of words in all blocks.
+     * 
+     * @return Sum
+     */
+    public int getNumWords() {
+        return numWords;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/Video.java b/src/main/java/de/l3s/boilerpipe/document/Video.java
new file mode 100644
index 0000000..3c6fa31
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/Video.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *       
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an video resource which is contained in the document.
+ *
+ *  
+ * @author Manuel Codiga
+ */
+public class Video extends Media {
+        private final String originUrl;
+        private final String embedUrl;
+
+        public Video(final String url, final String embedUrl) {
+                this.originUrl = url;
+                this.embedUrl = embedUrl;
+                if(this.embedUrl == null) {
+                        throw new NullPointerException("embedUrl attribute must not be null");
+                }
+        }
+
+        public String getOriginUrl() {
+                return originUrl;
+        }
+        
+        public String getEmbedUrl() {
+            return embedUrl;
+        }
+       
+        public String toString() {
+                return "url: "+originUrl;
+        }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java b/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java
new file mode 100644
index 0000000..3bada83
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *       
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an Vimeo video resource that is contained in the document.
+ *
+ *  
+ * @author Manuel Codiga
+ */
+public class VimeoVideo extends Video {
+
+	public VimeoVideo(String originUrl, String embedUrl) {
+		super(originUrl, embedUrl);
+	}
+	
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java b/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java
new file mode 100644
index 0000000..1f80744
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *       
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an Youtube video resource that is contained in the document.
+ *
+ *  
+ * @author Manuel Codiga
+ */
+public class YoutubeVideo extends Video {
+
+	public YoutubeVideo(String originUrl, String embedUrl) {
+		super(originUrl, embedUrl);
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/package.html b/src/main/java/de/l3s/boilerpipe/document/package.html
new file mode 100644
index 0000000..b80903d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+	<p>The classes in this package represent the simple Boilerpipe
+		document model.</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java b/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java
new file mode 100644
index 0000000..1fea4ca
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java
@@ -0,0 +1,62 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.estimators;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.document.TextDocumentStatistics;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+
+/**
+ * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class SimpleEstimator {
+
+	/**
+	 * Returns the singleton instance of {@link SimpleEstimator}
+	 */
+    public static final SimpleEstimator INSTANCE = new SimpleEstimator();
+    
+    private SimpleEstimator() {
+    }
+    
+    /**
+     * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor},
+     * can we regard the extraction quality (too) low?
+     * 
+     * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others.
+     * 
+     * @param dsBefore
+     * @param dsAfter
+     * @return true if low quality is to be expected. 
+     */
+    public boolean isLowQuality(final TextDocumentStatistics dsBefore, final TextDocumentStatistics dsAfter) {
+        if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) {
+            return true;
+        }
+
+        if (dsAfter.avgNumWords() < 25) {
+            return true;
+        }
+
+        return false;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java
new file mode 100644
index 0000000..9013c3f
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java
@@ -0,0 +1,68 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.IgnoreBlocksAfterContentFilter;
+import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier;
+import de.l3s.boilerpipe.filters.english.TerminatingBlocksFinder;
+import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion;
+import de.l3s.boilerpipe.filters.heuristics.DocumentTitleMatchClassifier;
+import de.l3s.boilerpipe.filters.heuristics.ExpandTitleToContentFilter;
+import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
+import de.l3s.boilerpipe.filters.heuristics.LargeBlockSameTagLevelToContentFilter;
+import de.l3s.boilerpipe.filters.heuristics.ListAtEndFilter;
+import de.l3s.boilerpipe.filters.heuristics.TrailingHeadlineToBoilerplateFilter;
+import de.l3s.boilerpipe.filters.simple.BoilerplateBlockFilter;
+
+/**
+ * A full-text extractor which is tuned towards news articles. In this scenario
+ * it achieves higher accuracy than {@link DefaultExtractor}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class ArticleExtractor extends ExtractorBase {
+    public static final ArticleExtractor INSTANCE = new ArticleExtractor();
+
+    /**
+     * Returns the singleton instance for {@link ArticleExtractor}.
+     */
+    public static ArticleExtractor getInstance() {
+        return INSTANCE;
+    }
+    
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        return
+
+        TerminatingBlocksFinder.INSTANCE.process(doc)
+                | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc)
+                | NumWordsRulesClassifier.INSTANCE.process(doc)
+                | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc)
+                | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc)
+                | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
+                | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc)
+                | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc)
+                | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc)
+                | ExpandTitleToContentFilter.INSTANCE.process(doc)
+                | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc)
+                | ListAtEndFilter.INSTANCE.process(doc)
+        ;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java
new file mode 100644
index 0000000..5b95e31
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java
@@ -0,0 +1,49 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.simple.MinClauseWordsFilter;
+import de.l3s.boilerpipe.filters.simple.SplitParagraphBlocksFilter;
+
+/**
+ * A full-text extractor which is tuned towards extracting sentences from news articles.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class ArticleSentencesExtractor extends ExtractorBase {
+    public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor();
+
+    /**
+     * Returns the singleton instance for {@link ArticleSentencesExtractor}.
+     */
+    public static ArticleSentencesExtractor getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        return
+
+        ArticleExtractor.INSTANCE.process(doc)
+                | SplitParagraphBlocksFilter.INSTANCE.process(doc)
+                | MinClauseWordsFilter.INSTANCE.process(doc);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java
new file mode 100644
index 0000000..db970e0
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java
@@ -0,0 +1,106 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.estimators.SimpleEstimator;
+
+/**
+ * A full-text extractor trained on <a href="http://krdwrd.org/">krdwrd</a> <a
+ * href
+ * ="https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf">Canola
+ * </a>. Works well with {@link SimpleEstimator}, too.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class CanolaExtractor extends ExtractorBase {
+	public static final CanolaExtractor INSTANCE = new CanolaExtractor();
+
+	/**
+	 * Returns the singleton instance for {@link CanolaExtractor}.
+	 */
+	public static CanolaExtractor getInstance() {
+		return INSTANCE;
+	}
+
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+
+		return CLASSIFIER.process(doc);
+	}
+
+	/**
+	 * The actual classifier, exposed.
+	 */
+	public static final BoilerpipeFilter CLASSIFIER = new BoilerpipeFilter() {
+
+		public boolean process(TextDocument doc)
+				throws BoilerpipeProcessingException {
+			List<TextBlock> textBlocks = doc.getTextBlocks();
+			boolean hasChanges = false;
+
+			ListIterator<TextBlock> it = textBlocks.listIterator();
+			if (!it.hasNext()) {
+				return false;
+			}
+			TextBlock prevBlock = TextBlock.EMPTY_START;
+			TextBlock currentBlock = it.next();
+			TextBlock nextBlock = it.hasNext() ? it.next()
+					: TextBlock.EMPTY_START;
+
+			hasChanges = classify(prevBlock, currentBlock, nextBlock)
+					| hasChanges;
+
+			if (nextBlock != TextBlock.EMPTY_START) {
+				while (it.hasNext()) {
+					prevBlock = currentBlock;
+					currentBlock = nextBlock;
+					nextBlock = it.next();
+					hasChanges = classify(prevBlock, currentBlock, nextBlock)
+							| hasChanges;
+				}
+				prevBlock = currentBlock;
+				currentBlock = nextBlock;
+				nextBlock = TextBlock.EMPTY_START;
+				hasChanges = classify(prevBlock, currentBlock, nextBlock)
+						| hasChanges;
+			}
+
+			return hasChanges;
+		}
+
+		protected boolean classify(final TextBlock prev, final TextBlock curr,
+				final TextBlock next) {
+			final boolean isContent = (curr.getLinkDensity() > 0 && next
+					.getNumWords() > 11)
+					|| (curr.getNumWords() > 19 || (next.getNumWords() > 6
+							&& next.getLinkDensity() == 0
+							&& prev.getLinkDensity() == 0 && (curr
+							.getNumWords() > 6 || prev.getNumWords() > 7 || next
+							.getNumWords() > 19)));
+
+			return curr.setIsContent(isContent);
+		}
+	};
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java b/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java
new file mode 100644
index 0000000..7e43d20
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java
@@ -0,0 +1,42 @@
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+
+/**
+ * Provides quick access to common {@link BoilerpipeExtractor}s.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class CommonExtractors {
+	private CommonExtractors() {
+	}
+
+	/**
+	 * Works very well for most types of Article-like HTML.
+	 */
+	public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE;
+
+	/**
+	 * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics.
+	 */
+	public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE;
+
+	/**
+	 * Like {@link DefaultExtractor}, but keeps the largest text block only.
+	 */
+	public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR = LargestContentExtractor.INSTANCE;
+	
+	
+	/**
+	 * Trained on krdwrd Canola (different definition of "boilerplate"). You may
+	 * give it a try.
+	 */
+	public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE;
+
+	/**
+	 * Dummy Extractor; should return the input text. Use this to double-check
+	 * that your problem is within a particular {@link BoilerpipeExtractor}, or
+	 * somewhere else.
+	 */
+	public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR = KeepEverythingExtractor.INSTANCE;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java
new file mode 100644
index 0000000..1fd7f33
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java
@@ -0,0 +1,50 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.DensityRulesClassifier;
+import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion;
+import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
+
+/**
+ * A quite generic full-text extractor.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class DefaultExtractor extends ExtractorBase {
+    public static final DefaultExtractor INSTANCE = new DefaultExtractor();
+
+    /**
+     * Returns the singleton instance for {@link DefaultExtractor}.
+     */
+    public static DefaultExtractor getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        return
+
+        SimpleBlockFusionProcessor.INSTANCE.process(doc)
+                | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
+                | DensityRulesClassifier.INSTANCE.process(doc);
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java b/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java
new file mode 100644
index 0000000..f41a243
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java
@@ -0,0 +1,116 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.URL;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
+import de.l3s.boilerpipe.sax.HTMLFetcher;
+
+/**
+ * The base class of Extractors. Also provides some helper methods to quickly
+ * retrieve the text that remained after processing.
+ * 
+ * @author Christian Kohlschütter
+ */
+public abstract class ExtractorBase implements BoilerpipeExtractor {
+    
+    /**
+     * Extracts text from the HTML code given as a String.
+     * 
+     * @param html  The HTML code as a String.
+     * @return  The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final String html)
+            throws BoilerpipeProcessingException {
+        try {
+            return getText(new BoilerpipeSAXInput(new InputSource(
+                    new StringReader(html))).getTextDocument());
+        } catch (SAXException e) {
+            throw new BoilerpipeProcessingException(e);
+        }
+    }
+
+    /**
+     * Extracts text from the HTML code available from the given {@link InputSource}.
+     * 
+     * @param is The InputSource containing the HTML
+     * @return  The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final InputSource is)
+            throws BoilerpipeProcessingException {
+        try {
+            return getText(new BoilerpipeSAXInput(is).getTextDocument());
+        } catch (SAXException e) {
+            throw new BoilerpipeProcessingException(e);
+        }
+    }
+
+    /**
+     * Extracts text from the HTML code available from the given {@link URL}.
+     * NOTE: This method is mainly to be used for show case purposes. If you are
+     * going to crawl the Web, consider using {@link #getText(InputSource)}
+     * instead.
+     * 
+     * @param url  The URL pointing to the HTML code.
+     * @return  The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final URL url) throws BoilerpipeProcessingException {
+        try {
+        	return getText(HTMLFetcher.fetch(url).toInputSource());
+        } catch (IOException e) {
+            throw new BoilerpipeProcessingException(e);
+        }
+    }
+
+    /**
+     * Extracts text from the HTML code available from the given {@link Reader}.
+     * 
+     * @param r The Reader containing the HTML
+     * @return  The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(final Reader r) throws BoilerpipeProcessingException {
+        return getText(new InputSource(r));
+    }
+
+    /**
+     * Extracts text from the given {@link TextDocument} object.
+     * 
+     * @param doc The {@link TextDocument}.
+     * @return  The extracted text.
+     * @throws BoilerpipeProcessingException
+     */
+    public String getText(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        process(doc);
+        return doc.getContent();
+    }    
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java
new file mode 100644
index 0000000..d1f8afc
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java
@@ -0,0 +1,42 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter;
+
+/**
+ * Marks everything as content.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class KeepEverythingExtractor extends ExtractorBase {
+
+    public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor();
+
+    private KeepEverythingExtractor() {
+
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        return MarkEverythingContentFilter.INSTANCE.process(doc);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java
new file mode 100644
index 0000000..96a88c0
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java
@@ -0,0 +1,48 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
+import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter;
+import de.l3s.boilerpipe.filters.simple.MinWordsFilter;
+
+/**
+ * A full-text extractor which extracts the largest text component of a page.
+ * For news articles, it may perform better than the {@link DefaultExtractor},
+ * but usually worse than {@link ArticleExtractor}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase {
+
+    private final MinWordsFilter filter;
+
+    public KeepEverythingWithMinKWordsExtractor(final int kMin) {
+        this.filter = new MinWordsFilter(kMin);
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        return SimpleBlockFusionProcessor.INSTANCE.process(doc)
+                | MarkEverythingContentFilter.INSTANCE.process(doc)
+                | filter.process(doc);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java
new file mode 100644
index 0000000..8720c5c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java
@@ -0,0 +1,53 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier;
+import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion;
+import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
+
+/**
+ * A full-text extractor which extracts the largest text component of a page.
+ * For news articles, it may perform better than the {@link DefaultExtractor},
+ * but usually worse than {@link ArticleExtractor}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class LargestContentExtractor extends ExtractorBase {
+    public static final LargestContentExtractor INSTANCE = new LargestContentExtractor();
+
+    private LargestContentExtractor() {
+    }
+
+    /**
+     * Returns the singleton instance for {@link LargestContentExtractor}.
+     */
+    public static LargestContentExtractor getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        return NumWordsRulesClassifier.INSTANCE.process(doc)
+                | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
+                | KeepLargestBlockFilter.INSTANCE.process(doc);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java
new file mode 100644
index 0000000..12ece11
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java
@@ -0,0 +1,46 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier;
+
+/**
+ * A quite generic full-text extractor solely based upon the number of words per
+ * block (the current, the previous and the next block).
+ * 
+ * @author Christian Kohlschütter
+ */
+public class NumWordsRulesExtractor extends ExtractorBase {
+    public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor();
+
+    /**
+     * Returns the singleton instance for {@link NumWordsRulesExtractor}.
+     */
+    public static NumWordsRulesExtractor getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        return NumWordsRulesClassifier.INSTANCE.process(doc);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/package.html b/src/main/java/de/l3s/boilerpipe/extractors/package.html
new file mode 100644
index 0000000..aae6f19
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+	<p>This package contains some standard extractors (i.e., completely
+		piped BoilerpipeFilters)</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java b/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java
new file mode 100644
index 0000000..52025ef
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java
@@ -0,0 +1,69 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2012 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.debug;
+
+import java.io.PrintWriter;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Prints debug information about the current state of the TextDocument. (=
+ * calls {@link TextDocument#debugString()}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class PrintDebugFilter implements BoilerpipeFilter {
+	/**
+	 * Returns the default instance for {@link PrintDebugFilter},
+	 * which dumps debug information to <code>System.out</code>
+	 */
+	public static final PrintDebugFilter INSTANCE = new PrintDebugFilter(
+			new PrintWriter(System.out, true));
+	private final PrintWriter out;
+
+	/**
+	 * Returns the default instance for {@link PrintDebugFilter},
+	 * which dumps debug information to <code>System.out</code>
+	 */
+	public static PrintDebugFilter getInstance() {
+		return INSTANCE;
+	}
+
+	/**
+	 * Creates a new instance of {@link PrintDebugFilter}.
+	 * 
+	 * Only use this method if you are not going to dump 
+	 * the debug information to <code>System.out</code> --
+	 * for this case, use {@link #getInstance()} instead. 
+	 * 
+	 * @param out The target {@link PrintWriter}. Will not be closed
+	 */
+	public PrintDebugFilter(final PrintWriter out) {
+		this.out = out;
+
+	}
+
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+		out.println(doc.debugString());
+
+		return false;
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java
new file mode 100644
index 0000000..bbda7ba
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java
@@ -0,0 +1,117 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Classifies {@link TextBlock}s as content/not-content through rules that have
+ * been determined using the C4.8 machine learning algorithm, as described in the
+ * paper "Boilerplate Detection using Shallow Text Features", particularly using
+ * text densities and link densities.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class DensityRulesClassifier implements
+        BoilerpipeFilter {
+    public static final DensityRulesClassifier INSTANCE = new DensityRulesClassifier();
+
+    /**
+     * Returns the singleton instance for RulebasedBoilerpipeClassifier.
+     */
+    public static DensityRulesClassifier getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        boolean hasChanges = false;
+
+        ListIterator<TextBlock> it = textBlocks.listIterator();
+        if (!it.hasNext()) {
+            return false;
+        }
+        TextBlock prevBlock = TextBlock.EMPTY_START;
+        TextBlock currentBlock = it.next();
+        TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START;
+
+        hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges;
+
+        if (nextBlock != TextBlock.EMPTY_START) {
+            while (it.hasNext()) {
+                prevBlock = currentBlock;
+                currentBlock = nextBlock;
+                nextBlock = it.next();
+                hasChanges = classify(prevBlock, currentBlock, nextBlock)
+                        | hasChanges;
+            }
+            prevBlock = currentBlock;
+            currentBlock = nextBlock;
+            nextBlock = TextBlock.EMPTY_START;
+            hasChanges = classify(prevBlock, currentBlock, nextBlock)
+                    | hasChanges;
+        }
+
+        return hasChanges;
+    }
+
+    protected boolean classify(final TextBlock prev, final TextBlock curr,
+            final TextBlock next) {
+        final boolean isContent;
+
+        if (curr.getLinkDensity() <= 0.333333) {
+            if (prev.getLinkDensity() <= 0.555556) {
+                if (curr.getTextDensity() <= 9) {
+                    if (next.getTextDensity() <= 10) {
+                        if (prev.getTextDensity() <= 4) {
+                            isContent = false;
+                        } else {
+                            isContent = true;
+                        }
+                    } else {
+                        isContent = true;
+                    }
+                } else {
+                    if (next.getTextDensity() == 0) {
+                        isContent = false;
+                    } else {
+                        isContent = true;
+                    }
+                }
+            } else {
+                if (next.getTextDensity() <= 11) {
+                    isContent = false;
+                } else {
+                    isContent = true;
+                }
+            }
+        } else {
+            isContent = false;
+        }
+
+        return curr.setIsContent(isContent);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java b/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java
new file mode 100644
index 0000000..dc72d07
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java
@@ -0,0 +1,40 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Base class for some heuristics that are used by boilerpipe filters.
+ * 
+ * @author Christian Kohlschütter
+ */
+abstract class HeuristicFilterBase {
+
+    protected static int getNumFullTextWords(final TextBlock tb) {
+        return getNumFullTextWords(tb, 9);
+    }
+    protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) {
+        if(tb.getTextDensity() >= minTextDensity) {
+            return tb.getNumWords();
+        } else {
+            return 0;
+        }
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java
new file mode 100644
index 0000000..1d505be
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java
@@ -0,0 +1,80 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009,2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.Iterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks as "non-content" that occur after blocks that have been
+ * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}. These marks are ignored
+ * unless a minimum number of words in content blocks occur before this mark (default: 60).
+ * This can be used in conjunction with an upstream {@link TerminatingBlocksFinder}.
+ * 
+ * @author Christian Kohlschütter
+ * @see TerminatingBlocksFinder
+ */
+public final class IgnoreBlocksAfterContentFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+    public static final IgnoreBlocksAfterContentFilter DEFAULT_INSTANCE = new IgnoreBlocksAfterContentFilter(
+            60);
+    public static final IgnoreBlocksAfterContentFilter INSTANCE_200 = new IgnoreBlocksAfterContentFilter(
+            200);
+    private final int minNumWords;
+
+    /**
+     * Returns the singleton instance for DeleteBlocksAfterContentFilter.
+     */
+    public static IgnoreBlocksAfterContentFilter getDefaultInstance() {
+        return DEFAULT_INSTANCE;
+    }
+
+    public IgnoreBlocksAfterContentFilter(final int minNumWords) {
+        this.minNumWords = minNumWords;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        boolean changes = false;
+
+        int numWords = 0;
+        boolean foundEndOfText = false;
+        for (Iterator<TextBlock> it = doc.getTextBlocks().iterator(); it.hasNext();) {
+            TextBlock block = it.next();
+
+            final boolean endOfText = block
+                    .hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
+            if (block.isContent()) {
+                numWords += getNumFullTextWords(block);
+            }
+            if (endOfText && numWords >= minNumWords) {
+                foundEndOfText = true;
+            }
+            if (foundEndOfText) {
+                changes = true;
+                block.setIsContent(false);
+            }
+        }
+
+        return changes;
+    }    
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java
new file mode 100644
index 0000000..0fdf7dd
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java
@@ -0,0 +1,76 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks as "non-content" that occur after blocks that have been
+ * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}, and after any content block.
+ * This filter can be used in conjunction with an upstream {@link TerminatingBlocksFinder}.
+ * 
+ * @author Christian Kohlschütter
+ * @see TerminatingBlocksFinder
+ */
+public final class IgnoreBlocksAfterContentFromEndFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+    public static final IgnoreBlocksAfterContentFromEndFilter INSTANCE = new IgnoreBlocksAfterContentFromEndFilter(
+            );
+
+    private IgnoreBlocksAfterContentFromEndFilter() {
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        boolean changes = false;
+        
+        
+        int words = 0;
+
+        List<TextBlock> blocks = doc.getTextBlocks();
+        if (!blocks.isEmpty()) {
+			ListIterator<TextBlock> it = blocks.listIterator(blocks.size());
+			
+			TextBlock tb;
+			
+			while(it.hasPrevious()) {
+				tb = it.previous();
+				if(tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) {
+					tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT);
+					tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT);
+					tb.setIsContent(false);
+					changes = true;
+				} else if(tb.isContent()) {
+					words += tb.getNumWords();
+					if(words > 200) {
+						break;
+					}
+				}
+
+			}
+		}        
+
+        return changes;
+    }    
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java
new file mode 100644
index 0000000..ccf7fd8
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java
@@ -0,0 +1,83 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Keeps the largest {@link TextBlock} only (by the number of words). In case of
+ * more than one block with the same number of words, the first block is chosen.
+ * All discarded blocks are marked "not content" and flagged as
+ * {@link DefaultLabels#MIGHT_BE_CONTENT}.
+ * 
+ * As opposed to {@link KeepLargestBlockFilter}, the number of words are
+ * computed using {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}, which only counts
+ * words that occur in text elements with at least 9 words and are thus believed to be full text.
+ * 
+ * NOTE: Without language-specific fine-tuning (i.e., running the default instance), this filter
+ * may lead to suboptimal results. You better use {@link KeepLargestBlockFilter} instead, which
+ * works at the level of number-of-words instead of text densities.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class KeepLargestFulltextBlockFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+    public static final KeepLargestFulltextBlockFilter INSTANCE = new KeepLargestFulltextBlockFilter();
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        if (textBlocks.size() < 2) {
+            return false;
+        }
+
+        int max = -1;
+        TextBlock largestBlock = null;
+        for (TextBlock tb : textBlocks) {
+            if (!tb.isContent()) {
+                continue;
+            }
+            int numWords = getNumFullTextWords(tb);
+            if (numWords > max) {
+                largestBlock = tb;
+                max = numWords;
+            }
+        }
+        
+        if (largestBlock == null) {
+            return false;
+        }
+
+        for (TextBlock tb : textBlocks) {
+            if (tb == largestBlock) {
+                tb.setIsContent(true);
+            } else {
+                tb.setIsContent(false);
+                tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT);
+            }
+        }
+
+        return true;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java
new file mode 100644
index 0000000..7962af4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java
@@ -0,0 +1,63 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Keeps only those content blocks which contain at least k full-text words
+ * (measured by {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+    public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter(
+            30);
+    private final int minWords;
+
+    public static MinFulltextWordsFilter getDefaultInstance() {
+        return DEFAULT_INSTANCE;
+    }
+
+    public MinFulltextWordsFilter(final int minWords) {
+        this.minWords = minWords;
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (!tb.isContent()) {
+                continue;
+            }
+            if (getNumFullTextWords(tb) < minWords) {
+                tb.setIsContent(false);
+                changes = true;
+            }
+
+        }
+
+        return changes;
+
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java
new file mode 100644
index 0000000..550252a
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java
@@ -0,0 +1,116 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Classifies {@link TextBlock}s as content/not-content through rules that have
+ * been determined using the C4.8 machine learning algorithm, as described in
+ * the paper "Boilerplate Detection using Shallow Text Features" (WSDM 2010),
+ * particularly using number of words per block and link density per block.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class NumWordsRulesClassifier implements BoilerpipeFilter {
+    public static final NumWordsRulesClassifier INSTANCE = new NumWordsRulesClassifier();
+
+    /**
+     * Returns the singleton instance for RulebasedBoilerpipeClassifier.
+     */
+    public static NumWordsRulesClassifier getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        boolean hasChanges = false;
+
+        ListIterator<TextBlock> it = textBlocks.listIterator();
+        if (!it.hasNext()) {
+            return false;
+        }
+        TextBlock prevBlock = TextBlock.EMPTY_START;
+        TextBlock currentBlock = it.next();
+        TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START;
+
+        hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges;
+
+        if (nextBlock != TextBlock.EMPTY_START) {
+            while (it.hasNext()) {
+                prevBlock = currentBlock;
+                currentBlock = nextBlock;
+                nextBlock = it.next();
+                hasChanges = classify(prevBlock, currentBlock, nextBlock)
+                        | hasChanges;
+            }
+            prevBlock = currentBlock;
+            currentBlock = nextBlock;
+            nextBlock = TextBlock.EMPTY_START;
+            hasChanges = classify(prevBlock, currentBlock, nextBlock)
+                    | hasChanges;
+        }
+
+        return hasChanges;
+    }
+
+    protected boolean classify(final TextBlock prev, final TextBlock curr,
+            final TextBlock next) {
+        final boolean isContent;
+
+        if (curr.getLinkDensity() <= 0.333333) {
+            if (prev.getLinkDensity() <= 0.555556) {
+                if (curr.getNumWords() <= 16) {
+                    if (next.getNumWords() <= 15) {
+                        if (prev.getNumWords() <= 4) {
+                            isContent = false;
+                        } else {
+                            isContent = true;
+                        }
+                    } else {
+                        isContent = true;
+                    }
+                } else {
+                    isContent = true;
+                }
+            } else {
+                if (curr.getNumWords() <= 40) {
+                    if (next.getNumWords() <= 17) {
+                        isContent = false;
+                    } else {
+                        isContent = true;
+                    }
+                } else {
+                    isContent = true;
+                }
+            }
+        } else {
+            isContent = false;
+        }
+
+        return curr.setIsContent(isContent);
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java b/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java
new file mode 100644
index 0000000..0c5c15c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java
@@ -0,0 +1,124 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlsch��tter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Finds blocks which are potentially indicating the end of an article text and
+ * marks them with {@link DefaultLabels#INDICATES_END_OF_TEXT}. This can be used
+ * in conjunction with a downstream {@link IgnoreBlocksAfterContentFilter}.
+ * 
+ * @author Christian Kohlsch��tter
+ * @see IgnoreBlocksAfterContentFilter
+ */
+public class TerminatingBlocksFinder implements BoilerpipeFilter {
+	public static final TerminatingBlocksFinder INSTANCE = new TerminatingBlocksFinder();
+
+	/**
+	 * Returns the singleton instance for TerminatingBlocksFinder.
+	 */
+	public static TerminatingBlocksFinder getInstance() {
+		return INSTANCE;
+	}
+
+	// public static long timeSpent = 0;
+
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+		boolean changes = false;
+
+		// long t = System.currentTimeMillis();
+
+		for (TextBlock tb : doc.getTextBlocks()) {
+			final int numWords = tb.getNumWords();
+			if (numWords < 15) {
+				final String text = tb.getText().trim();
+				final int len = text.length();
+				if (len >= 8) {
+					final String textLC = text.toLowerCase();
+					if (textLC.startsWith("comments")
+							|| startsWithNumber(textLC, len, " comments",
+									" users responded in")
+							|| textLC.startsWith("�� reuters")
+							|| textLC.startsWith("please rate this")
+							|| textLC.startsWith("post a comment")
+							|| textLC.contains("what you think...")
+							|| textLC.contains("add your comment")
+							|| textLC.contains("add comment")
+							|| textLC.contains("reader views")
+							|| textLC.contains("have your say")
+							|| textLC.contains("reader comments")
+							|| textLC.contains("r��tta artikeln")
+							|| textLC.contains("Réagir")
+							|| textLC.contains("Vos réactions ")
+							|| textLC
+									.equals("thanks for your comments - this feedback is now closed")) {
+						tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT);
+						changes = true;
+					}
+				} else if(tb.getLinkDensity() == 1.0) {
+					if(text.equals("Comment")) {
+						tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT);
+					}
+				}
+			}
+		}
+
+		// timeSpent += System.currentTimeMillis() - t;
+
+		return changes;
+	}
+
+	/**
+	 * Checks whether the given text t starts with a sequence of digits,
+	 * followed by one of the given strings.
+	 * 
+	 * @param t
+	 *            The text to examine
+	 * @param len
+	 *            The length of the text to examine
+	 * @param str
+	 *            Any strings that may follow the digits.
+	 * @return true if at least one combination matches
+	 */
+	private static boolean startsWithNumber(final String t, final int len,
+			final String... str) {
+		int j = 0;
+		while (j < len && isDigit(t.charAt(j))) {
+			j++;
+		}
+		if (j != 0) {
+			for (String s : str) {
+				if (t.startsWith(s, j)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+
+	private static boolean isDigit(final char c) {
+		return c >= '0' && c <= '9';
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/package.html b/src/main/java/de/l3s/boilerpipe/filters/english/package.html
new file mode 100644
index 0000000..ec624a9
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/package.html
@@ -0,0 +1,8 @@
+<html>
+<body>
+	<p>The BoilerpipeFilters in this package have only been tested on
+		English text.</p>
+	<p>That is, they will probably work with other Western languages,
+		but maybe need some parameter tuning to perform well.</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java
new file mode 100644
index 0000000..0922cb1
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java
@@ -0,0 +1,84 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2011 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Adds the labels of the preceding block to the current block, optionally adding a prefix.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class AddPrecedingLabelsFilter implements BoilerpipeFilter {
+
+    public static final AddPrecedingLabelsFilter INSTANCE = new AddPrecedingLabelsFilter("");
+    public static final AddPrecedingLabelsFilter INSTANCE_PRE = new AddPrecedingLabelsFilter("^");
+
+	private final String labelPrefix;
+
+    /**
+     * Creates a new {@link AddPrecedingLabelsFilter} instance.
+     *
+     */
+    public AddPrecedingLabelsFilter(final String labelPrefix) {
+        this.labelPrefix = labelPrefix;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        if (textBlocks.size() < 2) {
+            return false;
+        }
+
+        boolean changes = false;
+        int remaining = textBlocks.size();
+
+        TextBlock blockBelow = null;
+        TextBlock block;
+        for (ListIterator<TextBlock> it = textBlocks.listIterator(textBlocks.size()); it
+                .hasPrevious();) {
+        	if(--remaining <= 0) {
+        		break;
+        	}
+        	if(blockBelow == null) {
+        		blockBelow = it.previous();
+        		continue;
+        	}
+            block = it.previous();
+            
+            Set<String> labels = block.getLabels();
+            if(labels != null && !labels.isEmpty()) {
+            	for(String l : labels) {
+            		blockBelow.addLabel(labelPrefix+l);
+            	}
+	            changes = true;
+            }
+            blockBelow = block;
+        }
+
+        return changes;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java
new file mode 100644
index 0000000..10f9d70
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java
@@ -0,0 +1,43 @@
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.regex.Pattern;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+public class ArticleMetadataFilter implements BoilerpipeFilter {
+	private static final Pattern[] PATTERNS_SHORT = new Pattern[] {
+		Pattern
+			.compile("^[0-9 \\,\\./]*\\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\\b[0-9 \\,\\:apm\\./]*([CPSDMGET]{2,3})?$"),
+			Pattern.compile("^[Bb]y ")
+			};
+
+
+	public static final ArticleMetadataFilter INSTANCE = new ArticleMetadataFilter();
+	
+	private ArticleMetadataFilter() {
+	}
+	
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+		boolean changed = false;
+		for (TextBlock tb : doc.getTextBlocks()) {
+			if (tb.getNumWords() > 10) {
+				continue;
+			}
+			final String text = tb.getText();
+			for (Pattern p : PATTERNS_SHORT) {
+				if (p.matcher(text).find()) {
+					changed = true;
+					tb.setIsContent(true);
+					tb.addLabel(DefaultLabels.ARTICLE_METADATA);
+				}
+			}
+		}
+		return changed;
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java
new file mode 100644
index 0000000..510c47f
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java
@@ -0,0 +1,128 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit.
+ * This probably makes sense only in cases where an upstream filter already has removed some blocks.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class BlockProximityFusion implements BoilerpipeFilter {
+
+    private final int maxBlocksDistance;
+
+    public static final BlockProximityFusion MAX_DISTANCE_1 = new BlockProximityFusion(
+            1, false, false);
+    public static final BlockProximityFusion MAX_DISTANCE_1_SAME_TAGLEVEL = new BlockProximityFusion(
+            1, false, true);
+    public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY = new BlockProximityFusion(
+            1, true, false);
+    public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = new BlockProximityFusion(
+            1, true, true);
+
+    private final boolean contentOnly;
+
+	private final boolean sameTagLevelOnly;
+
+    /**
+     * Creates a new {@link BlockProximityFusion} instance.
+     *
+     * @param maxBlocksDistance The maximum distance in blocks.
+     * @param contentOnly 
+     */
+    public BlockProximityFusion(final int maxBlocksDistance,
+            final boolean contentOnly, final boolean sameTagLevelOnly) {
+        this.maxBlocksDistance = maxBlocksDistance;
+        this.contentOnly = contentOnly;
+		this.sameTagLevelOnly = sameTagLevelOnly;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        if (textBlocks.size() < 2) {
+            return false;
+        }
+
+        boolean changes = false;
+        TextBlock prevBlock;
+
+        int offset;
+        if (contentOnly) {
+            prevBlock = null;
+            offset = 0;
+            for (TextBlock tb : textBlocks) {
+                offset++;
+                if (tb.isContent()) {
+                    prevBlock = tb;
+                    break;
+                }
+            }
+            if (prevBlock == null) {
+                return false;
+            }
+        } else {
+            prevBlock = textBlocks.get(0);
+            offset = 1;
+        }
+
+        for (Iterator<TextBlock> it = textBlocks.listIterator(offset); it
+                .hasNext();) {
+            TextBlock block = it.next();
+            if (!block.isContent()) {
+                prevBlock = block;
+                continue;
+            }
+            int diffBlocks = block.getOffsetBlocksStart()
+                    - prevBlock.getOffsetBlocksEnd() - 1;
+            if (diffBlocks <= maxBlocksDistance) {
+                boolean ok = true;
+                if (contentOnly) {
+                    if (!prevBlock.isContent()
+                            || !block.isContent()) {
+                        ok = false;
+                    }
+                }
+                if(ok && sameTagLevelOnly && prevBlock.getTagLevel() != block.getTagLevel()) {
+                	ok = false;
+                }
+                if (ok) {
+                    prevBlock.mergeNext(block);
+                    it.remove();
+                    changes = true;
+                } else {
+                    prevBlock = block;
+                }
+            } else {
+                prevBlock = block;
+            }
+        }
+
+        return changes;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java
new file mode 100644
index 0000000..e44fc0c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java
@@ -0,0 +1,72 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+public final class ContentFusion implements BoilerpipeFilter {
+
+	public static final ContentFusion INSTANCE = new ContentFusion();
+
+	/**
+	 * Creates a new {@link ContentFusion} instance.
+	 * 
+	 */
+	public ContentFusion() {
+	}
+
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+		List<TextBlock> textBlocks = doc.getTextBlocks();
+		if (textBlocks.size() < 2) {
+			return false;
+		}
+
+		TextBlock prevBlock = textBlocks.get(0);
+
+		boolean changes = false;
+		do {
+			changes = false;
+			for (ListIterator<TextBlock> it = textBlocks.listIterator(1); it
+					.hasNext();) {
+				TextBlock block = it.next();
+
+				if (prevBlock.isContent()
+						&& block.getLinkDensity() < 0.56
+						&& !block.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) {
+					
+					prevBlock.mergeNext(block);
+					it.remove();
+					changes = true;
+				} else {
+					prevBlock = block;
+				}
+			}
+		} while (changes);
+
+		return true;
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java
new file mode 100644
index 0000000..f3e4cda
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java
@@ -0,0 +1,173 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks {@link TextBlock}s which contain parts of the HTML
+ * <code>&lt;TITLE&gt;</code> tag, using some heuristics which are quite
+ * specific to the news domain.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class DocumentTitleMatchClassifier implements BoilerpipeFilter {
+
+	private final Set<String> potentialTitles;
+
+	public DocumentTitleMatchClassifier(String title) {
+		if (title == null) {
+			this.potentialTitles = null;
+		} else {
+			
+			title = title.replace('\u00a0', ' ');
+			title = title.replace("'", "");
+			
+			title = title.trim().toLowerCase();
+			
+			if (title.length() == 0) {
+				this.potentialTitles = null;
+			} else {
+				this.potentialTitles = new HashSet<String>();
+
+				potentialTitles.add(title);
+
+				String p;
+
+				p = getLongestPart(title, "[ ]*[\\|»|-][ ]*");
+				if (p != null) {
+					potentialTitles.add(p);
+				}
+				p = getLongestPart(title, "[ ]*[\\|»|:][ ]*");
+				if (p != null) {
+					potentialTitles.add(p);
+				}
+				p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)][ ]*");
+				if (p != null) {
+					potentialTitles.add(p);
+				}
+				p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)\\-][ ]*");
+				if (p != null) {
+					potentialTitles.add(p);
+				}
+				p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-][ ]*");
+				if (p != null) {
+					potentialTitles.add(p);
+				}
+				p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-\u00a0][ ]*");
+				if (p != null) {
+					potentialTitles.add(p);
+				}
+				
+				addPotentialTitles(potentialTitles, title, "[ ]+[\\|][ ]+", 4);
+				addPotentialTitles(potentialTitles, title, "[ ]+[\\-][ ]+", 4);
+				
+				potentialTitles.add(title.replaceFirst(" - [^\\-]+$", ""));
+				potentialTitles.add(title.replaceFirst("^[^\\-]+ - ", ""));
+			}
+		}
+	}
+
+	public Set<String> getPotentialTitles() {
+		return potentialTitles;
+	}
+	
+	private void addPotentialTitles(final Set<String> potentialTitles, final String title, final String pattern, final int minWords) {
+		String[] parts = title.split(pattern);
+		if (parts.length == 1) {
+			return;
+		}
+		for (int i = 0; i < parts.length; i++) {
+			String p = parts[i];
+			if (p.contains(".com")) {
+				continue;
+			}
+			final int numWords = p.split("[\b ]+").length;
+			if (numWords >=minWords) {
+				potentialTitles.add(p);
+			}
+		}
+	}
+
+	private String getLongestPart(final String title, final String pattern) {
+		String[] parts = title.split(pattern);
+		if (parts.length == 1) {
+			return null;
+		}
+		int longestNumWords = 0;
+		String longestPart = "";
+		for (int i = 0; i < parts.length; i++) {
+			String p = parts[i];
+			if (p.contains(".com")) {
+				continue;
+			}
+			final int numWords = p.split("[\b ]+").length;
+			if (numWords > longestNumWords || p.length() > longestPart.length()) {
+				longestNumWords = numWords;
+				longestPart = p;
+			}
+		}
+		if (longestPart.length() == 0) {
+			return null;
+		} else {
+			return longestPart.trim();
+		}
+	}
+	
+	private static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?\\!\\.\\-\\:]+");
+
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+		if (potentialTitles == null) {
+			return false;
+		}
+		boolean changes = false;
+		
+		for (final TextBlock tb : doc.getTextBlocks()) {
+			String text = tb.getText();
+			
+			text = text.replace('\u00a0', ' ');
+			text = text.replace("'", "");
+
+			text = text.trim().toLowerCase();
+
+			if (potentialTitles.contains(text)) {
+				tb.addLabel(DefaultLabels.TITLE);
+				changes = true;
+				break;
+			}
+			
+			text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim();
+			if (potentialTitles.contains(text)) {
+				tb.addLabel(DefaultLabels.TITLE);
+				changes = true;
+				break;
+			}
+		}
+		return changes;
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java
new file mode 100644
index 0000000..7268a45
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java
@@ -0,0 +1,73 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all {@link TextBlock}s "content" which are between the headline and the part that
+ * has already been marked content, if they are marked {@link DefaultLabels#MIGHT_BE_CONTENT}.
+ * 
+ * This filter is quite specific to the news domain.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class ExpandTitleToContentFilter implements BoilerpipeFilter {
+    public static final ExpandTitleToContentFilter INSTANCE = new ExpandTitleToContentFilter();
+
+    /**
+     * Returns the singleton instance for ExpandTitleToContentFilter.
+     */
+    public static ExpandTitleToContentFilter getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        int i = 0;
+        int title = -1;
+        int contentStart = -1;
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (contentStart == -1 && tb.hasLabel(DefaultLabels.TITLE)) {
+                title = i;
+                contentStart = -1;
+            }
+            if (contentStart == -1 && tb.isContent()) {
+                contentStart = i;
+            }
+            
+            i++;
+        }
+
+        if (contentStart <= title || title == -1) {
+            return false;
+        }
+        boolean changes = false;
+        for (TextBlock tb : doc.getTextBlocks().subList(title, contentStart)) {
+            if (tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)) {
+                changes = tb.setIsContent(true) | changes;
+            }
+        }
+        return changes;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java
new file mode 100644
index 0000000..5d4cc31
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java
@@ -0,0 +1,124 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Keeps the largest {@link TextBlock} only (by the number of words). In case of
+ * more than one block with the same number of words, the first block is chosen.
+ * All discarded blocks are marked "not content" and flagged as
+ * {@link DefaultLabels#MIGHT_BE_CONTENT}.
+ * 
+ * Note that, by default, only TextBlocks marked as "content" are taken into consideration.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class KeepLargestBlockFilter implements BoilerpipeFilter {
+	public static final KeepLargestBlockFilter INSTANCE = new KeepLargestBlockFilter(
+			false, 0);
+	public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL = new KeepLargestBlockFilter(
+			true, 0);
+	public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS = new KeepLargestBlockFilter(
+			true, 150);
+	private final boolean expandToSameLevelText;
+	private final int minWords;
+
+	public KeepLargestBlockFilter(boolean expandToSameLevelText, final int minWords) {
+		this.expandToSameLevelText = expandToSameLevelText;
+		this.minWords = minWords;
+	}
+
+	public boolean process(final TextDocument doc)
+			throws BoilerpipeProcessingException {
+		List<TextBlock> textBlocks = doc.getTextBlocks();
+		if (textBlocks.size() < 2) {
+			return false;
+		}
+
+		int maxNumWords = -1;
+		TextBlock largestBlock = null;
+
+		int level = -1;
+
+		int i = 0;
+		int n = -1;
+		for (TextBlock tb : textBlocks) {
+			if (tb.isContent()) {
+				final int nw = tb.getNumWords();
+				
+				if (nw > maxNumWords) {
+					largestBlock = tb;
+					maxNumWords = nw;
+
+					n = i;
+
+					if (expandToSameLevelText) {
+						level = tb.getTagLevel();
+					}
+				}
+			}
+			i++;
+		}
+		for (TextBlock tb : textBlocks) {
+			if (tb == largestBlock) {
+				tb.setIsContent(true);
+				tb.addLabel(DefaultLabels.VERY_LIKELY_CONTENT);
+			} else {
+				tb.setIsContent(false);
+				tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT);
+			}
+		}
+		if (expandToSameLevelText && n != -1) {
+			
+			for (ListIterator<TextBlock> it = textBlocks.listIterator(n); it
+					.hasPrevious();) {
+				TextBlock tb = it.previous();
+				final int tl = tb.getTagLevel();
+				if(tl < level) {
+					break;
+				} else if(tl == level) {
+					if(tb.getNumWords() >= minWords) {
+						tb.setIsContent(true);
+					}
+				}
+			}
+			for (ListIterator<TextBlock> it = textBlocks.listIterator(n); it
+			.hasNext();) {
+				TextBlock tb = it.next();
+				final int tl = tb.getTagLevel();
+				if(tl < level) {
+					break;
+				} else if(tl == level) {
+					if(tb.getNumWords() >= minWords) {
+						tb.setIsContent(true);
+					}
+				}
+			}
+		}
+
+		return true;
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java
new file mode 100644
index 0000000..0ec3836
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java
@@ -0,0 +1,91 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Fuses adjacent blocks if their labels are equal.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class LabelFusion implements BoilerpipeFilter {
+
+    public static final LabelFusion INSTANCE = new LabelFusion();
+
+    /**
+     * Creates a new {@link LabelFusion} instance.
+     */
+    private LabelFusion() {
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        if (textBlocks.size() < 2) {
+            return false;
+        }
+
+        boolean changes = false;
+        TextBlock prevBlock = textBlocks.get(0);
+        int offset = 1;
+
+        for (Iterator<TextBlock> it = textBlocks.listIterator(offset); it
+                .hasNext();) {
+            TextBlock block = it.next();
+
+            if(equalLabels(prevBlock.getLabels(), block.getLabels())) {
+                prevBlock.mergeNext(block);
+                it.remove();
+                changes = true;
+            } else {
+                prevBlock = block;
+            }
+        }
+
+        return changes;
+    }
+
+	private boolean equalLabels(Set<String> labels, Set<String> labels2) {
+		if(labels == null || labels2 == null) {
+			return false;
+		}
+		return markupLabelsOnly(labels).equals(markupLabelsOnly(labels2));
+	}
+	
+	private Set<String> markupLabelsOnly(final Set<String> set1) {
+		Set<String> set = new HashSet<String>(set1);
+		for(Iterator<String> it = set.iterator(); it.hasNext(); ) {
+			final String label = it.next();
+			if(!label.startsWith(DefaultLabels.MARKUP_PREFIX)) {
+				it.remove();
+			}
+		}
+		return set;
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java
new file mode 100644
index 0000000..966e583
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java
@@ -0,0 +1,70 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks as content that:
+ * <ol>
+ * <li>are on the same tag-level as very likely main content (usually the level of the largest block)</li>
+ * <li>have a significant number of words, currently: at least 100</li>  
+ * </ol>
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class LargeBlockSameTagLevelToContentFilter implements BoilerpipeFilter {
+    public static final LargeBlockSameTagLevelToContentFilter INSTANCE = new LargeBlockSameTagLevelToContentFilter();
+    private LargeBlockSameTagLevelToContentFilter() {
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        int tagLevel = -1;
+        for (TextBlock tb : doc.getTextBlocks()) {
+        	if(tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
+        		tagLevel = tb.getTagLevel();
+        		break;
+        	}
+        }
+        
+        if(tagLevel == -1) {
+        	return false;
+        }
+        
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (!tb.isContent()) {
+            	
+            	if(tb.getNumWords() >= 100 && tb.getTagLevel() == tagLevel) {
+	                tb.setIsContent(true);
+	                changes = true;
+            	}
+            }
+        }
+
+        return changes;
+
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java
new file mode 100644
index 0000000..dfaae1b
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java
@@ -0,0 +1,64 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks nested list-item blocks after the end of the main content.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class ListAtEndFilter implements BoilerpipeFilter {
+	public static final ListAtEndFilter INSTANCE = new ListAtEndFilter();
+
+	private ListAtEndFilter() {
+	}
+
+	public boolean process(final TextDocument doc)
+			throws BoilerpipeProcessingException {
+
+		boolean changes = false;
+
+		int tagLevel = Integer.MAX_VALUE;
+		for (TextBlock tb : doc.getTextBlocks()) {
+			if (tb.isContent()
+					&& tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
+				tagLevel = tb.getTagLevel();
+			} else {
+				if (tb.getTagLevel() > tagLevel
+						&& tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)
+						&& tb.hasLabel(DefaultLabels.LI)
+						&& tb.getLinkDensity() == 0
+						) {
+					tb.setIsContent(true);
+					changes = true;
+				} else {
+					tagLevel = Integer.MAX_VALUE;
+				}
+			}
+		}
+
+		return changes;
+
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java
new file mode 100644
index 0000000..e1fc17b
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java
@@ -0,0 +1,70 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Merges two subsequent blocks if their text densities are equal.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class SimpleBlockFusionProcessor implements BoilerpipeFilter {
+    public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor();
+
+    /**
+     * Returns the singleton instance for BlockFusionProcessor.
+     */
+    public static SimpleBlockFusionProcessor getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        List<TextBlock> textBlocks = doc.getTextBlocks();
+        boolean changes = false;
+
+        if (textBlocks.size() < 2) {
+            return false;
+        }
+
+        TextBlock b1 = textBlocks.get(0);
+        for (Iterator<TextBlock> it = textBlocks.listIterator(1); it.hasNext();) {
+            TextBlock b2 = it.next();
+
+            final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
+            
+            if(similar) {
+                b1.mergeNext(b2);
+                it.remove();
+                changes = true;
+            } else {
+                b1 = b2;
+            }
+        }
+
+        return changes;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java
new file mode 100644
index 0000000..8a5b18d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java
@@ -0,0 +1,66 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks trailing headlines ({@link TextBlock}s that have the label {@link DefaultLabels#HEADING})
+ * as boilerplate. Trailing means they are marked content and are below any other content block.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class TrailingHeadlineToBoilerplateFilter implements BoilerpipeFilter {
+    public static final TrailingHeadlineToBoilerplateFilter INSTANCE = new TrailingHeadlineToBoilerplateFilter();
+
+    /**
+     * Returns the singleton instance for ExpandTitleToContentFilter.
+     */
+    public static TrailingHeadlineToBoilerplateFilter getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+    	boolean changes = false;
+    	
+    	List<TextBlock> list = doc.getTextBlocks();
+
+    	for (ListIterator<TextBlock> it = list.listIterator(list.size()); it.hasPrevious(); ) {
+    		TextBlock tb = it.previous();
+    		if(tb.isContent()) {
+    			if(tb.hasLabel(DefaultLabels.HEADING)) {
+    				tb.setIsContent(false);
+    				changes = true;
+    			} else {
+    				break;
+    			}
+    		}
+    	}
+    	
+        return changes;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html b/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html
new file mode 100644
index 0000000..a368224
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+	<p>The BoilerpipeFilters in this package are pure heuristics.</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java
new file mode 100644
index 0000000..aff85a6
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java
@@ -0,0 +1,71 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Removes {@link TextBlock}s which have explicitly been marked as
+ * "not content".
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class BoilerplateBlockFilter implements BoilerpipeFilter {
+	public static final BoilerplateBlockFilter INSTANCE = new BoilerplateBlockFilter(
+			null);
+	public static final BoilerplateBlockFilter INSTANCE_KEEP_TITLE = new BoilerplateBlockFilter(
+			DefaultLabels.TITLE);
+	private final String labelToKeep;
+
+	/**
+	 * Returns the singleton instance for BoilerplateBlockFilter.
+	 */
+	public static BoilerplateBlockFilter getInstance() {
+		return INSTANCE;
+	}
+
+	public BoilerplateBlockFilter(final String labelToKeep) {
+		this.labelToKeep = labelToKeep;
+	}
+
+	public boolean process(TextDocument doc)
+			throws BoilerpipeProcessingException {
+		List<TextBlock> textBlocks = doc.getTextBlocks();
+		boolean hasChanges = false;
+
+		for (Iterator<TextBlock> it = textBlocks.iterator(); it.hasNext();) {
+			TextBlock tb = it.next();
+			if (!tb.isContent()
+					&& (labelToKeep == null || !tb
+							.hasLabel(DefaultLabels.TITLE))) {
+				it.remove();
+				hasChanges = true;
+			}
+		}
+
+		return hasChanges;
+	}
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java
new file mode 100644
index 0000000..a464dbf
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java
@@ -0,0 +1,51 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Reverts the "isContent" flag for all {@link TextBlock}s
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class InvertedFilter implements BoilerpipeFilter {
+    public static final InvertedFilter INSTANCE = new InvertedFilter();
+    private InvertedFilter() {
+    }
+    
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        List<TextBlock> tbs = doc.getTextBlocks();
+        if (tbs.isEmpty()) {
+            return false;
+        }
+        for (TextBlock tb : tbs) {
+            tb.setIsContent(!tb.isContent());
+        }
+
+        return true;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java
new file mode 100644
index 0000000..3178f0b
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java
@@ -0,0 +1,59 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks that contain a given label as "boilerplate".
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class LabelToBoilerplateFilter implements BoilerpipeFilter {
+	public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT = new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT);
+	
+    private String[] labels;
+
+    public LabelToBoilerplateFilter(final String... label) {
+        this.labels = label;
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
+            if (tb.isContent()) {
+                for (String label : labels) {
+                    if (tb.hasLabel(label)) {
+                        tb.setIsContent(false);
+                        changes = true;
+                        continue BLOCK_LOOP;
+                    }
+                }
+            }
+        }
+
+        return changes;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java
new file mode 100644
index 0000000..e4bf856
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java
@@ -0,0 +1,56 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Marks all blocks that contain a given label as "content".
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class LabelToContentFilter implements BoilerpipeFilter {
+    private String[] labels;
+
+    public LabelToContentFilter(final String... label) {
+        this.labels = label;
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
+            if (!tb.isContent()) {
+                for (String label : labels) {
+                    if (tb.hasLabel(label)) {
+                        tb.setIsContent(true);
+                        changes = true;
+                        continue BLOCK_LOOP;
+                    }
+                }
+            }
+        }
+
+        return changes;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java
new file mode 100644
index 0000000..e888334
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java
@@ -0,0 +1,50 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Marks all blocks as boilerplate.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter {
+    public static final MarkEverythingBoilerplateFilter INSTANCE = new MarkEverythingBoilerplateFilter();
+    private MarkEverythingBoilerplateFilter() {
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (tb.isContent()) {
+                tb.setIsContent(false);
+                changes = true;
+            }
+        }
+
+        return changes;
+
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java
new file mode 100644
index 0000000..8a8b7be
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java
@@ -0,0 +1,50 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Marks all blocks as content.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class MarkEverythingContentFilter implements BoilerpipeFilter {
+    public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter();
+    private MarkEverythingContentFilter() {
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (!tb.isContent()) {
+                tb.setIsContent(true);
+                changes = true;
+            }
+        }
+
+        return changes;
+
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java
new file mode 100644
index 0000000..d326059
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java
@@ -0,0 +1,113 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Keeps only blocks that have at least one segment fragment ("clause") with at
+ * least <em>k</em> words (default: 5).
+ * 
+ * NOTE: You might consider using the {@link SplitParagraphBlocksFilter}
+ * upstream.
+ * 
+ * @author Christian Kohlschütter
+ * @see SplitParagraphBlocksFilter
+ */
+public final class MinClauseWordsFilter implements BoilerpipeFilter {
+    public static final MinClauseWordsFilter INSTANCE = new MinClauseWordsFilter(
+            5, false);
+    private int minWords;
+    private final boolean acceptClausesWithoutDelimiter;
+
+    public MinClauseWordsFilter(final int minWords) {
+        this(minWords, false);
+    }
+
+    public MinClauseWordsFilter(final int minWords,
+            final boolean acceptClausesWithoutDelimiter) {
+        this.minWords = minWords;
+        this.acceptClausesWithoutDelimiter = acceptClausesWithoutDelimiter;
+    }
+
+    private final Pattern PAT_CLAUSE_DELIMITER = Pattern
+            .compile("[\\p{L}\\d][\\,\\.\\:\\;\\!\\?]+([ \\n\\r]+|$)");
+    private final Pattern PAT_WHITESPACE = Pattern.compile("[ \\n\\r]+");
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (!tb.isContent()) {
+                continue;
+            }
+            final String text = tb.getText();
+
+            Matcher m = PAT_CLAUSE_DELIMITER.matcher(text);
+            boolean found = m.find();
+            int start = 0;
+            int end;
+            boolean hasClause = false;
+            while (found) {
+                end = m.start() + 1;
+                hasClause = isClause(text.subSequence(start, end));
+                start = m.end();
+
+                if (hasClause) {
+                    break;
+                }
+                found = m.find();
+            }
+            end = text.length();
+
+            // since clauses should *always end* with a delimiter, we normally
+            // don't consider text without one
+            if (acceptClausesWithoutDelimiter) {
+                hasClause |= isClause(text.subSequence(start, end));
+            }
+
+            if (!hasClause) {
+                tb.setIsContent(false);
+                changes = true;
+                // System.err.println("IS NOT CONTENT: " + text);
+            }
+        }
+
+        return changes;
+
+    }
+
+    private boolean isClause(final CharSequence text) {
+        Matcher m = PAT_WHITESPACE.matcher(text);
+        int n = 1;
+        while (m.find()) {
+            n++;
+            if (n >= minWords) {
+                return true;
+            }
+        }
+        return n >= minWords;
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java
new file mode 100644
index 0000000..a3a49c4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java
@@ -0,0 +1,56 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Keeps only those content blocks which contain at least <em>k</em> words.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class MinWordsFilter implements BoilerpipeFilter {
+    private final int minWords;
+
+    public MinWordsFilter(final int minWords) {
+        this.minWords = minWords;
+    }
+
+    public boolean process(final TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        boolean changes = false;
+
+        for (TextBlock tb : doc.getTextBlocks()) {
+            if (!tb.isContent()) {
+                continue;
+            }
+            if (tb.getNumWords() < minWords) {
+                tb.setIsContent(false);
+                changes = true;
+            }
+
+        }
+
+        return changes;
+
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java
new file mode 100644
index 0000000..86fae33
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java
@@ -0,0 +1,82 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Splits TextBlocks at paragraph boundaries.
+ * 
+ * NOTE: This is not fully supported (i.e., it will break highlighting support
+ * via #getContainedTextElements()), but this one probably is necessary for some other
+ * filters.
+ * 
+ * @author Christian Kohlschütter
+ * @see MinClauseWordsFilter
+ */
+public final class SplitParagraphBlocksFilter implements BoilerpipeFilter {
+    public static final SplitParagraphBlocksFilter INSTANCE = new SplitParagraphBlocksFilter();
+
+    /**
+     * Returns the singleton instance for TerminatingBlocksFinder.
+     */
+    public static SplitParagraphBlocksFilter getInstance() {
+        return INSTANCE;
+    }
+
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+        boolean changes = false;
+
+        final List<TextBlock> blocks = doc.getTextBlocks();
+        final List<TextBlock> blocksNew = new ArrayList<TextBlock>();
+
+        for (TextBlock tb : blocks) {
+            final String text = tb.getText();
+            final String[] paragraphs = text.split("[\n\r]+");
+            if (paragraphs.length < 2) {
+                blocksNew.add(tb);
+                continue;
+            }
+            final boolean isContent = tb.isContent();
+            final Set<String> labels = tb.getLabels();
+            for (String p : paragraphs) {
+                final TextBlock tbP = new TextBlock(p);
+                tbP.setIsContent(isContent);
+                tbP.addLabels(labels);
+                blocksNew.add(tbP);
+                changes = true;
+            }
+        }
+
+        if (changes) {
+            blocks.clear();
+            blocks.addAll(blocksNew);
+        }
+
+        return changes;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java
new file mode 100644
index 0000000..28cf002
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java
@@ -0,0 +1,54 @@
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.conditions.TextBlockCondition;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+public class SurroundingToContentFilter implements BoilerpipeFilter {
+	public static final SurroundingToContentFilter INSTANCE_TEXT = new SurroundingToContentFilter(new TextBlockCondition() {
+		
+		public boolean meetsCondition(TextBlock tb) {
+			return tb.getLinkDensity() == 0 && tb.getNumWords() > 6;
+		}
+	});
+
+	private final TextBlockCondition cond;
+    public SurroundingToContentFilter(final TextBlockCondition cond) {
+		this.cond = cond;
+    }
+    
+    public boolean process(TextDocument doc)
+            throws BoilerpipeProcessingException {
+
+        List<TextBlock> tbs = doc.getTextBlocks();
+        if (tbs.size() < 3) {
+            return false;
+        }
+        
+        TextBlock a = tbs.get(0);
+        TextBlock b = tbs.get(1);
+        TextBlock c;
+        boolean hasChanges = false;
+        for (Iterator<TextBlock> it= tbs.listIterator(2);it.hasNext();) {
+            c = it.next();
+            if(!b.isContent() && a.isContent() && c.isContent() && cond.meetsCondition(b)) {
+            	b.setIsContent(true);
+            	hasChanges = true;
+            }
+            
+            a = c;
+            if(!it.hasNext()) {
+            	break;
+            }
+            b = it.next();
+        }
+
+        return hasChanges;
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/package.html b/src/main/java/de/l3s/boilerpipe/filters/simple/package.html
new file mode 100644
index 0000000..bc7a25d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+	<p>The BoilerpipeFilters in this package are straight-forward and
+		probably not really specific to English.</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java b/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java
new file mode 100644
index 0000000..220e8df
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java
@@ -0,0 +1,43 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.labels;
+
+import de.l3s.boilerpipe.conditions.TextBlockCondition;
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Adds labels to a {@link TextBlock} if the given criteria are met.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class ConditionalLabelAction extends LabelAction {
+
+    private final TextBlockCondition condition;
+
+    public ConditionalLabelAction(TextBlockCondition condition,
+            String... labels) {
+        super(labels);
+        this.condition = condition;
+    }
+
+    public void addTo(final TextBlock tb) {
+        if (condition.meetsCondition(tb)) {
+            addLabelsTo(tb);
+        }
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java b/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java
new file mode 100644
index 0000000..3c56533
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java
@@ -0,0 +1,48 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.labels;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Some pre-defined labels which can be used in conjunction with
+ * {@link TextBlock#addLabel(String)} and {@link TextBlock#hasLabel(String)}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class DefaultLabels {
+    public static final String TITLE = "de.l3s.boilerpipe/TITLE";
+    public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA";
+    public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT";
+    public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT";
+    public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT";
+    public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT";
+    public static final String HR = "de.l3s.boilerpipe/HR";
+    public static final String LI = "de.l3s.boilerpipe/LI";
+
+    public static final String HEADING = "de.l3s.boilerpipe/HEADING";
+    public static final String H1 = "de.l3s.boilerpipe/H1";
+    public static final String H2 = "de.l3s.boilerpipe/H2";
+    public static final String H3 = "de.l3s.boilerpipe/H3";
+    
+    public static final String MARKUP_PREFIX = "<";
+    
+    private DefaultLabels() {
+    	// not to be instantiated
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java b/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java
new file mode 100644
index 0000000..b725f2e
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java
@@ -0,0 +1,48 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.labels;
+
+import java.util.Arrays;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Helps adding labels to {@link TextBlock}s.
+ * 
+ * @author Christian Kohlschütter
+ * @see ConditionalLabelAction
+ */
+public class LabelAction {
+    protected final String[] labels;
+    
+    public LabelAction(String... labels) {
+        this.labels = labels;
+    }
+    
+    public void addTo(final TextBlock tb) {
+        addLabelsTo(tb);
+    }
+    
+    protected final void addLabelsTo(final TextBlock tb) {
+        tb.addLabels(labels);
+    }
+    
+    public String toString() {
+    	return super.toString()+"{"+Arrays.asList(labels)+"}";
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/package.html b/src/main/java/de/l3s/boilerpipe/package.html
new file mode 100644
index 0000000..81c88d6
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+	<p>The Boilerpipe top-level package.</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
new file mode 100644
index 0000000..f8cd767
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
@@ -0,0 +1,454 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.LabelAction;
+import de.l3s.boilerpipe.util.UnicodeTokenizer;
+
+/**
+ * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can
+ * be used by different parser implementations, e.g. NekoHTML and TagSoup.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class BoilerpipeHTMLContentHandler implements ContentHandler {
+
+	private final Map<String, TagAction> tagActions;
+	private String title = null;
+
+	static final String ANCHOR_TEXT_START = "$\ue00a<";
+	static final String ANCHOR_TEXT_END = ">\ue00a$";
+
+	StringBuilder tokenBuffer = new StringBuilder();
+	StringBuilder textBuffer = new StringBuilder();
+
+	int inBody = 0;
+	int inAnchor = 0;
+	int inIgnorableElement = 0;
+
+	int tagLevel = 0;
+	int blockTagLevel = -1;
+
+	boolean sbLastWasWhitespace = false;
+	private int textElementIdx = 0;
+
+	private final List<TextBlock> textBlocks = new ArrayList<TextBlock>();
+
+	private String lastStartTag = null;
+	@SuppressWarnings("unused")
+	private String lastEndTag = null;
+	@SuppressWarnings("unused")
+	private Event lastEvent = null;
+
+	private int offsetBlocks = 0;
+	private BitSet currentContainedTextElements = new BitSet();
+
+	private boolean flush = false;
+	boolean inAnchorText = false;
+
+	LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedList<LabelAction>>();
+	LinkedList<Integer> fontSizeStack = new LinkedList<Integer>();
+
+	/**
+	 * Recycles this instance.
+	 */
+	public void recycle() {
+		tokenBuffer.setLength(0);
+		textBuffer.setLength(0);
+
+		inBody = 0;
+		inAnchor = 0;
+		inIgnorableElement = 0;
+		sbLastWasWhitespace = false;
+		textElementIdx = 0;
+
+		textBlocks.clear();
+
+		lastStartTag = null;
+		lastEndTag = null;
+		lastEvent = null;
+
+		offsetBlocks = 0;
+		currentContainedTextElements.clear();
+
+		flush = false;
+		inAnchorText = false;
+	}
+
+	/**
+	 * Constructs a {@link BoilerpipeHTMLContentHandler} using the
+	 * {@link DefaultTagActionMap}.
+	 */
+	public BoilerpipeHTMLContentHandler() {
+		this(DefaultTagActionMap.INSTANCE);
+	}
+
+	/**
+	 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given
+	 * {@link TagActionMap}.
+	 * 
+	 * @param tagActions
+	 *            The {@link TagActionMap} to use, e.g.
+	 *            {@link DefaultTagActionMap}.
+	 */
+	public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) {
+		this.tagActions = tagActions;
+	}
+
+	// @Override
+	public void endDocument() throws SAXException {
+		flushBlock();
+	}
+
+	// @Override
+	public void endPrefixMapping(String prefix) throws SAXException {
+	}
+
+	// @Override
+	public void ignorableWhitespace(char[] ch, int start, int length)
+			throws SAXException {
+		if (!sbLastWasWhitespace) {
+			textBuffer.append(' ');
+			tokenBuffer.append(' ');
+		}
+		sbLastWasWhitespace = true;
+	}
+
+	// @Override
+	public void processingInstruction(String target, String data)
+			throws SAXException {
+	}
+
+	// @Override
+	public void setDocumentLocator(Locator locator) {
+	}
+
+	// @Override
+	public void skippedEntity(String name) throws SAXException {
+	}
+
+	// @Override
+	public void startDocument() throws SAXException {
+	}
+
+	// @Override
+	public void startPrefixMapping(String prefix, String uri)
+			throws SAXException {
+	}
+
+	// @Override
+	public void startElement(String uri, String localName, String qName,
+			Attributes atts) throws SAXException {
+		labelStacks.add(null);
+
+		TagAction ta = tagActions.get(localName);
+		if (ta != null) {
+			if(ta.changesTagLevel()) {
+				tagLevel++;
+			}
+			flush = ta.start(this, localName, qName, atts) | flush;
+		} else {
+			tagLevel++;
+			flush = true;
+		}
+
+		lastEvent = Event.START_TAG;
+		lastStartTag = localName;
+	}
+
+	// @Override
+	public void endElement(String uri, String localName, String qName)
+			throws SAXException {
+		TagAction ta = tagActions.get(localName);
+		if (ta != null) {
+			flush = ta.end(this, localName, qName) | flush;
+		} else {
+			flush = true;
+		}
+		
+		if(ta == null || ta.changesTagLevel()) {
+			tagLevel--;
+		}
+		
+		if (flush) {
+			flushBlock();
+		}
+
+		lastEvent = Event.END_TAG;
+		lastEndTag = localName;
+
+		labelStacks.removeLast();
+	}
+
+	// @Override
+	public void characters(char[] ch, int start, int length)
+			throws SAXException {
+		textElementIdx++;
+
+	
+		if (flush) {
+			flushBlock();
+			flush = false;
+		}
+
+		if (inIgnorableElement != 0) {
+			return;
+		}
+
+		char c;
+		boolean startWhitespace = false;
+		boolean endWhitespace = false;
+		if (length == 0) {
+			return;
+		}
+
+		final int end = start + length;
+		for (int i = start; i < end; i++) {
+			if (Character.isWhitespace(ch[i])) {
+				ch[i] = ' ';
+			}
+		}
+		while (start < end) {
+			c = ch[start];
+			if (c == ' ') {
+				startWhitespace = true;
+				start++;
+				length--;
+			} else {
+				break;
+			}
+		}
+		while (length > 0) {
+			c = ch[start + length - 1];
+			if (c == ' ') {
+				endWhitespace = true;
+				length--;
+			} else {
+				break;
+			}
+		}
+		if (length == 0) {
+			if (startWhitespace || endWhitespace) {
+				if (!sbLastWasWhitespace) {
+					textBuffer.append(' ');
+					tokenBuffer.append(' ');
+				}
+				sbLastWasWhitespace = true;
+			} else {
+				sbLastWasWhitespace = false;
+			}
+			lastEvent = Event.WHITESPACE;
+			return;
+		}
+		if (startWhitespace) {
+			if (!sbLastWasWhitespace) {
+				textBuffer.append(' ');
+				tokenBuffer.append(' ');
+			}
+		}
+		
+		if (blockTagLevel == -1) {
+			blockTagLevel = tagLevel;
+		}
+
+		textBuffer.append(ch, start, length);
+		tokenBuffer.append(ch, start, length);
+		if (endWhitespace) {
+			textBuffer.append(' ');
+			tokenBuffer.append(' ');
+		}
+
+		sbLastWasWhitespace = endWhitespace;
+		lastEvent = Event.CHARACTERS;
+
+		currentContainedTextElements.set(textElementIdx);
+	}
+
+	List<TextBlock> getTextBlocks() {
+		return textBlocks;
+	}
+
+	public void flushBlock() {
+		if (inBody == 0) {
+			if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) {
+				setTitle(tokenBuffer.toString().trim());
+			}
+			textBuffer.setLength(0);
+			tokenBuffer.setLength(0);
+			return;
+		}
+
+		final int length = tokenBuffer.length();
+		switch (length) {
+		case 0:
+			return;
+		case 1:
+			if (sbLastWasWhitespace) {
+				textBuffer.setLength(0);
+				tokenBuffer.setLength(0);
+				return;
+			}
+		}
+		final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer);
+
+		int numWords = 0;
+		int numLinkedWords = 0;
+		int numWrappedLines = 0;
+		int currentLineLength = -1; // don't count the first space
+		final int maxLineLength = 80;
+		int numTokens = 0;
+		int numWordsCurrentLine = 0;
+
+		for (String token : tokens) {
+			if (ANCHOR_TEXT_START.equals(token)) {
+				inAnchorText = true;
+			} else if (ANCHOR_TEXT_END.equals(token)) {
+				inAnchorText = false;
+			} else if (isWord(token)) {
+				numTokens++;
+				numWords++;
+				numWordsCurrentLine++;
+				if (inAnchorText) {
+					numLinkedWords++;
+				}
+				final int tokenLength = token.length();
+				currentLineLength += tokenLength + 1;
+				if (currentLineLength > maxLineLength) {
+					numWrappedLines++;
+					currentLineLength = tokenLength;
+					numWordsCurrentLine = 1;
+				}
+			} else {
+				numTokens++;
+			}
+		}
+		if (numTokens == 0) {
+			return;
+		}
+		int numWordsInWrappedLines;
+		if (numWrappedLines == 0) {
+			numWordsInWrappedLines = numWords;
+			numWrappedLines = 1;
+		} else {
+			numWordsInWrappedLines = numWords - numWordsCurrentLine;
+		}
+
+		TextBlock tb = new TextBlock(textBuffer.toString().trim(),
+				currentContainedTextElements, numWords, numLinkedWords,
+				numWordsInWrappedLines, numWrappedLines, offsetBlocks);
+		currentContainedTextElements = new BitSet();
+
+		offsetBlocks++;
+
+		textBuffer.setLength(0);
+		tokenBuffer.setLength(0);
+
+		tb.setTagLevel(blockTagLevel);
+		addTextBlock(tb);
+		blockTagLevel = -1;
+	}
+
+	protected void addTextBlock(final TextBlock tb) {
+
+		for (Integer l : fontSizeStack) {
+			if (l != null) {
+				tb.addLabel("font-" + l);
+				break;
+			}
+		}
+		for (LinkedList<LabelAction> labelStack : labelStacks) {
+			if (labelStack != null) {
+				for (LabelAction labels : labelStack) {
+					if (labels != null) {
+						labels.addTo(tb);
+					}
+				}
+			}
+		}
+
+		textBlocks.add(tb);
+	}
+
+	private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
+			.compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]");
+
+	private static boolean isWord(final String token) {
+		return PAT_VALID_WORD_CHARACTER.matcher(token).find();
+	}
+
+	static private enum Event {
+		START_TAG, END_TAG, CHARACTERS, WHITESPACE
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String s) {
+		if (s == null || s.length() == 0) {
+			return;
+		}
+		title = s;
+	}
+
+	/**
+	 * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
+	 * s. NOTE: Only call this after parsing.
+	 * 
+	 * @return The {@link TextDocument}
+	 */
+	public TextDocument toTextDocument() {
+		// just to be sure
+		flushBlock();
+
+		return new TextDocument(getTitle(), getTextBlocks());
+	}
+
+	public void addWhitespaceIfNecessary() {
+		if (!sbLastWasWhitespace) {
+			tokenBuffer.append(' ');
+			textBuffer.append(' ');
+			sbLastWasWhitespace = true;
+		}
+	}
+
+	public void addLabelAction(final LabelAction la)
+			throws IllegalStateException {
+		LinkedList<LabelAction> labelStack = labelStacks.getLast();
+		if (labelStack == null) {
+			labelStack = new LinkedList<LabelAction>();
+			labelStacks.removeLast();
+			labelStacks.add(labelStack);
+		}
+		labelStack.add(la);
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java
new file mode 100644
index 0000000..79dcc72
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java
@@ -0,0 +1,76 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+
+import de.l3s.boilerpipe.BoilerpipeDocumentSource;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * A simple SAX Parser, used by {@link BoilerpipeSAXInput}. The parser uses <a
+ * href="http://nekohtml.sourceforge.net/">CyberNeko</a> to parse HTML content.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class BoilerpipeHTMLParser extends AbstractSAXParser implements BoilerpipeDocumentSource {
+
+    private BoilerpipeHTMLContentHandler contentHandler;
+
+    /**
+     * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
+     */
+    public BoilerpipeHTMLParser() {
+        this(new BoilerpipeHTMLContentHandler());
+    }
+
+    /**
+     * Constructs a {@link BoilerpipeHTMLParser} using the given {@link BoilerpipeHTMLContentHandler}.
+     *
+     * @param contentHandler
+     */
+    public BoilerpipeHTMLParser(BoilerpipeHTMLContentHandler contentHandler) {
+        super(new HTMLConfiguration());
+        setContentHandler(contentHandler);
+    }
+    
+    protected BoilerpipeHTMLParser(boolean ignore) {
+    	super(new HTMLConfiguration());
+    }
+
+    public void setContentHandler(final BoilerpipeHTMLContentHandler contentHandler) {
+    	this.contentHandler = contentHandler;
+    	super.setContentHandler(contentHandler);
+    }
+    public void setContentHandler(final org.xml.sax.ContentHandler contentHandler) {
+    	this.contentHandler = null;
+    	super.setContentHandler(contentHandler);
+    }
+    
+    /**
+     * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
+     * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
+     * 
+     * @return The {@link TextDocument}
+     */
+    public TextDocument toTextDocument() {
+        return contentHandler.toTextDocument();
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java
new file mode 100644
index 0000000..f95fd41
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java
@@ -0,0 +1,73 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeInput;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Parses an {@link InputSource} using SAX and returns a {@link TextDocument}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class BoilerpipeSAXInput implements BoilerpipeInput {
+    private final InputSource is;
+
+    /**
+     * Creates a new instance of {@link BoilerpipeSAXInput} for the given {@link InputSource}.
+     *
+     * @param is
+     * @throws SAXException
+     */
+    public BoilerpipeSAXInput(final InputSource is) throws SAXException {
+        this.is = is;
+    }
+
+    /**
+     * Retrieves the {@link TextDocument} using a default HTML parser.
+     */
+    public TextDocument getTextDocument() throws BoilerpipeProcessingException {
+        return getTextDocument(new BoilerpipeHTMLParser());
+    }
+    
+    /**
+     * Retrieves the {@link TextDocument} using the given HTML parser.
+     * 
+     * @param parser The parser used to transform the input into boilerpipe's internal representation.
+     * @return The retrieved {@link TextDocument}
+     * @throws BoilerpipeProcessingException
+     */
+    public TextDocument getTextDocument(final BoilerpipeHTMLParser parser) throws BoilerpipeProcessingException {
+        try {
+            parser.parse(is);
+        } catch (IOException e) {
+            throw new BoilerpipeProcessingException(e);
+        } catch (SAXException e) {
+            throw new BoilerpipeProcessingException(e);
+        }
+        
+        return parser.toTextDocument();
+    }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java b/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java
new file mode 100644
index 0000000..7b9c410
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java
@@ -0,0 +1,357 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.labels.LabelAction;
+
+/**
+ * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing.
+ * 
+ * @author Christian Kohlschütter
+ */
+public abstract class CommonTagActions {
+
+	private CommonTagActions() {
+	}
+
+    public static final class Chained implements TagAction {
+
+        private final TagAction t1;
+        private final TagAction t2;
+
+        public Chained(final TagAction t1, final TagAction t2) {
+            this.t1 = t1;
+            this.t2 = t2;
+        }
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                String localName, String qName, Attributes atts)
+                throws SAXException {
+            return t1.start(instance, localName, qName, atts)
+                    | t2.start(instance, localName, qName, atts);
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                String localName, String qName) throws SAXException {
+            return t1.end(instance, localName, qName)
+                    | t2.end(instance, localName, qName);
+        }
+
+        public boolean changesTagLevel() {
+        	return t1.changesTagLevel() || t2.changesTagLevel();
+        }
+    }
+
+    /**
+     * Marks this tag as "ignorable", i.e. all its inner content is silently skipped.
+     */
+    public static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
+
+        public boolean start(final BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            instance.inIgnorableElement++;
+            return true;
+        }
+
+        public boolean end(final BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            instance.inIgnorableElement--;
+            return true;
+        }
+        
+        public boolean changesTagLevel() {
+        	return true;
+        }
+    };
+    
+    /**
+     * Marks this tag as "anchor" (this should usually only be set for the <code>&lt;A&gt;</code> tag).
+     * Anchor tags may not be nested.
+     * 
+     * There is a bug in certain versions of NekoHTML which still allows nested tags.
+     * If boilerpipe encounters such nestings, a SAXException is thrown.
+     */
+    public static final TagAction TA_ANCHOR_TEXT = new TagAction() {
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) throws SAXException {
+            if (instance.inAnchor++ > 0) {
+                // as nested A elements are not allowed per specification, we
+                // are probably reaching this branch due to a bug in the XML
+                // parser
+            	System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...");
+            	
+            	end(instance, localName, qName);
+            }
+            if (instance.inIgnorableElement == 0) {
+                instance.addWhitespaceIfNecessary();
+                instance.tokenBuffer
+                        .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_START);
+                instance.tokenBuffer.append(' ');
+                instance.sbLastWasWhitespace = true;
+            }
+            return false;
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            if (--instance.inAnchor == 0) {
+                if (instance.inIgnorableElement == 0) {
+                    instance.addWhitespaceIfNecessary();
+                    instance.tokenBuffer
+                            .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_END);
+                    instance.tokenBuffer.append(' ');
+                    instance.sbLastWasWhitespace = true;
+                }
+            }
+            return false;
+        }
+
+        public boolean changesTagLevel() {
+        	return true;
+        }
+    };
+    
+    /**
+     * Marks this tag the body element (this should usually only be set for the <code>&lt;BODY&gt;</code> tag).
+     */
+    public static final TagAction TA_BODY = new TagAction() {
+        public boolean start(final BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            instance.flushBlock();
+            instance.inBody++;
+            return false;
+        }
+
+        public boolean end(final BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            instance.flushBlock();
+            instance.inBody--;
+            return false;
+        }
+        
+        public boolean changesTagLevel() {
+        	return true;
+        }
+    };
+
+    /**
+     * Marks this tag a simple "inline" element, which generates whitespace, but no new block.
+     */
+    public static final TagAction TA_INLINE_WHITESPACE = new TagAction() {
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            instance.addWhitespaceIfNecessary();
+            return false;
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            instance.addWhitespaceIfNecessary();
+            return false;
+        }
+        
+        public boolean changesTagLevel() {
+        	return false;
+        }
+    };
+    
+    /**
+     * @deprecated Use {@link #TA_INLINE_WHITESPACE} instead
+     */
+    @Deprecated
+    public static final TagAction TA_INLINE = TA_INLINE_WHITESPACE;
+    
+    /**
+     * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
+     */
+    public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() {
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            return false;
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            return false;
+        }
+
+        public boolean changesTagLevel() {
+        	return false;
+        }
+    };
+    private static final Pattern PAT_FONT_SIZE = Pattern
+            .compile("([\\+\\-]?)([0-9])");
+    
+    /**
+     * Explicitly marks this tag a simple "block-level" element, which always generates whitespace
+     */
+    public static final TagAction TA_BLOCK_LEVEL = new TagAction() {
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            return true;
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            return true;
+        }
+        
+        public boolean changesTagLevel() {
+        	return true;
+        }
+    };    
+    
+    /**
+     * Special TagAction for the <code>&lt;FONT&gt;</code> tag, which keeps track of the
+     * absolute and relative font size.
+     */
+    public static final TagAction TA_FONT = new TagAction() {
+
+        public boolean start(final BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+
+            String sizeAttr = atts.getValue("size");
+            if (sizeAttr != null) {
+                Matcher m = PAT_FONT_SIZE.matcher(sizeAttr);
+                if (m.matches()) {
+                    String rel = m.group(1);
+                    final int val = Integer.parseInt(m.group(2));
+                    final int size;
+                    if (rel.length() == 0) {
+                        // absolute
+                        size = val;
+                    } else {
+                        // relative
+                        int prevSize;
+                        if (instance.fontSizeStack.isEmpty()) {
+                            prevSize = 3;
+                        } else {
+                            prevSize = 3;
+                            for (Integer s : instance.fontSizeStack) {
+                                if (s != null) {
+                                    prevSize = s;
+                                    break;
+                                }
+                            }
+                        }
+                        if (rel.charAt(0) == '+') {
+                            size = prevSize + val;
+                        } else {
+                            size = prevSize - val;
+                        }
+
+                    }
+                    instance.fontSizeStack.add(0, size);
+                } else {
+                    instance.fontSizeStack.add(0, null);
+                }
+            } else {
+                instance.fontSizeStack.add(0, null);
+            }
+            return false;
+        }
+
+        public boolean end(final BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            instance.fontSizeStack.removeFirst();
+            return false;
+        }
+        
+        public boolean changesTagLevel() {
+        	return false;
+        }
+    };
+
+    /**
+     * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated
+     * {@link TextBlock}.
+     */
+    public static final class InlineTagLabelAction implements TagAction {
+
+        private final LabelAction action;
+
+        public InlineTagLabelAction(final LabelAction action) {
+            this.action = action;
+        }
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            instance.addWhitespaceIfNecessary();
+            instance.addLabelAction(action);
+            return false;
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            instance.addWhitespaceIfNecessary();
+            return false;
+        }
+        
+        public boolean changesTagLevel() {
+        	return false;
+        }
+    }
+
+    /**
+     * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated
+     * {@link TextBlock}.
+     */
+    public static final class BlockTagLabelAction implements TagAction {
+
+        private final LabelAction action;
+
+        public BlockTagLabelAction(final LabelAction action) {
+            this.action = action;
+        }
+
+        public boolean start(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName,
+                final Attributes atts) {
+            instance.addLabelAction(action);
+            return true;
+        }
+
+        public boolean end(BoilerpipeHTMLContentHandler instance,
+                final String localName, final String qName) {
+            return true;
+        }
+        
+        public boolean changesTagLevel() {
+        	return true;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java b/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java
new file mode 100644
index 0000000..cf48dac
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java
@@ -0,0 +1,86 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import de.l3s.boilerpipe.labels.DefaultLabels;
+import de.l3s.boilerpipe.labels.LabelAction;
+
+
+/**
+ * Default {@link TagAction}s. Seem to work well.
+ * 
+ * @see TagActionMap
+ */
+public class DefaultTagActionMap extends TagActionMap {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    public static final TagActionMap INSTANCE = new DefaultTagActionMap();
+
+    protected DefaultTagActionMap() {
+        setTagAction("STYLE", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        setTagAction("SCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        setTagAction("OPTION", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        setTagAction("OBJECT", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        setTagAction("EMBED", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        setTagAction("APPLET", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        setTagAction("LINK", CommonTagActions.TA_IGNORABLE_ELEMENT);
+
+        setTagAction("A", CommonTagActions.TA_ANCHOR_TEXT);
+        setTagAction("BODY", CommonTagActions.TA_BODY);
+
+        setTagAction("STRIKE", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("U", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("B", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("I", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("EM", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("STRONG", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("SPAN", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        
+        // New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
+        setTagAction("SUP", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        
+        // New in 1.2
+        setTagAction("CODE", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("TT", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("SUB", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+        setTagAction("VAR", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+
+
+        setTagAction("ABBR", CommonTagActions.TA_INLINE_WHITESPACE);
+        setTagAction("ACRONYM", CommonTagActions.TA_INLINE_WHITESPACE);
+
+        setTagAction("FONT", CommonTagActions.TA_INLINE_NO_WHITESPACE); // could also use TA_FONT 
+
+        // added in 1.1.1
+        setTagAction("NOSCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT);
+        
+        // New in 1.3
+		setTagAction("LI", new CommonTagActions.BlockTagLabelAction(
+				new LabelAction(DefaultLabels.LI)));
+		setTagAction("H1", new CommonTagActions.BlockTagLabelAction(
+				new LabelAction(DefaultLabels.H1, DefaultLabels.HEADING)));
+		setTagAction("H2", new CommonTagActions.BlockTagLabelAction(
+				new LabelAction(DefaultLabels.H2, DefaultLabels.HEADING)));
+		setTagAction("H3", new CommonTagActions.BlockTagLabelAction(
+				new LabelAction(DefaultLabels.H3, DefaultLabels.HEADING)));
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java
new file mode 100644
index 0000000..9cf2d87
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java
@@ -0,0 +1,41 @@
+package de.l3s.boilerpipe.sax;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+
+import org.xml.sax.InputSource;
+
+/**
+ * An {@link InputSourceable} for {@link HTMLFetcher}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class HTMLDocument implements InputSourceable {
+	private final Charset charset;
+	private final byte[] data;
+
+	public HTMLDocument(final byte[] data, final Charset charset) {
+		this.data = data;
+		this.charset = charset;
+	}
+	
+	public HTMLDocument(final String data) {
+		Charset cs = Charset.forName("utf-8");
+		this.data = data.getBytes(cs);
+		this.charset = cs;
+	}
+	
+	public Charset getCharset() {
+		return charset;
+	}
+	
+	public byte[] getData() {
+		return data;
+	}
+	
+	public InputSource toInputSource() {
+		final InputSource is = new InputSource(new ByteArrayInputStream(data));
+		is.setEncoding(charset.name());
+		return is;
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java
new file mode 100644
index 0000000..2c2e0c4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java
@@ -0,0 +1,79 @@
+package de.l3s.boilerpipe.sax;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * A very simple HTTP/HTML fetcher, really just for demo purposes.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class HTMLFetcher {
+	private HTMLFetcher() {
+	}
+
+	private static final Pattern PAT_CHARSET = Pattern
+			.compile("charset=([^; ]+)$");
+
+	/**
+	 * Fetches the document at the given URL, using {@link URLConnection}.
+	 * 
+	 * @param url
+	 * @return the document at the given URL
+	 * @throws IOException
+	 */
+	public static HTMLDocument fetch(final URL url) throws IOException {
+		final URLConnection conn = url.openConnection();
+		final String ct = conn.getContentType();
+
+		if (ct == null
+				|| !(ct.equals("text/html") || ct.startsWith("text/html;"))) {
+			throw new IOException("Unsupported content type: "+ct);
+		}
+
+		Charset cs = Charset.forName("Cp1252");
+		if (ct != null) {
+			Matcher m = PAT_CHARSET.matcher(ct);
+			if (m.find()) {
+				final String charset = m.group(1);
+				try {
+					cs = Charset.forName(charset);
+				} catch (UnsupportedCharsetException e) {
+					// keep default
+				}
+			}
+		}
+
+		InputStream in = conn.getInputStream();
+
+		final String encoding = conn.getContentEncoding();
+		if (encoding != null) {
+			if ("gzip".equalsIgnoreCase(encoding)) {
+				in = new GZIPInputStream(in);
+			} else {
+				System.err.println("WARN: unsupported Content-Encoding: "
+						+ encoding);
+			}
+		}
+
+		ByteArrayOutputStream bos = new ByteArrayOutputStream();
+		byte[] buf = new byte[4096];
+		int r;
+		while ((r = in.read(buf)) != -1) {
+			bos.write(buf, 0, r);
+		}
+		in.close();
+
+		final byte[] data = bos.toByteArray();
+
+		return new HTMLDocument(data, cs);
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java
new file mode 100644
index 0000000..4a300c3
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java
@@ -0,0 +1,530 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Highlights text blocks in an HTML document that have been marked as "content"
+ * in the corresponding {@link TextDocument}.
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class HTMLHighlighter {
+
+	private Map<String, Set<String>> tagWhitelist = null;
+
+	/**
+	 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
+	 * HTML text, with the extracted text portion <b>highlighted</b>.
+	 */
+	public static HTMLHighlighter newHighlightingInstance() {
+		return new HTMLHighlighter(false);
+	}
+
+	/**
+	 * Creates a new {@link HTMLHighlighter}, which is set-up to return only the
+	 * extracted HTML text, including enclosed markup.
+	 */
+	public static HTMLHighlighter newExtractingInstance() {
+		return new HTMLHighlighter(true);
+	}
+
+	private HTMLHighlighter(final boolean extractHTML) {
+		if (extractHTML) {
+			setOutputHighlightOnly(true);
+			setExtraStyleSheet("\n<style type=\"text/css\">\n"
+					+ "A:before { content:' '; } \n" //
+					+ "A:after { content:' '; } \n" //
+					+ "SPAN:before { content:' '; } \n" //
+					+ "SPAN:after { content:' '; } \n" //
+					+ "</style>\n");
+			setPreHighlight("");
+			setPostHighlight("");
+		}
+	}
+
+	/**
+	 * Processes the given {@link TextDocument} and the original HTML text (as a
+	 * String).
+	 * 
+	 * @param doc
+	 *            The processed {@link TextDocument}.
+	 * @param origHTML
+	 *            The original HTML document.
+	 * @return The highlighted HTML.
+	 * @throws BoilerpipeProcessingException
+	 */
+	public String process(final TextDocument doc, final String origHTML)
+			throws BoilerpipeProcessingException {
+		return process(doc, new InputSource(new StringReader(origHTML)));
+	}
+
+	/**
+	 * Processes the given {@link TextDocument} and the original HTML text (as
+	 * an {@link InputSource}).
+	 * 
+	 * @param doc
+	 *            The processed {@link TextDocument}.
+	 *            The original HTML document.
+	 * @return The highlighted HTML.
+	 * @throws BoilerpipeProcessingException
+	 */
+	public String process(final TextDocument doc, final InputSource is)
+			throws BoilerpipeProcessingException {
+		final Implementation implementation = new Implementation();
+		implementation.process(doc, is);
+
+		String html = implementation.html.toString();
+		if (outputHighlightOnly) {
+			Matcher m;
+
+			boolean repeat = true;
+			while (repeat) {
+				repeat = false;
+				m = PAT_TAG_NO_TEXT.matcher(html);
+				if (m.find()) {
+					repeat = true;
+					html = m.replaceAll("");
+				}
+
+				m = PAT_SUPER_TAG.matcher(html);
+				if (m.find()) {
+					repeat = true;
+					html = m.replaceAll(m.group(1));
+				}
+			}
+		}
+
+		return html;
+	}
+
+	private static final Pattern PAT_TAG_NO_TEXT = Pattern
+			.compile("<[^/][^>]*></[^>]*>");
+	private static final Pattern PAT_SUPER_TAG = Pattern
+			.compile("^<[^>]*>(<.*?>)</[^>]*>$");
+
+	/**
+	 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
+	 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
+	 * 
+	 *            The processed {@link TextDocument}.
+	 *            The original HTML document.
+	 * @return The highlighted HTML.
+	 * @throws BoilerpipeProcessingException
+	 */
+	public String process(final URL url, final BoilerpipeExtractor extractor)
+			throws IOException, BoilerpipeProcessingException, SAXException {
+		final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
+
+		final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
+				.getTextDocument();
+		extractor.process(doc);
+
+		final InputSource is = htmlDoc.toInputSource();
+
+		return process(doc, is);
+	}
+
+	private boolean outputHighlightOnly = false;
+	private String extraStyleSheet = "\n<style type=\"text/css\">\n"
+			+ ".x-boilerpipe-mark1 {" + " text-decoration:none; "
+			+ "background-color: #ffff42 !important; "
+			+ "color: black !important; " + "display:inline !important; "
+			+ "visibility:visible !important; }\n" + //
+			"</style>\n";
+	private String preHighlight = "<span class=\"x-boilerpipe-mark1\">";
+	private String postHighlight = "</span>";
+
+	/**
+	 * If true, only HTML enclosed within highlighted content will be returned
+	 */
+	public boolean isOutputHighlightOnly() {
+		return outputHighlightOnly;
+	}
+
+	/**
+	 * Sets whether only HTML enclosed within highlighted content will be
+	 * returned, or the whole HTML document.
+	 */
+	public void setOutputHighlightOnly(boolean outputHighlightOnly) {
+		this.outputHighlightOnly = outputHighlightOnly;
+	}
+
+	/**
+	 * Returns the extra stylesheet definition that will be inserted in the HEAD
+	 * element.
+	 * 
+	 * By default, this corresponds to a simple definition that marks text in
+	 * class "x-boilerpipe-mark1" as inline text with yellow background.
+	 */
+	public String getExtraStyleSheet() {
+		return extraStyleSheet;
+	}
+
+	/**
+	 * Sets the extra stylesheet definition that will be inserted in the HEAD
+	 * element.
+	 * 
+	 * To disable, set it to the empty string: ""
+	 * 
+	 * @param extraStyleSheet
+	 *            Plain HTML
+	 */
+	public void setExtraStyleSheet(String extraStyleSheet) {
+		this.extraStyleSheet = extraStyleSheet;
+	}
+
+	/**
+	 * Returns the string that will be inserted before any highlighted HTML
+	 * block.
+	 * 
+	 * By default, this corresponds to
+	 * <code>&lt;span class=&qupt;x-boilerpipe-mark1&quot;&gt;</code>
+	 */
+	public String getPreHighlight() {
+		return preHighlight;
+	}
+
+	/**
+	 * Sets the string that will be inserted prior to any highlighted HTML
+	 * block.
+	 * 
+	 * To disable, set it to the empty string: ""
+	 */
+	public void setPreHighlight(String preHighlight) {
+		this.preHighlight = preHighlight;
+	}
+
+	/**
+	 * Returns the string that will be inserted after any highlighted HTML
+	 * block.
+	 * 
+	 * By default, this corresponds to <code>&lt;/span&gt;</code>
+	 */
+	public String getPostHighlight() {
+		return postHighlight;
+	}
+
+	/**
+	 * Sets the string that will be inserted after any highlighted HTML block.
+	 * 
+	 * To disable, set it to the empty string: ""
+	 */
+	public void setPostHighlight(String postHighlight) {
+		this.postHighlight = postHighlight;
+	}
+
+	private abstract static class TagAction {
+		void beforeStart(final Implementation instance, final String localName) {
+		}
+
+		void afterStart(final Implementation instance, final String localName) {
+		}
+
+		void beforeEnd(final Implementation instance, final String localName) {
+		}
+
+		void afterEnd(final Implementation instance, final String localName) {
+		}
+	}
+
+	private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
+		void beforeStart(final Implementation instance, final String localName) {
+			instance.inIgnorableElement++;
+		}
+
+		void afterEnd(final Implementation instance, final String localName) {
+			instance.inIgnorableElement--;
+		}
+	};
+
+	private static final TagAction TA_HEAD = new TagAction() {
+		void beforeStart(final Implementation instance, final String localName) {
+			instance.inIgnorableElement++;
+		}
+
+		void beforeEnd(final Implementation instance, String localName) {
+			instance.html.append(instance.hl.extraStyleSheet);
+		}
+
+		void afterEnd(final Implementation instance, final String localName) {
+			instance.inIgnorableElement--;
+		}
+	};
+	private static Map<String, TagAction> TAG_ACTIONS = new HashMap<String, TagAction>();
+	static {
+		TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("OBJECT", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
+		// NOTE: you might want to comment this out:
+		TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
+
+		TAG_ACTIONS.put("HEAD", TA_HEAD);
+	}
+
+	private final class Implementation extends AbstractSAXParser implements
+			ContentHandler {
+		StringBuilder html = new StringBuilder();
+
+		private int inIgnorableElement = 0;
+		private int characterElementIdx = 0;
+		private final BitSet contentBitSet = new BitSet();
+		private final HTMLHighlighter hl = HTMLHighlighter.this;
+
+		Implementation() {
+			super(new HTMLConfiguration());
+			setContentHandler(this);
+		}
+
+		void process(final TextDocument doc, final InputSource is)
+				throws BoilerpipeProcessingException {
+			for (TextBlock block : doc.getTextBlocks()) {
+				if (block.isContent()) {
+					final BitSet bs = block.getContainedTextElements();
+					if (bs != null) {
+						contentBitSet.or(bs);
+					}
+				}
+			}
+
+			try {
+				parse(is);
+			} catch (SAXException e) {
+				throw new BoilerpipeProcessingException(e);
+			} catch (IOException e) {
+				throw new BoilerpipeProcessingException(e);
+			}
+		}
+
+		public void endDocument() throws SAXException {
+		}
+
+		public void endPrefixMapping(String prefix) throws SAXException {
+		}
+
+		public void ignorableWhitespace(char[] ch, int start, int length)
+				throws SAXException {
+		}
+
+		public void processingInstruction(String target, String data)
+				throws SAXException {
+		}
+
+		public void setDocumentLocator(Locator locator) {
+		}
+
+		public void skippedEntity(String name) throws SAXException {
+		}
+
+		public void startDocument() throws SAXException {
+		}
+
+		public void startElement(String uri, String localName, String qName,
+				Attributes atts) throws SAXException {
+			TagAction ta = TAG_ACTIONS.get(localName);
+			if (ta != null) {
+				ta.beforeStart(this, localName);
+			}
+
+			// HACK: remove existing highlight
+			boolean ignoreAttrs = false;
+			if ("SPAN".equalsIgnoreCase(localName)) {
+				String classVal = atts.getValue("class");
+				if ("x-boilerpipe-mark1".equals(classVal)) {
+					ignoreAttrs = true;
+				}
+			}
+
+			try {
+				if (inIgnorableElement == 0) {
+					if (outputHighlightOnly) {
+						// boolean highlight = contentBitSet
+						// .get(characterElementIdx);
+
+						// if (!highlight) {
+						// return;
+						// }
+					}
+
+					final Set<String> whitelistAttributes;
+					if (tagWhitelist == null) {
+						whitelistAttributes = null;
+					} else {
+						whitelistAttributes = tagWhitelist.get(qName);
+						if (whitelistAttributes == null) {
+							// skip
+							return;
+						}
+					}
+
+					html.append('<');
+					html.append(qName);
+					if (!ignoreAttrs) {
+						final int numAtts = atts.getLength();
+						for (int i = 0; i < numAtts; i++) {
+							final String attr = atts.getQName(i);
+
+							if (whitelistAttributes != null
+									&& !whitelistAttributes.contains(attr)) {
+								// skip
+								continue;
+							}
+
+							final String value = atts.getValue(i);
+							html.append(' ');
+							html.append(attr);
+							html.append("=\"");
+							html.append(xmlEncode(value));
+							html.append("\"");
+						}
+					}
+					html.append('>');
+				}
+			} finally {
+				if (ta != null) {
+					ta.afterStart(this, localName);
+				}
+			}
+		}
+
+		public void endElement(String uri, String localName, String qName)
+				throws SAXException {
+			TagAction ta = TAG_ACTIONS.get(localName);
+			if (ta != null) {
+				ta.beforeEnd(this, localName);
+			}
+
+			try {
+				if (inIgnorableElement == 0) {
+					if (outputHighlightOnly) {
+						// boolean highlight = contentBitSet
+						// .get(characterElementIdx);
+
+						// if (!highlight) {
+						// return;
+						// }
+					}
+
+					if (tagWhitelist != null
+							&& !tagWhitelist.containsKey(qName)) {
+						// skip
+						return;
+					}
+
+					html.append("</");
+					html.append(qName);
+					html.append('>');
+				}
+			} finally {
+				if (ta != null) {
+					ta.afterEnd(this, localName);
+				}
+			}
+		}
+
+		public void characters(char[] ch, int start, int length)
+				throws SAXException {
+			characterElementIdx++;
+			if (inIgnorableElement == 0) {
+
+				boolean highlight = contentBitSet.get(characterElementIdx);
+
+				if (!highlight && outputHighlightOnly) {
+					return;
+				}
+
+				if (highlight) {
+					html.append(preHighlight);
+				}
+				html.append(xmlEncode(String.valueOf(ch, start, length)));
+				if (highlight) {
+					html.append(postHighlight);
+				}
+			}
+		}
+
+		public void startPrefixMapping(String prefix, String uri)
+				throws SAXException {
+		}
+
+	}
+
+	private static String xmlEncode(final String in) {
+		if (in == null) {
+			return "";
+		}
+		char c;
+		StringBuilder out = new StringBuilder(in.length());
+
+		for (int i = 0; i < in.length(); i++) {
+			c = in.charAt(i);
+			switch (c) {
+			case '<':
+				out.append("&lt;");
+				break;
+			case '>':
+				out.append("&gt;");
+				break;
+			case '&':
+				out.append("&amp;");
+				break;
+			case '"':
+				out.append("&quot;");
+				break;
+			default:
+				out.append(c);
+			}
+		}
+
+		return out.toString();
+	}
+
+	public Map<String, Set<String>> getTagWhitelist() {
+		return tagWhitelist;
+	}
+
+	public void setTagWhitelist(Map<String, Set<String>> tagWhitelist) {
+		this.tagWhitelist = tagWhitelist;
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java b/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java
new file mode 100644
index 0000000..3a9bcbe
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java
@@ -0,0 +1,277 @@
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.Image;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Extracts the images that are enclosed by extracted content. 
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class ImageExtractor {
+	public static final ImageExtractor INSTANCE = new ImageExtractor();
+	
+	/**
+	 * Returns the singleton instance of {@link ImageExtractor}.
+	 * 
+	 * @return the singleton instance of {@link ImageExtractor}.
+	 */
+	public static ImageExtractor getInstance() {
+		return INSTANCE;
+	}
+
+	private ImageExtractor() {
+	}
+
+	/**
+	 * Processes the given {@link TextDocument} and the original HTML text (as a
+	 * String).
+	 * 
+	 * @param doc
+	 *            The processed {@link TextDocument}.
+	 * @param origHTML
+	 *            The original HTML document.
+	 * @return A List of enclosed {@link Image}s
+	 * @throws BoilerpipeProcessingException
+	 */
+	public List<Image> process(final TextDocument doc,
+			final String origHTML) throws BoilerpipeProcessingException {
+		return process(doc, new InputSource(
+				new StringReader(origHTML)));
+	}
+
+	/**
+	 * Processes the given {@link TextDocument} and the original HTML text (as an
+	 * {@link InputSource}).
+	 * 
+	 * @param doc
+	 *            The processed {@link TextDocument}.
+	 *            The original HTML document.
+	 * @return A List of enclosed {@link Image}s
+	 * @throws BoilerpipeProcessingException
+	 */
+	public List<Image> process(final TextDocument doc,
+			final InputSource is) throws BoilerpipeProcessingException {
+		final Implementation implementation = new Implementation();
+		implementation.process(doc, is);
+		
+		return implementation.linksHighlight;
+	}
+	
+	/**
+	 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
+	 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
+	 * 
+	 *            The processed {@link TextDocument}.
+	 *            The original HTML document.
+	 * @return A List of enclosed {@link Image}s
+	 * @throws BoilerpipeProcessingException
+	 */
+	public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
+			throws IOException, BoilerpipeProcessingException, SAXException {
+		final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
+
+		final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
+				.getTextDocument();
+		extractor.process(doc);
+
+		final InputSource is = htmlDoc.toInputSource();
+
+		return process(doc, is);
+	}
+	
+
+	private final class Implementation extends AbstractSAXParser implements
+			ContentHandler {
+		List<Image> linksHighlight = new ArrayList<Image>();
+		private List<Image> linksBuffer = new ArrayList<Image>();
+
+		private int inIgnorableElement = 0;
+		private int characterElementIdx = 0;
+		private final BitSet contentBitSet = new BitSet();
+		
+		private boolean inHighlight = false;
+
+		Implementation() {
+			super(new HTMLConfiguration());
+			setContentHandler(this);
+		}
+
+		void process(final TextDocument doc, final InputSource is)
+				throws BoilerpipeProcessingException {
+			for (TextBlock block : doc.getTextBlocks()) {
+				if (block.isContent()) {
+					final BitSet bs = block.getContainedTextElements();
+					if (bs != null) {
+						contentBitSet.or(bs);
+					}
+				}
+			}
+
+			try {
+				parse(is);
+			} catch (SAXException e) {
+				throw new BoilerpipeProcessingException(e);
+			} catch (IOException e) {
+				throw new BoilerpipeProcessingException(e);
+			}
+		}
+
+		public void endDocument() throws SAXException {
+		}
+
+		public void endPrefixMapping(String prefix) throws SAXException {
+		}
+
+		public void ignorableWhitespace(char[] ch, int start, int length)
+				throws SAXException {
+		}
+
+		public void processingInstruction(String target, String data)
+				throws SAXException {
+		}
+
+		public void setDocumentLocator(Locator locator) {
+		}
+
+		public void skippedEntity(String name) throws SAXException {
+		}
+
+		public void startDocument() throws SAXException {
+		}
+
+		public void startElement(String uri, String localName, String qName,
+				Attributes atts) throws SAXException {
+			TagAction ta = TAG_ACTIONS.get(localName);
+			if (ta != null) {
+				ta.beforeStart(this, localName);
+			}
+
+			try {
+				if (inIgnorableElement == 0) {
+					if(inHighlight && "IMG".equalsIgnoreCase(localName)) {
+						String src = atts.getValue("src");
+						if(src != null && src.length() > 0) {
+							linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts.getValue("alt")));
+						}
+					}
+				}
+			} finally {
+				if (ta != null) {
+					ta.afterStart(this, localName);
+				}
+			}
+		}
+
+		public void endElement(String uri, String localName, String qName)
+				throws SAXException {
+			TagAction ta = TAG_ACTIONS.get(localName);
+			if (ta != null) {
+				ta.beforeEnd(this, localName);
+			}
+
+			try {
+				if (inIgnorableElement == 0) {
+					//
+				}
+			} finally {
+				if (ta != null) {
+					ta.afterEnd(this, localName);
+				}
+			}
+		}
+
+		public void characters(char[] ch, int start, int length)
+				throws SAXException {
+			characterElementIdx++;
+			if (inIgnorableElement == 0) {
+
+				boolean highlight = contentBitSet.get(characterElementIdx);
+				if(!highlight) {
+					if(length == 0) {
+						return;
+					}
+					boolean justWhitespace = true;
+					for(int i=start;i<start+length;i++) {
+						if(!Character.isWhitespace(ch[i])) {
+							justWhitespace = false;
+							break;
+						}
+					}
+					if(justWhitespace) {
+						return;
+					}
+				}
+
+				inHighlight = highlight;
+				if(inHighlight) {
+					linksHighlight.addAll(linksBuffer);
+					linksBuffer.clear();
+				}
+			}
+		}
+
+		public void startPrefixMapping(String prefix, String uri)
+				throws SAXException {
+		}
+
+	}
+	
+	
+	private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
+		void beforeStart(final Implementation instance, final String localName) {
+			instance.inIgnorableElement++;
+		}
+
+		void afterEnd(final Implementation instance, final String localName) {
+			instance.inIgnorableElement--;
+		}
+	};
+
+	private static Map<String, TagAction> TAG_ACTIONS = new HashMap<String, TagAction>();
+	static {
+		TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
+		TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
+
+		TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT);
+	}
+	
+	private abstract static class TagAction {
+		void beforeStart(final Implementation instance, final String localName) {
+		}
+
+		void afterStart(final Implementation instance, final String localName) {
+		}
+
+		void beforeEnd(final Implementation instance, final String localName) {
+		}
+
+		void afterEnd(final Implementation instance, final String localName) {
+		}
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java b/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java
new file mode 100644
index 0000000..ef8010e
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java
@@ -0,0 +1,12 @@
+package de.l3s.boilerpipe.sax;
+
+import org.xml.sax.InputSource;
+
+/**
+ * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given document.
+ * 
+ * @author Christian Kohlschütter
+ */
+public interface InputSourceable {
+	InputSource toInputSource();
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java b/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java
new file mode 100644
index 0000000..e54a3da
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java
@@ -0,0 +1,105 @@
+package de.l3s.boilerpipe.sax;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+import de.l3s.boilerpipe.labels.LabelAction;
+
+/**
+ * Assigns labels for element CSS classes and ids to the corresponding
+ * {@link TextBlock}. CSS classes are prefixed by
+ * <code>{@link DefaultLabels#MARKUP_PREFIX}.</code>, and IDs are prefixed by
+ * <code>{@link DefaultLabels#MARKUP_PREFIX}#</code>
+ * 
+ * @author Christian Kohlschütter
+ */
+public final class MarkupTagAction implements TagAction {
+
+	private final boolean isBlockLevel;
+	private LinkedList<List<String>> labelStack = new LinkedList<List<String>>();
+
+	public MarkupTagAction(final boolean isBlockLevel) {
+		this.isBlockLevel = isBlockLevel;
+	}
+	
+	private static final Pattern PAT_NUM = Pattern.compile("[0-9]+");
+
+	public boolean start(BoilerpipeHTMLContentHandler instance,
+			String localName, String qName, Attributes atts)
+			throws SAXException {
+		List<String> labels = new ArrayList<String>(5);
+		labels.add(DefaultLabels.MARKUP_PREFIX  + localName);
+
+		String classVal = atts.getValue("class");
+
+		if (classVal != null && classVal.length() > 0) {
+			classVal = PAT_NUM.matcher(classVal).replaceAll("#");
+			classVal = classVal.trim();
+			String[] vals = classVal.split("[ ]+");
+			labels.add(DefaultLabels.MARKUP_PREFIX + "."
+					+ classVal.replace(' ', '.'));
+			if (vals.length > 1) {
+				for (String s : vals) {
+					labels.add(DefaultLabels.MARKUP_PREFIX + "." + s);
+				}
+			}
+		}
+
+		String id = atts.getValue("id");
+		if (id != null && id.length() > 0) {
+			id = PAT_NUM.matcher(id).replaceAll("#");
+			labels.add(DefaultLabels.MARKUP_PREFIX + "#" + id);
+		}
+
+		Set<String> ancestors = getAncestorLabels();
+		List<String> labelsWithAncestors = new ArrayList<String>(
+				(ancestors.size() + 1) * labels.size());
+
+		for (String l : labels) {
+			for (String an : ancestors) {
+				labelsWithAncestors.add(an);
+				labelsWithAncestors.add(an + " " + l);
+			}
+			labelsWithAncestors.add(l);
+		}
+
+		instance.addLabelAction(new LabelAction(labelsWithAncestors
+				.toArray(new String[labelsWithAncestors.size()])));
+
+		labelStack.add(labels);
+
+		return isBlockLevel;
+	}
+
+
+	public boolean end(BoilerpipeHTMLContentHandler instance, String localName,
+			String qName) throws SAXException {
+
+		labelStack.removeLast();
+		return isBlockLevel;
+	}
+
+    public boolean changesTagLevel() {
+    	return isBlockLevel;
+    }
+    
+	private Set<String> getAncestorLabels() {
+		Set<String> set = new HashSet<String>();
+		for (List<String> labels : labelStack) {
+			if (labels == null) {
+				continue;
+			}
+			set.addAll(labels);
+		}
+		return set;
+	}
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java b/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java
new file mode 100644
index 0000000..e6f1943
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *       
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.Image;
+import de.l3s.boilerpipe.document.Media;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.document.VimeoVideo;
+import de.l3s.boilerpipe.document.YoutubeVideo;
+import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
+import de.l3s.boilerpipe.sax.HTMLDocument;
+import de.l3s.boilerpipe.sax.HTMLFetcher;
+
+
+/**
+ * Extracts youtube and vimeo videos that are enclosed by extracted content.
+ *
+ * @author Christian Kohlschütter, manuel.codiga@gmail.com
+ */
+public final class MediaExtractor {
+
+	/**  */
+        public static final MediaExtractor INSTANCE = new MediaExtractor();
+
+         /**
+          * @return the singleton instance of {@link MediaExtractor}.
+         */
+        public static MediaExtractor getInstance() {
+                return INSTANCE;
+        }
+
+
+
+        /**
+         * Processes the given {@link TextDocument} and the original HTML text (as a
+         * String).
+         *
+         * @param doc
+         *            The processed {@link TextDocument}.
+         * @param origHTML
+         *            The original HTML document.
+         * @return A List of enclosed {@link Image}s
+         * @throws BoilerpipeProcessingException if an error during extraction occure
+         */
+        public List<Media> process(final TextDocument doc, final String origHTML)
+        	throws BoilerpipeProcessingException {
+                return process(doc, new InputSource(new StringReader(origHTML)));
+        }
+
+        /**
+         * Processes the given {@link TextDocument} and the original HTML text (as an
+         * {@link InputSource}).
+         *
+         * @param doc
+         *            The processed {@link TextDocument}.
+         *            The original HTML document.
+         * @return A List of enclosed {@link Image}s
+         * @throws BoilerpipeProcessingException
+         */
+        public List<Media> process(final TextDocument doc, final InputSource is)
+        	throws BoilerpipeProcessingException {
+                final Implementation implementation = new Implementation();
+                implementation.process(doc, is);
+
+                return implementation.linksHighlight;
+        }
+
+        /**
+         * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
+         * retrieved HTML using the specified {@link BoilerpipeExtractor}.
+         * @param url the url of the document to fetch
+         * @param extractor extractor to use
+         *
+         * @return A List of enclosed {@link Image}s
+         * @throws IOException
+         * @throws BoilerpipeProcessingException
+         * @throws SAXException
+         */
+        @SuppressWarnings("javadoc")
+		public List<Media> process(final URL url, final BoilerpipeExtractor extractor)
+                        throws IOException, BoilerpipeProcessingException, SAXException {
+                final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
+
+                final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
+                                .getTextDocument();
+                extractor.process(doc);
+
+                final InputSource is = htmlDoc.toInputSource();
+
+                return process(doc, is);
+        }
+
+        /**
+         * parses the media (picture, video) out of doc
+         * @param doc document to parse the media out
+         * @param extractor extractor to use
+         * @return list of extracted media, with size = 0 if no media found
+         */
+        public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
+        	final HTMLDocument htmlDoc = new HTMLDocument(doc);
+    		List<Media> media = new ArrayList<Media>();
+            TextDocument tdoc;
+
+            try {
+    			tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
+    			extractor.process(tdoc);
+    			final InputSource is = htmlDoc.toInputSource();
+    			media = process(tdoc, is);
+    		} catch (Exception e) {
+    			return null;
+    		}
+            return media;
+        }
+
+
+        private final class Implementation extends AbstractSAXParser implements
+                        ContentHandler {
+                List<Media> linksHighlight = new ArrayList<Media>();
+                private List<Media> linksBuffer = new ArrayList<Media>();
+
+                private int inIgnorableElement = 0;
+                private int characterElementIdx = 0;
+                private final BitSet contentBitSet = new BitSet();
+
+                private boolean inHighlight = false;
+
+                Implementation() {
+	                    super(new HTMLConfiguration());
+	                    setContentHandler(this);
+	            }
+
+                void process(final TextDocument doc, final InputSource is)
+                                throws BoilerpipeProcessingException {
+                        for (TextBlock block : doc.getTextBlocks()) {
+                                if (block.isContent()) {
+                                        final BitSet bs = block.getContainedTextElements();
+                                        if (bs != null) {
+                                                contentBitSet.or(bs);
+                                        }
+                                }
+                        }
+
+                        try {
+                                parse(is);
+                        } catch (SAXException e) {
+                                throw new BoilerpipeProcessingException(e);
+                        } catch (IOException e) {
+                                throw new BoilerpipeProcessingException(e);
+                        }
+                }
+
+				public void endDocument() throws SAXException {
+                }
+
+                public void endPrefixMapping(String prefix) throws SAXException {
+                }
+
+                public void ignorableWhitespace(char[] ch, int start, int length)
+                                throws SAXException {
+                }
+
+                public void processingInstruction(String target, String data)
+                                throws SAXException {
+                }
+
+                public void setDocumentLocator(Locator locator) {
+                }
+
+                public void skippedEntity(String name) throws SAXException {
+                }
+
+                public void startDocument() throws SAXException {
+                }
+
+                public void startElement(String uri, String localName, String qName,
+                                Attributes atts) throws SAXException {
+                    TagAction ta = TAG_ACTIONS.get(localName);
+                    if (ta != null) {
+                            ta.beforeStart(this, localName);
+                    }
+
+                    try {
+                        if (inIgnorableElement == 0) {
+                            if(inHighlight && "IFRAME".equalsIgnoreCase(localName)) {
+                                String src = atts.getValue("src");
+                                src = src.replaceAll("\\\\\"", "");
+                                if(src != null && src.length() > 0 && src.contains("youtube.com/embed/")) {
+                                	String originUrl = null;
+                                	try {
+										URL url = new URL(src);
+										String path = url.getPath();
+										String[] pathParts = path.split("/");
+										originUrl = "http://www.youtube.com/watch?v="+pathParts[pathParts.length-1];
+										linksBuffer.add(new YoutubeVideo(originUrl,src));
+									} catch (MalformedURLException e) {
+									}
+                                    
+                                }
+
+                                if(src != null && src.length() > 0 && src.contains("player.vimeo.com")) {
+                                	String originUrl = null;
+                            		try {
+                            			URL url = new URL(src);
+										String path = url.getPath();
+										String[] pathParts = path.split("/");
+										originUrl = "http://vimeo.com/"+pathParts[pathParts.length-1];
+										linksBuffer.add(new VimeoVideo(originUrl,src));
+									} catch (MalformedURLException e) {
+									}
+                                    
+                                }
+                            }
+
+
+                            if(inHighlight && "IMG".equalsIgnoreCase(localName)) {
+                                String src = atts.getValue("src");
+                                try {
+									URI image = new URI(src);
+									if(src != null && src.length() > 0) {
+	                                	linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts.getValue("alt")));
+	                                }
+								} catch (URISyntaxException e) {
+								}
+                            }
+                        }
+                    } finally {
+                        if (ta != null) {
+                            ta.afterStart(this, localName);
+                        }
+                    }
+                }
+
+                public void endElement(String uri, String localName, String qName)
+                                throws SAXException {
+                        TagAction ta = TAG_ACTIONS.get(localName);
+                        if (ta != null) {
+                                ta.beforeEnd(this, localName);
+                        }
+
+                        try {
+                                if (inIgnorableElement == 0) {
+                                        //
+                                }
+                        } finally {
+                                if (ta != null) {
+                                        ta.afterEnd(this, localName);
+                                }
+                        }
+                }
+
+                public void characters(char[] ch, int start, int length)
+                                throws SAXException {
+                        characterElementIdx++;
+                        if (inIgnorableElement == 0) {
+
+                                boolean highlight = contentBitSet.get(characterElementIdx);
+                                if(!highlight) {
+                                        if(length == 0) {
+                                                return;
+                                        }
+                                        boolean justWhitespace = true;
+                                        for(int i=start;i<start+length;i++) {
+                                                if(!Character.isWhitespace(ch[i])) {
+                                                        justWhitespace = false;
+                                                        break;
+                                                }
+                                        }
+                                        if(justWhitespace) {
+                                                return;
+                                        }
+                                }
+
+                                inHighlight = highlight;
+                                if(inHighlight) {
+                                        linksHighlight.addAll(linksBuffer);
+                                        linksBuffer.clear();
+                                }
+                        }
+                }
+
+                public void startPrefixMapping(String prefix, String uri)
+                                throws SAXException {
+                }
+
+        }
+
+
+        @SuppressWarnings("synthetic-access")
+		private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
+                @Override
+				void beforeStart(final Implementation instance, final String localName) {
+                        instance.inIgnorableElement++;
+                }
+
+                @Override
+				void afterEnd(final Implementation instance, final String localName) {
+                        instance.inIgnorableElement--;
+                }
+        };
+
+        private static Map<String, TagAction> TAG_ACTIONS = new HashMap<String, TagAction>();
+        static {
+                TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
+                TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
+                TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
+                TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
+                TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
+                TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
+                TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
+
+                TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT);
+        }
+
+        private abstract static class TagAction {
+                void beforeStart(final Implementation instance, final String localName) {
+                }
+
+                void afterStart(final Implementation instance, final String localName) {
+                }
+
+                void beforeEnd(final Implementation instance, final String localName) {
+                }
+
+                void afterEnd(final Implementation instance, final String localName) {
+                }
+        }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/TagAction.java b/src/main/java/de/l3s/boilerpipe/sax/TagAction.java
new file mode 100644
index 0000000..3ee8dcf
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/TagAction.java
@@ -0,0 +1,39 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * Defines an action that is to be performed whenever a particular tag occurs
+ * during HTML parsing.
+ * 
+ * @author Christian Kohlschütter
+ */
+public interface TagAction {
+
+	boolean start(final BoilerpipeHTMLContentHandler instance,
+			final String localName, final String qName, final Attributes atts)
+			throws SAXException;
+
+	boolean end(final BoilerpipeHTMLContentHandler instance,
+			final String localName, final String qName) throws SAXException;
+	
+	boolean changesTagLevel();
+}
\ No newline at end of file
diff --git a/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java b/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java
new file mode 100644
index 0000000..74ab275
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java
@@ -0,0 +1,60 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.util.HashMap;
+
+/**
+ * Base class for definition a set of {@link TagAction}s that are to be used for the
+ * HTML parsing process.
+ * 
+ * @see DefaultTagActionMap
+ * @author Christian Kohlschütter
+ */
+public abstract class TagActionMap extends HashMap<String, TagAction> {
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag
+     * will be removed and overwritten.
+     * 
+     * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
+     * @param action The {@link TagAction}
+     */
+    protected void setTagAction(final String tag, final TagAction action) {
+        put(tag.toUpperCase(), action);
+        put(tag.toLowerCase(), action);
+        put(tag, action);
+    }
+
+    /**
+     * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that tag,
+     * a chained action, consisting of the previous and the new {@link TagAction} is created.
+     * 
+     * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
+     * @param action The {@link TagAction}
+     */
+    protected void addTagAction(final String tag, final TagAction action) {
+        TagAction previousAction = get(tag);
+        if(previousAction == null) {
+            setTagAction(tag, action);
+        } else {
+            setTagAction(tag, new CommonTagActions.Chained(previousAction, action));
+        }
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/package.html b/src/main/java/de/l3s/boilerpipe/sax/package.html
new file mode 100644
index 0000000..9772244
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+	<p>Classes related to parsing and producing HTML from/to Boilerpipe
+		TextDocuments.</p>
+</body>
+</html>
diff --git a/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java b/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java
new file mode 100644
index 0000000..e7997f0
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java
@@ -0,0 +1,45 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.util;
+
+import java.util.regex.Pattern;
+
+/**
+ * Tokenizes text according to Unicode word boundaries and strips off non-word
+ * characters.
+ * 
+ * @author Christian Kohlschütter
+ */
+public class UnicodeTokenizer {
+    private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b");
+    private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern
+            .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*");
+
+    /**
+     * Tokenizes the text and returns an array of tokens.
+     * 
+     * @param text The text
+     * @return The tokens
+     */
+    public static String[] tokenize(final CharSequence text) {
+        return PAT_NOT_WORD_BOUNDARY.matcher(
+                PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063"))
+                .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split(
+                        "[ ]+");
+    }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/util/package.html b/src/main/java/de/l3s/boilerpipe/util/package.html
new file mode 100644
index 0000000..ab7a714
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/util/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+	<p>Some helper classes.</p>
+</body>
+</html>
diff --git a/src/main/java/org/cyberneko/html/HTMLElements.java b/src/main/java/org/cyberneko/html/HTMLElements.java
new file mode 100644
index 0000000..d200373
--- /dev/null
+++ b/src/main/java/org/cyberneko/html/HTMLElements.java
@@ -0,0 +1,794 @@
+/* 
+ * Copyright 2002-2009 Andy Clark, Marc Guillemot
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.cyberneko.html;
+
+/**
+ * Collection of HTML element information.
+ *
+ * @author Andy Clark
+ * @author Ahmed Ashour
+ * @author Marc Guillemot
+ *
+ * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $
+ */
+public class HTMLElements {
+
+    //
+    // Constants
+    //
+    
+    // element codes
+
+    // NOTE: The element codes *must* start with 0 and increment in
+    //       sequence. The parent and closes references depends on 
+    //       this assumption. -Ac
+
+    public static final short A = 0;
+    public static final short ABBR = A+1;
+    public static final short ACRONYM = ABBR+1;
+    public static final short ADDRESS = ACRONYM+1;
+    public static final short APPLET = ADDRESS+1;
+    public static final short AREA = APPLET+1;
+    public static final short B = AREA+1;
+    public static final short BASE = B+1;
+    public static final short BASEFONT = BASE+1;
+    public static final short BDO = BASEFONT+1;
+    public static final short BGSOUND = BDO+1;
+    public static final short BIG = BGSOUND+1;
+    public static final short BLINK = BIG+1;
+    public static final short BLOCKQUOTE = BLINK+1;
+    public static final short BODY = BLOCKQUOTE+1;
+    public static final short BR = BODY+1;
+    public static final short BUTTON = BR+1;
+    public static final short CAPTION = BUTTON+1;
+    public static final short CENTER = CAPTION+1;
+    public static final short CITE = CENTER+1;
+    public static final short CODE = CITE+1;
+    public static final short COL = CODE+1;
+    public static final short COLGROUP = COL+1;
+    public static final short COMMENT = COLGROUP+1;
+    public static final short DEL = COMMENT+1;
+    public static final short DFN = DEL+1;
+    public static final short DIR = DFN+1;
+    public static final short DIV = DIR+1;
+    public static final short DD = DIV+1;
+    public static final short DL = DD+1;
+    public static final short DT = DL+1;
+    public static final short EM = DT+1;
+    public static final short EMBED = EM+1;
+    public static final short FIELDSET = EMBED+1;
+    public static final short FONT = FIELDSET+1;
+    public static final short FORM = FONT+1;
+    public static final short FRAME = FORM+1;
+    public static final short FRAMESET = FRAME+1;
+    public static final short H1 = FRAMESET+1;
+    public static final short H2 = H1+1;
+    public static final short H3 = H2+1;
+    public static final short H4 = H3+1;
+    public static final short H5 = H4+1;
+    public static final short H6 = H5+1;
+    public static final short HEAD = H6+1;
+    public static final short HR = HEAD+1;
+    public static final short HTML = HR+1;
+    public static final short I = HTML+1;
+    public static final short IFRAME = I+1;
+    public static final short ILAYER = IFRAME+1;
+    public static final short IMG = ILAYER+1;
+    public static final short INPUT = IMG+1;
+    public static final short INS = INPUT+1;
+    public static final short ISINDEX = INS+1;
+    public static final short KBD = ISINDEX+1;
+    public static final short KEYGEN = KBD+1;
+    public static final short LABEL = KEYGEN+1;
+    public static final short LAYER = LABEL+1;
+    public static final short LEGEND = LAYER+1;
+    public static final short LI = LEGEND+1;
+    public static final short LINK = LI+1;
+    public static final short LISTING = LINK+1;
+    public static final short MAP = LISTING+1;
+    public static final short MARQUEE = MAP+1;
+    public static final short MENU = MARQUEE+1;
+    public static final short META = MENU+1;
+    public static final short MULTICOL = META+1;
+    public static final short NEXTID = MULTICOL+1;
+    public static final short NOBR = NEXTID+1;
+    public static final short NOEMBED = NOBR+1;
+    public static final short NOFRAMES = NOEMBED+1;
+    public static final short NOLAYER = NOFRAMES+1;
+    public static final short NOSCRIPT = NOLAYER+1;
+    public static final short OBJECT = NOSCRIPT+1;
+    public static final short OL = OBJECT+1;
+    public static final short OPTION = OL+1;
+    public static final short OPTGROUP = OPTION+1;
+    public static final short P = OPTGROUP+1;
+    public static final short PARAM = P+1;
+    public static final short PLAINTEXT = PARAM+1;
+    public static final short PRE = PLAINTEXT+1;
+    public static final short Q = PRE+1;
+    public static final short RB = Q+1;
+    public static final short RBC = RB+1;
+    public static final short RP = RBC+1;
+    public static final short RT = RP+1;
+    public static final short RTC = RT+1;
+    public static final short RUBY = RTC+1;
+    public static final short S = RUBY+1;
+    public static final short SAMP = S+1;
+    public static final short SCRIPT = SAMP+1;
+    public static final short SELECT = SCRIPT+1;
+    public static final short SMALL = SELECT+1;
+    public static final short SOUND = SMALL+1;
+    public static final short SPACER = SOUND+1;
+    public static final short SPAN = SPACER+1;
+    public static final short STRIKE = SPAN+1;
+    public static final short STRONG = STRIKE+1;
+    public static final short STYLE = STRONG+1;
+    public static final short SUB = STYLE+1;
+    public static final short SUP = SUB+1;
+    public static final short TABLE = SUP+1;
+    public static final short TBODY = TABLE+1;
+    public static final short TD = TBODY+1;
+    public static final short TEXTAREA = TD+1;
+    public static final short TFOOT = TEXTAREA+1;
+    public static final short TH = TFOOT+1;
+    public static final short THEAD = TH+1;
+    public static final short TITLE = THEAD+1;
+    public static final short TR = TITLE+1;
+    public static final short TT = TR+1;
+    public static final short U = TT+1;
+    public static final short UL = U+1;
+    public static final short VAR = UL+1;
+    public static final short WBR = VAR+1;
+    public static final short XML = WBR+1;
+    public static final short XMP = XML+1;
+    public static final short UNKNOWN = XMP+1;
+
+    // information
+
+    /** Element information organized by first letter. */
+    protected static final Element[][] ELEMENTS_ARRAY = new Element[26][];
+
+    /** Element information as a contiguous list. */
+    protected static final ElementList ELEMENTS = new ElementList();
+
+    /** No such element. */
+    public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "",  Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null);
+
+    //
+    // Static initializer
+    //
+
+    /**
+     * Initializes the element information.
+     * <p>
+     * <strong>Note:</strong>
+     * The <code>getElement</code> method requires that the HTML elements
+     * are added to the list in alphabetical order. If new elements are
+     * added, then they <em>must</em> be inserted in alphabetical order.
+     */
+    static {
+        // <!ENTITY % heading "H1|H2|H3|H4|H5|H6">
+        // <!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
+        // <!ENTITY % phrase "EM | STRONG | DFN | CODE | SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
+        // <!ENTITY % special "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
+        // <!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
+        // <!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
+        // <!ENTITY % block "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT | BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS">
+        // <!ENTITY % flow "%block; | %inline;">
+
+        // initialize array of element information
+        ELEMENTS_ARRAY['A'-'A'] = new Element[] {
+            // A - - (%inline;)* -(A)
+            new Element(A, "A", Element.INLINE, BODY, new short[] {A}),
+            // ABBR - - (%inline;)*
+            new Element(ABBR, "ABBR", Element.INLINE, BODY, null),
+            // ACRONYM - - (%inline;)*
+            new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null),
+            // ADDRESS - - (%inline;)*
+            new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null),
+            // APPLET
+            new Element(APPLET, "APPLET", 0, BODY, null),
+            // AREA - O EMPTY
+            new Element(AREA, "AREA", Element.EMPTY, MAP, null),
+        };
+        ELEMENTS_ARRAY['B'-'A'] = new Element[] {
+            // B - - (%inline;)*
+            new Element(B, "B", Element.INLINE, BODY, null),
+            // BASE - O EMPTY
+            new Element(BASE, "BASE", Element.EMPTY, HEAD, null),
+            // BASEFONT
+            new Element(BASEFONT, "BASEFONT", 0, HEAD, null),
+            // BDO - - (%inline;)*
+            new Element(BDO, "BDO", Element.INLINE, BODY, null),
+            // BGSOUND
+            new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null),
+            // BIG - - (%inline;)*
+            new Element(BIG, "BIG", Element.INLINE, BODY, null),
+            // BLINK
+            new Element(BLINK, "BLINK", Element.INLINE, BODY, null),
+            // BLOCKQUOTE - - (%block;|SCRIPT)+
+            new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}),
+            // BODY O O (%block;|SCRIPT)+ +(INS|DEL)
+            new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}),
+            // BR - O EMPTY
+            new Element(BR, "BR", Element.EMPTY, BODY, null),
+            // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET)
+            new Element(BUTTON, "BUTTON", 0, BODY, null),
+        };
+        ELEMENTS_ARRAY['C'-'A'] = new Element[] {
+            // CAPTION - - (%inline;)*
+            new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null),
+            // CENTER, 
+            new Element(CENTER, "CENTER", 0, BODY, null),
+            // CITE - - (%inline;)*
+            new Element(CITE, "CITE", Element.INLINE, BODY, null),
+            // CODE - - (%inline;)*
+            new Element(CODE, "CODE", Element.INLINE, BODY, null),
+            // COL - O EMPTY
+            new Element(COL, "COL", Element.EMPTY, TABLE, null),
+            // COLGROUP - O (COL)*
+            new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}),
+            // COMMENT
+            new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null),
+        };
+        ELEMENTS_ARRAY['D'-'A'] = new Element[] {
+            // DEL - - (%flow;)*
+            new Element(DEL, "DEL", 0, BODY, null),
+            // DFN - - (%inline;)*
+            new Element(DFN, "DFN", Element.INLINE, BODY, null),
+            // DIR
+            new Element(DIR, "DIR", 0, BODY, null),
+            // DIV - - (%flow;)*
+            new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}),
+            // DD - O (%flow;)*
+            new Element(DD, "DD", 0, DL, new short[]{DT,DD}),
+            // DL - - (DT|DD)+
+            new Element(DL, "DL", Element.BLOCK, BODY, null),
+            // DT - O (%inline;)*
+            new Element(DT, "DT", 0, DL, new short[]{DT,DD}),
+        };
+        ELEMENTS_ARRAY['E'-'A'] = new Element[] {
+            // EM - - (%inline;)*
+            new Element(EM, "EM", Element.INLINE, BODY, null),
+            // EMBED
+            new Element(EMBED, "EMBED", 0, BODY, null),
+        };
+        ELEMENTS_ARRAY['F'-'A'] = new Element[] {
+            // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*)
+            new Element(FIELDSET, "FIELDSET", 0, BODY, null),
+            // FONT
+            new Element(FONT, "FONT", Element.CONTAINER, BODY, null),
+            // FORM - - (%block;|SCRIPT)+ -(FORM)
+            new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}),
+            // FRAME - O EMPTY
+            new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null),
+            // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?)
+            new Element(FRAMESET, "FRAMESET", 0, HTML, null),
+        };
+        ELEMENTS_ARRAY['H'-'A'] = new Element[] {
+            // (H1|H2|H3|H4|H5|H6) - - (%inline;)*
+            new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+            new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+            new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+            new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+            new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+            new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+            // HEAD O O (%head.content;) +(%head.misc;)
+            new Element(HEAD, "HEAD", 0, HTML, null),
+            // HR - O EMPTY
+            new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}),
+            // HTML O O (%html.content;)
+            new Element(HTML, "HTML", 0, null, null),
+        };
+        ELEMENTS_ARRAY['I'-'A'] = new Element[] {
+            // I - - (%inline;)*
+            new Element(I, "I", Element.INLINE, BODY, null),
+            // IFRAME
+            new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null),
+            // ILAYER
+            new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null),
+            // IMG - O EMPTY
+            new Element(IMG, "IMG", Element.EMPTY, BODY, null),
+            // INPUT - O EMPTY
+            new Element(INPUT, "INPUT", Element.EMPTY, BODY, null),
+            // INS - - (%flow;)*
+            new Element(INS, "INS", 0, BODY, null),
+            // ISINDEX
+            new Element(ISINDEX, "ISINDEX", 0, HEAD, null),
+        };
+        ELEMENTS_ARRAY['K'-'A'] = new Element[] {
+            // KBD - - (%inline;)*
+            new Element(KBD, "KBD", Element.INLINE, BODY, null),
+            // KEYGEN
+            new Element(KEYGEN, "KEYGEN", 0, BODY, null),
+        };
+        ELEMENTS_ARRAY['L'-'A'] = new Element[] {
+            // LABEL - - (%inline;)* -(LABEL)
+            new Element(LABEL, "LABEL", 0, BODY, null),
+            // LAYER
+            new Element(LAYER, "LAYER", Element.BLOCK, BODY, null),
+            // LEGEND - - (%inline;)*
+            new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null),
+            // LI - O (%flow;)*
+            new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}),
+            // LINK - O EMPTY
+            new Element(LINK, "LINK", Element.EMPTY, HEAD, null),
+            // LISTING
+            new Element(LISTING, "LISTING", 0, BODY, null),
+        };
+        ELEMENTS_ARRAY['M'-'A'] = new Element[] {
+            // MAP - - ((%block;) | AREA)+
+            new Element(MAP, "MAP", Element.INLINE, BODY, null),
+            // MARQUEE
+            new Element(MARQUEE, "MARQUEE", 0, BODY, null),
+            // MENU
+            new Element(MENU, "MENU", 0, BODY, null),
+            // META - O EMPTY
+            new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}),
+            // MULTICOL
+            new Element(MULTICOL, "MULTICOL", 0, BODY, null),
+        };
+        ELEMENTS_ARRAY['N'-'A'] = new Element[] {
+            // NEXTID
+            new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null),
+            // NOBR
+            new Element(NOBR, "NOBR", Element.INLINE, BODY, null),
+            // NOEMBED
+            new Element(NOEMBED, "NOEMBED", 0, BODY, null),
+            // NOFRAMES - - (BODY) -(NOFRAMES)
+            new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null),
+            // NOLAYER
+            new Element(NOLAYER, "NOLAYER", 0, BODY, null),
+            // NOSCRIPT - - (%block;)+
+            new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null),
+        };
+        ELEMENTS_ARRAY['O'-'A'] = new Element[] {
+            // OBJECT - - (PARAM | %flow;)*
+            new Element(OBJECT, "OBJECT", 0, BODY, null),
+            // OL - - (LI)+
+            new Element(OL, "OL", Element.BLOCK, BODY, null),
+            // OPTGROUP - - (OPTION)+
+            new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}),
+            // OPTION - O (#PCDATA)
+            new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}),
+        };
+        ELEMENTS_ARRAY['P'-'A'] = new Element[] {
+            // P - O (%inline;)*
+            new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}),
+            // PARAM - O EMPTY
+            new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null),
+            // PLAINTEXT
+            new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null),
+            // PRE - - (%inline;)* -(%pre.exclusion;)
+            new Element(PRE, "PRE", 0, BODY, null),
+        };
+        ELEMENTS_ARRAY['Q'-'A'] = new Element[] {
+            // Q - - (%inline;)*
+            new Element(Q, "Q", Element.INLINE, BODY, null),
+        };
+        ELEMENTS_ARRAY['R'-'A'] = new Element[] {
+            // RB
+            new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}),
+            // RBC
+            new Element(RBC, "RBC", 0, RUBY, null),
+            // RP
+            new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}),
+            // RT
+            new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}),
+            // RTC
+            new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}),
+            // RUBY
+            new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}),
+        };
+        ELEMENTS_ARRAY['S'-'A'] = new Element[] {
+            // S
+            new Element(S, "S", 0, BODY, null),
+            // SAMP - - (%inline;)*
+            new Element(SAMP, "SAMP", Element.INLINE, BODY, null),
+            // SCRIPT - - %Script;
+            new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null),
+            // SELECT - - (OPTGROUP|OPTION)+
+            new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}),
+            // SMALL - - (%inline;)*
+            new Element(SMALL, "SMALL", Element.INLINE, BODY, null),
+            // SOUND
+            new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null),
+            // SPACER
+            new Element(SPACER, "SPACER", Element.EMPTY, BODY, null),
+            // SPAN - - (%inline;)*
+            new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null),
+            // STRIKE
+            new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null),
+            // STRONG - - (%inline;)*
+            new Element(STRONG, "STRONG", Element.INLINE, BODY, null),
+            // STYLE - - %StyleSheet;
+            new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}),
+            // SUB - - (%inline;)*
+            new Element(SUB, "SUB", Element.INLINE, BODY, null),
+            // SUP - - (%inline;)*
+            new Element(SUP, "SUP", Element.INLINE, BODY, null),
+        };
+        ELEMENTS_ARRAY['T'-'A'] = new Element[] {
+            // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
+            new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null),
+            // TBODY O O (TR)+
+            new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}),
+            // TD - O (%flow;)*
+            new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}),
+            // TEXTAREA - - (#PCDATA)
+            new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null),
+            // TFOOT - O (TR)+
+            new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}),
+            // TH - O (%flow;)*
+            new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}),
+            // THEAD - O (TR)+
+            new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}),
+            // TITLE - - (#PCDATA) -(%head.misc;)
+            new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null),
+            // TR - O (TH|TD)+
+            new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}),
+            // TT - - (%inline;)*
+            new Element(TT, "TT", Element.INLINE, BODY, null),
+        };
+        ELEMENTS_ARRAY['U'-'A'] = new Element[] {
+            // U, 
+            new Element(U, "U", Element.INLINE, BODY, null),
+            // UL - - (LI)+
+            new Element(UL, "UL", Element.BLOCK, BODY, null),
+        };
+        ELEMENTS_ARRAY['V'-'A'] = new Element[] {
+            // VAR - - (%inline;)*
+            new Element(VAR, "VAR", Element.INLINE, BODY, null),
+        };
+        ELEMENTS_ARRAY['W'-'A'] = new Element[] {
+            // WBR
+            new Element(WBR, "WBR", Element.EMPTY, BODY, null),
+        };
+        ELEMENTS_ARRAY['X'-'A'] = new Element[] {
+            // XML
+            new Element(XML, "XML", 0, BODY, null),
+            // XMP
+            new Element(XMP, "XMP", Element.SPECIAL, BODY, null),
+        };
+
+        // keep contiguous list of elements for lookups by code
+        for (int i = 0; i < ELEMENTS_ARRAY.length; i++) {
+            Element[] elements = ELEMENTS_ARRAY[i];
+            if (elements != null) {
+                for (int j = 0; j < elements.length; j++) {
+                    Element element = elements[j];
+                    ELEMENTS.addElement(element);
+                }
+            }
+        }
+        ELEMENTS.addElement(NO_SUCH_ELEMENT);
+
+        // initialize cross references to parent elements
+        for (int i = 0; i < ELEMENTS.size; i++) {
+            Element element = ELEMENTS.data[i];
+            if (element.parentCodes != null) {
+                element.parent = new Element[element.parentCodes.length];
+                for (int j = 0; j < element.parentCodes.length; j++) {
+                    element.parent[j] = ELEMENTS.data[element.parentCodes[j]];
+                }
+                element.parentCodes = null;
+            }
+        }
+
+    } // <clinit>()
+
+    //
+    // Public static methods
+    //
+
+    /**
+     * Returns the element information for the specified element code.
+     *
+     * @param code The element code.
+     */
+    public static final Element getElement(short code) {
+        return ELEMENTS.data[code];
+    } // getElement(short):Element
+
+    /**
+     * Returns the element information for the specified element name.
+     *
+     * @param ename The element name.
+     */
+    public static final Element getElement(String ename) {
+        return getElement(ename, NO_SUCH_ELEMENT);
+    } // getElement(String):Element
+
+    /**
+     * Returns the element information for the specified element name.
+     *
+     * @param ename The element name.
+     * @param element The default element to return if not found.
+     */
+    public static final Element getElement(String ename, Element element) {
+
+        if (ename.length() > 0) {
+            int c = ename.charAt(0);
+            if (c >= 'a' && c <= 'z') {
+                c = 'A' + c - 'a';
+            }
+            if (c >= 'A' && c <= 'Z') {
+                Element[] elements = ELEMENTS_ARRAY[c - 'A'];
+                if (elements != null) {
+                    for (int i = 0; i < elements.length; i++) {
+                        Element elem = elements[i];
+                        if (elem.name.equalsIgnoreCase(ename)) {
+                            return elem;
+                        }
+                    }
+                }
+            }
+        }
+        return element;
+
+    } // getElement(String):Element
+
+    //
+    // Classes
+    //
+
+    /**
+     * Element information.
+     *
+     * @author Andy Clark
+     */
+    public static class Element {
+
+        //
+        // Constants
+        //
+
+        /** Inline element. */
+        public static final int INLINE = 0x01;
+
+        /** Block element. */
+        public static final int BLOCK = 0x02;
+
+        /** Empty element. */
+        public static final int EMPTY = 0x04;
+
+        /** Container element. */
+        public static final int CONTAINER = 0x08;
+
+        /** Special element. */
+        public static final int SPECIAL = 0x10;
+
+        //
+        // Data
+        //
+
+        /** The element code. */
+        public short code;
+
+        /** The element name. */
+        public String name;
+
+        /** Informational flags. */
+        public int flags;
+
+        /** Parent elements. */
+        public short[] parentCodes;
+
+        /** Parent elements. */
+        public Element[] parent;
+
+        /** The bounding element code. */
+        public short bounds;
+
+        /** List of elements this element can close. */
+        public short[] closes;
+        
+        /** If set to true, then this element may not be nested, example: "A" **/
+        boolean nestable = true;
+
+        //
+        // Constructors
+        //
+
+        /** 
+         * Constructs an element object.
+         *
+         * @param code The element code.
+         * @param name The element name.
+         * @param flags Informational flags
+         * @param parent Natural closing parent name.
+         * @param closes List of elements this element can close.
+         */
+        public Element(short code, String name, int flags, 
+                       short parent, short[] closes) {
+            this(code, name, flags, new short[]{parent}, (short)-1, closes);
+        } // <init>(short,String,int,short,short[]);
+
+        /** 
+         * Constructs an element object.
+         *
+         * @param code The element code.
+         * @param name The element name.
+         * @param flags Informational flags
+         * @param parent Natural closing parent name.
+         * @param closes List of elements this element can close.
+         */
+        public Element(short code, String name, int flags, 
+                       short parent, short bounds, short[] closes) {
+            this(code, name, flags, new short[]{parent}, bounds, closes);
+        } // <init>(short,String,int,short,short,short[])
+
+        /** 
+         * Constructs an element object.
+         *
+         * @param code The element code.
+         * @param name The element name.
+         * @param flags Informational flags
+         * @param parents Natural closing parent names.
+         * @param closes List of elements this element can close.
+         */
+        public Element(short code, String name, int flags, 
+                       short[] parents, short[] closes) {
+            this(code, name, flags, parents, (short)-1, closes);
+        } // <init>(short,String,int,short[],short[])
+
+        /** 
+         * Constructs an element object.
+         *
+         * @param code The element code.
+         * @param name The element name.
+         * @param flags Informational flags
+         * @param parents Natural closing parent names.
+         * @param closes List of elements this element can close.
+         */
+        public Element(short code, String name, int flags, 
+                       short[] parents, short bounds, short[] closes) {
+            this.code = code;
+            this.name = name;
+            this.flags = flags;
+            this.parentCodes = parents;
+            this.parent = null;
+            this.bounds = bounds;
+            this.closes = closes;
+            if(closes != null) {
+                for(int i=0;i<closes.length;i++) {
+                    if(closes[i] == code) {
+                        this.nestable = false;
+                        break;
+                    }
+                }
+            }
+        } // <init>(short,String,int,short[],short,short[])
+
+        //
+        // Public methods
+        //
+
+        /** Returns true if this element is an inline element. */
+        public final boolean isInline() {
+            return (flags & INLINE) != 0;
+        } // isInline():boolean
+
+        /** Returns true if this element is a block element. */
+        public final boolean isBlock() {
+            return (flags & BLOCK) != 0;
+        } // isBlock():boolean
+
+        /** Returns true if this element is an empty element. */
+        public final boolean isEmpty() {
+            return (flags & EMPTY) != 0;
+        } // isEmpty():boolean
+
+        /** Returns true if this element is a container element. */
+        public final boolean isContainer() {
+            return (flags & CONTAINER) != 0;
+        } // isContainer():boolean
+
+        /** 
+         * Returns true if this element is special -- if its content
+         * should be parsed ignoring markup.
+         */
+        public final boolean isSpecial() {
+            return (flags & SPECIAL) != 0;
+        } // isSpecial():boolean
+
+        /**
+         * Returns true if this element can close the specified Element.
+         *
+         * @param tag The element.
+         */
+        public boolean closes(short tag) {
+
+            if (closes != null) {
+                for (int i = 0; i < closes.length; i++) {
+                    if (closes[i] == tag) {
+                        return true;
+                    }
+                }
+            }
+            return false;
+
+        } // closes(short):boolean
+
+        //
+        // Object methods
+        //
+
+        /** Returns a hash code for this object. */
+        public int hashCode() {
+            return name.hashCode();
+        } // hashCode():int
+
+        /** Returns true if the objects are equal. */
+        public boolean equals(Object o) {
+            return name.equals(o);
+        } // equals(Object):boolean
+
+        /**
+         * Provides a simple representation to make debugging easier
+         */
+        public String toString() {
+        	return super.toString() + "(name=" + name + ")";
+        }
+
+        /**
+         * Indicates if the provided element is an accepted parent of current element
+         * @param element the element to test for "paternity"
+         * @return <code>true</code> if <code>element</code> belongs to the {@link #parent}
+         */
+		public boolean isParent(final Element element) {
+			if (parent == null)
+				return false;
+			else {
+				for (int i=0; i<parent.length; ++i) {
+					if (element.code == parent[i].code)
+						return true;
+				}
+			}
+			return false;
+		}
+    } // class Element
+
+    /** Unsynchronized list of elements. */
+    public static class ElementList {
+
+        //
+        // Data
+        //
+
+        /** The size of the list. */
+        public int size;
+
+        /** The data in the list. */
+        public Element[] data = new Element[120];
+
+        //
+        // Public methods
+        //
+
+        /** Adds an element to list, resizing if necessary. */
+        public void addElement(Element element) {
+            if (size == data.length) {
+                Element[] newarray = new Element[size + 20];
+                System.arraycopy(data, 0, newarray, 0, size);
+                data = newarray;
+            }
+            data[size++] = element;
+        } // addElement(Element)
+
+    } // class Element
+
+} // class HTMLElements
diff --git a/src/main/java/org/cyberneko/html/HTMLTagBalancer.java b/src/main/java/org/cyberneko/html/HTMLTagBalancer.java
new file mode 100644
index 0000000..bb5f385
--- /dev/null
+++ b/src/main/java/org/cyberneko/html/HTMLTagBalancer.java
@@ -0,0 +1,1409 @@
+/* 
+ * Copyright 2002-2009 Andy Clark, Marc Guillemot
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.cyberneko.html;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.xerces.util.XMLAttributesImpl;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.XMLDocumentHandler;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLResourceIdentifier;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.apache.xerces.xni.parser.XMLComponentManager;
+import org.apache.xerces.xni.parser.XMLConfigurationException;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.apache.xerces.xni.parser.XMLDocumentSource;
+import org.cyberneko.html.HTMLElements.Element;
+import org.cyberneko.html.filters.NamespaceBinder;
+import org.cyberneko.html.xercesbridge.XercesBridge;
+@SuppressWarnings("all")
+/**
+ * Balances tags in an HTML document. This component receives document events
+ * and tries to correct many common mistakes that human (and computer) HTML
+ * document authors make. This tag balancer can:
+ * <ul>
+ * <li>add missing parent elements;
+ * <li>automatically close elements with optional end tags; and
+ * <li>handle mis-matched inline element tags.
+ * </ul>
+ * <p>
+ * This component recognizes the following features:
+ * <ul>
+ * <li>http://cyberneko.org/html/features/augmentations
+ * <li>http://cyberneko.org/html/features/report-errors
+ * <li>http://cyberneko.org/html/features/balance-tags/document-fragment
+ * <li>http://cyberneko.org/html/features/balance-tags/ignore-outside-content
+ * </ul>
+ * <p>
+ * This component recognizes the following properties:
+ * <ul>
+ * <li>http://cyberneko.org/html/properties/names/elems
+ * <li>http://cyberneko.org/html/properties/names/attrs
+ * <li>http://cyberneko.org/html/properties/error-reporter
+ * <li>http://cyberneko.org/html/properties/balance-tags/current-stack
+ * </ul>
+ *
+ * @see HTMLElements
+ *
+ * @author Andy Clark
+ * @author Marc Guillemot
+ *
+ * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
+ */
+public class HTMLTagBalancer
+    implements XMLDocumentFilter, HTMLComponent {
+
+    //
+    // Constants
+    //
+
+    // features
+
+    /** Namespaces. */
+    protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
+
+    /** Include infoset augmentations. */
+    protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+    /** Report errors. */
+    protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
+
+    /** Document fragment balancing only (deprecated). */
+    protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";
+
+    /** Document fragment balancing only. */
+    protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";
+
+    /** Ignore outside content. */
+    protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";
+
+    /** Recognized features. */
+    private static final String[] RECOGNIZED_FEATURES = {
+        NAMESPACES,
+        AUGMENTATIONS,
+        REPORT_ERRORS,
+        DOCUMENT_FRAGMENT_DEPRECATED,
+        DOCUMENT_FRAGMENT,
+        IGNORE_OUTSIDE_CONTENT,
+    };
+
+    /** Recognized features defaults. */
+    private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
+        null,
+        null,
+        null,
+        null,
+        Boolean.FALSE,
+        Boolean.FALSE,
+    };
+
+    // properties
+
+    /** Modify HTML element names: { "upper", "lower", "default" }. */
+    protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
+
+    /** Modify HTML attribute names: { "upper", "lower", "default" }. */
+    protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
+    
+    /** Error reporter. */
+    protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
+
+    /**
+     * <font color="red">EXPERIMENTAL: may change in next release</font><br/>
+     * Name of the property holding the stack of elements in which context a document fragment should be parsed.
+     **/
+    public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack";
+
+    /** Recognized properties. */
+    private static final String[] RECOGNIZED_PROPERTIES = {
+        NAMES_ELEMS,
+        NAMES_ATTRS,
+        ERROR_REPORTER,
+        FRAGMENT_CONTEXT_STACK,
+    };
+
+    /** Recognized properties defaults. */
+    private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
+        null,
+        null,
+        null,
+        null,
+    };
+
+    // modify HTML names
+
+    /** Don't modify HTML names. */
+    protected static final short NAMES_NO_CHANGE = 0;
+
+    /** Match HTML element names. */
+    protected static final short NAMES_MATCH = 0;
+
+    /** Uppercase HTML names. */
+    protected static final short NAMES_UPPERCASE = 1;
+
+    /** Lowercase HTML names. */
+    protected static final short NAMES_LOWERCASE = 2;
+
+    // static vars
+
+    /** Synthesized event info item. */
+    protected static final HTMLEventInfo SYNTHESIZED_ITEM = 
+        new HTMLEventInfo.SynthesizedItem();
+
+    //
+    // Data
+    //
+
+    // features
+
+    /** Namespaces. */
+    protected boolean fNamespaces;
+
+    /** Include infoset augmentations. */
+    protected boolean fAugmentations;
+    
+    /** Report errors. */
+    protected boolean fReportErrors;
+
+    /** Document fragment balancing only. */
+    protected boolean fDocumentFragment;
+
+    /** Ignore outside content. */
+    protected boolean fIgnoreOutsideContent;
+
+    // properties
+
+    /** Modify HTML element names. */
+    protected short fNamesElems;
+
+    /** Modify HTML attribute names. */
+    protected short fNamesAttrs;
+
+    /** Error reporter. */
+    protected HTMLErrorReporter fErrorReporter;
+
+    // connections
+
+    /** The document source. */
+    protected XMLDocumentSource fDocumentSource;
+
+    /** The document handler. */
+    protected XMLDocumentHandler fDocumentHandler;
+
+    // state
+
+    /** The element stack. */
+    protected final InfoStack fElementStack = new InfoStack();
+
+    /** The inline stack. */
+    protected final InfoStack fInlineStack = new InfoStack();
+
+    /** True if seen anything. Important for xml declaration. */
+    protected boolean fSeenAnything;
+
+    /** True if root element has been seen. */
+    protected boolean fSeenDoctype;
+
+    /** True if root element has been seen. */
+    protected boolean fSeenRootElement;
+
+    /** 
+     * True if seen the end of the document element. In other words, 
+     * this variable is set to false <em>until</em> the end &lt;/HTML&gt; 
+     * tag is seen (or synthesized). This is used to ensure that 
+     * extraneous events after the end of the document element do not 
+     * make the document stream ill-formed.
+     */
+    protected boolean fSeenRootElementEnd;
+
+    /** True if seen &lt;head&lt; element. */
+    protected boolean fSeenHeadElement;
+
+    /** True if seen &lt;body&lt; element. */
+    protected boolean fSeenBodyElement;
+
+    /** True if a form is in the stack (allow to discard opening of nested forms) */
+    protected boolean fOpenedForm;
+
+    // temp vars
+
+    /** A qualified name. */
+    private final QName fQName = new QName();
+
+    /** Empty attributes. */
+    private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();
+
+    /** Augmentations. */
+    private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
+
+    protected HTMLTagBalancingListener tagBalancingListener;
+    private LostText lostText_ = new LostText();
+
+    private boolean forcedStartElement_ = false;
+    private boolean forcedEndElement_ = false;
+
+    /**
+     * Stack of elements determining the context in which a document fragment should be parsed
+     */
+	private QName[] fragmentContextStack_ = null;
+	private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set
+
+    private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList(); 
+
+    //
+    // HTMLComponent methods
+    //
+
+    /** Returns the default state for a feature. */
+    public Boolean getFeatureDefault(String featureId) {
+        int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
+        for (int i = 0; i < length; i++) {
+            if (RECOGNIZED_FEATURES[i].equals(featureId)) {
+                return RECOGNIZED_FEATURES_DEFAULTS[i];
+            }
+        }
+        return null;
+    } // getFeatureDefault(String):Boolean
+
+    /** Returns the default state for a property. */
+    public Object getPropertyDefault(String propertyId) {
+        int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
+        for (int i = 0; i < length; i++) {
+            if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
+                return RECOGNIZED_PROPERTIES_DEFAULTS[i];
+            }
+        }
+        return null;
+    } // getPropertyDefault(String):Object
+
+    //
+    // XMLComponent methods
+    //
+
+    /** Returns recognized features. */
+    public String[] getRecognizedFeatures() {
+        return RECOGNIZED_FEATURES;
+    } // getRecognizedFeatures():String[]
+
+    /** Returns recognized properties. */
+    public String[] getRecognizedProperties() {
+        return RECOGNIZED_PROPERTIES;
+    } // getRecognizedProperties():String[]
+
+    /** Resets the component. */
+    public void reset(XMLComponentManager manager)
+        throws XMLConfigurationException {
+
+        // get features
+        fNamespaces = manager.getFeature(NAMESPACES);
+        fAugmentations = manager.getFeature(AUGMENTATIONS);
+        fReportErrors = manager.getFeature(REPORT_ERRORS);
+        fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) ||
+                            manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED);
+        fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT);
+
+        // get properties
+        fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
+        fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
+        fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
+        
+        fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK);
+
+    } // reset(XMLComponentManager)
+
+    /** Sets a feature. */
+    public void setFeature(String featureId, boolean state)
+        throws XMLConfigurationException {
+
+        if (featureId.equals(AUGMENTATIONS)) {
+            fAugmentations = state;
+            return;
+        }
+        if (featureId.equals(REPORT_ERRORS)) {
+            fReportErrors = state;
+            return;
+        }
+        if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) {
+            fIgnoreOutsideContent = state;
+            return;
+        }
+
+    } // setFeature(String,boolean)
+
+    /** Sets a property. */
+    public void setProperty(String propertyId, Object value)
+        throws XMLConfigurationException {
+    
+        if (propertyId.equals(NAMES_ELEMS)) {
+            fNamesElems = getNamesValue(String.valueOf(value));
+            return;
+        }
+
+        if (propertyId.equals(NAMES_ATTRS)) {
+            fNamesAttrs = getNamesValue(String.valueOf(value));
+            return;
+        }
+
+    } // setProperty(String,Object)
+
+    //
+    // XMLDocumentSource methods
+    //
+
+    /** Sets the document handler. */
+    public void setDocumentHandler(XMLDocumentHandler handler) {
+        fDocumentHandler = handler;
+    } // setDocumentHandler(XMLDocumentHandler)
+
+    // @since Xerces 2.1.0
+
+    /** Returns the document handler. */
+    public XMLDocumentHandler getDocumentHandler() {
+        return fDocumentHandler;
+    } // getDocumentHandler():XMLDocumentHandler
+
+    //
+    // XMLDocumentHandler methods
+    //
+
+    // since Xerces-J 2.2.0
+
+    /** Start document. */
+    public void startDocument(XMLLocator locator, String encoding, 
+                              NamespaceContext nscontext, Augmentations augs) 
+        throws XNIException {
+
+        // reset state
+        fElementStack.top = 0;
+        if (fragmentContextStack_ != null) {
+        	fragmentContextStackSize_ = fragmentContextStack_.length;
+        	for (int i=0; i<fragmentContextStack_.length; ++i) {
+        		final QName name = fragmentContextStack_[i];
+            	final Element elt = HTMLElements.getElement(name.localpart);
+            	fElementStack.push(new Info(elt, name));
+        	}
+        	
+        }
+        else {
+        	fragmentContextStackSize_ = 0;
+        }
+        fSeenAnything = false;
+        fSeenDoctype = false;
+        fSeenRootElement = false;
+        fSeenRootElementEnd = false;
+        fSeenHeadElement = false;
+        fSeenBodyElement = false;
+        
+
+        // pass on event
+        if (fDocumentHandler != null) {
+        	XercesBridge.getInstance().XMLDocumentHandler_startDocument(fDocumentHandler, locator, encoding, nscontext, augs);
+        }
+    
+    } // startDocument(XMLLocator,String,Augmentations)
+
+    // old methods
+
+    /** XML declaration. */
+    public void xmlDecl(String version, String encoding, String standalone,
+                        Augmentations augs) throws XNIException {
+        if (!fSeenAnything && fDocumentHandler != null) {
+            fDocumentHandler.xmlDecl(version, encoding, standalone, augs);
+        }
+    } // xmlDecl(String,String,String,Augmentations)
+
+    /** Doctype declaration. */
+    public void doctypeDecl(String rootElementName, String publicId, String systemId,
+                            Augmentations augs) throws XNIException {
+        fSeenAnything = true;
+        if (fReportErrors) {
+            if (fSeenRootElement) {
+                fErrorReporter.reportError("HTML2010", null);
+            }
+            else if (fSeenDoctype) {
+                fErrorReporter.reportError("HTML2011", null);
+            }
+        }
+        if (!fSeenRootElement && !fSeenDoctype) {
+            fSeenDoctype = true;
+            if (fDocumentHandler != null) {
+                fDocumentHandler.doctypeDecl(rootElementName, publicId, systemId, augs);
+            }
+        }
+    } // doctypeDecl(String,String,String,Augmentations)
+
+    /** End document. */
+    public void endDocument(Augmentations augs) throws XNIException {
+
+    	// </body> and </html> have been buffered to consider outside content
+    	fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer
+    	consumeBufferedEndElements();
+    	
+        // handle empty document
+        if (!fSeenRootElement && !fDocumentFragment) {
+            if (fReportErrors) {
+                fErrorReporter.reportError("HTML2000", null);
+            }
+            if (fDocumentHandler != null) {
+            	fSeenRootElementEnd = false;
+                forceStartBody(); // will force <html> and <head></head>
+                final String body = modifyName("body", fNamesElems);
+                fQName.setValues(null, body, body, null);
+                callEndElement(fQName, synthesizedAugs());
+
+                final String ename = modifyName("html", fNamesElems);
+                fQName.setValues(null, ename, ename, null);
+                callEndElement(fQName, synthesizedAugs());
+            }
+        }
+
+        // pop all remaining elements
+        else {
+            int length = fElementStack.top - fragmentContextStackSize_;
+            for (int i = 0; i < length; i++) {
+                Info info = fElementStack.pop();
+                if (fReportErrors) {
+                    String ename = info.qname.rawname;
+                    fErrorReporter.reportWarning("HTML2001", new Object[]{ename});
+                }
+                if (fDocumentHandler != null) {
+                    callEndElement(info.qname, synthesizedAugs());
+                }
+            }
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.endDocument(augs);
+        }
+
+    } // endDocument(Augmentations)
+
+    /**
+     * Consume elements that have been buffered, like </body></html> that are first consumed
+     * at the end of document
+     */
+	private void consumeBufferedEndElements() {
+		final List toConsume = new ArrayList(endElementsBuffer_);
+		endElementsBuffer_.clear();
+		for (int i=0; i<toConsume.size(); ++i) {
+    		final ElementEntry entry = (ElementEntry) toConsume.get(i);
+    		forcedEndElement_ = true;
+        	endElement(entry.name_, entry.augs_);
+    	}
+		endElementsBuffer_.clear();
+	}
+
+    /** Comment. */
+    public void comment(XMLString text, Augmentations augs) throws XNIException {
+        fSeenAnything = true;
+        consumeEarlyTextIfNeeded();
+        if (fDocumentHandler != null) {
+            fDocumentHandler.comment(text, augs);
+        }
+    } // comment(XMLString,Augmentations)
+
+	private void consumeEarlyTextIfNeeded() {
+		if (!lostText_.isEmpty()) {
+        	if (!fSeenBodyElement) {
+        		forceStartBody();
+        	}
+            lostText_.refeed(this);
+        }
+	}
+
+    /** Processing instruction. */
+    public void processingInstruction(String target, XMLString data,
+                                      Augmentations augs) throws XNIException {
+        fSeenAnything = true;
+        consumeEarlyTextIfNeeded();
+        if (fDocumentHandler != null) {
+            fDocumentHandler.processingInstruction(target, data, augs);
+        }
+    } // processingInstruction(String,XMLString,Augmentations)
+
+    /** Start element. */
+    public void startElement(final QName elem, XMLAttributes attrs, final Augmentations augs)
+        throws XNIException {
+        fSeenAnything = true;
+        
+        final boolean isForcedCreation = forcedStartElement_;
+        forcedStartElement_ = false;
+
+        // check for end of document
+        if (fSeenRootElementEnd) {
+        	notifyDiscardedStartElement(elem, attrs, augs);
+            return;
+        }
+
+        // get element information
+        final HTMLElements.Element element = getElement(elem);
+        final short elementCode = element.code;
+
+        // the creation of some elements like TABLE or SELECT can't be forced. Any others? 
+        if (isForcedCreation && (elementCode == HTMLElements.TABLE || elementCode == HTMLElements.SELECT)) {
+        	return; // don't accept creation
+        }
+
+        // ignore multiple html, head, body elements
+		if (fSeenRootElement && elementCode == HTMLElements.HTML) {
+        	notifyDiscardedStartElement(elem, attrs, augs);
+            return;
+        }
+        if (elementCode == HTMLElements.HEAD) {
+            if (fSeenHeadElement) {
+            	notifyDiscardedStartElement(elem, attrs, augs);
+                return;
+            }
+            fSeenHeadElement = true;
+        }
+        else if (elementCode == HTMLElements.FRAMESET) {
+        	consumeBufferedEndElements(); // </head> (if any) has been buffered
+        }
+        else if (elementCode == HTMLElements.BODY) {
+    		// create <head></head> if none was present
+    		if (!fSeenHeadElement) {
+    			final QName head = createQName("head");
+    			forceStartElement(head, null, synthesizedAugs());
+    			endElement(head, synthesizedAugs());
+    		}
+        	consumeBufferedEndElements(); // </head> (if any) has been buffered
+    		
+            if (fSeenBodyElement) {
+            	notifyDiscardedStartElement(elem, attrs, augs);
+                return;
+            }
+            fSeenBodyElement = true;
+        }
+        else if (elementCode == HTMLElements.FORM) {
+        	if (fOpenedForm) {
+            	notifyDiscardedStartElement(elem, attrs, augs);
+        		return;
+        	}
+        	fOpenedForm = true;
+        }
+        else if (elementCode == HTMLElements.UNKNOWN) {
+        	consumeBufferedEndElements();
+        }
+
+        // check proper parent
+        if (element.parent != null) {
+        	if (!fSeenRootElement && !fDocumentFragment) {
+                String pname = element.parent[0].name;
+                pname = modifyName(pname, fNamesElems);
+                if (fReportErrors) {
+                    String ename = elem.rawname;
+                    fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname});
+                }
+                final QName qname = new QName(null, pname, pname, null);
+                final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs());
+                if (!parentCreated) {
+                	if (!isForcedCreation) {
+                		notifyDiscardedStartElement(elem, attrs, augs);
+                	}
+            		return;
+                }
+            }
+        	else {
+                HTMLElements.Element preferedParent = element.parent[0];
+                if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) {
+                    int depth = getParentDepth(element.parent, element.bounds);
+                    if (depth == -1) { // no parent found
+                        final String pname = modifyName(preferedParent.name, fNamesElems);
+                        final QName qname = new QName(null, pname, pname, null);
+                        if (fReportErrors) {
+                            String ename = elem.rawname;
+                            fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname});
+                        }
+                        final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs());
+                        if (!parentCreated) {
+                        	if (!isForcedCreation) {
+                        		notifyDiscardedStartElement(elem, attrs, augs);
+                        	}
+                    		return;
+                        }
+                    }
+                }
+            }
+        }
+
+        // if block element, save immediate parent inline elements
+        int depth = 0;
+        if (element.flags == 0) {
+            int length = fElementStack.top;
+            fInlineStack.top = 0;
+            for (int i = length - 1; i >= 0; i--) {
+                Info info = fElementStack.data[i];
+                if (!info.element.isInline()) {
+                    break;
+                }
+                fInlineStack.push(info);
+                endElement(info.qname, synthesizedAugs());
+            }
+            depth = fInlineStack.top;
+        }
+
+        // close previous elements
+        // all elements close a <script>
+        // in head, no element has children
+        if ((fElementStack.top > 1 
+        		&& (fElementStack.peek().element.code == HTMLElements.SCRIPT))
+        		|| fElementStack.top > 2 && fElementStack.data[fElementStack.top-2].element.code == HTMLElements.HEAD) {
+            final Info info = fElementStack.pop();
+            if (fDocumentHandler != null) {
+                callEndElement(info.qname, synthesizedAugs());
+            }
+        }
+        if (element.closes != null) {
+            int length = fElementStack.top;
+            for (int i = length - 1; i >= 0; i--) {
+                Info info = fElementStack.data[i];
+
+                // does it close the element we're looking at?
+                if (element.closes(info.element.code)) {
+                    if (fReportErrors) {
+                        String ename = elem.rawname;
+                        String iname = info.qname.rawname;
+                        fErrorReporter.reportWarning("HTML2005", new Object[]{ename,iname});
+                    }
+                    for (int j = length - 1; j >= i; j--) {
+                        info = fElementStack.pop();
+                        if (fDocumentHandler != null) {
+                            // PATCH: Marc-Andr� Morissette
+                            callEndElement(info.qname, synthesizedAugs());
+                        }
+                    }
+                    length = i;
+                    continue;
+                }
+                
+                // should we stop searching?
+                if(element.nestable) {
+                    if (info.element.isBlock() || element.isParent(info.element)) {
+                    	break;
+                    }
+                }
+            }
+        }
+        // TODO: investigate if only table is special here
+        // table closes all opened inline elements
+        else if (elementCode == HTMLElements.TABLE) {
+            for (int i=fElementStack.top-1; i >= 0; i--) {
+                final Info info = fElementStack.data[i];
+                if (!info.element.isInline()) {
+                    break;
+                }
+                endElement(info.qname, synthesizedAugs());
+            }
+        }
+
+        // call handler
+        fSeenRootElement = true;
+        if (element != null && element.isEmpty()) {
+            if (attrs == null) {
+                attrs = emptyAttributes();
+            }
+            if (fDocumentHandler != null) {
+                fDocumentHandler.emptyElement(elem, attrs, augs);
+            }
+        }
+        else {
+            boolean inline = element != null && element.isInline();
+            fElementStack.push(new Info(element, elem, inline ? attrs : null));
+            if (attrs == null) {
+                attrs = emptyAttributes();
+            }
+            if (fDocumentHandler != null) {
+                callStartElement(elem, attrs, augs);
+            }
+        }
+
+        // re-open inline elements
+        for (int i = 0; i < depth; i++) {
+            Info info = fInlineStack.pop();
+            forceStartElement(info.qname, info.attributes, synthesizedAugs());
+        }
+
+        if (elementCode == HTMLElements.BODY) {
+        	lostText_.refeed(this);
+        }
+    } // startElement(QName,XMLAttributes,Augmentations)
+
+    /**
+     * Forces an element start, taking care to set the information to allow startElement to "see" that's
+     * the element has been forced.
+     * @return <code>true</code> if creation could be done (TABLE's creation for instance can't be forced)
+     */
+    private boolean forceStartElement(final QName elem, XMLAttributes attrs, final Augmentations augs)
+    throws XNIException {
+    	
+    	forcedStartElement_ = true;
+    	startElement(elem, attrs, augs);
+    	
+    	return fElementStack.top > 0 && elem.equals(fElementStack.peek().qname);
+    }
+
+    private QName createQName(String tagName) {
+		tagName = modifyName(tagName, fNamesElems);
+		return new QName(null, tagName, tagName, NamespaceBinder.XHTML_1_0_URI);
+	}
+
+	/** Empty element. */
+    public void emptyElement(final QName element, XMLAttributes attrs, Augmentations augs)
+        throws XNIException {
+    	startElement(element, attrs, augs);
+        // browser ignore the closing indication for non empty tags like <form .../> but not for unknown element
+        final HTMLElements.Element elem = getElement(element);
+        if (elem.isEmpty() || elem.code == HTMLElements.UNKNOWN) {
+        	endElement(element, augs);
+        }
+    } // emptyElement(QName,XMLAttributes,Augmentations)
+
+	/** Start entity. */
+    public void startGeneralEntity(String name, 
+                                   XMLResourceIdentifier id,
+                                   String encoding,
+                                   Augmentations augs) throws XNIException {
+        fSeenAnything = true;
+
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // insert body, if needed
+        if (!fDocumentFragment) {
+            boolean insertBody = !fSeenRootElement;
+            if (!insertBody) {
+                Info info = fElementStack.peek();
+                if (info.element.code == HTMLElements.HEAD ||
+                    info.element.code == HTMLElements.HTML) {
+                    String hname = modifyName("head", fNamesElems);
+                    String bname = modifyName("body", fNamesElems);
+                    if (fReportErrors) {
+                        fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
+                    }
+                    fQName.setValues(null, hname, hname, null);
+                    endElement(fQName, synthesizedAugs());
+                    insertBody = true;
+                }
+            }
+            if (insertBody) {
+                forceStartBody();
+            }
+        }
+        
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.startGeneralEntity(name, id, encoding, augs);
+        }
+
+    } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
+
+    /**
+     * Generates a missing <body> (which creates missing <head> when needed)
+     */
+	private void forceStartBody() {
+		final QName body = createQName("body");
+		if (fReportErrors) {
+		    fErrorReporter.reportWarning("HTML2006", new Object[]{body.localpart});
+		}
+		forceStartElement(body, null, synthesizedAugs());
+	}
+
+    /** Text declaration. */
+    public void textDecl(String version, String encoding, Augmentations augs)
+        throws XNIException {
+        fSeenAnything = true;
+        
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.textDecl(version, encoding, augs);
+        }
+
+    } // textDecl(String,String,Augmentations)
+
+    /** End entity. */
+    public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
+        
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.endGeneralEntity(name, augs);
+        }
+
+    } // endGeneralEntity(String,Augmentations)
+
+    /** Start CDATA section. */
+    public void startCDATA(Augmentations augs) throws XNIException {
+        fSeenAnything = true;
+        
+        consumeEarlyTextIfNeeded();
+
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.startCDATA(augs);
+        }
+
+    } // startCDATA(Augmentations)
+
+    /** End CDATA section. */
+    public void endCDATA(Augmentations augs) throws XNIException {
+
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.endCDATA(augs);
+        }
+
+    } // endCDATA(Augmentations)
+
+    /** Characters. */
+    public void characters(final XMLString text, final Augmentations augs) throws XNIException {
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+    	if (fElementStack.top == 0 && !fDocumentFragment) {
+    		// character before first opening tag
+    		lostText_.add(text, augs);
+    		return;
+    	}
+
+        // is this text whitespace?
+        boolean whitespace = true;
+        for (int i = 0; i < text.length; i++) {
+            if (!Character.isWhitespace(text.ch[text.offset + i])) {
+                whitespace = false;
+                break;
+            }
+        }
+
+        if (!fDocumentFragment) {
+            // handle bare characters
+            if (!fSeenRootElement) {
+                if (whitespace) {
+                    return;
+                }
+                forceStartBody();
+            }
+            
+            if (whitespace && (fElementStack.top < 2 || endElementsBuffer_.size() == 1)) {
+            	// ignore spaces directly within <html>
+            	return;
+            }
+
+            // handle character content in head
+            // NOTE: This frequently happens when the document looks like:
+            //       <title>Title</title>
+            //       And here's some text.
+            else if (!whitespace) {
+                Info info = fElementStack.peek();
+                if (info.element.code == HTMLElements.HEAD ||
+                    info.element.code == HTMLElements.HTML) {
+                    String hname = modifyName("head", fNamesElems);
+                    String bname = modifyName("body", fNamesElems);
+                    if (fReportErrors) {
+                        fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
+                    }
+                    forceStartBody();
+                }
+            }
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+            fDocumentHandler.characters(text, augs);
+        }
+
+    } // characters(XMLString,Augmentations)
+
+    /** Ignorable whitespace. */
+    public void ignorableWhitespace(XMLString text, Augmentations augs)
+        throws XNIException {
+        characters(text, augs);
+    } // ignorableWhitespace(XMLString,Augmentations)
+    
+    /** End element. */
+    public void endElement(final QName element, final Augmentations augs) throws XNIException {
+    	final boolean forcedEndElement = forcedEndElement_;
+        // is there anything to do?
+        if (fSeenRootElementEnd) {
+        	notifyDiscardedEndElement(element, augs);
+            return;
+        }
+        
+        // get element information
+        HTMLElements.Element elem = getElement(element);
+
+        // if we consider outside content, just buffer </body> and </html> to consider them at the very end
+        if (!fIgnoreOutsideContent &&
+            (elem.code == HTMLElements.BODY || elem.code == HTMLElements.HTML)) {
+        	endElementsBuffer_.add(new ElementEntry(element, augs));
+            return;
+        }
+
+        // check for end of document
+        if (elem.code == HTMLElements.HTML) {
+            fSeenRootElementEnd = true;
+        }
+        else if (elem.code == HTMLElements.FORM) {
+        	fOpenedForm = false;
+        }
+        else if (elem.code == HTMLElements.HEAD && !forcedEndElement) {
+        	// consume </head> first when <body> is reached to retrieve content lost between </head> and <body>
+        	endElementsBuffer_.add(new ElementEntry(element, augs));
+        	return;
+        }
+        
+
+        // empty element
+        int depth = getElementDepth(elem);
+        if (depth == -1) {
+        	if (elem.code == HTMLElements.P) {
+        		forceStartElement(element, emptyAttributes(), synthesizedAugs());
+	            endElement(element, augs);
+        	}
+        	else if (!elem.isEmpty()) {
+            	notifyDiscardedEndElement(element, augs);
+        	}
+            return;
+        }
+
+        // find unbalanced inline elements
+        if (depth > 1 && elem.isInline()) {
+            final int size = fElementStack.top;
+            fInlineStack.top = 0;
+            for (int i = 0; i < depth - 1; i++) {
+                final Info info = fElementStack.data[size - i - 1];
+                final HTMLElements.Element pelem = info.element;
+                
+                if (pelem.isInline() || pelem.code == HTMLElements.FONT) { // TODO: investigate if only FONT
+                    // NOTE: I don't have to make a copy of the info because
+                    //       it will just be popped off of the element stack
+                    //       as soon as we close it, anyway.
+                    fInlineStack.push(info);
+                }
+            }
+        }
+
+        // close children up to appropriate element
+        for (int i = 0; i < depth; i++) {
+            Info info = fElementStack.pop();
+            
+            if (fReportErrors && i < depth - 1) {
+                String ename = modifyName(element.rawname, fNamesElems);
+                String iname = info.qname.rawname;
+                fErrorReporter.reportWarning("HTML2007", new Object[]{ename,iname});
+            }
+            if (fDocumentHandler != null) {
+                // PATCH: Marc-Andr\u00e8 Morissette
+                callEndElement(info.qname, i < depth - 1 ? synthesizedAugs() : augs);
+            }
+        }
+
+        // re-open inline elements
+        if (depth > 1) {
+            int size = fInlineStack.top;
+            for (int i = 0; i < size; i++) {
+                Info info = (Info)fInlineStack.pop();
+                XMLAttributes attributes = info.attributes;
+                if (fReportErrors) {
+                    String iname = info.qname.rawname;
+                    fErrorReporter.reportWarning("HTML2008", new Object[]{iname});
+                }
+                forceStartElement(info.qname, attributes, synthesizedAugs());
+            }
+        }
+
+    } // endElement(QName,Augmentations)
+
+    // @since Xerces 2.1.0
+
+	/** Sets the document source. */
+    public void setDocumentSource(XMLDocumentSource source) {
+        fDocumentSource = source;
+    } // setDocumentSource(XMLDocumentSource)
+
+    /** Returns the document source. */
+    public XMLDocumentSource getDocumentSource() {
+        return fDocumentSource;
+    } // getDocumentSource():XMLDocumentSource
+
+    // removed since Xerces-J 2.3.0
+
+    /** Start document. */
+    public void startDocument(XMLLocator locator, String encoding, Augmentations augs) 
+        throws XNIException {
+        startDocument(locator, encoding, null, augs);
+    } // startDocument(XMLLocator,String,Augmentations)
+
+    /** Start prefix mapping. */
+    public void startPrefixMapping(String prefix, String uri, Augmentations augs)
+        throws XNIException {
+        
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+        	XercesBridge.getInstance().XMLDocumentHandler_startPrefixMapping(fDocumentHandler, prefix, uri, augs);
+        }
+    
+    } // startPrefixMapping(String,String,Augmentations)
+
+    /** End prefix mapping. */
+    public void endPrefixMapping(String prefix, Augmentations augs)
+        throws XNIException {
+        
+        // check for end of document
+        if (fSeenRootElementEnd) {
+            return;
+        }
+
+        // call handler
+        if (fDocumentHandler != null) {
+        	XercesBridge.getInstance().XMLDocumentHandler_endPrefixMapping(fDocumentHandler, prefix, augs);
+        }
+    
+    } // endPrefixMapping(String,Augmentations)
+
+    //
+    // Protected methods
+    //
+
+    /** Returns an HTML element. */
+    protected HTMLElements.Element getElement(final QName elementName) {
+    	String name = elementName.rawname;
+        if (fNamespaces && NamespaceBinder.XHTML_1_0_URI.equals(elementName.uri)) {
+            int index = name.indexOf(':');
+            if (index != -1) {
+                name = name.substring(index+1);
+            }
+        }
+        return HTMLElements.getElement(name);
+    } // getElement(String):HTMLElements.Element
+
+    /** Call document handler start element. */
+    protected final void callStartElement(QName element, XMLAttributes attrs,
+                                          Augmentations augs) 
+        throws XNIException {
+        fDocumentHandler.startElement(element, attrs, augs);
+    } // callStartElement(QName,XMLAttributes,Augmentations)
+
+    /** Call document handler end element. */
+    protected final void callEndElement(QName element, Augmentations augs) 
+        throws XNIException {
+        fDocumentHandler.endElement(element, augs);
+    } // callEndElement(QName,Augmentations)
+
+    /**
+     * Returns the depth of the open tag associated with the specified
+     * element name or -1 if no matching element is found.
+     *
+     * @param element The element.
+     */
+    protected final int getElementDepth(HTMLElements.Element element) {
+        final boolean container = element.isContainer();
+        int depth = -1;
+        for (int i = fElementStack.top - 1; i >=fragmentContextStackSize_; i--) {
+            Info info = fElementStack.data[i];
+            if (info.element.code == element.code) {
+                depth = fElementStack.top - i;
+                break;
+            }
+            if (!container && (element.nestable && info.element.isBlock())) {
+                break;
+            }
+        }
+        return depth;
+    } // getElementDepth(HTMLElements.Element)
+
+    /**
+     * Returns the depth of the open tag associated with the specified
+     * element parent names or -1 if no matching element is found.
+     *
+     * @param parents The parent elements.
+     */
+    protected int getParentDepth(HTMLElements.Element[] parents, short bounds) {
+        if (parents != null) {
+            for (int i = fElementStack.top - 1; i >= 0; i--) {
+                Info info = fElementStack.data[i];
+                if (info.element.code == bounds) {
+                    break;
+                }
+                for (int j = 0; j < parents.length; j++) {
+                    if (info.element.code == parents[j].code) {
+                        return fElementStack.top - i;
+                    }
+                }
+            }
+        }
+        return -1;
+    } // getParentDepth(HTMLElements.Element[],short):int
+
+    /** Returns a set of empty attributes. */
+    protected final XMLAttributes emptyAttributes() {
+        fEmptyAttrs.removeAllAttributes();
+        return fEmptyAttrs;
+    } // emptyAttributes():XMLAttributes
+
+    /** Returns an augmentations object with a synthesized item added. */
+    protected final Augmentations synthesizedAugs() {
+        HTMLAugmentations augs = null;
+        if (fAugmentations) {
+            augs = fInfosetAugs;
+            augs.removeAllItems();
+            augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
+        }
+        return augs;
+    } // synthesizedAugs():Augmentations
+
+    //
+    // Protected static methods
+    //
+
+    /** Modifies the given name based on the specified mode. */
+    protected static final String modifyName(String name, short mode) {
+        switch (mode) {
+            case NAMES_UPPERCASE: return name.toUpperCase();
+            case NAMES_LOWERCASE: return name.toLowerCase();
+        }
+        return name;
+    } // modifyName(String,short):String
+
+    /**
+     * Converts HTML names string value to constant value. 
+     *
+     * @see #NAMES_NO_CHANGE
+     * @see #NAMES_LOWERCASE
+     * @see #NAMES_UPPERCASE
+     */
+    protected static final short getNamesValue(String value) {
+        if (value.equals("lower")) {
+            return NAMES_LOWERCASE;
+        }
+        if (value.equals("upper")) {
+            return NAMES_UPPERCASE;
+        }
+        return NAMES_NO_CHANGE;
+    } // getNamesValue(String):short
+
+    //
+    // Classes
+    //
+
+    /**
+     * Element info for each start element. This information is used when
+     * closing unbalanced inline elements. For example:
+     * <pre>
+     * &lt;i>unbalanced &lt;b>HTML&lt;/i> content&lt;/b>
+     * </pre>
+     * <p>
+     * It seems that it is a waste of processing and memory to copy the 
+     * attributes for every start element even if there are no unbalanced 
+     * inline elements in the document. However, if the attributes are
+     * <em>not</em> saved, then important attributes such as style
+     * information would be lost.
+     *
+     * @author Andy Clark
+     */
+    public static class Info {
+
+        //
+        // Data
+        //
+
+        /** The element. */
+        public HTMLElements.Element element;
+
+        /** The element qualified name. */
+        public QName qname;
+
+        /** The element attributes. */
+        public XMLAttributes attributes;
+
+        //
+        // Constructors
+        //
+
+        /**
+         * Creates an element information object.
+         * <p>
+         * <strong>Note:</strong>
+         * This constructor makes a copy of the element information.
+         *
+         * @param element The element qualified name.
+         */
+        public Info(HTMLElements.Element element, QName qname) {
+            this(element, qname, null);
+        } // <init>(HTMLElements.Element,QName)
+
+        /**
+         * Creates an element information object.
+         * <p>
+         * <strong>Note:</strong>
+         * This constructor makes a copy of the element information.
+         *
+         * @param element The element qualified name.
+         * @param attributes The element attributes.
+         */
+        public Info(HTMLElements.Element element,
+                    QName qname, XMLAttributes attributes) {
+            this.element = element;
+            this.qname = new QName(qname);
+            if (attributes != null) {
+                int length = attributes.getLength();
+                if (length > 0) {
+                    QName aqname = new QName();
+                    XMLAttributes newattrs = new XMLAttributesImpl();
+                    for (int i = 0; i < length; i++) {
+                        attributes.getName(i, aqname);
+                        String type = attributes.getType(i);
+                        String value = attributes.getValue(i);
+                        String nonNormalizedValue = attributes.getNonNormalizedValue(i);
+                        boolean specified = attributes.isSpecified(i);
+                        newattrs.addAttribute(aqname, type, value);
+                        newattrs.setNonNormalizedValue(i, nonNormalizedValue);
+                        newattrs.setSpecified(i, specified);
+                    }
+                    this.attributes = newattrs;
+                }
+            }
+        } // <init>(HTMLElements.Element,QName,XMLAttributes)
+
+        /**
+         * Simple representation to make debugging easier
+         */
+        public String toString() {
+        	return super.toString() + qname;
+        }
+    } // class Info
+
+    /** Unsynchronized stack of element information. */
+    public static class InfoStack {
+
+        //
+        // Data
+        //
+
+        /** The top of the stack. */
+        public int top;
+
+        /** The stack data. */
+        public Info[] data = new Info[10];
+
+        //
+        // Public methods
+        //
+
+        /** Pushes element information onto the stack. */
+        public void push(Info info) {
+            if (top == data.length) {
+                Info[] newarray = new Info[top + 10];
+                System.arraycopy(data, 0, newarray, 0, top);
+                data = newarray;
+            }
+            data[top++] = info;
+        } // push(Info)
+
+        /** Peeks at the top of the stack. */
+        public Info peek() {
+            return data[top-1];
+        } // peek():Info
+
+        /** Pops the top item off of the stack. */
+        public Info pop() {
+            return data[--top];
+        } // pop():Info
+        
+        /**
+         * Simple representation to make debugging easier
+         */
+        public String toString() {
+        	final StringBuffer sb = new StringBuffer("InfoStack(");
+        	for (int i=top-1; i>=0; --i) {
+        		sb.append(data[i]);
+        		if (i != 0)
+        			sb.append(", ");
+        	}
+        	sb.append(")");
+        	return sb.toString();
+        }
+
+
+    } // class InfoStack
+
+	void setTagBalancingListener(final HTMLTagBalancingListener tagBalancingListener) {
+		this.tagBalancingListener = tagBalancingListener;
+	}
+
+	/**
+	 * Notifies the tagBalancingListener (if any) of an ignored start element
+	 */
+    private void notifyDiscardedStartElement(final QName elem, final XMLAttributes attrs,
+    		final Augmentations augs) {
+    	if (tagBalancingListener != null)
+    		tagBalancingListener.ignoredStartElement(elem, attrs, augs);
+	}
+
+	/**
+	 * Notifies the tagBalancingListener (if any) of an ignored end element
+	 */
+    private void notifyDiscardedEndElement(final QName element, final Augmentations augs) {
+    	if (tagBalancingListener != null)
+    		tagBalancingListener.ignoredEndElement(element, augs);
+	}
+
+    /**
+     * Structure to hold information about an element placed in buffer to be comsumed later
+     */
+    static class ElementEntry {
+    	private final QName name_;
+    	private final Augmentations augs_;
+    	ElementEntry(final QName element, final Augmentations augs) {
+    		name_ = new QName(element);
+    		augs_ = (augs == null) ? null : new HTMLAugmentations(augs);
+    	}
+    }
+} // class HTMLTagBalancer