doc
.
+ *
+ * @param doc
+ * The {@link TextDocument} that is to be processed.
+ * @return true
if changes have been made to the
+ * {@link TextDocument}.
+ * @throws BoilerpipeProcessingException
+ */
+ boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java
new file mode 100644
index 0000000..bcb603d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java
@@ -0,0 +1,35 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe;
+
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * A source that returns {@link TextDocument}s.
+ *
+ * @author Christian Kohlschütter
+ */
+public interface BoilerpipeInput {
+ /**
+ * Returns (somehow) a {@link TextDocument}.
+ *
+ * @return A {@link TextDocument}.
+ * @throws BoilerpipeProcessingException
+ */
+ TextDocument getTextDocument() throws BoilerpipeProcessingException;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java
new file mode 100644
index 0000000..f3a9cc4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java
@@ -0,0 +1,43 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe;
+
+/**
+ * Exception for signaling failure in the processing pipeline.
+ *
+ * @author Christian Kohlschütter
+ */
+public class BoilerpipeProcessingException extends Exception {
+ private static final long serialVersionUID = 1L;
+
+ public BoilerpipeProcessingException() {
+ super();
+ }
+
+ public BoilerpipeProcessingException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public BoilerpipeProcessingException(String message) {
+ super(message);
+ }
+
+ public BoilerpipeProcessingException(Throwable cause) {
+ super(cause);
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java b/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java
new file mode 100644
index 0000000..df92f10
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java
@@ -0,0 +1,37 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.conditions;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.labels.ConditionalLabelAction;
+
+/**
+ * Evaluates whether a given {@link TextBlock} meets a certain condition.
+ * Useful in combination with {@link ConditionalLabelAction}.
+ *
+ * @author Christian Kohlschuetter
+ */
+public interface TextBlockCondition {
+ /**
+ * Returns true
iff the given {@link TextBlock} tb meets the defined condition.
+ *
+ * @param tb
+ * @return iff the condition is met.
+ */
+ boolean meetsCondition(final TextBlock tb);
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/Image.java b/src/main/java/de/l3s/boilerpipe/document/Image.java
new file mode 100644
index 0000000..91abc66
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/Image.java
@@ -0,0 +1,97 @@
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an Image resource that is contained in the document.
+ *
+ * Any of the attributes may be null, except for "src".
+ *
+ * @author Christian Kohlschuetter
+ */
+public class Image extends Media implements Comparable {
+ private final String src;
+ private final String width;
+ private final String height;
+ private final String alt;
+ private final int area;
+
+ public Image(final String src, final String width, final String height, final String alt) {
+ this.src = src;
+ if(src == null) {
+ throw new NullPointerException("src attribute must not be null");
+ }
+ this.width = nullTrim(width);
+ this.height = nullTrim(height);
+ this.alt = nullTrim(alt);
+
+ if(width != null && height != null) {
+ int a;
+ try {
+ a = Integer.parseInt(width) * Integer.parseInt(height);
+ } catch(NumberFormatException e) {
+ a = -1;
+ }
+ this.area = a;
+ } else {
+ this.area = -1;
+ }
+ }
+
+ /**
+ * gets the src attribut from the image tag in the html source.
+ * it's not everytime an absolute path!
+ *
+ * @return gets the src attribut from the image
+ */
+ public String getSrc() {
+ return src;
+ }
+
+ public String getWidth() {
+ return width;
+ }
+
+ public String getHeight() {
+ return height;
+ }
+
+ public String getAlt() {
+ return alt;
+ }
+
+ private static String nullTrim(String s) {
+ if(s == null) {
+ return null;
+ }
+ s = s.trim();
+ if(s.length() == 0) {
+ return null;
+ }
+ return s;
+ }
+
+ /**
+ * Returns the image's area (specified by width * height), or -1 if width/height weren't both specified or could not be parsed.
+ *
+ * @return the image's area
+ */
+ public int getArea() {
+ return area;
+ }
+
+ public String toString() {
+ return src+"\twidth="+width+"\theight="+height+"\talt="+alt+"\tarea="+area;
+ }
+
+ public int compareTo(Image o) {
+ if(o == this) {
+ return 0;
+ }
+ if(area > o.area) {
+ return -1;
+ } else if(area == o.area) {
+ return src.compareTo(o.src);
+ } else {
+ return 1;
+ }
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/Media.java b/src/main/java/de/l3s/boilerpipe/document/Media.java
new file mode 100644
index 0000000..8923b24
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/Media.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Media class
+ *
+ * @author manuel.codiga@gmail.com
+ *
+ */
+public abstract class Media {
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/TextBlock.java b/src/main/java/de/l3s/boilerpipe/document/TextBlock.java
new file mode 100644
index 0000000..f7e59ac
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/TextBlock.java
@@ -0,0 +1,286 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.document;
+
+import java.util.BitSet;
+import java.util.HashSet;
+import java.util.Set;
+
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Describes a block of text.
+ *
+ * A block can be an "atomic" text element (i.e., a sequence of text that is not
+ * interrupted by any HTML markup) or a compound of such atomic elements.
+ *
+ * @author Christian Kohlschütter
+ */
+public class TextBlock implements Cloneable {
+ boolean isContent = false;
+ private CharSequence text;
+ Set labels = null;
+
+ int offsetBlocksStart;
+ int offsetBlocksEnd;
+
+ int numWords;
+ int numWordsInAnchorText;
+ int numWordsInWrappedLines;
+ int numWrappedLines;
+ float textDensity;
+ float linkDensity;
+
+ BitSet containedTextElements;
+
+ private int numFullTextWords = 0;
+ private int tagLevel;
+
+ private static final BitSet EMPTY_BITSET = new BitSet();
+ public static final TextBlock EMPTY_START = new TextBlock("", EMPTY_BITSET,
+ 0, 0, 0, 0, -1);
+ public static final TextBlock EMPTY_END = new TextBlock("", EMPTY_BITSET,
+ 0, 0, 0, 0, Integer.MAX_VALUE);
+
+ public TextBlock(final String text) {
+ this(text, null, 0,0,0,0,0);
+ }
+
+ public TextBlock(final String text, final BitSet containedTextElements,
+ final int numWords, final int numWordsInAnchorText,
+ final int numWordsInWrappedLines, final int numWrappedLines,
+ final int offsetBlocks) {
+ this.text = text;
+ this.containedTextElements = containedTextElements;
+ this.numWords = numWords;
+ this.numWordsInAnchorText = numWordsInAnchorText;
+ this.numWordsInWrappedLines = numWordsInWrappedLines;
+ this.numWrappedLines = numWrappedLines;
+ this.offsetBlocksStart = offsetBlocks;
+ this.offsetBlocksEnd = offsetBlocks;
+ initDensities();
+ }
+
+ public boolean isContent() {
+ return isContent;
+ }
+
+ public boolean setIsContent(boolean isContent) {
+ if (isContent != this.isContent) {
+ this.isContent = isContent;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ public String getText() {
+ return text.toString();
+ }
+
+ public int getNumWords() {
+ return numWords;
+ }
+
+ public int getNumWordsInAnchorText() {
+ return numWordsInAnchorText;
+ }
+
+ public float getTextDensity() {
+ return textDensity;
+ }
+
+ public float getLinkDensity() {
+ return linkDensity;
+ }
+
+ public void mergeNext(final TextBlock other) {
+ if (!(text instanceof StringBuilder)) {
+ text = new StringBuilder(text);
+ }
+ StringBuilder sb = (StringBuilder) text;
+ sb.append('\n');
+ sb.append(other.text);
+
+ numWords += other.numWords;
+ numWordsInAnchorText += other.numWordsInAnchorText;
+
+ numWordsInWrappedLines += other.numWordsInWrappedLines;
+ numWrappedLines += other.numWrappedLines;
+
+ offsetBlocksStart = Math
+ .min(offsetBlocksStart, other.offsetBlocksStart);
+ offsetBlocksEnd = Math.max(offsetBlocksEnd, other.offsetBlocksEnd);
+
+ initDensities();
+
+ this.isContent |= other.isContent;
+
+ if(containedTextElements == null) {
+ containedTextElements = (BitSet)other.containedTextElements.clone();
+ } else {
+ containedTextElements.or(other.containedTextElements);
+ }
+
+ numFullTextWords += other.numFullTextWords;
+
+ if (other.labels != null) {
+ if (labels == null) {
+ labels = new HashSet(other.labels);
+ } else {
+ labels.addAll(other.labels);
+ }
+ }
+
+ tagLevel = Math.min(tagLevel, other.tagLevel);
+ }
+
+ private void initDensities() {
+ if (numWordsInWrappedLines == 0) {
+ numWordsInWrappedLines = numWords;
+ numWrappedLines = 1;
+ }
+ textDensity = numWordsInWrappedLines / (float) numWrappedLines;
+ linkDensity = numWords == 0 ? 0 : numWordsInAnchorText / (float) numWords;
+ }
+
+ public int getOffsetBlocksStart() {
+ return offsetBlocksStart;
+ }
+ public int getOffsetBlocksEnd() {
+ return offsetBlocksEnd;
+ }
+
+ public String toString() {
+ return "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl="+tagLevel+"; nw="+numWords+";nwl="+numWrappedLines+";ld="+linkDensity+"]\t"
+ + (isContent?"CONTENT":"boilerplate") + "," + labels + "\n" + getText();
+ }
+
+ /**
+ * Adds an arbitrary String label to this {@link TextBlock}.
+ *
+ * @param label The label
+ * @see DefaultLabels
+ */
+ public void addLabel(final String label) {
+ if (labels == null) {
+ labels = new HashSet(2);
+ }
+ labels.add(label);
+ }
+
+ /**
+ * Checks whether this TextBlock has the given label.
+ *
+ * @param label The label
+ * @return true
if this block is marked by the given label.
+ */
+ public boolean hasLabel(final String label) {
+ return labels != null && labels.contains(label);
+ }
+
+ public boolean removeLabel(final String label) {
+ return labels != null && labels.remove(label);
+ }
+
+ /**
+ * Returns the labels associated to this TextBlock, or null
if no such labels
+ * exist.
+ *
+ * NOTE: The returned instance is the one used directly in TextBlock. You have full access
+ * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock}
+ * whenever possible.
+ *
+ * @return Returns the set of labels, or null
if no labels was added yet.
+ */
+ public Set getLabels() {
+ return labels;
+ }
+
+ /**
+ * Adds a set of labels to this {@link TextBlock}.
+ * null
-references are silently ignored.
+ *
+ * @param l The labels to be added.
+ */
+ public void addLabels(final Set l) {
+ if(l == null) {
+ return;
+ }
+ if(this.labels == null) {
+ this.labels = new HashSet(l);
+ } else {
+ this.labels.addAll(l);
+ }
+ }
+
+ /**
+ * Adds a set of labels to this {@link TextBlock}.
+ * null
-references are silently ignored.
+ *
+ * @param l The labels to be added.
+ */
+ public void addLabels(final String... l) {
+ if(l == null) {
+ return;
+ }
+ if(this.labels == null) {
+ this.labels = new HashSet();
+ }
+ for(final String label : l) {
+ this.labels.add(label);
+ }
+ }
+
+ /**
+ * Returns the containedTextElements BitSet, or null
.
+ * @return the containedTextElements BitSet, or null
.
+ */
+ public BitSet getContainedTextElements() {
+ return containedTextElements;
+ }
+
+ @Override
+ protected TextBlock clone() {
+ final TextBlock clone;
+ try {
+ clone = (TextBlock)super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ if(text != null && !(text instanceof String)) {
+ clone.text = new StringBuilder(text);
+ }
+ if(labels != null && !labels.isEmpty()) {
+ clone.labels = new HashSet(labels);
+ }
+ if(containedTextElements != null) {
+ clone.containedTextElements = (BitSet)containedTextElements.clone();
+ }
+
+ return clone;
+ }
+
+ public int getTagLevel() {
+ return tagLevel;
+ }
+
+ public void setTagLevel(int tagLevel) {
+ this.tagLevel = tagLevel;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/TextDocument.java b/src/main/java/de/l3s/boilerpipe/document/TextDocument.java
new file mode 100644
index 0000000..5ea893c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/TextDocument.java
@@ -0,0 +1,141 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.document;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A text document, consisting of one or more {@link TextBlock}s.
+ *
+ * @author Christian Kohlschütter
+ */
+public class TextDocument implements Cloneable {
+ final List textBlocks;
+ String title;
+
+ /**
+ * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no
+ * title.
+ *
+ * @param textBlocks
+ * The text blocks of this document.
+ */
+ public TextDocument(final List textBlocks) {
+ this(null, textBlocks);
+ }
+
+ /**
+ * Creates a new {@link TextDocument} with given {@link TextBlock}s and
+ * given title.
+ *
+ * @param title
+ * The "main" title for this text document.
+ * @param textBlocks
+ * The text blocks of this document.
+ */
+ public TextDocument(final String title, final List textBlocks) {
+ this.title = title;
+ this.textBlocks = textBlocks;
+ }
+
+ /**
+ * Returns the {@link TextBlock}s of this document.
+ *
+ * @return A list of {@link TextBlock}s, in sequential order of appearance.
+ */
+ public List getTextBlocks() {
+ return textBlocks;
+ }
+
+ /**
+ * Returns the "main" title for this document, or null
if no
+ * such title has ben set.
+ *
+ * @return The "main" title.
+ */
+ public String getTitle() {
+ return title;
+ }
+
+ /**
+ * Updates the "main" title for this document.
+ *
+ * @param title
+ */
+ public void setTitle(final String title) {
+ this.title = title;
+ }
+
+ /**
+ * Returns the {@link TextDocument}'s content.
+ *
+ * @return The content text.
+ */
+ public String getContent() {
+ return getText(true, false);
+ }
+
+
+ /**
+ * Returns the {@link TextDocument}'s content, non-content or both
+ *
+ * @param includeContent Whether to include TextBlocks marked as "content".
+ * @param includeNonContent Whether to include TextBlocks marked as "non-content".
+ * @return The text.
+ */
+ public String getText(boolean includeContent, boolean includeNonContent) {
+ StringBuilder sb = new StringBuilder();
+ LOOP: for (TextBlock block : getTextBlocks()) {
+ if(block.isContent()) {
+ if(!includeContent) {
+ continue LOOP;
+ }
+ } else {
+ if(!includeNonContent) {
+ continue LOOP;
+ }
+ }
+ sb.append(block.getText());
+ sb.append('\n');
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Returns detailed debugging information about the contained {@link TextBlock}s.
+ *
+ * @return Debug information.
+ */
+ public String debugString() {
+ StringBuilder sb = new StringBuilder();
+ for(TextBlock tb : getTextBlocks()) {
+ sb.append(tb.toString());
+ sb.append('\n');
+ }
+ return sb.toString();
+ }
+
+ public TextDocument clone() {
+ final List list = new ArrayList(textBlocks.size());
+ for(TextBlock tb : textBlocks) {
+ list.add(tb.clone());
+ }
+ return new TextDocument(title, list);
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java b/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java
new file mode 100644
index 0000000..51abe73
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java
@@ -0,0 +1,64 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.document;
+
+/**
+ * Provides shallow statistics on a given TextDocument
+ *
+ * @author Christian Kohlschuetter
+ */
+public final class TextDocumentStatistics {
+ private int numWords = 0;
+ private int numBlocks = 0;
+
+ /**
+ * Computes statistics on a given {@link TextDocument}.
+ *
+ * @param doc The {@link TextDocument}.
+ * @param contentOnly if true then o
+ */
+ public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) {
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (contentOnly && !tb.isContent()) {
+ continue;
+ }
+
+ numWords += tb.getNumWords();
+ numBlocks++;
+ }
+ }
+
+ /**
+ * Returns the average number of words at block-level (= overall number of words divided by
+ * the number of blocks).
+ *
+ * @return Average
+ */
+ public float avgNumWords() {
+ return numWords / (float) numBlocks;
+ }
+
+ /**
+ * Returns the overall number of words in all blocks.
+ *
+ * @return Sum
+ */
+ public int getNumWords() {
+ return numWords;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/Video.java b/src/main/java/de/l3s/boilerpipe/document/Video.java
new file mode 100644
index 0000000..3c6fa31
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/Video.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an video resource which is contained in the document.
+ *
+ *
+ * @author Manuel Codiga
+ */
+public class Video extends Media {
+ private final String originUrl;
+ private final String embedUrl;
+
+ public Video(final String url, final String embedUrl) {
+ this.originUrl = url;
+ this.embedUrl = embedUrl;
+ if(this.embedUrl == null) {
+ throw new NullPointerException("embedUrl attribute must not be null");
+ }
+ }
+
+ public String getOriginUrl() {
+ return originUrl;
+ }
+
+ public String getEmbedUrl() {
+ return embedUrl;
+ }
+
+ public String toString() {
+ return "url: "+originUrl;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java b/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java
new file mode 100644
index 0000000..3bada83
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an Vimeo video resource that is contained in the document.
+ *
+ *
+ * @author Manuel Codiga
+ */
+public class VimeoVideo extends Video {
+
+ public VimeoVideo(String originUrl, String embedUrl) {
+ super(originUrl, embedUrl);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java b/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java
new file mode 100644
index 0000000..1f80744
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.l3s.boilerpipe.document;
+
+/**
+ * Represents an Youtube video resource that is contained in the document.
+ *
+ *
+ * @author Manuel Codiga
+ */
+public class YoutubeVideo extends Video {
+
+ public YoutubeVideo(String originUrl, String embedUrl) {
+ super(originUrl, embedUrl);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/document/package.html b/src/main/java/de/l3s/boilerpipe/document/package.html
new file mode 100644
index 0000000..b80903d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/document/package.html
@@ -0,0 +1,6 @@
+
+
+ The classes in this package represent the simple Boilerpipe
+ document model.
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java b/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java
new file mode 100644
index 0000000..1fea4ca
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java
@@ -0,0 +1,62 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.estimators;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.document.TextDocumentStatistics;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+
+/**
+ * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class SimpleEstimator {
+
+ /**
+ * Returns the singleton instance of {@link SimpleEstimator}
+ */
+ public static final SimpleEstimator INSTANCE = new SimpleEstimator();
+
+ private SimpleEstimator() {
+ }
+
+ /**
+ * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor},
+ * can we regard the extraction quality (too) low?
+ *
+ * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others.
+ *
+ * @param dsBefore
+ * @param dsAfter
+ * @return true if low quality is to be expected.
+ */
+ public boolean isLowQuality(final TextDocumentStatistics dsBefore, final TextDocumentStatistics dsAfter) {
+ if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) {
+ return true;
+ }
+
+ if (dsAfter.avgNumWords() < 25) {
+ return true;
+ }
+
+ return false;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java
new file mode 100644
index 0000000..9013c3f
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java
@@ -0,0 +1,68 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.IgnoreBlocksAfterContentFilter;
+import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier;
+import de.l3s.boilerpipe.filters.english.TerminatingBlocksFinder;
+import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion;
+import de.l3s.boilerpipe.filters.heuristics.DocumentTitleMatchClassifier;
+import de.l3s.boilerpipe.filters.heuristics.ExpandTitleToContentFilter;
+import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
+import de.l3s.boilerpipe.filters.heuristics.LargeBlockSameTagLevelToContentFilter;
+import de.l3s.boilerpipe.filters.heuristics.ListAtEndFilter;
+import de.l3s.boilerpipe.filters.heuristics.TrailingHeadlineToBoilerplateFilter;
+import de.l3s.boilerpipe.filters.simple.BoilerplateBlockFilter;
+
+/**
+ * A full-text extractor which is tuned towards news articles. In this scenario
+ * it achieves higher accuracy than {@link DefaultExtractor}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class ArticleExtractor extends ExtractorBase {
+ public static final ArticleExtractor INSTANCE = new ArticleExtractor();
+
+ /**
+ * Returns the singleton instance for {@link ArticleExtractor}.
+ */
+ public static ArticleExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ return
+
+ TerminatingBlocksFinder.INSTANCE.process(doc)
+ | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc)
+ | NumWordsRulesClassifier.INSTANCE.process(doc)
+ | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc)
+ | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc)
+ | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
+ | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc)
+ | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc)
+ | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc)
+ | ExpandTitleToContentFilter.INSTANCE.process(doc)
+ | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc)
+ | ListAtEndFilter.INSTANCE.process(doc)
+ ;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java
new file mode 100644
index 0000000..5b95e31
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java
@@ -0,0 +1,49 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.simple.MinClauseWordsFilter;
+import de.l3s.boilerpipe.filters.simple.SplitParagraphBlocksFilter;
+
+/**
+ * A full-text extractor which is tuned towards extracting sentences from news articles.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class ArticleSentencesExtractor extends ExtractorBase {
+ public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor();
+
+ /**
+ * Returns the singleton instance for {@link ArticleSentencesExtractor}.
+ */
+ public static ArticleSentencesExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ return
+
+ ArticleExtractor.INSTANCE.process(doc)
+ | SplitParagraphBlocksFilter.INSTANCE.process(doc)
+ | MinClauseWordsFilter.INSTANCE.process(doc);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java
new file mode 100644
index 0000000..db970e0
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java
@@ -0,0 +1,106 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.estimators.SimpleEstimator;
+
+/**
+ * A full-text extractor trained on krdwrd Canola
+ * . Works well with {@link SimpleEstimator}, too.
+ *
+ * @author Christian Kohlschütter
+ */
+public class CanolaExtractor extends ExtractorBase {
+ public static final CanolaExtractor INSTANCE = new CanolaExtractor();
+
+ /**
+ * Returns the singleton instance for {@link CanolaExtractor}.
+ */
+ public static CanolaExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ return CLASSIFIER.process(doc);
+ }
+
+ /**
+ * The actual classifier, exposed.
+ */
+ public static final BoilerpipeFilter CLASSIFIER = new BoilerpipeFilter() {
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ boolean hasChanges = false;
+
+ ListIterator it = textBlocks.listIterator();
+ if (!it.hasNext()) {
+ return false;
+ }
+ TextBlock prevBlock = TextBlock.EMPTY_START;
+ TextBlock currentBlock = it.next();
+ TextBlock nextBlock = it.hasNext() ? it.next()
+ : TextBlock.EMPTY_START;
+
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+
+ if (nextBlock != TextBlock.EMPTY_START) {
+ while (it.hasNext()) {
+ prevBlock = currentBlock;
+ currentBlock = nextBlock;
+ nextBlock = it.next();
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+ }
+ prevBlock = currentBlock;
+ currentBlock = nextBlock;
+ nextBlock = TextBlock.EMPTY_START;
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+ }
+
+ return hasChanges;
+ }
+
+ protected boolean classify(final TextBlock prev, final TextBlock curr,
+ final TextBlock next) {
+ final boolean isContent = (curr.getLinkDensity() > 0 && next
+ .getNumWords() > 11)
+ || (curr.getNumWords() > 19 || (next.getNumWords() > 6
+ && next.getLinkDensity() == 0
+ && prev.getLinkDensity() == 0 && (curr
+ .getNumWords() > 6 || prev.getNumWords() > 7 || next
+ .getNumWords() > 19)));
+
+ return curr.setIsContent(isContent);
+ }
+ };
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java b/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java
new file mode 100644
index 0000000..7e43d20
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java
@@ -0,0 +1,42 @@
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+
+/**
+ * Provides quick access to common {@link BoilerpipeExtractor}s.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class CommonExtractors {
+ private CommonExtractors() {
+ }
+
+ /**
+ * Works very well for most types of Article-like HTML.
+ */
+ public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE;
+
+ /**
+ * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics.
+ */
+ public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE;
+
+ /**
+ * Like {@link DefaultExtractor}, but keeps the largest text block only.
+ */
+ public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR = LargestContentExtractor.INSTANCE;
+
+
+ /**
+ * Trained on krdwrd Canola (different definition of "boilerplate"). You may
+ * give it a try.
+ */
+ public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE;
+
+ /**
+ * Dummy Extractor; should return the input text. Use this to double-check
+ * that your problem is within a particular {@link BoilerpipeExtractor}, or
+ * somewhere else.
+ */
+ public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR = KeepEverythingExtractor.INSTANCE;
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java
new file mode 100644
index 0000000..1fd7f33
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java
@@ -0,0 +1,50 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.DensityRulesClassifier;
+import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion;
+import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
+
+/**
+ * A quite generic full-text extractor.
+ *
+ * @author Christian Kohlschütter
+ */
+public class DefaultExtractor extends ExtractorBase {
+ public static final DefaultExtractor INSTANCE = new DefaultExtractor();
+
+ /**
+ * Returns the singleton instance for {@link DefaultExtractor}.
+ */
+ public static DefaultExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ return
+
+ SimpleBlockFusionProcessor.INSTANCE.process(doc)
+ | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
+ | DensityRulesClassifier.INSTANCE.process(doc);
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java b/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java
new file mode 100644
index 0000000..f41a243
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java
@@ -0,0 +1,116 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.URL;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
+import de.l3s.boilerpipe.sax.HTMLFetcher;
+
+/**
+ * The base class of Extractors. Also provides some helper methods to quickly
+ * retrieve the text that remained after processing.
+ *
+ * @author Christian Kohlschütter
+ */
+public abstract class ExtractorBase implements BoilerpipeExtractor {
+
+ /**
+ * Extracts text from the HTML code given as a String.
+ *
+ * @param html The HTML code as a String.
+ * @return The extracted text.
+ * @throws BoilerpipeProcessingException
+ */
+ public String getText(final String html)
+ throws BoilerpipeProcessingException {
+ try {
+ return getText(new BoilerpipeSAXInput(new InputSource(
+ new StringReader(html))).getTextDocument());
+ } catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+ }
+
+ /**
+ * Extracts text from the HTML code available from the given {@link InputSource}.
+ *
+ * @param is The InputSource containing the HTML
+ * @return The extracted text.
+ * @throws BoilerpipeProcessingException
+ */
+ public String getText(final InputSource is)
+ throws BoilerpipeProcessingException {
+ try {
+ return getText(new BoilerpipeSAXInput(is).getTextDocument());
+ } catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+ }
+
+ /**
+ * Extracts text from the HTML code available from the given {@link URL}.
+ * NOTE: This method is mainly to be used for show case purposes. If you are
+ * going to crawl the Web, consider using {@link #getText(InputSource)}
+ * instead.
+ *
+ * @param url The URL pointing to the HTML code.
+ * @return The extracted text.
+ * @throws BoilerpipeProcessingException
+ */
+ public String getText(final URL url) throws BoilerpipeProcessingException {
+ try {
+ return getText(HTMLFetcher.fetch(url).toInputSource());
+ } catch (IOException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+ }
+
+ /**
+ * Extracts text from the HTML code available from the given {@link Reader}.
+ *
+ * @param r The Reader containing the HTML
+ * @return The extracted text.
+ * @throws BoilerpipeProcessingException
+ */
+ public String getText(final Reader r) throws BoilerpipeProcessingException {
+ return getText(new InputSource(r));
+ }
+
+ /**
+ * Extracts text from the given {@link TextDocument} object.
+ *
+ * @param doc The {@link TextDocument}.
+ * @return The extracted text.
+ * @throws BoilerpipeProcessingException
+ */
+ public String getText(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ process(doc);
+ return doc.getContent();
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java
new file mode 100644
index 0000000..d1f8afc
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java
@@ -0,0 +1,42 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter;
+
+/**
+ * Marks everything as content.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class KeepEverythingExtractor extends ExtractorBase {
+
+ public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor();
+
+ private KeepEverythingExtractor() {
+
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ return MarkEverythingContentFilter.INSTANCE.process(doc);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java
new file mode 100644
index 0000000..96a88c0
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java
@@ -0,0 +1,48 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
+import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter;
+import de.l3s.boilerpipe.filters.simple.MinWordsFilter;
+
+/**
+ * A full-text extractor which extracts the largest text component of a page.
+ * For news articles, it may perform better than the {@link DefaultExtractor},
+ * but usually worse than {@link ArticleExtractor}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase {
+
+ private final MinWordsFilter filter;
+
+ public KeepEverythingWithMinKWordsExtractor(final int kMin) {
+ this.filter = new MinWordsFilter(kMin);
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ return SimpleBlockFusionProcessor.INSTANCE.process(doc)
+ | MarkEverythingContentFilter.INSTANCE.process(doc)
+ | filter.process(doc);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java
new file mode 100644
index 0000000..8720c5c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java
@@ -0,0 +1,53 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier;
+import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion;
+import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
+
+/**
+ * A full-text extractor which extracts the largest text component of a page.
+ * For news articles, it may perform better than the {@link DefaultExtractor},
+ * but usually worse than {@link ArticleExtractor}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class LargestContentExtractor extends ExtractorBase {
+ public static final LargestContentExtractor INSTANCE = new LargestContentExtractor();
+
+ private LargestContentExtractor() {
+ }
+
+ /**
+ * Returns the singleton instance for {@link LargestContentExtractor}.
+ */
+ public static LargestContentExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ return NumWordsRulesClassifier.INSTANCE.process(doc)
+ | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
+ | KeepLargestBlockFilter.INSTANCE.process(doc);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java
new file mode 100644
index 0000000..12ece11
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java
@@ -0,0 +1,46 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.extractors;
+
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier;
+
+/**
+ * A quite generic full-text extractor solely based upon the number of words per
+ * block (the current, the previous and the next block).
+ *
+ * @author Christian Kohlschütter
+ */
+public class NumWordsRulesExtractor extends ExtractorBase {
+ public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor();
+
+ /**
+ * Returns the singleton instance for {@link NumWordsRulesExtractor}.
+ */
+ public static NumWordsRulesExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ return NumWordsRulesClassifier.INSTANCE.process(doc);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/extractors/package.html b/src/main/java/de/l3s/boilerpipe/extractors/package.html
new file mode 100644
index 0000000..aae6f19
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/extractors/package.html
@@ -0,0 +1,6 @@
+
+
+ This package contains some standard extractors (i.e., completely
+ piped BoilerpipeFilters)
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java b/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java
new file mode 100644
index 0000000..52025ef
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java
@@ -0,0 +1,69 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2012 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.debug;
+
+import java.io.PrintWriter;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Prints debug information about the current state of the TextDocument. (=
+ * calls {@link TextDocument#debugString()}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class PrintDebugFilter implements BoilerpipeFilter {
+ /**
+ * Returns the default instance for {@link PrintDebugFilter},
+ * which dumps debug information to System.out
+ */
+ public static final PrintDebugFilter INSTANCE = new PrintDebugFilter(
+ new PrintWriter(System.out, true));
+ private final PrintWriter out;
+
+ /**
+ * Returns the default instance for {@link PrintDebugFilter},
+ * which dumps debug information to System.out
+ */
+ public static PrintDebugFilter getInstance() {
+ return INSTANCE;
+ }
+
+ /**
+ * Creates a new instance of {@link PrintDebugFilter}.
+ *
+ * Only use this method if you are not going to dump
+ * the debug information to System.out
--
+ * for this case, use {@link #getInstance()} instead.
+ *
+ * @param out The target {@link PrintWriter}. Will not be closed
+ */
+ public PrintDebugFilter(final PrintWriter out) {
+ this.out = out;
+
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ out.println(doc.debugString());
+
+ return false;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java
new file mode 100644
index 0000000..bbda7ba
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java
@@ -0,0 +1,117 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Classifies {@link TextBlock}s as content/not-content through rules that have
+ * been determined using the C4.8 machine learning algorithm, as described in the
+ * paper "Boilerplate Detection using Shallow Text Features", particularly using
+ * text densities and link densities.
+ *
+ * @author Christian Kohlschütter
+ */
+public class DensityRulesClassifier implements
+ BoilerpipeFilter {
+ public static final DensityRulesClassifier INSTANCE = new DensityRulesClassifier();
+
+ /**
+ * Returns the singleton instance for RulebasedBoilerpipeClassifier.
+ */
+ public static DensityRulesClassifier getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ boolean hasChanges = false;
+
+ ListIterator it = textBlocks.listIterator();
+ if (!it.hasNext()) {
+ return false;
+ }
+ TextBlock prevBlock = TextBlock.EMPTY_START;
+ TextBlock currentBlock = it.next();
+ TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START;
+
+ hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges;
+
+ if (nextBlock != TextBlock.EMPTY_START) {
+ while (it.hasNext()) {
+ prevBlock = currentBlock;
+ currentBlock = nextBlock;
+ nextBlock = it.next();
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+ }
+ prevBlock = currentBlock;
+ currentBlock = nextBlock;
+ nextBlock = TextBlock.EMPTY_START;
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+ }
+
+ return hasChanges;
+ }
+
+ protected boolean classify(final TextBlock prev, final TextBlock curr,
+ final TextBlock next) {
+ final boolean isContent;
+
+ if (curr.getLinkDensity() <= 0.333333) {
+ if (prev.getLinkDensity() <= 0.555556) {
+ if (curr.getTextDensity() <= 9) {
+ if (next.getTextDensity() <= 10) {
+ if (prev.getTextDensity() <= 4) {
+ isContent = false;
+ } else {
+ isContent = true;
+ }
+ } else {
+ isContent = true;
+ }
+ } else {
+ if (next.getTextDensity() == 0) {
+ isContent = false;
+ } else {
+ isContent = true;
+ }
+ }
+ } else {
+ if (next.getTextDensity() <= 11) {
+ isContent = false;
+ } else {
+ isContent = true;
+ }
+ }
+ } else {
+ isContent = false;
+ }
+
+ return curr.setIsContent(isContent);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java b/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java
new file mode 100644
index 0000000..dc72d07
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java
@@ -0,0 +1,40 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Base class for some heuristics that are used by boilerpipe filters.
+ *
+ * @author Christian Kohlschütter
+ */
+abstract class HeuristicFilterBase {
+
+ protected static int getNumFullTextWords(final TextBlock tb) {
+ return getNumFullTextWords(tb, 9);
+ }
+ protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) {
+ if(tb.getTextDensity() >= minTextDensity) {
+ return tb.getNumWords();
+ } else {
+ return 0;
+ }
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java
new file mode 100644
index 0000000..1d505be
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java
@@ -0,0 +1,80 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009,2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.Iterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks as "non-content" that occur after blocks that have been
+ * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}. These marks are ignored
+ * unless a minimum number of words in content blocks occur before this mark (default: 60).
+ * This can be used in conjunction with an upstream {@link TerminatingBlocksFinder}.
+ *
+ * @author Christian Kohlschütter
+ * @see TerminatingBlocksFinder
+ */
+public final class IgnoreBlocksAfterContentFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+ public static final IgnoreBlocksAfterContentFilter DEFAULT_INSTANCE = new IgnoreBlocksAfterContentFilter(
+ 60);
+ public static final IgnoreBlocksAfterContentFilter INSTANCE_200 = new IgnoreBlocksAfterContentFilter(
+ 200);
+ private final int minNumWords;
+
+ /**
+ * Returns the singleton instance for DeleteBlocksAfterContentFilter.
+ */
+ public static IgnoreBlocksAfterContentFilter getDefaultInstance() {
+ return DEFAULT_INSTANCE;
+ }
+
+ public IgnoreBlocksAfterContentFilter(final int minNumWords) {
+ this.minNumWords = minNumWords;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ boolean changes = false;
+
+ int numWords = 0;
+ boolean foundEndOfText = false;
+ for (Iterator it = doc.getTextBlocks().iterator(); it.hasNext();) {
+ TextBlock block = it.next();
+
+ final boolean endOfText = block
+ .hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
+ if (block.isContent()) {
+ numWords += getNumFullTextWords(block);
+ }
+ if (endOfText && numWords >= minNumWords) {
+ foundEndOfText = true;
+ }
+ if (foundEndOfText) {
+ changes = true;
+ block.setIsContent(false);
+ }
+ }
+
+ return changes;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java
new file mode 100644
index 0000000..0fdf7dd
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java
@@ -0,0 +1,76 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks as "non-content" that occur after blocks that have been
+ * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}, and after any content block.
+ * This filter can be used in conjunction with an upstream {@link TerminatingBlocksFinder}.
+ *
+ * @author Christian Kohlschütter
+ * @see TerminatingBlocksFinder
+ */
+public final class IgnoreBlocksAfterContentFromEndFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+ public static final IgnoreBlocksAfterContentFromEndFilter INSTANCE = new IgnoreBlocksAfterContentFromEndFilter(
+ );
+
+ private IgnoreBlocksAfterContentFromEndFilter() {
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ boolean changes = false;
+
+
+ int words = 0;
+
+ List blocks = doc.getTextBlocks();
+ if (!blocks.isEmpty()) {
+ ListIterator it = blocks.listIterator(blocks.size());
+
+ TextBlock tb;
+
+ while(it.hasPrevious()) {
+ tb = it.previous();
+ if(tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) {
+ tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT);
+ tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT);
+ tb.setIsContent(false);
+ changes = true;
+ } else if(tb.isContent()) {
+ words += tb.getNumWords();
+ if(words > 200) {
+ break;
+ }
+ }
+
+ }
+ }
+
+ return changes;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java
new file mode 100644
index 0000000..ccf7fd8
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java
@@ -0,0 +1,83 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Keeps the largest {@link TextBlock} only (by the number of words). In case of
+ * more than one block with the same number of words, the first block is chosen.
+ * All discarded blocks are marked "not content" and flagged as
+ * {@link DefaultLabels#MIGHT_BE_CONTENT}.
+ *
+ * As opposed to {@link KeepLargestBlockFilter}, the number of words are
+ * computed using {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}, which only counts
+ * words that occur in text elements with at least 9 words and are thus believed to be full text.
+ *
+ * NOTE: Without language-specific fine-tuning (i.e., running the default instance), this filter
+ * may lead to suboptimal results. You better use {@link KeepLargestBlockFilter} instead, which
+ * works at the level of number-of-words instead of text densities.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class KeepLargestFulltextBlockFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+ public static final KeepLargestFulltextBlockFilter INSTANCE = new KeepLargestFulltextBlockFilter();
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ int max = -1;
+ TextBlock largestBlock = null;
+ for (TextBlock tb : textBlocks) {
+ if (!tb.isContent()) {
+ continue;
+ }
+ int numWords = getNumFullTextWords(tb);
+ if (numWords > max) {
+ largestBlock = tb;
+ max = numWords;
+ }
+ }
+
+ if (largestBlock == null) {
+ return false;
+ }
+
+ for (TextBlock tb : textBlocks) {
+ if (tb == largestBlock) {
+ tb.setIsContent(true);
+ } else {
+ tb.setIsContent(false);
+ tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT);
+ }
+ }
+
+ return true;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java
new file mode 100644
index 0000000..7962af4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java
@@ -0,0 +1,63 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Keeps only those content blocks which contain at least k full-text words
+ * (measured by {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter {
+ public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter(
+ 30);
+ private final int minWords;
+
+ public static MinFulltextWordsFilter getDefaultInstance() {
+ return DEFAULT_INSTANCE;
+ }
+
+ public MinFulltextWordsFilter(final int minWords) {
+ this.minWords = minWords;
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (!tb.isContent()) {
+ continue;
+ }
+ if (getNumFullTextWords(tb) < minWords) {
+ tb.setIsContent(false);
+ changes = true;
+ }
+
+ }
+
+ return changes;
+
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java
new file mode 100644
index 0000000..550252a
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java
@@ -0,0 +1,116 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Classifies {@link TextBlock}s as content/not-content through rules that have
+ * been determined using the C4.8 machine learning algorithm, as described in
+ * the paper "Boilerplate Detection using Shallow Text Features" (WSDM 2010),
+ * particularly using number of words per block and link density per block.
+ *
+ * @author Christian Kohlschütter
+ */
+public class NumWordsRulesClassifier implements BoilerpipeFilter {
+ public static final NumWordsRulesClassifier INSTANCE = new NumWordsRulesClassifier();
+
+ /**
+ * Returns the singleton instance for RulebasedBoilerpipeClassifier.
+ */
+ public static NumWordsRulesClassifier getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ boolean hasChanges = false;
+
+ ListIterator it = textBlocks.listIterator();
+ if (!it.hasNext()) {
+ return false;
+ }
+ TextBlock prevBlock = TextBlock.EMPTY_START;
+ TextBlock currentBlock = it.next();
+ TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START;
+
+ hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges;
+
+ if (nextBlock != TextBlock.EMPTY_START) {
+ while (it.hasNext()) {
+ prevBlock = currentBlock;
+ currentBlock = nextBlock;
+ nextBlock = it.next();
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+ }
+ prevBlock = currentBlock;
+ currentBlock = nextBlock;
+ nextBlock = TextBlock.EMPTY_START;
+ hasChanges = classify(prevBlock, currentBlock, nextBlock)
+ | hasChanges;
+ }
+
+ return hasChanges;
+ }
+
+ protected boolean classify(final TextBlock prev, final TextBlock curr,
+ final TextBlock next) {
+ final boolean isContent;
+
+ if (curr.getLinkDensity() <= 0.333333) {
+ if (prev.getLinkDensity() <= 0.555556) {
+ if (curr.getNumWords() <= 16) {
+ if (next.getNumWords() <= 15) {
+ if (prev.getNumWords() <= 4) {
+ isContent = false;
+ } else {
+ isContent = true;
+ }
+ } else {
+ isContent = true;
+ }
+ } else {
+ isContent = true;
+ }
+ } else {
+ if (curr.getNumWords() <= 40) {
+ if (next.getNumWords() <= 17) {
+ isContent = false;
+ } else {
+ isContent = true;
+ }
+ } else {
+ isContent = true;
+ }
+ }
+ } else {
+ isContent = false;
+ }
+
+ return curr.setIsContent(isContent);
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java b/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java
new file mode 100644
index 0000000..0c5c15c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java
@@ -0,0 +1,124 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlsch��tter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.english;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Finds blocks which are potentially indicating the end of an article text and
+ * marks them with {@link DefaultLabels#INDICATES_END_OF_TEXT}. This can be used
+ * in conjunction with a downstream {@link IgnoreBlocksAfterContentFilter}.
+ *
+ * @author Christian Kohlsch��tter
+ * @see IgnoreBlocksAfterContentFilter
+ */
+public class TerminatingBlocksFinder implements BoilerpipeFilter {
+ public static final TerminatingBlocksFinder INSTANCE = new TerminatingBlocksFinder();
+
+ /**
+ * Returns the singleton instance for TerminatingBlocksFinder.
+ */
+ public static TerminatingBlocksFinder getInstance() {
+ return INSTANCE;
+ }
+
+ // public static long timeSpent = 0;
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ boolean changes = false;
+
+ // long t = System.currentTimeMillis();
+
+ for (TextBlock tb : doc.getTextBlocks()) {
+ final int numWords = tb.getNumWords();
+ if (numWords < 15) {
+ final String text = tb.getText().trim();
+ final int len = text.length();
+ if (len >= 8) {
+ final String textLC = text.toLowerCase();
+ if (textLC.startsWith("comments")
+ || startsWithNumber(textLC, len, " comments",
+ " users responded in")
+ || textLC.startsWith("�� reuters")
+ || textLC.startsWith("please rate this")
+ || textLC.startsWith("post a comment")
+ || textLC.contains("what you think...")
+ || textLC.contains("add your comment")
+ || textLC.contains("add comment")
+ || textLC.contains("reader views")
+ || textLC.contains("have your say")
+ || textLC.contains("reader comments")
+ || textLC.contains("r��tta artikeln")
+ || textLC.contains("Réagir")
+ || textLC.contains("Vos réactions ")
+ || textLC
+ .equals("thanks for your comments - this feedback is now closed")) {
+ tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT);
+ changes = true;
+ }
+ } else if(tb.getLinkDensity() == 1.0) {
+ if(text.equals("Comment")) {
+ tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT);
+ }
+ }
+ }
+ }
+
+ // timeSpent += System.currentTimeMillis() - t;
+
+ return changes;
+ }
+
+ /**
+ * Checks whether the given text t starts with a sequence of digits,
+ * followed by one of the given strings.
+ *
+ * @param t
+ * The text to examine
+ * @param len
+ * The length of the text to examine
+ * @param str
+ * Any strings that may follow the digits.
+ * @return true if at least one combination matches
+ */
+ private static boolean startsWithNumber(final String t, final int len,
+ final String... str) {
+ int j = 0;
+ while (j < len && isDigit(t.charAt(j))) {
+ j++;
+ }
+ if (j != 0) {
+ for (String s : str) {
+ if (t.startsWith(s, j)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private static boolean isDigit(final char c) {
+ return c >= '0' && c <= '9';
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/package.html b/src/main/java/de/l3s/boilerpipe/filters/english/package.html
new file mode 100644
index 0000000..ec624a9
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/english/package.html
@@ -0,0 +1,8 @@
+
+
+ The BoilerpipeFilters in this package have only been tested on
+ English text.
+ That is, they will probably work with other Western languages,
+ but maybe need some parameter tuning to perform well.
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java
new file mode 100644
index 0000000..0922cb1
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java
@@ -0,0 +1,84 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2011 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Adds the labels of the preceding block to the current block, optionally adding a prefix.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class AddPrecedingLabelsFilter implements BoilerpipeFilter {
+
+ public static final AddPrecedingLabelsFilter INSTANCE = new AddPrecedingLabelsFilter("");
+ public static final AddPrecedingLabelsFilter INSTANCE_PRE = new AddPrecedingLabelsFilter("^");
+
+ private final String labelPrefix;
+
+ /**
+ * Creates a new {@link AddPrecedingLabelsFilter} instance.
+ *
+ */
+ public AddPrecedingLabelsFilter(final String labelPrefix) {
+ this.labelPrefix = labelPrefix;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ boolean changes = false;
+ int remaining = textBlocks.size();
+
+ TextBlock blockBelow = null;
+ TextBlock block;
+ for (ListIterator it = textBlocks.listIterator(textBlocks.size()); it
+ .hasPrevious();) {
+ if(--remaining <= 0) {
+ break;
+ }
+ if(blockBelow == null) {
+ blockBelow = it.previous();
+ continue;
+ }
+ block = it.previous();
+
+ Set labels = block.getLabels();
+ if(labels != null && !labels.isEmpty()) {
+ for(String l : labels) {
+ blockBelow.addLabel(labelPrefix+l);
+ }
+ changes = true;
+ }
+ blockBelow = block;
+ }
+
+ return changes;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java
new file mode 100644
index 0000000..10f9d70
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java
@@ -0,0 +1,43 @@
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.regex.Pattern;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+public class ArticleMetadataFilter implements BoilerpipeFilter {
+ private static final Pattern[] PATTERNS_SHORT = new Pattern[] {
+ Pattern
+ .compile("^[0-9 \\,\\./]*\\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\\b[0-9 \\,\\:apm\\./]*([CPSDMGET]{2,3})?$"),
+ Pattern.compile("^[Bb]y ")
+ };
+
+
+ public static final ArticleMetadataFilter INSTANCE = new ArticleMetadataFilter();
+
+ private ArticleMetadataFilter() {
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ boolean changed = false;
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (tb.getNumWords() > 10) {
+ continue;
+ }
+ final String text = tb.getText();
+ for (Pattern p : PATTERNS_SHORT) {
+ if (p.matcher(text).find()) {
+ changed = true;
+ tb.setIsContent(true);
+ tb.addLabel(DefaultLabels.ARTICLE_METADATA);
+ }
+ }
+ }
+ return changed;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java
new file mode 100644
index 0000000..510c47f
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java
@@ -0,0 +1,128 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit.
+ * This probably makes sense only in cases where an upstream filter already has removed some blocks.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class BlockProximityFusion implements BoilerpipeFilter {
+
+ private final int maxBlocksDistance;
+
+ public static final BlockProximityFusion MAX_DISTANCE_1 = new BlockProximityFusion(
+ 1, false, false);
+ public static final BlockProximityFusion MAX_DISTANCE_1_SAME_TAGLEVEL = new BlockProximityFusion(
+ 1, false, true);
+ public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY = new BlockProximityFusion(
+ 1, true, false);
+ public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = new BlockProximityFusion(
+ 1, true, true);
+
+ private final boolean contentOnly;
+
+ private final boolean sameTagLevelOnly;
+
+ /**
+ * Creates a new {@link BlockProximityFusion} instance.
+ *
+ * @param maxBlocksDistance The maximum distance in blocks.
+ * @param contentOnly
+ */
+ public BlockProximityFusion(final int maxBlocksDistance,
+ final boolean contentOnly, final boolean sameTagLevelOnly) {
+ this.maxBlocksDistance = maxBlocksDistance;
+ this.contentOnly = contentOnly;
+ this.sameTagLevelOnly = sameTagLevelOnly;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ boolean changes = false;
+ TextBlock prevBlock;
+
+ int offset;
+ if (contentOnly) {
+ prevBlock = null;
+ offset = 0;
+ for (TextBlock tb : textBlocks) {
+ offset++;
+ if (tb.isContent()) {
+ prevBlock = tb;
+ break;
+ }
+ }
+ if (prevBlock == null) {
+ return false;
+ }
+ } else {
+ prevBlock = textBlocks.get(0);
+ offset = 1;
+ }
+
+ for (Iterator it = textBlocks.listIterator(offset); it
+ .hasNext();) {
+ TextBlock block = it.next();
+ if (!block.isContent()) {
+ prevBlock = block;
+ continue;
+ }
+ int diffBlocks = block.getOffsetBlocksStart()
+ - prevBlock.getOffsetBlocksEnd() - 1;
+ if (diffBlocks <= maxBlocksDistance) {
+ boolean ok = true;
+ if (contentOnly) {
+ if (!prevBlock.isContent()
+ || !block.isContent()) {
+ ok = false;
+ }
+ }
+ if(ok && sameTagLevelOnly && prevBlock.getTagLevel() != block.getTagLevel()) {
+ ok = false;
+ }
+ if (ok) {
+ prevBlock.mergeNext(block);
+ it.remove();
+ changes = true;
+ } else {
+ prevBlock = block;
+ }
+ } else {
+ prevBlock = block;
+ }
+ }
+
+ return changes;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java
new file mode 100644
index 0000000..e44fc0c
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java
@@ -0,0 +1,72 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+public final class ContentFusion implements BoilerpipeFilter {
+
+ public static final ContentFusion INSTANCE = new ContentFusion();
+
+ /**
+ * Creates a new {@link ContentFusion} instance.
+ *
+ */
+ public ContentFusion() {
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ TextBlock prevBlock = textBlocks.get(0);
+
+ boolean changes = false;
+ do {
+ changes = false;
+ for (ListIterator it = textBlocks.listIterator(1); it
+ .hasNext();) {
+ TextBlock block = it.next();
+
+ if (prevBlock.isContent()
+ && block.getLinkDensity() < 0.56
+ && !block.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) {
+
+ prevBlock.mergeNext(block);
+ it.remove();
+ changes = true;
+ } else {
+ prevBlock = block;
+ }
+ }
+ } while (changes);
+
+ return true;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java
new file mode 100644
index 0000000..f3e4cda
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java
@@ -0,0 +1,173 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks {@link TextBlock}s which contain parts of the HTML
+ * <TITLE>
tag, using some heuristics which are quite
+ * specific to the news domain.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class DocumentTitleMatchClassifier implements BoilerpipeFilter {
+
+ private final Set potentialTitles;
+
+ public DocumentTitleMatchClassifier(String title) {
+ if (title == null) {
+ this.potentialTitles = null;
+ } else {
+
+ title = title.replace('\u00a0', ' ');
+ title = title.replace("'", "");
+
+ title = title.trim().toLowerCase();
+
+ if (title.length() == 0) {
+ this.potentialTitles = null;
+ } else {
+ this.potentialTitles = new HashSet();
+
+ potentialTitles.add(title);
+
+ String p;
+
+ p = getLongestPart(title, "[ ]*[\\|»|-][ ]*");
+ if (p != null) {
+ potentialTitles.add(p);
+ }
+ p = getLongestPart(title, "[ ]*[\\|»|:][ ]*");
+ if (p != null) {
+ potentialTitles.add(p);
+ }
+ p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)][ ]*");
+ if (p != null) {
+ potentialTitles.add(p);
+ }
+ p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)\\-][ ]*");
+ if (p != null) {
+ potentialTitles.add(p);
+ }
+ p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-][ ]*");
+ if (p != null) {
+ potentialTitles.add(p);
+ }
+ p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-\u00a0][ ]*");
+ if (p != null) {
+ potentialTitles.add(p);
+ }
+
+ addPotentialTitles(potentialTitles, title, "[ ]+[\\|][ ]+", 4);
+ addPotentialTitles(potentialTitles, title, "[ ]+[\\-][ ]+", 4);
+
+ potentialTitles.add(title.replaceFirst(" - [^\\-]+$", ""));
+ potentialTitles.add(title.replaceFirst("^[^\\-]+ - ", ""));
+ }
+ }
+ }
+
+ public Set getPotentialTitles() {
+ return potentialTitles;
+ }
+
+ private void addPotentialTitles(final Set potentialTitles, final String title, final String pattern, final int minWords) {
+ String[] parts = title.split(pattern);
+ if (parts.length == 1) {
+ return;
+ }
+ for (int i = 0; i < parts.length; i++) {
+ String p = parts[i];
+ if (p.contains(".com")) {
+ continue;
+ }
+ final int numWords = p.split("[\b ]+").length;
+ if (numWords >=minWords) {
+ potentialTitles.add(p);
+ }
+ }
+ }
+
+ private String getLongestPart(final String title, final String pattern) {
+ String[] parts = title.split(pattern);
+ if (parts.length == 1) {
+ return null;
+ }
+ int longestNumWords = 0;
+ String longestPart = "";
+ for (int i = 0; i < parts.length; i++) {
+ String p = parts[i];
+ if (p.contains(".com")) {
+ continue;
+ }
+ final int numWords = p.split("[\b ]+").length;
+ if (numWords > longestNumWords || p.length() > longestPart.length()) {
+ longestNumWords = numWords;
+ longestPart = p;
+ }
+ }
+ if (longestPart.length() == 0) {
+ return null;
+ } else {
+ return longestPart.trim();
+ }
+ }
+
+ private static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?\\!\\.\\-\\:]+");
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ if (potentialTitles == null) {
+ return false;
+ }
+ boolean changes = false;
+
+ for (final TextBlock tb : doc.getTextBlocks()) {
+ String text = tb.getText();
+
+ text = text.replace('\u00a0', ' ');
+ text = text.replace("'", "");
+
+ text = text.trim().toLowerCase();
+
+ if (potentialTitles.contains(text)) {
+ tb.addLabel(DefaultLabels.TITLE);
+ changes = true;
+ break;
+ }
+
+ text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim();
+ if (potentialTitles.contains(text)) {
+ tb.addLabel(DefaultLabels.TITLE);
+ changes = true;
+ break;
+ }
+ }
+ return changes;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java
new file mode 100644
index 0000000..7268a45
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java
@@ -0,0 +1,73 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all {@link TextBlock}s "content" which are between the headline and the part that
+ * has already been marked content, if they are marked {@link DefaultLabels#MIGHT_BE_CONTENT}.
+ *
+ * This filter is quite specific to the news domain.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class ExpandTitleToContentFilter implements BoilerpipeFilter {
+ public static final ExpandTitleToContentFilter INSTANCE = new ExpandTitleToContentFilter();
+
+ /**
+ * Returns the singleton instance for ExpandTitleToContentFilter.
+ */
+ public static ExpandTitleToContentFilter getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ int i = 0;
+ int title = -1;
+ int contentStart = -1;
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (contentStart == -1 && tb.hasLabel(DefaultLabels.TITLE)) {
+ title = i;
+ contentStart = -1;
+ }
+ if (contentStart == -1 && tb.isContent()) {
+ contentStart = i;
+ }
+
+ i++;
+ }
+
+ if (contentStart <= title || title == -1) {
+ return false;
+ }
+ boolean changes = false;
+ for (TextBlock tb : doc.getTextBlocks().subList(title, contentStart)) {
+ if (tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)) {
+ changes = tb.setIsContent(true) | changes;
+ }
+ }
+ return changes;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java
new file mode 100644
index 0000000..5d4cc31
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java
@@ -0,0 +1,124 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Keeps the largest {@link TextBlock} only (by the number of words). In case of
+ * more than one block with the same number of words, the first block is chosen.
+ * All discarded blocks are marked "not content" and flagged as
+ * {@link DefaultLabels#MIGHT_BE_CONTENT}.
+ *
+ * Note that, by default, only TextBlocks marked as "content" are taken into consideration.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class KeepLargestBlockFilter implements BoilerpipeFilter {
+ public static final KeepLargestBlockFilter INSTANCE = new KeepLargestBlockFilter(
+ false, 0);
+ public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL = new KeepLargestBlockFilter(
+ true, 0);
+ public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS = new KeepLargestBlockFilter(
+ true, 150);
+ private final boolean expandToSameLevelText;
+ private final int minWords;
+
+ public KeepLargestBlockFilter(boolean expandToSameLevelText, final int minWords) {
+ this.expandToSameLevelText = expandToSameLevelText;
+ this.minWords = minWords;
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ int maxNumWords = -1;
+ TextBlock largestBlock = null;
+
+ int level = -1;
+
+ int i = 0;
+ int n = -1;
+ for (TextBlock tb : textBlocks) {
+ if (tb.isContent()) {
+ final int nw = tb.getNumWords();
+
+ if (nw > maxNumWords) {
+ largestBlock = tb;
+ maxNumWords = nw;
+
+ n = i;
+
+ if (expandToSameLevelText) {
+ level = tb.getTagLevel();
+ }
+ }
+ }
+ i++;
+ }
+ for (TextBlock tb : textBlocks) {
+ if (tb == largestBlock) {
+ tb.setIsContent(true);
+ tb.addLabel(DefaultLabels.VERY_LIKELY_CONTENT);
+ } else {
+ tb.setIsContent(false);
+ tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT);
+ }
+ }
+ if (expandToSameLevelText && n != -1) {
+
+ for (ListIterator it = textBlocks.listIterator(n); it
+ .hasPrevious();) {
+ TextBlock tb = it.previous();
+ final int tl = tb.getTagLevel();
+ if(tl < level) {
+ break;
+ } else if(tl == level) {
+ if(tb.getNumWords() >= minWords) {
+ tb.setIsContent(true);
+ }
+ }
+ }
+ for (ListIterator it = textBlocks.listIterator(n); it
+ .hasNext();) {
+ TextBlock tb = it.next();
+ final int tl = tb.getTagLevel();
+ if(tl < level) {
+ break;
+ } else if(tl == level) {
+ if(tb.getNumWords() >= minWords) {
+ tb.setIsContent(true);
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java
new file mode 100644
index 0000000..0ec3836
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java
@@ -0,0 +1,91 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Fuses adjacent blocks if their labels are equal.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class LabelFusion implements BoilerpipeFilter {
+
+ public static final LabelFusion INSTANCE = new LabelFusion();
+
+ /**
+ * Creates a new {@link LabelFusion} instance.
+ */
+ private LabelFusion() {
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ boolean changes = false;
+ TextBlock prevBlock = textBlocks.get(0);
+ int offset = 1;
+
+ for (Iterator it = textBlocks.listIterator(offset); it
+ .hasNext();) {
+ TextBlock block = it.next();
+
+ if(equalLabels(prevBlock.getLabels(), block.getLabels())) {
+ prevBlock.mergeNext(block);
+ it.remove();
+ changes = true;
+ } else {
+ prevBlock = block;
+ }
+ }
+
+ return changes;
+ }
+
+ private boolean equalLabels(Set labels, Set labels2) {
+ if(labels == null || labels2 == null) {
+ return false;
+ }
+ return markupLabelsOnly(labels).equals(markupLabelsOnly(labels2));
+ }
+
+ private Set markupLabelsOnly(final Set set1) {
+ Set set = new HashSet(set1);
+ for(Iterator it = set.iterator(); it.hasNext(); ) {
+ final String label = it.next();
+ if(!label.startsWith(DefaultLabels.MARKUP_PREFIX)) {
+ it.remove();
+ }
+ }
+ return set;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java
new file mode 100644
index 0000000..966e583
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java
@@ -0,0 +1,70 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks as content that:
+ *
+ * - are on the same tag-level as very likely main content (usually the level of the largest block)
+ * - have a significant number of words, currently: at least 100
+ *
+ *
+ * @author Christian Kohlschütter
+ */
+public final class LargeBlockSameTagLevelToContentFilter implements BoilerpipeFilter {
+ public static final LargeBlockSameTagLevelToContentFilter INSTANCE = new LargeBlockSameTagLevelToContentFilter();
+ private LargeBlockSameTagLevelToContentFilter() {
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ int tagLevel = -1;
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if(tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
+ tagLevel = tb.getTagLevel();
+ break;
+ }
+ }
+
+ if(tagLevel == -1) {
+ return false;
+ }
+
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (!tb.isContent()) {
+
+ if(tb.getNumWords() >= 100 && tb.getTagLevel() == tagLevel) {
+ tb.setIsContent(true);
+ changes = true;
+ }
+ }
+ }
+
+ return changes;
+
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java
new file mode 100644
index 0000000..dfaae1b
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java
@@ -0,0 +1,64 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks nested list-item blocks after the end of the main content.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class ListAtEndFilter implements BoilerpipeFilter {
+ public static final ListAtEndFilter INSTANCE = new ListAtEndFilter();
+
+ private ListAtEndFilter() {
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ int tagLevel = Integer.MAX_VALUE;
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (tb.isContent()
+ && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
+ tagLevel = tb.getTagLevel();
+ } else {
+ if (tb.getTagLevel() > tagLevel
+ && tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)
+ && tb.hasLabel(DefaultLabels.LI)
+ && tb.getLinkDensity() == 0
+ ) {
+ tb.setIsContent(true);
+ changes = true;
+ } else {
+ tagLevel = Integer.MAX_VALUE;
+ }
+ }
+ }
+
+ return changes;
+
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java
new file mode 100644
index 0000000..e1fc17b
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java
@@ -0,0 +1,70 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Merges two subsequent blocks if their text densities are equal.
+ *
+ * @author Christian Kohlschütter
+ */
+public class SimpleBlockFusionProcessor implements BoilerpipeFilter {
+ public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor();
+
+ /**
+ * Returns the singleton instance for BlockFusionProcessor.
+ */
+ public static SimpleBlockFusionProcessor getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ boolean changes = false;
+
+ if (textBlocks.size() < 2) {
+ return false;
+ }
+
+ TextBlock b1 = textBlocks.get(0);
+ for (Iterator it = textBlocks.listIterator(1); it.hasNext();) {
+ TextBlock b2 = it.next();
+
+ final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
+
+ if(similar) {
+ b1.mergeNext(b2);
+ it.remove();
+ changes = true;
+ } else {
+ b1 = b2;
+ }
+ }
+
+ return changes;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java
new file mode 100644
index 0000000..8a5b18d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java
@@ -0,0 +1,66 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.heuristics;
+
+import java.util.List;
+import java.util.ListIterator;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks trailing headlines ({@link TextBlock}s that have the label {@link DefaultLabels#HEADING})
+ * as boilerplate. Trailing means they are marked content and are below any other content block.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class TrailingHeadlineToBoilerplateFilter implements BoilerpipeFilter {
+ public static final TrailingHeadlineToBoilerplateFilter INSTANCE = new TrailingHeadlineToBoilerplateFilter();
+
+ /**
+ * Returns the singleton instance for ExpandTitleToContentFilter.
+ */
+ public static TrailingHeadlineToBoilerplateFilter getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ boolean changes = false;
+
+ List list = doc.getTextBlocks();
+
+ for (ListIterator it = list.listIterator(list.size()); it.hasPrevious(); ) {
+ TextBlock tb = it.previous();
+ if(tb.isContent()) {
+ if(tb.hasLabel(DefaultLabels.HEADING)) {
+ tb.setIsContent(false);
+ changes = true;
+ } else {
+ break;
+ }
+ }
+ }
+
+ return changes;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html b/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html
new file mode 100644
index 0000000..a368224
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html
@@ -0,0 +1,5 @@
+
+
+ The BoilerpipeFilters in this package are pure heuristics.
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java
new file mode 100644
index 0000000..aff85a6
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java
@@ -0,0 +1,71 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Removes {@link TextBlock}s which have explicitly been marked as
+ * "not content".
+ *
+ * @author Christian Kohlschütter
+ */
+public final class BoilerplateBlockFilter implements BoilerpipeFilter {
+ public static final BoilerplateBlockFilter INSTANCE = new BoilerplateBlockFilter(
+ null);
+ public static final BoilerplateBlockFilter INSTANCE_KEEP_TITLE = new BoilerplateBlockFilter(
+ DefaultLabels.TITLE);
+ private final String labelToKeep;
+
+ /**
+ * Returns the singleton instance for BoilerplateBlockFilter.
+ */
+ public static BoilerplateBlockFilter getInstance() {
+ return INSTANCE;
+ }
+
+ public BoilerplateBlockFilter(final String labelToKeep) {
+ this.labelToKeep = labelToKeep;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ List textBlocks = doc.getTextBlocks();
+ boolean hasChanges = false;
+
+ for (Iterator it = textBlocks.iterator(); it.hasNext();) {
+ TextBlock tb = it.next();
+ if (!tb.isContent()
+ && (labelToKeep == null || !tb
+ .hasLabel(DefaultLabels.TITLE))) {
+ it.remove();
+ hasChanges = true;
+ }
+ }
+
+ return hasChanges;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java
new file mode 100644
index 0000000..a464dbf
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java
@@ -0,0 +1,51 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Reverts the "isContent" flag for all {@link TextBlock}s
+ *
+ * @author Christian Kohlschütter
+ */
+public final class InvertedFilter implements BoilerpipeFilter {
+ public static final InvertedFilter INSTANCE = new InvertedFilter();
+ private InvertedFilter() {
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ List tbs = doc.getTextBlocks();
+ if (tbs.isEmpty()) {
+ return false;
+ }
+ for (TextBlock tb : tbs) {
+ tb.setIsContent(!tb.isContent());
+ }
+
+ return true;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java
new file mode 100644
index 0000000..3178f0b
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java
@@ -0,0 +1,59 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+
+/**
+ * Marks all blocks that contain a given label as "boilerplate".
+ *
+ * @author Christian Kohlschütter
+ */
+public final class LabelToBoilerplateFilter implements BoilerpipeFilter {
+ public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT = new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT);
+
+ private String[] labels;
+
+ public LabelToBoilerplateFilter(final String... label) {
+ this.labels = label;
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
+ if (tb.isContent()) {
+ for (String label : labels) {
+ if (tb.hasLabel(label)) {
+ tb.setIsContent(false);
+ changes = true;
+ continue BLOCK_LOOP;
+ }
+ }
+ }
+ }
+
+ return changes;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java
new file mode 100644
index 0000000..e4bf856
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java
@@ -0,0 +1,56 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Marks all blocks that contain a given label as "content".
+ *
+ * @author Christian Kohlschütter
+ */
+public final class LabelToContentFilter implements BoilerpipeFilter {
+ private String[] labels;
+
+ public LabelToContentFilter(final String... label) {
+ this.labels = label;
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
+ if (!tb.isContent()) {
+ for (String label : labels) {
+ if (tb.hasLabel(label)) {
+ tb.setIsContent(true);
+ changes = true;
+ continue BLOCK_LOOP;
+ }
+ }
+ }
+ }
+
+ return changes;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java
new file mode 100644
index 0000000..e888334
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java
@@ -0,0 +1,50 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Marks all blocks as boilerplate.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter {
+ public static final MarkEverythingBoilerplateFilter INSTANCE = new MarkEverythingBoilerplateFilter();
+ private MarkEverythingBoilerplateFilter() {
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (tb.isContent()) {
+ tb.setIsContent(false);
+ changes = true;
+ }
+ }
+
+ return changes;
+
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java
new file mode 100644
index 0000000..8a8b7be
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java
@@ -0,0 +1,50 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Marks all blocks as content.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class MarkEverythingContentFilter implements BoilerpipeFilter {
+ public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter();
+ private MarkEverythingContentFilter() {
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (!tb.isContent()) {
+ tb.setIsContent(true);
+ changes = true;
+ }
+ }
+
+ return changes;
+
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java
new file mode 100644
index 0000000..d326059
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java
@@ -0,0 +1,113 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Keeps only blocks that have at least one segment fragment ("clause") with at
+ * least k words (default: 5).
+ *
+ * NOTE: You might consider using the {@link SplitParagraphBlocksFilter}
+ * upstream.
+ *
+ * @author Christian Kohlschütter
+ * @see SplitParagraphBlocksFilter
+ */
+public final class MinClauseWordsFilter implements BoilerpipeFilter {
+ public static final MinClauseWordsFilter INSTANCE = new MinClauseWordsFilter(
+ 5, false);
+ private int minWords;
+ private final boolean acceptClausesWithoutDelimiter;
+
+ public MinClauseWordsFilter(final int minWords) {
+ this(minWords, false);
+ }
+
+ public MinClauseWordsFilter(final int minWords,
+ final boolean acceptClausesWithoutDelimiter) {
+ this.minWords = minWords;
+ this.acceptClausesWithoutDelimiter = acceptClausesWithoutDelimiter;
+ }
+
+ private final Pattern PAT_CLAUSE_DELIMITER = Pattern
+ .compile("[\\p{L}\\d][\\,\\.\\:\\;\\!\\?]+([ \\n\\r]+|$)");
+ private final Pattern PAT_WHITESPACE = Pattern.compile("[ \\n\\r]+");
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (!tb.isContent()) {
+ continue;
+ }
+ final String text = tb.getText();
+
+ Matcher m = PAT_CLAUSE_DELIMITER.matcher(text);
+ boolean found = m.find();
+ int start = 0;
+ int end;
+ boolean hasClause = false;
+ while (found) {
+ end = m.start() + 1;
+ hasClause = isClause(text.subSequence(start, end));
+ start = m.end();
+
+ if (hasClause) {
+ break;
+ }
+ found = m.find();
+ }
+ end = text.length();
+
+ // since clauses should *always end* with a delimiter, we normally
+ // don't consider text without one
+ if (acceptClausesWithoutDelimiter) {
+ hasClause |= isClause(text.subSequence(start, end));
+ }
+
+ if (!hasClause) {
+ tb.setIsContent(false);
+ changes = true;
+ // System.err.println("IS NOT CONTENT: " + text);
+ }
+ }
+
+ return changes;
+
+ }
+
+ private boolean isClause(final CharSequence text) {
+ Matcher m = PAT_WHITESPACE.matcher(text);
+ int n = 1;
+ while (m.find()) {
+ n++;
+ if (n >= minWords) {
+ return true;
+ }
+ }
+ return n >= minWords;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java
new file mode 100644
index 0000000..a3a49c4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java
@@ -0,0 +1,56 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Keeps only those content blocks which contain at least k words.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class MinWordsFilter implements BoilerpipeFilter {
+ private final int minWords;
+
+ public MinWordsFilter(final int minWords) {
+ this.minWords = minWords;
+ }
+
+ public boolean process(final TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ boolean changes = false;
+
+ for (TextBlock tb : doc.getTextBlocks()) {
+ if (!tb.isContent()) {
+ continue;
+ }
+ if (tb.getNumWords() < minWords) {
+ tb.setIsContent(false);
+ changes = true;
+ }
+
+ }
+
+ return changes;
+
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java
new file mode 100644
index 0000000..86fae33
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java
@@ -0,0 +1,82 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Splits TextBlocks at paragraph boundaries.
+ *
+ * NOTE: This is not fully supported (i.e., it will break highlighting support
+ * via #getContainedTextElements()), but this one probably is necessary for some other
+ * filters.
+ *
+ * @author Christian Kohlschütter
+ * @see MinClauseWordsFilter
+ */
+public final class SplitParagraphBlocksFilter implements BoilerpipeFilter {
+ public static final SplitParagraphBlocksFilter INSTANCE = new SplitParagraphBlocksFilter();
+
+ /**
+ * Returns the singleton instance for TerminatingBlocksFinder.
+ */
+ public static SplitParagraphBlocksFilter getInstance() {
+ return INSTANCE;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+ boolean changes = false;
+
+ final List blocks = doc.getTextBlocks();
+ final List blocksNew = new ArrayList();
+
+ for (TextBlock tb : blocks) {
+ final String text = tb.getText();
+ final String[] paragraphs = text.split("[\n\r]+");
+ if (paragraphs.length < 2) {
+ blocksNew.add(tb);
+ continue;
+ }
+ final boolean isContent = tb.isContent();
+ final Set labels = tb.getLabels();
+ for (String p : paragraphs) {
+ final TextBlock tbP = new TextBlock(p);
+ tbP.setIsContent(isContent);
+ tbP.addLabels(labels);
+ blocksNew.add(tbP);
+ changes = true;
+ }
+ }
+
+ if (changes) {
+ blocks.clear();
+ blocks.addAll(blocksNew);
+ }
+
+ return changes;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java
new file mode 100644
index 0000000..28cf002
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java
@@ -0,0 +1,54 @@
+package de.l3s.boilerpipe.filters.simple;
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.l3s.boilerpipe.BoilerpipeFilter;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.conditions.TextBlockCondition;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+public class SurroundingToContentFilter implements BoilerpipeFilter {
+ public static final SurroundingToContentFilter INSTANCE_TEXT = new SurroundingToContentFilter(new TextBlockCondition() {
+
+ public boolean meetsCondition(TextBlock tb) {
+ return tb.getLinkDensity() == 0 && tb.getNumWords() > 6;
+ }
+ });
+
+ private final TextBlockCondition cond;
+ public SurroundingToContentFilter(final TextBlockCondition cond) {
+ this.cond = cond;
+ }
+
+ public boolean process(TextDocument doc)
+ throws BoilerpipeProcessingException {
+
+ List tbs = doc.getTextBlocks();
+ if (tbs.size() < 3) {
+ return false;
+ }
+
+ TextBlock a = tbs.get(0);
+ TextBlock b = tbs.get(1);
+ TextBlock c;
+ boolean hasChanges = false;
+ for (Iterator it= tbs.listIterator(2);it.hasNext();) {
+ c = it.next();
+ if(!b.isContent() && a.isContent() && c.isContent() && cond.meetsCondition(b)) {
+ b.setIsContent(true);
+ hasChanges = true;
+ }
+
+ a = c;
+ if(!it.hasNext()) {
+ break;
+ }
+ b = it.next();
+ }
+
+ return hasChanges;
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/package.html b/src/main/java/de/l3s/boilerpipe/filters/simple/package.html
new file mode 100644
index 0000000..bc7a25d
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/filters/simple/package.html
@@ -0,0 +1,6 @@
+
+
+ The BoilerpipeFilters in this package are straight-forward and
+ probably not really specific to English.
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java b/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java
new file mode 100644
index 0000000..220e8df
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java
@@ -0,0 +1,43 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.labels;
+
+import de.l3s.boilerpipe.conditions.TextBlockCondition;
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Adds labels to a {@link TextBlock} if the given criteria are met.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class ConditionalLabelAction extends LabelAction {
+
+ private final TextBlockCondition condition;
+
+ public ConditionalLabelAction(TextBlockCondition condition,
+ String... labels) {
+ super(labels);
+ this.condition = condition;
+ }
+
+ public void addTo(final TextBlock tb) {
+ if (condition.meetsCondition(tb)) {
+ addLabelsTo(tb);
+ }
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java b/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java
new file mode 100644
index 0000000..3c56533
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java
@@ -0,0 +1,48 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.labels;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Some pre-defined labels which can be used in conjunction with
+ * {@link TextBlock#addLabel(String)} and {@link TextBlock#hasLabel(String)}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class DefaultLabels {
+ public static final String TITLE = "de.l3s.boilerpipe/TITLE";
+ public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA";
+ public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT";
+ public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT";
+ public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT";
+ public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT";
+ public static final String HR = "de.l3s.boilerpipe/HR";
+ public static final String LI = "de.l3s.boilerpipe/LI";
+
+ public static final String HEADING = "de.l3s.boilerpipe/HEADING";
+ public static final String H1 = "de.l3s.boilerpipe/H1";
+ public static final String H2 = "de.l3s.boilerpipe/H2";
+ public static final String H3 = "de.l3s.boilerpipe/H3";
+
+ public static final String MARKUP_PREFIX = "<";
+
+ private DefaultLabels() {
+ // not to be instantiated
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java b/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java
new file mode 100644
index 0000000..b725f2e
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java
@@ -0,0 +1,48 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.labels;
+
+import java.util.Arrays;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+/**
+ * Helps adding labels to {@link TextBlock}s.
+ *
+ * @author Christian Kohlschütter
+ * @see ConditionalLabelAction
+ */
+public class LabelAction {
+ protected final String[] labels;
+
+ public LabelAction(String... labels) {
+ this.labels = labels;
+ }
+
+ public void addTo(final TextBlock tb) {
+ addLabelsTo(tb);
+ }
+
+ protected final void addLabelsTo(final TextBlock tb) {
+ tb.addLabels(labels);
+ }
+
+ public String toString() {
+ return super.toString()+"{"+Arrays.asList(labels)+"}";
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/package.html b/src/main/java/de/l3s/boilerpipe/package.html
new file mode 100644
index 0000000..81c88d6
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/package.html
@@ -0,0 +1,5 @@
+
+
+ The Boilerpipe top-level package.
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
new file mode 100644
index 0000000..f8cd767
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
@@ -0,0 +1,454 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.labels.LabelAction;
+import de.l3s.boilerpipe.util.UnicodeTokenizer;
+
+/**
+ * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can
+ * be used by different parser implementations, e.g. NekoHTML and TagSoup.
+ *
+ * @author Christian Kohlschütter
+ */
+public class BoilerpipeHTMLContentHandler implements ContentHandler {
+
+ private final Map tagActions;
+ private String title = null;
+
+ static final String ANCHOR_TEXT_START = "$\ue00a<";
+ static final String ANCHOR_TEXT_END = ">\ue00a$";
+
+ StringBuilder tokenBuffer = new StringBuilder();
+ StringBuilder textBuffer = new StringBuilder();
+
+ int inBody = 0;
+ int inAnchor = 0;
+ int inIgnorableElement = 0;
+
+ int tagLevel = 0;
+ int blockTagLevel = -1;
+
+ boolean sbLastWasWhitespace = false;
+ private int textElementIdx = 0;
+
+ private final List textBlocks = new ArrayList();
+
+ private String lastStartTag = null;
+ @SuppressWarnings("unused")
+ private String lastEndTag = null;
+ @SuppressWarnings("unused")
+ private Event lastEvent = null;
+
+ private int offsetBlocks = 0;
+ private BitSet currentContainedTextElements = new BitSet();
+
+ private boolean flush = false;
+ boolean inAnchorText = false;
+
+ LinkedList> labelStacks = new LinkedList>();
+ LinkedList fontSizeStack = new LinkedList();
+
+ /**
+ * Recycles this instance.
+ */
+ public void recycle() {
+ tokenBuffer.setLength(0);
+ textBuffer.setLength(0);
+
+ inBody = 0;
+ inAnchor = 0;
+ inIgnorableElement = 0;
+ sbLastWasWhitespace = false;
+ textElementIdx = 0;
+
+ textBlocks.clear();
+
+ lastStartTag = null;
+ lastEndTag = null;
+ lastEvent = null;
+
+ offsetBlocks = 0;
+ currentContainedTextElements.clear();
+
+ flush = false;
+ inAnchorText = false;
+ }
+
+ /**
+ * Constructs a {@link BoilerpipeHTMLContentHandler} using the
+ * {@link DefaultTagActionMap}.
+ */
+ public BoilerpipeHTMLContentHandler() {
+ this(DefaultTagActionMap.INSTANCE);
+ }
+
+ /**
+ * Constructs a {@link BoilerpipeHTMLContentHandler} using the given
+ * {@link TagActionMap}.
+ *
+ * @param tagActions
+ * The {@link TagActionMap} to use, e.g.
+ * {@link DefaultTagActionMap}.
+ */
+ public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) {
+ this.tagActions = tagActions;
+ }
+
+ // @Override
+ public void endDocument() throws SAXException {
+ flushBlock();
+ }
+
+ // @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ // @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if (!sbLastWasWhitespace) {
+ textBuffer.append(' ');
+ tokenBuffer.append(' ');
+ }
+ sbLastWasWhitespace = true;
+ }
+
+ // @Override
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ }
+
+ // @Override
+ public void setDocumentLocator(Locator locator) {
+ }
+
+ // @Override
+ public void skippedEntity(String name) throws SAXException {
+ }
+
+ // @Override
+ public void startDocument() throws SAXException {
+ }
+
+ // @Override
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ }
+
+ // @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ labelStacks.add(null);
+
+ TagAction ta = tagActions.get(localName);
+ if (ta != null) {
+ if(ta.changesTagLevel()) {
+ tagLevel++;
+ }
+ flush = ta.start(this, localName, qName, atts) | flush;
+ } else {
+ tagLevel++;
+ flush = true;
+ }
+
+ lastEvent = Event.START_TAG;
+ lastStartTag = localName;
+ }
+
+ // @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ TagAction ta = tagActions.get(localName);
+ if (ta != null) {
+ flush = ta.end(this, localName, qName) | flush;
+ } else {
+ flush = true;
+ }
+
+ if(ta == null || ta.changesTagLevel()) {
+ tagLevel--;
+ }
+
+ if (flush) {
+ flushBlock();
+ }
+
+ lastEvent = Event.END_TAG;
+ lastEndTag = localName;
+
+ labelStacks.removeLast();
+ }
+
+ // @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ textElementIdx++;
+
+
+ if (flush) {
+ flushBlock();
+ flush = false;
+ }
+
+ if (inIgnorableElement != 0) {
+ return;
+ }
+
+ char c;
+ boolean startWhitespace = false;
+ boolean endWhitespace = false;
+ if (length == 0) {
+ return;
+ }
+
+ final int end = start + length;
+ for (int i = start; i < end; i++) {
+ if (Character.isWhitespace(ch[i])) {
+ ch[i] = ' ';
+ }
+ }
+ while (start < end) {
+ c = ch[start];
+ if (c == ' ') {
+ startWhitespace = true;
+ start++;
+ length--;
+ } else {
+ break;
+ }
+ }
+ while (length > 0) {
+ c = ch[start + length - 1];
+ if (c == ' ') {
+ endWhitespace = true;
+ length--;
+ } else {
+ break;
+ }
+ }
+ if (length == 0) {
+ if (startWhitespace || endWhitespace) {
+ if (!sbLastWasWhitespace) {
+ textBuffer.append(' ');
+ tokenBuffer.append(' ');
+ }
+ sbLastWasWhitespace = true;
+ } else {
+ sbLastWasWhitespace = false;
+ }
+ lastEvent = Event.WHITESPACE;
+ return;
+ }
+ if (startWhitespace) {
+ if (!sbLastWasWhitespace) {
+ textBuffer.append(' ');
+ tokenBuffer.append(' ');
+ }
+ }
+
+ if (blockTagLevel == -1) {
+ blockTagLevel = tagLevel;
+ }
+
+ textBuffer.append(ch, start, length);
+ tokenBuffer.append(ch, start, length);
+ if (endWhitespace) {
+ textBuffer.append(' ');
+ tokenBuffer.append(' ');
+ }
+
+ sbLastWasWhitespace = endWhitespace;
+ lastEvent = Event.CHARACTERS;
+
+ currentContainedTextElements.set(textElementIdx);
+ }
+
+ List getTextBlocks() {
+ return textBlocks;
+ }
+
+ public void flushBlock() {
+ if (inBody == 0) {
+ if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) {
+ setTitle(tokenBuffer.toString().trim());
+ }
+ textBuffer.setLength(0);
+ tokenBuffer.setLength(0);
+ return;
+ }
+
+ final int length = tokenBuffer.length();
+ switch (length) {
+ case 0:
+ return;
+ case 1:
+ if (sbLastWasWhitespace) {
+ textBuffer.setLength(0);
+ tokenBuffer.setLength(0);
+ return;
+ }
+ }
+ final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer);
+
+ int numWords = 0;
+ int numLinkedWords = 0;
+ int numWrappedLines = 0;
+ int currentLineLength = -1; // don't count the first space
+ final int maxLineLength = 80;
+ int numTokens = 0;
+ int numWordsCurrentLine = 0;
+
+ for (String token : tokens) {
+ if (ANCHOR_TEXT_START.equals(token)) {
+ inAnchorText = true;
+ } else if (ANCHOR_TEXT_END.equals(token)) {
+ inAnchorText = false;
+ } else if (isWord(token)) {
+ numTokens++;
+ numWords++;
+ numWordsCurrentLine++;
+ if (inAnchorText) {
+ numLinkedWords++;
+ }
+ final int tokenLength = token.length();
+ currentLineLength += tokenLength + 1;
+ if (currentLineLength > maxLineLength) {
+ numWrappedLines++;
+ currentLineLength = tokenLength;
+ numWordsCurrentLine = 1;
+ }
+ } else {
+ numTokens++;
+ }
+ }
+ if (numTokens == 0) {
+ return;
+ }
+ int numWordsInWrappedLines;
+ if (numWrappedLines == 0) {
+ numWordsInWrappedLines = numWords;
+ numWrappedLines = 1;
+ } else {
+ numWordsInWrappedLines = numWords - numWordsCurrentLine;
+ }
+
+ TextBlock tb = new TextBlock(textBuffer.toString().trim(),
+ currentContainedTextElements, numWords, numLinkedWords,
+ numWordsInWrappedLines, numWrappedLines, offsetBlocks);
+ currentContainedTextElements = new BitSet();
+
+ offsetBlocks++;
+
+ textBuffer.setLength(0);
+ tokenBuffer.setLength(0);
+
+ tb.setTagLevel(blockTagLevel);
+ addTextBlock(tb);
+ blockTagLevel = -1;
+ }
+
+ protected void addTextBlock(final TextBlock tb) {
+
+ for (Integer l : fontSizeStack) {
+ if (l != null) {
+ tb.addLabel("font-" + l);
+ break;
+ }
+ }
+ for (LinkedList labelStack : labelStacks) {
+ if (labelStack != null) {
+ for (LabelAction labels : labelStack) {
+ if (labels != null) {
+ labels.addTo(tb);
+ }
+ }
+ }
+ }
+
+ textBlocks.add(tb);
+ }
+
+ private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
+ .compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]");
+
+ private static boolean isWord(final String token) {
+ return PAT_VALID_WORD_CHARACTER.matcher(token).find();
+ }
+
+ static private enum Event {
+ START_TAG, END_TAG, CHARACTERS, WHITESPACE
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String s) {
+ if (s == null || s.length() == 0) {
+ return;
+ }
+ title = s;
+ }
+
+ /**
+ * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
+ * s. NOTE: Only call this after parsing.
+ *
+ * @return The {@link TextDocument}
+ */
+ public TextDocument toTextDocument() {
+ // just to be sure
+ flushBlock();
+
+ return new TextDocument(getTitle(), getTextBlocks());
+ }
+
+ public void addWhitespaceIfNecessary() {
+ if (!sbLastWasWhitespace) {
+ tokenBuffer.append(' ');
+ textBuffer.append(' ');
+ sbLastWasWhitespace = true;
+ }
+ }
+
+ public void addLabelAction(final LabelAction la)
+ throws IllegalStateException {
+ LinkedList labelStack = labelStacks.getLast();
+ if (labelStack == null) {
+ labelStack = new LinkedList();
+ labelStacks.removeLast();
+ labelStacks.add(labelStack);
+ }
+ labelStack.add(la);
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java
new file mode 100644
index 0000000..79dcc72
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java
@@ -0,0 +1,76 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+
+import de.l3s.boilerpipe.BoilerpipeDocumentSource;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * A simple SAX Parser, used by {@link BoilerpipeSAXInput}. The parser uses CyberNeko to parse HTML content.
+ *
+ * @author Christian Kohlschütter
+ */
+public class BoilerpipeHTMLParser extends AbstractSAXParser implements BoilerpipeDocumentSource {
+
+ private BoilerpipeHTMLContentHandler contentHandler;
+
+ /**
+ * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
+ */
+ public BoilerpipeHTMLParser() {
+ this(new BoilerpipeHTMLContentHandler());
+ }
+
+ /**
+ * Constructs a {@link BoilerpipeHTMLParser} using the given {@link BoilerpipeHTMLContentHandler}.
+ *
+ * @param contentHandler
+ */
+ public BoilerpipeHTMLParser(BoilerpipeHTMLContentHandler contentHandler) {
+ super(new HTMLConfiguration());
+ setContentHandler(contentHandler);
+ }
+
+ protected BoilerpipeHTMLParser(boolean ignore) {
+ super(new HTMLConfiguration());
+ }
+
+ public void setContentHandler(final BoilerpipeHTMLContentHandler contentHandler) {
+ this.contentHandler = contentHandler;
+ super.setContentHandler(contentHandler);
+ }
+ public void setContentHandler(final org.xml.sax.ContentHandler contentHandler) {
+ this.contentHandler = null;
+ super.setContentHandler(contentHandler);
+ }
+
+ /**
+ * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
+ * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
+ *
+ * @return The {@link TextDocument}
+ */
+ public TextDocument toTextDocument() {
+ return contentHandler.toTextDocument();
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java
new file mode 100644
index 0000000..f95fd41
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java
@@ -0,0 +1,73 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeInput;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Parses an {@link InputSource} using SAX and returns a {@link TextDocument}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class BoilerpipeSAXInput implements BoilerpipeInput {
+ private final InputSource is;
+
+ /**
+ * Creates a new instance of {@link BoilerpipeSAXInput} for the given {@link InputSource}.
+ *
+ * @param is
+ * @throws SAXException
+ */
+ public BoilerpipeSAXInput(final InputSource is) throws SAXException {
+ this.is = is;
+ }
+
+ /**
+ * Retrieves the {@link TextDocument} using a default HTML parser.
+ */
+ public TextDocument getTextDocument() throws BoilerpipeProcessingException {
+ return getTextDocument(new BoilerpipeHTMLParser());
+ }
+
+ /**
+ * Retrieves the {@link TextDocument} using the given HTML parser.
+ *
+ * @param parser The parser used to transform the input into boilerpipe's internal representation.
+ * @return The retrieved {@link TextDocument}
+ * @throws BoilerpipeProcessingException
+ */
+ public TextDocument getTextDocument(final BoilerpipeHTMLParser parser) throws BoilerpipeProcessingException {
+ try {
+ parser.parse(is);
+ } catch (IOException e) {
+ throw new BoilerpipeProcessingException(e);
+ } catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+
+ return parser.toTextDocument();
+ }
+
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java b/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java
new file mode 100644
index 0000000..7b9c410
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java
@@ -0,0 +1,357 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.labels.LabelAction;
+
+/**
+ * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing.
+ *
+ * @author Christian Kohlschütter
+ */
+public abstract class CommonTagActions {
+
+ private CommonTagActions() {
+ }
+
+ public static final class Chained implements TagAction {
+
+ private final TagAction t1;
+ private final TagAction t2;
+
+ public Chained(final TagAction t1, final TagAction t2) {
+ this.t1 = t1;
+ this.t2 = t2;
+ }
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ String localName, String qName, Attributes atts)
+ throws SAXException {
+ return t1.start(instance, localName, qName, atts)
+ | t2.start(instance, localName, qName, atts);
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ String localName, String qName) throws SAXException {
+ return t1.end(instance, localName, qName)
+ | t2.end(instance, localName, qName);
+ }
+
+ public boolean changesTagLevel() {
+ return t1.changesTagLevel() || t2.changesTagLevel();
+ }
+ }
+
+ /**
+ * Marks this tag as "ignorable", i.e. all its inner content is silently skipped.
+ */
+ public static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
+
+ public boolean start(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ instance.inIgnorableElement++;
+ return true;
+ }
+
+ public boolean end(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ instance.inIgnorableElement--;
+ return true;
+ }
+
+ public boolean changesTagLevel() {
+ return true;
+ }
+ };
+
+ /**
+ * Marks this tag as "anchor" (this should usually only be set for the <A>
tag).
+ * Anchor tags may not be nested.
+ *
+ * There is a bug in certain versions of NekoHTML which still allows nested tags.
+ * If boilerpipe encounters such nestings, a SAXException is thrown.
+ */
+ public static final TagAction TA_ANCHOR_TEXT = new TagAction() {
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) throws SAXException {
+ if (instance.inAnchor++ > 0) {
+ // as nested A elements are not allowed per specification, we
+ // are probably reaching this branch due to a bug in the XML
+ // parser
+ System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...");
+
+ end(instance, localName, qName);
+ }
+ if (instance.inIgnorableElement == 0) {
+ instance.addWhitespaceIfNecessary();
+ instance.tokenBuffer
+ .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_START);
+ instance.tokenBuffer.append(' ');
+ instance.sbLastWasWhitespace = true;
+ }
+ return false;
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ if (--instance.inAnchor == 0) {
+ if (instance.inIgnorableElement == 0) {
+ instance.addWhitespaceIfNecessary();
+ instance.tokenBuffer
+ .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_END);
+ instance.tokenBuffer.append(' ');
+ instance.sbLastWasWhitespace = true;
+ }
+ }
+ return false;
+ }
+
+ public boolean changesTagLevel() {
+ return true;
+ }
+ };
+
+ /**
+ * Marks this tag the body element (this should usually only be set for the <BODY>
tag).
+ */
+ public static final TagAction TA_BODY = new TagAction() {
+ public boolean start(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ instance.flushBlock();
+ instance.inBody++;
+ return false;
+ }
+
+ public boolean end(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ instance.flushBlock();
+ instance.inBody--;
+ return false;
+ }
+
+ public boolean changesTagLevel() {
+ return true;
+ }
+ };
+
+ /**
+ * Marks this tag a simple "inline" element, which generates whitespace, but no new block.
+ */
+ public static final TagAction TA_INLINE_WHITESPACE = new TagAction() {
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ instance.addWhitespaceIfNecessary();
+ return false;
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ instance.addWhitespaceIfNecessary();
+ return false;
+ }
+
+ public boolean changesTagLevel() {
+ return false;
+ }
+ };
+
+ /**
+ * @deprecated Use {@link #TA_INLINE_WHITESPACE} instead
+ */
+ @Deprecated
+ public static final TagAction TA_INLINE = TA_INLINE_WHITESPACE;
+
+ /**
+ * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
+ */
+ public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() {
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ return false;
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ return false;
+ }
+
+ public boolean changesTagLevel() {
+ return false;
+ }
+ };
+ private static final Pattern PAT_FONT_SIZE = Pattern
+ .compile("([\\+\\-]?)([0-9])");
+
+ /**
+ * Explicitly marks this tag a simple "block-level" element, which always generates whitespace
+ */
+ public static final TagAction TA_BLOCK_LEVEL = new TagAction() {
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ return true;
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ return true;
+ }
+
+ public boolean changesTagLevel() {
+ return true;
+ }
+ };
+
+ /**
+ * Special TagAction for the <FONT>
tag, which keeps track of the
+ * absolute and relative font size.
+ */
+ public static final TagAction TA_FONT = new TagAction() {
+
+ public boolean start(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+
+ String sizeAttr = atts.getValue("size");
+ if (sizeAttr != null) {
+ Matcher m = PAT_FONT_SIZE.matcher(sizeAttr);
+ if (m.matches()) {
+ String rel = m.group(1);
+ final int val = Integer.parseInt(m.group(2));
+ final int size;
+ if (rel.length() == 0) {
+ // absolute
+ size = val;
+ } else {
+ // relative
+ int prevSize;
+ if (instance.fontSizeStack.isEmpty()) {
+ prevSize = 3;
+ } else {
+ prevSize = 3;
+ for (Integer s : instance.fontSizeStack) {
+ if (s != null) {
+ prevSize = s;
+ break;
+ }
+ }
+ }
+ if (rel.charAt(0) == '+') {
+ size = prevSize + val;
+ } else {
+ size = prevSize - val;
+ }
+
+ }
+ instance.fontSizeStack.add(0, size);
+ } else {
+ instance.fontSizeStack.add(0, null);
+ }
+ } else {
+ instance.fontSizeStack.add(0, null);
+ }
+ return false;
+ }
+
+ public boolean end(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ instance.fontSizeStack.removeFirst();
+ return false;
+ }
+
+ public boolean changesTagLevel() {
+ return false;
+ }
+ };
+
+ /**
+ * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated
+ * {@link TextBlock}.
+ */
+ public static final class InlineTagLabelAction implements TagAction {
+
+ private final LabelAction action;
+
+ public InlineTagLabelAction(final LabelAction action) {
+ this.action = action;
+ }
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ instance.addWhitespaceIfNecessary();
+ instance.addLabelAction(action);
+ return false;
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ instance.addWhitespaceIfNecessary();
+ return false;
+ }
+
+ public boolean changesTagLevel() {
+ return false;
+ }
+ }
+
+ /**
+ * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated
+ * {@link TextBlock}.
+ */
+ public static final class BlockTagLabelAction implements TagAction {
+
+ private final LabelAction action;
+
+ public BlockTagLabelAction(final LabelAction action) {
+ this.action = action;
+ }
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName,
+ final Attributes atts) {
+ instance.addLabelAction(action);
+ return true;
+ }
+
+ public boolean end(BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) {
+ return true;
+ }
+
+ public boolean changesTagLevel() {
+ return true;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java b/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java
new file mode 100644
index 0000000..cf48dac
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java
@@ -0,0 +1,86 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import de.l3s.boilerpipe.labels.DefaultLabels;
+import de.l3s.boilerpipe.labels.LabelAction;
+
+
+/**
+ * Default {@link TagAction}s. Seem to work well.
+ *
+ * @see TagActionMap
+ */
+public class DefaultTagActionMap extends TagActionMap {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public static final TagActionMap INSTANCE = new DefaultTagActionMap();
+
+ protected DefaultTagActionMap() {
+ setTagAction("STYLE", CommonTagActions.TA_IGNORABLE_ELEMENT);
+ setTagAction("SCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT);
+ setTagAction("OPTION", CommonTagActions.TA_IGNORABLE_ELEMENT);
+ setTagAction("OBJECT", CommonTagActions.TA_IGNORABLE_ELEMENT);
+ setTagAction("EMBED", CommonTagActions.TA_IGNORABLE_ELEMENT);
+ setTagAction("APPLET", CommonTagActions.TA_IGNORABLE_ELEMENT);
+ setTagAction("LINK", CommonTagActions.TA_IGNORABLE_ELEMENT);
+
+ setTagAction("A", CommonTagActions.TA_ANCHOR_TEXT);
+ setTagAction("BODY", CommonTagActions.TA_BODY);
+
+ setTagAction("STRIKE", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("U", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("B", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("I", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("EM", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("STRONG", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("SPAN", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+
+ // New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
+ setTagAction("SUP", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+
+ // New in 1.2
+ setTagAction("CODE", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("TT", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("SUB", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+ setTagAction("VAR", CommonTagActions.TA_INLINE_NO_WHITESPACE);
+
+
+ setTagAction("ABBR", CommonTagActions.TA_INLINE_WHITESPACE);
+ setTagAction("ACRONYM", CommonTagActions.TA_INLINE_WHITESPACE);
+
+ setTagAction("FONT", CommonTagActions.TA_INLINE_NO_WHITESPACE); // could also use TA_FONT
+
+ // added in 1.1.1
+ setTagAction("NOSCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT);
+
+ // New in 1.3
+ setTagAction("LI", new CommonTagActions.BlockTagLabelAction(
+ new LabelAction(DefaultLabels.LI)));
+ setTagAction("H1", new CommonTagActions.BlockTagLabelAction(
+ new LabelAction(DefaultLabels.H1, DefaultLabels.HEADING)));
+ setTagAction("H2", new CommonTagActions.BlockTagLabelAction(
+ new LabelAction(DefaultLabels.H2, DefaultLabels.HEADING)));
+ setTagAction("H3", new CommonTagActions.BlockTagLabelAction(
+ new LabelAction(DefaultLabels.H3, DefaultLabels.HEADING)));
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java
new file mode 100644
index 0000000..9cf2d87
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java
@@ -0,0 +1,41 @@
+package de.l3s.boilerpipe.sax;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+
+import org.xml.sax.InputSource;
+
+/**
+ * An {@link InputSourceable} for {@link HTMLFetcher}.
+ *
+ * @author Christian Kohlschütter
+ */
+public class HTMLDocument implements InputSourceable {
+ private final Charset charset;
+ private final byte[] data;
+
+ public HTMLDocument(final byte[] data, final Charset charset) {
+ this.data = data;
+ this.charset = charset;
+ }
+
+ public HTMLDocument(final String data) {
+ Charset cs = Charset.forName("utf-8");
+ this.data = data.getBytes(cs);
+ this.charset = cs;
+ }
+
+ public Charset getCharset() {
+ return charset;
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ public InputSource toInputSource() {
+ final InputSource is = new InputSource(new ByteArrayInputStream(data));
+ is.setEncoding(charset.name());
+ return is;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java
new file mode 100644
index 0000000..2c2e0c4
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java
@@ -0,0 +1,79 @@
+package de.l3s.boilerpipe.sax;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * A very simple HTTP/HTML fetcher, really just for demo purposes.
+ *
+ * @author Christian Kohlschütter
+ */
+public class HTMLFetcher {
+ private HTMLFetcher() {
+ }
+
+ private static final Pattern PAT_CHARSET = Pattern
+ .compile("charset=([^; ]+)$");
+
+ /**
+ * Fetches the document at the given URL, using {@link URLConnection}.
+ *
+ * @param url
+ * @return the document at the given URL
+ * @throws IOException
+ */
+ public static HTMLDocument fetch(final URL url) throws IOException {
+ final URLConnection conn = url.openConnection();
+ final String ct = conn.getContentType();
+
+ if (ct == null
+ || !(ct.equals("text/html") || ct.startsWith("text/html;"))) {
+ throw new IOException("Unsupported content type: "+ct);
+ }
+
+ Charset cs = Charset.forName("Cp1252");
+ if (ct != null) {
+ Matcher m = PAT_CHARSET.matcher(ct);
+ if (m.find()) {
+ final String charset = m.group(1);
+ try {
+ cs = Charset.forName(charset);
+ } catch (UnsupportedCharsetException e) {
+ // keep default
+ }
+ }
+ }
+
+ InputStream in = conn.getInputStream();
+
+ final String encoding = conn.getContentEncoding();
+ if (encoding != null) {
+ if ("gzip".equalsIgnoreCase(encoding)) {
+ in = new GZIPInputStream(in);
+ } else {
+ System.err.println("WARN: unsupported Content-Encoding: "
+ + encoding);
+ }
+ }
+
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ byte[] buf = new byte[4096];
+ int r;
+ while ((r = in.read(buf)) != -1) {
+ bos.write(buf, 0, r);
+ }
+ in.close();
+
+ final byte[] data = bos.toByteArray();
+
+ return new HTMLDocument(data, cs);
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java
new file mode 100644
index 0000000..4a300c3
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java
@@ -0,0 +1,530 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Highlights text blocks in an HTML document that have been marked as "content"
+ * in the corresponding {@link TextDocument}.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class HTMLHighlighter {
+
+ private Map> tagWhitelist = null;
+
+ /**
+ * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
+ * HTML text, with the extracted text portion highlighted.
+ */
+ public static HTMLHighlighter newHighlightingInstance() {
+ return new HTMLHighlighter(false);
+ }
+
+ /**
+ * Creates a new {@link HTMLHighlighter}, which is set-up to return only the
+ * extracted HTML text, including enclosed markup.
+ */
+ public static HTMLHighlighter newExtractingInstance() {
+ return new HTMLHighlighter(true);
+ }
+
+ private HTMLHighlighter(final boolean extractHTML) {
+ if (extractHTML) {
+ setOutputHighlightOnly(true);
+ setExtraStyleSheet("\n\n");
+ setPreHighlight("");
+ setPostHighlight("");
+ }
+ }
+
+ /**
+ * Processes the given {@link TextDocument} and the original HTML text (as a
+ * String).
+ *
+ * @param doc
+ * The processed {@link TextDocument}.
+ * @param origHTML
+ * The original HTML document.
+ * @return The highlighted HTML.
+ * @throws BoilerpipeProcessingException
+ */
+ public String process(final TextDocument doc, final String origHTML)
+ throws BoilerpipeProcessingException {
+ return process(doc, new InputSource(new StringReader(origHTML)));
+ }
+
+ /**
+ * Processes the given {@link TextDocument} and the original HTML text (as
+ * an {@link InputSource}).
+ *
+ * @param doc
+ * The processed {@link TextDocument}.
+ * The original HTML document.
+ * @return The highlighted HTML.
+ * @throws BoilerpipeProcessingException
+ */
+ public String process(final TextDocument doc, final InputSource is)
+ throws BoilerpipeProcessingException {
+ final Implementation implementation = new Implementation();
+ implementation.process(doc, is);
+
+ String html = implementation.html.toString();
+ if (outputHighlightOnly) {
+ Matcher m;
+
+ boolean repeat = true;
+ while (repeat) {
+ repeat = false;
+ m = PAT_TAG_NO_TEXT.matcher(html);
+ if (m.find()) {
+ repeat = true;
+ html = m.replaceAll("");
+ }
+
+ m = PAT_SUPER_TAG.matcher(html);
+ if (m.find()) {
+ repeat = true;
+ html = m.replaceAll(m.group(1));
+ }
+ }
+ }
+
+ return html;
+ }
+
+ private static final Pattern PAT_TAG_NO_TEXT = Pattern
+ .compile("<[^/][^>]*>[^>]*>");
+ private static final Pattern PAT_SUPER_TAG = Pattern
+ .compile("^<[^>]*>(<.*?>)[^>]*>$");
+
+ /**
+ * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
+ * retrieved HTML using the specified {@link BoilerpipeExtractor}.
+ *
+ * The processed {@link TextDocument}.
+ * The original HTML document.
+ * @return The highlighted HTML.
+ * @throws BoilerpipeProcessingException
+ */
+ public String process(final URL url, final BoilerpipeExtractor extractor)
+ throws IOException, BoilerpipeProcessingException, SAXException {
+ final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
+
+ final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
+ .getTextDocument();
+ extractor.process(doc);
+
+ final InputSource is = htmlDoc.toInputSource();
+
+ return process(doc, is);
+ }
+
+ private boolean outputHighlightOnly = false;
+ private String extraStyleSheet = "\n\n";
+ private String preHighlight = "";
+ private String postHighlight = "";
+
+ /**
+ * If true, only HTML enclosed within highlighted content will be returned
+ */
+ public boolean isOutputHighlightOnly() {
+ return outputHighlightOnly;
+ }
+
+ /**
+ * Sets whether only HTML enclosed within highlighted content will be
+ * returned, or the whole HTML document.
+ */
+ public void setOutputHighlightOnly(boolean outputHighlightOnly) {
+ this.outputHighlightOnly = outputHighlightOnly;
+ }
+
+ /**
+ * Returns the extra stylesheet definition that will be inserted in the HEAD
+ * element.
+ *
+ * By default, this corresponds to a simple definition that marks text in
+ * class "x-boilerpipe-mark1" as inline text with yellow background.
+ */
+ public String getExtraStyleSheet() {
+ return extraStyleSheet;
+ }
+
+ /**
+ * Sets the extra stylesheet definition that will be inserted in the HEAD
+ * element.
+ *
+ * To disable, set it to the empty string: ""
+ *
+ * @param extraStyleSheet
+ * Plain HTML
+ */
+ public void setExtraStyleSheet(String extraStyleSheet) {
+ this.extraStyleSheet = extraStyleSheet;
+ }
+
+ /**
+ * Returns the string that will be inserted before any highlighted HTML
+ * block.
+ *
+ * By default, this corresponds to
+ * <span class=&qupt;x-boilerpipe-mark1">
+ */
+ public String getPreHighlight() {
+ return preHighlight;
+ }
+
+ /**
+ * Sets the string that will be inserted prior to any highlighted HTML
+ * block.
+ *
+ * To disable, set it to the empty string: ""
+ */
+ public void setPreHighlight(String preHighlight) {
+ this.preHighlight = preHighlight;
+ }
+
+ /**
+ * Returns the string that will be inserted after any highlighted HTML
+ * block.
+ *
+ * By default, this corresponds to </span>
+ */
+ public String getPostHighlight() {
+ return postHighlight;
+ }
+
+ /**
+ * Sets the string that will be inserted after any highlighted HTML block.
+ *
+ * To disable, set it to the empty string: ""
+ */
+ public void setPostHighlight(String postHighlight) {
+ this.postHighlight = postHighlight;
+ }
+
+ private abstract static class TagAction {
+ void beforeStart(final Implementation instance, final String localName) {
+ }
+
+ void afterStart(final Implementation instance, final String localName) {
+ }
+
+ void beforeEnd(final Implementation instance, final String localName) {
+ }
+
+ void afterEnd(final Implementation instance, final String localName) {
+ }
+ }
+
+ private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
+ void beforeStart(final Implementation instance, final String localName) {
+ instance.inIgnorableElement++;
+ }
+
+ void afterEnd(final Implementation instance, final String localName) {
+ instance.inIgnorableElement--;
+ }
+ };
+
+ private static final TagAction TA_HEAD = new TagAction() {
+ void beforeStart(final Implementation instance, final String localName) {
+ instance.inIgnorableElement++;
+ }
+
+ void beforeEnd(final Implementation instance, String localName) {
+ instance.html.append(instance.hl.extraStyleSheet);
+ }
+
+ void afterEnd(final Implementation instance, final String localName) {
+ instance.inIgnorableElement--;
+ }
+ };
+ private static Map TAG_ACTIONS = new HashMap();
+ static {
+ TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("OBJECT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
+ // NOTE: you might want to comment this out:
+ TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
+
+ TAG_ACTIONS.put("HEAD", TA_HEAD);
+ }
+
+ private final class Implementation extends AbstractSAXParser implements
+ ContentHandler {
+ StringBuilder html = new StringBuilder();
+
+ private int inIgnorableElement = 0;
+ private int characterElementIdx = 0;
+ private final BitSet contentBitSet = new BitSet();
+ private final HTMLHighlighter hl = HTMLHighlighter.this;
+
+ Implementation() {
+ super(new HTMLConfiguration());
+ setContentHandler(this);
+ }
+
+ void process(final TextDocument doc, final InputSource is)
+ throws BoilerpipeProcessingException {
+ for (TextBlock block : doc.getTextBlocks()) {
+ if (block.isContent()) {
+ final BitSet bs = block.getContainedTextElements();
+ if (bs != null) {
+ contentBitSet.or(bs);
+ }
+ }
+ }
+
+ try {
+ parse(is);
+ } catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ } catch (IOException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+ }
+
+ public void endDocument() throws SAXException {
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ }
+
+ public void startDocument() throws SAXException {
+ }
+
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ TagAction ta = TAG_ACTIONS.get(localName);
+ if (ta != null) {
+ ta.beforeStart(this, localName);
+ }
+
+ // HACK: remove existing highlight
+ boolean ignoreAttrs = false;
+ if ("SPAN".equalsIgnoreCase(localName)) {
+ String classVal = atts.getValue("class");
+ if ("x-boilerpipe-mark1".equals(classVal)) {
+ ignoreAttrs = true;
+ }
+ }
+
+ try {
+ if (inIgnorableElement == 0) {
+ if (outputHighlightOnly) {
+ // boolean highlight = contentBitSet
+ // .get(characterElementIdx);
+
+ // if (!highlight) {
+ // return;
+ // }
+ }
+
+ final Set whitelistAttributes;
+ if (tagWhitelist == null) {
+ whitelistAttributes = null;
+ } else {
+ whitelistAttributes = tagWhitelist.get(qName);
+ if (whitelistAttributes == null) {
+ // skip
+ return;
+ }
+ }
+
+ html.append('<');
+ html.append(qName);
+ if (!ignoreAttrs) {
+ final int numAtts = atts.getLength();
+ for (int i = 0; i < numAtts; i++) {
+ final String attr = atts.getQName(i);
+
+ if (whitelistAttributes != null
+ && !whitelistAttributes.contains(attr)) {
+ // skip
+ continue;
+ }
+
+ final String value = atts.getValue(i);
+ html.append(' ');
+ html.append(attr);
+ html.append("=\"");
+ html.append(xmlEncode(value));
+ html.append("\"");
+ }
+ }
+ html.append('>');
+ }
+ } finally {
+ if (ta != null) {
+ ta.afterStart(this, localName);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ TagAction ta = TAG_ACTIONS.get(localName);
+ if (ta != null) {
+ ta.beforeEnd(this, localName);
+ }
+
+ try {
+ if (inIgnorableElement == 0) {
+ if (outputHighlightOnly) {
+ // boolean highlight = contentBitSet
+ // .get(characterElementIdx);
+
+ // if (!highlight) {
+ // return;
+ // }
+ }
+
+ if (tagWhitelist != null
+ && !tagWhitelist.containsKey(qName)) {
+ // skip
+ return;
+ }
+
+ html.append("");
+ html.append(qName);
+ html.append('>');
+ }
+ } finally {
+ if (ta != null) {
+ ta.afterEnd(this, localName);
+ }
+ }
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ characterElementIdx++;
+ if (inIgnorableElement == 0) {
+
+ boolean highlight = contentBitSet.get(characterElementIdx);
+
+ if (!highlight && outputHighlightOnly) {
+ return;
+ }
+
+ if (highlight) {
+ html.append(preHighlight);
+ }
+ html.append(xmlEncode(String.valueOf(ch, start, length)));
+ if (highlight) {
+ html.append(postHighlight);
+ }
+ }
+ }
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ }
+
+ }
+
+ private static String xmlEncode(final String in) {
+ if (in == null) {
+ return "";
+ }
+ char c;
+ StringBuilder out = new StringBuilder(in.length());
+
+ for (int i = 0; i < in.length(); i++) {
+ c = in.charAt(i);
+ switch (c) {
+ case '<':
+ out.append("<");
+ break;
+ case '>':
+ out.append(">");
+ break;
+ case '&':
+ out.append("&");
+ break;
+ case '"':
+ out.append(""");
+ break;
+ default:
+ out.append(c);
+ }
+ }
+
+ return out.toString();
+ }
+
+ public Map> getTagWhitelist() {
+ return tagWhitelist;
+ }
+
+ public void setTagWhitelist(Map> tagWhitelist) {
+ this.tagWhitelist = tagWhitelist;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java b/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java
new file mode 100644
index 0000000..3a9bcbe
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java
@@ -0,0 +1,277 @@
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.Image;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+
+/**
+ * Extracts the images that are enclosed by extracted content.
+ *
+ * @author Christian Kohlschütter
+ */
+public final class ImageExtractor {
+ public static final ImageExtractor INSTANCE = new ImageExtractor();
+
+ /**
+ * Returns the singleton instance of {@link ImageExtractor}.
+ *
+ * @return the singleton instance of {@link ImageExtractor}.
+ */
+ public static ImageExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ private ImageExtractor() {
+ }
+
+ /**
+ * Processes the given {@link TextDocument} and the original HTML text (as a
+ * String).
+ *
+ * @param doc
+ * The processed {@link TextDocument}.
+ * @param origHTML
+ * The original HTML document.
+ * @return A List of enclosed {@link Image}s
+ * @throws BoilerpipeProcessingException
+ */
+ public List process(final TextDocument doc,
+ final String origHTML) throws BoilerpipeProcessingException {
+ return process(doc, new InputSource(
+ new StringReader(origHTML)));
+ }
+
+ /**
+ * Processes the given {@link TextDocument} and the original HTML text (as an
+ * {@link InputSource}).
+ *
+ * @param doc
+ * The processed {@link TextDocument}.
+ * The original HTML document.
+ * @return A List of enclosed {@link Image}s
+ * @throws BoilerpipeProcessingException
+ */
+ public List process(final TextDocument doc,
+ final InputSource is) throws BoilerpipeProcessingException {
+ final Implementation implementation = new Implementation();
+ implementation.process(doc, is);
+
+ return implementation.linksHighlight;
+ }
+
+ /**
+ * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
+ * retrieved HTML using the specified {@link BoilerpipeExtractor}.
+ *
+ * The processed {@link TextDocument}.
+ * The original HTML document.
+ * @return A List of enclosed {@link Image}s
+ * @throws BoilerpipeProcessingException
+ */
+ public List process(final URL url, final BoilerpipeExtractor extractor)
+ throws IOException, BoilerpipeProcessingException, SAXException {
+ final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
+
+ final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
+ .getTextDocument();
+ extractor.process(doc);
+
+ final InputSource is = htmlDoc.toInputSource();
+
+ return process(doc, is);
+ }
+
+
+ private final class Implementation extends AbstractSAXParser implements
+ ContentHandler {
+ List linksHighlight = new ArrayList();
+ private List linksBuffer = new ArrayList();
+
+ private int inIgnorableElement = 0;
+ private int characterElementIdx = 0;
+ private final BitSet contentBitSet = new BitSet();
+
+ private boolean inHighlight = false;
+
+ Implementation() {
+ super(new HTMLConfiguration());
+ setContentHandler(this);
+ }
+
+ void process(final TextDocument doc, final InputSource is)
+ throws BoilerpipeProcessingException {
+ for (TextBlock block : doc.getTextBlocks()) {
+ if (block.isContent()) {
+ final BitSet bs = block.getContainedTextElements();
+ if (bs != null) {
+ contentBitSet.or(bs);
+ }
+ }
+ }
+
+ try {
+ parse(is);
+ } catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ } catch (IOException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+ }
+
+ public void endDocument() throws SAXException {
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ }
+
+ public void startDocument() throws SAXException {
+ }
+
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ TagAction ta = TAG_ACTIONS.get(localName);
+ if (ta != null) {
+ ta.beforeStart(this, localName);
+ }
+
+ try {
+ if (inIgnorableElement == 0) {
+ if(inHighlight && "IMG".equalsIgnoreCase(localName)) {
+ String src = atts.getValue("src");
+ if(src != null && src.length() > 0) {
+ linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts.getValue("alt")));
+ }
+ }
+ }
+ } finally {
+ if (ta != null) {
+ ta.afterStart(this, localName);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ TagAction ta = TAG_ACTIONS.get(localName);
+ if (ta != null) {
+ ta.beforeEnd(this, localName);
+ }
+
+ try {
+ if (inIgnorableElement == 0) {
+ //
+ }
+ } finally {
+ if (ta != null) {
+ ta.afterEnd(this, localName);
+ }
+ }
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ characterElementIdx++;
+ if (inIgnorableElement == 0) {
+
+ boolean highlight = contentBitSet.get(characterElementIdx);
+ if(!highlight) {
+ if(length == 0) {
+ return;
+ }
+ boolean justWhitespace = true;
+ for(int i=start;i TAG_ACTIONS = new HashMap();
+ static {
+ TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
+
+ TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT);
+ }
+
+ private abstract static class TagAction {
+ void beforeStart(final Implementation instance, final String localName) {
+ }
+
+ void afterStart(final Implementation instance, final String localName) {
+ }
+
+ void beforeEnd(final Implementation instance, final String localName) {
+ }
+
+ void afterEnd(final Implementation instance, final String localName) {
+ }
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java b/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java
new file mode 100644
index 0000000..ef8010e
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java
@@ -0,0 +1,12 @@
+package de.l3s.boilerpipe.sax;
+
+import org.xml.sax.InputSource;
+
+/**
+ * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given document.
+ *
+ * @author Christian Kohlschütter
+ */
+public interface InputSourceable {
+ InputSource toInputSource();
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java b/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java
new file mode 100644
index 0000000..e54a3da
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java
@@ -0,0 +1,105 @@
+package de.l3s.boilerpipe.sax;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.labels.DefaultLabels;
+import de.l3s.boilerpipe.labels.LabelAction;
+
+/**
+ * Assigns labels for element CSS classes and ids to the corresponding
+ * {@link TextBlock}. CSS classes are prefixed by
+ * {@link DefaultLabels#MARKUP_PREFIX}.
, and IDs are prefixed by
+ * {@link DefaultLabels#MARKUP_PREFIX}#
+ *
+ * @author Christian Kohlschütter
+ */
+public final class MarkupTagAction implements TagAction {
+
+ private final boolean isBlockLevel;
+ private LinkedList> labelStack = new LinkedList>();
+
+ public MarkupTagAction(final boolean isBlockLevel) {
+ this.isBlockLevel = isBlockLevel;
+ }
+
+ private static final Pattern PAT_NUM = Pattern.compile("[0-9]+");
+
+ public boolean start(BoilerpipeHTMLContentHandler instance,
+ String localName, String qName, Attributes atts)
+ throws SAXException {
+ List labels = new ArrayList(5);
+ labels.add(DefaultLabels.MARKUP_PREFIX + localName);
+
+ String classVal = atts.getValue("class");
+
+ if (classVal != null && classVal.length() > 0) {
+ classVal = PAT_NUM.matcher(classVal).replaceAll("#");
+ classVal = classVal.trim();
+ String[] vals = classVal.split("[ ]+");
+ labels.add(DefaultLabels.MARKUP_PREFIX + "."
+ + classVal.replace(' ', '.'));
+ if (vals.length > 1) {
+ for (String s : vals) {
+ labels.add(DefaultLabels.MARKUP_PREFIX + "." + s);
+ }
+ }
+ }
+
+ String id = atts.getValue("id");
+ if (id != null && id.length() > 0) {
+ id = PAT_NUM.matcher(id).replaceAll("#");
+ labels.add(DefaultLabels.MARKUP_PREFIX + "#" + id);
+ }
+
+ Set ancestors = getAncestorLabels();
+ List labelsWithAncestors = new ArrayList(
+ (ancestors.size() + 1) * labels.size());
+
+ for (String l : labels) {
+ for (String an : ancestors) {
+ labelsWithAncestors.add(an);
+ labelsWithAncestors.add(an + " " + l);
+ }
+ labelsWithAncestors.add(l);
+ }
+
+ instance.addLabelAction(new LabelAction(labelsWithAncestors
+ .toArray(new String[labelsWithAncestors.size()])));
+
+ labelStack.add(labels);
+
+ return isBlockLevel;
+ }
+
+
+ public boolean end(BoilerpipeHTMLContentHandler instance, String localName,
+ String qName) throws SAXException {
+
+ labelStack.removeLast();
+ return isBlockLevel;
+ }
+
+ public boolean changesTagLevel() {
+ return isBlockLevel;
+ }
+
+ private Set getAncestorLabels() {
+ Set set = new HashSet();
+ for (List labels : labelStack) {
+ if (labels == null) {
+ continue;
+ }
+ set.addAll(labels);
+ }
+ return set;
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java b/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java
new file mode 100644
index 0000000..e6f1943
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.Image;
+import de.l3s.boilerpipe.document.Media;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.document.VimeoVideo;
+import de.l3s.boilerpipe.document.YoutubeVideo;
+import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
+import de.l3s.boilerpipe.sax.HTMLDocument;
+import de.l3s.boilerpipe.sax.HTMLFetcher;
+
+
+/**
+ * Extracts youtube and vimeo videos that are enclosed by extracted content.
+ *
+ * @author Christian Kohlschütter, manuel.codiga@gmail.com
+ */
+public final class MediaExtractor {
+
+ /** */
+ public static final MediaExtractor INSTANCE = new MediaExtractor();
+
+ /**
+ * @return the singleton instance of {@link MediaExtractor}.
+ */
+ public static MediaExtractor getInstance() {
+ return INSTANCE;
+ }
+
+
+
+ /**
+ * Processes the given {@link TextDocument} and the original HTML text (as a
+ * String).
+ *
+ * @param doc
+ * The processed {@link TextDocument}.
+ * @param origHTML
+ * The original HTML document.
+ * @return A List of enclosed {@link Image}s
+ * @throws BoilerpipeProcessingException if an error during extraction occure
+ */
+ public List process(final TextDocument doc, final String origHTML)
+ throws BoilerpipeProcessingException {
+ return process(doc, new InputSource(new StringReader(origHTML)));
+ }
+
+ /**
+ * Processes the given {@link TextDocument} and the original HTML text (as an
+ * {@link InputSource}).
+ *
+ * @param doc
+ * The processed {@link TextDocument}.
+ * The original HTML document.
+ * @return A List of enclosed {@link Image}s
+ * @throws BoilerpipeProcessingException
+ */
+ public List process(final TextDocument doc, final InputSource is)
+ throws BoilerpipeProcessingException {
+ final Implementation implementation = new Implementation();
+ implementation.process(doc, is);
+
+ return implementation.linksHighlight;
+ }
+
+ /**
+ * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
+ * retrieved HTML using the specified {@link BoilerpipeExtractor}.
+ * @param url the url of the document to fetch
+ * @param extractor extractor to use
+ *
+ * @return A List of enclosed {@link Image}s
+ * @throws IOException
+ * @throws BoilerpipeProcessingException
+ * @throws SAXException
+ */
+ @SuppressWarnings("javadoc")
+ public List process(final URL url, final BoilerpipeExtractor extractor)
+ throws IOException, BoilerpipeProcessingException, SAXException {
+ final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
+
+ final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
+ .getTextDocument();
+ extractor.process(doc);
+
+ final InputSource is = htmlDoc.toInputSource();
+
+ return process(doc, is);
+ }
+
+ /**
+ * parses the media (picture, video) out of doc
+ * @param doc document to parse the media out
+ * @param extractor extractor to use
+ * @return list of extracted media, with size = 0 if no media found
+ */
+ public List process(String doc, final BoilerpipeExtractor extractor) {
+ final HTMLDocument htmlDoc = new HTMLDocument(doc);
+ List media = new ArrayList();
+ TextDocument tdoc;
+
+ try {
+ tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
+ extractor.process(tdoc);
+ final InputSource is = htmlDoc.toInputSource();
+ media = process(tdoc, is);
+ } catch (Exception e) {
+ return null;
+ }
+ return media;
+ }
+
+
+ private final class Implementation extends AbstractSAXParser implements
+ ContentHandler {
+ List linksHighlight = new ArrayList();
+ private List linksBuffer = new ArrayList();
+
+ private int inIgnorableElement = 0;
+ private int characterElementIdx = 0;
+ private final BitSet contentBitSet = new BitSet();
+
+ private boolean inHighlight = false;
+
+ Implementation() {
+ super(new HTMLConfiguration());
+ setContentHandler(this);
+ }
+
+ void process(final TextDocument doc, final InputSource is)
+ throws BoilerpipeProcessingException {
+ for (TextBlock block : doc.getTextBlocks()) {
+ if (block.isContent()) {
+ final BitSet bs = block.getContainedTextElements();
+ if (bs != null) {
+ contentBitSet.or(bs);
+ }
+ }
+ }
+
+ try {
+ parse(is);
+ } catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ } catch (IOException e) {
+ throw new BoilerpipeProcessingException(e);
+ }
+ }
+
+ public void endDocument() throws SAXException {
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ }
+
+ public void startDocument() throws SAXException {
+ }
+
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ TagAction ta = TAG_ACTIONS.get(localName);
+ if (ta != null) {
+ ta.beforeStart(this, localName);
+ }
+
+ try {
+ if (inIgnorableElement == 0) {
+ if(inHighlight && "IFRAME".equalsIgnoreCase(localName)) {
+ String src = atts.getValue("src");
+ src = src.replaceAll("\\\\\"", "");
+ if(src != null && src.length() > 0 && src.contains("youtube.com/embed/")) {
+ String originUrl = null;
+ try {
+ URL url = new URL(src);
+ String path = url.getPath();
+ String[] pathParts = path.split("/");
+ originUrl = "http://www.youtube.com/watch?v="+pathParts[pathParts.length-1];
+ linksBuffer.add(new YoutubeVideo(originUrl,src));
+ } catch (MalformedURLException e) {
+ }
+
+ }
+
+ if(src != null && src.length() > 0 && src.contains("player.vimeo.com")) {
+ String originUrl = null;
+ try {
+ URL url = new URL(src);
+ String path = url.getPath();
+ String[] pathParts = path.split("/");
+ originUrl = "http://vimeo.com/"+pathParts[pathParts.length-1];
+ linksBuffer.add(new VimeoVideo(originUrl,src));
+ } catch (MalformedURLException e) {
+ }
+
+ }
+ }
+
+
+ if(inHighlight && "IMG".equalsIgnoreCase(localName)) {
+ String src = atts.getValue("src");
+ try {
+ URI image = new URI(src);
+ if(src != null && src.length() > 0) {
+ linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts.getValue("alt")));
+ }
+ } catch (URISyntaxException e) {
+ }
+ }
+ }
+ } finally {
+ if (ta != null) {
+ ta.afterStart(this, localName);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ TagAction ta = TAG_ACTIONS.get(localName);
+ if (ta != null) {
+ ta.beforeEnd(this, localName);
+ }
+
+ try {
+ if (inIgnorableElement == 0) {
+ //
+ }
+ } finally {
+ if (ta != null) {
+ ta.afterEnd(this, localName);
+ }
+ }
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ characterElementIdx++;
+ if (inIgnorableElement == 0) {
+
+ boolean highlight = contentBitSet.get(characterElementIdx);
+ if(!highlight) {
+ if(length == 0) {
+ return;
+ }
+ boolean justWhitespace = true;
+ for(int i=start;i TAG_ACTIONS = new HashMap();
+ static {
+ TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
+ TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
+
+ TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT);
+ }
+
+ private abstract static class TagAction {
+ void beforeStart(final Implementation instance, final String localName) {
+ }
+
+ void afterStart(final Implementation instance, final String localName) {
+ }
+
+ void beforeEnd(final Implementation instance, final String localName) {
+ }
+
+ void afterEnd(final Implementation instance, final String localName) {
+ }
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/TagAction.java b/src/main/java/de/l3s/boilerpipe/sax/TagAction.java
new file mode 100644
index 0000000..3ee8dcf
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/TagAction.java
@@ -0,0 +1,39 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * Defines an action that is to be performed whenever a particular tag occurs
+ * during HTML parsing.
+ *
+ * @author Christian Kohlschütter
+ */
+public interface TagAction {
+
+ boolean start(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName, final Attributes atts)
+ throws SAXException;
+
+ boolean end(final BoilerpipeHTMLContentHandler instance,
+ final String localName, final String qName) throws SAXException;
+
+ boolean changesTagLevel();
+}
\ No newline at end of file
diff --git a/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java b/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java
new file mode 100644
index 0000000..74ab275
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java
@@ -0,0 +1,60 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009, 2010 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.sax;
+
+import java.util.HashMap;
+
+/**
+ * Base class for definition a set of {@link TagAction}s that are to be used for the
+ * HTML parsing process.
+ *
+ * @see DefaultTagActionMap
+ * @author Christian Kohlschütter
+ */
+public abstract class TagActionMap extends HashMap {
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag
+ * will be removed and overwritten.
+ *
+ * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
+ * @param action The {@link TagAction}
+ */
+ protected void setTagAction(final String tag, final TagAction action) {
+ put(tag.toUpperCase(), action);
+ put(tag.toLowerCase(), action);
+ put(tag, action);
+ }
+
+ /**
+ * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that tag,
+ * a chained action, consisting of the previous and the new {@link TagAction} is created.
+ *
+ * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
+ * @param action The {@link TagAction}
+ */
+ protected void addTagAction(final String tag, final TagAction action) {
+ TagAction previousAction = get(tag);
+ if(previousAction == null) {
+ setTagAction(tag, action);
+ } else {
+ setTagAction(tag, new CommonTagActions.Chained(previousAction, action));
+ }
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/sax/package.html b/src/main/java/de/l3s/boilerpipe/sax/package.html
new file mode 100644
index 0000000..9772244
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/sax/package.html
@@ -0,0 +1,6 @@
+
+
+ Classes related to parsing and producing HTML from/to Boilerpipe
+ TextDocuments.
+
+
diff --git a/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java b/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java
new file mode 100644
index 0000000..e7997f0
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java
@@ -0,0 +1,45 @@
+/**
+ * boilerpipe
+ *
+ * Copyright (c) 2009 Christian Kohlschütter
+ *
+ * The author licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.l3s.boilerpipe.util;
+
+import java.util.regex.Pattern;
+
+/**
+ * Tokenizes text according to Unicode word boundaries and strips off non-word
+ * characters.
+ *
+ * @author Christian Kohlschütter
+ */
+public class UnicodeTokenizer {
+ private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b");
+ private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern
+ .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*");
+
+ /**
+ * Tokenizes the text and returns an array of tokens.
+ *
+ * @param text The text
+ * @return The tokens
+ */
+ public static String[] tokenize(final CharSequence text) {
+ return PAT_NOT_WORD_BOUNDARY.matcher(
+ PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063"))
+ .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split(
+ "[ ]+");
+ }
+}
diff --git a/src/main/java/de/l3s/boilerpipe/util/package.html b/src/main/java/de/l3s/boilerpipe/util/package.html
new file mode 100644
index 0000000..ab7a714
--- /dev/null
+++ b/src/main/java/de/l3s/boilerpipe/util/package.html
@@ -0,0 +1,5 @@
+
+
+ Some helper classes.
+
+
diff --git a/src/main/java/org/cyberneko/html/HTMLElements.java b/src/main/java/org/cyberneko/html/HTMLElements.java
new file mode 100644
index 0000000..d200373
--- /dev/null
+++ b/src/main/java/org/cyberneko/html/HTMLElements.java
@@ -0,0 +1,794 @@
+/*
+ * Copyright 2002-2009 Andy Clark, Marc Guillemot
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.cyberneko.html;
+
+/**
+ * Collection of HTML element information.
+ *
+ * @author Andy Clark
+ * @author Ahmed Ashour
+ * @author Marc Guillemot
+ *
+ * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $
+ */
+public class HTMLElements {
+
+ //
+ // Constants
+ //
+
+ // element codes
+
+ // NOTE: The element codes *must* start with 0 and increment in
+ // sequence. The parent and closes references depends on
+ // this assumption. -Ac
+
+ public static final short A = 0;
+ public static final short ABBR = A+1;
+ public static final short ACRONYM = ABBR+1;
+ public static final short ADDRESS = ACRONYM+1;
+ public static final short APPLET = ADDRESS+1;
+ public static final short AREA = APPLET+1;
+ public static final short B = AREA+1;
+ public static final short BASE = B+1;
+ public static final short BASEFONT = BASE+1;
+ public static final short BDO = BASEFONT+1;
+ public static final short BGSOUND = BDO+1;
+ public static final short BIG = BGSOUND+1;
+ public static final short BLINK = BIG+1;
+ public static final short BLOCKQUOTE = BLINK+1;
+ public static final short BODY = BLOCKQUOTE+1;
+ public static final short BR = BODY+1;
+ public static final short BUTTON = BR+1;
+ public static final short CAPTION = BUTTON+1;
+ public static final short CENTER = CAPTION+1;
+ public static final short CITE = CENTER+1;
+ public static final short CODE = CITE+1;
+ public static final short COL = CODE+1;
+ public static final short COLGROUP = COL+1;
+ public static final short COMMENT = COLGROUP+1;
+ public static final short DEL = COMMENT+1;
+ public static final short DFN = DEL+1;
+ public static final short DIR = DFN+1;
+ public static final short DIV = DIR+1;
+ public static final short DD = DIV+1;
+ public static final short DL = DD+1;
+ public static final short DT = DL+1;
+ public static final short EM = DT+1;
+ public static final short EMBED = EM+1;
+ public static final short FIELDSET = EMBED+1;
+ public static final short FONT = FIELDSET+1;
+ public static final short FORM = FONT+1;
+ public static final short FRAME = FORM+1;
+ public static final short FRAMESET = FRAME+1;
+ public static final short H1 = FRAMESET+1;
+ public static final short H2 = H1+1;
+ public static final short H3 = H2+1;
+ public static final short H4 = H3+1;
+ public static final short H5 = H4+1;
+ public static final short H6 = H5+1;
+ public static final short HEAD = H6+1;
+ public static final short HR = HEAD+1;
+ public static final short HTML = HR+1;
+ public static final short I = HTML+1;
+ public static final short IFRAME = I+1;
+ public static final short ILAYER = IFRAME+1;
+ public static final short IMG = ILAYER+1;
+ public static final short INPUT = IMG+1;
+ public static final short INS = INPUT+1;
+ public static final short ISINDEX = INS+1;
+ public static final short KBD = ISINDEX+1;
+ public static final short KEYGEN = KBD+1;
+ public static final short LABEL = KEYGEN+1;
+ public static final short LAYER = LABEL+1;
+ public static final short LEGEND = LAYER+1;
+ public static final short LI = LEGEND+1;
+ public static final short LINK = LI+1;
+ public static final short LISTING = LINK+1;
+ public static final short MAP = LISTING+1;
+ public static final short MARQUEE = MAP+1;
+ public static final short MENU = MARQUEE+1;
+ public static final short META = MENU+1;
+ public static final short MULTICOL = META+1;
+ public static final short NEXTID = MULTICOL+1;
+ public static final short NOBR = NEXTID+1;
+ public static final short NOEMBED = NOBR+1;
+ public static final short NOFRAMES = NOEMBED+1;
+ public static final short NOLAYER = NOFRAMES+1;
+ public static final short NOSCRIPT = NOLAYER+1;
+ public static final short OBJECT = NOSCRIPT+1;
+ public static final short OL = OBJECT+1;
+ public static final short OPTION = OL+1;
+ public static final short OPTGROUP = OPTION+1;
+ public static final short P = OPTGROUP+1;
+ public static final short PARAM = P+1;
+ public static final short PLAINTEXT = PARAM+1;
+ public static final short PRE = PLAINTEXT+1;
+ public static final short Q = PRE+1;
+ public static final short RB = Q+1;
+ public static final short RBC = RB+1;
+ public static final short RP = RBC+1;
+ public static final short RT = RP+1;
+ public static final short RTC = RT+1;
+ public static final short RUBY = RTC+1;
+ public static final short S = RUBY+1;
+ public static final short SAMP = S+1;
+ public static final short SCRIPT = SAMP+1;
+ public static final short SELECT = SCRIPT+1;
+ public static final short SMALL = SELECT+1;
+ public static final short SOUND = SMALL+1;
+ public static final short SPACER = SOUND+1;
+ public static final short SPAN = SPACER+1;
+ public static final short STRIKE = SPAN+1;
+ public static final short STRONG = STRIKE+1;
+ public static final short STYLE = STRONG+1;
+ public static final short SUB = STYLE+1;
+ public static final short SUP = SUB+1;
+ public static final short TABLE = SUP+1;
+ public static final short TBODY = TABLE+1;
+ public static final short TD = TBODY+1;
+ public static final short TEXTAREA = TD+1;
+ public static final short TFOOT = TEXTAREA+1;
+ public static final short TH = TFOOT+1;
+ public static final short THEAD = TH+1;
+ public static final short TITLE = THEAD+1;
+ public static final short TR = TITLE+1;
+ public static final short TT = TR+1;
+ public static final short U = TT+1;
+ public static final short UL = U+1;
+ public static final short VAR = UL+1;
+ public static final short WBR = VAR+1;
+ public static final short XML = WBR+1;
+ public static final short XMP = XML+1;
+ public static final short UNKNOWN = XMP+1;
+
+ // information
+
+ /** Element information organized by first letter. */
+ protected static final Element[][] ELEMENTS_ARRAY = new Element[26][];
+
+ /** Element information as a contiguous list. */
+ protected static final ElementList ELEMENTS = new ElementList();
+
+ /** No such element. */
+ public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null);
+
+ //
+ // Static initializer
+ //
+
+ /**
+ * Initializes the element information.
+ *
+ * Note:
+ * The getElement
method requires that the HTML elements
+ * are added to the list in alphabetical order. If new elements are
+ * added, then they must be inserted in alphabetical order.
+ */
+ static {
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+
+ // initialize array of element information
+ ELEMENTS_ARRAY['A'-'A'] = new Element[] {
+ // A - - (%inline;)* -(A)
+ new Element(A, "A", Element.INLINE, BODY, new short[] {A}),
+ // ABBR - - (%inline;)*
+ new Element(ABBR, "ABBR", Element.INLINE, BODY, null),
+ // ACRONYM - - (%inline;)*
+ new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null),
+ // ADDRESS - - (%inline;)*
+ new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null),
+ // APPLET
+ new Element(APPLET, "APPLET", 0, BODY, null),
+ // AREA - O EMPTY
+ new Element(AREA, "AREA", Element.EMPTY, MAP, null),
+ };
+ ELEMENTS_ARRAY['B'-'A'] = new Element[] {
+ // B - - (%inline;)*
+ new Element(B, "B", Element.INLINE, BODY, null),
+ // BASE - O EMPTY
+ new Element(BASE, "BASE", Element.EMPTY, HEAD, null),
+ // BASEFONT
+ new Element(BASEFONT, "BASEFONT", 0, HEAD, null),
+ // BDO - - (%inline;)*
+ new Element(BDO, "BDO", Element.INLINE, BODY, null),
+ // BGSOUND
+ new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null),
+ // BIG - - (%inline;)*
+ new Element(BIG, "BIG", Element.INLINE, BODY, null),
+ // BLINK
+ new Element(BLINK, "BLINK", Element.INLINE, BODY, null),
+ // BLOCKQUOTE - - (%block;|SCRIPT)+
+ new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}),
+ // BODY O O (%block;|SCRIPT)+ +(INS|DEL)
+ new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}),
+ // BR - O EMPTY
+ new Element(BR, "BR", Element.EMPTY, BODY, null),
+ // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET)
+ new Element(BUTTON, "BUTTON", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['C'-'A'] = new Element[] {
+ // CAPTION - - (%inline;)*
+ new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null),
+ // CENTER,
+ new Element(CENTER, "CENTER", 0, BODY, null),
+ // CITE - - (%inline;)*
+ new Element(CITE, "CITE", Element.INLINE, BODY, null),
+ // CODE - - (%inline;)*
+ new Element(CODE, "CODE", Element.INLINE, BODY, null),
+ // COL - O EMPTY
+ new Element(COL, "COL", Element.EMPTY, TABLE, null),
+ // COLGROUP - O (COL)*
+ new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}),
+ // COMMENT
+ new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null),
+ };
+ ELEMENTS_ARRAY['D'-'A'] = new Element[] {
+ // DEL - - (%flow;)*
+ new Element(DEL, "DEL", 0, BODY, null),
+ // DFN - - (%inline;)*
+ new Element(DFN, "DFN", Element.INLINE, BODY, null),
+ // DIR
+ new Element(DIR, "DIR", 0, BODY, null),
+ // DIV - - (%flow;)*
+ new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}),
+ // DD - O (%flow;)*
+ new Element(DD, "DD", 0, DL, new short[]{DT,DD}),
+ // DL - - (DT|DD)+
+ new Element(DL, "DL", Element.BLOCK, BODY, null),
+ // DT - O (%inline;)*
+ new Element(DT, "DT", 0, DL, new short[]{DT,DD}),
+ };
+ ELEMENTS_ARRAY['E'-'A'] = new Element[] {
+ // EM - - (%inline;)*
+ new Element(EM, "EM", Element.INLINE, BODY, null),
+ // EMBED
+ new Element(EMBED, "EMBED", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['F'-'A'] = new Element[] {
+ // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*)
+ new Element(FIELDSET, "FIELDSET", 0, BODY, null),
+ // FONT
+ new Element(FONT, "FONT", Element.CONTAINER, BODY, null),
+ // FORM - - (%block;|SCRIPT)+ -(FORM)
+ new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}),
+ // FRAME - O EMPTY
+ new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null),
+ // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?)
+ new Element(FRAMESET, "FRAMESET", 0, HTML, null),
+ };
+ ELEMENTS_ARRAY['H'-'A'] = new Element[] {
+ // (H1|H2|H3|H4|H5|H6) - - (%inline;)*
+ new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
+ // HEAD O O (%head.content;) +(%head.misc;)
+ new Element(HEAD, "HEAD", 0, HTML, null),
+ // HR - O EMPTY
+ new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}),
+ // HTML O O (%html.content;)
+ new Element(HTML, "HTML", 0, null, null),
+ };
+ ELEMENTS_ARRAY['I'-'A'] = new Element[] {
+ // I - - (%inline;)*
+ new Element(I, "I", Element.INLINE, BODY, null),
+ // IFRAME
+ new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null),
+ // ILAYER
+ new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null),
+ // IMG - O EMPTY
+ new Element(IMG, "IMG", Element.EMPTY, BODY, null),
+ // INPUT - O EMPTY
+ new Element(INPUT, "INPUT", Element.EMPTY, BODY, null),
+ // INS - - (%flow;)*
+ new Element(INS, "INS", 0, BODY, null),
+ // ISINDEX
+ new Element(ISINDEX, "ISINDEX", 0, HEAD, null),
+ };
+ ELEMENTS_ARRAY['K'-'A'] = new Element[] {
+ // KBD - - (%inline;)*
+ new Element(KBD, "KBD", Element.INLINE, BODY, null),
+ // KEYGEN
+ new Element(KEYGEN, "KEYGEN", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['L'-'A'] = new Element[] {
+ // LABEL - - (%inline;)* -(LABEL)
+ new Element(LABEL, "LABEL", 0, BODY, null),
+ // LAYER
+ new Element(LAYER, "LAYER", Element.BLOCK, BODY, null),
+ // LEGEND - - (%inline;)*
+ new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null),
+ // LI - O (%flow;)*
+ new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}),
+ // LINK - O EMPTY
+ new Element(LINK, "LINK", Element.EMPTY, HEAD, null),
+ // LISTING
+ new Element(LISTING, "LISTING", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['M'-'A'] = new Element[] {
+ // MAP - - ((%block;) | AREA)+
+ new Element(MAP, "MAP", Element.INLINE, BODY, null),
+ // MARQUEE
+ new Element(MARQUEE, "MARQUEE", 0, BODY, null),
+ // MENU
+ new Element(MENU, "MENU", 0, BODY, null),
+ // META - O EMPTY
+ new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}),
+ // MULTICOL
+ new Element(MULTICOL, "MULTICOL", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['N'-'A'] = new Element[] {
+ // NEXTID
+ new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null),
+ // NOBR
+ new Element(NOBR, "NOBR", Element.INLINE, BODY, null),
+ // NOEMBED
+ new Element(NOEMBED, "NOEMBED", 0, BODY, null),
+ // NOFRAMES - - (BODY) -(NOFRAMES)
+ new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null),
+ // NOLAYER
+ new Element(NOLAYER, "NOLAYER", 0, BODY, null),
+ // NOSCRIPT - - (%block;)+
+ new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null),
+ };
+ ELEMENTS_ARRAY['O'-'A'] = new Element[] {
+ // OBJECT - - (PARAM | %flow;)*
+ new Element(OBJECT, "OBJECT", 0, BODY, null),
+ // OL - - (LI)+
+ new Element(OL, "OL", Element.BLOCK, BODY, null),
+ // OPTGROUP - - (OPTION)+
+ new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}),
+ // OPTION - O (#PCDATA)
+ new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}),
+ };
+ ELEMENTS_ARRAY['P'-'A'] = new Element[] {
+ // P - O (%inline;)*
+ new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}),
+ // PARAM - O EMPTY
+ new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null),
+ // PLAINTEXT
+ new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null),
+ // PRE - - (%inline;)* -(%pre.exclusion;)
+ new Element(PRE, "PRE", 0, BODY, null),
+ };
+ ELEMENTS_ARRAY['Q'-'A'] = new Element[] {
+ // Q - - (%inline;)*
+ new Element(Q, "Q", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['R'-'A'] = new Element[] {
+ // RB
+ new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}),
+ // RBC
+ new Element(RBC, "RBC", 0, RUBY, null),
+ // RP
+ new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}),
+ // RT
+ new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}),
+ // RTC
+ new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}),
+ // RUBY
+ new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}),
+ };
+ ELEMENTS_ARRAY['S'-'A'] = new Element[] {
+ // S
+ new Element(S, "S", 0, BODY, null),
+ // SAMP - - (%inline;)*
+ new Element(SAMP, "SAMP", Element.INLINE, BODY, null),
+ // SCRIPT - - %Script;
+ new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null),
+ // SELECT - - (OPTGROUP|OPTION)+
+ new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}),
+ // SMALL - - (%inline;)*
+ new Element(SMALL, "SMALL", Element.INLINE, BODY, null),
+ // SOUND
+ new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null),
+ // SPACER
+ new Element(SPACER, "SPACER", Element.EMPTY, BODY, null),
+ // SPAN - - (%inline;)*
+ new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null),
+ // STRIKE
+ new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null),
+ // STRONG - - (%inline;)*
+ new Element(STRONG, "STRONG", Element.INLINE, BODY, null),
+ // STYLE - - %StyleSheet;
+ new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}),
+ // SUB - - (%inline;)*
+ new Element(SUB, "SUB", Element.INLINE, BODY, null),
+ // SUP - - (%inline;)*
+ new Element(SUP, "SUP", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['T'-'A'] = new Element[] {
+ // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
+ new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null),
+ // TBODY O O (TR)+
+ new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}),
+ // TD - O (%flow;)*
+ new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}),
+ // TEXTAREA - - (#PCDATA)
+ new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null),
+ // TFOOT - O (TR)+
+ new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}),
+ // TH - O (%flow;)*
+ new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}),
+ // THEAD - O (TR)+
+ new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}),
+ // TITLE - - (#PCDATA) -(%head.misc;)
+ new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null),
+ // TR - O (TH|TD)+
+ new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}),
+ // TT - - (%inline;)*
+ new Element(TT, "TT", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['U'-'A'] = new Element[] {
+ // U,
+ new Element(U, "U", Element.INLINE, BODY, null),
+ // UL - - (LI)+
+ new Element(UL, "UL", Element.BLOCK, BODY, null),
+ };
+ ELEMENTS_ARRAY['V'-'A'] = new Element[] {
+ // VAR - - (%inline;)*
+ new Element(VAR, "VAR", Element.INLINE, BODY, null),
+ };
+ ELEMENTS_ARRAY['W'-'A'] = new Element[] {
+ // WBR
+ new Element(WBR, "WBR", Element.EMPTY, BODY, null),
+ };
+ ELEMENTS_ARRAY['X'-'A'] = new Element[] {
+ // XML
+ new Element(XML, "XML", 0, BODY, null),
+ // XMP
+ new Element(XMP, "XMP", Element.SPECIAL, BODY, null),
+ };
+
+ // keep contiguous list of elements for lookups by code
+ for (int i = 0; i < ELEMENTS_ARRAY.length; i++) {
+ Element[] elements = ELEMENTS_ARRAY[i];
+ if (elements != null) {
+ for (int j = 0; j < elements.length; j++) {
+ Element element = elements[j];
+ ELEMENTS.addElement(element);
+ }
+ }
+ }
+ ELEMENTS.addElement(NO_SUCH_ELEMENT);
+
+ // initialize cross references to parent elements
+ for (int i = 0; i < ELEMENTS.size; i++) {
+ Element element = ELEMENTS.data[i];
+ if (element.parentCodes != null) {
+ element.parent = new Element[element.parentCodes.length];
+ for (int j = 0; j < element.parentCodes.length; j++) {
+ element.parent[j] = ELEMENTS.data[element.parentCodes[j]];
+ }
+ element.parentCodes = null;
+ }
+ }
+
+ } // ()
+
+ //
+ // Public static methods
+ //
+
+ /**
+ * Returns the element information for the specified element code.
+ *
+ * @param code The element code.
+ */
+ public static final Element getElement(short code) {
+ return ELEMENTS.data[code];
+ } // getElement(short):Element
+
+ /**
+ * Returns the element information for the specified element name.
+ *
+ * @param ename The element name.
+ */
+ public static final Element getElement(String ename) {
+ return getElement(ename, NO_SUCH_ELEMENT);
+ } // getElement(String):Element
+
+ /**
+ * Returns the element information for the specified element name.
+ *
+ * @param ename The element name.
+ * @param element The default element to return if not found.
+ */
+ public static final Element getElement(String ename, Element element) {
+
+ if (ename.length() > 0) {
+ int c = ename.charAt(0);
+ if (c >= 'a' && c <= 'z') {
+ c = 'A' + c - 'a';
+ }
+ if (c >= 'A' && c <= 'Z') {
+ Element[] elements = ELEMENTS_ARRAY[c - 'A'];
+ if (elements != null) {
+ for (int i = 0; i < elements.length; i++) {
+ Element elem = elements[i];
+ if (elem.name.equalsIgnoreCase(ename)) {
+ return elem;
+ }
+ }
+ }
+ }
+ }
+ return element;
+
+ } // getElement(String):Element
+
+ //
+ // Classes
+ //
+
+ /**
+ * Element information.
+ *
+ * @author Andy Clark
+ */
+ public static class Element {
+
+ //
+ // Constants
+ //
+
+ /** Inline element. */
+ public static final int INLINE = 0x01;
+
+ /** Block element. */
+ public static final int BLOCK = 0x02;
+
+ /** Empty element. */
+ public static final int EMPTY = 0x04;
+
+ /** Container element. */
+ public static final int CONTAINER = 0x08;
+
+ /** Special element. */
+ public static final int SPECIAL = 0x10;
+
+ //
+ // Data
+ //
+
+ /** The element code. */
+ public short code;
+
+ /** The element name. */
+ public String name;
+
+ /** Informational flags. */
+ public int flags;
+
+ /** Parent elements. */
+ public short[] parentCodes;
+
+ /** Parent elements. */
+ public Element[] parent;
+
+ /** The bounding element code. */
+ public short bounds;
+
+ /** List of elements this element can close. */
+ public short[] closes;
+
+ /** If set to true, then this element may not be nested, example: "A" **/
+ boolean nestable = true;
+
+ //
+ // Constructors
+ //
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parent Natural closing parent name.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short parent, short[] closes) {
+ this(code, name, flags, new short[]{parent}, (short)-1, closes);
+ } // (short,String,int,short,short[]);
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parent Natural closing parent name.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short parent, short bounds, short[] closes) {
+ this(code, name, flags, new short[]{parent}, bounds, closes);
+ } // (short,String,int,short,short,short[])
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parents Natural closing parent names.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short[] parents, short[] closes) {
+ this(code, name, flags, parents, (short)-1, closes);
+ } // (short,String,int,short[],short[])
+
+ /**
+ * Constructs an element object.
+ *
+ * @param code The element code.
+ * @param name The element name.
+ * @param flags Informational flags
+ * @param parents Natural closing parent names.
+ * @param closes List of elements this element can close.
+ */
+ public Element(short code, String name, int flags,
+ short[] parents, short bounds, short[] closes) {
+ this.code = code;
+ this.name = name;
+ this.flags = flags;
+ this.parentCodes = parents;
+ this.parent = null;
+ this.bounds = bounds;
+ this.closes = closes;
+ if(closes != null) {
+ for(int i=0;i(short,String,int,short[],short,short[])
+
+ //
+ // Public methods
+ //
+
+ /** Returns true if this element is an inline element. */
+ public final boolean isInline() {
+ return (flags & INLINE) != 0;
+ } // isInline():boolean
+
+ /** Returns true if this element is a block element. */
+ public final boolean isBlock() {
+ return (flags & BLOCK) != 0;
+ } // isBlock():boolean
+
+ /** Returns true if this element is an empty element. */
+ public final boolean isEmpty() {
+ return (flags & EMPTY) != 0;
+ } // isEmpty():boolean
+
+ /** Returns true if this element is a container element. */
+ public final boolean isContainer() {
+ return (flags & CONTAINER) != 0;
+ } // isContainer():boolean
+
+ /**
+ * Returns true if this element is special -- if its content
+ * should be parsed ignoring markup.
+ */
+ public final boolean isSpecial() {
+ return (flags & SPECIAL) != 0;
+ } // isSpecial():boolean
+
+ /**
+ * Returns true if this element can close the specified Element.
+ *
+ * @param tag The element.
+ */
+ public boolean closes(short tag) {
+
+ if (closes != null) {
+ for (int i = 0; i < closes.length; i++) {
+ if (closes[i] == tag) {
+ return true;
+ }
+ }
+ }
+ return false;
+
+ } // closes(short):boolean
+
+ //
+ // Object methods
+ //
+
+ /** Returns a hash code for this object. */
+ public int hashCode() {
+ return name.hashCode();
+ } // hashCode():int
+
+ /** Returns true if the objects are equal. */
+ public boolean equals(Object o) {
+ return name.equals(o);
+ } // equals(Object):boolean
+
+ /**
+ * Provides a simple representation to make debugging easier
+ */
+ public String toString() {
+ return super.toString() + "(name=" + name + ")";
+ }
+
+ /**
+ * Indicates if the provided element is an accepted parent of current element
+ * @param element the element to test for "paternity"
+ * @return true
if element
belongs to the {@link #parent}
+ */
+ public boolean isParent(final Element element) {
+ if (parent == null)
+ return false;
+ else {
+ for (int i=0; i
+ * add missing parent elements;
+ * automatically close elements with optional end tags; and
+ * handle mis-matched inline element tags.
+ *
+ *
+ * This component recognizes the following features:
+ *
+ * - http://cyberneko.org/html/features/augmentations
+ *
- http://cyberneko.org/html/features/report-errors
+ *
- http://cyberneko.org/html/features/balance-tags/document-fragment
+ *
- http://cyberneko.org/html/features/balance-tags/ignore-outside-content
+ *
+ *
+ * This component recognizes the following properties:
+ *
+ * - http://cyberneko.org/html/properties/names/elems
+ *
- http://cyberneko.org/html/properties/names/attrs
+ *
- http://cyberneko.org/html/properties/error-reporter
+ *
- http://cyberneko.org/html/properties/balance-tags/current-stack
+ *
+ *
+ * @see HTMLElements
+ *
+ * @author Andy Clark
+ * @author Marc Guillemot
+ *
+ * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
+ */
+public class HTMLTagBalancer
+ implements XMLDocumentFilter, HTMLComponent {
+
+ //
+ // Constants
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
+
+ /** Include infoset augmentations. */
+ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
+
+ /** Report errors. */
+ protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
+
+ /** Document fragment balancing only (deprecated). */
+ protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";
+
+ /** Document fragment balancing only. */
+ protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";
+
+ /** Ignore outside content. */
+ protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";
+
+ /** Recognized features. */
+ private static final String[] RECOGNIZED_FEATURES = {
+ NAMESPACES,
+ AUGMENTATIONS,
+ REPORT_ERRORS,
+ DOCUMENT_FRAGMENT_DEPRECATED,
+ DOCUMENT_FRAGMENT,
+ IGNORE_OUTSIDE_CONTENT,
+ };
+
+ /** Recognized features defaults. */
+ private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
+ null,
+ null,
+ null,
+ null,
+ Boolean.FALSE,
+ Boolean.FALSE,
+ };
+
+ // properties
+
+ /** Modify HTML element names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
+
+ /** Modify HTML attribute names: { "upper", "lower", "default" }. */
+ protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
+
+ /** Error reporter. */
+ protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
+
+ /**
+ * EXPERIMENTAL: may change in next release
+ * Name of the property holding the stack of elements in which context a document fragment should be parsed.
+ **/
+ public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack";
+
+ /** Recognized properties. */
+ private static final String[] RECOGNIZED_PROPERTIES = {
+ NAMES_ELEMS,
+ NAMES_ATTRS,
+ ERROR_REPORTER,
+ FRAGMENT_CONTEXT_STACK,
+ };
+
+ /** Recognized properties defaults. */
+ private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
+ null,
+ null,
+ null,
+ null,
+ };
+
+ // modify HTML names
+
+ /** Don't modify HTML names. */
+ protected static final short NAMES_NO_CHANGE = 0;
+
+ /** Match HTML element names. */
+ protected static final short NAMES_MATCH = 0;
+
+ /** Uppercase HTML names. */
+ protected static final short NAMES_UPPERCASE = 1;
+
+ /** Lowercase HTML names. */
+ protected static final short NAMES_LOWERCASE = 2;
+
+ // static vars
+
+ /** Synthesized event info item. */
+ protected static final HTMLEventInfo SYNTHESIZED_ITEM =
+ new HTMLEventInfo.SynthesizedItem();
+
+ //
+ // Data
+ //
+
+ // features
+
+ /** Namespaces. */
+ protected boolean fNamespaces;
+
+ /** Include infoset augmentations. */
+ protected boolean fAugmentations;
+
+ /** Report errors. */
+ protected boolean fReportErrors;
+
+ /** Document fragment balancing only. */
+ protected boolean fDocumentFragment;
+
+ /** Ignore outside content. */
+ protected boolean fIgnoreOutsideContent;
+
+ // properties
+
+ /** Modify HTML element names. */
+ protected short fNamesElems;
+
+ /** Modify HTML attribute names. */
+ protected short fNamesAttrs;
+
+ /** Error reporter. */
+ protected HTMLErrorReporter fErrorReporter;
+
+ // connections
+
+ /** The document source. */
+ protected XMLDocumentSource fDocumentSource;
+
+ /** The document handler. */
+ protected XMLDocumentHandler fDocumentHandler;
+
+ // state
+
+ /** The element stack. */
+ protected final InfoStack fElementStack = new InfoStack();
+
+ /** The inline stack. */
+ protected final InfoStack fInlineStack = new InfoStack();
+
+ /** True if seen anything. Important for xml declaration. */
+ protected boolean fSeenAnything;
+
+ /** True if root element has been seen. */
+ protected boolean fSeenDoctype;
+
+ /** True if root element has been seen. */
+ protected boolean fSeenRootElement;
+
+ /**
+ * True if seen the end of the document element. In other words,
+ * this variable is set to false until the end </HTML>
+ * tag is seen (or synthesized). This is used to ensure that
+ * extraneous events after the end of the document element do not
+ * make the document stream ill-formed.
+ */
+ protected boolean fSeenRootElementEnd;
+
+ /** True if seen <head< element. */
+ protected boolean fSeenHeadElement;
+
+ /** True if seen <body< element. */
+ protected boolean fSeenBodyElement;
+
+ /** True if a form is in the stack (allow to discard opening of nested forms) */
+ protected boolean fOpenedForm;
+
+ // temp vars
+
+ /** A qualified name. */
+ private final QName fQName = new QName();
+
+ /** Empty attributes. */
+ private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();
+
+ /** Augmentations. */
+ private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
+
+ protected HTMLTagBalancingListener tagBalancingListener;
+ private LostText lostText_ = new LostText();
+
+ private boolean forcedStartElement_ = false;
+ private boolean forcedEndElement_ = false;
+
+ /**
+ * Stack of elements determining the context in which a document fragment should be parsed
+ */
+ private QName[] fragmentContextStack_ = null;
+ private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set
+
+ private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList();
+
+ //
+ // HTMLComponent methods
+ //
+
+ /** Returns the default state for a feature. */
+ public Boolean getFeatureDefault(String featureId) {
+ int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
+ for (int i = 0; i < length; i++) {
+ if (RECOGNIZED_FEATURES[i].equals(featureId)) {
+ return RECOGNIZED_FEATURES_DEFAULTS[i];
+ }
+ }
+ return null;
+ } // getFeatureDefault(String):Boolean
+
+ /** Returns the default state for a property. */
+ public Object getPropertyDefault(String propertyId) {
+ int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
+ for (int i = 0; i < length; i++) {
+ if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
+ return RECOGNIZED_PROPERTIES_DEFAULTS[i];
+ }
+ }
+ return null;
+ } // getPropertyDefault(String):Object
+
+ //
+ // XMLComponent methods
+ //
+
+ /** Returns recognized features. */
+ public String[] getRecognizedFeatures() {
+ return RECOGNIZED_FEATURES;
+ } // getRecognizedFeatures():String[]
+
+ /** Returns recognized properties. */
+ public String[] getRecognizedProperties() {
+ return RECOGNIZED_PROPERTIES;
+ } // getRecognizedProperties():String[]
+
+ /** Resets the component. */
+ public void reset(XMLComponentManager manager)
+ throws XMLConfigurationException {
+
+ // get features
+ fNamespaces = manager.getFeature(NAMESPACES);
+ fAugmentations = manager.getFeature(AUGMENTATIONS);
+ fReportErrors = manager.getFeature(REPORT_ERRORS);
+ fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) ||
+ manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED);
+ fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT);
+
+ // get properties
+ fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
+ fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
+ fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
+
+ fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK);
+
+ } // reset(XMLComponentManager)
+
+ /** Sets a feature. */
+ public void setFeature(String featureId, boolean state)
+ throws XMLConfigurationException {
+
+ if (featureId.equals(AUGMENTATIONS)) {
+ fAugmentations = state;
+ return;
+ }
+ if (featureId.equals(REPORT_ERRORS)) {
+ fReportErrors = state;
+ return;
+ }
+ if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) {
+ fIgnoreOutsideContent = state;
+ return;
+ }
+
+ } // setFeature(String,boolean)
+
+ /** Sets a property. */
+ public void setProperty(String propertyId, Object value)
+ throws XMLConfigurationException {
+
+ if (propertyId.equals(NAMES_ELEMS)) {
+ fNamesElems = getNamesValue(String.valueOf(value));
+ return;
+ }
+
+ if (propertyId.equals(NAMES_ATTRS)) {
+ fNamesAttrs = getNamesValue(String.valueOf(value));
+ return;
+ }
+
+ } // setProperty(String,Object)
+
+ //
+ // XMLDocumentSource methods
+ //
+
+ /** Sets the document handler. */
+ public void setDocumentHandler(XMLDocumentHandler handler) {
+ fDocumentHandler = handler;
+ } // setDocumentHandler(XMLDocumentHandler)
+
+ // @since Xerces 2.1.0
+
+ /** Returns the document handler. */
+ public XMLDocumentHandler getDocumentHandler() {
+ return fDocumentHandler;
+ } // getDocumentHandler():XMLDocumentHandler
+
+ //
+ // XMLDocumentHandler methods
+ //
+
+ // since Xerces-J 2.2.0
+
+ /** Start document. */
+ public void startDocument(XMLLocator locator, String encoding,
+ NamespaceContext nscontext, Augmentations augs)
+ throws XNIException {
+
+ // reset state
+ fElementStack.top = 0;
+ if (fragmentContextStack_ != null) {
+ fragmentContextStackSize_ = fragmentContextStack_.length;
+ for (int i=0; i and