diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d081115 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +/bin +/target +/.settings +/.classpath +/.DS_Store +/.project diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..f2ae202 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,18 @@ + + boilerpipe + + Copyright (c) 2009-2011 Christian Kohlschütter + + The author licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f84a305 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +boilerpipe +====== + +Repackaging of [boilerpipe](https://code.google.com/p/boilerpipe/) published on Maven Central Repository. + +Overview +-------- +This is a repackaging of the last sources of [boilerpipe](https://code.google.com/p/boilerpipe/) with some improvements: + + * Published on Maven Central Repository + * Media extraction (Youtube videos, Vimeo videos and Images) within an article from [Netbreeze-GmbH fork](https://github.com/Netbreeze-GmbH/boilerpipe) + +Getting started +----- + +The best way to start is to look at [boilerpipe QuickStart](https://code.google.com/p/boilerpipe/wiki/QuickStart) + +### Including the SDK in your project + +Simply add a new dependency to your `pom.xml`: + +```xml + + com.syncthemall + boilerpipe + 1.2.1 + +``` \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..5c92a19 --- /dev/null +++ b/pom.xml @@ -0,0 +1,147 @@ + + 4.0.0 + com.syncthemall + boilerpipe + 1.2.1 + jar + boilerpipe + Repackaging of Dropbox Java SDK with minor bug fixes and published on Maven Central Repository. + https://github.com/vanduynslagerp/boilerpipe + + + Apache License 2.0 + repo + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:git:git@github.com:vanduynslagerp/boilerpipe.git + scm:git:git@github.com:vanduynslagerp/boilerpipe.git + https://github.com/vanduynslagerp/boilerpipe + + + + 1 + Christian Kohlschütter + http://www.kohlschutter.com + + project initiator + + + + 2 + Manuel Codiga + manuel.codiga@gmail.com + + contributor + + + + + + net.sourceforge.nekohtml + nekohtml + 1.9.18 + + + xerces + xercesImpl + 2.11.0 + + + + package + ${project.artifactId}-${project.version} + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + true + true + + + + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + ${project.build.sourceEncoding} + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + ${project.build.sourceEncoding} + ${source.version} + ${source.version} + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.4.1 + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9 + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.4 + + + sign-artifacts + verify + + sign + + + + + + + + + nexus-releases + Nexus Release Repository + http://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + UTF-8 + 1.6 + + \ No newline at end of file diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..9bb88d3 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1 @@ +/.DS_Store diff --git a/src/main/.gitignore b/src/main/.gitignore new file mode 100644 index 0000000..9bb88d3 --- /dev/null +++ b/src/main/.gitignore @@ -0,0 +1 @@ +/.DS_Store diff --git a/src/main/java/.gitignore b/src/main/java/.gitignore new file mode 100644 index 0000000..9bb88d3 --- /dev/null +++ b/src/main/java/.gitignore @@ -0,0 +1 @@ +/.DS_Store diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java new file mode 100644 index 0000000..febbe96 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java @@ -0,0 +1,10 @@ +package de.l3s.boilerpipe; + +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Something that can be represented as a {@link TextDocument}. + */ +public interface BoilerpipeDocumentSource { + TextDocument toTextDocument() throws BoilerpipeProcessingException; +} diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java new file mode 100644 index 0000000..fcc8aab --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java @@ -0,0 +1,58 @@ +package de.l3s.boilerpipe; + +import java.io.Reader; + +import org.xml.sax.InputSource; + +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Describes a complete filter pipeline. + * + * @author Christian Kohlschütter + */ +public interface BoilerpipeExtractor extends BoilerpipeFilter { + /** + * Extracts text from the HTML code given as a String. + * + * @param html + * The HTML code as a String. + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final String html) + throws BoilerpipeProcessingException; + + /** + * Extracts text from the HTML code available from the given + * {@link InputSource}. + * + * @param is + * The InputSource containing the HTML + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final InputSource is) + throws BoilerpipeProcessingException; + + /** + * Extracts text from the HTML code available from the given {@link Reader}. + * + * @param r + * The Reader containing the HTML + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final Reader r) throws BoilerpipeProcessingException; + + /** + * Extracts text from the given {@link TextDocument} object. + * + * @param doc + * The {@link TextDocument}. + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(TextDocument doc) + throws BoilerpipeProcessingException; +} diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java new file mode 100644 index 0000000..8a15f77 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java @@ -0,0 +1,40 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe; + +import de.l3s.boilerpipe.document.TextDocument; + +/** + * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and + * processes it somehow. + * + * @author Christian Kohlschütter + */ +public interface BoilerpipeFilter { + /** + * Processes the given document doc. + * + * @param doc + * The {@link TextDocument} that is to be processed. + * @return true if changes have been made to the + * {@link TextDocument}. + * @throws BoilerpipeProcessingException + */ + boolean process(final TextDocument doc) + throws BoilerpipeProcessingException; +} diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java new file mode 100644 index 0000000..bcb603d --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java @@ -0,0 +1,35 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe; + +import de.l3s.boilerpipe.document.TextDocument; + +/** + * A source that returns {@link TextDocument}s. + * + * @author Christian Kohlschütter + */ +public interface BoilerpipeInput { + /** + * Returns (somehow) a {@link TextDocument}. + * + * @return A {@link TextDocument}. + * @throws BoilerpipeProcessingException + */ + TextDocument getTextDocument() throws BoilerpipeProcessingException; +} diff --git a/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java b/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java new file mode 100644 index 0000000..f3a9cc4 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/BoilerpipeProcessingException.java @@ -0,0 +1,43 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe; + +/** + * Exception for signaling failure in the processing pipeline. + * + * @author Christian Kohlschütter + */ +public class BoilerpipeProcessingException extends Exception { + private static final long serialVersionUID = 1L; + + public BoilerpipeProcessingException() { + super(); + } + + public BoilerpipeProcessingException(String message, Throwable cause) { + super(message, cause); + } + + public BoilerpipeProcessingException(String message) { + super(message); + } + + public BoilerpipeProcessingException(Throwable cause) { + super(cause); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java b/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java new file mode 100644 index 0000000..df92f10 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/conditions/TextBlockCondition.java @@ -0,0 +1,37 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.conditions; + +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.labels.ConditionalLabelAction; + +/** + * Evaluates whether a given {@link TextBlock} meets a certain condition. + * Useful in combination with {@link ConditionalLabelAction}. + * + * @author Christian Kohlschuetter + */ +public interface TextBlockCondition { + /** + * Returns true iff the given {@link TextBlock} tb meets the defined condition. + * + * @param tb + * @return iff the condition is met. + */ + boolean meetsCondition(final TextBlock tb); +} diff --git a/src/main/java/de/l3s/boilerpipe/document/Image.java b/src/main/java/de/l3s/boilerpipe/document/Image.java new file mode 100644 index 0000000..91abc66 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/Image.java @@ -0,0 +1,97 @@ +package de.l3s.boilerpipe.document; + +/** + * Represents an Image resource that is contained in the document. + * + * Any of the attributes may be null, except for "src". + * + * @author Christian Kohlschuetter + */ +public class Image extends Media implements Comparable { + private final String src; + private final String width; + private final String height; + private final String alt; + private final int area; + + public Image(final String src, final String width, final String height, final String alt) { + this.src = src; + if(src == null) { + throw new NullPointerException("src attribute must not be null"); + } + this.width = nullTrim(width); + this.height = nullTrim(height); + this.alt = nullTrim(alt); + + if(width != null && height != null) { + int a; + try { + a = Integer.parseInt(width) * Integer.parseInt(height); + } catch(NumberFormatException e) { + a = -1; + } + this.area = a; + } else { + this.area = -1; + } + } + + /** + * gets the src attribut from the image tag in the html source. + * it's not everytime an absolute path! + * + * @return gets the src attribut from the image + */ + public String getSrc() { + return src; + } + + public String getWidth() { + return width; + } + + public String getHeight() { + return height; + } + + public String getAlt() { + return alt; + } + + private static String nullTrim(String s) { + if(s == null) { + return null; + } + s = s.trim(); + if(s.length() == 0) { + return null; + } + return s; + } + + /** + * Returns the image's area (specified by width * height), or -1 if width/height weren't both specified or could not be parsed. + * + * @return the image's area + */ + public int getArea() { + return area; + } + + public String toString() { + return src+"\twidth="+width+"\theight="+height+"\talt="+alt+"\tarea="+area; + } + + public int compareTo(Image o) { + if(o == this) { + return 0; + } + if(area > o.area) { + return -1; + } else if(area == o.area) { + return src.compareTo(o.src); + } else { + return 1; + } + } +} diff --git a/src/main/java/de/l3s/boilerpipe/document/Media.java b/src/main/java/de/l3s/boilerpipe/document/Media.java new file mode 100644 index 0000000..8923b24 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/Media.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package de.l3s.boilerpipe.document; + +/** + * Media class + * + * @author manuel.codiga@gmail.com + * + */ +public abstract class Media { + +} diff --git a/src/main/java/de/l3s/boilerpipe/document/TextBlock.java b/src/main/java/de/l3s/boilerpipe/document/TextBlock.java new file mode 100644 index 0000000..f7e59ac --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/TextBlock.java @@ -0,0 +1,286 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.document; + +import java.util.BitSet; +import java.util.HashSet; +import java.util.Set; + +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Describes a block of text. + * + * A block can be an "atomic" text element (i.e., a sequence of text that is not + * interrupted by any HTML markup) or a compound of such atomic elements. + * + * @author Christian Kohlschütter + */ +public class TextBlock implements Cloneable { + boolean isContent = false; + private CharSequence text; + Set labels = null; + + int offsetBlocksStart; + int offsetBlocksEnd; + + int numWords; + int numWordsInAnchorText; + int numWordsInWrappedLines; + int numWrappedLines; + float textDensity; + float linkDensity; + + BitSet containedTextElements; + + private int numFullTextWords = 0; + private int tagLevel; + + private static final BitSet EMPTY_BITSET = new BitSet(); + public static final TextBlock EMPTY_START = new TextBlock("", EMPTY_BITSET, + 0, 0, 0, 0, -1); + public static final TextBlock EMPTY_END = new TextBlock("", EMPTY_BITSET, + 0, 0, 0, 0, Integer.MAX_VALUE); + + public TextBlock(final String text) { + this(text, null, 0,0,0,0,0); + } + + public TextBlock(final String text, final BitSet containedTextElements, + final int numWords, final int numWordsInAnchorText, + final int numWordsInWrappedLines, final int numWrappedLines, + final int offsetBlocks) { + this.text = text; + this.containedTextElements = containedTextElements; + this.numWords = numWords; + this.numWordsInAnchorText = numWordsInAnchorText; + this.numWordsInWrappedLines = numWordsInWrappedLines; + this.numWrappedLines = numWrappedLines; + this.offsetBlocksStart = offsetBlocks; + this.offsetBlocksEnd = offsetBlocks; + initDensities(); + } + + public boolean isContent() { + return isContent; + } + + public boolean setIsContent(boolean isContent) { + if (isContent != this.isContent) { + this.isContent = isContent; + return true; + } else { + return false; + } + } + + public String getText() { + return text.toString(); + } + + public int getNumWords() { + return numWords; + } + + public int getNumWordsInAnchorText() { + return numWordsInAnchorText; + } + + public float getTextDensity() { + return textDensity; + } + + public float getLinkDensity() { + return linkDensity; + } + + public void mergeNext(final TextBlock other) { + if (!(text instanceof StringBuilder)) { + text = new StringBuilder(text); + } + StringBuilder sb = (StringBuilder) text; + sb.append('\n'); + sb.append(other.text); + + numWords += other.numWords; + numWordsInAnchorText += other.numWordsInAnchorText; + + numWordsInWrappedLines += other.numWordsInWrappedLines; + numWrappedLines += other.numWrappedLines; + + offsetBlocksStart = Math + .min(offsetBlocksStart, other.offsetBlocksStart); + offsetBlocksEnd = Math.max(offsetBlocksEnd, other.offsetBlocksEnd); + + initDensities(); + + this.isContent |= other.isContent; + + if(containedTextElements == null) { + containedTextElements = (BitSet)other.containedTextElements.clone(); + } else { + containedTextElements.or(other.containedTextElements); + } + + numFullTextWords += other.numFullTextWords; + + if (other.labels != null) { + if (labels == null) { + labels = new HashSet(other.labels); + } else { + labels.addAll(other.labels); + } + } + + tagLevel = Math.min(tagLevel, other.tagLevel); + } + + private void initDensities() { + if (numWordsInWrappedLines == 0) { + numWordsInWrappedLines = numWords; + numWrappedLines = 1; + } + textDensity = numWordsInWrappedLines / (float) numWrappedLines; + linkDensity = numWords == 0 ? 0 : numWordsInAnchorText / (float) numWords; + } + + public int getOffsetBlocksStart() { + return offsetBlocksStart; + } + public int getOffsetBlocksEnd() { + return offsetBlocksEnd; + } + + public String toString() { + return "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl="+tagLevel+"; nw="+numWords+";nwl="+numWrappedLines+";ld="+linkDensity+"]\t" + + (isContent?"CONTENT":"boilerplate") + "," + labels + "\n" + getText(); + } + + /** + * Adds an arbitrary String label to this {@link TextBlock}. + * + * @param label The label + * @see DefaultLabels + */ + public void addLabel(final String label) { + if (labels == null) { + labels = new HashSet(2); + } + labels.add(label); + } + + /** + * Checks whether this TextBlock has the given label. + * + * @param label The label + * @return true if this block is marked by the given label. + */ + public boolean hasLabel(final String label) { + return labels != null && labels.contains(label); + } + + public boolean removeLabel(final String label) { + return labels != null && labels.remove(label); + } + + /** + * Returns the labels associated to this TextBlock, or null if no such labels + * exist. + * + * NOTE: The returned instance is the one used directly in TextBlock. You have full access + * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock} + * whenever possible. + * + * @return Returns the set of labels, or null if no labels was added yet. + */ + public Set getLabels() { + return labels; + } + + /** + * Adds a set of labels to this {@link TextBlock}. + * null-references are silently ignored. + * + * @param l The labels to be added. + */ + public void addLabels(final Set l) { + if(l == null) { + return; + } + if(this.labels == null) { + this.labels = new HashSet(l); + } else { + this.labels.addAll(l); + } + } + + /** + * Adds a set of labels to this {@link TextBlock}. + * null-references are silently ignored. + * + * @param l The labels to be added. + */ + public void addLabels(final String... l) { + if(l == null) { + return; + } + if(this.labels == null) { + this.labels = new HashSet(); + } + for(final String label : l) { + this.labels.add(label); + } + } + + /** + * Returns the containedTextElements BitSet, or null. + * @return the containedTextElements BitSet, or null. + */ + public BitSet getContainedTextElements() { + return containedTextElements; + } + + @Override + protected TextBlock clone() { + final TextBlock clone; + try { + clone = (TextBlock)super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + if(text != null && !(text instanceof String)) { + clone.text = new StringBuilder(text); + } + if(labels != null && !labels.isEmpty()) { + clone.labels = new HashSet(labels); + } + if(containedTextElements != null) { + clone.containedTextElements = (BitSet)containedTextElements.clone(); + } + + return clone; + } + + public int getTagLevel() { + return tagLevel; + } + + public void setTagLevel(int tagLevel) { + this.tagLevel = tagLevel; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/document/TextDocument.java b/src/main/java/de/l3s/boilerpipe/document/TextDocument.java new file mode 100644 index 0000000..5ea893c --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/TextDocument.java @@ -0,0 +1,141 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.document; + +import java.util.ArrayList; +import java.util.List; + +/** + * A text document, consisting of one or more {@link TextBlock}s. + * + * @author Christian Kohlschütter + */ +public class TextDocument implements Cloneable { + final List textBlocks; + String title; + + /** + * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no + * title. + * + * @param textBlocks + * The text blocks of this document. + */ + public TextDocument(final List textBlocks) { + this(null, textBlocks); + } + + /** + * Creates a new {@link TextDocument} with given {@link TextBlock}s and + * given title. + * + * @param title + * The "main" title for this text document. + * @param textBlocks + * The text blocks of this document. + */ + public TextDocument(final String title, final List textBlocks) { + this.title = title; + this.textBlocks = textBlocks; + } + + /** + * Returns the {@link TextBlock}s of this document. + * + * @return A list of {@link TextBlock}s, in sequential order of appearance. + */ + public List getTextBlocks() { + return textBlocks; + } + + /** + * Returns the "main" title for this document, or null if no + * such title has ben set. + * + * @return The "main" title. + */ + public String getTitle() { + return title; + } + + /** + * Updates the "main" title for this document. + * + * @param title + */ + public void setTitle(final String title) { + this.title = title; + } + + /** + * Returns the {@link TextDocument}'s content. + * + * @return The content text. + */ + public String getContent() { + return getText(true, false); + } + + + /** + * Returns the {@link TextDocument}'s content, non-content or both + * + * @param includeContent Whether to include TextBlocks marked as "content". + * @param includeNonContent Whether to include TextBlocks marked as "non-content". + * @return The text. + */ + public String getText(boolean includeContent, boolean includeNonContent) { + StringBuilder sb = new StringBuilder(); + LOOP: for (TextBlock block : getTextBlocks()) { + if(block.isContent()) { + if(!includeContent) { + continue LOOP; + } + } else { + if(!includeNonContent) { + continue LOOP; + } + } + sb.append(block.getText()); + sb.append('\n'); + } + return sb.toString(); + } + + /** + * Returns detailed debugging information about the contained {@link TextBlock}s. + * + * @return Debug information. + */ + public String debugString() { + StringBuilder sb = new StringBuilder(); + for(TextBlock tb : getTextBlocks()) { + sb.append(tb.toString()); + sb.append('\n'); + } + return sb.toString(); + } + + public TextDocument clone() { + final List list = new ArrayList(textBlocks.size()); + for(TextBlock tb : textBlocks) { + list.add(tb.clone()); + } + return new TextDocument(title, list); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java b/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java new file mode 100644 index 0000000..51abe73 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/TextDocumentStatistics.java @@ -0,0 +1,64 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.document; + +/** + * Provides shallow statistics on a given TextDocument + * + * @author Christian Kohlschuetter + */ +public final class TextDocumentStatistics { + private int numWords = 0; + private int numBlocks = 0; + + /** + * Computes statistics on a given {@link TextDocument}. + * + * @param doc The {@link TextDocument}. + * @param contentOnly if true then o + */ + public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { + for (TextBlock tb : doc.getTextBlocks()) { + if (contentOnly && !tb.isContent()) { + continue; + } + + numWords += tb.getNumWords(); + numBlocks++; + } + } + + /** + * Returns the average number of words at block-level (= overall number of words divided by + * the number of blocks). + * + * @return Average + */ + public float avgNumWords() { + return numWords / (float) numBlocks; + } + + /** + * Returns the overall number of words in all blocks. + * + * @return Sum + */ + public int getNumWords() { + return numWords; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/document/Video.java b/src/main/java/de/l3s/boilerpipe/document/Video.java new file mode 100644 index 0000000..3c6fa31 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/Video.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package de.l3s.boilerpipe.document; + +/** + * Represents an video resource which is contained in the document. + * + * + * @author Manuel Codiga + */ +public class Video extends Media { + private final String originUrl; + private final String embedUrl; + + public Video(final String url, final String embedUrl) { + this.originUrl = url; + this.embedUrl = embedUrl; + if(this.embedUrl == null) { + throw new NullPointerException("embedUrl attribute must not be null"); + } + } + + public String getOriginUrl() { + return originUrl; + } + + public String getEmbedUrl() { + return embedUrl; + } + + public String toString() { + return "url: "+originUrl; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java b/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java new file mode 100644 index 0000000..3bada83 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/VimeoVideo.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package de.l3s.boilerpipe.document; + +/** + * Represents an Vimeo video resource that is contained in the document. + * + * + * @author Manuel Codiga + */ +public class VimeoVideo extends Video { + + public VimeoVideo(String originUrl, String embedUrl) { + super(originUrl, embedUrl); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java b/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java new file mode 100644 index 0000000..1f80744 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/YoutubeVideo.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package de.l3s.boilerpipe.document; + +/** + * Represents an Youtube video resource that is contained in the document. + * + * + * @author Manuel Codiga + */ +public class YoutubeVideo extends Video { + + public YoutubeVideo(String originUrl, String embedUrl) { + super(originUrl, embedUrl); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/document/package.html b/src/main/java/de/l3s/boilerpipe/document/package.html new file mode 100644 index 0000000..b80903d --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/document/package.html @@ -0,0 +1,6 @@ + + +

The classes in this package represent the simple Boilerpipe + document model.

+ + diff --git a/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java b/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java new file mode 100644 index 0000000..1fea4ca --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/estimators/SimpleEstimator.java @@ -0,0 +1,62 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.estimators; + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.document.TextDocumentStatistics; +import de.l3s.boilerpipe.extractors.ArticleExtractor; +import de.l3s.boilerpipe.extractors.DefaultExtractor; + +/** + * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document. + * + * @author Christian Kohlschütter + */ +public final class SimpleEstimator { + + /** + * Returns the singleton instance of {@link SimpleEstimator} + */ + public static final SimpleEstimator INSTANCE = new SimpleEstimator(); + + private SimpleEstimator() { + } + + /** + * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor}, + * can we regard the extraction quality (too) low? + * + * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others. + * + * @param dsBefore + * @param dsAfter + * @return true if low quality is to be expected. + */ + public boolean isLowQuality(final TextDocumentStatistics dsBefore, final TextDocumentStatistics dsAfter) { + if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) { + return true; + } + + if (dsAfter.avgNumWords() < 25) { + return true; + } + + return false; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java new file mode 100644 index 0000000..9013c3f --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/ArticleExtractor.java @@ -0,0 +1,68 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.english.IgnoreBlocksAfterContentFilter; +import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier; +import de.l3s.boilerpipe.filters.english.TerminatingBlocksFinder; +import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion; +import de.l3s.boilerpipe.filters.heuristics.DocumentTitleMatchClassifier; +import de.l3s.boilerpipe.filters.heuristics.ExpandTitleToContentFilter; +import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter; +import de.l3s.boilerpipe.filters.heuristics.LargeBlockSameTagLevelToContentFilter; +import de.l3s.boilerpipe.filters.heuristics.ListAtEndFilter; +import de.l3s.boilerpipe.filters.heuristics.TrailingHeadlineToBoilerplateFilter; +import de.l3s.boilerpipe.filters.simple.BoilerplateBlockFilter; + +/** + * A full-text extractor which is tuned towards news articles. In this scenario + * it achieves higher accuracy than {@link DefaultExtractor}. + * + * @author Christian Kohlschütter + */ +public final class ArticleExtractor extends ExtractorBase { + public static final ArticleExtractor INSTANCE = new ArticleExtractor(); + + /** + * Returns the singleton instance for {@link ArticleExtractor}. + */ + public static ArticleExtractor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + return + + TerminatingBlocksFinder.INSTANCE.process(doc) + | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc) + | NumWordsRulesClassifier.INSTANCE.process(doc) + | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc) + | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc) + | BlockProximityFusion.MAX_DISTANCE_1.process(doc) + | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc) + | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc) + | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc) + | ExpandTitleToContentFilter.INSTANCE.process(doc) + | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc) + | ListAtEndFilter.INSTANCE.process(doc) + ; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java new file mode 100644 index 0000000..5b95e31 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java @@ -0,0 +1,49 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.simple.MinClauseWordsFilter; +import de.l3s.boilerpipe.filters.simple.SplitParagraphBlocksFilter; + +/** + * A full-text extractor which is tuned towards extracting sentences from news articles. + * + * @author Christian Kohlschütter + */ +public final class ArticleSentencesExtractor extends ExtractorBase { + public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor(); + + /** + * Returns the singleton instance for {@link ArticleSentencesExtractor}. + */ + public static ArticleSentencesExtractor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + return + + ArticleExtractor.INSTANCE.process(doc) + | SplitParagraphBlocksFilter.INSTANCE.process(doc) + | MinClauseWordsFilter.INSTANCE.process(doc); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java new file mode 100644 index 0000000..db970e0 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/CanolaExtractor.java @@ -0,0 +1,106 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.estimators.SimpleEstimator; + +/** + * A full-text extractor trained on krdwrd Canola + * . Works well with {@link SimpleEstimator}, too. + * + * @author Christian Kohlschütter + */ +public class CanolaExtractor extends ExtractorBase { + public static final CanolaExtractor INSTANCE = new CanolaExtractor(); + + /** + * Returns the singleton instance for {@link CanolaExtractor}. + */ + public static CanolaExtractor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + + return CLASSIFIER.process(doc); + } + + /** + * The actual classifier, exposed. + */ + public static final BoilerpipeFilter CLASSIFIER = new BoilerpipeFilter() { + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + boolean hasChanges = false; + + ListIterator it = textBlocks.listIterator(); + if (!it.hasNext()) { + return false; + } + TextBlock prevBlock = TextBlock.EMPTY_START; + TextBlock currentBlock = it.next(); + TextBlock nextBlock = it.hasNext() ? it.next() + : TextBlock.EMPTY_START; + + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + + if (nextBlock != TextBlock.EMPTY_START) { + while (it.hasNext()) { + prevBlock = currentBlock; + currentBlock = nextBlock; + nextBlock = it.next(); + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + } + prevBlock = currentBlock; + currentBlock = nextBlock; + nextBlock = TextBlock.EMPTY_START; + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + } + + return hasChanges; + } + + protected boolean classify(final TextBlock prev, final TextBlock curr, + final TextBlock next) { + final boolean isContent = (curr.getLinkDensity() > 0 && next + .getNumWords() > 11) + || (curr.getNumWords() > 19 || (next.getNumWords() > 6 + && next.getLinkDensity() == 0 + && prev.getLinkDensity() == 0 && (curr + .getNumWords() > 6 || prev.getNumWords() > 7 || next + .getNumWords() > 19))); + + return curr.setIsContent(isContent); + } + }; +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java b/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java new file mode 100644 index 0000000..7e43d20 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/CommonExtractors.java @@ -0,0 +1,42 @@ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeExtractor; + +/** + * Provides quick access to common {@link BoilerpipeExtractor}s. + * + * @author Christian Kohlschütter + */ +public final class CommonExtractors { + private CommonExtractors() { + } + + /** + * Works very well for most types of Article-like HTML. + */ + public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE; + + /** + * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. + */ + public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE; + + /** + * Like {@link DefaultExtractor}, but keeps the largest text block only. + */ + public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR = LargestContentExtractor.INSTANCE; + + + /** + * Trained on krdwrd Canola (different definition of "boilerplate"). You may + * give it a try. + */ + public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE; + + /** + * Dummy Extractor; should return the input text. Use this to double-check + * that your problem is within a particular {@link BoilerpipeExtractor}, or + * somewhere else. + */ + public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR = KeepEverythingExtractor.INSTANCE; +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java new file mode 100644 index 0000000..1fd7f33 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/DefaultExtractor.java @@ -0,0 +1,50 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.english.DensityRulesClassifier; +import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion; +import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor; + +/** + * A quite generic full-text extractor. + * + * @author Christian Kohlschütter + */ +public class DefaultExtractor extends ExtractorBase { + public static final DefaultExtractor INSTANCE = new DefaultExtractor(); + + /** + * Returns the singleton instance for {@link DefaultExtractor}. + */ + public static DefaultExtractor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + + return + + SimpleBlockFusionProcessor.INSTANCE.process(doc) + | BlockProximityFusion.MAX_DISTANCE_1.process(doc) + | DensityRulesClassifier.INSTANCE.process(doc); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java b/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java new file mode 100644 index 0000000..f41a243 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/ExtractorBase.java @@ -0,0 +1,116 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; +import de.l3s.boilerpipe.sax.HTMLFetcher; + +/** + * The base class of Extractors. Also provides some helper methods to quickly + * retrieve the text that remained after processing. + * + * @author Christian Kohlschütter + */ +public abstract class ExtractorBase implements BoilerpipeExtractor { + + /** + * Extracts text from the HTML code given as a String. + * + * @param html The HTML code as a String. + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final String html) + throws BoilerpipeProcessingException { + try { + return getText(new BoilerpipeSAXInput(new InputSource( + new StringReader(html))).getTextDocument()); + } catch (SAXException e) { + throw new BoilerpipeProcessingException(e); + } + } + + /** + * Extracts text from the HTML code available from the given {@link InputSource}. + * + * @param is The InputSource containing the HTML + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final InputSource is) + throws BoilerpipeProcessingException { + try { + return getText(new BoilerpipeSAXInput(is).getTextDocument()); + } catch (SAXException e) { + throw new BoilerpipeProcessingException(e); + } + } + + /** + * Extracts text from the HTML code available from the given {@link URL}. + * NOTE: This method is mainly to be used for show case purposes. If you are + * going to crawl the Web, consider using {@link #getText(InputSource)} + * instead. + * + * @param url The URL pointing to the HTML code. + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final URL url) throws BoilerpipeProcessingException { + try { + return getText(HTMLFetcher.fetch(url).toInputSource()); + } catch (IOException e) { + throw new BoilerpipeProcessingException(e); + } + } + + /** + * Extracts text from the HTML code available from the given {@link Reader}. + * + * @param r The Reader containing the HTML + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(final Reader r) throws BoilerpipeProcessingException { + return getText(new InputSource(r)); + } + + /** + * Extracts text from the given {@link TextDocument} object. + * + * @param doc The {@link TextDocument}. + * @return The extracted text. + * @throws BoilerpipeProcessingException + */ + public String getText(TextDocument doc) + throws BoilerpipeProcessingException { + process(doc); + return doc.getContent(); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java new file mode 100644 index 0000000..d1f8afc --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java @@ -0,0 +1,42 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter; + +/** + * Marks everything as content. + * + * @author Christian Kohlschütter + */ +public final class KeepEverythingExtractor extends ExtractorBase { + + public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor(); + + private KeepEverythingExtractor() { + + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + return MarkEverythingContentFilter.INSTANCE.process(doc); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java new file mode 100644 index 0000000..96a88c0 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java @@ -0,0 +1,48 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor; +import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter; +import de.l3s.boilerpipe.filters.simple.MinWordsFilter; + +/** + * A full-text extractor which extracts the largest text component of a page. + * For news articles, it may perform better than the {@link DefaultExtractor}, + * but usually worse than {@link ArticleExtractor}. + * + * @author Christian Kohlschütter + */ +public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase { + + private final MinWordsFilter filter; + + public KeepEverythingWithMinKWordsExtractor(final int kMin) { + this.filter = new MinWordsFilter(kMin); + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + return SimpleBlockFusionProcessor.INSTANCE.process(doc) + | MarkEverythingContentFilter.INSTANCE.process(doc) + | filter.process(doc); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java new file mode 100644 index 0000000..8720c5c --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/LargestContentExtractor.java @@ -0,0 +1,53 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier; +import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion; +import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter; + +/** + * A full-text extractor which extracts the largest text component of a page. + * For news articles, it may perform better than the {@link DefaultExtractor}, + * but usually worse than {@link ArticleExtractor}. + * + * @author Christian Kohlschütter + */ +public final class LargestContentExtractor extends ExtractorBase { + public static final LargestContentExtractor INSTANCE = new LargestContentExtractor(); + + private LargestContentExtractor() { + } + + /** + * Returns the singleton instance for {@link LargestContentExtractor}. + */ + public static LargestContentExtractor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + return NumWordsRulesClassifier.INSTANCE.process(doc) + | BlockProximityFusion.MAX_DISTANCE_1.process(doc) + | KeepLargestBlockFilter.INSTANCE.process(doc); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java b/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java new file mode 100644 index 0000000..12ece11 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java @@ -0,0 +1,46 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.extractors; + +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier; + +/** + * A quite generic full-text extractor solely based upon the number of words per + * block (the current, the previous and the next block). + * + * @author Christian Kohlschütter + */ +public class NumWordsRulesExtractor extends ExtractorBase { + public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor(); + + /** + * Returns the singleton instance for {@link NumWordsRulesExtractor}. + */ + public static NumWordsRulesExtractor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + + return NumWordsRulesClassifier.INSTANCE.process(doc); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/extractors/package.html b/src/main/java/de/l3s/boilerpipe/extractors/package.html new file mode 100644 index 0000000..aae6f19 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/extractors/package.html @@ -0,0 +1,6 @@ + + +

This package contains some standard extractors (i.e., completely + piped BoilerpipeFilters)

+ + diff --git a/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java b/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java new file mode 100644 index 0000000..52025ef --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/debug/PrintDebugFilter.java @@ -0,0 +1,69 @@ +/** + * boilerpipe + * + * Copyright (c) 2012 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.debug; + +import java.io.PrintWriter; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Prints debug information about the current state of the TextDocument. (= + * calls {@link TextDocument#debugString()}. + * + * @author Christian Kohlschütter + */ +public final class PrintDebugFilter implements BoilerpipeFilter { + /** + * Returns the default instance for {@link PrintDebugFilter}, + * which dumps debug information to System.out + */ + public static final PrintDebugFilter INSTANCE = new PrintDebugFilter( + new PrintWriter(System.out, true)); + private final PrintWriter out; + + /** + * Returns the default instance for {@link PrintDebugFilter}, + * which dumps debug information to System.out + */ + public static PrintDebugFilter getInstance() { + return INSTANCE; + } + + /** + * Creates a new instance of {@link PrintDebugFilter}. + * + * Only use this method if you are not going to dump + * the debug information to System.out -- + * for this case, use {@link #getInstance()} instead. + * + * @param out The target {@link PrintWriter}. Will not be closed + */ + public PrintDebugFilter(final PrintWriter out) { + this.out = out; + + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + out.println(doc.debugString()); + + return false; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java new file mode 100644 index 0000000..bbda7ba --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java @@ -0,0 +1,117 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Classifies {@link TextBlock}s as content/not-content through rules that have + * been determined using the C4.8 machine learning algorithm, as described in the + * paper "Boilerplate Detection using Shallow Text Features", particularly using + * text densities and link densities. + * + * @author Christian Kohlschütter + */ +public class DensityRulesClassifier implements + BoilerpipeFilter { + public static final DensityRulesClassifier INSTANCE = new DensityRulesClassifier(); + + /** + * Returns the singleton instance for RulebasedBoilerpipeClassifier. + */ + public static DensityRulesClassifier getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + boolean hasChanges = false; + + ListIterator it = textBlocks.listIterator(); + if (!it.hasNext()) { + return false; + } + TextBlock prevBlock = TextBlock.EMPTY_START; + TextBlock currentBlock = it.next(); + TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START; + + hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; + + if (nextBlock != TextBlock.EMPTY_START) { + while (it.hasNext()) { + prevBlock = currentBlock; + currentBlock = nextBlock; + nextBlock = it.next(); + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + } + prevBlock = currentBlock; + currentBlock = nextBlock; + nextBlock = TextBlock.EMPTY_START; + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + } + + return hasChanges; + } + + protected boolean classify(final TextBlock prev, final TextBlock curr, + final TextBlock next) { + final boolean isContent; + + if (curr.getLinkDensity() <= 0.333333) { + if (prev.getLinkDensity() <= 0.555556) { + if (curr.getTextDensity() <= 9) { + if (next.getTextDensity() <= 10) { + if (prev.getTextDensity() <= 4) { + isContent = false; + } else { + isContent = true; + } + } else { + isContent = true; + } + } else { + if (next.getTextDensity() == 0) { + isContent = false; + } else { + isContent = true; + } + } + } else { + if (next.getTextDensity() <= 11) { + isContent = false; + } else { + isContent = true; + } + } + } else { + isContent = false; + } + + return curr.setIsContent(isContent); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java b/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java new file mode 100644 index 0000000..dc72d07 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java @@ -0,0 +1,40 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import de.l3s.boilerpipe.document.TextBlock; + +/** + * Base class for some heuristics that are used by boilerpipe filters. + * + * @author Christian Kohlschütter + */ +abstract class HeuristicFilterBase { + + protected static int getNumFullTextWords(final TextBlock tb) { + return getNumFullTextWords(tb, 9); + } + protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { + if(tb.getTextDensity() >= minTextDensity) { + return tb.getNumWords(); + } else { + return 0; + } + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java new file mode 100644 index 0000000..1d505be --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java @@ -0,0 +1,80 @@ +/** + * boilerpipe + * + * Copyright (c) 2009,2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import java.util.Iterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks all blocks as "non-content" that occur after blocks that have been + * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}. These marks are ignored + * unless a minimum number of words in content blocks occur before this mark (default: 60). + * This can be used in conjunction with an upstream {@link TerminatingBlocksFinder}. + * + * @author Christian Kohlschütter + * @see TerminatingBlocksFinder + */ +public final class IgnoreBlocksAfterContentFilter extends HeuristicFilterBase implements BoilerpipeFilter { + public static final IgnoreBlocksAfterContentFilter DEFAULT_INSTANCE = new IgnoreBlocksAfterContentFilter( + 60); + public static final IgnoreBlocksAfterContentFilter INSTANCE_200 = new IgnoreBlocksAfterContentFilter( + 200); + private final int minNumWords; + + /** + * Returns the singleton instance for DeleteBlocksAfterContentFilter. + */ + public static IgnoreBlocksAfterContentFilter getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + public IgnoreBlocksAfterContentFilter(final int minNumWords) { + this.minNumWords = minNumWords; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + boolean changes = false; + + int numWords = 0; + boolean foundEndOfText = false; + for (Iterator it = doc.getTextBlocks().iterator(); it.hasNext();) { + TextBlock block = it.next(); + + final boolean endOfText = block + .hasLabel(DefaultLabels.INDICATES_END_OF_TEXT); + if (block.isContent()) { + numWords += getNumFullTextWords(block); + } + if (endOfText && numWords >= minNumWords) { + foundEndOfText = true; + } + if (foundEndOfText) { + changes = true; + block.setIsContent(false); + } + } + + return changes; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java new file mode 100644 index 0000000..0fdf7dd --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java @@ -0,0 +1,76 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks all blocks as "non-content" that occur after blocks that have been + * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}, and after any content block. + * This filter can be used in conjunction with an upstream {@link TerminatingBlocksFinder}. + * + * @author Christian Kohlschütter + * @see TerminatingBlocksFinder + */ +public final class IgnoreBlocksAfterContentFromEndFilter extends HeuristicFilterBase implements BoilerpipeFilter { + public static final IgnoreBlocksAfterContentFromEndFilter INSTANCE = new IgnoreBlocksAfterContentFromEndFilter( + ); + + private IgnoreBlocksAfterContentFromEndFilter() { + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + boolean changes = false; + + + int words = 0; + + List blocks = doc.getTextBlocks(); + if (!blocks.isEmpty()) { + ListIterator it = blocks.listIterator(blocks.size()); + + TextBlock tb; + + while(it.hasPrevious()) { + tb = it.previous(); + if(tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) { + tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT); + tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT); + tb.setIsContent(false); + changes = true; + } else if(tb.isContent()) { + words += tb.getNumWords(); + if(words > 200) { + break; + } + } + + } + } + + return changes; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java new file mode 100644 index 0000000..ccf7fd8 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java @@ -0,0 +1,83 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import java.util.List; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Keeps the largest {@link TextBlock} only (by the number of words). In case of + * more than one block with the same number of words, the first block is chosen. + * All discarded blocks are marked "not content" and flagged as + * {@link DefaultLabels#MIGHT_BE_CONTENT}. + * + * As opposed to {@link KeepLargestBlockFilter}, the number of words are + * computed using {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}, which only counts + * words that occur in text elements with at least 9 words and are thus believed to be full text. + * + * NOTE: Without language-specific fine-tuning (i.e., running the default instance), this filter + * may lead to suboptimal results. You better use {@link KeepLargestBlockFilter} instead, which + * works at the level of number-of-words instead of text densities. + * + * @author Christian Kohlschütter + */ +public final class KeepLargestFulltextBlockFilter extends HeuristicFilterBase implements BoilerpipeFilter { + public static final KeepLargestFulltextBlockFilter INSTANCE = new KeepLargestFulltextBlockFilter(); + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + if (textBlocks.size() < 2) { + return false; + } + + int max = -1; + TextBlock largestBlock = null; + for (TextBlock tb : textBlocks) { + if (!tb.isContent()) { + continue; + } + int numWords = getNumFullTextWords(tb); + if (numWords > max) { + largestBlock = tb; + max = numWords; + } + } + + if (largestBlock == null) { + return false; + } + + for (TextBlock tb : textBlocks) { + if (tb == largestBlock) { + tb.setIsContent(true); + } else { + tb.setIsContent(false); + tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT); + } + } + + return true; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java new file mode 100644 index 0000000..7962af4 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java @@ -0,0 +1,63 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Keeps only those content blocks which contain at least k full-text words + * (measured by {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default. + * + * @author Christian Kohlschütter + */ +public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter { + public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter( + 30); + private final int minWords; + + public static MinFulltextWordsFilter getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + public MinFulltextWordsFilter(final int minWords) { + this.minWords = minWords; + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + for (TextBlock tb : doc.getTextBlocks()) { + if (!tb.isContent()) { + continue; + } + if (getNumFullTextWords(tb) < minWords) { + tb.setIsContent(false); + changes = true; + } + + } + + return changes; + + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java new file mode 100644 index 0000000..550252a --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java @@ -0,0 +1,116 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Classifies {@link TextBlock}s as content/not-content through rules that have + * been determined using the C4.8 machine learning algorithm, as described in + * the paper "Boilerplate Detection using Shallow Text Features" (WSDM 2010), + * particularly using number of words per block and link density per block. + * + * @author Christian Kohlschütter + */ +public class NumWordsRulesClassifier implements BoilerpipeFilter { + public static final NumWordsRulesClassifier INSTANCE = new NumWordsRulesClassifier(); + + /** + * Returns the singleton instance for RulebasedBoilerpipeClassifier. + */ + public static NumWordsRulesClassifier getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + boolean hasChanges = false; + + ListIterator it = textBlocks.listIterator(); + if (!it.hasNext()) { + return false; + } + TextBlock prevBlock = TextBlock.EMPTY_START; + TextBlock currentBlock = it.next(); + TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START; + + hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; + + if (nextBlock != TextBlock.EMPTY_START) { + while (it.hasNext()) { + prevBlock = currentBlock; + currentBlock = nextBlock; + nextBlock = it.next(); + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + } + prevBlock = currentBlock; + currentBlock = nextBlock; + nextBlock = TextBlock.EMPTY_START; + hasChanges = classify(prevBlock, currentBlock, nextBlock) + | hasChanges; + } + + return hasChanges; + } + + protected boolean classify(final TextBlock prev, final TextBlock curr, + final TextBlock next) { + final boolean isContent; + + if (curr.getLinkDensity() <= 0.333333) { + if (prev.getLinkDensity() <= 0.555556) { + if (curr.getNumWords() <= 16) { + if (next.getNumWords() <= 15) { + if (prev.getNumWords() <= 4) { + isContent = false; + } else { + isContent = true; + } + } else { + isContent = true; + } + } else { + isContent = true; + } + } else { + if (curr.getNumWords() <= 40) { + if (next.getNumWords() <= 17) { + isContent = false; + } else { + isContent = true; + } + } else { + isContent = true; + } + } + } else { + isContent = false; + } + + return curr.setIsContent(isContent); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java b/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java new file mode 100644 index 0000000..0c5c15c --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java @@ -0,0 +1,124 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlsch��tter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.english; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Finds blocks which are potentially indicating the end of an article text and + * marks them with {@link DefaultLabels#INDICATES_END_OF_TEXT}. This can be used + * in conjunction with a downstream {@link IgnoreBlocksAfterContentFilter}. + * + * @author Christian Kohlsch��tter + * @see IgnoreBlocksAfterContentFilter + */ +public class TerminatingBlocksFinder implements BoilerpipeFilter { + public static final TerminatingBlocksFinder INSTANCE = new TerminatingBlocksFinder(); + + /** + * Returns the singleton instance for TerminatingBlocksFinder. + */ + public static TerminatingBlocksFinder getInstance() { + return INSTANCE; + } + + // public static long timeSpent = 0; + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + boolean changes = false; + + // long t = System.currentTimeMillis(); + + for (TextBlock tb : doc.getTextBlocks()) { + final int numWords = tb.getNumWords(); + if (numWords < 15) { + final String text = tb.getText().trim(); + final int len = text.length(); + if (len >= 8) { + final String textLC = text.toLowerCase(); + if (textLC.startsWith("comments") + || startsWithNumber(textLC, len, " comments", + " users responded in") + || textLC.startsWith("�� reuters") + || textLC.startsWith("please rate this") + || textLC.startsWith("post a comment") + || textLC.contains("what you think...") + || textLC.contains("add your comment") + || textLC.contains("add comment") + || textLC.contains("reader views") + || textLC.contains("have your say") + || textLC.contains("reader comments") + || textLC.contains("r��tta artikeln") + || textLC.contains("Réagir") + || textLC.contains("Vos réactions ") + || textLC + .equals("thanks for your comments - this feedback is now closed")) { + tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT); + changes = true; + } + } else if(tb.getLinkDensity() == 1.0) { + if(text.equals("Comment")) { + tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT); + } + } + } + } + + // timeSpent += System.currentTimeMillis() - t; + + return changes; + } + + /** + * Checks whether the given text t starts with a sequence of digits, + * followed by one of the given strings. + * + * @param t + * The text to examine + * @param len + * The length of the text to examine + * @param str + * Any strings that may follow the digits. + * @return true if at least one combination matches + */ + private static boolean startsWithNumber(final String t, final int len, + final String... str) { + int j = 0; + while (j < len && isDigit(t.charAt(j))) { + j++; + } + if (j != 0) { + for (String s : str) { + if (t.startsWith(s, j)) { + return true; + } + } + } + return false; + } + + private static boolean isDigit(final char c) { + return c >= '0' && c <= '9'; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/english/package.html b/src/main/java/de/l3s/boilerpipe/filters/english/package.html new file mode 100644 index 0000000..ec624a9 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/english/package.html @@ -0,0 +1,8 @@ + + +

The BoilerpipeFilters in this package have only been tested on + English text.

+

That is, they will probably work with other Western languages, + but maybe need some parameter tuning to perform well.

+ + diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java new file mode 100644 index 0000000..0922cb1 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java @@ -0,0 +1,84 @@ +/** + * boilerpipe + * + * Copyright (c) 2011 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.List; +import java.util.ListIterator; +import java.util.Set; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Adds the labels of the preceding block to the current block, optionally adding a prefix. + * + * @author Christian Kohlschütter + */ +public final class AddPrecedingLabelsFilter implements BoilerpipeFilter { + + public static final AddPrecedingLabelsFilter INSTANCE = new AddPrecedingLabelsFilter(""); + public static final AddPrecedingLabelsFilter INSTANCE_PRE = new AddPrecedingLabelsFilter("^"); + + private final String labelPrefix; + + /** + * Creates a new {@link AddPrecedingLabelsFilter} instance. + * + */ + public AddPrecedingLabelsFilter(final String labelPrefix) { + this.labelPrefix = labelPrefix; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + if (textBlocks.size() < 2) { + return false; + } + + boolean changes = false; + int remaining = textBlocks.size(); + + TextBlock blockBelow = null; + TextBlock block; + for (ListIterator it = textBlocks.listIterator(textBlocks.size()); it + .hasPrevious();) { + if(--remaining <= 0) { + break; + } + if(blockBelow == null) { + blockBelow = it.previous(); + continue; + } + block = it.previous(); + + Set labels = block.getLabels(); + if(labels != null && !labels.isEmpty()) { + for(String l : labels) { + blockBelow.addLabel(labelPrefix+l); + } + changes = true; + } + blockBelow = block; + } + + return changes; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java new file mode 100644 index 0000000..10f9d70 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java @@ -0,0 +1,43 @@ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.regex.Pattern; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +public class ArticleMetadataFilter implements BoilerpipeFilter { + private static final Pattern[] PATTERNS_SHORT = new Pattern[] { + Pattern + .compile("^[0-9 \\,\\./]*\\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\\b[0-9 \\,\\:apm\\./]*([CPSDMGET]{2,3})?$"), + Pattern.compile("^[Bb]y ") + }; + + + public static final ArticleMetadataFilter INSTANCE = new ArticleMetadataFilter(); + + private ArticleMetadataFilter() { + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + boolean changed = false; + for (TextBlock tb : doc.getTextBlocks()) { + if (tb.getNumWords() > 10) { + continue; + } + final String text = tb.getText(); + for (Pattern p : PATTERNS_SHORT) { + if (p.matcher(text).find()) { + changed = true; + tb.setIsContent(true); + tb.addLabel(DefaultLabels.ARTICLE_METADATA); + } + } + } + return changed; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java new file mode 100644 index 0000000..510c47f --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java @@ -0,0 +1,128 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.Iterator; +import java.util.List; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. + * This probably makes sense only in cases where an upstream filter already has removed some blocks. + * + * @author Christian Kohlschütter + */ +public final class BlockProximityFusion implements BoilerpipeFilter { + + private final int maxBlocksDistance; + + public static final BlockProximityFusion MAX_DISTANCE_1 = new BlockProximityFusion( + 1, false, false); + public static final BlockProximityFusion MAX_DISTANCE_1_SAME_TAGLEVEL = new BlockProximityFusion( + 1, false, true); + public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY = new BlockProximityFusion( + 1, true, false); + public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = new BlockProximityFusion( + 1, true, true); + + private final boolean contentOnly; + + private final boolean sameTagLevelOnly; + + /** + * Creates a new {@link BlockProximityFusion} instance. + * + * @param maxBlocksDistance The maximum distance in blocks. + * @param contentOnly + */ + public BlockProximityFusion(final int maxBlocksDistance, + final boolean contentOnly, final boolean sameTagLevelOnly) { + this.maxBlocksDistance = maxBlocksDistance; + this.contentOnly = contentOnly; + this.sameTagLevelOnly = sameTagLevelOnly; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + if (textBlocks.size() < 2) { + return false; + } + + boolean changes = false; + TextBlock prevBlock; + + int offset; + if (contentOnly) { + prevBlock = null; + offset = 0; + for (TextBlock tb : textBlocks) { + offset++; + if (tb.isContent()) { + prevBlock = tb; + break; + } + } + if (prevBlock == null) { + return false; + } + } else { + prevBlock = textBlocks.get(0); + offset = 1; + } + + for (Iterator it = textBlocks.listIterator(offset); it + .hasNext();) { + TextBlock block = it.next(); + if (!block.isContent()) { + prevBlock = block; + continue; + } + int diffBlocks = block.getOffsetBlocksStart() + - prevBlock.getOffsetBlocksEnd() - 1; + if (diffBlocks <= maxBlocksDistance) { + boolean ok = true; + if (contentOnly) { + if (!prevBlock.isContent() + || !block.isContent()) { + ok = false; + } + } + if(ok && sameTagLevelOnly && prevBlock.getTagLevel() != block.getTagLevel()) { + ok = false; + } + if (ok) { + prevBlock.mergeNext(block); + it.remove(); + changes = true; + } else { + prevBlock = block; + } + } else { + prevBlock = block; + } + } + + return changes; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java new file mode 100644 index 0000000..e44fc0c --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java @@ -0,0 +1,72 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +public final class ContentFusion implements BoilerpipeFilter { + + public static final ContentFusion INSTANCE = new ContentFusion(); + + /** + * Creates a new {@link ContentFusion} instance. + * + */ + public ContentFusion() { + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + if (textBlocks.size() < 2) { + return false; + } + + TextBlock prevBlock = textBlocks.get(0); + + boolean changes = false; + do { + changes = false; + for (ListIterator it = textBlocks.listIterator(1); it + .hasNext();) { + TextBlock block = it.next(); + + if (prevBlock.isContent() + && block.getLinkDensity() < 0.56 + && !block.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) { + + prevBlock.mergeNext(block); + it.remove(); + changes = true; + } else { + prevBlock = block; + } + } + } while (changes); + + return true; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java new file mode 100644 index 0000000..f3e4cda --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java @@ -0,0 +1,173 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks {@link TextBlock}s which contain parts of the HTML + * <TITLE> tag, using some heuristics which are quite + * specific to the news domain. + * + * @author Christian Kohlschütter + */ +public final class DocumentTitleMatchClassifier implements BoilerpipeFilter { + + private final Set potentialTitles; + + public DocumentTitleMatchClassifier(String title) { + if (title == null) { + this.potentialTitles = null; + } else { + + title = title.replace('\u00a0', ' '); + title = title.replace("'", ""); + + title = title.trim().toLowerCase(); + + if (title.length() == 0) { + this.potentialTitles = null; + } else { + this.potentialTitles = new HashSet(); + + potentialTitles.add(title); + + String p; + + p = getLongestPart(title, "[ ]*[\\|»|-][ ]*"); + if (p != null) { + potentialTitles.add(p); + } + p = getLongestPart(title, "[ ]*[\\|»|:][ ]*"); + if (p != null) { + potentialTitles.add(p); + } + p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)][ ]*"); + if (p != null) { + potentialTitles.add(p); + } + p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)\\-][ ]*"); + if (p != null) { + potentialTitles.add(p); + } + p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-][ ]*"); + if (p != null) { + potentialTitles.add(p); + } + p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-\u00a0][ ]*"); + if (p != null) { + potentialTitles.add(p); + } + + addPotentialTitles(potentialTitles, title, "[ ]+[\\|][ ]+", 4); + addPotentialTitles(potentialTitles, title, "[ ]+[\\-][ ]+", 4); + + potentialTitles.add(title.replaceFirst(" - [^\\-]+$", "")); + potentialTitles.add(title.replaceFirst("^[^\\-]+ - ", "")); + } + } + } + + public Set getPotentialTitles() { + return potentialTitles; + } + + private void addPotentialTitles(final Set potentialTitles, final String title, final String pattern, final int minWords) { + String[] parts = title.split(pattern); + if (parts.length == 1) { + return; + } + for (int i = 0; i < parts.length; i++) { + String p = parts[i]; + if (p.contains(".com")) { + continue; + } + final int numWords = p.split("[\b ]+").length; + if (numWords >=minWords) { + potentialTitles.add(p); + } + } + } + + private String getLongestPart(final String title, final String pattern) { + String[] parts = title.split(pattern); + if (parts.length == 1) { + return null; + } + int longestNumWords = 0; + String longestPart = ""; + for (int i = 0; i < parts.length; i++) { + String p = parts[i]; + if (p.contains(".com")) { + continue; + } + final int numWords = p.split("[\b ]+").length; + if (numWords > longestNumWords || p.length() > longestPart.length()) { + longestNumWords = numWords; + longestPart = p; + } + } + if (longestPart.length() == 0) { + return null; + } else { + return longestPart.trim(); + } + } + + private static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?\\!\\.\\-\\:]+"); + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + if (potentialTitles == null) { + return false; + } + boolean changes = false; + + for (final TextBlock tb : doc.getTextBlocks()) { + String text = tb.getText(); + + text = text.replace('\u00a0', ' '); + text = text.replace("'", ""); + + text = text.trim().toLowerCase(); + + if (potentialTitles.contains(text)) { + tb.addLabel(DefaultLabels.TITLE); + changes = true; + break; + } + + text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim(); + if (potentialTitles.contains(text)) { + tb.addLabel(DefaultLabels.TITLE); + changes = true; + break; + } + } + return changes; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java new file mode 100644 index 0000000..7268a45 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java @@ -0,0 +1,73 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks all {@link TextBlock}s "content" which are between the headline and the part that + * has already been marked content, if they are marked {@link DefaultLabels#MIGHT_BE_CONTENT}. + * + * This filter is quite specific to the news domain. + * + * @author Christian Kohlschütter + */ +public final class ExpandTitleToContentFilter implements BoilerpipeFilter { + public static final ExpandTitleToContentFilter INSTANCE = new ExpandTitleToContentFilter(); + + /** + * Returns the singleton instance for ExpandTitleToContentFilter. + */ + public static ExpandTitleToContentFilter getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + int i = 0; + int title = -1; + int contentStart = -1; + for (TextBlock tb : doc.getTextBlocks()) { + if (contentStart == -1 && tb.hasLabel(DefaultLabels.TITLE)) { + title = i; + contentStart = -1; + } + if (contentStart == -1 && tb.isContent()) { + contentStart = i; + } + + i++; + } + + if (contentStart <= title || title == -1) { + return false; + } + boolean changes = false; + for (TextBlock tb : doc.getTextBlocks().subList(title, contentStart)) { + if (tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)) { + changes = tb.setIsContent(true) | changes; + } + } + return changes; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java new file mode 100644 index 0000000..5d4cc31 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java @@ -0,0 +1,124 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Keeps the largest {@link TextBlock} only (by the number of words). In case of + * more than one block with the same number of words, the first block is chosen. + * All discarded blocks are marked "not content" and flagged as + * {@link DefaultLabels#MIGHT_BE_CONTENT}. + * + * Note that, by default, only TextBlocks marked as "content" are taken into consideration. + * + * @author Christian Kohlschütter + */ +public final class KeepLargestBlockFilter implements BoilerpipeFilter { + public static final KeepLargestBlockFilter INSTANCE = new KeepLargestBlockFilter( + false, 0); + public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL = new KeepLargestBlockFilter( + true, 0); + public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS = new KeepLargestBlockFilter( + true, 150); + private final boolean expandToSameLevelText; + private final int minWords; + + public KeepLargestBlockFilter(boolean expandToSameLevelText, final int minWords) { + this.expandToSameLevelText = expandToSameLevelText; + this.minWords = minWords; + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + if (textBlocks.size() < 2) { + return false; + } + + int maxNumWords = -1; + TextBlock largestBlock = null; + + int level = -1; + + int i = 0; + int n = -1; + for (TextBlock tb : textBlocks) { + if (tb.isContent()) { + final int nw = tb.getNumWords(); + + if (nw > maxNumWords) { + largestBlock = tb; + maxNumWords = nw; + + n = i; + + if (expandToSameLevelText) { + level = tb.getTagLevel(); + } + } + } + i++; + } + for (TextBlock tb : textBlocks) { + if (tb == largestBlock) { + tb.setIsContent(true); + tb.addLabel(DefaultLabels.VERY_LIKELY_CONTENT); + } else { + tb.setIsContent(false); + tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT); + } + } + if (expandToSameLevelText && n != -1) { + + for (ListIterator it = textBlocks.listIterator(n); it + .hasPrevious();) { + TextBlock tb = it.previous(); + final int tl = tb.getTagLevel(); + if(tl < level) { + break; + } else if(tl == level) { + if(tb.getNumWords() >= minWords) { + tb.setIsContent(true); + } + } + } + for (ListIterator it = textBlocks.listIterator(n); it + .hasNext();) { + TextBlock tb = it.next(); + final int tl = tb.getTagLevel(); + if(tl < level) { + break; + } else if(tl == level) { + if(tb.getNumWords() >= minWords) { + tb.setIsContent(true); + } + } + } + } + + return true; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java new file mode 100644 index 0000000..0ec3836 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java @@ -0,0 +1,91 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Fuses adjacent blocks if their labels are equal. + * + * @author Christian Kohlschütter + */ +public final class LabelFusion implements BoilerpipeFilter { + + public static final LabelFusion INSTANCE = new LabelFusion(); + + /** + * Creates a new {@link LabelFusion} instance. + */ + private LabelFusion() { + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + if (textBlocks.size() < 2) { + return false; + } + + boolean changes = false; + TextBlock prevBlock = textBlocks.get(0); + int offset = 1; + + for (Iterator it = textBlocks.listIterator(offset); it + .hasNext();) { + TextBlock block = it.next(); + + if(equalLabels(prevBlock.getLabels(), block.getLabels())) { + prevBlock.mergeNext(block); + it.remove(); + changes = true; + } else { + prevBlock = block; + } + } + + return changes; + } + + private boolean equalLabels(Set labels, Set labels2) { + if(labels == null || labels2 == null) { + return false; + } + return markupLabelsOnly(labels).equals(markupLabelsOnly(labels2)); + } + + private Set markupLabelsOnly(final Set set1) { + Set set = new HashSet(set1); + for(Iterator it = set.iterator(); it.hasNext(); ) { + final String label = it.next(); + if(!label.startsWith(DefaultLabels.MARKUP_PREFIX)) { + it.remove(); + } + } + return set; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java new file mode 100644 index 0000000..966e583 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java @@ -0,0 +1,70 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks all blocks as content that: + *
    + *
  1. are on the same tag-level as very likely main content (usually the level of the largest block)
  2. + *
  3. have a significant number of words, currently: at least 100
  4. + *
+ * + * @author Christian Kohlschütter + */ +public final class LargeBlockSameTagLevelToContentFilter implements BoilerpipeFilter { + public static final LargeBlockSameTagLevelToContentFilter INSTANCE = new LargeBlockSameTagLevelToContentFilter(); + private LargeBlockSameTagLevelToContentFilter() { + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + int tagLevel = -1; + for (TextBlock tb : doc.getTextBlocks()) { + if(tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { + tagLevel = tb.getTagLevel(); + break; + } + } + + if(tagLevel == -1) { + return false; + } + + for (TextBlock tb : doc.getTextBlocks()) { + if (!tb.isContent()) { + + if(tb.getNumWords() >= 100 && tb.getTagLevel() == tagLevel) { + tb.setIsContent(true); + changes = true; + } + } + } + + return changes; + + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java new file mode 100644 index 0000000..dfaae1b --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/ListAtEndFilter.java @@ -0,0 +1,64 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks nested list-item blocks after the end of the main content. + * + * @author Christian Kohlschütter + */ +public final class ListAtEndFilter implements BoilerpipeFilter { + public static final ListAtEndFilter INSTANCE = new ListAtEndFilter(); + + private ListAtEndFilter() { + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + int tagLevel = Integer.MAX_VALUE; + for (TextBlock tb : doc.getTextBlocks()) { + if (tb.isContent() + && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { + tagLevel = tb.getTagLevel(); + } else { + if (tb.getTagLevel() > tagLevel + && tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT) + && tb.hasLabel(DefaultLabels.LI) + && tb.getLinkDensity() == 0 + ) { + tb.setIsContent(true); + changes = true; + } else { + tagLevel = Integer.MAX_VALUE; + } + } + } + + return changes; + + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java new file mode 100644 index 0000000..e1fc17b --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java @@ -0,0 +1,70 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.Iterator; +import java.util.List; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Merges two subsequent blocks if their text densities are equal. + * + * @author Christian Kohlschütter + */ +public class SimpleBlockFusionProcessor implements BoilerpipeFilter { + public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor(); + + /** + * Returns the singleton instance for BlockFusionProcessor. + */ + public static SimpleBlockFusionProcessor getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + boolean changes = false; + + if (textBlocks.size() < 2) { + return false; + } + + TextBlock b1 = textBlocks.get(0); + for (Iterator it = textBlocks.listIterator(1); it.hasNext();) { + TextBlock b2 = it.next(); + + final boolean similar = (b1.getTextDensity() == b2.getTextDensity()); + + if(similar) { + b1.mergeNext(b2); + it.remove(); + changes = true; + } else { + b1 = b2; + } + } + + return changes; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java new file mode 100644 index 0000000..8a5b18d --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/TrailingHeadlineToBoilerplateFilter.java @@ -0,0 +1,66 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.heuristics; + +import java.util.List; +import java.util.ListIterator; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks trailing headlines ({@link TextBlock}s that have the label {@link DefaultLabels#HEADING}) + * as boilerplate. Trailing means they are marked content and are below any other content block. + * + * @author Christian Kohlschütter + */ +public final class TrailingHeadlineToBoilerplateFilter implements BoilerpipeFilter { + public static final TrailingHeadlineToBoilerplateFilter INSTANCE = new TrailingHeadlineToBoilerplateFilter(); + + /** + * Returns the singleton instance for ExpandTitleToContentFilter. + */ + public static TrailingHeadlineToBoilerplateFilter getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + boolean changes = false; + + List list = doc.getTextBlocks(); + + for (ListIterator it = list.listIterator(list.size()); it.hasPrevious(); ) { + TextBlock tb = it.previous(); + if(tb.isContent()) { + if(tb.hasLabel(DefaultLabels.HEADING)) { + tb.setIsContent(false); + changes = true; + } else { + break; + } + } + } + + return changes; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html b/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html new file mode 100644 index 0000000..a368224 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/heuristics/package.html @@ -0,0 +1,5 @@ + + +

The BoilerpipeFilters in this package are pure heuristics.

+ + diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java new file mode 100644 index 0000000..aff85a6 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java @@ -0,0 +1,71 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import java.util.Iterator; +import java.util.List; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Removes {@link TextBlock}s which have explicitly been marked as + * "not content". + * + * @author Christian Kohlschütter + */ +public final class BoilerplateBlockFilter implements BoilerpipeFilter { + public static final BoilerplateBlockFilter INSTANCE = new BoilerplateBlockFilter( + null); + public static final BoilerplateBlockFilter INSTANCE_KEEP_TITLE = new BoilerplateBlockFilter( + DefaultLabels.TITLE); + private final String labelToKeep; + + /** + * Returns the singleton instance for BoilerplateBlockFilter. + */ + public static BoilerplateBlockFilter getInstance() { + return INSTANCE; + } + + public BoilerplateBlockFilter(final String labelToKeep) { + this.labelToKeep = labelToKeep; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + List textBlocks = doc.getTextBlocks(); + boolean hasChanges = false; + + for (Iterator it = textBlocks.iterator(); it.hasNext();) { + TextBlock tb = it.next(); + if (!tb.isContent() + && (labelToKeep == null || !tb + .hasLabel(DefaultLabels.TITLE))) { + it.remove(); + hasChanges = true; + } + } + + return hasChanges; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java new file mode 100644 index 0000000..a464dbf --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/InvertedFilter.java @@ -0,0 +1,51 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import java.util.List; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Reverts the "isContent" flag for all {@link TextBlock}s + * + * @author Christian Kohlschütter + */ +public final class InvertedFilter implements BoilerpipeFilter { + public static final InvertedFilter INSTANCE = new InvertedFilter(); + private InvertedFilter() { + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + + List tbs = doc.getTextBlocks(); + if (tbs.isEmpty()) { + return false; + } + for (TextBlock tb : tbs) { + tb.setIsContent(!tb.isContent()); + } + + return true; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java new file mode 100644 index 0000000..3178f0b --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java @@ -0,0 +1,59 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.DefaultLabels; + +/** + * Marks all blocks that contain a given label as "boilerplate". + * + * @author Christian Kohlschütter + */ +public final class LabelToBoilerplateFilter implements BoilerpipeFilter { + public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT = new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT); + + private String[] labels; + + public LabelToBoilerplateFilter(final String... label) { + this.labels = label; + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) { + if (tb.isContent()) { + for (String label : labels) { + if (tb.hasLabel(label)) { + tb.setIsContent(false); + changes = true; + continue BLOCK_LOOP; + } + } + } + } + + return changes; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java new file mode 100644 index 0000000..e4bf856 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java @@ -0,0 +1,56 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Marks all blocks that contain a given label as "content". + * + * @author Christian Kohlschütter + */ +public final class LabelToContentFilter implements BoilerpipeFilter { + private String[] labels; + + public LabelToContentFilter(final String... label) { + this.labels = label; + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) { + if (!tb.isContent()) { + for (String label : labels) { + if (tb.hasLabel(label)) { + tb.setIsContent(true); + changes = true; + continue BLOCK_LOOP; + } + } + } + } + + return changes; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java new file mode 100644 index 0000000..e888334 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java @@ -0,0 +1,50 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Marks all blocks as boilerplate. + * + * @author Christian Kohlschütter + */ +public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter { + public static final MarkEverythingBoilerplateFilter INSTANCE = new MarkEverythingBoilerplateFilter(); + private MarkEverythingBoilerplateFilter() { + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + for (TextBlock tb : doc.getTextBlocks()) { + if (tb.isContent()) { + tb.setIsContent(false); + changes = true; + } + } + + return changes; + + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java new file mode 100644 index 0000000..8a8b7be --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java @@ -0,0 +1,50 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Marks all blocks as content. + * + * @author Christian Kohlschütter + */ +public final class MarkEverythingContentFilter implements BoilerpipeFilter { + public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter(); + private MarkEverythingContentFilter() { + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + for (TextBlock tb : doc.getTextBlocks()) { + if (!tb.isContent()) { + tb.setIsContent(true); + changes = true; + } + } + + return changes; + + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java new file mode 100644 index 0000000..d326059 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java @@ -0,0 +1,113 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Keeps only blocks that have at least one segment fragment ("clause") with at + * least k words (default: 5). + * + * NOTE: You might consider using the {@link SplitParagraphBlocksFilter} + * upstream. + * + * @author Christian Kohlschütter + * @see SplitParagraphBlocksFilter + */ +public final class MinClauseWordsFilter implements BoilerpipeFilter { + public static final MinClauseWordsFilter INSTANCE = new MinClauseWordsFilter( + 5, false); + private int minWords; + private final boolean acceptClausesWithoutDelimiter; + + public MinClauseWordsFilter(final int minWords) { + this(minWords, false); + } + + public MinClauseWordsFilter(final int minWords, + final boolean acceptClausesWithoutDelimiter) { + this.minWords = minWords; + this.acceptClausesWithoutDelimiter = acceptClausesWithoutDelimiter; + } + + private final Pattern PAT_CLAUSE_DELIMITER = Pattern + .compile("[\\p{L}\\d][\\,\\.\\:\\;\\!\\?]+([ \\n\\r]+|$)"); + private final Pattern PAT_WHITESPACE = Pattern.compile("[ \\n\\r]+"); + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + for (TextBlock tb : doc.getTextBlocks()) { + if (!tb.isContent()) { + continue; + } + final String text = tb.getText(); + + Matcher m = PAT_CLAUSE_DELIMITER.matcher(text); + boolean found = m.find(); + int start = 0; + int end; + boolean hasClause = false; + while (found) { + end = m.start() + 1; + hasClause = isClause(text.subSequence(start, end)); + start = m.end(); + + if (hasClause) { + break; + } + found = m.find(); + } + end = text.length(); + + // since clauses should *always end* with a delimiter, we normally + // don't consider text without one + if (acceptClausesWithoutDelimiter) { + hasClause |= isClause(text.subSequence(start, end)); + } + + if (!hasClause) { + tb.setIsContent(false); + changes = true; + // System.err.println("IS NOT CONTENT: " + text); + } + } + + return changes; + + } + + private boolean isClause(final CharSequence text) { + Matcher m = PAT_WHITESPACE.matcher(text); + int n = 1; + while (m.find()) { + n++; + if (n >= minWords) { + return true; + } + } + return n >= minWords; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java new file mode 100644 index 0000000..a3a49c4 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java @@ -0,0 +1,56 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Keeps only those content blocks which contain at least k words. + * + * @author Christian Kohlschütter + */ +public final class MinWordsFilter implements BoilerpipeFilter { + private final int minWords; + + public MinWordsFilter(final int minWords) { + this.minWords = minWords; + } + + public boolean process(final TextDocument doc) + throws BoilerpipeProcessingException { + + boolean changes = false; + + for (TextBlock tb : doc.getTextBlocks()) { + if (!tb.isContent()) { + continue; + } + if (tb.getNumWords() < minWords) { + tb.setIsContent(false); + changes = true; + } + + } + + return changes; + + } +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java new file mode 100644 index 0000000..86fae33 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java @@ -0,0 +1,82 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.filters.simple; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Splits TextBlocks at paragraph boundaries. + * + * NOTE: This is not fully supported (i.e., it will break highlighting support + * via #getContainedTextElements()), but this one probably is necessary for some other + * filters. + * + * @author Christian Kohlschütter + * @see MinClauseWordsFilter + */ +public final class SplitParagraphBlocksFilter implements BoilerpipeFilter { + public static final SplitParagraphBlocksFilter INSTANCE = new SplitParagraphBlocksFilter(); + + /** + * Returns the singleton instance for TerminatingBlocksFinder. + */ + public static SplitParagraphBlocksFilter getInstance() { + return INSTANCE; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + boolean changes = false; + + final List blocks = doc.getTextBlocks(); + final List blocksNew = new ArrayList(); + + for (TextBlock tb : blocks) { + final String text = tb.getText(); + final String[] paragraphs = text.split("[\n\r]+"); + if (paragraphs.length < 2) { + blocksNew.add(tb); + continue; + } + final boolean isContent = tb.isContent(); + final Set labels = tb.getLabels(); + for (String p : paragraphs) { + final TextBlock tbP = new TextBlock(p); + tbP.setIsContent(isContent); + tbP.addLabels(labels); + blocksNew.add(tbP); + changes = true; + } + } + + if (changes) { + blocks.clear(); + blocks.addAll(blocksNew); + } + + return changes; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java b/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java new file mode 100644 index 0000000..28cf002 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java @@ -0,0 +1,54 @@ +package de.l3s.boilerpipe.filters.simple; + +import java.util.Iterator; +import java.util.List; + +import de.l3s.boilerpipe.BoilerpipeFilter; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.conditions.TextBlockCondition; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +public class SurroundingToContentFilter implements BoilerpipeFilter { + public static final SurroundingToContentFilter INSTANCE_TEXT = new SurroundingToContentFilter(new TextBlockCondition() { + + public boolean meetsCondition(TextBlock tb) { + return tb.getLinkDensity() == 0 && tb.getNumWords() > 6; + } + }); + + private final TextBlockCondition cond; + public SurroundingToContentFilter(final TextBlockCondition cond) { + this.cond = cond; + } + + public boolean process(TextDocument doc) + throws BoilerpipeProcessingException { + + List tbs = doc.getTextBlocks(); + if (tbs.size() < 3) { + return false; + } + + TextBlock a = tbs.get(0); + TextBlock b = tbs.get(1); + TextBlock c; + boolean hasChanges = false; + for (Iterator it= tbs.listIterator(2);it.hasNext();) { + c = it.next(); + if(!b.isContent() && a.isContent() && c.isContent() && cond.meetsCondition(b)) { + b.setIsContent(true); + hasChanges = true; + } + + a = c; + if(!it.hasNext()) { + break; + } + b = it.next(); + } + + return hasChanges; + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/filters/simple/package.html b/src/main/java/de/l3s/boilerpipe/filters/simple/package.html new file mode 100644 index 0000000..bc7a25d --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/filters/simple/package.html @@ -0,0 +1,6 @@ + + +

The BoilerpipeFilters in this package are straight-forward and + probably not really specific to English.

+ + diff --git a/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java b/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java new file mode 100644 index 0000000..220e8df --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/labels/ConditionalLabelAction.java @@ -0,0 +1,43 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.labels; + +import de.l3s.boilerpipe.conditions.TextBlockCondition; +import de.l3s.boilerpipe.document.TextBlock; + +/** + * Adds labels to a {@link TextBlock} if the given criteria are met. + * + * @author Christian Kohlschütter + */ +public final class ConditionalLabelAction extends LabelAction { + + private final TextBlockCondition condition; + + public ConditionalLabelAction(TextBlockCondition condition, + String... labels) { + super(labels); + this.condition = condition; + } + + public void addTo(final TextBlock tb) { + if (condition.meetsCondition(tb)) { + addLabelsTo(tb); + } + } +} diff --git a/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java b/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java new file mode 100644 index 0000000..3c56533 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/labels/DefaultLabels.java @@ -0,0 +1,48 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.labels; + +import de.l3s.boilerpipe.document.TextBlock; + +/** + * Some pre-defined labels which can be used in conjunction with + * {@link TextBlock#addLabel(String)} and {@link TextBlock#hasLabel(String)}. + * + * @author Christian Kohlschütter + */ +public final class DefaultLabels { + public static final String TITLE = "de.l3s.boilerpipe/TITLE"; + public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA"; + public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT"; + public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT"; + public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT"; + public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT"; + public static final String HR = "de.l3s.boilerpipe/HR"; + public static final String LI = "de.l3s.boilerpipe/LI"; + + public static final String HEADING = "de.l3s.boilerpipe/HEADING"; + public static final String H1 = "de.l3s.boilerpipe/H1"; + public static final String H2 = "de.l3s.boilerpipe/H2"; + public static final String H3 = "de.l3s.boilerpipe/H3"; + + public static final String MARKUP_PREFIX = "<"; + + private DefaultLabels() { + // not to be instantiated + } +} diff --git a/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java b/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java new file mode 100644 index 0000000..b725f2e --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/labels/LabelAction.java @@ -0,0 +1,48 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.labels; + +import java.util.Arrays; + +import de.l3s.boilerpipe.document.TextBlock; + +/** + * Helps adding labels to {@link TextBlock}s. + * + * @author Christian Kohlschütter + * @see ConditionalLabelAction + */ +public class LabelAction { + protected final String[] labels; + + public LabelAction(String... labels) { + this.labels = labels; + } + + public void addTo(final TextBlock tb) { + addLabelsTo(tb); + } + + protected final void addLabelsTo(final TextBlock tb) { + tb.addLabels(labels); + } + + public String toString() { + return super.toString()+"{"+Arrays.asList(labels)+"}"; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/package.html b/src/main/java/de/l3s/boilerpipe/package.html new file mode 100644 index 0000000..81c88d6 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/package.html @@ -0,0 +1,5 @@ + + +

The Boilerpipe top-level package.

+ + diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java new file mode 100644 index 0000000..f8cd767 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java @@ -0,0 +1,454 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.labels.LabelAction; +import de.l3s.boilerpipe.util.UnicodeTokenizer; + +/** + * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can + * be used by different parser implementations, e.g. NekoHTML and TagSoup. + * + * @author Christian Kohlschütter + */ +public class BoilerpipeHTMLContentHandler implements ContentHandler { + + private final Map tagActions; + private String title = null; + + static final String ANCHOR_TEXT_START = "$\ue00a<"; + static final String ANCHOR_TEXT_END = ">\ue00a$"; + + StringBuilder tokenBuffer = new StringBuilder(); + StringBuilder textBuffer = new StringBuilder(); + + int inBody = 0; + int inAnchor = 0; + int inIgnorableElement = 0; + + int tagLevel = 0; + int blockTagLevel = -1; + + boolean sbLastWasWhitespace = false; + private int textElementIdx = 0; + + private final List textBlocks = new ArrayList(); + + private String lastStartTag = null; + @SuppressWarnings("unused") + private String lastEndTag = null; + @SuppressWarnings("unused") + private Event lastEvent = null; + + private int offsetBlocks = 0; + private BitSet currentContainedTextElements = new BitSet(); + + private boolean flush = false; + boolean inAnchorText = false; + + LinkedList> labelStacks = new LinkedList>(); + LinkedList fontSizeStack = new LinkedList(); + + /** + * Recycles this instance. + */ + public void recycle() { + tokenBuffer.setLength(0); + textBuffer.setLength(0); + + inBody = 0; + inAnchor = 0; + inIgnorableElement = 0; + sbLastWasWhitespace = false; + textElementIdx = 0; + + textBlocks.clear(); + + lastStartTag = null; + lastEndTag = null; + lastEvent = null; + + offsetBlocks = 0; + currentContainedTextElements.clear(); + + flush = false; + inAnchorText = false; + } + + /** + * Constructs a {@link BoilerpipeHTMLContentHandler} using the + * {@link DefaultTagActionMap}. + */ + public BoilerpipeHTMLContentHandler() { + this(DefaultTagActionMap.INSTANCE); + } + + /** + * Constructs a {@link BoilerpipeHTMLContentHandler} using the given + * {@link TagActionMap}. + * + * @param tagActions + * The {@link TagActionMap} to use, e.g. + * {@link DefaultTagActionMap}. + */ + public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { + this.tagActions = tagActions; + } + + // @Override + public void endDocument() throws SAXException { + flushBlock(); + } + + // @Override + public void endPrefixMapping(String prefix) throws SAXException { + } + + // @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + if (!sbLastWasWhitespace) { + textBuffer.append(' '); + tokenBuffer.append(' '); + } + sbLastWasWhitespace = true; + } + + // @Override + public void processingInstruction(String target, String data) + throws SAXException { + } + + // @Override + public void setDocumentLocator(Locator locator) { + } + + // @Override + public void skippedEntity(String name) throws SAXException { + } + + // @Override + public void startDocument() throws SAXException { + } + + // @Override + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + } + + // @Override + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + labelStacks.add(null); + + TagAction ta = tagActions.get(localName); + if (ta != null) { + if(ta.changesTagLevel()) { + tagLevel++; + } + flush = ta.start(this, localName, qName, atts) | flush; + } else { + tagLevel++; + flush = true; + } + + lastEvent = Event.START_TAG; + lastStartTag = localName; + } + + // @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + TagAction ta = tagActions.get(localName); + if (ta != null) { + flush = ta.end(this, localName, qName) | flush; + } else { + flush = true; + } + + if(ta == null || ta.changesTagLevel()) { + tagLevel--; + } + + if (flush) { + flushBlock(); + } + + lastEvent = Event.END_TAG; + lastEndTag = localName; + + labelStacks.removeLast(); + } + + // @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + textElementIdx++; + + + if (flush) { + flushBlock(); + flush = false; + } + + if (inIgnorableElement != 0) { + return; + } + + char c; + boolean startWhitespace = false; + boolean endWhitespace = false; + if (length == 0) { + return; + } + + final int end = start + length; + for (int i = start; i < end; i++) { + if (Character.isWhitespace(ch[i])) { + ch[i] = ' '; + } + } + while (start < end) { + c = ch[start]; + if (c == ' ') { + startWhitespace = true; + start++; + length--; + } else { + break; + } + } + while (length > 0) { + c = ch[start + length - 1]; + if (c == ' ') { + endWhitespace = true; + length--; + } else { + break; + } + } + if (length == 0) { + if (startWhitespace || endWhitespace) { + if (!sbLastWasWhitespace) { + textBuffer.append(' '); + tokenBuffer.append(' '); + } + sbLastWasWhitespace = true; + } else { + sbLastWasWhitespace = false; + } + lastEvent = Event.WHITESPACE; + return; + } + if (startWhitespace) { + if (!sbLastWasWhitespace) { + textBuffer.append(' '); + tokenBuffer.append(' '); + } + } + + if (blockTagLevel == -1) { + blockTagLevel = tagLevel; + } + + textBuffer.append(ch, start, length); + tokenBuffer.append(ch, start, length); + if (endWhitespace) { + textBuffer.append(' '); + tokenBuffer.append(' '); + } + + sbLastWasWhitespace = endWhitespace; + lastEvent = Event.CHARACTERS; + + currentContainedTextElements.set(textElementIdx); + } + + List getTextBlocks() { + return textBlocks; + } + + public void flushBlock() { + if (inBody == 0) { + if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { + setTitle(tokenBuffer.toString().trim()); + } + textBuffer.setLength(0); + tokenBuffer.setLength(0); + return; + } + + final int length = tokenBuffer.length(); + switch (length) { + case 0: + return; + case 1: + if (sbLastWasWhitespace) { + textBuffer.setLength(0); + tokenBuffer.setLength(0); + return; + } + } + final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); + + int numWords = 0; + int numLinkedWords = 0; + int numWrappedLines = 0; + int currentLineLength = -1; // don't count the first space + final int maxLineLength = 80; + int numTokens = 0; + int numWordsCurrentLine = 0; + + for (String token : tokens) { + if (ANCHOR_TEXT_START.equals(token)) { + inAnchorText = true; + } else if (ANCHOR_TEXT_END.equals(token)) { + inAnchorText = false; + } else if (isWord(token)) { + numTokens++; + numWords++; + numWordsCurrentLine++; + if (inAnchorText) { + numLinkedWords++; + } + final int tokenLength = token.length(); + currentLineLength += tokenLength + 1; + if (currentLineLength > maxLineLength) { + numWrappedLines++; + currentLineLength = tokenLength; + numWordsCurrentLine = 1; + } + } else { + numTokens++; + } + } + if (numTokens == 0) { + return; + } + int numWordsInWrappedLines; + if (numWrappedLines == 0) { + numWordsInWrappedLines = numWords; + numWrappedLines = 1; + } else { + numWordsInWrappedLines = numWords - numWordsCurrentLine; + } + + TextBlock tb = new TextBlock(textBuffer.toString().trim(), + currentContainedTextElements, numWords, numLinkedWords, + numWordsInWrappedLines, numWrappedLines, offsetBlocks); + currentContainedTextElements = new BitSet(); + + offsetBlocks++; + + textBuffer.setLength(0); + tokenBuffer.setLength(0); + + tb.setTagLevel(blockTagLevel); + addTextBlock(tb); + blockTagLevel = -1; + } + + protected void addTextBlock(final TextBlock tb) { + + for (Integer l : fontSizeStack) { + if (l != null) { + tb.addLabel("font-" + l); + break; + } + } + for (LinkedList labelStack : labelStacks) { + if (labelStack != null) { + for (LabelAction labels : labelStack) { + if (labels != null) { + labels.addTo(tb); + } + } + } + } + + textBlocks.add(tb); + } + + private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern + .compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]"); + + private static boolean isWord(final String token) { + return PAT_VALID_WORD_CHARACTER.matcher(token).find(); + } + + static private enum Event { + START_TAG, END_TAG, CHARACTERS, WHITESPACE + } + + public String getTitle() { + return title; + } + + public void setTitle(String s) { + if (s == null || s.length() == 0) { + return; + } + title = s; + } + + /** + * Returns a {@link TextDocument} containing the extracted {@link TextBlock} + * s. NOTE: Only call this after parsing. + * + * @return The {@link TextDocument} + */ + public TextDocument toTextDocument() { + // just to be sure + flushBlock(); + + return new TextDocument(getTitle(), getTextBlocks()); + } + + public void addWhitespaceIfNecessary() { + if (!sbLastWasWhitespace) { + tokenBuffer.append(' '); + textBuffer.append(' '); + sbLastWasWhitespace = true; + } + } + + public void addLabelAction(final LabelAction la) + throws IllegalStateException { + LinkedList labelStack = labelStacks.getLast(); + if (labelStack == null) { + labelStack = new LinkedList(); + labelStacks.removeLast(); + labelStacks.add(labelStack); + } + labelStack.add(la); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java new file mode 100644 index 0000000..79dcc72 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java @@ -0,0 +1,76 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import org.apache.xerces.parsers.AbstractSAXParser; +import org.cyberneko.html.HTMLConfiguration; + +import de.l3s.boilerpipe.BoilerpipeDocumentSource; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * A simple SAX Parser, used by {@link BoilerpipeSAXInput}. The parser uses CyberNeko to parse HTML content. + * + * @author Christian Kohlschütter + */ +public class BoilerpipeHTMLParser extends AbstractSAXParser implements BoilerpipeDocumentSource { + + private BoilerpipeHTMLContentHandler contentHandler; + + /** + * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler. + */ + public BoilerpipeHTMLParser() { + this(new BoilerpipeHTMLContentHandler()); + } + + /** + * Constructs a {@link BoilerpipeHTMLParser} using the given {@link BoilerpipeHTMLContentHandler}. + * + * @param contentHandler + */ + public BoilerpipeHTMLParser(BoilerpipeHTMLContentHandler contentHandler) { + super(new HTMLConfiguration()); + setContentHandler(contentHandler); + } + + protected BoilerpipeHTMLParser(boolean ignore) { + super(new HTMLConfiguration()); + } + + public void setContentHandler(final BoilerpipeHTMLContentHandler contentHandler) { + this.contentHandler = contentHandler; + super.setContentHandler(contentHandler); + } + public void setContentHandler(final org.xml.sax.ContentHandler contentHandler) { + this.contentHandler = null; + super.setContentHandler(contentHandler); + } + + /** + * Returns a {@link TextDocument} containing the extracted {@link TextBlock} + * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}. + * + * @return The {@link TextDocument} + */ + public TextDocument toTextDocument() { + return contentHandler.toTextDocument(); + } +} \ No newline at end of file diff --git a/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java new file mode 100644 index 0000000..f95fd41 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java @@ -0,0 +1,73 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import java.io.IOException; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.BoilerpipeInput; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Parses an {@link InputSource} using SAX and returns a {@link TextDocument}. + * + * @author Christian Kohlschütter + */ +public final class BoilerpipeSAXInput implements BoilerpipeInput { + private final InputSource is; + + /** + * Creates a new instance of {@link BoilerpipeSAXInput} for the given {@link InputSource}. + * + * @param is + * @throws SAXException + */ + public BoilerpipeSAXInput(final InputSource is) throws SAXException { + this.is = is; + } + + /** + * Retrieves the {@link TextDocument} using a default HTML parser. + */ + public TextDocument getTextDocument() throws BoilerpipeProcessingException { + return getTextDocument(new BoilerpipeHTMLParser()); + } + + /** + * Retrieves the {@link TextDocument} using the given HTML parser. + * + * @param parser The parser used to transform the input into boilerpipe's internal representation. + * @return The retrieved {@link TextDocument} + * @throws BoilerpipeProcessingException + */ + public TextDocument getTextDocument(final BoilerpipeHTMLParser parser) throws BoilerpipeProcessingException { + try { + parser.parse(is); + } catch (IOException e) { + throw new BoilerpipeProcessingException(e); + } catch (SAXException e) { + throw new BoilerpipeProcessingException(e); + } + + return parser.toTextDocument(); + } + +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java b/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java new file mode 100644 index 0000000..7b9c410 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/CommonTagActions.java @@ -0,0 +1,357 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.labels.LabelAction; + +/** + * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing. + * + * @author Christian Kohlschütter + */ +public abstract class CommonTagActions { + + private CommonTagActions() { + } + + public static final class Chained implements TagAction { + + private final TagAction t1; + private final TagAction t2; + + public Chained(final TagAction t1, final TagAction t2) { + this.t1 = t1; + this.t2 = t2; + } + + public boolean start(BoilerpipeHTMLContentHandler instance, + String localName, String qName, Attributes atts) + throws SAXException { + return t1.start(instance, localName, qName, atts) + | t2.start(instance, localName, qName, atts); + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + String localName, String qName) throws SAXException { + return t1.end(instance, localName, qName) + | t2.end(instance, localName, qName); + } + + public boolean changesTagLevel() { + return t1.changesTagLevel() || t2.changesTagLevel(); + } + } + + /** + * Marks this tag as "ignorable", i.e. all its inner content is silently skipped. + */ + public static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() { + + public boolean start(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + instance.inIgnorableElement++; + return true; + } + + public boolean end(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + instance.inIgnorableElement--; + return true; + } + + public boolean changesTagLevel() { + return true; + } + }; + + /** + * Marks this tag as "anchor" (this should usually only be set for the <A> tag). + * Anchor tags may not be nested. + * + * There is a bug in certain versions of NekoHTML which still allows nested tags. + * If boilerpipe encounters such nestings, a SAXException is thrown. + */ + public static final TagAction TA_ANCHOR_TEXT = new TagAction() { + + public boolean start(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) throws SAXException { + if (instance.inAnchor++ > 0) { + // as nested A elements are not allowed per specification, we + // are probably reaching this branch due to a bug in the XML + // parser + System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."); + + end(instance, localName, qName); + } + if (instance.inIgnorableElement == 0) { + instance.addWhitespaceIfNecessary(); + instance.tokenBuffer + .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_START); + instance.tokenBuffer.append(' '); + instance.sbLastWasWhitespace = true; + } + return false; + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + if (--instance.inAnchor == 0) { + if (instance.inIgnorableElement == 0) { + instance.addWhitespaceIfNecessary(); + instance.tokenBuffer + .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_END); + instance.tokenBuffer.append(' '); + instance.sbLastWasWhitespace = true; + } + } + return false; + } + + public boolean changesTagLevel() { + return true; + } + }; + + /** + * Marks this tag the body element (this should usually only be set for the <BODY> tag). + */ + public static final TagAction TA_BODY = new TagAction() { + public boolean start(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + instance.flushBlock(); + instance.inBody++; + return false; + } + + public boolean end(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + instance.flushBlock(); + instance.inBody--; + return false; + } + + public boolean changesTagLevel() { + return true; + } + }; + + /** + * Marks this tag a simple "inline" element, which generates whitespace, but no new block. + */ + public static final TagAction TA_INLINE_WHITESPACE = new TagAction() { + + public boolean start(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + instance.addWhitespaceIfNecessary(); + return false; + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + instance.addWhitespaceIfNecessary(); + return false; + } + + public boolean changesTagLevel() { + return false; + } + }; + + /** + * @deprecated Use {@link #TA_INLINE_WHITESPACE} instead + */ + @Deprecated + public static final TagAction TA_INLINE = TA_INLINE_WHITESPACE; + + /** + * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block. + */ + public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() { + + public boolean start(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + return false; + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + return false; + } + + public boolean changesTagLevel() { + return false; + } + }; + private static final Pattern PAT_FONT_SIZE = Pattern + .compile("([\\+\\-]?)([0-9])"); + + /** + * Explicitly marks this tag a simple "block-level" element, which always generates whitespace + */ + public static final TagAction TA_BLOCK_LEVEL = new TagAction() { + + public boolean start(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + return true; + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + return true; + } + + public boolean changesTagLevel() { + return true; + } + }; + + /** + * Special TagAction for the <FONT> tag, which keeps track of the + * absolute and relative font size. + */ + public static final TagAction TA_FONT = new TagAction() { + + public boolean start(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + + String sizeAttr = atts.getValue("size"); + if (sizeAttr != null) { + Matcher m = PAT_FONT_SIZE.matcher(sizeAttr); + if (m.matches()) { + String rel = m.group(1); + final int val = Integer.parseInt(m.group(2)); + final int size; + if (rel.length() == 0) { + // absolute + size = val; + } else { + // relative + int prevSize; + if (instance.fontSizeStack.isEmpty()) { + prevSize = 3; + } else { + prevSize = 3; + for (Integer s : instance.fontSizeStack) { + if (s != null) { + prevSize = s; + break; + } + } + } + if (rel.charAt(0) == '+') { + size = prevSize + val; + } else { + size = prevSize - val; + } + + } + instance.fontSizeStack.add(0, size); + } else { + instance.fontSizeStack.add(0, null); + } + } else { + instance.fontSizeStack.add(0, null); + } + return false; + } + + public boolean end(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + instance.fontSizeStack.removeFirst(); + return false; + } + + public boolean changesTagLevel() { + return false; + } + }; + + /** + * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated + * {@link TextBlock}. + */ + public static final class InlineTagLabelAction implements TagAction { + + private final LabelAction action; + + public InlineTagLabelAction(final LabelAction action) { + this.action = action; + } + + public boolean start(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + instance.addWhitespaceIfNecessary(); + instance.addLabelAction(action); + return false; + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + instance.addWhitespaceIfNecessary(); + return false; + } + + public boolean changesTagLevel() { + return false; + } + } + + /** + * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated + * {@link TextBlock}. + */ + public static final class BlockTagLabelAction implements TagAction { + + private final LabelAction action; + + public BlockTagLabelAction(final LabelAction action) { + this.action = action; + } + + public boolean start(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, + final Attributes atts) { + instance.addLabelAction(action); + return true; + } + + public boolean end(BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) { + return true; + } + + public boolean changesTagLevel() { + return true; + } + } +} \ No newline at end of file diff --git a/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java b/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java new file mode 100644 index 0000000..cf48dac --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/DefaultTagActionMap.java @@ -0,0 +1,86 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import de.l3s.boilerpipe.labels.DefaultLabels; +import de.l3s.boilerpipe.labels.LabelAction; + + +/** + * Default {@link TagAction}s. Seem to work well. + * + * @see TagActionMap + */ +public class DefaultTagActionMap extends TagActionMap { + + /** + * + */ + private static final long serialVersionUID = 1L; + + public static final TagActionMap INSTANCE = new DefaultTagActionMap(); + + protected DefaultTagActionMap() { + setTagAction("STYLE", CommonTagActions.TA_IGNORABLE_ELEMENT); + setTagAction("SCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT); + setTagAction("OPTION", CommonTagActions.TA_IGNORABLE_ELEMENT); + setTagAction("OBJECT", CommonTagActions.TA_IGNORABLE_ELEMENT); + setTagAction("EMBED", CommonTagActions.TA_IGNORABLE_ELEMENT); + setTagAction("APPLET", CommonTagActions.TA_IGNORABLE_ELEMENT); + setTagAction("LINK", CommonTagActions.TA_IGNORABLE_ELEMENT); + + setTagAction("A", CommonTagActions.TA_ANCHOR_TEXT); + setTagAction("BODY", CommonTagActions.TA_BODY); + + setTagAction("STRIKE", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("U", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("B", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("I", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("EM", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("STRONG", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("SPAN", CommonTagActions.TA_INLINE_NO_WHITESPACE); + + // New in 1.1 (especially to improve extraction quality from Wikipedia etc.) + setTagAction("SUP", CommonTagActions.TA_INLINE_NO_WHITESPACE); + + // New in 1.2 + setTagAction("CODE", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("TT", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("SUB", CommonTagActions.TA_INLINE_NO_WHITESPACE); + setTagAction("VAR", CommonTagActions.TA_INLINE_NO_WHITESPACE); + + + setTagAction("ABBR", CommonTagActions.TA_INLINE_WHITESPACE); + setTagAction("ACRONYM", CommonTagActions.TA_INLINE_WHITESPACE); + + setTagAction("FONT", CommonTagActions.TA_INLINE_NO_WHITESPACE); // could also use TA_FONT + + // added in 1.1.1 + setTagAction("NOSCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT); + + // New in 1.3 + setTagAction("LI", new CommonTagActions.BlockTagLabelAction( + new LabelAction(DefaultLabels.LI))); + setTagAction("H1", new CommonTagActions.BlockTagLabelAction( + new LabelAction(DefaultLabels.H1, DefaultLabels.HEADING))); + setTagAction("H2", new CommonTagActions.BlockTagLabelAction( + new LabelAction(DefaultLabels.H2, DefaultLabels.HEADING))); + setTagAction("H3", new CommonTagActions.BlockTagLabelAction( + new LabelAction(DefaultLabels.H3, DefaultLabels.HEADING))); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java new file mode 100644 index 0000000..9cf2d87 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLDocument.java @@ -0,0 +1,41 @@ +package de.l3s.boilerpipe.sax; + +import java.io.ByteArrayInputStream; +import java.nio.charset.Charset; + +import org.xml.sax.InputSource; + +/** + * An {@link InputSourceable} for {@link HTMLFetcher}. + * + * @author Christian Kohlschütter + */ +public class HTMLDocument implements InputSourceable { + private final Charset charset; + private final byte[] data; + + public HTMLDocument(final byte[] data, final Charset charset) { + this.data = data; + this.charset = charset; + } + + public HTMLDocument(final String data) { + Charset cs = Charset.forName("utf-8"); + this.data = data.getBytes(cs); + this.charset = cs; + } + + public Charset getCharset() { + return charset; + } + + public byte[] getData() { + return data; + } + + public InputSource toInputSource() { + final InputSource is = new InputSource(new ByteArrayInputStream(data)); + is.setEncoding(charset.name()); + return is; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java new file mode 100644 index 0000000..2c2e0c4 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLFetcher.java @@ -0,0 +1,79 @@ +package de.l3s.boilerpipe.sax; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; + +/** + * A very simple HTTP/HTML fetcher, really just for demo purposes. + * + * @author Christian Kohlschütter + */ +public class HTMLFetcher { + private HTMLFetcher() { + } + + private static final Pattern PAT_CHARSET = Pattern + .compile("charset=([^; ]+)$"); + + /** + * Fetches the document at the given URL, using {@link URLConnection}. + * + * @param url + * @return the document at the given URL + * @throws IOException + */ + public static HTMLDocument fetch(final URL url) throws IOException { + final URLConnection conn = url.openConnection(); + final String ct = conn.getContentType(); + + if (ct == null + || !(ct.equals("text/html") || ct.startsWith("text/html;"))) { + throw new IOException("Unsupported content type: "+ct); + } + + Charset cs = Charset.forName("Cp1252"); + if (ct != null) { + Matcher m = PAT_CHARSET.matcher(ct); + if (m.find()) { + final String charset = m.group(1); + try { + cs = Charset.forName(charset); + } catch (UnsupportedCharsetException e) { + // keep default + } + } + } + + InputStream in = conn.getInputStream(); + + final String encoding = conn.getContentEncoding(); + if (encoding != null) { + if ("gzip".equalsIgnoreCase(encoding)) { + in = new GZIPInputStream(in); + } else { + System.err.println("WARN: unsupported Content-Encoding: " + + encoding); + } + } + + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + byte[] buf = new byte[4096]; + int r; + while ((r = in.read(buf)) != -1) { + bos.write(buf, 0, r); + } + in.close(); + + final byte[] data = bos.toByteArray(); + + return new HTMLDocument(data, cs); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java b/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java new file mode 100644 index 0000000..4a300c3 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/HTMLHighlighter.java @@ -0,0 +1,530 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import java.io.IOException; +import java.io.StringReader; +import java.net.URL; +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.xerces.parsers.AbstractSAXParser; +import org.cyberneko.html.HTMLConfiguration; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Highlights text blocks in an HTML document that have been marked as "content" + * in the corresponding {@link TextDocument}. + * + * @author Christian Kohlschütter + */ +public final class HTMLHighlighter { + + private Map> tagWhitelist = null; + + /** + * Creates a new {@link HTMLHighlighter}, which is set-up to return the full + * HTML text, with the extracted text portion highlighted. + */ + public static HTMLHighlighter newHighlightingInstance() { + return new HTMLHighlighter(false); + } + + /** + * Creates a new {@link HTMLHighlighter}, which is set-up to return only the + * extracted HTML text, including enclosed markup. + */ + public static HTMLHighlighter newExtractingInstance() { + return new HTMLHighlighter(true); + } + + private HTMLHighlighter(final boolean extractHTML) { + if (extractHTML) { + setOutputHighlightOnly(true); + setExtraStyleSheet("\n\n"); + setPreHighlight(""); + setPostHighlight(""); + } + } + + /** + * Processes the given {@link TextDocument} and the original HTML text (as a + * String). + * + * @param doc + * The processed {@link TextDocument}. + * @param origHTML + * The original HTML document. + * @return The highlighted HTML. + * @throws BoilerpipeProcessingException + */ + public String process(final TextDocument doc, final String origHTML) + throws BoilerpipeProcessingException { + return process(doc, new InputSource(new StringReader(origHTML))); + } + + /** + * Processes the given {@link TextDocument} and the original HTML text (as + * an {@link InputSource}). + * + * @param doc + * The processed {@link TextDocument}. + * The original HTML document. + * @return The highlighted HTML. + * @throws BoilerpipeProcessingException + */ + public String process(final TextDocument doc, final InputSource is) + throws BoilerpipeProcessingException { + final Implementation implementation = new Implementation(); + implementation.process(doc, is); + + String html = implementation.html.toString(); + if (outputHighlightOnly) { + Matcher m; + + boolean repeat = true; + while (repeat) { + repeat = false; + m = PAT_TAG_NO_TEXT.matcher(html); + if (m.find()) { + repeat = true; + html = m.replaceAll(""); + } + + m = PAT_SUPER_TAG.matcher(html); + if (m.find()) { + repeat = true; + html = m.replaceAll(m.group(1)); + } + } + } + + return html; + } + + private static final Pattern PAT_TAG_NO_TEXT = Pattern + .compile("<[^/][^>]*>]*>"); + private static final Pattern PAT_SUPER_TAG = Pattern + .compile("^<[^>]*>(<.*?>)]*>$"); + + /** + * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the + * retrieved HTML using the specified {@link BoilerpipeExtractor}. + * + * The processed {@link TextDocument}. + * The original HTML document. + * @return The highlighted HTML. + * @throws BoilerpipeProcessingException + */ + public String process(final URL url, final BoilerpipeExtractor extractor) + throws IOException, BoilerpipeProcessingException, SAXException { + final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); + + final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) + .getTextDocument(); + extractor.process(doc); + + final InputSource is = htmlDoc.toInputSource(); + + return process(doc, is); + } + + private boolean outputHighlightOnly = false; + private String extraStyleSheet = "\n\n"; + private String preHighlight = ""; + private String postHighlight = ""; + + /** + * If true, only HTML enclosed within highlighted content will be returned + */ + public boolean isOutputHighlightOnly() { + return outputHighlightOnly; + } + + /** + * Sets whether only HTML enclosed within highlighted content will be + * returned, or the whole HTML document. + */ + public void setOutputHighlightOnly(boolean outputHighlightOnly) { + this.outputHighlightOnly = outputHighlightOnly; + } + + /** + * Returns the extra stylesheet definition that will be inserted in the HEAD + * element. + * + * By default, this corresponds to a simple definition that marks text in + * class "x-boilerpipe-mark1" as inline text with yellow background. + */ + public String getExtraStyleSheet() { + return extraStyleSheet; + } + + /** + * Sets the extra stylesheet definition that will be inserted in the HEAD + * element. + * + * To disable, set it to the empty string: "" + * + * @param extraStyleSheet + * Plain HTML + */ + public void setExtraStyleSheet(String extraStyleSheet) { + this.extraStyleSheet = extraStyleSheet; + } + + /** + * Returns the string that will be inserted before any highlighted HTML + * block. + * + * By default, this corresponds to + * <span class=&qupt;x-boilerpipe-mark1"> + */ + public String getPreHighlight() { + return preHighlight; + } + + /** + * Sets the string that will be inserted prior to any highlighted HTML + * block. + * + * To disable, set it to the empty string: "" + */ + public void setPreHighlight(String preHighlight) { + this.preHighlight = preHighlight; + } + + /** + * Returns the string that will be inserted after any highlighted HTML + * block. + * + * By default, this corresponds to </span> + */ + public String getPostHighlight() { + return postHighlight; + } + + /** + * Sets the string that will be inserted after any highlighted HTML block. + * + * To disable, set it to the empty string: "" + */ + public void setPostHighlight(String postHighlight) { + this.postHighlight = postHighlight; + } + + private abstract static class TagAction { + void beforeStart(final Implementation instance, final String localName) { + } + + void afterStart(final Implementation instance, final String localName) { + } + + void beforeEnd(final Implementation instance, final String localName) { + } + + void afterEnd(final Implementation instance, final String localName) { + } + } + + private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() { + void beforeStart(final Implementation instance, final String localName) { + instance.inIgnorableElement++; + } + + void afterEnd(final Implementation instance, final String localName) { + instance.inIgnorableElement--; + } + }; + + private static final TagAction TA_HEAD = new TagAction() { + void beforeStart(final Implementation instance, final String localName) { + instance.inIgnorableElement++; + } + + void beforeEnd(final Implementation instance, String localName) { + instance.html.append(instance.hl.extraStyleSheet); + } + + void afterEnd(final Implementation instance, final String localName) { + instance.inIgnorableElement--; + } + }; + private static Map TAG_ACTIONS = new HashMap(); + static { + TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("OBJECT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT); + // NOTE: you might want to comment this out: + TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT); + + TAG_ACTIONS.put("HEAD", TA_HEAD); + } + + private final class Implementation extends AbstractSAXParser implements + ContentHandler { + StringBuilder html = new StringBuilder(); + + private int inIgnorableElement = 0; + private int characterElementIdx = 0; + private final BitSet contentBitSet = new BitSet(); + private final HTMLHighlighter hl = HTMLHighlighter.this; + + Implementation() { + super(new HTMLConfiguration()); + setContentHandler(this); + } + + void process(final TextDocument doc, final InputSource is) + throws BoilerpipeProcessingException { + for (TextBlock block : doc.getTextBlocks()) { + if (block.isContent()) { + final BitSet bs = block.getContainedTextElements(); + if (bs != null) { + contentBitSet.or(bs); + } + } + } + + try { + parse(is); + } catch (SAXException e) { + throw new BoilerpipeProcessingException(e); + } catch (IOException e) { + throw new BoilerpipeProcessingException(e); + } + } + + public void endDocument() throws SAXException { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + } + + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startDocument() throws SAXException { + } + + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + TagAction ta = TAG_ACTIONS.get(localName); + if (ta != null) { + ta.beforeStart(this, localName); + } + + // HACK: remove existing highlight + boolean ignoreAttrs = false; + if ("SPAN".equalsIgnoreCase(localName)) { + String classVal = atts.getValue("class"); + if ("x-boilerpipe-mark1".equals(classVal)) { + ignoreAttrs = true; + } + } + + try { + if (inIgnorableElement == 0) { + if (outputHighlightOnly) { + // boolean highlight = contentBitSet + // .get(characterElementIdx); + + // if (!highlight) { + // return; + // } + } + + final Set whitelistAttributes; + if (tagWhitelist == null) { + whitelistAttributes = null; + } else { + whitelistAttributes = tagWhitelist.get(qName); + if (whitelistAttributes == null) { + // skip + return; + } + } + + html.append('<'); + html.append(qName); + if (!ignoreAttrs) { + final int numAtts = atts.getLength(); + for (int i = 0; i < numAtts; i++) { + final String attr = atts.getQName(i); + + if (whitelistAttributes != null + && !whitelistAttributes.contains(attr)) { + // skip + continue; + } + + final String value = atts.getValue(i); + html.append(' '); + html.append(attr); + html.append("=\""); + html.append(xmlEncode(value)); + html.append("\""); + } + } + html.append('>'); + } + } finally { + if (ta != null) { + ta.afterStart(this, localName); + } + } + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + TagAction ta = TAG_ACTIONS.get(localName); + if (ta != null) { + ta.beforeEnd(this, localName); + } + + try { + if (inIgnorableElement == 0) { + if (outputHighlightOnly) { + // boolean highlight = contentBitSet + // .get(characterElementIdx); + + // if (!highlight) { + // return; + // } + } + + if (tagWhitelist != null + && !tagWhitelist.containsKey(qName)) { + // skip + return; + } + + html.append("'); + } + } finally { + if (ta != null) { + ta.afterEnd(this, localName); + } + } + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + characterElementIdx++; + if (inIgnorableElement == 0) { + + boolean highlight = contentBitSet.get(characterElementIdx); + + if (!highlight && outputHighlightOnly) { + return; + } + + if (highlight) { + html.append(preHighlight); + } + html.append(xmlEncode(String.valueOf(ch, start, length))); + if (highlight) { + html.append(postHighlight); + } + } + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + } + + } + + private static String xmlEncode(final String in) { + if (in == null) { + return ""; + } + char c; + StringBuilder out = new StringBuilder(in.length()); + + for (int i = 0; i < in.length(); i++) { + c = in.charAt(i); + switch (c) { + case '<': + out.append("<"); + break; + case '>': + out.append(">"); + break; + case '&': + out.append("&"); + break; + case '"': + out.append("""); + break; + default: + out.append(c); + } + } + + return out.toString(); + } + + public Map> getTagWhitelist() { + return tagWhitelist; + } + + public void setTagWhitelist(Map> tagWhitelist) { + this.tagWhitelist = tagWhitelist; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java b/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java new file mode 100644 index 0000000..3a9bcbe --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/ImageExtractor.java @@ -0,0 +1,277 @@ +package de.l3s.boilerpipe.sax; + +import java.io.IOException; +import java.io.StringReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.xerces.parsers.AbstractSAXParser; +import org.cyberneko.html.HTMLConfiguration; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.Image; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; + +/** + * Extracts the images that are enclosed by extracted content. + * + * @author Christian Kohlschütter + */ +public final class ImageExtractor { + public static final ImageExtractor INSTANCE = new ImageExtractor(); + + /** + * Returns the singleton instance of {@link ImageExtractor}. + * + * @return the singleton instance of {@link ImageExtractor}. + */ + public static ImageExtractor getInstance() { + return INSTANCE; + } + + private ImageExtractor() { + } + + /** + * Processes the given {@link TextDocument} and the original HTML text (as a + * String). + * + * @param doc + * The processed {@link TextDocument}. + * @param origHTML + * The original HTML document. + * @return A List of enclosed {@link Image}s + * @throws BoilerpipeProcessingException + */ + public List process(final TextDocument doc, + final String origHTML) throws BoilerpipeProcessingException { + return process(doc, new InputSource( + new StringReader(origHTML))); + } + + /** + * Processes the given {@link TextDocument} and the original HTML text (as an + * {@link InputSource}). + * + * @param doc + * The processed {@link TextDocument}. + * The original HTML document. + * @return A List of enclosed {@link Image}s + * @throws BoilerpipeProcessingException + */ + public List process(final TextDocument doc, + final InputSource is) throws BoilerpipeProcessingException { + final Implementation implementation = new Implementation(); + implementation.process(doc, is); + + return implementation.linksHighlight; + } + + /** + * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the + * retrieved HTML using the specified {@link BoilerpipeExtractor}. + * + * The processed {@link TextDocument}. + * The original HTML document. + * @return A List of enclosed {@link Image}s + * @throws BoilerpipeProcessingException + */ + public List process(final URL url, final BoilerpipeExtractor extractor) + throws IOException, BoilerpipeProcessingException, SAXException { + final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); + + final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) + .getTextDocument(); + extractor.process(doc); + + final InputSource is = htmlDoc.toInputSource(); + + return process(doc, is); + } + + + private final class Implementation extends AbstractSAXParser implements + ContentHandler { + List linksHighlight = new ArrayList(); + private List linksBuffer = new ArrayList(); + + private int inIgnorableElement = 0; + private int characterElementIdx = 0; + private final BitSet contentBitSet = new BitSet(); + + private boolean inHighlight = false; + + Implementation() { + super(new HTMLConfiguration()); + setContentHandler(this); + } + + void process(final TextDocument doc, final InputSource is) + throws BoilerpipeProcessingException { + for (TextBlock block : doc.getTextBlocks()) { + if (block.isContent()) { + final BitSet bs = block.getContainedTextElements(); + if (bs != null) { + contentBitSet.or(bs); + } + } + } + + try { + parse(is); + } catch (SAXException e) { + throw new BoilerpipeProcessingException(e); + } catch (IOException e) { + throw new BoilerpipeProcessingException(e); + } + } + + public void endDocument() throws SAXException { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + } + + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startDocument() throws SAXException { + } + + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + TagAction ta = TAG_ACTIONS.get(localName); + if (ta != null) { + ta.beforeStart(this, localName); + } + + try { + if (inIgnorableElement == 0) { + if(inHighlight && "IMG".equalsIgnoreCase(localName)) { + String src = atts.getValue("src"); + if(src != null && src.length() > 0) { + linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts.getValue("alt"))); + } + } + } + } finally { + if (ta != null) { + ta.afterStart(this, localName); + } + } + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + TagAction ta = TAG_ACTIONS.get(localName); + if (ta != null) { + ta.beforeEnd(this, localName); + } + + try { + if (inIgnorableElement == 0) { + // + } + } finally { + if (ta != null) { + ta.afterEnd(this, localName); + } + } + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + characterElementIdx++; + if (inIgnorableElement == 0) { + + boolean highlight = contentBitSet.get(characterElementIdx); + if(!highlight) { + if(length == 0) { + return; + } + boolean justWhitespace = true; + for(int i=start;i TAG_ACTIONS = new HashMap(); + static { + TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT); + + TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT); + } + + private abstract static class TagAction { + void beforeStart(final Implementation instance, final String localName) { + } + + void afterStart(final Implementation instance, final String localName) { + } + + void beforeEnd(final Implementation instance, final String localName) { + } + + void afterEnd(final Implementation instance, final String localName) { + } + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java b/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java new file mode 100644 index 0000000..ef8010e --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/InputSourceable.java @@ -0,0 +1,12 @@ +package de.l3s.boilerpipe.sax; + +import org.xml.sax.InputSource; + +/** + * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given document. + * + * @author Christian Kohlschütter + */ +public interface InputSourceable { + InputSource toInputSource(); +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java b/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java new file mode 100644 index 0000000..e54a3da --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/MarkupTagAction.java @@ -0,0 +1,105 @@ +package de.l3s.boilerpipe.sax; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.labels.DefaultLabels; +import de.l3s.boilerpipe.labels.LabelAction; + +/** + * Assigns labels for element CSS classes and ids to the corresponding + * {@link TextBlock}. CSS classes are prefixed by + * {@link DefaultLabels#MARKUP_PREFIX}., and IDs are prefixed by + * {@link DefaultLabels#MARKUP_PREFIX}# + * + * @author Christian Kohlschütter + */ +public final class MarkupTagAction implements TagAction { + + private final boolean isBlockLevel; + private LinkedList> labelStack = new LinkedList>(); + + public MarkupTagAction(final boolean isBlockLevel) { + this.isBlockLevel = isBlockLevel; + } + + private static final Pattern PAT_NUM = Pattern.compile("[0-9]+"); + + public boolean start(BoilerpipeHTMLContentHandler instance, + String localName, String qName, Attributes atts) + throws SAXException { + List labels = new ArrayList(5); + labels.add(DefaultLabels.MARKUP_PREFIX + localName); + + String classVal = atts.getValue("class"); + + if (classVal != null && classVal.length() > 0) { + classVal = PAT_NUM.matcher(classVal).replaceAll("#"); + classVal = classVal.trim(); + String[] vals = classVal.split("[ ]+"); + labels.add(DefaultLabels.MARKUP_PREFIX + "." + + classVal.replace(' ', '.')); + if (vals.length > 1) { + for (String s : vals) { + labels.add(DefaultLabels.MARKUP_PREFIX + "." + s); + } + } + } + + String id = atts.getValue("id"); + if (id != null && id.length() > 0) { + id = PAT_NUM.matcher(id).replaceAll("#"); + labels.add(DefaultLabels.MARKUP_PREFIX + "#" + id); + } + + Set ancestors = getAncestorLabels(); + List labelsWithAncestors = new ArrayList( + (ancestors.size() + 1) * labels.size()); + + for (String l : labels) { + for (String an : ancestors) { + labelsWithAncestors.add(an); + labelsWithAncestors.add(an + " " + l); + } + labelsWithAncestors.add(l); + } + + instance.addLabelAction(new LabelAction(labelsWithAncestors + .toArray(new String[labelsWithAncestors.size()]))); + + labelStack.add(labels); + + return isBlockLevel; + } + + + public boolean end(BoilerpipeHTMLContentHandler instance, String localName, + String qName) throws SAXException { + + labelStack.removeLast(); + return isBlockLevel; + } + + public boolean changesTagLevel() { + return isBlockLevel; + } + + private Set getAncestorLabels() { + Set set = new HashSet(); + for (List labels : labelStack) { + if (labels == null) { + continue; + } + set.addAll(labels); + } + return set; + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java b/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java new file mode 100644 index 0000000..e6f1943 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/MediaExtractor.java @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package de.l3s.boilerpipe.sax; + +import java.io.IOException; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.xerces.parsers.AbstractSAXParser; +import org.cyberneko.html.HTMLConfiguration; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.Image; +import de.l3s.boilerpipe.document.Media; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.document.VimeoVideo; +import de.l3s.boilerpipe.document.YoutubeVideo; +import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; +import de.l3s.boilerpipe.sax.HTMLDocument; +import de.l3s.boilerpipe.sax.HTMLFetcher; + + +/** + * Extracts youtube and vimeo videos that are enclosed by extracted content. + * + * @author Christian Kohlschütter, manuel.codiga@gmail.com + */ +public final class MediaExtractor { + + /** */ + public static final MediaExtractor INSTANCE = new MediaExtractor(); + + /** + * @return the singleton instance of {@link MediaExtractor}. + */ + public static MediaExtractor getInstance() { + return INSTANCE; + } + + + + /** + * Processes the given {@link TextDocument} and the original HTML text (as a + * String). + * + * @param doc + * The processed {@link TextDocument}. + * @param origHTML + * The original HTML document. + * @return A List of enclosed {@link Image}s + * @throws BoilerpipeProcessingException if an error during extraction occure + */ + public List process(final TextDocument doc, final String origHTML) + throws BoilerpipeProcessingException { + return process(doc, new InputSource(new StringReader(origHTML))); + } + + /** + * Processes the given {@link TextDocument} and the original HTML text (as an + * {@link InputSource}). + * + * @param doc + * The processed {@link TextDocument}. + * The original HTML document. + * @return A List of enclosed {@link Image}s + * @throws BoilerpipeProcessingException + */ + public List process(final TextDocument doc, final InputSource is) + throws BoilerpipeProcessingException { + final Implementation implementation = new Implementation(); + implementation.process(doc, is); + + return implementation.linksHighlight; + } + + /** + * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the + * retrieved HTML using the specified {@link BoilerpipeExtractor}. + * @param url the url of the document to fetch + * @param extractor extractor to use + * + * @return A List of enclosed {@link Image}s + * @throws IOException + * @throws BoilerpipeProcessingException + * @throws SAXException + */ + @SuppressWarnings("javadoc") + public List process(final URL url, final BoilerpipeExtractor extractor) + throws IOException, BoilerpipeProcessingException, SAXException { + final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); + + final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) + .getTextDocument(); + extractor.process(doc); + + final InputSource is = htmlDoc.toInputSource(); + + return process(doc, is); + } + + /** + * parses the media (picture, video) out of doc + * @param doc document to parse the media out + * @param extractor extractor to use + * @return list of extracted media, with size = 0 if no media found + */ + public List process(String doc, final BoilerpipeExtractor extractor) { + final HTMLDocument htmlDoc = new HTMLDocument(doc); + List media = new ArrayList(); + TextDocument tdoc; + + try { + tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); + extractor.process(tdoc); + final InputSource is = htmlDoc.toInputSource(); + media = process(tdoc, is); + } catch (Exception e) { + return null; + } + return media; + } + + + private final class Implementation extends AbstractSAXParser implements + ContentHandler { + List linksHighlight = new ArrayList(); + private List linksBuffer = new ArrayList(); + + private int inIgnorableElement = 0; + private int characterElementIdx = 0; + private final BitSet contentBitSet = new BitSet(); + + private boolean inHighlight = false; + + Implementation() { + super(new HTMLConfiguration()); + setContentHandler(this); + } + + void process(final TextDocument doc, final InputSource is) + throws BoilerpipeProcessingException { + for (TextBlock block : doc.getTextBlocks()) { + if (block.isContent()) { + final BitSet bs = block.getContainedTextElements(); + if (bs != null) { + contentBitSet.or(bs); + } + } + } + + try { + parse(is); + } catch (SAXException e) { + throw new BoilerpipeProcessingException(e); + } catch (IOException e) { + throw new BoilerpipeProcessingException(e); + } + } + + public void endDocument() throws SAXException { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + } + + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startDocument() throws SAXException { + } + + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + TagAction ta = TAG_ACTIONS.get(localName); + if (ta != null) { + ta.beforeStart(this, localName); + } + + try { + if (inIgnorableElement == 0) { + if(inHighlight && "IFRAME".equalsIgnoreCase(localName)) { + String src = atts.getValue("src"); + src = src.replaceAll("\\\\\"", ""); + if(src != null && src.length() > 0 && src.contains("youtube.com/embed/")) { + String originUrl = null; + try { + URL url = new URL(src); + String path = url.getPath(); + String[] pathParts = path.split("/"); + originUrl = "http://www.youtube.com/watch?v="+pathParts[pathParts.length-1]; + linksBuffer.add(new YoutubeVideo(originUrl,src)); + } catch (MalformedURLException e) { + } + + } + + if(src != null && src.length() > 0 && src.contains("player.vimeo.com")) { + String originUrl = null; + try { + URL url = new URL(src); + String path = url.getPath(); + String[] pathParts = path.split("/"); + originUrl = "http://vimeo.com/"+pathParts[pathParts.length-1]; + linksBuffer.add(new VimeoVideo(originUrl,src)); + } catch (MalformedURLException e) { + } + + } + } + + + if(inHighlight && "IMG".equalsIgnoreCase(localName)) { + String src = atts.getValue("src"); + try { + URI image = new URI(src); + if(src != null && src.length() > 0) { + linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts.getValue("alt"))); + } + } catch (URISyntaxException e) { + } + } + } + } finally { + if (ta != null) { + ta.afterStart(this, localName); + } + } + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + TagAction ta = TAG_ACTIONS.get(localName); + if (ta != null) { + ta.beforeEnd(this, localName); + } + + try { + if (inIgnorableElement == 0) { + // + } + } finally { + if (ta != null) { + ta.afterEnd(this, localName); + } + } + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + characterElementIdx++; + if (inIgnorableElement == 0) { + + boolean highlight = contentBitSet.get(characterElementIdx); + if(!highlight) { + if(length == 0) { + return; + } + boolean justWhitespace = true; + for(int i=start;i TAG_ACTIONS = new HashMap(); + static { + TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT); + TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT); + + TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT); + } + + private abstract static class TagAction { + void beforeStart(final Implementation instance, final String localName) { + } + + void afterStart(final Implementation instance, final String localName) { + } + + void beforeEnd(final Implementation instance, final String localName) { + } + + void afterEnd(final Implementation instance, final String localName) { + } + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/TagAction.java b/src/main/java/de/l3s/boilerpipe/sax/TagAction.java new file mode 100644 index 0000000..3ee8dcf --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/TagAction.java @@ -0,0 +1,39 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * Defines an action that is to be performed whenever a particular tag occurs + * during HTML parsing. + * + * @author Christian Kohlschütter + */ +public interface TagAction { + + boolean start(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName, final Attributes atts) + throws SAXException; + + boolean end(final BoilerpipeHTMLContentHandler instance, + final String localName, final String qName) throws SAXException; + + boolean changesTagLevel(); +} \ No newline at end of file diff --git a/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java b/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java new file mode 100644 index 0000000..74ab275 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/TagActionMap.java @@ -0,0 +1,60 @@ +/** + * boilerpipe + * + * Copyright (c) 2009, 2010 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.sax; + +import java.util.HashMap; + +/** + * Base class for definition a set of {@link TagAction}s that are to be used for the + * HTML parsing process. + * + * @see DefaultTagActionMap + * @author Christian Kohlschütter + */ +public abstract class TagActionMap extends HashMap { + private static final long serialVersionUID = 1L; + + /** + * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag + * will be removed and overwritten. + * + * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case) + * @param action The {@link TagAction} + */ + protected void setTagAction(final String tag, final TagAction action) { + put(tag.toUpperCase(), action); + put(tag.toLowerCase(), action); + put(tag, action); + } + + /** + * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that tag, + * a chained action, consisting of the previous and the new {@link TagAction} is created. + * + * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case) + * @param action The {@link TagAction} + */ + protected void addTagAction(final String tag, final TagAction action) { + TagAction previousAction = get(tag); + if(previousAction == null) { + setTagAction(tag, action); + } else { + setTagAction(tag, new CommonTagActions.Chained(previousAction, action)); + } + } +} diff --git a/src/main/java/de/l3s/boilerpipe/sax/package.html b/src/main/java/de/l3s/boilerpipe/sax/package.html new file mode 100644 index 0000000..9772244 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/sax/package.html @@ -0,0 +1,6 @@ + + +

Classes related to parsing and producing HTML from/to Boilerpipe + TextDocuments.

+ + diff --git a/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java b/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java new file mode 100644 index 0000000..e7997f0 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/util/UnicodeTokenizer.java @@ -0,0 +1,45 @@ +/** + * boilerpipe + * + * Copyright (c) 2009 Christian Kohlschütter + * + * The author licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.l3s.boilerpipe.util; + +import java.util.regex.Pattern; + +/** + * Tokenizes text according to Unicode word boundaries and strips off non-word + * characters. + * + * @author Christian Kohlschütter + */ +public class UnicodeTokenizer { + private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b"); + private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern + .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*"); + + /** + * Tokenizes the text and returns an array of tokens. + * + * @param text The text + * @return The tokens + */ + public static String[] tokenize(final CharSequence text) { + return PAT_NOT_WORD_BOUNDARY.matcher( + PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063")) + .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split( + "[ ]+"); + } +} diff --git a/src/main/java/de/l3s/boilerpipe/util/package.html b/src/main/java/de/l3s/boilerpipe/util/package.html new file mode 100644 index 0000000..ab7a714 --- /dev/null +++ b/src/main/java/de/l3s/boilerpipe/util/package.html @@ -0,0 +1,5 @@ + + +

Some helper classes.

+ + diff --git a/src/main/java/org/cyberneko/html/HTMLElements.java b/src/main/java/org/cyberneko/html/HTMLElements.java new file mode 100644 index 0000000..d200373 --- /dev/null +++ b/src/main/java/org/cyberneko/html/HTMLElements.java @@ -0,0 +1,794 @@ +/* + * Copyright 2002-2009 Andy Clark, Marc Guillemot + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.cyberneko.html; + +/** + * Collection of HTML element information. + * + * @author Andy Clark + * @author Ahmed Ashour + * @author Marc Guillemot + * + * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $ + */ +public class HTMLElements { + + // + // Constants + // + + // element codes + + // NOTE: The element codes *must* start with 0 and increment in + // sequence. The parent and closes references depends on + // this assumption. -Ac + + public static final short A = 0; + public static final short ABBR = A+1; + public static final short ACRONYM = ABBR+1; + public static final short ADDRESS = ACRONYM+1; + public static final short APPLET = ADDRESS+1; + public static final short AREA = APPLET+1; + public static final short B = AREA+1; + public static final short BASE = B+1; + public static final short BASEFONT = BASE+1; + public static final short BDO = BASEFONT+1; + public static final short BGSOUND = BDO+1; + public static final short BIG = BGSOUND+1; + public static final short BLINK = BIG+1; + public static final short BLOCKQUOTE = BLINK+1; + public static final short BODY = BLOCKQUOTE+1; + public static final short BR = BODY+1; + public static final short BUTTON = BR+1; + public static final short CAPTION = BUTTON+1; + public static final short CENTER = CAPTION+1; + public static final short CITE = CENTER+1; + public static final short CODE = CITE+1; + public static final short COL = CODE+1; + public static final short COLGROUP = COL+1; + public static final short COMMENT = COLGROUP+1; + public static final short DEL = COMMENT+1; + public static final short DFN = DEL+1; + public static final short DIR = DFN+1; + public static final short DIV = DIR+1; + public static final short DD = DIV+1; + public static final short DL = DD+1; + public static final short DT = DL+1; + public static final short EM = DT+1; + public static final short EMBED = EM+1; + public static final short FIELDSET = EMBED+1; + public static final short FONT = FIELDSET+1; + public static final short FORM = FONT+1; + public static final short FRAME = FORM+1; + public static final short FRAMESET = FRAME+1; + public static final short H1 = FRAMESET+1; + public static final short H2 = H1+1; + public static final short H3 = H2+1; + public static final short H4 = H3+1; + public static final short H5 = H4+1; + public static final short H6 = H5+1; + public static final short HEAD = H6+1; + public static final short HR = HEAD+1; + public static final short HTML = HR+1; + public static final short I = HTML+1; + public static final short IFRAME = I+1; + public static final short ILAYER = IFRAME+1; + public static final short IMG = ILAYER+1; + public static final short INPUT = IMG+1; + public static final short INS = INPUT+1; + public static final short ISINDEX = INS+1; + public static final short KBD = ISINDEX+1; + public static final short KEYGEN = KBD+1; + public static final short LABEL = KEYGEN+1; + public static final short LAYER = LABEL+1; + public static final short LEGEND = LAYER+1; + public static final short LI = LEGEND+1; + public static final short LINK = LI+1; + public static final short LISTING = LINK+1; + public static final short MAP = LISTING+1; + public static final short MARQUEE = MAP+1; + public static final short MENU = MARQUEE+1; + public static final short META = MENU+1; + public static final short MULTICOL = META+1; + public static final short NEXTID = MULTICOL+1; + public static final short NOBR = NEXTID+1; + public static final short NOEMBED = NOBR+1; + public static final short NOFRAMES = NOEMBED+1; + public static final short NOLAYER = NOFRAMES+1; + public static final short NOSCRIPT = NOLAYER+1; + public static final short OBJECT = NOSCRIPT+1; + public static final short OL = OBJECT+1; + public static final short OPTION = OL+1; + public static final short OPTGROUP = OPTION+1; + public static final short P = OPTGROUP+1; + public static final short PARAM = P+1; + public static final short PLAINTEXT = PARAM+1; + public static final short PRE = PLAINTEXT+1; + public static final short Q = PRE+1; + public static final short RB = Q+1; + public static final short RBC = RB+1; + public static final short RP = RBC+1; + public static final short RT = RP+1; + public static final short RTC = RT+1; + public static final short RUBY = RTC+1; + public static final short S = RUBY+1; + public static final short SAMP = S+1; + public static final short SCRIPT = SAMP+1; + public static final short SELECT = SCRIPT+1; + public static final short SMALL = SELECT+1; + public static final short SOUND = SMALL+1; + public static final short SPACER = SOUND+1; + public static final short SPAN = SPACER+1; + public static final short STRIKE = SPAN+1; + public static final short STRONG = STRIKE+1; + public static final short STYLE = STRONG+1; + public static final short SUB = STYLE+1; + public static final short SUP = SUB+1; + public static final short TABLE = SUP+1; + public static final short TBODY = TABLE+1; + public static final short TD = TBODY+1; + public static final short TEXTAREA = TD+1; + public static final short TFOOT = TEXTAREA+1; + public static final short TH = TFOOT+1; + public static final short THEAD = TH+1; + public static final short TITLE = THEAD+1; + public static final short TR = TITLE+1; + public static final short TT = TR+1; + public static final short U = TT+1; + public static final short UL = U+1; + public static final short VAR = UL+1; + public static final short WBR = VAR+1; + public static final short XML = WBR+1; + public static final short XMP = XML+1; + public static final short UNKNOWN = XMP+1; + + // information + + /** Element information organized by first letter. */ + protected static final Element[][] ELEMENTS_ARRAY = new Element[26][]; + + /** Element information as a contiguous list. */ + protected static final ElementList ELEMENTS = new ElementList(); + + /** No such element. */ + public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null); + + // + // Static initializer + // + + /** + * Initializes the element information. + *

+ * Note: + * The getElement method requires that the HTML elements + * are added to the list in alphabetical order. If new elements are + * added, then they must be inserted in alphabetical order. + */ + static { + // + // + // + // + // + // + // + // + + // initialize array of element information + ELEMENTS_ARRAY['A'-'A'] = new Element[] { + // A - - (%inline;)* -(A) + new Element(A, "A", Element.INLINE, BODY, new short[] {A}), + // ABBR - - (%inline;)* + new Element(ABBR, "ABBR", Element.INLINE, BODY, null), + // ACRONYM - - (%inline;)* + new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null), + // ADDRESS - - (%inline;)* + new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null), + // APPLET + new Element(APPLET, "APPLET", 0, BODY, null), + // AREA - O EMPTY + new Element(AREA, "AREA", Element.EMPTY, MAP, null), + }; + ELEMENTS_ARRAY['B'-'A'] = new Element[] { + // B - - (%inline;)* + new Element(B, "B", Element.INLINE, BODY, null), + // BASE - O EMPTY + new Element(BASE, "BASE", Element.EMPTY, HEAD, null), + // BASEFONT + new Element(BASEFONT, "BASEFONT", 0, HEAD, null), + // BDO - - (%inline;)* + new Element(BDO, "BDO", Element.INLINE, BODY, null), + // BGSOUND + new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null), + // BIG - - (%inline;)* + new Element(BIG, "BIG", Element.INLINE, BODY, null), + // BLINK + new Element(BLINK, "BLINK", Element.INLINE, BODY, null), + // BLOCKQUOTE - - (%block;|SCRIPT)+ + new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}), + // BODY O O (%block;|SCRIPT)+ +(INS|DEL) + new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}), + // BR - O EMPTY + new Element(BR, "BR", Element.EMPTY, BODY, null), + // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET) + new Element(BUTTON, "BUTTON", 0, BODY, null), + }; + ELEMENTS_ARRAY['C'-'A'] = new Element[] { + // CAPTION - - (%inline;)* + new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null), + // CENTER, + new Element(CENTER, "CENTER", 0, BODY, null), + // CITE - - (%inline;)* + new Element(CITE, "CITE", Element.INLINE, BODY, null), + // CODE - - (%inline;)* + new Element(CODE, "CODE", Element.INLINE, BODY, null), + // COL - O EMPTY + new Element(COL, "COL", Element.EMPTY, TABLE, null), + // COLGROUP - O (COL)* + new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}), + // COMMENT + new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null), + }; + ELEMENTS_ARRAY['D'-'A'] = new Element[] { + // DEL - - (%flow;)* + new Element(DEL, "DEL", 0, BODY, null), + // DFN - - (%inline;)* + new Element(DFN, "DFN", Element.INLINE, BODY, null), + // DIR + new Element(DIR, "DIR", 0, BODY, null), + // DIV - - (%flow;)* + new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}), + // DD - O (%flow;)* + new Element(DD, "DD", 0, DL, new short[]{DT,DD}), + // DL - - (DT|DD)+ + new Element(DL, "DL", Element.BLOCK, BODY, null), + // DT - O (%inline;)* + new Element(DT, "DT", 0, DL, new short[]{DT,DD}), + }; + ELEMENTS_ARRAY['E'-'A'] = new Element[] { + // EM - - (%inline;)* + new Element(EM, "EM", Element.INLINE, BODY, null), + // EMBED + new Element(EMBED, "EMBED", 0, BODY, null), + }; + ELEMENTS_ARRAY['F'-'A'] = new Element[] { + // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*) + new Element(FIELDSET, "FIELDSET", 0, BODY, null), + // FONT + new Element(FONT, "FONT", Element.CONTAINER, BODY, null), + // FORM - - (%block;|SCRIPT)+ -(FORM) + new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}), + // FRAME - O EMPTY + new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null), + // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?) + new Element(FRAMESET, "FRAMESET", 0, HTML, null), + }; + ELEMENTS_ARRAY['H'-'A'] = new Element[] { + // (H1|H2|H3|H4|H5|H6) - - (%inline;)* + new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), + new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), + new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), + new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), + new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), + new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), + // HEAD O O (%head.content;) +(%head.misc;) + new Element(HEAD, "HEAD", 0, HTML, null), + // HR - O EMPTY + new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}), + // HTML O O (%html.content;) + new Element(HTML, "HTML", 0, null, null), + }; + ELEMENTS_ARRAY['I'-'A'] = new Element[] { + // I - - (%inline;)* + new Element(I, "I", Element.INLINE, BODY, null), + // IFRAME + new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null), + // ILAYER + new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null), + // IMG - O EMPTY + new Element(IMG, "IMG", Element.EMPTY, BODY, null), + // INPUT - O EMPTY + new Element(INPUT, "INPUT", Element.EMPTY, BODY, null), + // INS - - (%flow;)* + new Element(INS, "INS", 0, BODY, null), + // ISINDEX + new Element(ISINDEX, "ISINDEX", 0, HEAD, null), + }; + ELEMENTS_ARRAY['K'-'A'] = new Element[] { + // KBD - - (%inline;)* + new Element(KBD, "KBD", Element.INLINE, BODY, null), + // KEYGEN + new Element(KEYGEN, "KEYGEN", 0, BODY, null), + }; + ELEMENTS_ARRAY['L'-'A'] = new Element[] { + // LABEL - - (%inline;)* -(LABEL) + new Element(LABEL, "LABEL", 0, BODY, null), + // LAYER + new Element(LAYER, "LAYER", Element.BLOCK, BODY, null), + // LEGEND - - (%inline;)* + new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null), + // LI - O (%flow;)* + new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}), + // LINK - O EMPTY + new Element(LINK, "LINK", Element.EMPTY, HEAD, null), + // LISTING + new Element(LISTING, "LISTING", 0, BODY, null), + }; + ELEMENTS_ARRAY['M'-'A'] = new Element[] { + // MAP - - ((%block;) | AREA)+ + new Element(MAP, "MAP", Element.INLINE, BODY, null), + // MARQUEE + new Element(MARQUEE, "MARQUEE", 0, BODY, null), + // MENU + new Element(MENU, "MENU", 0, BODY, null), + // META - O EMPTY + new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}), + // MULTICOL + new Element(MULTICOL, "MULTICOL", 0, BODY, null), + }; + ELEMENTS_ARRAY['N'-'A'] = new Element[] { + // NEXTID + new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null), + // NOBR + new Element(NOBR, "NOBR", Element.INLINE, BODY, null), + // NOEMBED + new Element(NOEMBED, "NOEMBED", 0, BODY, null), + // NOFRAMES - - (BODY) -(NOFRAMES) + new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null), + // NOLAYER + new Element(NOLAYER, "NOLAYER", 0, BODY, null), + // NOSCRIPT - - (%block;)+ + new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null), + }; + ELEMENTS_ARRAY['O'-'A'] = new Element[] { + // OBJECT - - (PARAM | %flow;)* + new Element(OBJECT, "OBJECT", 0, BODY, null), + // OL - - (LI)+ + new Element(OL, "OL", Element.BLOCK, BODY, null), + // OPTGROUP - - (OPTION)+ + new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}), + // OPTION - O (#PCDATA) + new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}), + }; + ELEMENTS_ARRAY['P'-'A'] = new Element[] { + // P - O (%inline;)* + new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}), + // PARAM - O EMPTY + new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null), + // PLAINTEXT + new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null), + // PRE - - (%inline;)* -(%pre.exclusion;) + new Element(PRE, "PRE", 0, BODY, null), + }; + ELEMENTS_ARRAY['Q'-'A'] = new Element[] { + // Q - - (%inline;)* + new Element(Q, "Q", Element.INLINE, BODY, null), + }; + ELEMENTS_ARRAY['R'-'A'] = new Element[] { + // RB + new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}), + // RBC + new Element(RBC, "RBC", 0, RUBY, null), + // RP + new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}), + // RT + new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}), + // RTC + new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}), + // RUBY + new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}), + }; + ELEMENTS_ARRAY['S'-'A'] = new Element[] { + // S + new Element(S, "S", 0, BODY, null), + // SAMP - - (%inline;)* + new Element(SAMP, "SAMP", Element.INLINE, BODY, null), + // SCRIPT - - %Script; + new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null), + // SELECT - - (OPTGROUP|OPTION)+ + new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}), + // SMALL - - (%inline;)* + new Element(SMALL, "SMALL", Element.INLINE, BODY, null), + // SOUND + new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null), + // SPACER + new Element(SPACER, "SPACER", Element.EMPTY, BODY, null), + // SPAN - - (%inline;)* + new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null), + // STRIKE + new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null), + // STRONG - - (%inline;)* + new Element(STRONG, "STRONG", Element.INLINE, BODY, null), + // STYLE - - %StyleSheet; + new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}), + // SUB - - (%inline;)* + new Element(SUB, "SUB", Element.INLINE, BODY, null), + // SUP - - (%inline;)* + new Element(SUP, "SUP", Element.INLINE, BODY, null), + }; + ELEMENTS_ARRAY['T'-'A'] = new Element[] { + // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+) + new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null), + // TBODY O O (TR)+ + new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}), + // TD - O (%flow;)* + new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), + // TEXTAREA - - (#PCDATA) + new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null), + // TFOOT - O (TR)+ + new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}), + // TH - O (%flow;)* + new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), + // THEAD - O (TR)+ + new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}), + // TITLE - - (#PCDATA) -(%head.misc;) + new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null), + // TR - O (TH|TD)+ + new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}), + // TT - - (%inline;)* + new Element(TT, "TT", Element.INLINE, BODY, null), + }; + ELEMENTS_ARRAY['U'-'A'] = new Element[] { + // U, + new Element(U, "U", Element.INLINE, BODY, null), + // UL - - (LI)+ + new Element(UL, "UL", Element.BLOCK, BODY, null), + }; + ELEMENTS_ARRAY['V'-'A'] = new Element[] { + // VAR - - (%inline;)* + new Element(VAR, "VAR", Element.INLINE, BODY, null), + }; + ELEMENTS_ARRAY['W'-'A'] = new Element[] { + // WBR + new Element(WBR, "WBR", Element.EMPTY, BODY, null), + }; + ELEMENTS_ARRAY['X'-'A'] = new Element[] { + // XML + new Element(XML, "XML", 0, BODY, null), + // XMP + new Element(XMP, "XMP", Element.SPECIAL, BODY, null), + }; + + // keep contiguous list of elements for lookups by code + for (int i = 0; i < ELEMENTS_ARRAY.length; i++) { + Element[] elements = ELEMENTS_ARRAY[i]; + if (elements != null) { + for (int j = 0; j < elements.length; j++) { + Element element = elements[j]; + ELEMENTS.addElement(element); + } + } + } + ELEMENTS.addElement(NO_SUCH_ELEMENT); + + // initialize cross references to parent elements + for (int i = 0; i < ELEMENTS.size; i++) { + Element element = ELEMENTS.data[i]; + if (element.parentCodes != null) { + element.parent = new Element[element.parentCodes.length]; + for (int j = 0; j < element.parentCodes.length; j++) { + element.parent[j] = ELEMENTS.data[element.parentCodes[j]]; + } + element.parentCodes = null; + } + } + + } // () + + // + // Public static methods + // + + /** + * Returns the element information for the specified element code. + * + * @param code The element code. + */ + public static final Element getElement(short code) { + return ELEMENTS.data[code]; + } // getElement(short):Element + + /** + * Returns the element information for the specified element name. + * + * @param ename The element name. + */ + public static final Element getElement(String ename) { + return getElement(ename, NO_SUCH_ELEMENT); + } // getElement(String):Element + + /** + * Returns the element information for the specified element name. + * + * @param ename The element name. + * @param element The default element to return if not found. + */ + public static final Element getElement(String ename, Element element) { + + if (ename.length() > 0) { + int c = ename.charAt(0); + if (c >= 'a' && c <= 'z') { + c = 'A' + c - 'a'; + } + if (c >= 'A' && c <= 'Z') { + Element[] elements = ELEMENTS_ARRAY[c - 'A']; + if (elements != null) { + for (int i = 0; i < elements.length; i++) { + Element elem = elements[i]; + if (elem.name.equalsIgnoreCase(ename)) { + return elem; + } + } + } + } + } + return element; + + } // getElement(String):Element + + // + // Classes + // + + /** + * Element information. + * + * @author Andy Clark + */ + public static class Element { + + // + // Constants + // + + /** Inline element. */ + public static final int INLINE = 0x01; + + /** Block element. */ + public static final int BLOCK = 0x02; + + /** Empty element. */ + public static final int EMPTY = 0x04; + + /** Container element. */ + public static final int CONTAINER = 0x08; + + /** Special element. */ + public static final int SPECIAL = 0x10; + + // + // Data + // + + /** The element code. */ + public short code; + + /** The element name. */ + public String name; + + /** Informational flags. */ + public int flags; + + /** Parent elements. */ + public short[] parentCodes; + + /** Parent elements. */ + public Element[] parent; + + /** The bounding element code. */ + public short bounds; + + /** List of elements this element can close. */ + public short[] closes; + + /** If set to true, then this element may not be nested, example: "A" **/ + boolean nestable = true; + + // + // Constructors + // + + /** + * Constructs an element object. + * + * @param code The element code. + * @param name The element name. + * @param flags Informational flags + * @param parent Natural closing parent name. + * @param closes List of elements this element can close. + */ + public Element(short code, String name, int flags, + short parent, short[] closes) { + this(code, name, flags, new short[]{parent}, (short)-1, closes); + } // (short,String,int,short,short[]); + + /** + * Constructs an element object. + * + * @param code The element code. + * @param name The element name. + * @param flags Informational flags + * @param parent Natural closing parent name. + * @param closes List of elements this element can close. + */ + public Element(short code, String name, int flags, + short parent, short bounds, short[] closes) { + this(code, name, flags, new short[]{parent}, bounds, closes); + } // (short,String,int,short,short,short[]) + + /** + * Constructs an element object. + * + * @param code The element code. + * @param name The element name. + * @param flags Informational flags + * @param parents Natural closing parent names. + * @param closes List of elements this element can close. + */ + public Element(short code, String name, int flags, + short[] parents, short[] closes) { + this(code, name, flags, parents, (short)-1, closes); + } // (short,String,int,short[],short[]) + + /** + * Constructs an element object. + * + * @param code The element code. + * @param name The element name. + * @param flags Informational flags + * @param parents Natural closing parent names. + * @param closes List of elements this element can close. + */ + public Element(short code, String name, int flags, + short[] parents, short bounds, short[] closes) { + this.code = code; + this.name = name; + this.flags = flags; + this.parentCodes = parents; + this.parent = null; + this.bounds = bounds; + this.closes = closes; + if(closes != null) { + for(int i=0;i(short,String,int,short[],short,short[]) + + // + // Public methods + // + + /** Returns true if this element is an inline element. */ + public final boolean isInline() { + return (flags & INLINE) != 0; + } // isInline():boolean + + /** Returns true if this element is a block element. */ + public final boolean isBlock() { + return (flags & BLOCK) != 0; + } // isBlock():boolean + + /** Returns true if this element is an empty element. */ + public final boolean isEmpty() { + return (flags & EMPTY) != 0; + } // isEmpty():boolean + + /** Returns true if this element is a container element. */ + public final boolean isContainer() { + return (flags & CONTAINER) != 0; + } // isContainer():boolean + + /** + * Returns true if this element is special -- if its content + * should be parsed ignoring markup. + */ + public final boolean isSpecial() { + return (flags & SPECIAL) != 0; + } // isSpecial():boolean + + /** + * Returns true if this element can close the specified Element. + * + * @param tag The element. + */ + public boolean closes(short tag) { + + if (closes != null) { + for (int i = 0; i < closes.length; i++) { + if (closes[i] == tag) { + return true; + } + } + } + return false; + + } // closes(short):boolean + + // + // Object methods + // + + /** Returns a hash code for this object. */ + public int hashCode() { + return name.hashCode(); + } // hashCode():int + + /** Returns true if the objects are equal. */ + public boolean equals(Object o) { + return name.equals(o); + } // equals(Object):boolean + + /** + * Provides a simple representation to make debugging easier + */ + public String toString() { + return super.toString() + "(name=" + name + ")"; + } + + /** + * Indicates if the provided element is an accepted parent of current element + * @param element the element to test for "paternity" + * @return true if element belongs to the {@link #parent} + */ + public boolean isParent(final Element element) { + if (parent == null) + return false; + else { + for (int i=0; i + *

  • add missing parent elements; + *
  • automatically close elements with optional end tags; and + *
  • handle mis-matched inline element tags. + * + *

    + * This component recognizes the following features: + *

      + *
    • http://cyberneko.org/html/features/augmentations + *
    • http://cyberneko.org/html/features/report-errors + *
    • http://cyberneko.org/html/features/balance-tags/document-fragment + *
    • http://cyberneko.org/html/features/balance-tags/ignore-outside-content + *
    + *

    + * This component recognizes the following properties: + *

      + *
    • http://cyberneko.org/html/properties/names/elems + *
    • http://cyberneko.org/html/properties/names/attrs + *
    • http://cyberneko.org/html/properties/error-reporter + *
    • http://cyberneko.org/html/properties/balance-tags/current-stack + *
    + * + * @see HTMLElements + * + * @author Andy Clark + * @author Marc Guillemot + * + * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $ + */ +public class HTMLTagBalancer + implements XMLDocumentFilter, HTMLComponent { + + // + // Constants + // + + // features + + /** Namespaces. */ + protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces"; + + /** Include infoset augmentations. */ + protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; + + /** Report errors. */ + protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors"; + + /** Document fragment balancing only (deprecated). */ + protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment"; + + /** Document fragment balancing only. */ + protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment"; + + /** Ignore outside content. */ + protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content"; + + /** Recognized features. */ + private static final String[] RECOGNIZED_FEATURES = { + NAMESPACES, + AUGMENTATIONS, + REPORT_ERRORS, + DOCUMENT_FRAGMENT_DEPRECATED, + DOCUMENT_FRAGMENT, + IGNORE_OUTSIDE_CONTENT, + }; + + /** Recognized features defaults. */ + private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = { + null, + null, + null, + null, + Boolean.FALSE, + Boolean.FALSE, + }; + + // properties + + /** Modify HTML element names: { "upper", "lower", "default" }. */ + protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems"; + + /** Modify HTML attribute names: { "upper", "lower", "default" }. */ + protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs"; + + /** Error reporter. */ + protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter"; + + /** + * EXPERIMENTAL: may change in next release
    + * Name of the property holding the stack of elements in which context a document fragment should be parsed. + **/ + public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack"; + + /** Recognized properties. */ + private static final String[] RECOGNIZED_PROPERTIES = { + NAMES_ELEMS, + NAMES_ATTRS, + ERROR_REPORTER, + FRAGMENT_CONTEXT_STACK, + }; + + /** Recognized properties defaults. */ + private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = { + null, + null, + null, + null, + }; + + // modify HTML names + + /** Don't modify HTML names. */ + protected static final short NAMES_NO_CHANGE = 0; + + /** Match HTML element names. */ + protected static final short NAMES_MATCH = 0; + + /** Uppercase HTML names. */ + protected static final short NAMES_UPPERCASE = 1; + + /** Lowercase HTML names. */ + protected static final short NAMES_LOWERCASE = 2; + + // static vars + + /** Synthesized event info item. */ + protected static final HTMLEventInfo SYNTHESIZED_ITEM = + new HTMLEventInfo.SynthesizedItem(); + + // + // Data + // + + // features + + /** Namespaces. */ + protected boolean fNamespaces; + + /** Include infoset augmentations. */ + protected boolean fAugmentations; + + /** Report errors. */ + protected boolean fReportErrors; + + /** Document fragment balancing only. */ + protected boolean fDocumentFragment; + + /** Ignore outside content. */ + protected boolean fIgnoreOutsideContent; + + // properties + + /** Modify HTML element names. */ + protected short fNamesElems; + + /** Modify HTML attribute names. */ + protected short fNamesAttrs; + + /** Error reporter. */ + protected HTMLErrorReporter fErrorReporter; + + // connections + + /** The document source. */ + protected XMLDocumentSource fDocumentSource; + + /** The document handler. */ + protected XMLDocumentHandler fDocumentHandler; + + // state + + /** The element stack. */ + protected final InfoStack fElementStack = new InfoStack(); + + /** The inline stack. */ + protected final InfoStack fInlineStack = new InfoStack(); + + /** True if seen anything. Important for xml declaration. */ + protected boolean fSeenAnything; + + /** True if root element has been seen. */ + protected boolean fSeenDoctype; + + /** True if root element has been seen. */ + protected boolean fSeenRootElement; + + /** + * True if seen the end of the document element. In other words, + * this variable is set to false until the end </HTML> + * tag is seen (or synthesized). This is used to ensure that + * extraneous events after the end of the document element do not + * make the document stream ill-formed. + */ + protected boolean fSeenRootElementEnd; + + /** True if seen <head< element. */ + protected boolean fSeenHeadElement; + + /** True if seen <body< element. */ + protected boolean fSeenBodyElement; + + /** True if a form is in the stack (allow to discard opening of nested forms) */ + protected boolean fOpenedForm; + + // temp vars + + /** A qualified name. */ + private final QName fQName = new QName(); + + /** Empty attributes. */ + private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl(); + + /** Augmentations. */ + private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations(); + + protected HTMLTagBalancingListener tagBalancingListener; + private LostText lostText_ = new LostText(); + + private boolean forcedStartElement_ = false; + private boolean forcedEndElement_ = false; + + /** + * Stack of elements determining the context in which a document fragment should be parsed + */ + private QName[] fragmentContextStack_ = null; + private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set + + private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList(); + + // + // HTMLComponent methods + // + + /** Returns the default state for a feature. */ + public Boolean getFeatureDefault(String featureId) { + int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0; + for (int i = 0; i < length; i++) { + if (RECOGNIZED_FEATURES[i].equals(featureId)) { + return RECOGNIZED_FEATURES_DEFAULTS[i]; + } + } + return null; + } // getFeatureDefault(String):Boolean + + /** Returns the default state for a property. */ + public Object getPropertyDefault(String propertyId) { + int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0; + for (int i = 0; i < length; i++) { + if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) { + return RECOGNIZED_PROPERTIES_DEFAULTS[i]; + } + } + return null; + } // getPropertyDefault(String):Object + + // + // XMLComponent methods + // + + /** Returns recognized features. */ + public String[] getRecognizedFeatures() { + return RECOGNIZED_FEATURES; + } // getRecognizedFeatures():String[] + + /** Returns recognized properties. */ + public String[] getRecognizedProperties() { + return RECOGNIZED_PROPERTIES; + } // getRecognizedProperties():String[] + + /** Resets the component. */ + public void reset(XMLComponentManager manager) + throws XMLConfigurationException { + + // get features + fNamespaces = manager.getFeature(NAMESPACES); + fAugmentations = manager.getFeature(AUGMENTATIONS); + fReportErrors = manager.getFeature(REPORT_ERRORS); + fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) || + manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED); + fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT); + + // get properties + fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); + fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS))); + fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER); + + fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK); + + } // reset(XMLComponentManager) + + /** Sets a feature. */ + public void setFeature(String featureId, boolean state) + throws XMLConfigurationException { + + if (featureId.equals(AUGMENTATIONS)) { + fAugmentations = state; + return; + } + if (featureId.equals(REPORT_ERRORS)) { + fReportErrors = state; + return; + } + if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) { + fIgnoreOutsideContent = state; + return; + } + + } // setFeature(String,boolean) + + /** Sets a property. */ + public void setProperty(String propertyId, Object value) + throws XMLConfigurationException { + + if (propertyId.equals(NAMES_ELEMS)) { + fNamesElems = getNamesValue(String.valueOf(value)); + return; + } + + if (propertyId.equals(NAMES_ATTRS)) { + fNamesAttrs = getNamesValue(String.valueOf(value)); + return; + } + + } // setProperty(String,Object) + + // + // XMLDocumentSource methods + // + + /** Sets the document handler. */ + public void setDocumentHandler(XMLDocumentHandler handler) { + fDocumentHandler = handler; + } // setDocumentHandler(XMLDocumentHandler) + + // @since Xerces 2.1.0 + + /** Returns the document handler. */ + public XMLDocumentHandler getDocumentHandler() { + return fDocumentHandler; + } // getDocumentHandler():XMLDocumentHandler + + // + // XMLDocumentHandler methods + // + + // since Xerces-J 2.2.0 + + /** Start document. */ + public void startDocument(XMLLocator locator, String encoding, + NamespaceContext nscontext, Augmentations augs) + throws XNIException { + + // reset state + fElementStack.top = 0; + if (fragmentContextStack_ != null) { + fragmentContextStackSize_ = fragmentContextStack_.length; + for (int i=0; i and have been buffered to consider outside content + fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer + consumeBufferedEndElements(); + + // handle empty document + if (!fSeenRootElement && !fDocumentFragment) { + if (fReportErrors) { + fErrorReporter.reportError("HTML2000", null); + } + if (fDocumentHandler != null) { + fSeenRootElementEnd = false; + forceStartBody(); // will force and + final String body = modifyName("body", fNamesElems); + fQName.setValues(null, body, body, null); + callEndElement(fQName, synthesizedAugs()); + + final String ename = modifyName("html", fNamesElems); + fQName.setValues(null, ename, ename, null); + callEndElement(fQName, synthesizedAugs()); + } + } + + // pop all remaining elements + else { + int length = fElementStack.top - fragmentContextStackSize_; + for (int i = 0; i < length; i++) { + Info info = fElementStack.pop(); + if (fReportErrors) { + String ename = info.qname.rawname; + fErrorReporter.reportWarning("HTML2001", new Object[]{ename}); + } + if (fDocumentHandler != null) { + callEndElement(info.qname, synthesizedAugs()); + } + } + } + + // call handler + if (fDocumentHandler != null) { + fDocumentHandler.endDocument(augs); + } + + } // endDocument(Augmentations) + + /** + * Consume elements that have been buffered, like that are first consumed + * at the end of document + */ + private void consumeBufferedEndElements() { + final List toConsume = new ArrayList(endElementsBuffer_); + endElementsBuffer_.clear(); + for (int i=0; i (if any) has been buffered + } + else if (elementCode == HTMLElements.BODY) { + // create if none was present + if (!fSeenHeadElement) { + final QName head = createQName("head"); + forceStartElement(head, null, synthesizedAugs()); + endElement(head, synthesizedAugs()); + } + consumeBufferedEndElements(); // (if any) has been buffered + + if (fSeenBodyElement) { + notifyDiscardedStartElement(elem, attrs, augs); + return; + } + fSeenBodyElement = true; + } + else if (elementCode == HTMLElements.FORM) { + if (fOpenedForm) { + notifyDiscardedStartElement(elem, attrs, augs); + return; + } + fOpenedForm = true; + } + else if (elementCode == HTMLElements.UNKNOWN) { + consumeBufferedEndElements(); + } + + // check proper parent + if (element.parent != null) { + if (!fSeenRootElement && !fDocumentFragment) { + String pname = element.parent[0].name; + pname = modifyName(pname, fNamesElems); + if (fReportErrors) { + String ename = elem.rawname; + fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname}); + } + final QName qname = new QName(null, pname, pname, null); + final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); + if (!parentCreated) { + if (!isForcedCreation) { + notifyDiscardedStartElement(elem, attrs, augs); + } + return; + } + } + else { + HTMLElements.Element preferedParent = element.parent[0]; + if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) { + int depth = getParentDepth(element.parent, element.bounds); + if (depth == -1) { // no parent found + final String pname = modifyName(preferedParent.name, fNamesElems); + final QName qname = new QName(null, pname, pname, null); + if (fReportErrors) { + String ename = elem.rawname; + fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname}); + } + final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); + if (!parentCreated) { + if (!isForcedCreation) { + notifyDiscardedStartElement(elem, attrs, augs); + } + return; + } + } + } + } + } + + // if block element, save immediate parent inline elements + int depth = 0; + if (element.flags == 0) { + int length = fElementStack.top; + fInlineStack.top = 0; + for (int i = length - 1; i >= 0; i--) { + Info info = fElementStack.data[i]; + if (!info.element.isInline()) { + break; + } + fInlineStack.push(info); + endElement(info.qname, synthesizedAugs()); + } + depth = fInlineStack.top; + } + + // close previous elements + // all elements close a