forked from pvdlg/boilerpipe
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 366d8b4
Showing
91 changed files
with
9,302 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
/bin | ||
/target | ||
/.settings | ||
/.classpath | ||
/.DS_Store | ||
/.project |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
|
||
boilerpipe | ||
|
||
Copyright (c) 2009-2011 Christian Kohlschütter | ||
|
||
The author licenses this file to You under the Apache License, Version 2.0 | ||
(the "License"); you may not use this file except in compliance with | ||
the License. You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
boilerpipe | ||
====== | ||
|
||
Repackaging of [boilerpipe](https://code.google.com/p/boilerpipe/) published on Maven Central Repository. | ||
|
||
Overview | ||
-------- | ||
This is a repackaging of the last sources of [boilerpipe](https://code.google.com/p/boilerpipe/) with some improvements: | ||
|
||
* Published on Maven Central Repository | ||
* Media extraction (Youtube videos, Vimeo videos and Images) within an article from [Netbreeze-GmbH fork](https://github.com/Netbreeze-GmbH/boilerpipe) | ||
|
||
Getting started | ||
----- | ||
|
||
The best way to start is to look at [boilerpipe QuickStart](https://code.google.com/p/boilerpipe/wiki/QuickStart) | ||
|
||
### Including the SDK in your project | ||
|
||
Simply add a new dependency to your `pom.xml`: | ||
|
||
```xml | ||
<dependency> | ||
<groupId>com.syncthemall</groupId> | ||
<artifactId>boilerpipe</artifactId> | ||
<version>1.2.1</version> | ||
</dependency> | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<groupId>com.syncthemall</groupId> | ||
<artifactId>boilerpipe</artifactId> | ||
<version>1.2.1</version> | ||
<packaging>jar</packaging> | ||
<name>boilerpipe</name> | ||
<description>Repackaging of Dropbox Java SDK with minor bug fixes and published on Maven Central Repository.</description> | ||
<url>https://github.com/vanduynslagerp/boilerpipe</url> | ||
<licenses> | ||
<license> | ||
<name>Apache License 2.0</name> | ||
<distribution>repo</distribution> | ||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> | ||
</license> | ||
</licenses> | ||
<scm> | ||
<connection>scm:git:[email protected]:vanduynslagerp/boilerpipe.git</connection> | ||
<developerConnection>scm:git:[email protected]:vanduynslagerp/boilerpipe.git</developerConnection> | ||
<url>https://github.com/vanduynslagerp/boilerpipe</url> | ||
</scm> | ||
<developers> | ||
<developer> | ||
<id>1</id> | ||
<name>Christian Kohlschütter</name> | ||
<url>http://www.kohlschutter.com</url> | ||
<roles> | ||
<role>project initiator</role> | ||
</roles> | ||
</developer> | ||
<developer> | ||
<id>2</id> | ||
<name>Manuel Codiga</name> | ||
<email>[email protected]</email> | ||
<roles> | ||
<role>contributor</role> | ||
</roles> | ||
</developer> | ||
</developers> | ||
<dependencies> | ||
<dependency> | ||
<groupId>net.sourceforge.nekohtml</groupId> | ||
<artifactId>nekohtml</artifactId> | ||
<version>1.9.18</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>xerces</groupId> | ||
<artifactId>xercesImpl</artifactId> | ||
<version>2.11.0</version> | ||
</dependency> | ||
</dependencies> | ||
<build> | ||
<defaultGoal>package</defaultGoal> | ||
<finalName>${project.artifactId}-${project.version}</finalName> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-jar-plugin</artifactId> | ||
<version>2.4</version> | ||
<configuration> | ||
<archive> | ||
<manifest> | ||
<addClasspath>true</addClasspath> | ||
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries> | ||
<addDefaultImplementationEntries>true</addDefaultImplementationEntries> | ||
</manifest> | ||
</archive> | ||
</configuration> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-resources-plugin</artifactId> | ||
<version>2.6</version> | ||
<configuration> | ||
<encoding>${project.build.sourceEncoding}</encoding> | ||
</configuration> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<version>3.1</version> | ||
<configuration> | ||
<encoding>${project.build.sourceEncoding}</encoding> | ||
<source>${source.version}</source> | ||
<target>${source.version}</target> | ||
</configuration> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-source-plugin</artifactId> | ||
<version>2.2.1</version> | ||
<executions> | ||
<execution> | ||
<id>attach-sources</id> | ||
<goals> | ||
<goal>jar</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-release-plugin</artifactId> | ||
<version>2.4.1</version> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-javadoc-plugin</artifactId> | ||
<version>2.9</version> | ||
<executions> | ||
<execution> | ||
<id>attach-javadocs</id> | ||
<goals> | ||
<goal>jar</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-gpg-plugin</artifactId> | ||
<version>1.4</version> | ||
<executions> | ||
<execution> | ||
<id>sign-artifacts</id> | ||
<phase>verify</phase> | ||
<goals> | ||
<goal>sign</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
<distributionManagement> | ||
<repository> | ||
<id>nexus-releases</id> | ||
<name>Nexus Release Repository</name> | ||
<url>http://oss.sonatype.org/service/local/staging/deploy/maven2/</url> | ||
</repository> | ||
</distributionManagement> | ||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
<source.version>1.6</source.version> | ||
</properties> | ||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/.DS_Store |
10 changes: 10 additions & 0 deletions
10
src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package de.l3s.boilerpipe; | ||
|
||
import de.l3s.boilerpipe.document.TextDocument; | ||
|
||
/** | ||
* Something that can be represented as a {@link TextDocument}. | ||
*/ | ||
public interface BoilerpipeDocumentSource { | ||
TextDocument toTextDocument() throws BoilerpipeProcessingException; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package de.l3s.boilerpipe; | ||
|
||
import java.io.Reader; | ||
|
||
import org.xml.sax.InputSource; | ||
|
||
import de.l3s.boilerpipe.document.TextDocument; | ||
|
||
/** | ||
* Describes a complete filter pipeline. | ||
* | ||
* @author Christian Kohlschütter | ||
*/ | ||
public interface BoilerpipeExtractor extends BoilerpipeFilter { | ||
/** | ||
* Extracts text from the HTML code given as a String. | ||
* | ||
* @param html | ||
* The HTML code as a String. | ||
* @return The extracted text. | ||
* @throws BoilerpipeProcessingException | ||
*/ | ||
public String getText(final String html) | ||
throws BoilerpipeProcessingException; | ||
|
||
/** | ||
* Extracts text from the HTML code available from the given | ||
* {@link InputSource}. | ||
* | ||
* @param is | ||
* The InputSource containing the HTML | ||
* @return The extracted text. | ||
* @throws BoilerpipeProcessingException | ||
*/ | ||
public String getText(final InputSource is) | ||
throws BoilerpipeProcessingException; | ||
|
||
/** | ||
* Extracts text from the HTML code available from the given {@link Reader}. | ||
* | ||
* @param r | ||
* The Reader containing the HTML | ||
* @return The extracted text. | ||
* @throws BoilerpipeProcessingException | ||
*/ | ||
public String getText(final Reader r) throws BoilerpipeProcessingException; | ||
|
||
/** | ||
* Extracts text from the given {@link TextDocument} object. | ||
* | ||
* @param doc | ||
* The {@link TextDocument}. | ||
* @return The extracted text. | ||
* @throws BoilerpipeProcessingException | ||
*/ | ||
public String getText(TextDocument doc) | ||
throws BoilerpipeProcessingException; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/** | ||
* boilerpipe | ||
* | ||
* Copyright (c) 2009 Christian Kohlschütter | ||
* | ||
* The author licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.l3s.boilerpipe; | ||
|
||
import de.l3s.boilerpipe.document.TextDocument; | ||
|
||
/** | ||
* A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and | ||
* processes it somehow. | ||
* | ||
* @author Christian Kohlschütter | ||
*/ | ||
public interface BoilerpipeFilter { | ||
/** | ||
* Processes the given document <code>doc</code>. | ||
* | ||
* @param doc | ||
* The {@link TextDocument} that is to be processed. | ||
* @return <code>true</code> if changes have been made to the | ||
* {@link TextDocument}. | ||
* @throws BoilerpipeProcessingException | ||
*/ | ||
boolean process(final TextDocument doc) | ||
throws BoilerpipeProcessingException; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
/** | ||
* boilerpipe | ||
* | ||
* Copyright (c) 2009 Christian Kohlschütter | ||
* | ||
* The author licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.l3s.boilerpipe; | ||
|
||
import de.l3s.boilerpipe.document.TextDocument; | ||
|
||
/** | ||
* A source that returns {@link TextDocument}s. | ||
* | ||
* @author Christian Kohlschütter | ||
*/ | ||
public interface BoilerpipeInput { | ||
/** | ||
* Returns (somehow) a {@link TextDocument}. | ||
* | ||
* @return A {@link TextDocument}. | ||
* @throws BoilerpipeProcessingException | ||
*/ | ||
TextDocument getTextDocument() throws BoilerpipeProcessingException; | ||
} |
Oops, something went wrong.