Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
pvdlg committed Apr 15, 2013
0 parents commit 366d8b4
Show file tree
Hide file tree
Showing 91 changed files with 9,302 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/bin
/target
/.settings
/.classpath
/.DS_Store
/.project
18 changes: 18 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

boilerpipe

Copyright (c) 2009-2011 Christian Kohlschütter

The author licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
boilerpipe
======

Repackaging of [boilerpipe](https://code.google.com/p/boilerpipe/) published on Maven Central Repository.

Overview
--------
This is a repackaging of the last sources of [boilerpipe](https://code.google.com/p/boilerpipe/) with some improvements:

* Published on Maven Central Repository
* Media extraction (Youtube videos, Vimeo videos and Images) within an article from [Netbreeze-GmbH fork](https://github.com/Netbreeze-GmbH/boilerpipe)

Getting started
-----

The best way to start is to look at [boilerpipe QuickStart](https://code.google.com/p/boilerpipe/wiki/QuickStart)

### Including the SDK in your project

Simply add a new dependency to your `pom.xml`:

```xml
<dependency>
<groupId>com.syncthemall</groupId>
<artifactId>boilerpipe</artifactId>
<version>1.2.1</version>
</dependency>
```
147 changes: 147 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.syncthemall</groupId>
<artifactId>boilerpipe</artifactId>
<version>1.2.1</version>
<packaging>jar</packaging>
<name>boilerpipe</name>
<description>Repackaging of Dropbox Java SDK with minor bug fixes and published on Maven Central Repository.</description>
<url>https://github.com/vanduynslagerp/boilerpipe</url>
<licenses>
<license>
<name>Apache License 2.0</name>
<distribution>repo</distribution>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
</license>
</licenses>
<scm>
<connection>scm:git:[email protected]:vanduynslagerp/boilerpipe.git</connection>
<developerConnection>scm:git:[email protected]:vanduynslagerp/boilerpipe.git</developerConnection>
<url>https://github.com/vanduynslagerp/boilerpipe</url>
</scm>
<developers>
<developer>
<id>1</id>
<name>Christian Kohlschütter</name>
<url>http://www.kohlschutter.com</url>
<roles>
<role>project initiator</role>
</roles>
</developer>
<developer>
<id>2</id>
<name>Manuel Codiga</name>
<email>[email protected]</email>
<roles>
<role>contributor</role>
</roles>
</developer>
</developers>
<dependencies>
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.18</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.11.0</version>
</dependency>
</dependencies>
<build>
<defaultGoal>package</defaultGoal>
<finalName>${project.artifactId}-${project.version}</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<encoding>${project.build.sourceEncoding}</encoding>
<source>${source.version}</source>
<target>${source.version}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.4.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.4</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<distributionManagement>
<repository>
<id>nexus-releases</id>
<name>Nexus Release Repository</name>
<url>http://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<source.version>1.6</source.version>
</properties>
</project>
1 change: 1 addition & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.DS_Store
1 change: 1 addition & 0 deletions src/main/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.DS_Store
1 change: 1 addition & 0 deletions src/main/java/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.DS_Store
10 changes: 10 additions & 0 deletions src/main/java/de/l3s/boilerpipe/BoilerpipeDocumentSource.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package de.l3s.boilerpipe;

import de.l3s.boilerpipe.document.TextDocument;

/**
* Something that can be represented as a {@link TextDocument}.
*/
public interface BoilerpipeDocumentSource {
TextDocument toTextDocument() throws BoilerpipeProcessingException;
}
58 changes: 58 additions & 0 deletions src/main/java/de/l3s/boilerpipe/BoilerpipeExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package de.l3s.boilerpipe;

import java.io.Reader;

import org.xml.sax.InputSource;

import de.l3s.boilerpipe.document.TextDocument;

/**
* Describes a complete filter pipeline.
*
* @author Christian Kohlschütter
*/
public interface BoilerpipeExtractor extends BoilerpipeFilter {
/**
* Extracts text from the HTML code given as a String.
*
* @param html
* The HTML code as a String.
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(final String html)
throws BoilerpipeProcessingException;

/**
* Extracts text from the HTML code available from the given
* {@link InputSource}.
*
* @param is
* The InputSource containing the HTML
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(final InputSource is)
throws BoilerpipeProcessingException;

/**
* Extracts text from the HTML code available from the given {@link Reader}.
*
* @param r
* The Reader containing the HTML
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(final Reader r) throws BoilerpipeProcessingException;

/**
* Extracts text from the given {@link TextDocument} object.
*
* @param doc
* The {@link TextDocument}.
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(TextDocument doc)
throws BoilerpipeProcessingException;
}
40 changes: 40 additions & 0 deletions src/main/java/de/l3s/boilerpipe/BoilerpipeFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* boilerpipe
*
* Copyright (c) 2009 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe;

import de.l3s.boilerpipe.document.TextDocument;

/**
* A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and
* processes it somehow.
*
* @author Christian Kohlschütter
*/
public interface BoilerpipeFilter {
/**
* Processes the given document <code>doc</code>.
*
* @param doc
* The {@link TextDocument} that is to be processed.
* @return <code>true</code> if changes have been made to the
* {@link TextDocument}.
* @throws BoilerpipeProcessingException
*/
boolean process(final TextDocument doc)
throws BoilerpipeProcessingException;
}
35 changes: 35 additions & 0 deletions src/main/java/de/l3s/boilerpipe/BoilerpipeInput.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/**
* boilerpipe
*
* Copyright (c) 2009 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe;

import de.l3s.boilerpipe.document.TextDocument;

/**
* A source that returns {@link TextDocument}s.
*
* @author Christian Kohlschütter
*/
public interface BoilerpipeInput {
/**
* Returns (somehow) a {@link TextDocument}.
*
* @return A {@link TextDocument}.
* @throws BoilerpipeProcessingException
*/
TextDocument getTextDocument() throws BoilerpipeProcessingException;
}
Loading

0 comments on commit 366d8b4

Please sign in to comment.