Skip to content

Commit

Permalink
Rework fulltext crawlers and first prototype
Browse files Browse the repository at this point in the history
Rework fulltext crawlers and first prototype
  • Loading branch information
stefan-kolb committed Aug 18, 2015
1 parent d86c350 commit 353dd8f
Show file tree
Hide file tree
Showing 20 changed files with 539 additions and 326 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[master]
- Fix fulltext crawler for ScienceDirect, SpringerLink, and ACS
- Perform syntax improvements enabled by Java 1.7+ (diamond operator, try-with-resources)
- List of authors is now auto generated `scripts/generate-authors.sh` and inserted into L10N About.html
- Remove Mr.DLib support as MR.DLib will be shut down in 2015
Expand Down
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ dependencies {
compile 'commons-logging:commons-logging:1.2'

compile 'junit:junit:4.12'

compile 'org.jsoup:jsoup:1.8.3'
compile 'com.mashape.unirest:unirest-java:1.4.6'
}

sourceSets {
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/net/sf/jabref/BibtexEntryType.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ public boolean isVisibleAtNewEntryDialog() {

static {
// Put the standard entry types into the type map.
if (!Globals.prefs.getBoolean(JabRefPreferences.BIBLATEX_MODE)) {
// FIXME: throws NullPoinetrException when using BibtexEntry without JabREf instance -> Tests
//if (!Globals.prefs.getBoolean(JabRefPreferences.BIBLATEX_MODE)) {
ALL_TYPES.put("article", BibtexEntryTypes.ARTICLE);
ALL_TYPES.put("inbook", BibtexEntryTypes.INBOOK);
ALL_TYPES.put("book", BibtexEntryTypes.BOOK);
Expand All @@ -127,7 +128,7 @@ public boolean isVisibleAtNewEntryDialog() {
ALL_TYPES.put("misc", BibtexEntryTypes.MISC);
ALL_TYPES.put("other", BibtexEntryTypes.OTHER);
ALL_TYPES.put("ieeetranbstctl", BibtexEntryTypes.IEEETRANBSTCTL);
} else {
/*} else {
ALL_TYPES.put("article", BibLatexEntryTypes.ARTICLE);
ALL_TYPES.put("book", BibLatexEntryTypes.BOOK);
ALL_TYPES.put("inbook", BibLatexEntryTypes.INBOOK);
Expand Down Expand Up @@ -158,7 +159,7 @@ public boolean isVisibleAtNewEntryDialog() {
ALL_TYPES.put("techreport", BibLatexEntryTypes.TECHREPORT);
ALL_TYPES.put("www", BibLatexEntryTypes.WWW);
ALL_TYPES.put("ieeetranbstctl", BibLatexEntryTypes.IEEETRANBSTCTL);
}
}*/
// We need a record of the standard types, in case the user wants
// to remove a customized version. Therefore we clone the map.
STANDARD_TYPES = new TreeMap<String, BibtexEntryType>(ALL_TYPES);
Expand Down
42 changes: 0 additions & 42 deletions src/main/java/net/sf/jabref/external/ACSPdfDownload.java

This file was deleted.

138 changes: 44 additions & 94 deletions src/main/java/net/sf/jabref/external/FindFullText.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,13 @@
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import net.sf.jabref.BibtexEntry;
import net.sf.jabref.logic.crawler.ACS;
import net.sf.jabref.logic.crawler.GoogleScholar;
import net.sf.jabref.logic.crawler.ScienceDirect;
import net.sf.jabref.logic.crawler.SpringerLink;
import net.sf.jabref.util.DOI;
import net.sf.jabref.logic.net.URLDownload;

Expand All @@ -36,108 +41,66 @@
*/
public class FindFullText {

private static final int
FOUND_PDF = 0;
private static final int FOUND_PDF = 0;
public static final int WRONG_MIME_TYPE = 1;
public static final int UNKNOWN_DOMAIN = 2;
public static final int LINK_NOT_FOUND = 3;
public static final int IO_EXCEPTION = 4;
public static final int NO_URLS_DEFINED = 5;
public static final int LINK_NOT_FOUND = 2;
public static final int IO_EXCEPTION = 3;

private final List<FullTextFinder> finders = new ArrayList<FullTextFinder>();


public FindFullText() {
finders.add(new ScienceDirectPdfDownload());
finders.add(new SpringerLinkPdfDownload());
finders.add(new ACSPdfDownload());
// Ordering is important, authorities first!
// Publisher
finders.add(new ScienceDirect());
finders.add(new SpringerLink());
finders.add(new ACS());
// Meta search
finders.add(new GoogleScholar());
}

public FindResult findFullText(BibtexEntry entry) {
String urlText = entry.getField("url");
String doiText = entry.getField("doi");
// First try the Doi link, if defined:
if (doiText != null && !doiText.trim().isEmpty()) {
FindResult resDoi = lookForFullTextAtURL(new DOI(doiText).getURL());
if (resDoi.status == FindFullText.FOUND_PDF) {
return resDoi;
} else if (urlText != null && !urlText.trim().isEmpty()) {
FindResult resUrl = lookForFullTextAtURL(urlText);
if (resUrl.status == FindFullText.FOUND_PDF) {
return resUrl;
} else {
return resDoi; // If both URL and Doi fail, we assume that the error code for Doi is
// probably the most relevant.
}
} else {
return resDoi;
}
}
// No Doi? Try URL:
else if (urlText != null && !urlText.trim().isEmpty()) {
return lookForFullTextAtURL(urlText);
}
// No URL either? Return error code.
else {
return new FindResult(FindFullText.NO_URLS_DEFINED, null);
}
}

private FindResult lookForFullTextAtURL(String urlText) {
try {
URL url = new URL(urlText);
url = resolveRedirects(url, 0);
boolean domainKnown = false;
for (FullTextFinder finder : finders) {
if (finder.supportsSite(url)) {
domainKnown = true;
URL result = finder.findFullTextURL(url);
if (result != null) {

// Check the MIME type of this URL to see if it is a PDF. If not,
// it could be because the user doesn't have access:
try {
String mimeType = new URLDownload(result).determineMimeType();
if (mimeType != null && mimeType.toLowerCase().equals("application/pdf")) {
return new FindResult(result, url);
}
else {
new URLDownload(result).downloadToFile(new File("page.html"));
return new FindResult(FindFullText.WRONG_MIME_TYPE, url);
}
} catch (IOException ex) {
ex.printStackTrace();
return new FindResult(FindFullText.IO_EXCEPTION, url);
}
for (FullTextFinder finder : finders) {
try {
Optional<URL> result = finder.findFullText(entry);

if (result.isPresent()) {
// TODO: recheck this!
// Check the MIME type of this URL to see if it is a PDF. If not,
// it could be because the user doesn't have access:
// FIXME: redirection break this!
// Property-based software engineering measurement
// http://drum.lib.umd.edu/bitstream/1903/19/2/CS-TR-3368.pdf
// FIXME:
// INFO: Fulltext PDF found @ Google: https://www.uni-bamberg.de/fileadmin/uni/fakultaeten/wiai_lehrstuehle/praktische_informatik/Dateien/Publikationen/sose14-towards-application-portability-in-paas.pdf
// javax.net.ssl.SSLProtocolException: handshake alert: unrecognized_name
// http://stackoverflow.com/questions/7615645/ssl-handshake-alert-unrecognized-name-error-since-upgrade-to-java-1-7-0
String mimeType = new URLDownload(result.get()).determineMimeType();
if (mimeType != null && mimeType.toLowerCase().equals("application/pdf")) {
return new FindResult(result.get(), result.get());
} else {
return new FindResult(WRONG_MIME_TYPE, result.get());
}

}
} catch (IOException ex) {
ex.printStackTrace();
return new FindResult(IO_EXCEPTION, null);
}
if (!domainKnown) {
return new FindResult(FindFullText.UNKNOWN_DOMAIN, url);
} else {
return new FindResult(FindFullText.LINK_NOT_FOUND, url);
}
} catch (MalformedURLException e) {
e.printStackTrace();

} catch (IOException e) {
e.printStackTrace();
}

return null;
return new FindResult(LINK_NOT_FOUND, null);
}

/**
* Follow redirects until the final location is reached. This is necessary to handle Doi links, which
* redirect to publishers' web sites. We need to know the publisher's domain name in order to choose
* which FullTextFinder to use.
* @param url The url to start with.
*
* @param url The url to start with.
* @param redirectCount The number of previous redirects. We will follow a maximum of 5 redirects.
* @return the final URL, or the initial one in case there is no redirect.
* @throws IOException for connection error
*/
private URL resolveRedirects(URL url, int redirectCount) throws IOException {
private static URL resolveRedirects(URL url, int redirectCount) throws IOException {
URLConnection uc = url.openConnection();
if (uc instanceof HttpURLConnection) {
HttpURLConnection huc = (HttpURLConnection) uc;
Expand All @@ -146,7 +109,7 @@ private URL resolveRedirects(URL url, int redirectCount) throws IOException {
int responseCode = huc.getResponseCode();
String location = huc.getHeaderField("location");
huc.disconnect();
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP && redirectCount < 5) {
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP || responseCode == HttpURLConnection.HTTP_MOVED_PERM && redirectCount < 5) {
//System.out.println(responseCode);
//System.out.println(location);
try {
Expand Down Expand Up @@ -185,8 +148,7 @@ public static String loadPage(URL url) throws IOException {
sb.append((char) c);
}
return sb.toString();
}
else {
} else {
return null; // TODO: are other types of connection (https?) relevant?
}
} finally {
Expand Down Expand Up @@ -228,16 +190,4 @@ public FindResult(int status, URL originalUrl) {
}
}
}


public static void dumpToFile(String text, File f) {
try {
FileWriter fw = new FileWriter(f);
fw.write(text);
fw.close();
} catch (IOException e) {
e.printStackTrace();

}
}
}
13 changes: 1 addition & 12 deletions src/main/java/net/sf/jabref/external/FindFullTextAction.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ public void run() {

@Override
public void update() {
//pdfURL = new URL("http://geog-www.sbs.ohio-state.edu/faculty/bmark/abbott_etal_ppp03.pdf");
if (result.url != null) {
//System.out.println("PDF URL: "+result.url);
String bibtexKey = entry.getCiteKey();
String[] dirs = basePanel.metaData().getFileDirectory(GUIGlobals.FILE_FIELD);
if (dirs.length == 0) {
Expand All @@ -72,7 +70,6 @@ public void update() {

@Override
public void downloadComplete(FileListEntry file) {
System.out.println("finished");
FileListTableModel tm = new FileListTableModel();
String oldValue = entry.getField(GUIGlobals.FILE_FIELD);
tm.setContent(oldValue);
Expand All @@ -93,24 +90,16 @@ public void downloadComplete(FileListEntry file) {
else {
String message = null;
switch (result.status) {
case FindFullText.UNKNOWN_DOMAIN:
message = Localization.lang("Unable to find full text article. No search algorithm "
+ "defined for the '%0' web site.", result.host);
break;
case FindFullText.WRONG_MIME_TYPE:
message = Localization.lang("Found pdf link, but received the wrong MIME type. "
+ "This could indicate that you don't have access to the fulltext article.");
break;
case FindFullText.LINK_NOT_FOUND:
message = Localization.lang("Unable to find full text document in the linked web page.");
message = Localization.lang("Unable to find full text document.");
break;
case FindFullText.IO_EXCEPTION:
message = Localization.lang("Connection error when trying to find full text document.");
break;
case FindFullText.NO_URLS_DEFINED:
message = Localization.lang("This entry provides no URL or DOI links.");
break;

}
basePanel.output(Localization.lang("Full text article download failed"));
JOptionPane.showMessageDialog(basePanel.frame(), message, Localization.lang("Full text article download failed"),
Expand Down
30 changes: 12 additions & 18 deletions src/main/java/net/sf/jabref/external/FullTextFinder.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,25 @@
*/
package net.sf.jabref.external;

import net.sf.jabref.BibtexEntry;

import java.net.URL;
import java.io.IOException;
import java.util.Optional;

/**
* This interface is used for classes that try to resolve a full-text PDF url from an article
* web page. Implementing classes should specialize on specific article sites.
* */
* This interface is used for classes that try to resolve a full-text PDF url for a BibTex entry.
* Implementing classes should specialize on specific article sites.
* See e.g. @link{http://libguides.mit.edu/apis}.
*/
public interface FullTextFinder {

/**
* Report whether this FullTextFinder works for the site providing the given URL.
*
* @param url The url to check.
* @return true if the site is supported, false otherwise. If the site might be supported,
* it is best to return true.
*/
boolean supportsSite(URL url);

/**
* Take the source HTML for an article page, and try to find the URL to the
* full text for this article.
* Tries to find a fulltext URL for a given BibTex entry.
*
* @param url The URL to the article's web page.
* @return The fulltext PDF URL, if found, or null if not found.
* @param entry The Bibtex entry
* @return The fulltext PDF URL Optional, if found, or an empty Optional if not found.
* @throws NullPointerException if no BibTex entry is given
* @throws java.io.IOException
*/
URL findFullTextURL(URL url) throws IOException;
Optional<URL> findFullText(BibtexEntry entry) throws IOException;
}
Loading

0 comments on commit 353dd8f

Please sign in to comment.