Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine synchronized storing and indexing, re-organize components, HAL support (no more DOI centric approach), dependency update, import format update #92

Merged
merged 36 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
41b3f48
Update Dockerfile
Nov 2, 2021
fd74782
Prevent glutton from hanging for two minutes
Nov 2, 2021
6a8a2d1
Add max number of connections to ES to config
Nov 2, 2021
e8d4ca5
preparing working branch for version 0.3
kermitt2 Apr 12, 2022
2c2b637
Merge pull request #68 from mkardas/incremental-update
kermitt2 Apr 12, 2022
89eba9c
fix dependency
kermitt2 Apr 13, 2022
9b88fcb
complete merge
kermitt2 Apr 23, 2022
8e17030
avoid exploding the log size when something is going bad with the loa…
kermitt2 Apr 23, 2022
55af8b6
increase the lmdb map size to be safe
kermitt2 Apr 23, 2022
626cca5
review Readme
kermitt2 Jun 22, 2022
a61eac6
review Readme
kermitt2 Jun 22, 2022
a9a15ef
update crossref dump torrent version
kermitt2 May 3, 2023
8ecb21e
some fix for parsing medline/pubmed recent xml
kermitt2 Aug 31, 2023
35f7762
fix other medline import errors
kermitt2 Sep 1, 2023
e42a127
Big Bang change
kermitt2 Oct 20, 2023
9641a59
extending pmid, various fixes
kermitt2 Feb 18, 2024
d2f6a99
review resources
kermitt2 Feb 20, 2024
9936337
complete oai-pmh and tei parser
kermitt2 Feb 20, 2024
eeec3d4
use logback
kermitt2 Feb 20, 2024
fcd4413
add clean gradle tasks, document
kermitt2 Feb 20, 2024
54345b0
add indexer; update dependencies
kermitt2 Feb 22, 2024
f6446e9
add async indexing; fix some mapping; implement HAL API harvester ins…
kermitt2 Feb 23, 2024
46e702d
hal id injection via doi; some fix on server side and dependencies
kermitt2 Feb 24, 2024
346e80d
review hal id injection
kermitt2 Feb 25, 2024
1d2b7b5
fix unused compression for one map
kermitt2 Feb 25, 2024
4a5fa5b
update es client
kermitt2 Feb 25, 2024
d1554a5
review crossref combined storing+indexing
kermitt2 Feb 25, 2024
96ae5a5
review doc, with proper readthedocs
kermitt2 Feb 25, 2024
d8d37ba
cleaning
kermitt2 Feb 25, 2024
43d7957
fix conflict
kermitt2 Feb 25, 2024
e9d0ef4
format variant
kermitt2 Feb 25, 2024
0d34a17
more variants
kermitt2 Feb 25, 2024
c58e836
better logging
kermitt2 Feb 26, 2024
860ff1c
add HAL audit and related process
kermitt2 Feb 27, 2024
da38e29
fix hal id injection
kermitt2 Feb 29, 2024
8efc84e
update tests
kermitt2 Apr 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/.git
/data
.git
data
logs/*
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ data/db
scripts/dump.json
scripts/dump.json.gz
scripts/node_modules
lookup/build
build
.gradle
*.mdb
**/node_modules/**
**/out
**/package-lock.json
lookup/data/db/*
pubmed-glutton/build/*
data/db/*
pubmed-glutton/*
indexing/*
49 changes: 0 additions & 49 deletions Dockerfile

This file was deleted.

638 changes: 19 additions & 619 deletions Readme.md

Large diffs are not rendered by default.

252 changes: 252 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
buildscript {
repositories {
mavenLocal()
mavenCentral()
maven {
url 'https://plugins.gradle.org/m2/'
}
}

dependencies {
classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0'
classpath "gradle.plugin.com.github.jengelman.gradle.plugins:shadow:7.0.0"
}
}

apply plugin: 'application'
apply plugin: 'java'
apply plugin: 'java-library'
//apply plugin: 'maven'
apply plugin: 'maven-publish'
apply plugin: 'com.github.johnrengelman.shadow'

group 'com.scienceminer.glutton'
version '0.3-SNAPSHOT'

//sourceCompatibility = 1.8
sourceCompatibility = 1.11
targetCompatibility = 1.11

// The main class of the application
mainClassName = 'com.scienceminer.glutton.web.LookupServiceApplication'
tasks.run.workingDir = rootProject.rootDir

repositories {
mavenLocal()
mavenCentral()
}

sourceSets.main.resources {
srcDirs = ["src/main/resources"];
}

configurations {
all*.exclude group: 'org.slf4j', module: "slf4j-log4j12"
all*.exclude group: 'log4j', module: "log4j"
all*.exclude group: 'log4j2', module: "log4j2"
implementation.setCanBeResolved(true)
}

processResources {
filesMatching('config/glutton.yml') {
filter {
it.replace('project.version', project.property('version_placeholder'))
}
}
}

task install(dependsOn: installShadowDist)

dependencies {
testImplementation group: 'junit', name: 'junit', version: '4.12'
testImplementation "org.hamcrest:hamcrest-all:1.3"
testImplementation "org.easymock:easymock:3.5"

implementation "com.google.code.gson:gson:2.8.1"

//Dropwizard
implementation 'ru.vyarus:dropwizard-guicey:7.0.0'

implementation 'io.dropwizard:dropwizard-bom:4.0.0'
implementation 'io.dropwizard:dropwizard-core:4.0.0'
implementation 'io.dropwizard:dropwizard-assets:4.0.0'
implementation 'io.dropwizard:dropwizard-testing:4.0.0'
implementation 'io.dropwizard.modules:dropwizard-testing-junit4:4.0.0'
implementation 'io.dropwizard:dropwizard-forms:4.0.0'
implementation 'io.dropwizard:dropwizard-client:4.0.0'
implementation 'io.dropwizard:dropwizard-auth:4.0.0'
implementation 'io.dropwizard.metrics:metrics-core:4.2.22'
implementation 'io.dropwizard.metrics:metrics-servlets:4.2.22'

implementation 'jakarta.validation:jakarta.validation-api:2.0.2'

implementation 'org.slf4j:slf4j-api:1.7.30'
implementation 'ch.qos.logback:logback-classic:1.2.3'
implementation 'org.apache.logging.log4j:log4j-to-slf4j:2.8.2'

implementation "com.rockymadden.stringmetric:stringmetric-core_2.10:0.27.3"

//Parsing XML/Json
implementation group: 'org.codehaus.woodstox', name: 'stax2-api', version: '4.0.0'
implementation group: 'org.codehaus.woodstox', name: 'woodstox-core-asl', version: '4.4.1'

implementation 'org.lmdbjava:lmdbjava:0.9.0'
implementation 'de.ruedigermoeller:fst:2.56'
implementation 'org.xerial.snappy:snappy-java:1.1.7.2'

implementation 'org.elasticsearch.client:elasticsearch-rest-high-level-client:7.17.2'
implementation 'co.elastic.clients:elasticsearch-java:8.12.1'
implementation ('org.apache.solr:solr-solrj:9.5.0') {
exclude group: 'org.eclipse.jetty', module: 'jetty-http2'
exclude group: 'org.eclipse.jetty.http2', module: 'http2-client'
}

implementation 'org.apache.commons:commons-collections4:4.1'
implementation 'commons-beanutils:commons-beanutils:1.9.4'
implementation 'commons-io:commons-io:2.6'
implementation group: 'org.apache.commons', name: 'commons-compress', version: '1.20'

implementation group: 'org.tukaani', name: 'xz', version: '1.8'

implementation "com.fasterxml.jackson.core:jackson-core:2.14.3"
implementation "com.fasterxml.jackson.core:jackson-databind:2.14.3"
implementation "com.fasterxml.jackson.module:jackson-module-afterburner:2.14.3"
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.14.3"

implementation group: 'com.opencsv', name: 'opencsv', version: '5.0'
}

distributions {
shadow {
/*contents {
from(new File(rootProject.rootDir, "data/config")) {
into "data/config"
}
}*/
}
}

shadowJar {
classifier = 'onejar'
mergeServiceFiles()
zip64 true
manifest {
attributes 'Main-Class': mainClassName
}
exclude("logback.xml")
duplicatesStrategy = DuplicatesStrategy.EXCLUDE
}

distTar.enabled = false
distZip.enabled = false
shadowDistTar.enabled = false

artifacts {
archives shadowJar
}

// arguments for exec tasks
ext.getArg = { propName, defaultVal ->
return project.hasProperty(propName) ? project.getProperty(propName) : defaultVal;
}

task server(dependsOn: 'classes', type: JavaExec, group: 'service') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'server', 'config/glutton.yml'
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

task crossref(dependsOn: 'classes', type: JavaExec, group: 'data') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'crossref', '--input', getArg('input', ''), getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

task gap_crossref(dependsOn: 'classes', type: JavaExec, group: 'data') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'gap_crossref', getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

task hal(dependsOn: 'classes', type: JavaExec, group: 'data') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'hal', getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

task pmid(dependsOn: 'classes', type: JavaExec, group: 'service') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'pmid', getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

task unpaywall(dependsOn: 'classes', type: JavaExec, group: 'data') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'unpaywall', '--input', getArg('input', ''), getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

task istex(dependsOn: 'classes', type: JavaExec, group: 'data') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'istex', '--input', getArg('input', ''), getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

// only used it to re-create the search index from scratch, keeping the LMDB unchanged
task index(dependsOn: 'classes', type: JavaExec, group: 'search') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'index', getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}

// experimental
task hal_audit(dependsOn: 'classes', type: JavaExec, group: 'search') {
main = 'com.scienceminer.glutton.web.LookupServiceApplication'
classpath = sourceSets.main.runtimeClasspath
args 'hal_audit', getArg('config', 'config/glutton.yml')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
}
Loading