Skip to content

Commit

Permalink
Update Dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Dec 28, 2024
1 parent fa99823 commit 10ef561
Showing 1 changed file with 15 additions and 8 deletions.
23 changes: 15 additions & 8 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
ARG BASE_IMAGE=ubuntu:18.04
ARG BASE_IMAGE=python:3.9-slim
FROM $BASE_IMAGE
LABEL maintainer="NeuML"
LABEL repository="paperetl"

# Set Python version (i.e. 3, 3.9)
ARG PYTHON_VERSION=3

# Locale environment variables
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8

# Install required packages
RUN apt-get update && \
apt-get -y --no-install-recommends install libxml2 openjdk-8-jdk-headless openjdk-8-jre-headless python3.7 python3-pip unzip wget && \
apt-get -y --no-install-recommends install libxml2 default-jdk-headless default-jre-headless python${PYTHON_VERSION} python3-pip unzip wget && \
rm -rf /var/lib/apt/lists

# Install paperetl project and dependencies
RUN ln -sf /usr/bin/python3.7 /usr/bin/python && \
RUN ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \
python -m pip install --no-cache-dir -U pip wheel setuptools && \
python -m pip install --no-cache-dir paperetl && \
python -c "import nltk; nltk.download('punkt')"
python -c "import nltk; nltk.download(['punkt', 'punkt_tab', 'averaged_perceptron_tagger_eng'])"

# Install GROBID
# This method builds a trimmed down standalone service. A much simpler method is unzipping then running:
# ./gradlew install && ./gradlew run
RUN wget https://github.com/kermitt2/grobid/archive/0.7.2.zip && \
unzip 0.7.2.zip && rm 0.7.2.zip && mv grobid-0.7.2 grobid-install && \
RUN wget https://github.com/kermitt2/grobid/archive/0.8.1.zip && \
unzip 0.8.1.zip && rm 0.8.1.zip && mv grobid-0.8.1 grobid-install && \
cd grobid-install && ./gradlew clean assemble && \
mkdir -p ../grobid && cd ../grobid && \
unzip ../grobid-install/grobid-home/build/distributions/grobid-home*.zip && \
Expand All @@ -27,7 +34,7 @@ RUN wget https://github.com/kermitt2/grobid/archive/0.7.2.zip && \
rm -rf ~/.gradle ../grobid-install grobid-home/pdf2xml/mac-64/ grobid-home/pdf2xml/win-32/ grobid-home/pdf2xml/win-64

# Cleanup build packages
RUN apt-get -y purge openjdk-8-jdk-headless && apt-get -y autoremove
RUN apt-get -y purge default-jdk-headless && apt-get -y autoremove

# Create scripts directory for start.sh
RUN mkdir -p scripts
Expand All @@ -42,4 +49,4 @@ RUN echo "#!/bin/bash" > scripts/start.sh && \
RUN mkdir -p paperetl/data

# Start script
ENTRYPOINT scripts/start.sh
ENTRYPOINT ["scripts/start.sh"]

0 comments on commit 10ef561

Please sign in to comment.