Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] autotex driver using autotex for conversion #81

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions tex2pdf-service/bin/compile_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,14 @@ def get_outcome_meta_and_files_info(outcome_file: str) -> tuple[dict, list[str],
@click.option("--tex2pdf-timeout", default=100, help="timeout passed to tex2pdf")
@click.option("--post-timeout", default=600, help="timeout for the complete post")
@click.option("--threads", default=64, help="Number of threads requested for threadpool")
def compile(submissions: str, service: str, score: str, tex2pdf_timeout: int, post_timeout: int, threads: int) -> None:
@click.option("--auto-detect", default=True, help="Use preflight for ZZRM generation")
def compile(submissions: str, service: str, score: str, tex2pdf_timeout: int, post_timeout: int, threads: int, auto_detect: bool) -> None:
"""Compile submissions in a directory."""

def local_submit_tarball(tarball: str) -> None:
outcome_file = tarball_to_outcome_path(tarball)
try:
service_process_tarball(service, tarball, outcome_file, tex2pdf_timeout, post_timeout)
service_process_tarball(service, tarball, outcome_file, tex2pdf_timeout, post_timeout, auto_detect)
except FileExistsError:
logging.info(f"Not recreating already existing {outcome_file}.")
pass
Expand Down
2 changes: 1 addition & 1 deletion tex2pdf-service/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 104 additions & 0 deletions tex2pdf-service/tex2pdf/converter_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import time
import typing
from enum import Enum
from glob import glob

from tex2pdf_tools.preflight import PreflightStatusValues, generate_preflight_response
from tex2pdf_tools.tex_inspection import find_unused_toplevel_files, maybe_bbl
Expand Down Expand Up @@ -680,3 +681,106 @@ def generate_pdf(self) -> str | None:
logger.debug("Directory listing of %s is: %s", self.out_dir, os.listdir(self.out_dir))

return self.outcome.get("pdf_file")

class AutoTeXConverterDriver(ConverterDriver):
"""Uses autotex for conversion."""

def __init__(self, work_dir: str, source: str, tag: str | None = None, max_time_budget: float | None = None):
# Default are all already ok
super().__init__(work_dir, source, use_addon_tree=False, tag=tag, max_time_budget=max_time_budget)
self.zzrm = ZeroZeroReadMe()

def generate_pdf(self) -> str|None:
"""We have the beef."""
logger = get_logger()
t0 = time.perf_counter()

# run autotex.pl on the id
PATH = "/usr/local/bin:/opt_arxiv/bin:/opt_arxiv/arxiv-perl/bin:/usr/sbin:/usr/bin:/bin:/sbin"
# SECRETS or GOOGLE_APPLICATION_CREDENTIALS is not defined at all at this point but
# be defensive and squish it anyway.
cmdenv = {"SECRETS": "?", "GOOGLE_APPLICATION_CREDENTIALS": "?", "PATH": PATH}

arxivID = self.tag
# maybe it is already source
worker_args = [
"autotex.pl", "-f", "fInm", "-q",
"-S", self.in_dir, # here the original tarball has been dumped
"-W", self.out_dir, # work_dir/out where we expect files
# TODO currently autotex.pl DOES NOT HONOR THIS!!!
"-v", "-Z", "-p", arxivID ]
with subprocess.Popen(worker_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE,
cwd="/autotex", encoding='iso-8859-1', env=cmdenv) as child:
process_completion = False
try:
(out, err) = child.communicate(timeout=self.max_time_budget)
process_completion = True
except subprocess.TimeoutExpired:
logger.warning("Process timeout %s", shlex.join(worker_args), extra=self.log_extra)
child.kill()
(out, err) = child.communicate()
elapse_time = time.perf_counter() - t0
t1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
logger.debug(f"Exec result: return code: {child.returncode}", extra=self.log_extra)

# files generated
# self.in_dir / tex_cache / arXivID.pdf (might have a version!)
# self.in_dir / tex_logs / autotex.log (same name, not good)
# we need to move them to self.out_dir so that the follow-up packaging
# into a tarfile works
pdf_files = glob(f"{self.in_dir}/tex_cache/{arxivID}*.pdf")
if not pdf_files:
pdf_file = None
elif len(pdf_files) > 1:
raise Exception(f"Multiple PDF files found: {pdf_files}")
else:
# move the file to self.out_dir
pdf_file = os.path.join(self.out_dir, os.path.basename(pdf_files[0]))
os.rename(pdf_files[0], pdf_file)
# we use glob here, since we will need to rename the autotex.log created
# by autotex.pl to arxivID.log *within* autotex.log
log_files = glob(f"{self.in_dir}/tex_logs/autotex.log")
if not log_files:
logger.warning(f"No log files found for {arxivID}")
log = None
else:
with open(log_files[0]) as file:
log = file.read()

# Create an outcome structure
# This is unfortunately not well documented and has severe duplication of entries
self.outcome = {
ID_TAG: self.tag,
"converters": [ {
"pdf_file": pdf_file,
"runs": [ {
"args": worker_args,
"stdout": out,
"stderr": err,
"return_code": child.returncode,
"run_env": cmdenv,
"start_time": t0, "end_time": t1,
"elapse_time": elapse_time,
"process_completion": process_completion,
"PATH": PATH,
"arxiv_id": arxivID,
"log": log
}]
} ],
"start_time": str(t0),
"timeout": str(self.max_time_budget),
"total_time": elapse_time,
"pdf_files": [ pdf_file ],
"pdf_file": pdf_file,
"status": "success" if pdf_file else "fail",
}

# we need to get ZZRM
if self.zzrm is None:
logger.debug("no self.zzrm found, that should not happen")
self.zzrm = ZeroZeroReadMe()
else:
logger.debug("self.zzrm = %s", self.zzrm)

return self.outcome.get("pdf_file")

64 changes: 63 additions & 1 deletion tex2pdf-service/tex2pdf/tex2pdf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from starlette.responses import FileResponse, HTMLResponse

from . import MAX_APPENDING_FILES, MAX_TIME_BUDGET, MAX_TOPLEVEL_TEX_FILES, USE_ADDON_TREE
from .converter_driver import ConversionOutcomeMaker, ConverterDriver, PreflightVersion
from .converter_driver import ConversionOutcomeMaker, ConverterDriver, PreflightVersion, AutoTeXConverterDriver
from .fastapi_util import closer
from .pdf_watermark import Watermark
from .service_logger import get_logger
Expand Down Expand Up @@ -251,6 +251,68 @@ async def convert_pdf(
}
return GzipResponse(content, headers=headers, background=closer(content, filename, log_extra))

@app.post('/autotex/',
responses={
STATCODE.HTTP_200_OK: {"content": {"application/gzip": {}},
"description": "Conversion result"},
STATCODE.HTTP_400_BAD_REQUEST: {"model": Message},
STATCODE.HTTP_422_UNPROCESSABLE_ENTITY: {"model": Message},
STATCODE.HTTP_500_INTERNAL_SERVER_ERROR: {"model": Message}
})
async def autotex_pdf(incoming: UploadFile,
timeout: typing.Annotated[int | None,
Query(title="Time out", description="Time out in seconds.")] = None,
) -> Response:
"""Get a tarball, and convert to PDF using autotex."""
filename = incoming.filename if incoming.filename else tempfile.mktemp(prefix="download")
log_extra = {"source_filename": filename}
logger = get_logger()
logger.info("%s", incoming.filename)
tag = os.path.basename(filename)
while True:
[stem, ext] = os.path.splitext(tag)
if ext in [".gz", ".zip", ".tar"]:
tag = stem
continue
break
with tempfile.TemporaryDirectory(prefix=tag) as tempdir:
in_dir, out_dir = prep_tempdir(tempdir)
await save_stream(in_dir, incoming, filename, log_extra)
timeout_secs = float(MAX_TIME_BUDGET)
if timeout is not None:
try:
timeout_secs = float(timeout)
except ValueError:
pass
pass
driver = AutoTeXConverterDriver(tempdir, filename, tag=tag, max_time_budget=timeout_secs)
try:
_pdf_file = driver.generate_pdf()
except RemovedSubmission:
# TODO how can we detect this???
logger.info("Archive is marked deleted.")
return JSONResponse(status_code=STATCODE.HTTP_422_UNPROCESSABLE_ENTITY,
content={"message": "The source is marked deleted."})

except Exception as exc:
logger.error(f"Exception %s", str(exc), exc_info=True)
return JSONResponse(status_code=STATCODE.HTTP_500_INTERNAL_SERVER_ERROR,
content={"message": traceback.format_exc()})

out_dir_files = os.listdir(out_dir)
outcome_maker = ConversionOutcomeMaker(tempdir, tag)
outcome_maker.create_outcome(driver, driver.outcome, outcome_files=out_dir_files)

content = open(os.path.join(tempdir, outcome_maker.outcome_file), "rb")
filename = os.path.basename(outcome_maker.outcome_file)
headers = {
"Content-Type": "application/gzip",
"Content-Disposition": f"attachment; filename={filename}",
}
return GzipResponse(content, headers=headers,
background=closer(content, filename, log_extra))



@app.get("/texlive/info")
async def texlive_info() -> FileResponse:
Expand Down