Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

learn arxiv tex files #742

Merged
merged 25 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3563936
learn arxiv tex files
srdas Apr 23, 2024
f054bfe
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2024
67a680f
learn_arxiv
srdas Apr 23, 2024
b79661d
Merge branch 'learn_arxiv' of https://github.com/srdas/jupyter-ai int…
srdas Apr 23, 2024
2ee016e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2024
5bf86e8
learn arxiv tex files
srdas Apr 23, 2024
d3d0e8c
learn_arxiv
srdas Apr 23, 2024
dd95f3e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2024
480603c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2024
67b514e
Streamlined code for learning arxiv files
srdas Apr 26, 2024
f034e9a
update learn for arxiv
srdas Apr 26, 2024
64f3eab
Merge branch 'learn_arxiv' of https://github.com/srdas/jupyter-ai int…
srdas Apr 26, 2024
2df7d18
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 26, 2024
c8295c4
removed extra imports
srdas Apr 26, 2024
045cb02
remove extra imports
srdas Apr 26, 2024
f4713e1
Fix /learn in 2.14.0 (#747)
michaelchia Apr 26, 2024
4cebd6e
learn arxiv tex files
srdas Apr 23, 2024
7ca1061
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2024
d8cea38
update learn for arxiv files
srdas Apr 26, 2024
bdbb001
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 26, 2024
bfc42da
Improved code for arxiv files
srdas Apr 30, 2024
26576a8
Merge branch 'main' into learn_arxiv
srdas Apr 30, 2024
f4f09b9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 30, 2024
671ef03
Saves arxiv to root, better exception handling.
3coins May 3, 2024
37086b3
Added arxiv feature to docs.
3coins May 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/users/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,13 @@ use the `-a` or `--all-files` option.
/learn -a <directory>
```

### Learning arXiv files
`/learn` command also provides downloading and processing papers from the [arXiv](https://arxiv.org/) repository. You will need to install the `arxiv` python package for this feature to work. Run `pip install arxiv` to install the `arxiv` package.

```
/learn -r arxiv 2404.18558
```

### Additional chat commands

To clear the chat panel, use the `/clear` command. This does not reset the AI model; the model may still remember previous messages that you sent it, and it may use them to inform its responses.
Expand Down
29 changes: 28 additions & 1 deletion packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any, Coroutine, List, Optional, Tuple

from dask.distributed import Client as DaskClient
from jupyter_ai.document_loaders.directory import get_embeddings, split
from jupyter_ai.document_loaders.directory import arxiv_to_text, get_embeddings, split
from jupyter_ai.document_loaders.splitter import ExtensionSplitter, NotebookSplitter
from jupyter_ai.models import (
DEFAULT_CHUNK_OVERLAP,
Expand Down Expand Up @@ -44,6 +44,9 @@ def __init__(self, *args, **kwargs):
self.parser.add_argument("-v", "--verbose", action="store_true")
self.parser.add_argument("-d", "--delete", action="store_true")
self.parser.add_argument("-l", "--list", action="store_true")
self.parser.add_argument(
"-r", "--remote", action="store", default=None, type=str
)
self.parser.add_argument(
"-c", "--chunk-size", action="store", default=DEFAULT_CHUNK_SIZE, type=int
)
Expand Down Expand Up @@ -110,6 +113,30 @@ async def process_message(self, message: HumanChatMessage):
self.reply(self._build_list_response())
return

if args.remote:
remote_type = args.remote.lower()
if remote_type == "arxiv":
try:
id = args.path[0]
args.path = [arxiv_to_text(id, self.root_dir)]
self.reply(
f"Learning arxiv file with id **{id}**, saved in **{args.path[0]}**.",
message,
)
except ModuleNotFoundError as e:
self.log.error(e)
self.reply(
"No `arxiv` package found. " "Install with `pip install arxiv`."
)
return
except Exception as e:
self.log.error(e)
self.reply(
"An error occurred while processing the arXiv file. "
f"Please verify that the arxiv id {id} is correct."
)
return

# Make sure the path exists.
if not len(args.path) == 1:
self.reply(f"{self.parser.format_usage()}", message)
Expand Down
48 changes: 48 additions & 0 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import hashlib
import itertools
import os
import tarfile
from datetime import datetime
from pathlib import Path
from typing import List

Expand All @@ -10,6 +12,51 @@
from langchain_community.document_loaders import PyPDFLoader


def arxiv_to_text(id: str, output_dir: str) -> str:
"""Downloads and extracts single tar file from arXiv.
Combines the TeX components into a single file.

Parameters
----------
id : str
id for the paper, numbers after "arXiv" in arXiv:xxxx.xxxxx

output_dir : str
directory to save the output file

Returns
-------
output: str
output path to the downloaded TeX file
"""

import arxiv

outfile = f"{id}-{datetime.now():%Y-%m-%d-%H-%M}.tex"
download_filename = "downloaded-paper.tar.gz"
output_path = os.path.join(output_dir, outfile)

paper = next(arxiv.Client().results(arxiv.Search(id_list=[id])))
paper.download_source(filename=download_filename)

with tarfile.open(download_filename) as tar:
tex_list = []
for member in tar:
if member.isfile() and member.name.lower().endswith(".tex"):
tex_list.append(member.name)
tar.extract(member, path="")

with open(output_path, "w") as w:
for f in tex_list:
with open(f) as tex:
w.write(tex.read())
os.remove(f)

os.remove(download_filename)

return output_path


# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
pages = PyPDFLoader(path)
Expand Down Expand Up @@ -50,6 +97,7 @@ def path_to_doc(path):
".txt",
".html",
".pdf",
".tex", # added for raw latex files from arxiv
}


Expand Down
2 changes: 1 addition & 1 deletion packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ test = [

dev = ["jupyter_ai_magics[dev]"]

all = ["jupyter_ai_magics[all]", "pypdf"]
all = ["jupyter_ai_magics[all]", "pypdf", "arxiv"]

[tool.hatch.version]
source = "nodejs"
Expand Down
Loading