2.8.068

chapmanjacobd · Jul 12, 2024 · 88da928 · 88da928
1 parent c879edc
commit 88da928
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 19 deletions.
diff --git a/.github/README.md b/.github/README.md
@@ -97,7 +97,7 @@ To stop playing press Ctrl+C in either the terminal or mpv
 <details><summary>List all subcommands</summary>
 
     $ library
-    library (v2.8.067; 79 subcommands)
+    library (v2.8.068; 80 subcommands)
 
     Create database subcommands:
     ╭───────────────┬──────────────────────────────────────────╮
@@ -177,6 +177,8 @@ To stop playing press Ctrl+C in either the terminal or mpv
     │ sample-compare │ Compare files using sample-hash and other shortcuts │
     ├────────────────┼─────────────────────────────────────────────────────┤
     │ similar-files  │ Find similar files based on filename and size       │
+    ├────────────────┼─────────────────────────────────────────────────────┤
+    │ llm-map        │ Run LLMs across multiple files                      │
     ╰────────────────┴─────────────────────────────────────────────────────╯
 
     Tabular data subcommands:
@@ -770,9 +772,9 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
         - Set `--fixed-pages` to _always_ fetch the desired number of pages
 
         If the website is supported by --auto-pager data is fetched twice when using page iteration.
-        As such, page iteration (--max-pages, --fixed-pages, etc) is disabled when using `--auto-pager`.
+        As such, manual page iteration (--max-pages, --fixed-pages, etc) is disabled when using `--auto-pager`.
 
-        You can set unset --fixed-pages for all the playlists in your database by running this command:
+        You can unset --fixed-pages for all the playlists in your database by running this command:
         sqlite your.db "UPDATE playlists SET extractor_config = json_replace(extractor_config, '$.fixed_pages', null)"
 
     To use "&p=1" instead of "&page=1"
@@ -1553,6 +1555,34 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
 
 
 
+</details>
+
+###### llm-map
+
+<details><summary>Run LLMs across multiple files</summary>
+
+    $ library llm-map -h
+    usage: library llm-map LLAMA_FILE [paths ...] [--llama-args LLAMA_ARGS] [--prompt STR] [--text [INT]] [--rename]
+
+    Run a llamafile with a prompt including path names and file contents
+
+    Rename files based on file contents
+
+        library llm-map ./gemma2.llamafile ~/Downloads/booka.pdf --rename --text
+
+        cat llm_map_renames.csv
+        Path,Output
+        /home/xk/Downloads/booka.pdf,/home/xk/Downloads/Mining_Massive_Datasets.pdf
+
+    You can run a GGUF file with this:
+
+        wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.9/llamafile-0.8.9
+        chmod +x ~/Downloads/llamafile-0.8.9
+        mv ~/Downloads/llamafile-0.8.9 ~/.local/bin/llamafile  # move it somewhere in your $PATH
+
+        library llm-map --model ~/Downloads/llava-v1.5-7b-Q4_K.gguf --image-model ~/Downloads/llava-v1.5-7b-mmproj-Q4_0.gguf --prompt 'what do you see?' ~/Downloads/comp_*.jpg
+
+
 </details>
 
 ### Tabular data subcommands

diff --git a/xklb/__init__.py b/xklb/__init__.py
@@ -1 +1 @@
-__version__ = "2.8.067"
+__version__ = "2.8.068"
diff --git a/xklb/files/llm_map.py b/xklb/files/llm_map.py
@@ -10,22 +10,25 @@
 
 def parse_args():
     parser = argparse_utils.ArgumentParser(usage=usage.llm_map)
-    parser.add_argument("--prompt", '-q', "--custom-prompt", help="Use a custom prompt")
+    parser.add_argument("--prompt", "-q", "--custom-prompt", help="Use a custom prompt")
     parser.add_argument(
         "--text", type=int, nargs="?", const=1500, help="Pass text file contents of each file to the LLM"
     )
-    parser.add_argument("--images", action="store_true", help="Treat paths as image files")
     parser.add_argument("--rename", action="store_true", help="Use rename prompt")
     parser.add_argument("--output", help="The output CSV file to save the results.")
     arggroups.debug(parser)
 
     parser.add_argument(
-        '--model',
-        '-m',
+        "--model",
+        "-m",
         "--llamafile",
         help="The path to the llamafile to run. If llamafile is in your PATH then you can also specify a GGUF file.",
     )
-    parser.add_argument('--image-model', '--mmproj', help="The path to the LLaVA vision GGUF model.")
+    parser.add_argument(
+        "--image-model",
+        "--mmproj",
+        help="The path to the LLaVA vision GGUF model. When specified, paths will be treated as image files",
+    )
     parser.add_argument(
         "--llama-args", "--custom-args", type=shlex.split, default=[], help="Use custom llamafile arguments"
     )
@@ -43,14 +46,14 @@ def parse_args():
         else:
             raise NotImplementedError
 
-    args.exe = which('llamafile')
+    args.exe = which("llamafile")
     if args.exe:
-        args.llama_args += ['-m', args.model]
+        args.llama_args += ["-m", args.model]
     else:
         args.exe = args.model
 
     if args.image_model:
-        args.llama_args += ['--mmproj', args.image_model]
+        args.llama_args += ["--mmproj", args.image_model]
 
     if args.output is None:
         args.output = f"llm_map_{args.prompt}.csv"
@@ -78,15 +81,15 @@ def llm_map():
         prompt = args.prompt
 
         replacements = {
-            '{path}': "Existing path: " + path,
-            '{abspath}': "Existing path: " + str(Path(path).absolute()),
-            '{name}': "Existing filename: " + Path(path).name,
-            '{stem}': "Existing filename: " + Path(path).stem,
+            "{path}": "Existing path: " + path,
+            "{abspath}": "Existing path: " + str(Path(path).absolute()),
+            "{name}": "Existing filename: " + Path(path).name,
+            "{stem}": "Existing filename: " + Path(path).stem,
         }
         for k, v in replacements.items():
-            prompt.replace(k, '\n' + v + '\n')
+            prompt.replace(k, "\n" + v + "\n")
 
-        if args.images:
+        if args.image_model:
             args.llama_args += ["--image", str(Path(path).absolute())]
         elif args.text:
             file_contents = fs_add.munge_book_tags_fast(path)

diff --git a/xklb/text/extract_text.py b/xklb/text/extract_text.py
@@ -59,7 +59,7 @@ def get_text(args, url):
     if not url.startswith("http") and Path(url).is_file():
         text = fs_add.munge_book_tags_fast(url)
         if text:
-            yield text.get("tags").replace(';', '\n')
+            yield text.get("tags").replace(";", "\n")
         yield None
 
     if args.selenium:

diff --git a/xklb/usage.py b/xklb/usage.py
@@ -1646,4 +1646,12 @@ def play(action) -> str:
         cat llm_map_renames.csv
         Path,Output
         /home/xk/Downloads/booka.pdf,/home/xk/Downloads/Mining_Massive_Datasets.pdf
+
+    You can run a GGUF file with this:
+
+        wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.9/llamafile-0.8.9
+        chmod +x ~/Downloads/llamafile-0.8.9
+        mv ~/Downloads/llamafile-0.8.9 ~/.local/bin/llamafile  # move it somewhere in your $PATH
+
+        library llm-map --model ~/Downloads/llava-v1.5-7b-Q4_K.gguf --image-model ~/Downloads/llava-v1.5-7b-mmproj-Q4_0.gguf --prompt 'what do you see?' ~/Downloads/comp_*.jpg
 """