diff --git a/.github/README.md b/.github/README.md index 33439350..a607bb9c 100644 --- a/.github/README.md +++ b/.github/README.md @@ -97,7 +97,7 @@ To stop playing press Ctrl+C in either the terminal or mpv
List all subcommands $ library - library (v2.8.067; 79 subcommands) + library (v2.8.068; 80 subcommands) Create database subcommands: ╭───────────────┬──────────────────────────────────────────╮ @@ -177,6 +177,8 @@ To stop playing press Ctrl+C in either the terminal or mpv │ sample-compare │ Compare files using sample-hash and other shortcuts │ ├────────────────┼─────────────────────────────────────────────────────┤ │ similar-files │ Find similar files based on filename and size │ + ├────────────────┼─────────────────────────────────────────────────────┤ + │ llm-map │ Run LLMs across multiple files │ ╰────────────────┴─────────────────────────────────────────────────────╯ Tabular data subcommands: @@ -770,9 +772,9 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th - Set `--fixed-pages` to _always_ fetch the desired number of pages If the website is supported by --auto-pager data is fetched twice when using page iteration. - As such, page iteration (--max-pages, --fixed-pages, etc) is disabled when using `--auto-pager`. + As such, manual page iteration (--max-pages, --fixed-pages, etc) is disabled when using `--auto-pager`. - You can set unset --fixed-pages for all the playlists in your database by running this command: + You can unset --fixed-pages for all the playlists in your database by running this command: sqlite your.db "UPDATE playlists SET extractor_config = json_replace(extractor_config, '$.fixed_pages', null)" To use "&p=1" instead of "&page=1" @@ -1553,6 +1555,34 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th +
+ +###### llm-map + +
Run LLMs across multiple files + + $ library llm-map -h + usage: library llm-map LLAMA_FILE [paths ...] [--llama-args LLAMA_ARGS] [--prompt STR] [--text [INT]] [--rename] + + Run a llamafile with a prompt including path names and file contents + + Rename files based on file contents + + library llm-map ./gemma2.llamafile ~/Downloads/booka.pdf --rename --text + + cat llm_map_renames.csv + Path,Output + /home/xk/Downloads/booka.pdf,/home/xk/Downloads/Mining_Massive_Datasets.pdf + + You can run a GGUF file with this: + + wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.9/llamafile-0.8.9 + chmod +x ~/Downloads/llamafile-0.8.9 + mv ~/Downloads/llamafile-0.8.9 ~/.local/bin/llamafile # move it somewhere in your $PATH + + library llm-map --model ~/Downloads/llava-v1.5-7b-Q4_K.gguf --image-model ~/Downloads/llava-v1.5-7b-mmproj-Q4_0.gguf --prompt 'what do you see?' ~/Downloads/comp_*.jpg + +
### Tabular data subcommands diff --git a/xklb/__init__.py b/xklb/__init__.py index 0ead5b34..cea18e5f 100644 --- a/xklb/__init__.py +++ b/xklb/__init__.py @@ -1 +1 @@ -__version__ = "2.8.067" +__version__ = "2.8.068" diff --git a/xklb/files/llm_map.py b/xklb/files/llm_map.py index 7a0de85e..637c1cf7 100644 --- a/xklb/files/llm_map.py +++ b/xklb/files/llm_map.py @@ -10,22 +10,25 @@ def parse_args(): parser = argparse_utils.ArgumentParser(usage=usage.llm_map) - parser.add_argument("--prompt", '-q', "--custom-prompt", help="Use a custom prompt") + parser.add_argument("--prompt", "-q", "--custom-prompt", help="Use a custom prompt") parser.add_argument( "--text", type=int, nargs="?", const=1500, help="Pass text file contents of each file to the LLM" ) - parser.add_argument("--images", action="store_true", help="Treat paths as image files") parser.add_argument("--rename", action="store_true", help="Use rename prompt") parser.add_argument("--output", help="The output CSV file to save the results.") arggroups.debug(parser) parser.add_argument( - '--model', - '-m', + "--model", + "-m", "--llamafile", help="The path to the llamafile to run. If llamafile is in your PATH then you can also specify a GGUF file.", ) - parser.add_argument('--image-model', '--mmproj', help="The path to the LLaVA vision GGUF model.") + parser.add_argument( + "--image-model", + "--mmproj", + help="The path to the LLaVA vision GGUF model. When specified, paths will be treated as image files", + ) parser.add_argument( "--llama-args", "--custom-args", type=shlex.split, default=[], help="Use custom llamafile arguments" ) @@ -43,14 +46,14 @@ def parse_args(): else: raise NotImplementedError - args.exe = which('llamafile') + args.exe = which("llamafile") if args.exe: - args.llama_args += ['-m', args.model] + args.llama_args += ["-m", args.model] else: args.exe = args.model if args.image_model: - args.llama_args += ['--mmproj', args.image_model] + args.llama_args += ["--mmproj", args.image_model] if args.output is None: args.output = f"llm_map_{args.prompt}.csv" @@ -78,15 +81,15 @@ def llm_map(): prompt = args.prompt replacements = { - '{path}': "Existing path: " + path, - '{abspath}': "Existing path: " + str(Path(path).absolute()), - '{name}': "Existing filename: " + Path(path).name, - '{stem}': "Existing filename: " + Path(path).stem, + "{path}": "Existing path: " + path, + "{abspath}": "Existing path: " + str(Path(path).absolute()), + "{name}": "Existing filename: " + Path(path).name, + "{stem}": "Existing filename: " + Path(path).stem, } for k, v in replacements.items(): - prompt.replace(k, '\n' + v + '\n') + prompt.replace(k, "\n" + v + "\n") - if args.images: + if args.image_model: args.llama_args += ["--image", str(Path(path).absolute())] elif args.text: file_contents = fs_add.munge_book_tags_fast(path) diff --git a/xklb/text/extract_text.py b/xklb/text/extract_text.py index 1f37943b..f4252dba 100644 --- a/xklb/text/extract_text.py +++ b/xklb/text/extract_text.py @@ -59,7 +59,7 @@ def get_text(args, url): if not url.startswith("http") and Path(url).is_file(): text = fs_add.munge_book_tags_fast(url) if text: - yield text.get("tags").replace(';', '\n') + yield text.get("tags").replace(";", "\n") yield None if args.selenium: diff --git a/xklb/usage.py b/xklb/usage.py index 0bff63ef..e0b38b23 100644 --- a/xklb/usage.py +++ b/xklb/usage.py @@ -1646,4 +1646,12 @@ def play(action) -> str: cat llm_map_renames.csv Path,Output /home/xk/Downloads/booka.pdf,/home/xk/Downloads/Mining_Massive_Datasets.pdf + + You can run a GGUF file with this: + + wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.9/llamafile-0.8.9 + chmod +x ~/Downloads/llamafile-0.8.9 + mv ~/Downloads/llamafile-0.8.9 ~/.local/bin/llamafile # move it somewhere in your $PATH + + library llm-map --model ~/Downloads/llava-v1.5-7b-Q4_K.gguf --image-model ~/Downloads/llava-v1.5-7b-mmproj-Q4_0.gguf --prompt 'what do you see?' ~/Downloads/comp_*.jpg """