From 57a036b8073151927eb912232a306f42d0318598 Mon Sep 17 00:00:00 2001 From: Peter <74869040+pszemraj@users.noreply.github.com> Date: Sat, 14 Jan 2023 21:24:07 +0100 Subject: [PATCH] CLI + Packaging (#6) - integrate `pyscaffold` structure as backbone for project - enable `lm-api` as CLI command to use from anywhere - default to `openai` and `text-davinci-003` provider - clean up docs Signed-off-by: peter szemraj Signed-off-by: Peter <74869040+pszemraj@users.noreply.github.com> --- .coveragerc | 28 ++ .gitignore | 60 +++- .readthedocs.yml | 23 ++ AUTHORS.md | 4 + CHANGELOG.md | 31 ++ CONTRIBUTING.md | 336 ++++++++++++++++++ README.md | 75 +++- .../api_queries_test_queries_2022-11-29_00.md | 22 ++ pyproject.toml | 9 + requirements.txt | 4 - setup.cfg | 129 +++++++ setup.py | 21 ++ src/lm_api/__init__.py | 18 + query_api.py => src/lm_api/query_api.py | 156 ++++---- .../lm_api/test_goose_api.py | 0 utils.py => src/lm_api/utils.py | 0 tests/conftest.py | 10 + tests/test_skeleton.py | 25 ++ tox.ini | 91 +++++ 19 files changed, 967 insertions(+), 75 deletions(-) create mode 100644 .coveragerc create mode 100644 .readthedocs.yml create mode 100644 AUTHORS.md create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTING.md create mode 100644 data/lm-api-output/api_queries_test_queries_2022-11-29_00.md create mode 100644 pyproject.toml delete mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 src/lm_api/__init__.py rename query_api.py => src/lm_api/query_api.py (55%) rename test_goose_api.py => src/lm_api/test_goose_api.py (100%) rename utils.py => src/lm_api/utils.py (100%) create mode 100644 tests/conftest.py create mode 100644 tests/test_skeleton.py create mode 100644 tox.ini diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..a581717 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,28 @@ +# .coveragerc to control coverage.py +[run] +branch = True +source = lm_api +# omit = bad_file.py + +[paths] +source = + src/ + */site-packages/ + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: diff --git a/.gitignore b/.gitignore index 15094ae..50cd519 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,62 @@ # python sys files *.pyc *.pyo -__pycache__/ \ No newline at end of file +__pycache__/ + +# END CUSTOM IGNORE + +# Temporary and binary files +*~ +*.py[cod] +*.so +*.cfg +!.isort.cfg +!setup.cfg +*.orig +*.log + +*.pot +__pycache__/* +.cache/* +.*.swp +*/.ipynb_checkpoints/* +.DS_Store + +# Project files +.ropeproject +.project +.pydevproject +.settings +.idea +.vscode +tags + +# Package files +*.egg +*.eggs/ +.installed.cfg +*.egg-info + +# Unittest and coverage +htmlcov/* +.coverage +.coverage.* +.tox +junit*.xml +coverage.xml +.pytest_cache/ + +# Build and docs folder/files +build/* +dist/* +sdist/* +docs/api/* +docs/_rst/* +docs/_build/* +cover/* +MANIFEST + +# Per-project virtualenvs +.venv*/ +.conda*/ +.python-version diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..21b0814 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,23 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Build documentation with MkDocs +#mkdocs: +# configuration: mkdocs.yml + +# Optionally build your docs in additional formats such as PDF +formats: + - pdf + +python: + version: 3.8 + install: + - requirements: docs/requirements.txt + - {path: ., method: pip} diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000..e800eef --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,4 @@ +# Contributors + +* peter szemraj | [github](https://github.com/pszemraj), [profile page](peterszemraj.ch) +* daniel paleka | [github](https://github.com/dpaleka) diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0abb06a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,31 @@ +### Changelog + +All notable changes to this project will be documented in this file. Dates are displayed in UTC. + +Generated by [`auto-changelog`](https://github.com/CookPete/auto-changelog). + +#### [v0.0.3](https://github.com/pszemraj/lm-api/compare/v0.0.2...v0.0.3) + +> 25 August 2022 + +- Bug fixes on output_dir and model_id for openai [`#3`](https://github.com/pszemraj/lm-api/pull/3) +- Fix choosing an openai model; didn't change anything for Goose, probably still needs fixes [`d5bb05c`](https://github.com/pszemraj/lm-api/commit/d5bb05cf2aedd495afa63b599f8500e8d5734106) +- Fix making output_dir if it doesn't exist [`51836e0`](https://github.com/pszemraj/lm-api/commit/51836e0e0c54c6dfdffb415fffab11ccc1d6a9f2) + +#### [v0.0.2](https://github.com/pszemraj/lm-api/compare/v0.0.1...v0.0.2) + +> 15 August 2022 + +- Support txt [`#2`](https://github.com/pszemraj/lm-api/pull/2) +- improve output formatting and writing [`3071611`](https://github.com/pszemraj/lm-api/commit/3071611c677e57690b2d01c758f6e2490e998076) +- :zap: support text files [`625d2a1`](https://github.com/pszemraj/lm-api/commit/625d2a165720f1974d2bfe18449722a934e6e855) +- :zap: print output loc [`ad99e71`](https://github.com/pszemraj/lm-api/commit/ad99e7160e7f45a5c27b522e965595860f45517e) + +#### v0.0.1 + +> 16 June 2022 + +- Create LICENSE [`#1`](https://github.com/pszemraj/lm-api/pull/1) +- :tada: [`88c509e`](https://github.com/pszemraj/lm-api/commit/88c509e010cfa1462a38ccde6503f4a9d4b9e092) +- remove old output files [`cc75095`](https://github.com/pszemraj/lm-api/commit/cc750954c5569fa45e697d10ef31af8b1cbba521) +- consolidate logs [`01f91ff`](https://github.com/pszemraj/lm-api/commit/01f91ffb5fec09261b5b52a77f0e15c261675f06) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a630bb1 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,336 @@ + +# Contributing + +Welcome to `lm-api` contributor's guide. + +This document focuses on getting any potential contributor familiarized with +the development processes, but [other kinds of contributions] are also appreciated. + +If you are new to using [git] or have never collaborated in a project previously, +please have a look at [contribution-guide.org]. Other resources are also +listed in the excellent [guide created by FreeCodeCamp] [^contrib1]. + +Please notice, all users and contributors are expected to be **open, +considerate, reasonable, and respectful**. When in doubt, +[Python Software Foundation's Code of Conduct] is a good reference in terms of +behavior guidelines. + +## Issue Reports + +If you experience bugs or general issues with `lm-api`, please have a look +on the [issue tracker]. +If you don't see anything useful there, please feel free to fire an issue report. + +:::{tip} +Please don't forget to include the closed issues in your search. +Sometimes a solution was already reported, and the problem is considered +**solved**. +::: + +New issue reports should include information about your programming environment +(e.g., operating system, Python version) and steps to reproduce the problem. +Please try also to simplify the reproduction steps to a very minimal example +that still illustrates the problem you are facing. By removing other factors, +you help us to identify the root cause of the issue. + +## Documentation Improvements + +You can help improve `lm-api` docs by making them more readable and coherent, or +by adding missing information and correcting mistakes. + +`lm-api` documentation uses [Sphinx] as its main documentation compiler. +This means that the docs are kept in the same repository as the project code, and +that any documentation update is done in the same way was a code contribution. + +```{todo} Don't forget to mention which markup language you are using. + + e.g., [reStructuredText] or [CommonMark] with [MyST] extensions. +``` + +```{todo} If your project is hosted on GitHub, you can also mention the following tip: + + :::{tip} + Please notice that the [GitHub web interface] provides a quick way of + propose changes in `lm-api`'s files. While this mechanism can + be tricky for normal code contributions, it works perfectly fine for + contributing to the docs, and can be quite handy. + + If you are interested in trying this method out, please navigate to + the `docs` folder in the source [repository], find which file you + would like to propose changes and click in the little pencil icon at the + top, to open [GitHub's code editor]. Once you finish editing the file, + please write a message in the form at the bottom of the page describing + which changes have you made and what are the motivations behind them and + submit your proposal. + ::: +``` + +When working on documentation changes in your local machine, you can +compile them using [tox] : + +``` +tox -e docs +``` + +and use Python's built-in web server for a preview in your web browser +(`http://localhost:8000`): + +``` +python3 -m http.server --directory 'docs/_build/html' +``` + +## Code Contributions + +The architecture currently is quite basic. The primary file is `src/lm_api/query_api.py`, with utils in `src/lm_api/utils.py`. The `src/lm_api/__init__.py` file is used to import the main functions into the `lm_api` namespace. It can also be used as a CLI, with the `lm-api` command. + +Before submitting a PR please format your code with [black](https://github.com/psf/black) and [isort](). + +```bash +black src/lm_api +isort src/lm_api +``` + +### Submit an issue + +Before you work on any non-trivial code contribution it's best to first create +a report in the [issue tracker] to start a discussion on the subject. +This often provides additional considerations and avoids unnecessary work. + +### Create an environment + +Before you start coding, we recommend creating an isolated [virtual environment] +to avoid any problems with your installed Python packages. +This can easily be done via either [virtualenv]: + +``` +virtualenv +source /bin/activate +``` + +or [Miniconda]: + +``` +conda create -n lm-api python=3 six virtualenv pytest pytest-cov +conda activate lm-api +``` + +### Clone the repository + +1. Create an user account on GitHub if you do not already have one. + +2. Fork the project [repository]: click on the *Fork* button near the top of the + page. This creates a copy of the code under your account on GitHub. + +3. Clone this copy to your local disk: + + ``` + git clone git@github.com:YourLogin/lm-api.git + cd lm-api + ``` + +4. You should run: + + ``` + pip install -U pip setuptools -e . + ``` + + to be able to import the package under development in the Python REPL. + + ```{todo} if you are not using pre-commit, please remove the following item: + ``` + +5. Install [pre-commit]: + + ``` + pip install pre-commit + pre-commit install + ``` + + `lm-api` comes with a lot of hooks configured to automatically help the + developer to check the code being written. + +### Implement your changes + +1. Create a branch to hold your changes: + + ``` + git checkout -b my-feature + ``` + + and start making changes. Never work on the main branch! + +2. Start your work on this branch. Don't forget to add [docstrings] to new + functions, modules and classes, especially if they are part of public APIs. + +3. Add yourself to the list of contributors in `AUTHORS.rst`. + +4. When you’re done editing, do: + + ``` + git add + git commit + ``` + + to record your changes in [git]. + + ```{todo} if you are not using pre-commit, please remove the following item: + ``` + + Please make sure to see the validation messages from [pre-commit] and fix + any eventual issues. + This should automatically use [flake8]/[black] to check/fix the code style + in a way that is compatible with the project. + + :::{important} + Don't forget to add unit tests and documentation in case your + contribution adds an additional feature and is not just a bugfix. + + Moreover, writing a [descriptive commit message] is highly recommended. + In case of doubt, you can check the commit history with: + + ``` + git log --graph --decorate --pretty=oneline --abbrev-commit --all + ``` + + to look for recurring communication patterns. + ::: + +5. Please check that your changes don't break any unit tests with: + + ``` + tox + ``` + + (after having installed [tox] with `pip install tox` or `pipx`). + + You can also use [tox] to run several other pre-configured tasks in the + repository. Try `tox -av` to see a list of the available checks. + +### Submit your contribution + +1. If everything works fine, push your local branch to the remote server with: + + ``` + git push -u origin my-feature + ``` + +2. Go to the web page of your fork and click "Create pull request" + to send your changes for review. + + ```{todo} if you are using GitHub, you can uncomment the following paragraph + + Find more detailed information in [creating a PR]. You might also want to open + the PR as a draft first and mark it as ready for review after the feedbacks + from the continuous integration (CI) system or any required fixes. + + ``` + +### Troubleshooting + +The following tips can be used when facing problems to build or test the +package: + +1. Make sure to fetch all the tags from the upstream [repository]. + The command `git describe --abbrev=0 --tags` should return the version you + are expecting. If you are trying to run CI scripts in a fork repository, + make sure to push all the tags. + You can also try to remove all the egg files or the complete egg folder, i.e., + `.eggs`, as well as the `*.egg-info` folders in the `src` folder or + potentially in the root of your project. + +2. Sometimes [tox] misses out when new dependencies are added, especially to + `setup.cfg` and `docs/requirements.txt`. If you find any problems with + missing dependencies when running a command with [tox], try to recreate the + `tox` environment using the `-r` flag. For example, instead of: + + ``` + tox -e docs + ``` + + Try running: + + ``` + tox -r -e docs + ``` + +3. Make sure to have a reliable [tox] installation that uses the correct + Python version (e.g., 3.7+). When in doubt you can run: + + ``` + tox --version + # OR + which tox + ``` + + If you have trouble and are seeing weird errors upon running [tox], you can + also try to create a dedicated [virtual environment] with a [tox] binary + freshly installed. For example: + + ``` + virtualenv .venv + source .venv/bin/activate + .venv/bin/pip install tox + .venv/bin/tox -e all + ``` + +4. [Pytest can drop you] in an interactive session in the case an error occurs. + In order to do that you need to pass a `--pdb` option (for example by + running `tox -- -k --pdb`). + You can also setup breakpoints manually instead of using the `--pdb` option. + +## Maintainer tasks + +### Releases + +```{todo} This section assumes you are using PyPI to publicly release your package. + + If instead you are using a different/private package index, please update + the instructions accordingly. +``` + +If you are part of the group of maintainers and have correct user permissions +on [PyPI], the following steps can be used to release a new version for +`lm-api`: + +1. Make sure all unit tests are successful. +2. Tag the current commit on the main branch with a release tag, e.g., `v1.2.3`. +3. Push the new tag to the upstream [repository], + e.g., `git push upstream v1.2.3` +4. Clean up the `dist` and `build` folders with `tox -e clean` + (or `rm -rf dist build`) + to avoid confusion with old builds and Sphinx docs. +5. Run `tox -e build` and check that the files in `dist` have + the correct version (no `.dirty` or [git] hash) according to the [git] tag. + Also check the sizes of the distributions, if they are too big (e.g., > + 500KB), unwanted clutter may have been accidentally included. +6. Run `tox -e publish -- --repository pypi` and check that everything was + uploaded to [PyPI] correctly. + +[^contrib1]: Even though, these resources focus on open source projects and + communities, the general ideas behind collaborating with other developers + to collectively create software are general and can be applied to all sorts + of environments, including private companies and proprietary code bases. + +[black]: https://pypi.org/project/black/ +[contribution-guide.org]: http://www.contribution-guide.org/ +[descriptive commit message]: https://chris.beams.io/posts/git-commit +[docstrings]: https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html +[flake8]: https://flake8.pycqa.org/en/stable/ +[git]: https://git-scm.com +[guide created by freecodecamp]: https://github.com/freecodecamp/how-to-contribute-to-open-source +[miniconda]: https://docs.conda.io/en/latest/miniconda.html +[other kinds of contributions]: https://opensource.guide/how-to-contribute +[pre-commit]: https://pre-commit.com/ +[pypi]: https://pypi.org/ +[pytest can drop you]: https://docs.pytest.org/en/stable/usage.html#dropping-to-pdb-python-debugger-at-the-start-of-a-test +[python software foundation's code of conduct]: https://www.python.org/psf/conduct/ +[sphinx]: https://www.sphinx-doc.org/en/master/ +[tox]: https://tox.readthedocs.io/en/stable/ +[virtual environment]: https://realpython.com/python-virtual-environments-a-primer/ +[virtualenv]: https://virtualenv.pypa.io/en/stable/ + +```{todo} Please review and change the following definitions: +``` + +[repository]: https://github.com/pszemraj/lm-api +[issue tracker]: https://github.com/pszemraj/lm-api/issues diff --git a/README.md b/README.md index 7d1a7dd..b9ebd2c 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,78 @@ -# lm-api: Utilities large language model API queries + -## Usage +# lm-api + +> Command-line utilities for querying large language models + +This repo is built around making it easy to run a set of queries against a large language model (LM) and get back a set of results via CLI, but it also has basic Python API functionality. + +Queries are expected to be in a pandas-compatible format, and results are written to a text file with markdown formatting for easy viewing/sharing. + +## Installation -Command line script: +After cloning, `cd` into the `lm-api` directory and run +```bash +# create a virtual environment (optional) +pip install -e . ``` -python query_api.py -i -m 'gpt-j-6b' -o + +Alternatively, directly install via pip+git: + +```bash +# create a virtual environment (optional) +pip install git+https://github.com/pszemraj/lm-api.git ``` -This will query each row in the input file and output the results to the output directory using the gpt-j-6b model (on Goose API). +A quick test can be run with the `src/lm_api/test_goose_api.py` script. -**IMPORTANT:** your API key must be set in the environment variable `GOOSE` and `OPENAI` or passed as an argument to the script with the `-k` flag. +## On API Keys -## Installation +You will need an API key for each provider you want to query. Currently, the following providers are supported: + +- [Goose](https://goose.ai/) +- [OpenAI](https://beta.openai.com/playground) + +API keys can be set in the environment variables `GOOSE` and `OPENAI` or passed as an argument to the script with the `-k` flag. -- `cd` to the directory containing this file and run `pip install -r requirements.txt` +## Usage + +Command line scripts are located in `src/lm_api/`. And become installed as CLI commands. Currently, the commands are limited to `lm-api`. **IMPORTANT: your API key must be set in the environment variable `GOOSE` and `OPENAI` or passed as an argument to the script with the `-k` flag.** + +An example: -A quick test can be run with the `test_goose_api.py` script. +```bash +lm-api -i data/test_queries.xlsx -o ./my-test-folder +``` + +This will run the queries in `data/test_queries.xlsx` and write the results to a `.md` file in `my-test-folder/` in your current working directory. There are many options for the script, which can be viewed with the `-h` flag (e.g., `lm-api -h`). + +```bash +usage: lm-api [-h] [-i INPUT_FILE] [-o OUTPUT_DIR] [-provider PROVIDER_ID] [-k KEY] [-p PREFIX] [-s SUFFIX] [-simple] + [-kc KEY_COLUMN] [-m MODEL_ID] [-n N_TOKENS] [-t TEMPERATURE] [-f2 FREQUENCY_PENALTY] + [-p2 PRESENCE_PENALTY] [-v] +``` + +## TODO / Roadmap + +_Note: this is a work in progress, and the following is a running list of things that need to be done. This may and likely will be updated._ + +- [ ] adjust the `--prefix` and `--suffix` flags to a "prompt engine" switch that can augment/update the prompt with a variety of options (e.g., `--prompt-engine=prefix` or `--prompt-engine=prefix+suffix`) +- [ ] add a simple CLI command that does not require a query file +- [ ] add support for other providers (e.g., textsynth) +- [ ] validate performance as package / adjust as needed (i.e., import `lm_api` should work and have full functionality w.r.t. CLI) +- [ ] setup tests --- + +[![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/) diff --git a/data/lm-api-output/api_queries_test_queries_2022-11-29_00.md b/data/lm-api-output/api_queries_test_queries_2022-11-29_00.md new file mode 100644 index 0000000..62441ca --- /dev/null +++ b/data/lm-api-output/api_queries_test_queries_2022-11-29_00.md @@ -0,0 +1,22 @@ +API QUERIES FOR test_queries + +- text-davinci-003 +- 2022-11-29_00 + ## What is a transformer model? +Response: + + +A transformer model is a type of neural network architecture that uses attention mechanisms to help the model learn language-based tasks. It is a sequence-to-sequence model which means it processes an input sequence and produces an output sequence based on the input. The key component of a transformer model is the attention mechanism which allows the model to focus on specific parts of the input data and build a better understanding of the text. This makes transformer models very powerful for tasks like machine translation, text summarization and natural language processing. + + ## How to implement an algorithm to get n-grams from a body of text? +Response: + + +An algorithm to get n-grams from a body of text can be implemented by first tokenizing the text into individual words, then looping through each word, and for each word, creating an n-gram of the word and its surrounding words. This can be done by keeping track of the index of the current word, then retrieving the previous (n-1) words and next (n-1) words and combining them into an n-gram. The resulting n-grams can then be stored in a list or other data structure and returned. + + ## Why are topic models not so useful in practice? +Response: + + +Topic models are not as useful in practice because they have difficulty accurately capturing the meaning of documents. Topic models rely on the assumption that each document is composed of a mixture of topics, and each topic is composed of a mixture of words. This means that they often fail to accurately capture the context of the document, leading to inaccurate results. Additionally, topic models are limited in their ability to capture relationships between topics, making it difficult to accurately identify topics in long documents. To improve the effectiveness of topic models, additional features such as sentiment analysis and semantic analysis can be used to better understand the meaning of documents. + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..89a5bed --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! +requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +# For smarter version schemes and other configuration options, +# check out https://github.com/pypa/setuptools_scm +version_scheme = "no-guess-dev" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8860cc1..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -openai -pandas -openpyxl -tqdm diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..775b618 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,129 @@ +# This file is used to configure your project. +# Read more about the various options under: +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html +# https://setuptools.pypa.io/en/latest/references/keywords.html + +[metadata] +name = lm-api +description = Utilities for querying large language models +author = peter szemraj +author_email = peterszemraj@gmail.com +license = Apache License 2.0 +license_files = LICENSE +long_description = file: README.md +long_description_content_type = text/markdown; charset=UTF-8; variant=GFM +url = https://github.com/pszemraj/lm-api +# Add here related links, for example: +project_urls = + Documentation = https://pyscaffold.org/ +# Source = https://github.com/pyscaffold/pyscaffold/ +# Changelog = https://pyscaffold.org/en/latest/changelog.html +# Tracker = https://github.com/pyscaffold/pyscaffold/issues +# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold +# Download = https://pypi.org/project/PyScaffold/#files +# Twitter = https://twitter.com/PyScaffold + +# Change if running only on Windows, Mac or Linux (comma-separated) +platforms = any + +# Add here all kinds of additional classifiers as defined under +# https://pypi.org/classifiers/ +classifiers = + Development Status :: 4 - Beta + Programming Language :: Python + + +[options] +zip_safe = False +packages = find_namespace: +include_package_data = True +package_dir = + =src + +# Require a min/specific Python version (comma-separated conditions) +python_requires = >=3.7 + +# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. +# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in +# new major versions. This works if the required packages follow Semantic Versioning. +# For more information, check out https://semver.org/. +install_requires = + importlib-metadata; python_version<"3.8" + openai + pandas + openpyxl + tqdm + + +[options.packages.find] +where = src +exclude = + tests + +[options.extras_require] +# Add here additional requirements for extra features, to install with: +# `pip install lm-api[PDF]` like: +# PDF = ReportLab; RXP + +# Add here test requirements (semicolon/line-separated) +testing = + setuptools + pytest + pytest-cov + +[options.entry_points] +# Add here console scripts like: +console_scripts = + lm-api = lm_api.query_api:run +# For example: +# console_scripts = +# fibonacci = lm_api.skeleton:run +# And any other entry points, for example: +# pyscaffold.cli = +# awesome = pyscaffoldext.awesome.extension:AwesomeExtension + +[tool:pytest] +# Specify command line options as you would do when invoking pytest directly. +# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml +# in order to write a coverage file that can be read by Jenkins. +# CAUTION: --cov flags may prohibit setting breakpoints while debugging. +# Comment those flags to avoid this pytest issue. +addopts = + --cov lm_api --cov-report term-missing + --verbose +norecursedirs = + dist + build + .tox +testpaths = tests +# Use pytest markers to select/deselect specific tests +# markers = +# slow: mark tests as slow (deselect with '-m "not slow"') +# system: mark end-to-end system tests + +[devpi:upload] +# Options for the devpi: PyPI server and packaging tool +# VCS export must be deactivated since we are using setuptools-scm +no_vcs = 1 +formats = bdist_wheel + +[flake8] +# Some sane defaults for the code style checker flake8 +max_line_length = 88 +extend_ignore = E203, W503 +# ^ Black-compatible +# E203 and W503 have edge cases handled by black +exclude = + .tox + build + dist + .eggs + docs/conf.py + +[pyscaffold] +# PyScaffold's parameters when the project was created. +# This will be used when updating. Do not change! +version = 4.3.1 +package = lm_api +extensions = + markdown diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b52a66b --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +""" + Setup file for lm-api. + Use setup.cfg to configure your project. + + This file was generated with PyScaffold 4.3.1. + PyScaffold helps you to put up the scaffold of your new Python project. + Learn more under: https://pyscaffold.org/ +""" +from setuptools import setup + +if __name__ == "__main__": + try: + setup(use_scm_version={"version_scheme": "no-guess-dev"}) + except: # noqa + print( + "\n\nAn error occurred while building the project, " + "please ensure you have the most updated version of setuptools, " + "setuptools_scm and wheel with:\n" + " pip install -U setuptools setuptools_scm wheel\n\n" + ) + raise diff --git a/src/lm_api/__init__.py b/src/lm_api/__init__.py new file mode 100644 index 0000000..768eacc --- /dev/null +++ b/src/lm_api/__init__.py @@ -0,0 +1,18 @@ +import sys + +from . import query_api + +if sys.version_info[:2] >= (3, 8): + # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` + from importlib.metadata import PackageNotFoundError, version # pragma: no cover +else: + from importlib_metadata import PackageNotFoundError, version # pragma: no cover + +try: + # Change here if project is renamed and does not equal the package name + dist_name = "lm-api" + __version__ = version(dist_name) +except PackageNotFoundError: # pragma: no cover + __version__ = "unknown" +finally: + del version, PackageNotFoundError diff --git a/query_api.py b/src/lm_api/query_api.py similarity index 55% rename from query_api.py rename to src/lm_api/query_api.py index aa5b5f4..15a5225 100644 --- a/query_api.py +++ b/src/lm_api/query_api.py @@ -1,24 +1,29 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ - query_vs_dataframe.py - loads a CSV/xlsx/JSON file using Pandas, then sends an API query for each row in in the file. Outputs the query and response to a text file. +query_api.py - loads a CSV/xlsx/JSON file using Pandas, then sends an API query for each row in in the file. Outputs the query and response to a text file. - python query_vs_dataframe.py -h for help +Usage: + usage: lm-api/query_api.py [-h] [-i INPUT_FILE] [-o OUTPUT_DIR] [-provider PROVIDER_ID] [-k KEY] [-p PREFIX] + [-kc KEY_COLUMN] [-m MODEL_ID] [-n N_TOKENS] [-t TEMPERATURE] [-f2 FREQUENCY_PENALTY] [-s SUFFIX] + [-simple] [-p2 PRESENCE_PENALTY] [-v] + + call with --help for more information """ import argparse import logging import os -import sys import random +import sys import time from pathlib import Path import openai from tqdm import tqdm -from utils import append_entry_outtxt, df_to_list, flex_load_pandas +from lm_api.utils import append_entry_outtxt, df_to_list, flex_load_pandas logging.basicConfig( level=logging.INFO, @@ -26,51 +31,53 @@ format="%(asctime)s %(message)s", filename=f"api_dataframe_query.log", ) + logger = logging.getLogger(__name__) def query_terms( term_list, - prefix, - suffix, + prefix: str = "", + suffix: str = "", + model_id: str = "text-davinci-003", + n_tokens: int = 128, + frequency_penalty: float = 0.15, + presence_penalty: float = 0.05, + temperature: float = 0.7, + out_path: str or Path = None, + source_path: str or Path = None, verbose=False, - model_id="gpt-neo-20b", - n_tokens=128, - frequency_penalty=0.3, - presence_penalty=0.05, - temperature=1, - out_path=None, - source_path=None, ): """ - query_terms - queries the API for each term in the term_list - - Args: - term_list (list): list of terms to query - prefix (str): prefix to add to each query - suffix (str): suffix to add to each query - verbose (bool, optional): _description_. Defaults to False. - model_id (str, optional): _description_. Defaults to "gpt-neo-20b". - n_tokens (int, optional): _description_. Defaults to 128. - frequency_penalty (float, optional): _description_. Defaults to 0.3. - presence_penalty (float, optional): _description_. Defaults to 0.05. - temperature (int, optional): _description_. Defaults to 1. - out_path (_type_, optional): _description_. Defaults to None. + query_terms - sends a query to the API for each term in the list + + :param term_list: list of terms to query + :param str prefix: prefix to add to each query + :param str suffix: suffix to add to each query + :param str model_id: model id to use for the API query (default: text-davinci-003) + :param int n_tokens: number of tokens to use for the API query (default: 128) + :param float frequency_penalty: frequency penalty to use for the API query (default: 0.15) + :param float presence_penalty: presence penalty to use for the API query (default: 0.05) + :param float temperature: temperature to use for the API query (default: 0.7) + :param str or Path out_path: path to the output file (default: None) + :param str or Path source_path: path to the source file (default: None) + :param bool verbose: verbose output (default: False) + :return list: list of responses from the API """ if verbose: print(f"querying {len(term_list)} terms") for term in tqdm(term_list, desc="querying terms", total=len(term_list)): time.sleep(random.random() * 2) - _query = f"{prefix} {term} {suffix}" - _query_token_count = int(len(_query.split()) / 4) + query = f"{prefix} {term} {suffix}".strip() + _query_token_count = int(len(query.split()) / 4) # approx 4 tokens per word if verbose: - print(f"querying {term}:\n\t{_query}") + print(f"querying {term}:\n\t{query}") # query the API completion = openai.Completion.create( engine=model_id, - prompt=_query, + prompt=query, max_tokens=_query_token_count + n_tokens, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, @@ -78,11 +85,11 @@ def query_terms( ) # remove the prefix and suffix from the query - _query = _query.replace(prefix, "").replace(suffix, "") + query = query.replace(prefix, "").replace(suffix, "") # append the response to the output text file out_file_path = append_entry_outtxt( - _query, + query, completion.choices[0].text, out_path=out_path, model_name=model_id, @@ -96,7 +103,9 @@ def get_parser(): """ get_parser - a helper function for the argparse module """ - parser = argparse.ArgumentParser(description="Split text by percent") + parser = argparse.ArgumentParser( + description="Query a list of terms from a pandas-compatible file to a language model API" + ) parser.add_argument( "-i", @@ -119,31 +128,39 @@ def get_parser(): "--provider-id", required=False, type=str, - default="goose", - help="provider to connect to for API. Defaults to goose (openai is other)", + default="openai", + help="provider to connect to for API. Defaults to openai (options: openai, goose)", ) parser.add_argument( "-k", "--key", type=str, default=None, - help="API key for the provider if needed", + help="API key for the provider if needed (or set as environment variable OPENAI or GOOSE)", ) parser.add_argument( "-p", "--prefix", required=False, - default="Explain the following Natural Language Processing (NLP) concept(s):", + default="Explain the following concept(s) to a Master's student in the field:", type=str, - help="prefix to add to each query", + help="prefix to add to each query (spaces added automatically).\nDefaults to:\t'Explain the following concept(s) to a Master's student in the field:'", ) parser.add_argument( "-s", "--suffix", required=False, - default=" An acceptable solution to the problem would be similar to:", + default="An acceptable solution to the problem would be similar to:", type=str, - help="suffix to add to each query", + help="suffix to add to each query (spaces added automatically). \nDefaults to:\t'An acceptable solution to the problem would be similar to:'", + ) + parser.add_argument( + "-simple", + "--no_prefix_suffix", + required=False, + default=False, + action="store_true", + help="do not add a prefix or suffix to the query", ) parser.add_argument( "-kc", @@ -151,15 +168,15 @@ def get_parser(): required=False, default="terms", type=str, - help="name of the column in the input file that contains the terms to query", + help="name of the column in the input file that contains the terms to query. Defaults to 'terms'", ) parser.add_argument( "-m", "--model-id", required=False, - default="gpt-neo-20b", # gpt-j-6b + default="text-davinci-003", type=str, - help="model id to use for the API query", + help="model id to use for the API query. OpenAI models (text-davinci-003, ada, etc) Goose models (gpt-neo-20b, gpt-j-6b, etc). Defaults to text-davinci-003", ) parser.add_argument( "-n", @@ -167,7 +184,7 @@ def get_parser(): required=False, default=128, type=int, - help="number of tokens to use for the API query", + help="number of tokens to use for the API query (default: 128)", ) parser.add_argument( "-t", @@ -175,7 +192,7 @@ def get_parser(): required=False, default=0.7, type=float, - help="temperature to use for the API query", + help="temperature to use for the API query (default: 0.7)", ) parser.add_argument( "-f2", @@ -183,7 +200,7 @@ def get_parser(): required=False, default=0.15, type=float, - help="frequency penalty to use for the API query", + help="frequency penalty to use for the API query (default: 0.15)", ) parser.add_argument( "-p2", @@ -191,7 +208,7 @@ def get_parser(): required=False, default=0.05, type=float, - help="presence penalty to use for the API query", + help="presence penalty to use for the API query (default: 0.05)", ) parser.add_argument( "-v", @@ -204,22 +221,26 @@ def get_parser(): return parser -if __name__ == "__main__": +def main(args): PROVIDERS = ["goose", "openai"] - parser = get_parser() - args = parser.parse_args() - input_id = ( - Path(args.input_file) - if args.input_file - else Path.cwd() / "data" / "test_queries.xlsx" + + input_id = Path(args.input_file) + assert input_id.exists(), f"input file {str(input_id)} does not exist" + output_dir = ( + input_id.parent / "lm-api-output" + if args.output_dir is None + else Path(args.output_dir) ) - output_dir = Path(args.output_dir) or Path.cwd() / "out" output_dir.mkdir(exist_ok=True) key_column = args.key_column prefix = args.prefix suffix = args.suffix + if args.no_prefix_suffix: + logger.info("no prefix or suffix added to queries") + prefix = "" + suffix = "" model_id = args.model_id key = args.key provider_id = args.provider_id @@ -231,7 +252,8 @@ def get_parser(): verbose = args.verbose env_var = os.environ.get(provider_id.upper()) - openai.api_key = env_var if key is None else str(key) + openai.api_key = env_var if key is None else key + assert openai.api_key is not None, "no API key found" openai.api_base = ( "https://api.goose.ai/v1" if provider_id == "goose" @@ -260,14 +282,15 @@ def get_parser(): engine_ids = [e["id"] for e in engines["data"]] if provider_id == "openai" and model_id not in engine_ids: - print(f"{model_id} not found in openai.Engine.list(). Continue with text-davinci-002?") - if input("y/n: ") == "y": - model_id = "text-davinci-002" + print( + f"{model_id} not found in openai.Engine.list(). Continue with text-davinci-003?" + ) + if input("y/[n]: ") == "y": + model_id = "text-davinci-003" else: print("Exiting. Use -m to specify a valid model id") sys.exit() - if input_id.suffix == ".txt": with open(input_id, "r", encoding="utf-8", errors="ignore") as f: terms = f.readlines() @@ -296,3 +319,16 @@ def get_parser(): ) print(f"done, output file:\n\t{out_file_path}") + + +def run(): + """ + run - run the main function + """ + parser = get_parser() + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + run() diff --git a/test_goose_api.py b/src/lm_api/test_goose_api.py similarity index 100% rename from test_goose_api.py rename to src/lm_api/test_goose_api.py diff --git a/utils.py b/src/lm_api/utils.py similarity index 100% rename from utils.py rename to src/lm_api/utils.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e4a64e4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +""" + Dummy conftest.py for lm_api. + + If you don't know what this is for, just leave it empty. + Read more about conftest.py under: + - https://docs.pytest.org/en/stable/fixture.html + - https://docs.pytest.org/en/stable/writing_plugins.html +""" + +# import pytest diff --git a/tests/test_skeleton.py b/tests/test_skeleton.py new file mode 100644 index 0000000..47e7bf4 --- /dev/null +++ b/tests/test_skeleton.py @@ -0,0 +1,25 @@ +import pytest + +from lm_api.skeleton import fib, main + +__author__ = "peter szemraj" +__copyright__ = "peter szemraj" +__license__ = "MIT" + + +def test_fib(): + """API Tests""" + assert fib(1) == 1 + assert fib(2) == 1 + assert fib(7) == 13 + with pytest.raises(AssertionError): + fib(-10) + + +def test_main(capsys): + """CLI Tests""" + # capsys is a pytest fixture that allows asserts against stdout/stderr + # https://docs.pytest.org/en/stable/capture.html + main(["7"]) + captured = capsys.readouterr() + assert "The 7-th Fibonacci number is 13" in captured.out diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..54f9323 --- /dev/null +++ b/tox.ini @@ -0,0 +1,91 @@ +# Tox configuration file +# Read more under https://tox.wiki/ +# THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS! + +[tox] +minversion = 3.24 +envlist = default +isolated_build = True + + +[testenv] +description = Invoke pytest to run automated tests +setenv = + TOXINIDIR = {toxinidir} +passenv = + HOME + SETUPTOOLS_* +extras = + testing +commands = + pytest {posargs} + + +# # To run `tox -e lint` you need to make sure you have a +# # `.pre-commit-config.yaml` file. See https://pre-commit.com +# [testenv:lint] +# description = Perform static analysis and style checks +# skip_install = True +# deps = pre-commit +# passenv = +# HOMEPATH +# PROGRAMDATA +# SETUPTOOLS_* +# commands = +# pre-commit run --all-files {posargs:--show-diff-on-failure} + + +[testenv:{build,clean}] +description = + build: Build the package in isolation according to PEP517, see https://github.com/pypa/build + clean: Remove old distribution files and temporary build artifacts (./build and ./dist) +# https://setuptools.pypa.io/en/stable/build_meta.html#how-to-use-it +skip_install = True +changedir = {toxinidir} +deps = + build: build[virtualenv] +passenv = + SETUPTOOLS_* +commands = + clean: python -c 'import shutil; [shutil.rmtree(p, True) for p in ("build", "dist", "docs/_build")]' + clean: python -c 'import pathlib, shutil; [shutil.rmtree(p, True) for p in pathlib.Path("src").glob("*.egg-info")]' + build: python -m build {posargs} + + +[testenv:{docs,doctests,linkcheck}] +description = + docs: Invoke sphinx-build to build the docs + doctests: Invoke sphinx-build to run doctests + linkcheck: Check for broken links in the documentation +passenv = + SETUPTOOLS_* +setenv = + DOCSDIR = {toxinidir}/docs + BUILDDIR = {toxinidir}/docs/_build + docs: BUILD = html + doctests: BUILD = doctest + linkcheck: BUILD = linkcheck +deps = + -r {toxinidir}/docs/requirements.txt + # ^ requirements.txt shared with Read The Docs +commands = + sphinx-build --color -b {env:BUILD} -d "{env:BUILDDIR}/doctrees" "{env:DOCSDIR}" "{env:BUILDDIR}/{env:BUILD}" {posargs} + + +[testenv:publish] +description = + Publish the package you have been developing to a package index server. + By default, it uses testpypi. If you really want to publish your package + to be publicly accessible in PyPI, use the `-- --repository pypi` option. +skip_install = True +changedir = {toxinidir} +passenv = + # See: https://twine.readthedocs.io/en/latest/ + TWINE_USERNAME + TWINE_PASSWORD + TWINE_REPOSITORY + TWINE_REPOSITORY_URL +deps = twine +commands = + python -m twine check dist/* + python -m twine upload {posargs:--repository {env:TWINE_REPOSITORY:testpypi}} dist/*