Fix Issue #417

legendy4141 · Dec 12, 2024 · 5d10091 · 5d10091
1 parent 25118b4
commit 5d10091
Show file tree

Hide file tree

Showing 173 changed files with 863 additions and 863 deletions.
diff --git a/.github/workflows/cargo-publish.yml b/.github/workflows/cargo-publish.yml
@@ -10,7 +10,7 @@ on:
 
 env:
   # This env var is used by Swatinem/rust-cache@v2 for the cache
-  # key, so we set it to make sure it is always consistent.
+  # , so we set it to make sure it is always consistent.
   CARGO_TERM_COLOR: always
   # Up-to-date compilers needed for fp16kernels.
   CC: gcc-12

diff --git a/.github/workflows/docs_test.yml b/.github/workflows/docs_test.yml
@@ -102,7 +102,7 @@ jobs:
     - name: Test
       env:
         LANCEDB_URI: ${{ secrets.LANCEDB_URI }}
-        LANCEDB_DEV_API_KEY: ${{ secrets.LANCEDB_DEV_API_KEY }}
+        LANCEDB_DEV_API_: ${{ secrets.LANCEDB_DEV_API_ }}
       run: |
         cd docs
         npm t
diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml
@@ -81,7 +81,7 @@ jobs:
           server-id: ossrh
           server-username: SONATYPE_USER
           server-password: SONATYPE_TOKEN
-          gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
+          gpg-private-: ${{ secrets.GPG_PRIVATE_ }}
           gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
       - name: Install dependencies
         run: |

diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
@@ -12,7 +12,7 @@ on:
       - .github/workflows/java.yml
 env:
   # This env var is used by Swatinem/rust-cache@v2 for the cache
-  # key, so we set it to make sure it is always consistent.
+  # , so we set it to make sure it is always consistent.
   CARGO_TERM_COLOR: always
   # Disable full debug symbol generation to speed up CI build and keep memory down
   # "1" means line tables only, which is useful for panic tracebacks.

diff --git a/.github/workflows/node.yml b/.github/workflows/node.yml
@@ -100,8 +100,8 @@ jobs:
         shell: bash
         working-directory: node
     env:
-      AWS_ACCESS_KEY_ID: ACCESSKEY
-      AWS_SECRET_ACCESS_KEY: SECRETKEY
+      AWS_ACCESS__ID: ACCESS
+      AWS_SECRET_ACCESS_: SECRET
       AWS_DEFAULT_REGION: us-west-2
       # this one is for s3
       AWS_ENDPOINT: http://localhost:4566
@@ -127,7 +127,7 @@ jobs:
         aws dynamodb create-table \
           --table-name lancedb-integtest \
           --attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
-          --key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
+          ---schema '[{"AttributeName": "base_uri", "Type": "HASH"}, {"AttributeName": "version", "Type": "RANGE"}]' \
           --provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
           --endpoint-url $DYNAMODB_ENDPOINT
     - uses: Swatinem/rust-cache@v2

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -16,7 +16,7 @@ concurrency:
 
 env:
   # This env var is used by Swatinem/rust-cache@v2 for the cache
-  # key, so we set it to make sure it is always consistent.
+  # , so we set it to make sure it is always consistent.
   CARGO_TERM_COLOR: always
   # Disable full debug symbol generation to speed up CI build and keep memory down
   # "1" means line tables only, which is useful for panic tracebacks.

diff --git a/Cargo.toml b/Cargo.toml
@@ -16,7 +16,7 @@ authors = ["LanceDB Devs <[email protected]>"]
 license = "Apache-2.0"
 repository = "https://github.com/lancedb/lancedb"
 description = "Serverless, low-latency vector database for AI applications"
-keywords = ["lancedb", "lance", "database", "vector", "search"]
+words = ["lancedb", "lance", "database", "vector", "search"]
 categories = ["database-implementations"]
 rust-version = "1.80.0"                                                     # TODO: lower this once we upgrade Lance again.
 

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@
 
 LanceDB is an open-source database for vector-search built with persistent storage, which greatly simplifies retrieval, filtering and management of embeddings.
 
-The key features of LanceDB include:
+The  features of LanceDB include:
 
 * Production-scale vector search with no servers to manage.
 

diff --git a/ci/semver_sort.py b/ci/semver_sort.py
@@ -28,7 +28,7 @@
         versions.append((line, version))
 
     # Sort the versions
-    versions.sort(key=lambda x: x[1])
+    versions.sort(=lambda x: x[1])
 
     # Print the sorted versions as original strings
     for line, _ in versions:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -9,8 +9,8 @@ services:
       - DEBUG=1
       - LS_LOG=trace
       - DOCKER_HOST=unix:///var/run/docker.sock
-      - AWS_ACCESS_KEY_ID=ACCESSKEY
-      - AWS_SECRET_ACCESS_KEY=SECRETKEY
+      - AWS_ACCESS__ID=ACCESS
+      - AWS_SECRET_ACCESS_=SECRET
     healthcheck:
       test: [ "CMD", "curl", "-s", "http://localhost:4566/_localstack/health" ]
       interval: 5s

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -71,7 +71,7 @@ markdown_extensions:
   - footnotes
   - pymdownx.critic
   - pymdownx.caret
-  - pymdownx.keys
+  - pymdownx.s
   - pymdownx.mark
   - pymdownx.tilde
   - pymdownx.details

diff --git a/docs/openapi.yml b/docs/openapi.yml
@@ -22,13 +22,13 @@ servers:
         description: the service region of the DB
 
 security:
-  - key_auth: []
+  - _auth: []
 
 components:
   securitySchemes:
-    key_auth:
-      name: x-api-key
-      type: apiKey
+    _auth:
+      name: x-api-
+      type: api
       in: header
   parameters:
     table_name:
@@ -259,7 +259,7 @@ paths:
         - name: on
           in: query
           description: |
-            The column to use as the primary key for the merge operation.
+            The column to use as the primary  for the merge operation.
           required: true
           schema:
             type: string

diff --git a/docs/src/concepts/index_hnsw.md b/docs/src/concepts/index_hnsw.md
@@ -57,7 +57,7 @@ Then the greedy search routine operates as follows:
 
 ## Usage
 
-There are three key parameters to set when constructing an HNSW index:
+There are three  parameters to set when constructing an HNSW index:
 
 * `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
 * `m`: The number of neighbors to select for each vector in the HNSW graph.

diff --git a/docs/src/concepts/index_ivfpq.md b/docs/src/concepts/index_ivfpq.md
@@ -45,7 +45,7 @@ We can combine the above concepts to understand how to build and query an IVF-PQ
 
 ### Construct index
 
-There are three key parameters to set when constructing an IVF-PQ index:
+There are three  parameters to set when constructing an IVF-PQ index:
 
 * `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
 * `num_partitions`: The number of partitions in the IVF portion of the index.

diff --git a/docs/src/concepts/vector_search.md b/docs/src/concepts/vector_search.md
@@ -16,7 +16,7 @@ Large datasets of multi-modal data (text, audio, images, etc.) can be converted
 
 ## Indexes
 
-Embeddings for a given dataset are made searchable via an **index**. The index is constructed by using data structures that store the embeddings such that it's very efficient to perform scans and lookups on them. A key distinguishing feature of LanceDB is it uses a disk-based index: IVF-PQ, which is a variant of the Inverted File Index (IVF) that uses Product Quantization (PQ) to compress the embeddings.
+Embeddings for a given dataset are made searchable via an **index**. The index is constructed by using data structures that store the embeddings such that it's very efficient to perform scans and lookups on them. A  distinguishing feature of LanceDB is it uses a disk-based index: IVF-PQ, which is a variant of the Inverted File Index (IVF) that uses Product Quantization (PQ) to compress the embeddings.
 
 See the [IVF-PQ](./index_ivfpq.md) page for more details on how it works.
 

diff --git a/...le_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md b/...le_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
@@ -19,7 +19,7 @@ Usage Example:
     from lancedb.embeddings import get_registry
     import pandas as pd
 
-    os.environ['JINA_API_KEY'] = 'jina_*'
+    os.environ['JINA_API_'] = 'jina_*'
 
     db = lancedb.connect("~/.lancedb")
     func = get_registry().get("jina").create()

diff --git a/...eddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/...eddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
@@ -1,7 +1,7 @@
 # Cohere Embeddings
 
 Using cohere API requires cohere package, which can be installed using `pip install cohere`. Cohere embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification.
-You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API.
+You also need to set the `COHERE_API_` environment variable to use the Cohere API.
 
 Supported models are:
 

diff --git a/...available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md b/...available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
@@ -18,7 +18,7 @@ The following parameters can be passed to the `create` method:
 | Parameter  | Type     | Default Value                    | Description                                               |
 |------------|----------|----------------------------------|-----------------------------------------------------------|
 | name       | str      | "ibm/slate-125m-english-rtrvr"   | The model ID of the watsonx.ai model to use               |
-| api_key    | str      | None                             | Optional IBM Cloud API key (or set `WATSONX_API_KEY`)     |
+| api_    | str      | None                             | Optional IBM Cloud API  (or set `WATSONX_API_`)     |
 | project_id | str      | None                             | Optional watsonx project ID (or set `WATSONX_PROJECT_ID`) |
 | url        | str      | None                             | Optional custom URL for the watsonx.ai instance           |
 | params     | dict     | None                             | Optional additional parameters for the embedding model    |
@@ -34,7 +34,7 @@ pip install ibm-watsonx-ai
 Optionally set environment variables (if not passing credentials to `create` directly):
 
 ```sh
-export WATSONX_API_KEY="YOUR_WATSONX_API_KEY"
+export WATSONX_API_="YOUR_WATSONX_API_"
 export WATSONX_PROJECT_ID="YOUR_WATSONX_PROJECT_ID"
 ```
 
@@ -50,7 +50,7 @@ watsonx_embed = EmbeddingFunctionRegistry
   .create(
     name="ibm/slate-125m-english-rtrvr",
     # Uncomment and set these if not using environment variables
-    # api_key="your_api_key_here",
+    # api_="your_api__here",
     # project_id="your_project_id_here",
     # url="your_watsonx_url_here",
     # params={...},

diff --git a/...mbeddings/available_embedding_models/text_embedding_functions/jina_embedding.md b/...mbeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
@@ -1,7 +1,7 @@
 # Jina Embeddings
 
 Jina embeddings are used to generate embeddings for text and image data.
-You also need to set the `JINA_API_KEY` environment variable to use the Jina API.
+You also need to set the `JINA_API_` environment variable to use the Jina API.
 
 You can find a list of supported models under [https://jina.ai/embeddings/](https://jina.ai/embeddings/)
 
@@ -19,7 +19,7 @@ Usage Example:
     from lancedb.pydantic import LanceModel, Vector
     from lancedb.embeddings import EmbeddingFunctionRegistry
 
-    os.environ['JINA_API_KEY'] = 'jina_*'
+    os.environ['JINA_API_'] = 'jina_*'
 
     jina_embed = EmbeddingFunctionRegistry.get_instance().get("jina").create(name="jina-embeddings-v2-base-en")
 

diff --git a/...dings/available_embedding_models/text_embedding_functions/voyageai_embedding.md b/...dings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
@@ -4,7 +4,7 @@ Voyage AI provides cutting-edge embedding and rerankers.
 
 
 Using voyageai API requires voyageai package, which can be installed using `pip install voyageai`. Voyage AI embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification.
-You also need to set the `VOYAGE_API_KEY` environment variable to use the VoyageAI API.
+You also need to set the `VOYAGE_API_` environment variable to use the VoyageAI API.
 
 Supported models are:
 

diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md
@@ -50,24 +50,24 @@ These functions are registered by default to handle text embeddings.
 | [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | 📚 **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/instructor_embedding.png" alt="Instructor Embedding Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/instructor_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
 | [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | 🌌 Google’s Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/gemini.png" alt="Gemini Icon" width="95" height="35">](available_embedding_models/text_embedding_functions/gemini_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
 | [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | 💬 This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/cohere.png" alt="Cohere Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/cohere_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
-| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api **. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
 | [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
 | [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
 | [**VoyageAI Embeddings**](available_embedding_models/text_embedding_functions/voyageai_embedding.md "voyageai") | 🌕 Voyage AI provides cutting-edge embedding and rerankers. This will help you get started with **VoyageAI** embedding models using LanceDB. Using voyageai API requires voyageai package. Install it via `pip`. | [<img src="https://www.voyageai.com/logo.svg" alt="VoyageAI Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/voyageai_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
 
 
 
-[st-key]: "sentence-transformers"
-[hf-key]: "huggingface"
-[ollama-key]: "ollama"
-[-key]: ""
-[instructor-key]: "instructor"
-[gemini-key]: "gemini-text"
-[cohere-key]: "cohere"
-[jina-key]: "jina"
-[aws-key]: "bedrock-text"
-[watsonx-key]: "watsonx"
-[voyageai-key]: "voyageai"
+[st-]: "sentence-transformers"
+[hf-]: "huggingface"
+[ollama-]: "ollama"
+[-]: ""
+[instructor-]: "instructor"
+[gemini-]: "gemini-text"
+[cohere-]: "cohere"
+[jina-]: "jina"
+[aws-]: "bedrock-text"
+[watsonx-]: "watsonx"
+[voyageai-]: "voyageai"
 
 
 ## Multi-modal Embedding Functions🖼️ 

diff --git a/docs/src/embeddings/embedding_functions.md b/docs/src/embeddings/embedding_functions.md
@@ -1,4 +1,4 @@
-Representing multi-modal data as vector embeddings is becoming a standard practice. Embedding functions can themselves be thought of as key part of the data processing pipeline that each request has to be passed through. The assumption here is: after initial setup, these components and the underlying methodology are not expected to change for a particular project.
+Representing multi-modal data as vector embeddings is becoming a standard practice. Embedding functions can themselves be thought of as  part of the data processing pipeline that each request has to be passed through. The assumption here is: after initial setup, these components and the underlying methodology are not expected to change for a particular project.
 
 For this purpose, LanceDB introduces an **embedding functions API**, that allow you simply set up once, during the configuration stage of your project. After this, the table remembers it, effectively making the embedding functions *disappear in the background* so you don't have to worry about manually passing callables, and instead, simply focus on the rest of your data engineering pipeline.
 
@@ -37,10 +37,10 @@ For this purpose, LanceDB introduces an **embedding functions API**, that allow
     import * as lancedb from '@lancedb/lancedb'
     import { getRegistry } from '@lancedb/lancedb/embeddings'
 
-    // You need to provide an  API key
-    const apiKey = ""
+    // You need to provide an  API 
+    const api = ""
     // The embedding function will create embeddings for the 'text' column
-    const func = getRegistry().get("").create({apiKey})
+    const func = getRegistry().get("").create({api})
     ```
 === "Rust"
     In the Rust SDK, the choices are more limited. For now, only the