Harrison/modelscope (#5156)

Co-authored-by: thomas-yanxin <[email protected]> Co-authored-by: Dev 2049 <[email protected]>
langchain-ai · May 24, 2023 · 11c26eb · 11c26eb
1 parent 2d5588c
commit 11c26eb
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 0 deletions.
diff --git a/docs/ecosystem/modelscope.md b/docs/ecosystem/modelscope.md
@@ -0,0 +1,20 @@
+# ModelScope
+
+This page covers how to use the modelscope ecosystem within LangChain.
+It is broken into two parts: installation and setup, and then references to specific modelscope wrappers.
+
+## Installation and Setup
+
+* Install the Python SDK with `pip install modelscope`
+
+## Wrappers
+
+### Embeddings
+
+There exists a modelscope Embeddings wrapper, which you can access with 
+
+```python
+from langchain.embeddings import ModelScopeEmbeddings
+```
+
+For a more detailed walkthrough of this, see [this notebook](../modules/models/text_embedding/examples/modelscope_hub.ipynb)
diff --git a/docs/modules/models/text_embedding/examples/modelscope_hub.ipynb b/docs/modules/models/text_embedding/examples/modelscope_hub.ipynb
@@ -0,0 +1,82 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ModelScope\n",
+    "\n",
+    "Let's load the ModelScope Embedding class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import ModelScopeEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"damo/nlp_corom_sentence-embedding_english-base\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = ModelScopeEmbeddings(model_id=model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This is a test document.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_result = embeddings.embed_query(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc_results = embeddings.embed_documents([\"foo\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chatgpt",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.15"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py
@@ -17,6 +17,7 @@
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.jina import JinaEmbeddings
 from langchain.embeddings.llamacpp import LlamaCppEmbeddings
+from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings
 from langchain.embeddings.mosaicml import MosaicMLInstructorEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
@@ -38,6 +39,7 @@
     "JinaEmbeddings",
     "LlamaCppEmbeddings",
     "HuggingFaceHubEmbeddings",
+    "ModelScopeEmbeddings",
     "TensorflowHubEmbeddings",
     "SagemakerEndpointEmbeddings",
     "HuggingFaceInstructEmbeddings",

diff --git a/langchain/embeddings/modelscope_hub.py b/langchain/embeddings/modelscope_hub.py
@@ -0,0 +1,72 @@
+"""Wrapper around ModelScopeHub embedding models."""
+from typing import Any, List
+
+from pydantic import BaseModel, Extra
+
+from langchain.embeddings.base import Embeddings
+
+
+class ModelScopeEmbeddings(BaseModel, Embeddings):
+    """Wrapper around modelscope_hub embedding models.
+
+    To use, you should have the ``modelscope`` python package installed.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.embeddings import ModelScopeEmbeddings
+            model_id = "damo/nlp_corom_sentence-embedding_english-base"
+            embed = ModelScopeEmbeddings(model_id=model_id)
+    """
+
+    embed: Any
+    model_id: str = "damo/nlp_corom_sentence-embedding_english-base"
+    """Model name to use."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize the modelscope"""
+        super().__init__(**kwargs)
+        try:
+            from modelscope.pipelines import pipeline
+            from modelscope.utils.constant import Tasks
+
+            self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)
+
+        except ImportError as e:
+            raise ImportError(
+                "Could not import some python packages."
+                "Please install it with `pip install modelscope`."
+            ) from e
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Compute doc embeddings using a modelscope embedding model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        texts = list(map(lambda x: x.replace("\n", " "), texts))
+        inputs = {"source_sentence": texts}
+        embeddings = self.embed(input=inputs)["text_embedding"]
+        return embeddings.tolist()
+
+    def embed_query(self, text: str) -> List[float]:
+        """Compute query embeddings using a modelscope embedding model.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        text = text.replace("\n", " ")
+        inputs = {"source_sentence": [text]}
+        embedding = self.embed(input=inputs)["text_embedding"][0]
+        return embedding.tolist()
diff --git a/tests/integration_tests/embeddings/test_modelscope_hub.py b/tests/integration_tests/embeddings/test_modelscope_hub.py
@@ -0,0 +1,19 @@
+"""Test modelscope embeddings."""
+from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings
+
+
+def test_modelscope_embedding_documents() -> None:
+    """Test modelscope embeddings for documents."""
+    documents = ["foo bar"]
+    embedding = ModelScopeEmbeddings()
+    output = embedding.embed_documents(documents)
+    assert len(output) == 2
+    assert len(output[0]) == 512
+
+
+def test_modelscope_embedding_query() -> None:
+    """Test modelscope embeddings for query."""
+    document = "foo bar"
+    embedding = ModelScopeEmbeddings()
+    output = embedding.embed_query(document)
+    assert len(output) == 512