Skip to content

Commit

Permalink
Modify notebooks pre-webinar
Browse files Browse the repository at this point in the history
  • Loading branch information
blythed committed Nov 13, 2023
1 parent 6faaa14 commit e871300
Show file tree
Hide file tree
Showing 8 changed files with 275 additions and 56 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ integration-testing: ## Execute integration testing
# Run the test
pytest $(PYTEST_ARGUMENTS) ./test/integration

fix-and-test: testenv_init ## Lint the code before testing
fix-and-test: ## Lint the code before testing
# Code formatting
black $(DIRECTORIES)
# Linter and code formatting
Expand Down
4 changes: 1 addition & 3 deletions examples/mnist_torch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,7 @@
"cell_type": "code",
"execution_count": null,
"id": "bf0934cc",
"metadata": {
"scrolled": true
},
"metadata": {},
"outputs": [],
"source": [
"import torchvision\n",
Expand Down
104 changes: 81 additions & 23 deletions examples/multimodal_image_search_clip.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@
"mongodb_uri = os.getenv(\"MONGODB_URI\", \"mongomock://test\")\n",
"db = superduper(mongodb_uri, artifact_store='filesystem://./models/')\n",
"\n",
"# Create a collection for Tiny ImageNet\n",
"imagenet_collection = Collection('tiny-imagenet')"
"# Super-Duper your Database!\n",
"db = superduper(mongodb_uri, artifact_store='filesystem://.data')\n",
"\n",
"collection = Collection('multimodal')"
]
},
{
Expand All @@ -91,26 +93,56 @@
{
"cell_type": "code",
"execution_count": null,
"id": "aaa0a06a",
"id": "5f0f14fb-8e79-4bc6-88af-1a800aecb8db",
"metadata": {},
"outputs": [],
"source": [
"!curl -O https://superduperdb-public.s3.eu-west-1.amazonaws.com/coco_sample.zip\n",
"!unzip coco_sample.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e41e6faa-6b83-46d8-ab37-6de6fd346ee7",
"metadata": {},
"outputs": [],
"source": [
"from superduperdb import Document\n",
"from superduperdb.ext.pillow import pil_image as i\n",
"from datasets import load_dataset\n",
"import glob\n",
"import random\n",
"\n",
"# Load the dataset\n",
"dataset = load_dataset(\"zh-plus/tiny-imagenet\")['valid']\n",
"\n",
"# Wrap images into encodable objects\n",
"dataset = [Document({'image': i(r['image'])}) for r in dataset]\n",
"\n",
"# Randomly sample 1000 images from the dataset\n",
"dataset = random.sample(dataset, 1000)\n",
"\n",
"# Encode and insert images to the database\n",
"db.execute(imagenet_collection.insert_many(dataset), encoders=(i,))"
"images = glob.glob('images_small/*.jpg')\n",
"documents = [Document({'image': i(uri=f'file://{img}')}) for img in images][:500]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b3a63bf-9e1f-4266-823a-7a2208937e01",
"metadata": {},
"outputs": [],
"source": [
"documents[1]"
]
},
{
"cell_type": "markdown",
"id": "c9c7e282",
"metadata": {},
"source": [
"The wrapped python dictionaries may be inserted directly to the `Datalayer`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c32a91a5",
"metadata": {},
"outputs": [],
"source": [
"db.execute(collection.insert_many(documents), encoders=(i,))"
]
},
{
Expand Down Expand Up @@ -138,6 +170,7 @@
"metadata": {},
"source": [
"## Build Models\n",
"We now can wrap the CLIP model, to ready it for multimodal search. It involves 2 components:\n",
"\n",
"Now, let's prepare the CLIP model for multimodal search, which involves two components: `text encoding` and `visual encoding`. After installing both components, you can perform searches using both images and text to find matching items:"
]
Expand Down Expand Up @@ -206,7 +239,8 @@
" indexing_listener=Listener(\n",
" model=visual_model,\n",
" key='image',\n",
" select=imagenet_collection.find(),\n",
" select=collection.find(),\n",
" predict_kwargs={'batch_size': 10},\n",
" ),\n",
" compatible_listener=Listener(\n",
" model=text_model,\n",
Expand Down Expand Up @@ -238,19 +272,43 @@
"from IPython.display import display\n",
"from superduperdb import Document\n",
"\n",
"# Define the search parameters\n",
"search_term = \"mushroom\"\n",
"num_results = 6\n",
"query_string = 'sports'\n",
"\n",
"# Execute the query\n",
"search_results = db.execute(\n",
" imagenet_collection.like(Document({'text': search_term}), vector_index='my-index', n=num_results).find({})\n",
"out = db.execute(\n",
" collection.like(Document({'text': query_string}), vector_index='my-index', n=3).find({})\n",
")\n",
"\n",
"# Display the images from the search results\n",
"for r in search_results:\n",
" x = r['image'].x\n",
" display(x.resize((300, 300 * int(x.size[1] / x.size[0]))))"
" display(x.resize((300, int(300 * x.size[1] / x.size[0]))))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5e3ac22-044f-4675-976a-68ff9b59efe9",
"metadata": {},
"outputs": [],
"source": [
"img = db.execute(collection.find_one({}))['image']\n",
"img.x"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8569e4f-74f2-4ee5-9674-7829b2fcc62b",
"metadata": {},
"outputs": [],
"source": [
"cur = db.execute(\n",
" collection.like(Document({'image': img}), vector_index='my-index', n=3).find({})\n",
")\n",
"\n",
"for r in cur:\n",
" x = r['image'].x\n",
" display(x.resize((300, int(300 * x.size[1] / x.size[0]))))"
]
},
{
Expand Down
Loading

0 comments on commit e871300

Please sign in to comment.