From dc97ebcffc486a20c9100d98fc4732262403a0b0 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 09:18:47 +0200
Subject: [PATCH 1/8] Added tutorial for recipe search

---
 pyproject.toml                |   1 +
 tutorials/recipe_search.ipynb | 327 ++++++++++++++++++++++++++++++++++
 uv.lock                       |   2 +-
 3 files changed, 329 insertions(+), 1 deletion(-)
 create mode 100644 tutorials/recipe_search.ipynb

diff --git a/pyproject.toml b/pyproject.toml
index 1ab99dc..0ee0532 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ select = [
     # Print: Forbid print statements
     "T20",
 ]
+
 ignore = [
     # Allow self and cls to be untyped, and allow Any type
     "ANN101", "ANN102", "ANN401",
diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
new file mode 100644
index 0000000..5f9d917
--- /dev/null
+++ b/tutorials/recipe_search.ipynb
@@ -0,0 +1,327 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the necessary libraries\n",
+    "!pip install numpy datasets scikit-learn model2vec\n",
+    "    \n",
+    "# Import the necessary libraries\n",
+    "import regex\n",
+    "from collections import Counter\n",
+    "\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset, Dataset\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "from model2vec import StaticModel\n",
+    "from model2vec.distill import distill"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['title', 'url', 'category', 'author', 'description', 'rating', 'rating_count', 'review_count', 'ingredients', 'directions', 'prep_time', 'cook_time', 'total_time', 'servings', 'yields', 'calories', 'carbohydrates_g', 'sugars_g', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'protein_g', 'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg', 'zinc_mg', 'phosphorus_mg', 'vitamin_a_iu_IU', 'niacin_equivalents_mg', 'vitamin_b6_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg', 'riboflavin_mg', 'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg', 'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g', 'trans_fatty_acid_g', 'omega_3_fatty_acid_g', 'omega_6_fatty_acid_g', 'instructions_list', 'image']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the recipe dataset\n",
+    "dataset = load_dataset(\"Shengtao/recipe\", split=\"train\")\n",
+    "# Show the column names\n",
+    "print(dataset.column_names)\n",
+    "# Take the title column as our recipes corpus\n",
+    "recipes = dataset[\"title\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a function to find the most similar titles in a dataset to a given query\n",
+    "def find_most_similar_items(model: StaticModel, dataset: list[str] , query: str , top_k=5) -> list[tuple[str, float]]:\n",
+    "    \"\"\"\n",
+    "    Finds the most similar items in a dataset to the given query using the specified model.\n",
+    "\n",
+    "    :param model: The model used to generate embeddings.\n",
+    "    :param dataset: The dataset of recipe titles.\n",
+    "    :param query: The query recipe title.\n",
+    "    :param top_k: The number of most similar titles to return.\n",
+    "    :return: A list of tuples containing the most similar titles and their cosine similarity scores.\n",
+    "    \"\"\"\n",
+    "    # Generate embeddings for the entire dataset\n",
+    "    embeddings = model.encode(dataset)\n",
+    "\n",
+    "    # Generate embedding for the query\n",
+    "    query_embedding = model.encode(query).reshape(1, -1)\n",
+    "\n",
+    "    # Calculate cosine similarities between the query and dataset\n",
+    "    similarities = cosine_similarity(query_embedding, embeddings)[0]\n",
+    "\n",
+    "    # Get the indices of the most similar items (sorted in descending order)\n",
+    "    most_similar_indices = np.argsort(similarities)[::-1]\n",
+    "\n",
+    "    # Get the top-k most similar titles and their scores\n",
+    "    most_similar_titles = [dataset[i] for i in most_similar_indices[:top_k]]\n",
+    "    most_similar_scores = [similarities[i] for i in most_similar_indices[:top_k]]\n",
+    "\n",
+    "    # Combine titles and scores into a list of tuples\n",
+    "    return list(zip(most_similar_titles, most_similar_scores))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the M2V output model from the HuggingFace hub\n",
+    "model_name = \"minishlab/M2V_base_output\"\n",
+    "model_output = StaticModel.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most similar recipes to 'cheeseburger':\n",
+      "Title: Double Cheeseburger, Similarity Score: 0.9028\n",
+      "Title: Cheeseburger Chowder, Similarity Score: 0.8574\n",
+      "Title: Cheeseburger Sliders, Similarity Score: 0.8413\n",
+      "Title: Cheeseburger Salad, Similarity Score: 0.8384\n",
+      "Title: Cheeseburger Soup I, Similarity Score: 0.8298\n",
+      "\n",
+      "Most similar recipes to 'fattoush':\n",
+      "Title: Fattoush, Similarity Score: 1.0000\n",
+      "Title: Lebanese Fattoush, Similarity Score: 0.8370\n",
+      "Title: Aunty Terese's Fattoush, Similarity Score: 0.7630\n",
+      "Title: Arabic Fattoush Salad, Similarity Score: 0.7588\n",
+      "Title: Authentic Lebanese Fattoush, Similarity Score: 0.7584\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Find recipes using the output embeddings model\n",
+    "top_k = 5\n",
+    "\n",
+    "# Find the most similar recipes to the given queries\n",
+    "query = \"cheeseburger\"\n",
+    "results = find_most_similar_items(model_output, recipes, query, top_k)\n",
+    "print(f\"Most similar recipes to '{query}':\")\n",
+    "for title, score in results:\n",
+    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "    \n",
+    "print()\n",
+    "\n",
+    "query = \"fattoush\"\n",
+    "results = find_most_similar_items(model_output, recipes, query, top_k)\n",
+    "print(f\"Most similar recipes to '{query}':\")\n",
+    "for title, score in results:\n",
+    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the M2V glove model from the HuggingFace hub\n",
+    "model_name = \"minishlab/M2V_base_glove\"\n",
+    "model_glove = StaticModel.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most similar recipes to 'cheeseburger':\n",
+      "Title: Double Cheeseburger, Similarity Score: 0.8744\n",
+      "Title: Cheeseburger Meatloaf, Similarity Score: 0.8246\n",
+      "Title: Cheeseburger Salad, Similarity Score: 0.8160\n",
+      "Title: Hearty American Cheeseburger, Similarity Score: 0.8006\n",
+      "Title: Cheeseburger Chowder, Similarity Score: 0.7989\n",
+      "\n",
+      "Most similar recipes to 'fattoush':\n",
+      "Title: Zucchini and Onion Pancake, Similarity Score: 0.0000\n",
+      "Title: Crab Ball, Similarity Score: 0.0000\n",
+      "Title: Shrimp Eggs Foo Yung, Similarity Score: 0.0000\n",
+      "Title: Thai Shrimp and Snow Peas, Similarity Score: 0.0000\n",
+      "Title: Charbroiled Salmon, Similarity Score: 0.0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Find recipes using the output embeddings model\n",
+    "top_k = 5\n",
+    "\n",
+    "# Find the most similar recipes to the given queries\n",
+    "query = \"cheeseburger\"\n",
+    "results = find_most_similar_items(model_glove, recipes, query, top_k)\n",
+    "print(f\"Most similar recipes to '{query}':\")\n",
+    "for title, score in results:\n",
+    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "    \n",
+    "print()\n",
+    "\n",
+    "# NOTE: 'fattoush' is Out-Of-Vocabulary (OOV) for the GloVe model and will return a zero vector.\n",
+    "query = \"fattoush\"\n",
+    "results = find_most_similar_items(model_glove, recipes, query, top_k)\n",
+    "print(f\"Most similar recipes to '{query}':\")\n",
+    "for title, score in results:\n",
+    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up a regex tokenizer to split texts into words and punctuation\n",
+    "my_regex = regex.compile(r\"\\w+|[^\\w\\s]+\")\n",
+    "\n",
+    "# Function to tokenize texts with a progress bar\n",
+    "def tokenize_texts(texts: list[str]) -> list[str]:\n",
+    "    \"\"\"Tokenizes a list of texts using a regex tokenizer.\"\"\"\n",
+    "    all_tokens = []\n",
+    "    for text in texts:\n",
+    "        tokens = my_regex.findall(text.lower())\n",
+    "        all_tokens.extend(tokens)\n",
+    "    return all_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tokenize the recipe titles\n",
+    "tokens = tokenize_texts(recipes)\n",
+    "\n",
+    "# Count the frequency of each token\n",
+    "token_counts = Counter()\n",
+    "batch_size = 10000 \n",
+    "for i in range(0, len(tokens), batch_size):\n",
+    "    batch = tokens[i:i+batch_size]\n",
+    "    token_counts.update(batch)\n",
+    "\n",
+    "# Define vocabulary size and get the most common tokens\n",
+    "vocab_size = 30000\n",
+    "vocab = [word for word, count in token_counts.most_common(vocab_size)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 8/8 [00:08<00:00,  1.04s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Choose a Sentence Transformer model\n",
+    "model_name = \"BAAI/bge-small-en-v1.5\"\n",
+    "\n",
+    "# Distill a model2vec model using the Sentence Transformer model and the custom vocab\n",
+    "model_custom = distill(model_name=model_name, vocabulary=vocab, pca_dims=256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most similar recipes to 'cheeseburger':\n",
+      "Title: Cheeseburger Salad, Similarity Score: 0.9528\n",
+      "Title: Cheeseburger Casserole, Similarity Score: 0.9030\n",
+      "Title: Cheeseburger Chowder, Similarity Score: 0.8635\n",
+      "Title: Cheeseburger Pie, Similarity Score: 0.8401\n",
+      "Title: Cheeseburger Meatloaf, Similarity Score: 0.8184\n",
+      "\n",
+      "Most similar recipes to 'fattoush':\n",
+      "Title: Fattoush, Similarity Score: 1.0000\n",
+      "Title: Fatoosh, Similarity Score: 0.7488\n",
+      "Title: Lebanese Fattoush, Similarity Score: 0.6344\n",
+      "Title: Arabic Fattoush Salad, Similarity Score: 0.6108\n",
+      "Title: Fattoush (Lebanese Salad), Similarity Score: 0.5669\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Find recipes using the output embeddings model\n",
+    "top_k = 5\n",
+    "\n",
+    "# Find the most similar recipes to the given queries\n",
+    "query = \"cheeseburger\"\n",
+    "results = find_most_similar_items(model_custom, recipes, query, top_k)\n",
+    "print(f\"Most similar recipes to '{query}':\")\n",
+    "for title, score in results:\n",
+    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "    \n",
+    "print()\n",
+    "\n",
+    "#'fattoush' is now in the custom vocabulary and will return related recipes.\n",
+    "query = \"fattoush\"\n",
+    "results = find_most_similar_items(model_custom, recipes, query, top_k)\n",
+    "print(f\"Most similar recipes to '{query}':\")\n",
+    "for title, score in results:\n",
+    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/uv.lock b/uv.lock
index f438d6f..d60e70a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -433,7 +433,7 @@ wheels = [
 
 [[package]]
 name = "model2vec"
-version = "0.1.0"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "click" },

From bf78b481d22540a909cfe7e74eae5dddb6010b30 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 09:26:46 +0200
Subject: [PATCH 2/8] Switched similarity function

---
 tutorials/recipe_search.ipynb | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index 5f9d917..719c9c8 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -15,7 +15,7 @@
     "\n",
     "import numpy as np\n",
     "from datasets import load_dataset, Dataset\n",
-    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from sklearn.metrics import pairwise_distances\n",
     "\n",
     "from model2vec import StaticModel\n",
     "from model2vec.distill import distill"
@@ -66,15 +66,15 @@
     "    # Generate embedding for the query\n",
     "    query_embedding = model.encode(query).reshape(1, -1)\n",
     "\n",
-    "    # Calculate cosine similarities between the query and dataset\n",
-    "    similarities = cosine_similarity(query_embedding, embeddings)[0]\n",
+    "    # Calculate pairwise cosine distances between the query and dataset\n",
+    "    distances = pairwise_distances(query_embedding, embeddings, metric='cosine')[0]\n",
     "\n",
-    "    # Get the indices of the most similar items (sorted in descending order)\n",
-    "    most_similar_indices = np.argsort(similarities)[::-1]\n",
+    "    # Get the indices of the most similar items (sorted in ascending order because smaller distances are better)\n",
+    "    most_similar_indices = np.argsort(distances)\n",
     "\n",
-    "    # Get the top-k most similar titles and their scores\n",
+    "    # Get the top-k most similar titles and their scores (cosine distance to similarity: 1 - distance)\n",
     "    most_similar_titles = [dataset[i] for i in most_similar_indices[:top_k]]\n",
-    "    most_similar_scores = [similarities[i] for i in most_similar_indices[:top_k]]\n",
+    "    most_similar_scores = [1 - distances[i] for i in most_similar_indices[:top_k]]\n",
     "\n",
     "    # Combine titles and scores into a list of tuples\n",
     "    return list(zip(most_similar_titles, most_similar_scores))"

From 75c1a246082f8ed48bef12fc73235d605f660cd2 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 09:32:24 +0200
Subject: [PATCH 3/8] Moved encoding outside of function

---
 tutorials/recipe_search.ipynb | 79 ++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index 719c9c8..498d3de 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -45,39 +45,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define a function to find the most similar titles in a dataset to a given query\n",
-    "def find_most_similar_items(model: StaticModel, dataset: list[str] , query: str , top_k=5) -> list[tuple[str, float]]:\n",
+    "def find_most_similar_items(model: StaticModel, embeddings: np.ndarray , query: str , top_k=5) -> list[tuple[int, float]]:\n",
     "    \"\"\"\n",
     "    Finds the most similar items in a dataset to the given query using the specified model.\n",
     "\n",
     "    :param model: The model used to generate embeddings.\n",
-    "    :param dataset: The dataset of recipe titles.\n",
+    "    :param embeddings: The embeddings of the dataset.\n",
     "    :param query: The query recipe title.\n",
     "    :param top_k: The number of most similar titles to return.\n",
     "    :return: A list of tuples containing the most similar titles and their cosine similarity scores.\n",
     "    \"\"\"\n",
-    "    # Generate embeddings for the entire dataset\n",
-    "    embeddings = model.encode(dataset)\n",
-    "\n",
     "    # Generate embedding for the query\n",
     "    query_embedding = model.encode(query).reshape(1, -1)\n",
     "\n",
-    "    # Calculate pairwise cosine distances between the query and dataset\n",
+    "    # Calculate pairwise cosine distances between the query and the precomputed embeddings\n",
     "    distances = pairwise_distances(query_embedding, embeddings, metric='cosine')[0]\n",
     "\n",
     "    # Get the indices of the most similar items (sorted in ascending order because smaller distances are better)\n",
     "    most_similar_indices = np.argsort(distances)\n",
     "\n",
-    "    # Get the top-k most similar titles and their scores (cosine distance to similarity: 1 - distance)\n",
-    "    most_similar_titles = [dataset[i] for i in most_similar_indices[:top_k]]\n",
+    "    # Convert distances to similarity scores (cosine similarity = 1 - cosine distance)\n",
     "    most_similar_scores = [1 - distances[i] for i in most_similar_indices[:top_k]]\n",
     "\n",
-    "    # Combine titles and scores into a list of tuples\n",
-    "    return list(zip(most_similar_titles, most_similar_scores))"
+    "    # Return the top-k most similar indices and similarity scores\n",
+    "    return list(zip(most_similar_indices[:top_k], most_similar_scores))"
    ]
   },
   {
@@ -93,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
@@ -122,18 +118,20 @@
     "\n",
     "# Find the most similar recipes to the given queries\n",
     "query = \"cheeseburger\"\n",
-    "results = find_most_similar_items(model_output, recipes, query, top_k)\n",
+    "embeddings = model_output.encode(recipes)\n",
+    "\n",
+    "results = find_most_similar_items(model_output, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
-    "for title, score in results:\n",
-    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "for idx, score in results:\n",
+    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
     "    \n",
     "print()\n",
     "\n",
     "query = \"fattoush\"\n",
-    "results = find_most_similar_items(model_output, recipes, query, top_k)\n",
+    "results = find_most_similar_items(model_output, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
-    "for title, score in results:\n",
-    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "for idx, score in results:\n",
+    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
     "    "
    ]
   },
@@ -150,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
@@ -165,11 +163,11 @@
       "Title: Cheeseburger Chowder, Similarity Score: 0.7989\n",
       "\n",
       "Most similar recipes to 'fattoush':\n",
-      "Title: Zucchini and Onion Pancake, Similarity Score: 0.0000\n",
-      "Title: Crab Ball, Similarity Score: 0.0000\n",
-      "Title: Shrimp Eggs Foo Yung, Similarity Score: 0.0000\n",
-      "Title: Thai Shrimp and Snow Peas, Similarity Score: 0.0000\n",
-      "Title: Charbroiled Salmon, Similarity Score: 0.0000\n"
+      "Title: Simple Macaroni and Cheese, Similarity Score: 0.0000\n",
+      "Title: Fresh Tomato and Cucumber Salad with Buttery Garlic Croutons, Similarity Score: 0.0000\n",
+      "Title: Grilled Cheese, Apple, and Thyme Sandwich, Similarity Score: 0.0000\n",
+      "Title: Poppin' Turkey Salad, Similarity Score: 0.0000\n",
+      "Title: Chili - The Heat is On!, Similarity Score: 0.0000\n"
      ]
     }
    ],
@@ -179,19 +177,20 @@
     "\n",
     "# Find the most similar recipes to the given queries\n",
     "query = \"cheeseburger\"\n",
-    "results = find_most_similar_items(model_glove, recipes, query, top_k)\n",
+    "embeddings = model_glove.encode(recipes)\n",
+    "\n",
+    "results = find_most_similar_items(model_glove, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
-    "for title, score in results:\n",
-    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "for idx, score in results:\n",
+    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
     "    \n",
     "print()\n",
     "\n",
-    "# NOTE: 'fattoush' is Out-Of-Vocabulary (OOV) for the GloVe model and will return a zero vector.\n",
     "query = \"fattoush\"\n",
-    "results = find_most_similar_items(model_glove, recipes, query, top_k)\n",
+    "results = find_most_similar_items(model_glove, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
-    "for title, score in results:\n",
-    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "for idx, score in results:\n",
+    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
     "    "
    ]
   },
@@ -258,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
@@ -287,19 +286,21 @@
     "\n",
     "# Find the most similar recipes to the given queries\n",
     "query = \"cheeseburger\"\n",
-    "results = find_most_similar_items(model_custom, recipes, query, top_k)\n",
+    "embeddings = model_custom.encode(recipes)\n",
+    "\n",
+    "results = find_most_similar_items(model_custom, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
-    "for title, score in results:\n",
-    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")\n",
+    "for idx, score in results:\n",
+    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
     "    \n",
     "print()\n",
     "\n",
-    "#'fattoush' is now in the custom vocabulary and will return related recipes.\n",
     "query = \"fattoush\"\n",
-    "results = find_most_similar_items(model_custom, recipes, query, top_k)\n",
+    "results = find_most_similar_items(model_custom, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
-    "for title, score in results:\n",
-    "    print(f\"Title: {title}, Similarity Score: {score:.4f}\")"
+    "for idx, score in results:\n",
+    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    "
    ]
   }
  ],

From 419d94f6836f6cba60fc84bd025322563d55c5c1 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 10:18:27 +0200
Subject: [PATCH 4/8] Updated code

---
 tutorials/recipe_search.ipynb | 231 ++++++++++++++++++++++------------
 1 file changed, 152 insertions(+), 79 deletions(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index 498d3de..40dc429 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -1,5 +1,17 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Recipe Search using Model2Vec**\n",
+    "\n",
+    "This notebook demonstrates how to use the Model2Vec library to search for recipes based on a given query. Three modes of Model2Vec use are demonstrated:\n",
+    "1. **Using a pre-trained output vocab model**: Uses a pre-trained output embedding model. This is a very small model that uses a subword tokenizer. \n",
+    "2. **Using a pre-trained glove vocab model**: Uses pre-trained glove vocab model. This is a larger model that uses a word tokenizer.\n",
+    "3. **Using a custom vocab model**: Uses a custom domain-specific vocab model that is distilled on a vocab created from the recipe dataset. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -7,15 +19,16 @@
    "outputs": [],
    "source": [
     "# Install the necessary libraries\n",
-    "!pip install numpy datasets scikit-learn model2vec\n",
+    "!pip install numpy datasets scikit-learn transformers model2vec\n",
     "    \n",
     "# Import the necessary libraries\n",
     "import regex\n",
     "from collections import Counter\n",
     "\n",
     "import numpy as np\n",
-    "from datasets import load_dataset, Dataset\n",
+    "from datasets import load_dataset\n",
     "from sklearn.metrics import pairwise_distances\n",
+    "from tokenizers.pre_tokenizers import Whitespace\n",
     "\n",
     "from model2vec import StaticModel\n",
     "from model2vec.distill import distill"
@@ -23,34 +36,106 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['title', 'url', 'category', 'author', 'description', 'rating', 'rating_count', 'review_count', 'ingredients', 'directions', 'prep_time', 'cook_time', 'total_time', 'servings', 'yields', 'calories', 'carbohydrates_g', 'sugars_g', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'protein_g', 'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg', 'zinc_mg', 'phosphorus_mg', 'vitamin_a_iu_IU', 'niacin_equivalents_mg', 'vitamin_b6_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg', 'riboflavin_mg', 'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg', 'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g', 'trans_fatty_acid_g', 'omega_3_fatty_acid_g', 'omega_6_fatty_acid_g', 'instructions_list', 'image']\n"
+      "                        title  \\\n",
+      "0  Simple Macaroni and Cheese   \n",
+      "1    Gourmet Mushroom Risotto   \n",
+      "2              Dessert Crepes   \n",
+      "3                 Pork Steaks   \n",
+      "4  Quick and Easy Pizza Crust   \n",
+      "\n",
+      "                                                 url              category  \\\n",
+      "0  https://www.allrecipes.com/recipe/238691/simpl...             main-dish   \n",
+      "1  https://www.allrecipes.com/recipe/85389/gourme...             main-dish   \n",
+      "2  https://www.allrecipes.com/recipe/19037/desser...  breakfast-and-brunch   \n",
+      "3  https://www.allrecipes.com/recipe/70463/pork-s...      meat-and-poultry   \n",
+      "4  https://www.allrecipes.com/recipe/20171/quick-...                 bread   \n",
+      "\n",
+      "                  author                                        description  \\\n",
+      "0            g0dluvsugly  A very quick and easy fix to a tasty side-dish...   \n",
+      "1  Myleen Sagrado Sjödin  Authentic Italian-style risotto cooked the slo...   \n",
+      "2                  ANN57  Essential crepe recipe.  Sprinkle warm crepes ...   \n",
+      "3           BABYLOVE1222  My mom came up with this recipe when I was a c...   \n",
+      "4             CHEF RIDER  This is a great recipe when you don't want to ...   \n",
+      "\n",
+      "   rating  rating_count  review_count  \\\n",
+      "0    4.42           834           575   \n",
+      "1    4.80          3388          2245   \n",
+      "2    4.80          1156           794   \n",
+      "3    4.57           689           539   \n",
+      "4    4.70          3741          2794   \n",
+      "\n",
+      "                                         ingredients  \\\n",
+      "0  1 (8 ounce) box elbow macaroni ; ¼ cup butter ...   \n",
+      "1  6 cups chicken broth, divided ; 3 tablespoons ...   \n",
+      "2  4  eggs, lightly beaten ; 1 ⅓ cups milk ; 2 ta...   \n",
+      "3  ¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...   \n",
+      "4  1 (.25 ounce) package active dry yeast ; 1 tea...   \n",
+      "\n",
+      "                                          directions  ... vitamin_k_mcg  \\\n",
+      "0  Bring a large pot of lightly salted water to a...  ...           NaN   \n",
+      "1  In a saucepan, warm the broth over low heat. W...  ...           NaN   \n",
+      "2  In large bowl, whisk together eggs, milk, melt...  ...           NaN   \n",
+      "3  Melt butter in a skillet, and mix in the soy s...  ...           NaN   \n",
+      "4  Preheat oven to 450 degrees F (230 degrees C)....  ...           NaN   \n",
+      "\n",
+      "  biotin_mcg vitamin_b12_mcg  mono_fat_g poly_fat_g  trans_fatty_acid_g  \\\n",
+      "0        NaN             NaN         NaN        NaN                 NaN   \n",
+      "1        NaN             NaN         NaN        NaN                 NaN   \n",
+      "2        NaN             NaN         NaN        NaN                 NaN   \n",
+      "3        NaN             NaN         NaN        NaN                 NaN   \n",
+      "4        NaN             NaN         NaN        NaN                 NaN   \n",
+      "\n",
+      "   omega_3_fatty_acid_g  omega_6_fatty_acid_g  \\\n",
+      "0                   NaN                   NaN   \n",
+      "1                   NaN                   NaN   \n",
+      "2                   NaN                   NaN   \n",
+      "3                   NaN                   NaN   \n",
+      "4                   NaN                   NaN   \n",
+      "\n",
+      "                                   instructions_list  \\\n",
+      "0  ['Bring a large pot of lightly salted water to...   \n",
+      "1  ['Warm broth in a saucepan over low heat.', 'M...   \n",
+      "2  ['Whisk together eggs, milk, flour, melted but...   \n",
+      "3  ['Melt butter in a skillet over medium heat; s...   \n",
+      "4  ['Preheat oven to 450 degrees F (230 degrees C...   \n",
+      "\n",
+      "                                               image  \n",
+      "0  https://www.allrecipes.com/thmb/GZrTl8DBwmRuor...  \n",
+      "1  https://www.allrecipes.com/thmb/xCk4IEjfAYBikO...  \n",
+      "2  https://www.allrecipes.com/thmb/VwULr05JFDluPI...  \n",
+      "3  https://www.allrecipes.com/thmb/mYkvln7o9pb35l...  \n",
+      "4  https://www.allrecipes.com/thmb/V3Llo-ottudIs_...  \n",
+      "\n",
+      "[5 rows x 49 columns]\n"
      ]
     }
    ],
    "source": [
     "# Load the recipe dataset\n",
     "dataset = load_dataset(\"Shengtao/recipe\", split=\"train\")\n",
-    "# Show the column names\n",
-    "print(dataset.column_names)\n",
+    "# Convert the dataset to a pandas DataFrame\n",
+    "dataset = dataset.to_pandas()\n",
+    "# Display the first few rows of the dataset\n",
+    "print(dataset.head())\n",
     "# Take the title column as our recipes corpus\n",
     "recipes = dataset[\"title\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define a function to find the most similar titles in a dataset to a given query\n",
-    "def find_most_similar_items(model: StaticModel, embeddings: np.ndarray , query: str , top_k=5) -> list[tuple[int, float]]:\n",
+    "def find_most_similar_items(model: StaticModel, embeddings: np.ndarray, query: str, top_k=5) -> list[tuple[int, float]]:\n",
     "    \"\"\"\n",
     "    Finds the most similar items in a dataset to the given query using the specified model.\n",
     "\n",
@@ -61,7 +146,7 @@
     "    :return: A list of tuples containing the most similar titles and their cosine similarity scores.\n",
     "    \"\"\"\n",
     "    # Generate embedding for the query\n",
-    "    query_embedding = model.encode(query).reshape(1, -1)\n",
+    "    query_embedding = model.encode(query)[None, :]\n",
     "\n",
     "    # Calculate pairwise cosine distances between the query and the precomputed embeddings\n",
     "    distances = pairwise_distances(query_embedding, embeddings, metric='cosine')[0]\n",
@@ -89,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [
     {
@@ -97,18 +182,18 @@
      "output_type": "stream",
      "text": [
       "Most similar recipes to 'cheeseburger':\n",
-      "Title: Double Cheeseburger, Similarity Score: 0.9028\n",
-      "Title: Cheeseburger Chowder, Similarity Score: 0.8574\n",
-      "Title: Cheeseburger Sliders, Similarity Score: 0.8413\n",
-      "Title: Cheeseburger Salad, Similarity Score: 0.8384\n",
-      "Title: Cheeseburger Soup I, Similarity Score: 0.8298\n",
+      "Title: `Double Cheeseburger`, Similarity Score: 0.9028\n",
+      "Title: `Cheeseburger Chowder`, Similarity Score: 0.8574\n",
+      "Title: `Cheeseburger Sliders`, Similarity Score: 0.8413\n",
+      "Title: `Cheeseburger Salad`, Similarity Score: 0.8384\n",
+      "Title: `Cheeseburger Soup I`, Similarity Score: 0.8298\n",
       "\n",
       "Most similar recipes to 'fattoush':\n",
-      "Title: Fattoush, Similarity Score: 1.0000\n",
-      "Title: Lebanese Fattoush, Similarity Score: 0.8370\n",
-      "Title: Aunty Terese's Fattoush, Similarity Score: 0.7630\n",
-      "Title: Arabic Fattoush Salad, Similarity Score: 0.7588\n",
-      "Title: Authentic Lebanese Fattoush, Similarity Score: 0.7584\n"
+      "Title: `Fattoush`, Similarity Score: 1.0000\n",
+      "Title: `Lebanese Fattoush`, Similarity Score: 0.8370\n",
+      "Title: `Aunty Terese's Fattoush`, Similarity Score: 0.7630\n",
+      "Title: `Arabic Fattoush Salad`, Similarity Score: 0.7588\n",
+      "Title: `Authentic Lebanese Fattoush`, Similarity Score: 0.7584\n"
      ]
     }
    ],
@@ -123,7 +208,7 @@
     "results = find_most_similar_items(model_output, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
     "for idx, score in results:\n",
-    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    \n",
     "print()\n",
     "\n",
@@ -131,7 +216,7 @@
     "results = find_most_similar_items(model_output, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
     "for idx, score in results:\n",
-    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    "
    ]
   },
@@ -148,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [
     {
@@ -156,18 +241,18 @@
      "output_type": "stream",
      "text": [
       "Most similar recipes to 'cheeseburger':\n",
-      "Title: Double Cheeseburger, Similarity Score: 0.8744\n",
-      "Title: Cheeseburger Meatloaf, Similarity Score: 0.8246\n",
-      "Title: Cheeseburger Salad, Similarity Score: 0.8160\n",
-      "Title: Hearty American Cheeseburger, Similarity Score: 0.8006\n",
-      "Title: Cheeseburger Chowder, Similarity Score: 0.7989\n",
+      "Title: `Double Cheeseburger`, Similarity Score: 0.8744\n",
+      "Title: `Cheeseburger Meatloaf`, Similarity Score: 0.8246\n",
+      "Title: `Cheeseburger Salad`, Similarity Score: 0.8160\n",
+      "Title: `Hearty American Cheeseburger`, Similarity Score: 0.8006\n",
+      "Title: `Cheeseburger Chowder`, Similarity Score: 0.7989\n",
       "\n",
       "Most similar recipes to 'fattoush':\n",
-      "Title: Simple Macaroni and Cheese, Similarity Score: 0.0000\n",
-      "Title: Fresh Tomato and Cucumber Salad with Buttery Garlic Croutons, Similarity Score: 0.0000\n",
-      "Title: Grilled Cheese, Apple, and Thyme Sandwich, Similarity Score: 0.0000\n",
-      "Title: Poppin' Turkey Salad, Similarity Score: 0.0000\n",
-      "Title: Chili - The Heat is On!, Similarity Score: 0.0000\n"
+      "Title: `Simple Macaroni and Cheese`, Similarity Score: 0.0000\n",
+      "Title: `Fresh Tomato and Cucumber Salad with Buttery Garlic Croutons`, Similarity Score: 0.0000\n",
+      "Title: `Grilled Cheese, Apple, and Thyme Sandwich`, Similarity Score: 0.0000\n",
+      "Title: `Poppin' Turkey Salad`, Similarity Score: 0.0000\n",
+      "Title: `Chili - The Heat is On!`, Similarity Score: 0.0000\n"
      ]
     }
    ],
@@ -182,7 +267,7 @@
     "results = find_most_similar_items(model_glove, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
     "for idx, score in results:\n",
-    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    \n",
     "print()\n",
     "\n",
@@ -190,53 +275,37 @@
     "results = find_most_similar_items(model_glove, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
     "for idx, score in results:\n",
-    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Set up a regex tokenizer to split texts into words and punctuation\n",
     "my_regex = regex.compile(r\"\\w+|[^\\w\\s]+\")\n",
     "\n",
-    "# Function to tokenize texts with a progress bar\n",
-    "def tokenize_texts(texts: list[str]) -> list[str]:\n",
-    "    \"\"\"Tokenizes a list of texts using a regex tokenizer.\"\"\"\n",
-    "    all_tokens = []\n",
+    "def create_vocab(texts: list[str], tokenizer, size: int = 30_000) -> Counter[str]:\n",
+    "    \"\"\"Create a vocab from a list of texts.\"\"\"\n",
+    "    counts = Counter()\n",
     "    for text in texts:\n",
-    "        tokens = my_regex.findall(text.lower())\n",
-    "        all_tokens.extend(tokens)\n",
-    "    return all_tokens"
+    "        #tokens = tokenizer(text)\n",
+    "        \n",
+    "        #tokens = tokenizer.tokenize(text)\n",
+    "        tokens = tokenizer.pre_tokenize_str(text.lower())\n",
+    "        tokens = [token for token, _ in tokens]\n",
+    "        #tokens = my_regex.findall(text.lower())\n",
+    "        counts.update(tokens)\n",
+    "    vocab = [word for word, _ in counts.most_common(size)]\n",
+    "    return vocab"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Tokenize the recipe titles\n",
-    "tokens = tokenize_texts(recipes)\n",
-    "\n",
-    "# Count the frequency of each token\n",
-    "token_counts = Counter()\n",
-    "batch_size = 10000 \n",
-    "for i in range(0, len(tokens), batch_size):\n",
-    "    batch = tokens[i:i+batch_size]\n",
-    "    token_counts.update(batch)\n",
-    "\n",
-    "# Define vocabulary size and get the most common tokens\n",
-    "vocab_size = 30000\n",
-    "vocab = [word for word, count in token_counts.most_common(vocab_size)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [
     {
@@ -248,8 +317,12 @@
     }
    ],
    "source": [
-    "# Choose a Sentence Transformer model\n",
+    "# Choose a Sentence Transformer model and a tokenizer\n",
     "model_name = \"BAAI/bge-small-en-v1.5\"\n",
+    "tokenizer = Whitespace()\n",
+    "\n",
+    "# Create a custom vocab from the recipe titles\n",
+    "vocab = create_vocab(recipes, tokenizer)\n",
     "\n",
     "# Distill a model2vec model using the Sentence Transformer model and the custom vocab\n",
     "model_custom = distill(model_name=model_name, vocabulary=vocab, pca_dims=256)"
@@ -257,7 +330,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [
     {
@@ -265,18 +338,18 @@
      "output_type": "stream",
      "text": [
       "Most similar recipes to 'cheeseburger':\n",
-      "Title: Cheeseburger Salad, Similarity Score: 0.9528\n",
-      "Title: Cheeseburger Casserole, Similarity Score: 0.9030\n",
-      "Title: Cheeseburger Chowder, Similarity Score: 0.8635\n",
-      "Title: Cheeseburger Pie, Similarity Score: 0.8401\n",
-      "Title: Cheeseburger Meatloaf, Similarity Score: 0.8184\n",
+      "Title: `Cheeseburger Salad`, Similarity Score: 0.9528\n",
+      "Title: `Cheeseburger Casserole`, Similarity Score: 0.9030\n",
+      "Title: `Cheeseburger Chowder`, Similarity Score: 0.8635\n",
+      "Title: `Cheeseburger Pie`, Similarity Score: 0.8401\n",
+      "Title: `Cheeseburger Meatloaf`, Similarity Score: 0.8184\n",
       "\n",
       "Most similar recipes to 'fattoush':\n",
-      "Title: Fattoush, Similarity Score: 1.0000\n",
-      "Title: Fatoosh, Similarity Score: 0.7488\n",
-      "Title: Lebanese Fattoush, Similarity Score: 0.6344\n",
-      "Title: Arabic Fattoush Salad, Similarity Score: 0.6108\n",
-      "Title: Fattoush (Lebanese Salad), Similarity Score: 0.5669\n"
+      "Title: `Fattoush`, Similarity Score: 1.0000\n",
+      "Title: `Fatoosh`, Similarity Score: 0.7488\n",
+      "Title: `Lebanese Fattoush`, Similarity Score: 0.6344\n",
+      "Title: `Arabic Fattoush Salad`, Similarity Score: 0.6108\n",
+      "Title: `Fattoush (Lebanese Salad)`, Similarity Score: 0.5669\n"
      ]
     }
    ],
@@ -291,7 +364,7 @@
     "results = find_most_similar_items(model_custom, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
     "for idx, score in results:\n",
-    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    \n",
     "print()\n",
     "\n",
@@ -299,7 +372,7 @@
     "results = find_most_similar_items(model_custom, embeddings, query, top_k)\n",
     "print(f\"Most similar recipes to '{query}':\")\n",
     "for idx, score in results:\n",
-    "    print(f\"Title: {recipes[idx]}, Similarity Score: {score:.4f}\")\n",
+    "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    "
    ]
   }

From 818677982e990356237743110dc7fefb24727b32 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 10:32:59 +0200
Subject: [PATCH 5/8] Added docs

---
 tutorials/recipe_search.ipynb | 267 ++++++++++++++++++++++------------
 1 file changed, 178 insertions(+), 89 deletions(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index 40dc429..9a145ee 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -36,98 +36,136 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 96,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                        title  \\\n",
-      "0  Simple Macaroni and Cheese   \n",
-      "1    Gourmet Mushroom Risotto   \n",
-      "2              Dessert Crepes   \n",
-      "3                 Pork Steaks   \n",
-      "4  Quick and Easy Pizza Crust   \n",
-      "\n",
-      "                                                 url              category  \\\n",
-      "0  https://www.allrecipes.com/recipe/238691/simpl...             main-dish   \n",
-      "1  https://www.allrecipes.com/recipe/85389/gourme...             main-dish   \n",
-      "2  https://www.allrecipes.com/recipe/19037/desser...  breakfast-and-brunch   \n",
-      "3  https://www.allrecipes.com/recipe/70463/pork-s...      meat-and-poultry   \n",
-      "4  https://www.allrecipes.com/recipe/20171/quick-...                 bread   \n",
-      "\n",
-      "                  author                                        description  \\\n",
-      "0            g0dluvsugly  A very quick and easy fix to a tasty side-dish...   \n",
-      "1  Myleen Sagrado Sjödin  Authentic Italian-style risotto cooked the slo...   \n",
-      "2                  ANN57  Essential crepe recipe.  Sprinkle warm crepes ...   \n",
-      "3           BABYLOVE1222  My mom came up with this recipe when I was a c...   \n",
-      "4             CHEF RIDER  This is a great recipe when you don't want to ...   \n",
-      "\n",
-      "   rating  rating_count  review_count  \\\n",
-      "0    4.42           834           575   \n",
-      "1    4.80          3388          2245   \n",
-      "2    4.80          1156           794   \n",
-      "3    4.57           689           539   \n",
-      "4    4.70          3741          2794   \n",
-      "\n",
-      "                                         ingredients  \\\n",
-      "0  1 (8 ounce) box elbow macaroni ; ¼ cup butter ...   \n",
-      "1  6 cups chicken broth, divided ; 3 tablespoons ...   \n",
-      "2  4  eggs, lightly beaten ; 1 ⅓ cups milk ; 2 ta...   \n",
-      "3  ¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...   \n",
-      "4  1 (.25 ounce) package active dry yeast ; 1 tea...   \n",
-      "\n",
-      "                                          directions  ... vitamin_k_mcg  \\\n",
-      "0  Bring a large pot of lightly salted water to a...  ...           NaN   \n",
-      "1  In a saucepan, warm the broth over low heat. W...  ...           NaN   \n",
-      "2  In large bowl, whisk together eggs, milk, melt...  ...           NaN   \n",
-      "3  Melt butter in a skillet, and mix in the soy s...  ...           NaN   \n",
-      "4  Preheat oven to 450 degrees F (230 degrees C)....  ...           NaN   \n",
-      "\n",
-      "  biotin_mcg vitamin_b12_mcg  mono_fat_g poly_fat_g  trans_fatty_acid_g  \\\n",
-      "0        NaN             NaN         NaN        NaN                 NaN   \n",
-      "1        NaN             NaN         NaN        NaN                 NaN   \n",
-      "2        NaN             NaN         NaN        NaN                 NaN   \n",
-      "3        NaN             NaN         NaN        NaN                 NaN   \n",
-      "4        NaN             NaN         NaN        NaN                 NaN   \n",
-      "\n",
-      "   omega_3_fatty_acid_g  omega_6_fatty_acid_g  \\\n",
-      "0                   NaN                   NaN   \n",
-      "1                   NaN                   NaN   \n",
-      "2                   NaN                   NaN   \n",
-      "3                   NaN                   NaN   \n",
-      "4                   NaN                   NaN   \n",
-      "\n",
-      "                                   instructions_list  \\\n",
-      "0  ['Bring a large pot of lightly salted water to...   \n",
-      "1  ['Warm broth in a saucepan over low heat.', 'M...   \n",
-      "2  ['Whisk together eggs, milk, flour, melted but...   \n",
-      "3  ['Melt butter in a skillet over medium heat; s...   \n",
-      "4  ['Preheat oven to 450 degrees F (230 degrees C...   \n",
-      "\n",
-      "                                               image  \n",
-      "0  https://www.allrecipes.com/thmb/GZrTl8DBwmRuor...  \n",
-      "1  https://www.allrecipes.com/thmb/xCk4IEjfAYBikO...  \n",
-      "2  https://www.allrecipes.com/thmb/VwULr05JFDluPI...  \n",
-      "3  https://www.allrecipes.com/thmb/mYkvln7o9pb35l...  \n",
-      "4  https://www.allrecipes.com/thmb/V3Llo-ottudIs_...  \n",
-      "\n",
-      "[5 rows x 49 columns]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load the recipe dataset\n",
     "dataset = load_dataset(\"Shengtao/recipe\", split=\"train\")\n",
     "# Convert the dataset to a pandas DataFrame\n",
     "dataset = dataset.to_pandas()\n",
-    "# Display the first few rows of the dataset\n",
-    "print(dataset.head())\n",
     "# Take the title column as our recipes corpus\n",
     "recipes = dataset[\"title\"]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>category</th>\n",
+       "      <th>description</th>\n",
+       "      <th>ingredients</th>\n",
+       "      <th>directions</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Simple Macaroni and Cheese</td>\n",
+       "      <td>main-dish</td>\n",
+       "      <td>A very quick and easy fix to a tasty side-dish...</td>\n",
+       "      <td>1 (8 ounce) box elbow macaroni ; ¼ cup butter ...</td>\n",
+       "      <td>Bring a large pot of lightly salted water to a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Gourmet Mushroom Risotto</td>\n",
+       "      <td>main-dish</td>\n",
+       "      <td>Authentic Italian-style risotto cooked the slo...</td>\n",
+       "      <td>6 cups chicken broth, divided ; 3 tablespoons ...</td>\n",
+       "      <td>In a saucepan, warm the broth over low heat. W...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Dessert Crepes</td>\n",
+       "      <td>breakfast-and-brunch</td>\n",
+       "      <td>Essential crepe recipe.  Sprinkle warm crepes ...</td>\n",
+       "      <td>4  eggs, lightly beaten ; 1 ⅓ cups milk ; 2 ta...</td>\n",
+       "      <td>In large bowl, whisk together eggs, milk, melt...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Pork Steaks</td>\n",
+       "      <td>meat-and-poultry</td>\n",
+       "      <td>My mom came up with this recipe when I was a c...</td>\n",
+       "      <td>¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...</td>\n",
+       "      <td>Melt butter in a skillet, and mix in the soy s...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Quick and Easy Pizza Crust</td>\n",
+       "      <td>bread</td>\n",
+       "      <td>This is a great recipe when you don't want to ...</td>\n",
+       "      <td>1 (.25 ounce) package active dry yeast ; 1 tea...</td>\n",
+       "      <td>Preheat oven to 450 degrees F (230 degrees C)....</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        title              category  \\\n",
+       "0  Simple Macaroni and Cheese             main-dish   \n",
+       "1    Gourmet Mushroom Risotto             main-dish   \n",
+       "2              Dessert Crepes  breakfast-and-brunch   \n",
+       "3                 Pork Steaks      meat-and-poultry   \n",
+       "4  Quick and Easy Pizza Crust                 bread   \n",
+       "\n",
+       "                                         description  \\\n",
+       "0  A very quick and easy fix to a tasty side-dish...   \n",
+       "1  Authentic Italian-style risotto cooked the slo...   \n",
+       "2  Essential crepe recipe.  Sprinkle warm crepes ...   \n",
+       "3  My mom came up with this recipe when I was a c...   \n",
+       "4  This is a great recipe when you don't want to ...   \n",
+       "\n",
+       "                                         ingredients  \\\n",
+       "0  1 (8 ounce) box elbow macaroni ; ¼ cup butter ...   \n",
+       "1  6 cups chicken broth, divided ; 3 tablespoons ...   \n",
+       "2  4  eggs, lightly beaten ; 1 ⅓ cups milk ; 2 ta...   \n",
+       "3  ¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...   \n",
+       "4  1 (.25 ounce) package active dry yeast ; 1 tea...   \n",
+       "\n",
+       "                                          directions  \n",
+       "0  Bring a large pot of lightly salted water to a...  \n",
+       "1  In a saucepan, warm the broth over low heat. W...  \n",
+       "2  In large bowl, whisk together eggs, milk, melt...  \n",
+       "3  Melt butter in a skillet, and mix in the soy s...  \n",
+       "4  Preheat oven to 450 degrees F (230 degrees C)....  "
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the first few rows of the dataset for the specified columns\n",
+    "dataset[[\"title\", \"category\", \"description\", \"ingredients\", \"directions\"]].head()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 46,
@@ -161,6 +199,15 @@
     "    return list(zip(most_similar_indices[:top_k], most_similar_scores))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using a pre-trained output vocab model**\n",
+    "\n",
+    "In this part, we will use a pre-trained output vocab model to encode the recipes and search using multiple queries. The output vocab model is very small and fast while still providing good results. Since the model uses a sub-word tokenizer, it is able to handle out-of-vocabulary words and provide good results even for words that are not in the base vocab."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -220,6 +267,22 @@
     "    "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As can be seen, we get some good results for the queries. The model is able to find recipes that are similar to the query."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using a pre-trained output vocab model**\n",
+    "\n",
+    "In this part, we will use a pre-trained glove vocab model to encode the recipes and search using multiple queries. The glove vocab model is a bit larger and slower than the output vocab model but can provide better results. However, as we will see, it suffers from the out-of-vocabulary problem."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -279,25 +342,44 @@
     "    "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As can be seen, we get good results when we search for an in vocab query (`cheeseburger`), but when we search for an out-of-vocab query (`fattoush`), the model is not able to find any relevant recipes. To fix this, we will now distill a custom vocab model on the recipe dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using a custom vocab model**\n",
+    "\n",
+    "In this part, we will distill a custom vocab model on the recipe dataset and use it to encode the recipes and search using multiple queries. This will create a domain-specific model2vec model. "
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Set up a regex tokenizer to split texts into words and punctuation\n",
     "my_regex = regex.compile(r\"\\w+|[^\\w\\s]+\")\n",
     "\n",
-    "def create_vocab(texts: list[str], tokenizer, size: int = 30_000) -> Counter[str]:\n",
-    "    \"\"\"Create a vocab from a list of texts.\"\"\"\n",
+    "def create_vocab(texts: list[str], tokenizer: Whitespace, size: int = 30_000) -> list[str]:\n",
+    "    \"\"\"\n",
+    "    Create a vocab from a list of texts.\n",
+    "    \n",
+    "    :param texts: A list of texts.\n",
+    "    :param tokenizer: A whitespace tokenizer.\n",
+    "    :param size: The size of the vocab.\n",
+    "    :return: A vocab sorted by frequency.\n",
+    "    \"\"\"\n",
     "    counts = Counter()\n",
     "    for text in texts:\n",
-    "        #tokens = tokenizer(text)\n",
-    "        \n",
-    "        #tokens = tokenizer.tokenize(text)\n",
     "        tokens = tokenizer.pre_tokenize_str(text.lower())\n",
     "        tokens = [token for token, _ in tokens]\n",
-    "        #tokens = my_regex.findall(text.lower())\n",
     "        counts.update(tokens)\n",
     "    vocab = [word for word, _ in counts.most_common(size)]\n",
     "    return vocab"
@@ -375,6 +457,13 @@
     "    print(f\"Title: `{recipes[idx]}`, Similarity Score: {score:.4f}\")\n",
     "    "
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As can be seen, we now get good results for both queries with our custom vocab model."
+   ]
   }
  ],
  "metadata": {

From 86b0c9008698dfa615eb7dbadc71eba7958b3f99 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 10:36:14 +0200
Subject: [PATCH 6/8] Updated docs

---
 tutorials/recipe_search.ipynb | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index 9a145ee..dbb462e 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -166,6 +166,13 @@
     "dataset[[\"title\", \"category\", \"description\", \"ingredients\", \"directions\"]].head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we will set up a function to handle similarity search that we can use in this tutorial."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 46,
@@ -355,7 +362,7 @@
    "source": [
     "**Using a custom vocab model**\n",
     "\n",
-    "In this part, we will distill a custom vocab model on the recipe dataset and use it to encode the recipes and search using multiple queries. This will create a domain-specific model2vec model. "
+    "In this part, we will distill a custom vocab model on the recipe dataset and use it to encode the recipes and search using multiple queries. This will create a domain-specific model2vec model. First, we will set up a function to create a vocabulary from a list of texts (in our case, a list of recipe titles)."
    ]
   },
   {
@@ -462,7 +469,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As can be seen, we now get good results for both queries with our custom vocab model."
+    "As can be seen, we now get good results for both queries with our custom vocab model since the domain-specific terms are included in the vocab."
    ]
   }
  ],

From d90f2d8db57b952fe7ddcd7c7b4c087d5ac7409a Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 10:37:33 +0200
Subject: [PATCH 7/8] Updated docs

---
 tutorials/recipe_search.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index dbb462e..36516c8 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -287,7 +287,7 @@
    "source": [
     "**Using a pre-trained output vocab model**\n",
     "\n",
-    "In this part, we will use a pre-trained glove vocab model to encode the recipes and search using multiple queries. The glove vocab model is a bit larger and slower than the output vocab model but can provide better results. However, as we will see, it suffers from the out-of-vocabulary problem."
+    "In this part, we will use a pre-trained glove vocab model to encode the recipes and search using multiple queries. The glove vocab model is a bit larger and slower than the output vocab model but can provide better results. However, as we will see, it suffers from the out-of-vocabulary problem, since the glove vocab is not designed for the cooking recipe domain."
    ]
   },
   {

From 2e18fdc76b8936518d990f60730be76791c91a16 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Tue, 24 Sep 2024 10:40:50 +0200
Subject: [PATCH 8/8] Updated docs

---
 tutorials/recipe_search.ipynb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tutorials/recipe_search.ipynb b/tutorials/recipe_search.ipynb
index 36516c8..6158d5c 100644
--- a/tutorials/recipe_search.ipynb
+++ b/tutorials/recipe_search.ipynb
@@ -6,7 +6,10 @@
    "source": [
     "**Recipe Search using Model2Vec**\n",
     "\n",
-    "This notebook demonstrates how to use the Model2Vec library to search for recipes based on a given query. Three modes of Model2Vec use are demonstrated:\n",
+    "This notebook demonstrates how to use the Model2Vec library to search for recipes based on a given query. We will use the [recipe dataset](https://huggingface.co/datasets/Shengtao/recipe).\n",
+    "We will be using the `model2vec` in different modes to search for recipes based on a query, using both our own pre-trained models, as well as a domain-specific model we will distill ourselves in this tutorial.\n",
+    "\n",
+    "Three modes of Model2Vec use are demonstrated:\n",
     "1. **Using a pre-trained output vocab model**: Uses a pre-trained output embedding model. This is a very small model that uses a subword tokenizer. \n",
     "2. **Using a pre-trained glove vocab model**: Uses pre-trained glove vocab model. This is a larger model that uses a word tokenizer.\n",
     "3. **Using a custom vocab model**: Uses a custom domain-specific vocab model that is distilled on a vocab created from the recipe dataset. "