From 251e5ef72c44a99097d17ae87ca534e236a9c7ff Mon Sep 17 00:00:00 2001 From: Md Fazlul Karim Date: Fri, 24 Nov 2023 18:53:56 +0600 Subject: [PATCH 1/2] MNIST Updated --- docs/hr/content/use_cases/mnist_torch.md | 133 +++++++++++++++-------- 1 file changed, 85 insertions(+), 48 deletions(-) diff --git a/docs/hr/content/use_cases/mnist_torch.md b/docs/hr/content/use_cases/mnist_torch.md index 84a218ca2..65cad2aa3 100644 --- a/docs/hr/content/use_cases/mnist_torch.md +++ b/docs/hr/content/use_cases/mnist_torch.md @@ -1,29 +1,39 @@ # MNIST in Database -## Training and Maintaining MNIST Predictions with SuperDuperDB +## Training and Managing MNIST Predictions in SuperDuperDB -This notebook outlines the process of implementing a classic machine learning classification task - MNIST handwritten digit recognition, using a convolutional neural network. However, we introduce a unique twist by performing the task in a database using SuperDuperDB. +This notebook guides you through the implementation of a classic machine learning task: MNIST handwritten digit recognition. The twist? We perform the task directly in a database using SuperDuperDB. -## Prerequisites +This example makes it easy to connect any of your image recognition +model directly to your database in real-time. With SuperDuperDB, you can +skip complicated MLOps pipelines. It's a new straightforward way to +integrate your AI model with your data, ensuring simplicity, efficiency +and speed. -Before diving into the implementation, ensure that you have the necessary libraries installed by running the following commands: +## Prerequisites +Before diving into the implementation, ensure that you have the +necessary libraries installed by running the following commands: ```python !pip install superduperdb !pip install torch torchvision matplotlib ``` -## Connect to datastore +## Connect to datastore -First, we need to establish a connection to a MongoDB datastore via SuperDuperDB. You can configure the `MongoDB_URI` based on your specific setup. -Here are some examples of MongoDB URIs: +First, we need to establish a connection to a MongoDB datastore via +SuperDuperDB. You can configure the `MongoDB_URI` based on your specific +setup. -* For testing (default connection): `mongomock://test` -* Local MongoDB instance: `mongodb://localhost:27017` -* MongoDB with authentication: `mongodb://superduper:superduper@mongodb:27017/documents` -* MongoDB Atlas: `mongodb+srv://:@/` +Here are some examples of MongoDB URIs: +- For testing (default connection): `mongomock://test` +- Local MongoDB instance: `mongodb://localhost:27017` +- MongoDB with authentication: + `mongodb://superduper:superduper@mongodb:27017/documents` +- MongoDB Atlas: + `mongodb+srv://:@/` ```python from superduperdb import superduper @@ -31,18 +41,23 @@ from superduperdb.backends.mongodb import Collection import os mongodb_uri = os.getenv("MONGODB_URI","mongomock://test") + +# SuperDuperDB, now handles your MongoDB database +# It just super dupers your database db = superduper(mongodb_uri) # Create a collection for MNIST mnist_collection = Collection('mnist') ``` - ## Load Dataset -After connecting to MongoDB, we add the MNIST dataset. SuperDuperDB excels at handling "difficult" data types, and we achieve this using an `Encoder`, which works in tandem with the `Document` wrappers. Together, they enable Python dictionaries containing non-JSONable or bytes objects to be inserted into the underlying data infrastructure. - - +After establishing a connection to MongoDB, the next step is to load the +MNIST dataset. SuperDuperDB's strength lies in handling diverse data +types, especially those that are challenging. To achieve this, we use an +`Encoder` in conjunction with `Document` wrappers. These components +allow Python dictionaries containing non-JSONable or bytes objects to be +seamlessly inserted into the underlying data infrastructure. ```python import torchvision @@ -53,14 +68,18 @@ from superduperdb.backends.mongodb import Collection import random # Load MNIST images as Python objects using the Python Imaging Library. +# Each MNIST item is a tuple (image, label) mnist_data = list(torchvision.datasets.MNIST(root='./data', download=True)) + +# Create a list of Document instances from the MNIST data +# Each Document has an 'img' field (encoded using the Pillow library) and a 'class' field document_list = [Document({'img': pil_image(x[0]), 'class': x[1]}) for x in mnist_data] # Shuffle the data and select a subset of 1000 documents random.shuffle(document_list) data = document_list[:1000] -# Insert the selected data into the mnist_collection +# Insert the selected data into the mnist_collection which we mentioned before like: mnist_collection = Collection('mnist') db.execute( mnist_collection.insert_many(data[:-100]), # Insert all but the last 100 documents encoders=(pil_image,) # Encode images using the Pillow library. @@ -69,7 +88,6 @@ db.execute( Now that the images and their classes are inserted into the database, we can query the data in its original format. Particularly, we can use the `PIL.Image` instances to inspect the data. - ```python # Get and display one of the images r = db.execute(mnist_collection.find_one()) @@ -78,28 +96,33 @@ r.unpack()['img'] ## Build Model -Next, we create our machine learning model. SuperDuperDB supports various frameworks out of the box, and in this case, we are using PyTorch, which is well-suited for computer vision tasks. In this example, we combine torch with torchvision. - -We create `postprocess` and `preprocess` functions to handle the communication with the SuperDuperDB `Datalayer`, and then wrap model, preprocessing and postprocessing to create a native SuperDuperDB handler. - +Following that, we build our machine learning model. SuperDuperDB +conveniently supports various frameworks, and for this example, we opt +for PyTorch, a suitable choice for computer vision tasks. In this +instance, we combine `torch` with `torchvision`. +To facilitate communication with the SuperDuperDB `Datalayer`, we design `postprocess` and `preprocess` functions. These functions are then encapsulated with the model, preprocessing, and postprocessing steps to create a native SuperDuperDB handler. ```python import torch +# Define the LeNet-5 architecture for image classification class LeNet5(torch.nn.Module): def __init__(self, num_classes): super().__init__() + # Layer 1 self.layer1 = torch.nn.Sequential( torch.nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0), torch.nn.BatchNorm2d(6), torch.nn.ReLU(), torch.nn.MaxPool2d(kernel_size=2, stride=2)) + # Layer 2 self.layer2 = torch.nn.Sequential( torch.nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0), torch.nn.BatchNorm2d(16), torch.nn.ReLU(), torch.nn.MaxPool2d(kernel_size=2, stride=2)) + # Fully connected layers self.fc = torch.nn.Linear(400, 120) self.relu = torch.nn.ReLU() self.fc1 = torch.nn.Linear(120, 84) @@ -117,29 +140,33 @@ class LeNet5(torch.nn.Module): out = self.fc2(out) return out - +# Postprocess function for the model output def postprocess(x): return int(x.topk(1)[1].item()) - +# Preprocess function for input data def preprocess(x): return torchvision.transforms.Compose([ - torchvision.transforms.Resize((32, 32)), + torchvision.transforms.Resize((32, 32 + +)), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(0.1307,), std=(0.3081,))] )(x) +# Create an instance of the LeNet-5 model +lenet_model = LeNet5(10) -# Create and insert a SuperDuperDB model into the database -model = superduper(LeNet5(10), preprocess=preprocess, postprocess=postprocess, preferred_devices=('cpu',)) +# Create a SuperDuperDB model with the LeNet-5 model, preprocess, and postprocess functions +# Specify 'preferred_devices' as ('cpu',) indicating CPU preference +model = superduper(lenet_model, preprocess=preprocess, postprocess=postprocess, preferred_devices=('cpu',)) db.add(model) ``` ## Train Model -Now we are ready to "train" or "fit" the model. Trainable models in SuperDuperDB come with a sklearn-like `.fit` method. - - +Now we are ready to "train" or "fit" the model. Trainable models in +SuperDuperDB come with a sklearn-like `.fit` method. ```python from torch.nn.functional import cross_entropy @@ -153,28 +180,30 @@ job = model.fit( X='img', # Feature matrix used as input data y='class', # Target variable for training db=db, # Database used for data retrieval - select=mnist_collection.find(), # Select the dataset + select=mnist_collection.find(), # Select the dataset from the 'mnist_collection' configuration=TorchTrainerConfiguration( - identifier='my_configuration', - objective=cross_entropy, - loader_kwargs={'batch_size': 10}, - max_iterations=10, - validation_interval=5, + identifier='my_configuration', # Unique identifier for the training configuration + objective=cross_entropy, # The objective function (cross-entropy in this case) + loader_kwargs={'batch_size': 10}, # DataLoader keyword arguments, batch size is set to 10 + max_iterations=10, # Maximum number of training iterations + validation_interval=5, # Interval for validation during training ), - metrics=[Metric(identifier='acc', object=lambda x, y: sum([xx == yy for xx, yy in zip(x, y)]) / len(x))], + metrics=[Metric(identifier='acc', object=lambda x, y: sum([xx == yy for xx, yy in zip(x, y)]) / len(x))], # Define a custom accuracy metric for evaluation during training validation_sets=[ + # Define a validation dataset using a subset of data with '_fold' equal to 'valid' Dataset( identifier='my_valid', select=Collection('mnist').find({'_fold': 'valid'}), ) ], - distributed=False, + distributed=False, # Set to True if distributed training is enabled ) ``` ## Monitoring Training Efficiency -You can monitor the training efficiency with visualization tools like Matplotlib: +You can monitor the training efficiency with visualization tools like +Matplotlib: ```python from matplotlib import pyplot as plt @@ -187,11 +216,9 @@ plt.plot(model.metric_values['my_valid/acc']) plt.show() ``` - ## On-the-fly Predictions -Once the model is trained, you can use it to continuously predict on new data as it arrives. This is set up by enabling a `listener` for the database (without loading all the data client-side). The listen toggle activates the model to make predictions on incoming data changes. - +After training the model, you can continuously predict on new data as it arrives. By activating a `listener` for the database, the model can make predictions on incoming data changes without having to load all the data client-side. The listen toggle triggers the model to predict based on updates in the incoming data. ```python model.predict( @@ -205,9 +232,11 @@ model.predict( We can see that predictions are available in `_outputs.img.lenet5`. - ```python +# Execute find_one() to retrieve a single document from the 'mnist_collection'. r = db.execute(mnist_collection.find_one({'_fold': 'valid'})) + +# Unpack the document and extract its content r.unpack() ``` @@ -215,24 +244,32 @@ r.unpack() The models "activated" can be seen here: - ```python +# Show the status of the listener db.show('listener') ``` -We can verify that the model is activated, by inserting the rest of the data: - +We can verify that the model is activated, by inserting the rest of the +data: ```python +# Iterate over the last 100 elements in the 'data' list for r in data[-100:]: + # Update the 'update' field to True for each document r['update'] = True +# Insert the updated documents (with 'update' set to True) into the 'mnist_collection' db.execute(mnist_collection.insert_many(data[-100:])) ``` -You can see that the inserted data, are now also populated with predictions: - +You can see that the inserted data, are now also populated with +predictions: ```python -db.execute(mnist_collection.find_one({'update': True}))['_outputs'] +# Execute find_one() to retrieve a single sample document from 'mnist_collection' +# where the 'update' field is True +sample_document = db.execute(mnist_collection.find_one({'update': True}))['_outputs'] + +# A sample document +print(sample_document) ``` From 389dd016c0e91012d3a15ff1700f3e0e7c658049 Mon Sep 17 00:00:00 2001 From: Md Fazlul Karim Date: Fri, 24 Nov 2023 22:17:12 +0600 Subject: [PATCH 2/2] Sidebar Updated --- docs/hr/docusaurus.config.js | 13 +++++++++++-- docs/hr/sidebars.js | 2 +- docs/hr/src/css/custom.css | 8 +++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/hr/docusaurus.config.js b/docs/hr/docusaurus.config.js index fd341507a..0c4742808 100644 --- a/docs/hr/docusaurus.config.js +++ b/docs/hr/docusaurus.config.js @@ -1,6 +1,6 @@ // @ts-check // Note: type annotations allow type checking and IDEs autocompletion -const lightCodeTheme = require('prism-react-renderer').themes.vsLight; +const lightCodeTheme = require('prism-react-renderer').themes.github; const darkCodeTheme = require('prism-react-renderer').themes.vsDark; /** @type {import('@docusaurus/types').Config} */ @@ -177,6 +177,7 @@ const config = { routeBasePath: 'docs', path: 'content', sidebarPath: require.resolve('./sidebars.js'), + // sidebarCollapsible: true, // Please change this to your repo. // Remove this to remove the "edit this page" links. editUrl: @@ -255,7 +256,7 @@ const config = { }, { label: 'Use cases', - to: '/docs/category/use_cases', + to: '/docs/use_cases', }, { label: 'Blog', @@ -318,6 +319,14 @@ const config = { content: 'https://docs.superduperdb.com/img/superDuperDB_img.png', }, ], + announcementBar: { + id: 'support_us', + content: + '🔮 We are officially launching SuperDuperDB with the release of v0.1 on December 5th on Github! 🔮', + backgroundColor: '#7628f8', + textColor: '#fff', + isCloseable: true, + }, }), }; diff --git a/docs/hr/sidebars.js b/docs/hr/sidebars.js index aa837dfee..a1b9ebadc 100644 --- a/docs/hr/sidebars.js +++ b/docs/hr/sidebars.js @@ -38,7 +38,7 @@ const sidebars = { type: 'generated-index', description: 'Common and useful use-cases implemented in SuperDuperDB with a walkthrough', - slug: 'use-cases', + // slug: 'use-cases', }, }, // { diff --git a/docs/hr/src/css/custom.css b/docs/hr/src/css/custom.css index a718a2329..8569f0ab7 100644 --- a/docs/hr/src/css/custom.css +++ b/docs/hr/src/css/custom.css @@ -17,6 +17,12 @@ --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); --aa-primary-color-rgb: #7628f8 !important; --aa-muted-color-rgb: #7628f8 !important; + /* --ifm-code-background: #f5f5f5; */ + --prism-background-color: #f5f5f5 !important; +} + +style attribute { + --prism-background-color: #f5f5f5; } /* For readability concerns, you should choose a lighter palette in dark mode. */ @@ -217,5 +223,5 @@ main-wrapper { } pre code { - background-color: #F5F5F5 !important; + background-color: var(--ifm-pre-background); } \ No newline at end of file