From 1ca8977de58f339ef9c66ff1f77cc2d35f290ba9 Mon Sep 17 00:00:00 2001
From: Dan Sun <dsun20@bloomberg.net>
Date: Tue, 29 Dec 2020 10:15:15 -0500
Subject: [PATCH] Update torchserve docs (#1271)

* Update torchserve doc

* Fix autoscaling/canary example

* Reorgnize torchserve examples

* Add bert example
---
 docs/samples/v1beta1/torchserve/README.md     | 103 +++++++++------
 .../v1beta1/torchserve/autoscaling/README.md  |  99 ++++++++++++++
 .../{ => autoscaling}/autoscaling.yaml        |   5 +-
 .../samples/v1beta1/torchserve/bert/README.md | 105 +++++++++++++++
 .../samples/v1beta1/torchserve/bert/bert.yaml |  10 ++
 .../v1beta1/torchserve/bert/config.properties |   6 +
 .../v1beta1/torchserve/bert/sample_text.txt   |   7 +
 .../v1beta1/torchserve/canary/README.md       | 125 ++++++++++++++++++
 .../torchserve/{ => canary}/canary.yaml       |   2 +-
 .../v1beta1/torchserve/config.properties      |   4 +-
 .../v1beta1/torchserve/docs/autoscaling.md    |  65 ---------
 .../samples/v1beta1/torchserve/docs/canary.md | 100 --------------
 .../{docs/metrics.md => metrics/README.md}    |   0
 .../{docs => metrics}/images/grafana.png      | Bin
 .../{docs => metrics}/images/prometheus.png   | Bin
 .../images/prometheus_graph.png               | Bin
 .../torchserve/{ => metrics}/metrics.yaml     |   0
 .../torchserve/model-archiver/README.md       |   2 +-
 docs/samples/v1beta1/torchserve/pv.yaml       |  14 --
 docs/samples/v1beta1/torchserve/pvc.yaml      |  12 --
 docs/samples/v1beta1/torchserve/pvpod.yaml    |  21 ---
 21 files changed, 421 insertions(+), 259 deletions(-)
 create mode 100644 docs/samples/v1beta1/torchserve/autoscaling/README.md
 rename docs/samples/v1beta1/torchserve/{ => autoscaling}/autoscaling.yaml (53%)
 create mode 100644 docs/samples/v1beta1/torchserve/bert/README.md
 create mode 100644 docs/samples/v1beta1/torchserve/bert/bert.yaml
 create mode 100644 docs/samples/v1beta1/torchserve/bert/config.properties
 create mode 100644 docs/samples/v1beta1/torchserve/bert/sample_text.txt
 create mode 100644 docs/samples/v1beta1/torchserve/canary/README.md
 rename docs/samples/v1beta1/torchserve/{ => canary}/canary.yaml (95%)
 delete mode 100644 docs/samples/v1beta1/torchserve/docs/autoscaling.md
 delete mode 100644 docs/samples/v1beta1/torchserve/docs/canary.md
 rename docs/samples/v1beta1/torchserve/{docs/metrics.md => metrics/README.md} (100%)
 rename docs/samples/v1beta1/torchserve/{docs => metrics}/images/grafana.png (100%)
 rename docs/samples/v1beta1/torchserve/{docs => metrics}/images/prometheus.png (100%)
 rename docs/samples/v1beta1/torchserve/{docs => metrics}/images/prometheus_graph.png (100%)
 rename docs/samples/v1beta1/torchserve/{ => metrics}/metrics.yaml (100%)
 delete mode 100644 docs/samples/v1beta1/torchserve/pv.yaml
 delete mode 100644 docs/samples/v1beta1/torchserve/pvc.yaml
 delete mode 100644 docs/samples/v1beta1/torchserve/pvpod.yaml

diff --git a/docs/samples/v1beta1/torchserve/README.md b/docs/samples/v1beta1/torchserve/README.md
index fb2fabfa0ceb..a246f209dfd8 100644
--- a/docs/samples/v1beta1/torchserve/README.md
+++ b/docs/samples/v1beta1/torchserve/README.md
@@ -1,39 +1,47 @@
-# Predict on a InferenceService using Torchserve
+# Predict on a InferenceService using TorchServe
 
-In this example, we use a trained pytorch mnist model to predict handwritten digits by running an inference service with pytorch torchserve predictor.
+In this example, we use a trained pytorch mnist model to predict handwritten digits by running an inference service with [TorchServe](https://github.com/pytorch/serve) predictor.
 
 ## Setup
 
 1. Your ~/.kube/config should point to a cluster with [KFServing installed](https://github.com/kubeflow/kfserving/#install-kfserving).
 2. Your cluster's Istio Ingress gateway must be [network accessible](https://istio.io/latest/docs/tasks/traffic-management/ingress/ingress-control/).
 
-**__Note__** For prebuilt mnist marfile and config properties use this remote storage:
-
-```storageUri: gs://kfserving-examples/models/torchserve/image_classifier```
-
 ## Creating model storage with model archive file
 
-[Torchserve Model Archive Files (MAR)](https://github.com/pytorch/serve/blob/master/model-archiver/README.md)
-
-We obtain the model and dependent files from [here](https://github.com/pytorch/serve/tree/master/examples/image_classifier/mnist)
+TorchServe provides a utility to package all the model artifacts into a single [Torchserve Model Archive Files (MAR)](https://github.com/pytorch/serve/blob/master/model-archiver/README.md).
 
-Refer [model archive file generation](./model-archiver/README.md) for auto generation of marfiles from model and dependent files.
-
-## Create the InferenceService
+You can store your model and dependent files on remote storage or local persistent volume, the mnist model and dependent files can be obtained
+from [here](https://github.com/pytorch/serve/tree/master/examples/image_classifier/mnist).
 
-Apply the CRD
+The KFServing/TorchServe integration expects following model store layout.
 
 ```bash
-kubectl apply -f torchserve.yaml
+├── config
+│   ├── config.properties
+├── model-store
+│   ├── densenet_161.mar
+│   ├── mnist.mar
 ```
 
-Expected Output
+- For remote storage you can choose to start the example using the prebuilt mnist MAR file stored on KFServing example GCS bucket
+`gs://kfserving-examples/models/torchserve/image_classifier`,
+you can also generate the MAR file with `torch-model-archiver` and create the model store on remote storage according to the above layout.
 
 ```bash
-$inferenceservice.serving.kubeflow.org/torchserve created
+torch-model-archiver --model-name mnist --version 1.0 \
+--model-file model-archiver/model-store/mnist/mnist.py \
+--serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \
+--handler model-archiver/model-store/mnist/mnist_handler.py \
 ```
 
-## Torchserve with KFS envelope inference endpoints
+
+- For PVC user please refer to [model archive file generation](./model-archiver/README.md) for auto generation of MAR files from
+the model and dependent files.
+
+
+## TorchServe with KFS envelope inference endpoints
+The KFServing/TorchServe integration supports KFServing v1 protocol and we are working on to support v2 protocol.
 
 | API  | Verb | Path | Payload |
 | ------------- | ------------- | ------------- | ------------- |
@@ -42,21 +50,38 @@ $inferenceservice.serving.kubeflow.org/torchserve created
 
 [Sample requests for text and image classification](https://github.com/pytorch/serve/tree/master/kubernetes/kfserving/kf_request_json)
 
-## Run a prediction
+## Create the InferenceService
+
+For deploying the `InferenceService` on CPU
+```bash
+kubectl apply -f torchserve.yaml
+```
+
+For deploying the `InferenceService` on GPU
+```bash
+kubectl apply -f gpu.yaml
+```
+
+Expected Output
+
+```bash
+$inferenceservice.serving.kubeflow.org/torchserve created
+```
+
+## Inference
 
 The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
 
 ```bash
-MODEL_NAME=torchserve
+MODEL_NAME=mnist
 SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3)
 ```
 
-Use [image converter](../imgconv/README.md) to create input request for mnist. For other models refer [input request](https://github.com/pytorch/serve/tree/master/kubernetes/kfserving/kf_request_json)
-
-### Prediction Request
+Use [image converter](../imgconv/README.md) to create input request for mnist. 
+For other models please refer to [input request](https://github.com/pytorch/serve/tree/master/kubernetes/kfserving/kf_request_json)
 
 ```bash
-curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/mnist:predict -d @./mnist.json
+curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./mnist.json
 ```
 
 Expected Output
@@ -87,11 +112,13 @@ Expected Output
 {"predictions": ["2"]}
 ```
 
-### Explanation
+## Explanation
 
-Model interpretability is an important aspect which help  to understand , which of the input features were important for a particular classification. Captum is a model interpretability libarary. The explain function uses Captum's -integrated graident feature to help us understand, which input features were important for a particular model prediction.
+Model interpretability is an important aspect which help to understand which of the input features were important for a particular classification. 
+[Captum](https://captum.ai) is a model interpretability library, the `KFServing Explain Endpoint` uses Captum's state-of-the-art algorithm, including integrated
+gradients to provide user with an easy way to understand which features are contributing to the model output.
 
-Refer [Captum](https://captum.ai/tutorials/) for more info.
+Your can refer to [Captum Tutorial](https://captum.ai/tutorials/) for more examples.
 
 ### Explain Request
 
@@ -128,23 +155,17 @@ Expected Output
 317543458, 0.0060051362999805355, -0.0008195376963202741, 0.0041728603512658224, -0.0017597169567888774, -0.0010577007775543158, 0.00046033327178068433, -0.0007674196306044449, -0.0], [-0.0, -0.0, 0.0013386963856532302, 0.00035183178922260837, 0.0030610334903526204, 8.951834979315781e-05, 0.0023676793550483524, -0.0002900551076915047, -0.00207019445286608, -7.61697478482574e-05, 0.0012150086715244216, 0.009831239281792168, 0.003479667642621962, 0.0070584324334114525, 0.004161851261339585, 0.0026146296354490665, -9.194746959222099e-05, 0.0013583866966571571, 0.0016821551239318913, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]]]]}
 ```
 
-Get Pods
-
-```bash
-kubectl get pods -n <namespace>
-
-NAME                                                                  READY   STATUS    RESTARTS   AGE
-pod/torchserve-predictor-default-8mw55-deployment-57f979c88-f2dkn     2/2     Running   0          4m25s
-```
-
-## For Autoscaling
+## Autoscaling
+One of the main serverless inference features is to automatically scale the replicas of an `InferenceService` matching the incoming workload.
+KFServing by default enables [Knative Pod Autoscaler](https://knative.dev/docs/serving/autoscaling/) which watches traffic flow and scales up and down
+based on the configured metrics.
 
-Configurations for autoscaling pods [Auto scaling](docs/autoscaling.md)
+[Autoscaling Example](autoscaling/README.md)
 
 ## Canary Rollout
+Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic.
 
-Configurations for canary [Canary Deployment](docs/canary.md)
-
-## For Metrics
+[Canary Deployment](canary/README.md)
 
-Configurations for Metrics [Metrics](docs/metrics.md)
+## Monitoring
+[Expose metrics and setup grafana dashboards](metrics/README.md)
diff --git a/docs/samples/v1beta1/torchserve/autoscaling/README.md b/docs/samples/v1beta1/torchserve/autoscaling/README.md
new file mode 100644
index 000000000000..cb106d986f8c
--- /dev/null
+++ b/docs/samples/v1beta1/torchserve/autoscaling/README.md
@@ -0,0 +1,99 @@
+# Autoscaling
+KFServing supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes’ Horizontal Pod Autoscaler (HPA).
+The features and limitations of each of these Autoscalers are listed below.
+
+IMPORTANT: If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install [HPA extension](https://knative.dev/docs/install/any-kubernetes-cluster/#optional-serving-extensions)
+ after you install Knative Serving.
+
+Knative Pod Autoscaler (KPA)
+- Part of the Knative Serving core and enabled by default once Knative Serving is installed.
+- Supports scale to zero functionality.
+- Does not support CPU-based autoscaling.
+
+Horizontal Pod Autoscaler (HPA)
+- Not part of the Knative Serving core, and must be enabled after Knative Serving installation.
+- Does not support scale to zero functionality.
+- Supports CPU-based autoscaling.
+
+## Create InferenceService with concurrency target
+
+
+### Soft limit
+You can configure InferenceService with annotation `autoscaling.knative.dev/target` for a soft limit. The soft limit is a targeted limit rather than
+a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded.
+
+```yaml
+apiVersion: "serving.kubeflow.org/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "torchserve"
+  annotations:
+    autoscaling.knative.dev/target: "10"
+spec:
+  predictor:
+    pytorch:
+      protocolVersion: v2
+      storageUri: "gs://kfserving-examples/models/torchserve/image_classifier"
+```
+
+### Hard limit
+
+You can also configure InferenceService with field `containerConcurrency` for a hard limit. The hard limit is an enforced upper bound. 
+If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests.
+
+```yaml
+apiVersion: "serving.kubeflow.org/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "torchserve"
+spec:
+  predictor:
+    containerConcurrency: 10
+    pytorch:
+      protocolVersion: v2
+      storageUri: "gs://kfserving-examples/models/torchserve/image_classifier"
+```
+
+### Create the InferenceService
+
+```bash
+kubectl apply -f torchserve.yaml
+```
+
+Expected Output
+
+```bash
+$inferenceservice.serving.kubeflow.org/torchserve created
+```
+
+## Run inference with concurrent requests
+
+The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
+
+Install hey load generator 
+```bash
+go get -u github.com/rakyll/hey
+```
+
+Send concurrent inference requests
+```bash
+MODEL_NAME=mnist
+SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+
+./hey -m POST -z 30s -D ./mnist.json -host ${SERVICE_HOSTNAME} http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict
+```
+
+### Check the pods that are scaled up
+`hey` by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is 10.
+
+```bash
+kubectl get pods -n kfserving-test 
+
+NAME                                                             READY   STATUS        RESTARTS   AGE
+torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb   2/2     Terminating   0          103s
+torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8   2/2     Terminating   0          95s
+torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq   2/2     Running       0          50m
+torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr   2/2     Running       0          113s
+torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl   2/2     Running       0          109s
+torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t   2/2     Terminating   0          103s
+```
diff --git a/docs/samples/v1beta1/torchserve/autoscaling.yaml b/docs/samples/v1beta1/torchserve/autoscaling/autoscaling.yaml
similarity index 53%
rename from docs/samples/v1beta1/torchserve/autoscaling.yaml
rename to docs/samples/v1beta1/torchserve/autoscaling/autoscaling.yaml
index 58e466691017..95c1358c598b 100644
--- a/docs/samples/v1beta1/torchserve/autoscaling.yaml
+++ b/docs/samples/v1beta1/torchserve/autoscaling/autoscaling.yaml
@@ -1,10 +1,11 @@
-#For example, specify a “concurrency target” of “10”, the autoscaler will try to make sure that every replica receives on average 10 requests at a time. A target is always evaluated against a specified metric.
+# For example, specify a “concurrency target” of “10”, the autoscaler will try to make sure that every replica receives on average 10 requests at a time.
+# A target is always evaluated against a specified metric.
 apiVersion: "serving.kubeflow.org/v1beta1"
 kind: "InferenceService"
 metadata:
   name: "torchserve"
   annotations:
-    autoscaling.knative.dev/target: "5"
+    autoscaling.knative.dev/target: "10"
 spec:
   predictor:
     pytorch:
diff --git a/docs/samples/v1beta1/torchserve/bert/README.md b/docs/samples/v1beta1/torchserve/bert/README.md
new file mode 100644
index 000000000000..4de7995c7b6c
--- /dev/null
+++ b/docs/samples/v1beta1/torchserve/bert/README.md
@@ -0,0 +1,105 @@
+# TorchServe example with Huggingface bert model
+In this example we will show how to serve [Huggingface Transformers with TorchServe](https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers)
+on KFServing.
+
+## Model archive file creation
+
+Clone [pytorch/serve](https://github.com/pytorch/serve) repository,
+navigate to `examples/Huggingface_Transformers` and follow the steps for creating the MAR file including serialized model and other dependent files.
+TorchServe supports both eager model and torchscript and here we save as the pretrained model. 
+ 
+```bash
+torch-model-archiver --model-name BERTSeqClassification --version 1.0 \
+--serialized-file Transformer_model/pytorch_model.bin \
+--handler ./Transformer_handler_generalized.py \
+--extra-files "Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json"
+```
+
+## Create the InferenceService
+
+Apply the CRD
+
+```bash
+kubectl apply -f bert.yaml
+```
+
+Expected Output
+
+```bash
+$inferenceservice.serving.kubeflow.org/torchserve-bert created
+```
+
+## Run a prediction
+
+The first step is to [determine the ingress IP and ports](../../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
+
+```bash
+MODEL_NAME=torchserve-bert
+SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -n <namespace> -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+
+curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/BERTSeqClassification:predict -d ./sample_text.txt
+```
+
+Expected Output
+
+```bash
+*   Trying 44.239.20.204...
+* Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com (44.239.20.204) port 80 (#0)
+> PUT /v1/models/BERTSeqClassification:predict HTTP/1.1
+> Host: torchserve-bert.kfserving-test.example.com
+> User-Agent: curl/7.47.0
+> Accept: */*
+> Content-Length: 79
+> Expect: 100-continue
+>
+< HTTP/1.1 100 Continue
+* We are completely uploaded and fine
+< HTTP/1.1 200 OK
+< cache-control: no-cache; no-store, must-revalidate, private
+< content-length: 8
+< date: Wed, 04 Nov 2020 10:54:49 GMT
+< expires: Thu, 01 Jan 1970 00:00:00 UTC
+< pragma: no-cache
+< x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50
+< x-envoy-upstream-service-time: 2085
+< server: istio-envoy
+<
+* Connection #0 to host torchserve-bert.kfserving-test.example.com left intact
+Accepted
+```
+
+## Captum Explanations
+In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model.
+```bash
+MODEL_NAME=torchserve-bert
+SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -n <namespace> -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+
+curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/BERTSeqClassification:explaine -d ./sample_text.txt
+```
+Expected output
+```bash
+*   Trying ::1:8080...
+* Connected to localhost (::1) port 8080 (#0)
+> POST /v1/models/BERTSeqClassification:explain HTTP/1.1
+> Host: torchserve-bert.default.example.com
+> User-Agent: curl/7.73.0
+> Accept: */*
+> Content-Length: 84
+> Content-Type: application/x-www-form-urlencoded
+>Handling connection for 8080
+ 
+* upload completely sent off: 84 out of 84 bytes
+* Mark bundle as not supporting multiuse
+< HTTP/1.1 200 OK
+< content-length: 292
+< content-type: application/json; charset=UTF-8
+< date: Sun, 27 Dec 2020 05:53:52 GMT
+< server: istio-envoy
+< x-envoy-upstream-service-time: 5769
+< 
+* Connection #0 to host localhost left intact
+{"explanations": [{"importances": [0.0, -0.6324463574494716, -0.033115653530477414, 0.2681695752722339, -0.29124745608778546, 0.5422589681903883, -0.3848768219546909, 0.0], 
+"words": ["[CLS]", "bloomberg", "has", "reported", "on", "the", "economy", "[SEP]"], "delta": -0.0007350619859377225}]}
+```
+
+
diff --git a/docs/samples/v1beta1/torchserve/bert/bert.yaml b/docs/samples/v1beta1/torchserve/bert/bert.yaml
new file mode 100644
index 000000000000..0c7d8ab1114e
--- /dev/null
+++ b/docs/samples/v1beta1/torchserve/bert/bert.yaml
@@ -0,0 +1,10 @@
+apiVersion: serving.kubeflow.org/v1beta1
+kind: InferenceService
+metadata:
+  name: "torchserve-bert"
+spec:
+  predictor:
+    pytorch:
+      protocolVersion: v2
+      storageUri: gs://kfserving-examples/models/torchserve/huggingface
+      # storageUri: pvc://model-pv-claim
diff --git a/docs/samples/v1beta1/torchserve/bert/config.properties b/docs/samples/v1beta1/torchserve/bert/config.properties
new file mode 100644
index 000000000000..2f898c94337f
--- /dev/null
+++ b/docs/samples/v1beta1/torchserve/bert/config.properties
@@ -0,0 +1,6 @@
+inference_address=http://0.0.0.0:8085
+management_address=http://0.0.0.0:8081
+number_of_netty_threads=4
+job_queue_size=10
+model_store=/mnt/models/model-store
+model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"bert":{"1.0":{"defaultVersion":true,"marName":"BERTSeqClassification.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}}
diff --git a/docs/samples/v1beta1/torchserve/bert/sample_text.txt b/docs/samples/v1beta1/torchserve/bert/sample_text.txt
new file mode 100644
index 000000000000..794139004c91
--- /dev/null
+++ b/docs/samples/v1beta1/torchserve/bert/sample_text.txt
@@ -0,0 +1,7 @@
+{
+  "instances": [
+    {
+      "data": "Bloomberg has reported on the economy"
+    }
+  ]
+}
diff --git a/docs/samples/v1beta1/torchserve/canary/README.md b/docs/samples/v1beta1/torchserve/canary/README.md
new file mode 100644
index 000000000000..0ed53698d2ad
--- /dev/null
+++ b/docs/samples/v1beta1/torchserve/canary/README.md
@@ -0,0 +1,125 @@
+# Canary Rollout
+
+## Create InferenceService with default model
+
+```yaml
+apiVersion: "serving.kubeflow.org/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "torchserve"
+spec:
+  predictor:
+    pytorch:
+      protocolVersion: v2
+      storageUri: "gs://kfserving-examples/models/torchserve/image_classifier"
+```
+
+Apply the InferenceService
+
+```bash
+kubectl apply -f torchserve.yaml
+```
+
+Expected Output
+
+```bash
+$inferenceservice.serving.kubeflow.org/torchserve created
+```
+
+## Create InferenceService with canary model
+
+Change the `storageUri` for the new model version and apply the InferenceService
+
+```yaml
+apiVersion: "serving.kubeflow.org/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "torchserve"
+spec:
+  predictor:
+    canaryTrafficPercent: 20
+    pytorch:
+      storageUri: "gs://kfserving-examples/models/torchserve/image_classifier/v2"
+```
+
+Apply the InferenceService
+
+```bash
+kubectl apply -f canary.yaml
+```
+You should now see two revisions created
+```bash
+kubectl get revisions -l serving.kubeflow.org/inferenceservice=torchserve
+NAME                                 CONFIG NAME                    K8S SERVICE NAME                     GENERATION   READY   REASON
+torchserve-predictor-default-9lttm   torchserve-predictor-default   torchserve-predictor-default-9lttm   1            True
+torchserve-predictor-default-kxp96   torchserve-predictor-default   torchserve-predictor-default-kxp96   2            True
+```
+
+
+## Run a prediction
+
+The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
+
+```bash
+MODEL_NAME=mnist
+SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+
+curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./mnist.json
+```
+
+Expected Output
+
+```bash
+*   Trying 52.89.19.61...
+* Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com (52.89.19.61) port 80 (#0)
+> PUT /v1/models/mnist:predict HTTP/1.1
+> Host: torchserve.kfserving-test.example.com
+> User-Agent: curl/7.47.0
+> Accept: */*
+> Content-Length: 167
+> Expect: 100-continue
+> 
+< HTTP/1.1 100 Continue
+* We are completely uploaded and fine
+< HTTP/1.1 200 OK
+< cache-control: no-cache; no-store, must-revalidate, private
+< content-length: 1
+< date: Tue, 27 Oct 2020 08:26:19 GMT
+< expires: Thu, 01 Jan 1970 00:00:00 UTC
+< pragma: no-cache
+< x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517
+< x-envoy-upstream-service-time: 6
+< server: istio-envoy
+< 
+* Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact
+{"predictions": ["2"]}
+```
+
+## Check the traffic split between the two revisions
+
+```bash
+kubectl get pods -l serving.kubeflow.org/inferenceservice=torchserve
+NAME                                                             READY   STATUS    RESTARTS   AGE
+torchserve-predictor-default-9lttm-deployment-7dd5cff4cb-tmmlc   2/2     Running   0          21m
+torchserve-predictor-default-kxp96-deployment-5d949864df-bmzfk   2/2     Running   0          20m
+```
+
+Check the traffic split
+```bash
+kubectl get ksvc torchserve-predictor-default -oyaml
+  status:
+    address:
+      url: http://torchserve-predictor-default.default.svc.cluster.local
+    traffic:
+    - latestRevision: true
+      percent: 20
+      revisionName: torchserve-predictor-default-kxp96
+      tag: latest
+      url: http://latest-torchserve-predictor-default.default.example.com
+    - latestRevision: false
+      percent: 80
+      revisionName: torchserve-predictor-default-9lttm
+      tag: prev
+      url: http://prev-torchserve-predictor-default.default.example.com
+    url: http://torchserve-predictor-default.default.example.com
+```
diff --git a/docs/samples/v1beta1/torchserve/canary.yaml b/docs/samples/v1beta1/torchserve/canary/canary.yaml
similarity index 95%
rename from docs/samples/v1beta1/torchserve/canary.yaml
rename to docs/samples/v1beta1/torchserve/canary/canary.yaml
index 324499352241..174cf389c049 100644
--- a/docs/samples/v1beta1/torchserve/canary.yaml
+++ b/docs/samples/v1beta1/torchserve/canary/canary.yaml
@@ -7,7 +7,7 @@ spec:
     canaryTrafficPercent: 20
     pytorch:
       protocolVersion: v2
-      storageUri: gs://kfserving-examples/models/torchserve/image_classifier
+      storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v2
       resources:
         limits:
           memory: 4Gi
diff --git a/docs/samples/v1beta1/torchserve/config.properties b/docs/samples/v1beta1/torchserve/config.properties
index 76e23f6a8f9c..6ad8fba744a1 100644
--- a/docs/samples/v1beta1/torchserve/config.properties
+++ b/docs/samples/v1beta1/torchserve/config.properties
@@ -1,6 +1,6 @@
-inference_address=http://0.0.0.0:8080
+inference_address=http://0.0.0.0:8085
 management_address=http://0.0.0.0:8081
 number_of_netty_threads=4
 job_queue_size=10
 model_store=/mnt/models/model-store
-model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"mnist":{"1.0":{"defaultVersion":true,"marName":"mnist.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}}
\ No newline at end of file
+model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"mnist":{"1.0":{"defaultVersion":true,"marName":"mnist.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}}
diff --git a/docs/samples/v1beta1/torchserve/docs/autoscaling.md b/docs/samples/v1beta1/torchserve/docs/autoscaling.md
deleted file mode 100644
index 6a9ec0dc808a..000000000000
--- a/docs/samples/v1beta1/torchserve/docs/autoscaling.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Autoscaling
-
-## Deploymnet yaml
-
-For example, specify a “concurrency target” of “10”, the autoscaler will try to make sure that every replica receives on average 10 requests at a time.
-By default the pod scale with concurrency metrics
-
-Refer [model archive file generation](../model-archiver/README.md) for PV, PVC storage and marfile creation.
-
-autoscaling.yaml
-
-```yaml
-apiVersion: "serving.kubeflow.org/v1beta1"
-kind: "InferenceService"
-metadata:
-  name: "torchserve"
-  annotations:
-    autoscaling.knative.dev/target: "5"
-spec:
-  predictor:
-    pytorch:
-      protocolVersion: v2
-      storageUri: "pvc://model-pv-claim"
-```
-
-## Create the InferenceService
-
-Apply the CRD
-
-```bash
-kubectl apply -f torchserve.yaml
-```
-
-Expected Output
-
-```bash
-$inferenceservice.serving.kubeflow.org/torchserve created
-```
-
-## Run a prediction
-
-The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
-
-Install hey load generator (go get -u github.com/rakyll/hey).
-
-```bash
-MODEL_NAME=mnist
-SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3)
-
-./hey -m POST -z 30s -D ./mnist.json -host ${SERVICE_HOSTNAME} http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict
-```
-
-### Get Pods
-
-```bash
-kubectl get pods -n kfserving-test 
-
-NAME                                                             READY   STATUS        RESTARTS   AGE
-torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb   2/2     Terminating   0          103s
-torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8   2/2     Terminating   0          95s
-torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq   2/2     Running       0          50m
-torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr   2/2     Running       0          113s
-torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl   2/2     Running       0          109s
-torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t   2/2     Terminating   0          103s
-```
diff --git a/docs/samples/v1beta1/torchserve/docs/canary.md b/docs/samples/v1beta1/torchserve/docs/canary.md
deleted file mode 100644
index a11f346ff12c..000000000000
--- a/docs/samples/v1beta1/torchserve/docs/canary.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Canary Rollouts
-
-## Creating model storage with model archive file
-
-Refer [model archive file generation](../model-archiver/README.md) for PV, PVC storage and marfile creation.
-
-## Deployment yaml
-
-### Main model
-
-```yaml
-apiVersion: "serving.kubeflow.org/v1beta1"
-kind: "InferenceService"
-metadata:
-  name: "torch-pred"
-spec:
-  predictor:
-    pytorch:
-      protocolVersion: v2
-      storageUri: "pvc://model-pv-claim"
-```
-
-### Canary model
-
-Change the path and deploy
-
-```yaml
-apiVersion: "serving.kubeflow.org/v1beta1"
-kind: "InferenceService"
-metadata:
-  name: "torch-pred"
-spec:
-  predictor:
-    canaryTrafficPercent: 20
-    pytorch:
-      storageUri: "pvc://model-store-claim-1"
-```
-
-## Create the InferenceService
-
-Apply the CRD
-
-```bash
-kubectl apply -f torchserve.yaml
-```
-
-Expected Output
-
-```bash
-$inferenceservice.serving.kubeflow.org/torchserve created
-```
-
-## Run a prediction
-
-The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
-
-```bash
-MODEL_NAME=mnist
-SERVICE_HOSTNAME=$(kubectl get inferenceservice torch-pred  -o jsonpath='{.status.url}' | cut -d "/" -f 3)
-
-curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./mnist.json
-```
-
-Expected Output
-
-```bash
-*   Trying 52.89.19.61...
-* Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com (52.89.19.61) port 80 (#0)
-> PUT /v1/models/mnist:predict HTTP/1.1
-> Host: torch-pred.kfserving-test.example.com
-> User-Agent: curl/7.47.0
-> Accept: */*
-> Content-Length: 167
-> Expect: 100-continue
-> 
-< HTTP/1.1 100 Continue
-* We are completely uploaded and fine
-< HTTP/1.1 200 OK
-< cache-control: no-cache; no-store, must-revalidate, private
-< content-length: 1
-< date: Tue, 27 Oct 2020 08:26:19 GMT
-< expires: Thu, 01 Jan 1970 00:00:00 UTC
-< pragma: no-cache
-< x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517
-< x-envoy-upstream-service-time: 6
-< server: istio-envoy
-< 
-* Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact
-{"predictions": ["2"]}
-```
-
-### Get Pods
-
-```bash
-kubectl get pods -n kfserving-test 
-
-NAME                                                             READY   STATUS        RESTARTS   AGE
-torch-pred-predictor-default-cj2d8-deployment-69444c9c74-tsrwr   2/2     Running       0          113s
-torch-pred-predictor-default-cj2d8-deployment-69444c9c74-vvpjl   2/2     Running       0          109s
-```
diff --git a/docs/samples/v1beta1/torchserve/docs/metrics.md b/docs/samples/v1beta1/torchserve/metrics/README.md
similarity index 100%
rename from docs/samples/v1beta1/torchserve/docs/metrics.md
rename to docs/samples/v1beta1/torchserve/metrics/README.md
diff --git a/docs/samples/v1beta1/torchserve/docs/images/grafana.png b/docs/samples/v1beta1/torchserve/metrics/images/grafana.png
similarity index 100%
rename from docs/samples/v1beta1/torchserve/docs/images/grafana.png
rename to docs/samples/v1beta1/torchserve/metrics/images/grafana.png
diff --git a/docs/samples/v1beta1/torchserve/docs/images/prometheus.png b/docs/samples/v1beta1/torchserve/metrics/images/prometheus.png
similarity index 100%
rename from docs/samples/v1beta1/torchserve/docs/images/prometheus.png
rename to docs/samples/v1beta1/torchserve/metrics/images/prometheus.png
diff --git a/docs/samples/v1beta1/torchserve/docs/images/prometheus_graph.png b/docs/samples/v1beta1/torchserve/metrics/images/prometheus_graph.png
similarity index 100%
rename from docs/samples/v1beta1/torchserve/docs/images/prometheus_graph.png
rename to docs/samples/v1beta1/torchserve/metrics/images/prometheus_graph.png
diff --git a/docs/samples/v1beta1/torchserve/metrics.yaml b/docs/samples/v1beta1/torchserve/metrics/metrics.yaml
similarity index 100%
rename from docs/samples/v1beta1/torchserve/metrics.yaml
rename to docs/samples/v1beta1/torchserve/metrics/metrics.yaml
diff --git a/docs/samples/v1beta1/torchserve/model-archiver/README.md b/docs/samples/v1beta1/torchserve/model-archiver/README.md
index 57bd04c70799..e79c11dec3b2 100644
--- a/docs/samples/v1beta1/torchserve/model-archiver/README.md
+++ b/docs/samples/v1beta1/torchserve/model-archiver/README.md
@@ -7,7 +7,7 @@
 
 ## 1. Create PV and PVC
 
-Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage refer [AWS EFS storage](https://github.com/pytorch/serve/blob/master/kubernetes/EKS/README.md#setup-persistentvolume-backed-by-efs)
+Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to [AWS EFS storage](https://github.com/pytorch/serve/blob/master/kubernetes/EKS/README.md#setup-persistentvolume-backed-by-efs)
 
 ### 1.1 Create PV
 
diff --git a/docs/samples/v1beta1/torchserve/pv.yaml b/docs/samples/v1beta1/torchserve/pv.yaml
deleted file mode 100644
index 6e4511d8749c..000000000000
--- a/docs/samples/v1beta1/torchserve/pv.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: model-pv-volume
-  labels:
-    type: "amazonEBS"
-spec:
-  capacity:
-    storage: 5Gi
-  accessModes:
-      - ReadWriteOnce
-  awsElasticBlockStore:
-    volumeID: {volume-id} #vol-074ea8934f7b80df5
-    fsType: ext4
diff --git a/docs/samples/v1beta1/torchserve/pvc.yaml b/docs/samples/v1beta1/torchserve/pvc.yaml
deleted file mode 100644
index da72c56b84fc..000000000000
--- a/docs/samples/v1beta1/torchserve/pvc.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: model-pv-claim
-  labels:
-    type: "amazonEBS"
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 1Gi
diff --git a/docs/samples/v1beta1/torchserve/pvpod.yaml b/docs/samples/v1beta1/torchserve/pvpod.yaml
deleted file mode 100644
index baa71fdb0fa4..000000000000
--- a/docs/samples/v1beta1/torchserve/pvpod.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: model-store-pod
-spec:
-  volumes:
-    - name: model-store
-      persistentVolumeClaim:
-        claimName: model-pv-claim
-  containers:
-    - name: model-store
-      image: ubuntu
-      command: [ "sleep" ]
-      args: [ "infinity" ]
-      volumeMounts:
-        - mountPath: "/pv"
-          name: model-store
-      resources:
-        limits:
-          memory: "4Gi"
-          cpu: "2"