From 1ca8977de58f339ef9c66ff1f77cc2d35f290ba9 Mon Sep 17 00:00:00 2001 From: Dan Sun Date: Tue, 29 Dec 2020 10:15:15 -0500 Subject: [PATCH] Update torchserve docs (#1271) * Update torchserve doc * Fix autoscaling/canary example * Reorgnize torchserve examples * Add bert example --- docs/samples/v1beta1/torchserve/README.md | 103 +++++++++------ .../v1beta1/torchserve/autoscaling/README.md | 99 ++++++++++++++ .../{ => autoscaling}/autoscaling.yaml | 5 +- .../samples/v1beta1/torchserve/bert/README.md | 105 +++++++++++++++ .../samples/v1beta1/torchserve/bert/bert.yaml | 10 ++ .../v1beta1/torchserve/bert/config.properties | 6 + .../v1beta1/torchserve/bert/sample_text.txt | 7 + .../v1beta1/torchserve/canary/README.md | 125 ++++++++++++++++++ .../torchserve/{ => canary}/canary.yaml | 2 +- .../v1beta1/torchserve/config.properties | 4 +- .../v1beta1/torchserve/docs/autoscaling.md | 65 --------- .../samples/v1beta1/torchserve/docs/canary.md | 100 -------------- .../{docs/metrics.md => metrics/README.md} | 0 .../{docs => metrics}/images/grafana.png | Bin .../{docs => metrics}/images/prometheus.png | Bin .../images/prometheus_graph.png | Bin .../torchserve/{ => metrics}/metrics.yaml | 0 .../torchserve/model-archiver/README.md | 2 +- docs/samples/v1beta1/torchserve/pv.yaml | 14 -- docs/samples/v1beta1/torchserve/pvc.yaml | 12 -- docs/samples/v1beta1/torchserve/pvpod.yaml | 21 --- 21 files changed, 421 insertions(+), 259 deletions(-) create mode 100644 docs/samples/v1beta1/torchserve/autoscaling/README.md rename docs/samples/v1beta1/torchserve/{ => autoscaling}/autoscaling.yaml (53%) create mode 100644 docs/samples/v1beta1/torchserve/bert/README.md create mode 100644 docs/samples/v1beta1/torchserve/bert/bert.yaml create mode 100644 docs/samples/v1beta1/torchserve/bert/config.properties create mode 100644 docs/samples/v1beta1/torchserve/bert/sample_text.txt create mode 100644 docs/samples/v1beta1/torchserve/canary/README.md rename docs/samples/v1beta1/torchserve/{ => canary}/canary.yaml (95%) delete mode 100644 docs/samples/v1beta1/torchserve/docs/autoscaling.md delete mode 100644 docs/samples/v1beta1/torchserve/docs/canary.md rename docs/samples/v1beta1/torchserve/{docs/metrics.md => metrics/README.md} (100%) rename docs/samples/v1beta1/torchserve/{docs => metrics}/images/grafana.png (100%) rename docs/samples/v1beta1/torchserve/{docs => metrics}/images/prometheus.png (100%) rename docs/samples/v1beta1/torchserve/{docs => metrics}/images/prometheus_graph.png (100%) rename docs/samples/v1beta1/torchserve/{ => metrics}/metrics.yaml (100%) delete mode 100644 docs/samples/v1beta1/torchserve/pv.yaml delete mode 100644 docs/samples/v1beta1/torchserve/pvc.yaml delete mode 100644 docs/samples/v1beta1/torchserve/pvpod.yaml diff --git a/docs/samples/v1beta1/torchserve/README.md b/docs/samples/v1beta1/torchserve/README.md index fb2fabfa0ceb..a246f209dfd8 100644 --- a/docs/samples/v1beta1/torchserve/README.md +++ b/docs/samples/v1beta1/torchserve/README.md @@ -1,39 +1,47 @@ -# Predict on a InferenceService using Torchserve +# Predict on a InferenceService using TorchServe -In this example, we use a trained pytorch mnist model to predict handwritten digits by running an inference service with pytorch torchserve predictor. +In this example, we use a trained pytorch mnist model to predict handwritten digits by running an inference service with [TorchServe](https://github.com/pytorch/serve) predictor. ## Setup 1. Your ~/.kube/config should point to a cluster with [KFServing installed](https://github.com/kubeflow/kfserving/#install-kfserving). 2. Your cluster's Istio Ingress gateway must be [network accessible](https://istio.io/latest/docs/tasks/traffic-management/ingress/ingress-control/). -**__Note__** For prebuilt mnist marfile and config properties use this remote storage: - -```storageUri: gs://kfserving-examples/models/torchserve/image_classifier``` - ## Creating model storage with model archive file -[Torchserve Model Archive Files (MAR)](https://github.com/pytorch/serve/blob/master/model-archiver/README.md) - -We obtain the model and dependent files from [here](https://github.com/pytorch/serve/tree/master/examples/image_classifier/mnist) +TorchServe provides a utility to package all the model artifacts into a single [Torchserve Model Archive Files (MAR)](https://github.com/pytorch/serve/blob/master/model-archiver/README.md). -Refer [model archive file generation](./model-archiver/README.md) for auto generation of marfiles from model and dependent files. - -## Create the InferenceService +You can store your model and dependent files on remote storage or local persistent volume, the mnist model and dependent files can be obtained +from [here](https://github.com/pytorch/serve/tree/master/examples/image_classifier/mnist). -Apply the CRD +The KFServing/TorchServe integration expects following model store layout. ```bash -kubectl apply -f torchserve.yaml +├── config +│   ├── config.properties +├── model-store +│   ├── densenet_161.mar +│   ├── mnist.mar ``` -Expected Output +- For remote storage you can choose to start the example using the prebuilt mnist MAR file stored on KFServing example GCS bucket +`gs://kfserving-examples/models/torchserve/image_classifier`, +you can also generate the MAR file with `torch-model-archiver` and create the model store on remote storage according to the above layout. ```bash -$inferenceservice.serving.kubeflow.org/torchserve created +torch-model-archiver --model-name mnist --version 1.0 \ +--model-file model-archiver/model-store/mnist/mnist.py \ +--serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \ +--handler model-archiver/model-store/mnist/mnist_handler.py \ ``` -## Torchserve with KFS envelope inference endpoints + +- For PVC user please refer to [model archive file generation](./model-archiver/README.md) for auto generation of MAR files from +the model and dependent files. + + +## TorchServe with KFS envelope inference endpoints +The KFServing/TorchServe integration supports KFServing v1 protocol and we are working on to support v2 protocol. | API | Verb | Path | Payload | | ------------- | ------------- | ------------- | ------------- | @@ -42,21 +50,38 @@ $inferenceservice.serving.kubeflow.org/torchserve created [Sample requests for text and image classification](https://github.com/pytorch/serve/tree/master/kubernetes/kfserving/kf_request_json) -## Run a prediction +## Create the InferenceService + +For deploying the `InferenceService` on CPU +```bash +kubectl apply -f torchserve.yaml +``` + +For deploying the `InferenceService` on GPU +```bash +kubectl apply -f gpu.yaml +``` + +Expected Output + +```bash +$inferenceservice.serving.kubeflow.org/torchserve created +``` + +## Inference The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT` ```bash -MODEL_NAME=torchserve +MODEL_NAME=mnist SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3) ``` -Use [image converter](../imgconv/README.md) to create input request for mnist. For other models refer [input request](https://github.com/pytorch/serve/tree/master/kubernetes/kfserving/kf_request_json) - -### Prediction Request +Use [image converter](../imgconv/README.md) to create input request for mnist. +For other models please refer to [input request](https://github.com/pytorch/serve/tree/master/kubernetes/kfserving/kf_request_json) ```bash -curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/mnist:predict -d @./mnist.json +curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./mnist.json ``` Expected Output @@ -87,11 +112,13 @@ Expected Output {"predictions": ["2"]} ``` -### Explanation +## Explanation -Model interpretability is an important aspect which help to understand , which of the input features were important for a particular classification. Captum is a model interpretability libarary. The explain function uses Captum's -integrated graident feature to help us understand, which input features were important for a particular model prediction. +Model interpretability is an important aspect which help to understand which of the input features were important for a particular classification. +[Captum](https://captum.ai) is a model interpretability library, the `KFServing Explain Endpoint` uses Captum's state-of-the-art algorithm, including integrated +gradients to provide user with an easy way to understand which features are contributing to the model output. -Refer [Captum](https://captum.ai/tutorials/) for more info. +Your can refer to [Captum Tutorial](https://captum.ai/tutorials/) for more examples. ### Explain Request @@ -128,23 +155,17 @@ Expected Output 317543458, 0.0060051362999805355, -0.0008195376963202741, 0.0041728603512658224, -0.0017597169567888774, -0.0010577007775543158, 0.00046033327178068433, -0.0007674196306044449, -0.0], [-0.0, -0.0, 0.0013386963856532302, 0.00035183178922260837, 0.0030610334903526204, 8.951834979315781e-05, 0.0023676793550483524, -0.0002900551076915047, -0.00207019445286608, -7.61697478482574e-05, 0.0012150086715244216, 0.009831239281792168, 0.003479667642621962, 0.0070584324334114525, 0.004161851261339585, 0.0026146296354490665, -9.194746959222099e-05, 0.0013583866966571571, 0.0016821551239318913, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]]]]} ``` -Get Pods - -```bash -kubectl get pods -n - -NAME READY STATUS RESTARTS AGE -pod/torchserve-predictor-default-8mw55-deployment-57f979c88-f2dkn 2/2 Running 0 4m25s -``` - -## For Autoscaling +## Autoscaling +One of the main serverless inference features is to automatically scale the replicas of an `InferenceService` matching the incoming workload. +KFServing by default enables [Knative Pod Autoscaler](https://knative.dev/docs/serving/autoscaling/) which watches traffic flow and scales up and down +based on the configured metrics. -Configurations for autoscaling pods [Auto scaling](docs/autoscaling.md) +[Autoscaling Example](autoscaling/README.md) ## Canary Rollout +Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic. -Configurations for canary [Canary Deployment](docs/canary.md) - -## For Metrics +[Canary Deployment](canary/README.md) -Configurations for Metrics [Metrics](docs/metrics.md) +## Monitoring +[Expose metrics and setup grafana dashboards](metrics/README.md) diff --git a/docs/samples/v1beta1/torchserve/autoscaling/README.md b/docs/samples/v1beta1/torchserve/autoscaling/README.md new file mode 100644 index 000000000000..cb106d986f8c --- /dev/null +++ b/docs/samples/v1beta1/torchserve/autoscaling/README.md @@ -0,0 +1,99 @@ +# Autoscaling +KFServing supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes’ Horizontal Pod Autoscaler (HPA). +The features and limitations of each of these Autoscalers are listed below. + +IMPORTANT: If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install [HPA extension](https://knative.dev/docs/install/any-kubernetes-cluster/#optional-serving-extensions) + after you install Knative Serving. + +Knative Pod Autoscaler (KPA) +- Part of the Knative Serving core and enabled by default once Knative Serving is installed. +- Supports scale to zero functionality. +- Does not support CPU-based autoscaling. + +Horizontal Pod Autoscaler (HPA) +- Not part of the Knative Serving core, and must be enabled after Knative Serving installation. +- Does not support scale to zero functionality. +- Supports CPU-based autoscaling. + +## Create InferenceService with concurrency target + + +### Soft limit +You can configure InferenceService with annotation `autoscaling.knative.dev/target` for a soft limit. The soft limit is a targeted limit rather than +a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. + +```yaml +apiVersion: "serving.kubeflow.org/v1beta1" +kind: "InferenceService" +metadata: + name: "torchserve" + annotations: + autoscaling.knative.dev/target: "10" +spec: + predictor: + pytorch: + protocolVersion: v2 + storageUri: "gs://kfserving-examples/models/torchserve/image_classifier" +``` + +### Hard limit + +You can also configure InferenceService with field `containerConcurrency` for a hard limit. The hard limit is an enforced upper bound. +If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. + +```yaml +apiVersion: "serving.kubeflow.org/v1beta1" +kind: "InferenceService" +metadata: + name: "torchserve" +spec: + predictor: + containerConcurrency: 10 + pytorch: + protocolVersion: v2 + storageUri: "gs://kfserving-examples/models/torchserve/image_classifier" +``` + +### Create the InferenceService + +```bash +kubectl apply -f torchserve.yaml +``` + +Expected Output + +```bash +$inferenceservice.serving.kubeflow.org/torchserve created +``` + +## Run inference with concurrent requests + +The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT` + +Install hey load generator +```bash +go get -u github.com/rakyll/hey +``` + +Send concurrent inference requests +```bash +MODEL_NAME=mnist +SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3) + +./hey -m POST -z 30s -D ./mnist.json -host ${SERVICE_HOSTNAME} http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict +``` + +### Check the pods that are scaled up +`hey` by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is 10. + +```bash +kubectl get pods -n kfserving-test + +NAME READY STATUS RESTARTS AGE +torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2/2 Terminating 0 103s +torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2/2 Terminating 0 95s +torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2/2 Running 0 50m +torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2/2 Running 0 113s +torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2/2 Running 0 109s +torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2/2 Terminating 0 103s +``` diff --git a/docs/samples/v1beta1/torchserve/autoscaling.yaml b/docs/samples/v1beta1/torchserve/autoscaling/autoscaling.yaml similarity index 53% rename from docs/samples/v1beta1/torchserve/autoscaling.yaml rename to docs/samples/v1beta1/torchserve/autoscaling/autoscaling.yaml index 58e466691017..95c1358c598b 100644 --- a/docs/samples/v1beta1/torchserve/autoscaling.yaml +++ b/docs/samples/v1beta1/torchserve/autoscaling/autoscaling.yaml @@ -1,10 +1,11 @@ -#For example, specify a “concurrency target” of “10”, the autoscaler will try to make sure that every replica receives on average 10 requests at a time. A target is always evaluated against a specified metric. +# For example, specify a “concurrency target” of “10”, the autoscaler will try to make sure that every replica receives on average 10 requests at a time. +# A target is always evaluated against a specified metric. apiVersion: "serving.kubeflow.org/v1beta1" kind: "InferenceService" metadata: name: "torchserve" annotations: - autoscaling.knative.dev/target: "5" + autoscaling.knative.dev/target: "10" spec: predictor: pytorch: diff --git a/docs/samples/v1beta1/torchserve/bert/README.md b/docs/samples/v1beta1/torchserve/bert/README.md new file mode 100644 index 000000000000..4de7995c7b6c --- /dev/null +++ b/docs/samples/v1beta1/torchserve/bert/README.md @@ -0,0 +1,105 @@ +# TorchServe example with Huggingface bert model +In this example we will show how to serve [Huggingface Transformers with TorchServe](https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers) +on KFServing. + +## Model archive file creation + +Clone [pytorch/serve](https://github.com/pytorch/serve) repository, +navigate to `examples/Huggingface_Transformers` and follow the steps for creating the MAR file including serialized model and other dependent files. +TorchServe supports both eager model and torchscript and here we save as the pretrained model. + +```bash +torch-model-archiver --model-name BERTSeqClassification --version 1.0 \ +--serialized-file Transformer_model/pytorch_model.bin \ +--handler ./Transformer_handler_generalized.py \ +--extra-files "Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json" +``` + +## Create the InferenceService + +Apply the CRD + +```bash +kubectl apply -f bert.yaml +``` + +Expected Output + +```bash +$inferenceservice.serving.kubeflow.org/torchserve-bert created +``` + +## Run a prediction + +The first step is to [determine the ingress IP and ports](../../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT` + +```bash +MODEL_NAME=torchserve-bert +SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -n -o jsonpath='{.status.url}' | cut -d "/" -f 3) + +curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/BERTSeqClassification:predict -d ./sample_text.txt +``` + +Expected Output + +```bash +* Trying 44.239.20.204... +* Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com (44.239.20.204) port 80 (#0) +> PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 +> Host: torchserve-bert.kfserving-test.example.com +> User-Agent: curl/7.47.0 +> Accept: */* +> Content-Length: 79 +> Expect: 100-continue +> +< HTTP/1.1 100 Continue +* We are completely uploaded and fine +< HTTP/1.1 200 OK +< cache-control: no-cache; no-store, must-revalidate, private +< content-length: 8 +< date: Wed, 04 Nov 2020 10:54:49 GMT +< expires: Thu, 01 Jan 1970 00:00:00 UTC +< pragma: no-cache +< x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 +< x-envoy-upstream-service-time: 2085 +< server: istio-envoy +< +* Connection #0 to host torchserve-bert.kfserving-test.example.com left intact +Accepted +``` + +## Captum Explanations +In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. +```bash +MODEL_NAME=torchserve-bert +SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -n -o jsonpath='{.status.url}' | cut -d "/" -f 3) + +curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/BERTSeqClassification:explaine -d ./sample_text.txt +``` +Expected output +```bash +* Trying ::1:8080... +* Connected to localhost (::1) port 8080 (#0) +> POST /v1/models/BERTSeqClassification:explain HTTP/1.1 +> Host: torchserve-bert.default.example.com +> User-Agent: curl/7.73.0 +> Accept: */* +> Content-Length: 84 +> Content-Type: application/x-www-form-urlencoded +>Handling connection for 8080 + +* upload completely sent off: 84 out of 84 bytes +* Mark bundle as not supporting multiuse +< HTTP/1.1 200 OK +< content-length: 292 +< content-type: application/json; charset=UTF-8 +< date: Sun, 27 Dec 2020 05:53:52 GMT +< server: istio-envoy +< x-envoy-upstream-service-time: 5769 +< +* Connection #0 to host localhost left intact +{"explanations": [{"importances": [0.0, -0.6324463574494716, -0.033115653530477414, 0.2681695752722339, -0.29124745608778546, 0.5422589681903883, -0.3848768219546909, 0.0], +"words": ["[CLS]", "bloomberg", "has", "reported", "on", "the", "economy", "[SEP]"], "delta": -0.0007350619859377225}]} +``` + + diff --git a/docs/samples/v1beta1/torchserve/bert/bert.yaml b/docs/samples/v1beta1/torchserve/bert/bert.yaml new file mode 100644 index 000000000000..0c7d8ab1114e --- /dev/null +++ b/docs/samples/v1beta1/torchserve/bert/bert.yaml @@ -0,0 +1,10 @@ +apiVersion: serving.kubeflow.org/v1beta1 +kind: InferenceService +metadata: + name: "torchserve-bert" +spec: + predictor: + pytorch: + protocolVersion: v2 + storageUri: gs://kfserving-examples/models/torchserve/huggingface + # storageUri: pvc://model-pv-claim diff --git a/docs/samples/v1beta1/torchserve/bert/config.properties b/docs/samples/v1beta1/torchserve/bert/config.properties new file mode 100644 index 000000000000..2f898c94337f --- /dev/null +++ b/docs/samples/v1beta1/torchserve/bert/config.properties @@ -0,0 +1,6 @@ +inference_address=http://0.0.0.0:8085 +management_address=http://0.0.0.0:8081 +number_of_netty_threads=4 +job_queue_size=10 +model_store=/mnt/models/model-store +model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"bert":{"1.0":{"defaultVersion":true,"marName":"BERTSeqClassification.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}} diff --git a/docs/samples/v1beta1/torchserve/bert/sample_text.txt b/docs/samples/v1beta1/torchserve/bert/sample_text.txt new file mode 100644 index 000000000000..794139004c91 --- /dev/null +++ b/docs/samples/v1beta1/torchserve/bert/sample_text.txt @@ -0,0 +1,7 @@ +{ + "instances": [ + { + "data": "Bloomberg has reported on the economy" + } + ] +} diff --git a/docs/samples/v1beta1/torchserve/canary/README.md b/docs/samples/v1beta1/torchserve/canary/README.md new file mode 100644 index 000000000000..0ed53698d2ad --- /dev/null +++ b/docs/samples/v1beta1/torchserve/canary/README.md @@ -0,0 +1,125 @@ +# Canary Rollout + +## Create InferenceService with default model + +```yaml +apiVersion: "serving.kubeflow.org/v1beta1" +kind: "InferenceService" +metadata: + name: "torchserve" +spec: + predictor: + pytorch: + protocolVersion: v2 + storageUri: "gs://kfserving-examples/models/torchserve/image_classifier" +``` + +Apply the InferenceService + +```bash +kubectl apply -f torchserve.yaml +``` + +Expected Output + +```bash +$inferenceservice.serving.kubeflow.org/torchserve created +``` + +## Create InferenceService with canary model + +Change the `storageUri` for the new model version and apply the InferenceService + +```yaml +apiVersion: "serving.kubeflow.org/v1beta1" +kind: "InferenceService" +metadata: + name: "torchserve" +spec: + predictor: + canaryTrafficPercent: 20 + pytorch: + storageUri: "gs://kfserving-examples/models/torchserve/image_classifier/v2" +``` + +Apply the InferenceService + +```bash +kubectl apply -f canary.yaml +``` +You should now see two revisions created +```bash +kubectl get revisions -l serving.kubeflow.org/inferenceservice=torchserve +NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON +torchserve-predictor-default-9lttm torchserve-predictor-default torchserve-predictor-default-9lttm 1 True +torchserve-predictor-default-kxp96 torchserve-predictor-default torchserve-predictor-default-kxp96 2 True +``` + + +## Run a prediction + +The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT` + +```bash +MODEL_NAME=mnist +SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3) + +curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./mnist.json +``` + +Expected Output + +```bash +* Trying 52.89.19.61... +* Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com (52.89.19.61) port 80 (#0) +> PUT /v1/models/mnist:predict HTTP/1.1 +> Host: torchserve.kfserving-test.example.com +> User-Agent: curl/7.47.0 +> Accept: */* +> Content-Length: 167 +> Expect: 100-continue +> +< HTTP/1.1 100 Continue +* We are completely uploaded and fine +< HTTP/1.1 200 OK +< cache-control: no-cache; no-store, must-revalidate, private +< content-length: 1 +< date: Tue, 27 Oct 2020 08:26:19 GMT +< expires: Thu, 01 Jan 1970 00:00:00 UTC +< pragma: no-cache +< x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 +< x-envoy-upstream-service-time: 6 +< server: istio-envoy +< +* Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact +{"predictions": ["2"]} +``` + +## Check the traffic split between the two revisions + +```bash +kubectl get pods -l serving.kubeflow.org/inferenceservice=torchserve +NAME READY STATUS RESTARTS AGE +torchserve-predictor-default-9lttm-deployment-7dd5cff4cb-tmmlc 2/2 Running 0 21m +torchserve-predictor-default-kxp96-deployment-5d949864df-bmzfk 2/2 Running 0 20m +``` + +Check the traffic split +```bash +kubectl get ksvc torchserve-predictor-default -oyaml + status: + address: + url: http://torchserve-predictor-default.default.svc.cluster.local + traffic: + - latestRevision: true + percent: 20 + revisionName: torchserve-predictor-default-kxp96 + tag: latest + url: http://latest-torchserve-predictor-default.default.example.com + - latestRevision: false + percent: 80 + revisionName: torchserve-predictor-default-9lttm + tag: prev + url: http://prev-torchserve-predictor-default.default.example.com + url: http://torchserve-predictor-default.default.example.com +``` diff --git a/docs/samples/v1beta1/torchserve/canary.yaml b/docs/samples/v1beta1/torchserve/canary/canary.yaml similarity index 95% rename from docs/samples/v1beta1/torchserve/canary.yaml rename to docs/samples/v1beta1/torchserve/canary/canary.yaml index 324499352241..174cf389c049 100644 --- a/docs/samples/v1beta1/torchserve/canary.yaml +++ b/docs/samples/v1beta1/torchserve/canary/canary.yaml @@ -7,7 +7,7 @@ spec: canaryTrafficPercent: 20 pytorch: protocolVersion: v2 - storageUri: gs://kfserving-examples/models/torchserve/image_classifier + storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v2 resources: limits: memory: 4Gi diff --git a/docs/samples/v1beta1/torchserve/config.properties b/docs/samples/v1beta1/torchserve/config.properties index 76e23f6a8f9c..6ad8fba744a1 100644 --- a/docs/samples/v1beta1/torchserve/config.properties +++ b/docs/samples/v1beta1/torchserve/config.properties @@ -1,6 +1,6 @@ -inference_address=http://0.0.0.0:8080 +inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8081 number_of_netty_threads=4 job_queue_size=10 model_store=/mnt/models/model-store -model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"mnist":{"1.0":{"defaultVersion":true,"marName":"mnist.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}} \ No newline at end of file +model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"mnist":{"1.0":{"defaultVersion":true,"marName":"mnist.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}} diff --git a/docs/samples/v1beta1/torchserve/docs/autoscaling.md b/docs/samples/v1beta1/torchserve/docs/autoscaling.md deleted file mode 100644 index 6a9ec0dc808a..000000000000 --- a/docs/samples/v1beta1/torchserve/docs/autoscaling.md +++ /dev/null @@ -1,65 +0,0 @@ -# Autoscaling - -## Deploymnet yaml - -For example, specify a “concurrency target” of “10”, the autoscaler will try to make sure that every replica receives on average 10 requests at a time. -By default the pod scale with concurrency metrics - -Refer [model archive file generation](../model-archiver/README.md) for PV, PVC storage and marfile creation. - -autoscaling.yaml - -```yaml -apiVersion: "serving.kubeflow.org/v1beta1" -kind: "InferenceService" -metadata: - name: "torchserve" - annotations: - autoscaling.knative.dev/target: "5" -spec: - predictor: - pytorch: - protocolVersion: v2 - storageUri: "pvc://model-pv-claim" -``` - -## Create the InferenceService - -Apply the CRD - -```bash -kubectl apply -f torchserve.yaml -``` - -Expected Output - -```bash -$inferenceservice.serving.kubeflow.org/torchserve created -``` - -## Run a prediction - -The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT` - -Install hey load generator (go get -u github.com/rakyll/hey). - -```bash -MODEL_NAME=mnist -SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3) - -./hey -m POST -z 30s -D ./mnist.json -host ${SERVICE_HOSTNAME} http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -``` - -### Get Pods - -```bash -kubectl get pods -n kfserving-test - -NAME READY STATUS RESTARTS AGE -torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2/2 Terminating 0 103s -torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2/2 Terminating 0 95s -torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2/2 Running 0 50m -torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2/2 Running 0 113s -torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2/2 Running 0 109s -torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2/2 Terminating 0 103s -``` diff --git a/docs/samples/v1beta1/torchserve/docs/canary.md b/docs/samples/v1beta1/torchserve/docs/canary.md deleted file mode 100644 index a11f346ff12c..000000000000 --- a/docs/samples/v1beta1/torchserve/docs/canary.md +++ /dev/null @@ -1,100 +0,0 @@ -# Canary Rollouts - -## Creating model storage with model archive file - -Refer [model archive file generation](../model-archiver/README.md) for PV, PVC storage and marfile creation. - -## Deployment yaml - -### Main model - -```yaml -apiVersion: "serving.kubeflow.org/v1beta1" -kind: "InferenceService" -metadata: - name: "torch-pred" -spec: - predictor: - pytorch: - protocolVersion: v2 - storageUri: "pvc://model-pv-claim" -``` - -### Canary model - -Change the path and deploy - -```yaml -apiVersion: "serving.kubeflow.org/v1beta1" -kind: "InferenceService" -metadata: - name: "torch-pred" -spec: - predictor: - canaryTrafficPercent: 20 - pytorch: - storageUri: "pvc://model-store-claim-1" -``` - -## Create the InferenceService - -Apply the CRD - -```bash -kubectl apply -f torchserve.yaml -``` - -Expected Output - -```bash -$inferenceservice.serving.kubeflow.org/torchserve created -``` - -## Run a prediction - -The first step is to [determine the ingress IP and ports](../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT` - -```bash -MODEL_NAME=mnist -SERVICE_HOSTNAME=$(kubectl get inferenceservice torch-pred -o jsonpath='{.status.url}' | cut -d "/" -f 3) - -curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./mnist.json -``` - -Expected Output - -```bash -* Trying 52.89.19.61... -* Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com (52.89.19.61) port 80 (#0) -> PUT /v1/models/mnist:predict HTTP/1.1 -> Host: torch-pred.kfserving-test.example.com -> User-Agent: curl/7.47.0 -> Accept: */* -> Content-Length: 167 -> Expect: 100-continue -> -< HTTP/1.1 100 Continue -* We are completely uploaded and fine -< HTTP/1.1 200 OK -< cache-control: no-cache; no-store, must-revalidate, private -< content-length: 1 -< date: Tue, 27 Oct 2020 08:26:19 GMT -< expires: Thu, 01 Jan 1970 00:00:00 UTC -< pragma: no-cache -< x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 -< x-envoy-upstream-service-time: 6 -< server: istio-envoy -< -* Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact -{"predictions": ["2"]} -``` - -### Get Pods - -```bash -kubectl get pods -n kfserving-test - -NAME READY STATUS RESTARTS AGE -torch-pred-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2/2 Running 0 113s -torch-pred-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2/2 Running 0 109s -``` diff --git a/docs/samples/v1beta1/torchserve/docs/metrics.md b/docs/samples/v1beta1/torchserve/metrics/README.md similarity index 100% rename from docs/samples/v1beta1/torchserve/docs/metrics.md rename to docs/samples/v1beta1/torchserve/metrics/README.md diff --git a/docs/samples/v1beta1/torchserve/docs/images/grafana.png b/docs/samples/v1beta1/torchserve/metrics/images/grafana.png similarity index 100% rename from docs/samples/v1beta1/torchserve/docs/images/grafana.png rename to docs/samples/v1beta1/torchserve/metrics/images/grafana.png diff --git a/docs/samples/v1beta1/torchserve/docs/images/prometheus.png b/docs/samples/v1beta1/torchserve/metrics/images/prometheus.png similarity index 100% rename from docs/samples/v1beta1/torchserve/docs/images/prometheus.png rename to docs/samples/v1beta1/torchserve/metrics/images/prometheus.png diff --git a/docs/samples/v1beta1/torchserve/docs/images/prometheus_graph.png b/docs/samples/v1beta1/torchserve/metrics/images/prometheus_graph.png similarity index 100% rename from docs/samples/v1beta1/torchserve/docs/images/prometheus_graph.png rename to docs/samples/v1beta1/torchserve/metrics/images/prometheus_graph.png diff --git a/docs/samples/v1beta1/torchserve/metrics.yaml b/docs/samples/v1beta1/torchserve/metrics/metrics.yaml similarity index 100% rename from docs/samples/v1beta1/torchserve/metrics.yaml rename to docs/samples/v1beta1/torchserve/metrics/metrics.yaml diff --git a/docs/samples/v1beta1/torchserve/model-archiver/README.md b/docs/samples/v1beta1/torchserve/model-archiver/README.md index 57bd04c70799..e79c11dec3b2 100644 --- a/docs/samples/v1beta1/torchserve/model-archiver/README.md +++ b/docs/samples/v1beta1/torchserve/model-archiver/README.md @@ -7,7 +7,7 @@ ## 1. Create PV and PVC -Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage refer [AWS EFS storage](https://github.com/pytorch/serve/blob/master/kubernetes/EKS/README.md#setup-persistentvolume-backed-by-efs) +Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to [AWS EFS storage](https://github.com/pytorch/serve/blob/master/kubernetes/EKS/README.md#setup-persistentvolume-backed-by-efs) ### 1.1 Create PV diff --git a/docs/samples/v1beta1/torchserve/pv.yaml b/docs/samples/v1beta1/torchserve/pv.yaml deleted file mode 100644 index 6e4511d8749c..000000000000 --- a/docs/samples/v1beta1/torchserve/pv.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: model-pv-volume - labels: - type: "amazonEBS" -spec: - capacity: - storage: 5Gi - accessModes: - - ReadWriteOnce - awsElasticBlockStore: - volumeID: {volume-id} #vol-074ea8934f7b80df5 - fsType: ext4 diff --git a/docs/samples/v1beta1/torchserve/pvc.yaml b/docs/samples/v1beta1/torchserve/pvc.yaml deleted file mode 100644 index da72c56b84fc..000000000000 --- a/docs/samples/v1beta1/torchserve/pvc.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: model-pv-claim - labels: - type: "amazonEBS" -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 1Gi diff --git a/docs/samples/v1beta1/torchserve/pvpod.yaml b/docs/samples/v1beta1/torchserve/pvpod.yaml deleted file mode 100644 index baa71fdb0fa4..000000000000 --- a/docs/samples/v1beta1/torchserve/pvpod.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: model-store-pod -spec: - volumes: - - name: model-store - persistentVolumeClaim: - claimName: model-pv-claim - containers: - - name: model-store - image: ubuntu - command: [ "sleep" ] - args: [ "infinity" ] - volumeMounts: - - mountPath: "/pv" - name: model-store - resources: - limits: - memory: "4Gi" - cpu: "2"