From 4ab5c7ca5c053625c4632351392be14cccb6ef71 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Mon, 12 Jul 2021 12:01:10 +0200 Subject: [PATCH 01/42] Initial kubebuilder scaffolding --- pkg/crds/PROJECT | 11 +- .../apis/api/v1alpha1/groupversion_info.go | 36 ++++++ .../apis/api/v1alpha1/realtimeapi_types.go | 54 +++++++++ .../api/v1alpha1/zz_generated.deepcopy.go | 114 ++++++++++++++++++ pkg/crds/config/crd/kustomization.yaml | 3 + .../controllers/api/realtimeapi_controller.go | 63 ++++++++++ pkg/crds/controllers/api/suite_test.go | 80 ++++++++++++ pkg/crds/main.go | 11 ++ 8 files changed, 371 insertions(+), 1 deletion(-) create mode 100644 pkg/crds/apis/api/v1alpha1/groupversion_info.go create mode 100644 pkg/crds/apis/api/v1alpha1/realtimeapi_types.go create mode 100644 pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go create mode 100644 pkg/crds/controllers/api/realtimeapi_controller.go create mode 100644 pkg/crds/controllers/api/suite_test.go diff --git a/pkg/crds/PROJECT b/pkg/crds/PROJECT index 97b1925b84..01bbb7d1e9 100644 --- a/pkg/crds/PROJECT +++ b/pkg/crds/PROJECT @@ -3,7 +3,7 @@ layout: - go.kubebuilder.io/v3 multigroup: true projectName: operator -repo: github.com/cortexlabs/cortex +repo: github.com/cortexlabs/cortex/pkg/crds resources: - api: crdVersion: v1 @@ -14,4 +14,13 @@ resources: kind: BatchJob path: github.com/cortexlabs/cortex/pkg/crds/apis/batch/v1alpha1 version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: cortex.dev + group: api + kind: RealtimeAPI + path: github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1 + version: v1alpha1 version: "3" diff --git a/pkg/crds/apis/api/v1alpha1/groupversion_info.go b/pkg/crds/apis/api/v1alpha1/groupversion_info.go new file mode 100644 index 0000000000..3625dc1527 --- /dev/null +++ b/pkg/crds/apis/api/v1alpha1/groupversion_info.go @@ -0,0 +1,36 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the api v1alpha1 API group +//+kubebuilder:object:generate=true +//+groupName=api.cortex.dev +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "api.cortex.dev", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go new file mode 100644 index 0000000000..91802889ce --- /dev/null +++ b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go @@ -0,0 +1,54 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// RealtimeAPISpec defines the desired state of RealtimeAPI +type RealtimeAPISpec struct { +} + +// RealtimeAPIStatus defines the observed state of RealtimeAPI +type RealtimeAPIStatus struct { +} + +//+kubebuilder:object:root=true +//+kubebuilder:subresource:status + +// RealtimeAPI is the Schema for the realtimeapis API +type RealtimeAPI struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec RealtimeAPISpec `json:"spec,omitempty"` + Status RealtimeAPIStatus `json:"status,omitempty"` +} + +//+kubebuilder:object:root=true + +// RealtimeAPIList contains a list of RealtimeAPI +type RealtimeAPIList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []RealtimeAPI `json:"items"` +} + +func init() { + SchemeBuilder.Register(&RealtimeAPI{}, &RealtimeAPIList{}) +} diff --git a/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000000..852f080173 --- /dev/null +++ b/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,114 @@ +// +build !ignore_autogenerated + +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RealtimeAPI) DeepCopyInto(out *RealtimeAPI) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + out.Status = in.Status +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPI. +func (in *RealtimeAPI) DeepCopy() *RealtimeAPI { + if in == nil { + return nil + } + out := new(RealtimeAPI) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RealtimeAPI) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RealtimeAPIList) DeepCopyInto(out *RealtimeAPIList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]RealtimeAPI, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPIList. +func (in *RealtimeAPIList) DeepCopy() *RealtimeAPIList { + if in == nil { + return nil + } + out := new(RealtimeAPIList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RealtimeAPIList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RealtimeAPISpec) DeepCopyInto(out *RealtimeAPISpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPISpec. +func (in *RealtimeAPISpec) DeepCopy() *RealtimeAPISpec { + if in == nil { + return nil + } + out := new(RealtimeAPISpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RealtimeAPIStatus) DeepCopyInto(out *RealtimeAPIStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPIStatus. +func (in *RealtimeAPIStatus) DeepCopy() *RealtimeAPIStatus { + if in == nil { + return nil + } + out := new(RealtimeAPIStatus) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/crds/config/crd/kustomization.yaml b/pkg/crds/config/crd/kustomization.yaml index 73e33703bb..59e4b92a53 100644 --- a/pkg/crds/config/crd/kustomization.yaml +++ b/pkg/crds/config/crd/kustomization.yaml @@ -3,17 +3,20 @@ # It should be run by config/default resources: - bases/batch.cortex.dev_batchjobs.yaml +- bases/api.cortex.dev_realtimeapis.yaml #+kubebuilder:scaffold:crdkustomizeresource patchesStrategicMerge: # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. # patches here are for enabling the conversion webhook for each CRD #- patches/webhook_in_batchjobs.yaml +#- patches/webhook_in_realtimeapis.yaml #+kubebuilder:scaffold:crdkustomizewebhookpatch # [CERTMANAGER] To enable webhook, uncomment all the sections with [CERTMANAGER] prefix. # patches here are for enabling the CA injection for each CRD #- patches/cainjection_in_batchjobs.yaml +#- patches/cainjection_in_realtimeapis.yaml #+kubebuilder:scaffold:crdkustomizecainjectionpatch # the following config is for teaching kustomize how to do kustomization for CRDs. diff --git a/pkg/crds/controllers/api/realtimeapi_controller.go b/pkg/crds/controllers/api/realtimeapi_controller.go new file mode 100644 index 0000000000..f891b922c2 --- /dev/null +++ b/pkg/crds/controllers/api/realtimeapi_controller.go @@ -0,0 +1,63 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package api + +import ( + "context" + + "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" +) + +// RealtimeAPIReconciler reconciles a RealtimeAPI object +type RealtimeAPIReconciler struct { + client.Client + Log logr.Logger + Scheme *runtime.Scheme +} + +//+kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// the RealtimeAPI object against the actual cluster state, and then +// perform operations to make the cluster state reflect the state specified by +// the user. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.7.2/pkg/reconcile +func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + _ = r.Log.WithValues("realtimeapi", req.NamespacedName) + + // your logic here + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *RealtimeAPIReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&apiv1alpha1.RealtimeAPI{}). + Complete(r) +} diff --git a/pkg/crds/controllers/api/suite_test.go b/pkg/crds/controllers/api/suite_test.go new file mode 100644 index 0000000000..134a7234a8 --- /dev/null +++ b/pkg/crds/controllers/api/suite_test.go @@ -0,0 +1,80 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package api + +import ( + "path/filepath" + "testing" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/controller-runtime/pkg/envtest/printer" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" + //+kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var cfg *rest.Config +var k8sClient client.Client +var testEnv *envtest.Environment + +func TestAPIs(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecsWithDefaultAndCustomReporters(t, + "Controller Suite", + []Reporter{printer.NewlineReporter{}}) +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + cfg, err := testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = apiv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + //+kubebuilder:scaffold:scheme + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + +}, 60) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) diff --git a/pkg/crds/main.go b/pkg/crds/main.go index ee8c0c476b..90502d08bc 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -41,7 +41,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" + apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" batch "github.com/cortexlabs/cortex/pkg/crds/apis/batch/v1alpha1" + apicontrollers "github.com/cortexlabs/cortex/pkg/crds/controllers/api" batchcontrollers "github.com/cortexlabs/cortex/pkg/crds/controllers/batch" //+kubebuilder:scaffold:imports ) @@ -55,6 +57,7 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(batch.AddToScheme(scheme)) + utilruntime.Must(apiv1alpha1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -160,6 +163,14 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "BatchJob") os.Exit(1) } + if err = (&apicontrollers.RealtimeAPIReconciler{ + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("api").WithName("RealtimeAPI"), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "RealtimeAPI") + os.Exit(1) + } //+kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { From 20b7a6ffc9fce0a822b521c9432e641bba53af44 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Mon, 12 Jul 2021 17:06:01 +0200 Subject: [PATCH 02/42] Populate RealtimeAPI CRD types and add kubebuilder validation annotations --- .../apis/api/v1alpha1/realtimeapi_types.go | 187 ++++++ .../bases/api.cortex.dev_realtimeapis.yaml | 615 ++++++++++++++++++ .../patches/cainjection_in_realtimeapis.yaml | 7 + .../crd/patches/webhook_in_realtimeapis.yaml | 14 + .../config/rbac/realtimeapi_editor_role.yaml | 24 + .../config/rbac/realtimeapi_viewer_role.yaml | 20 + pkg/crds/config/rbac/role.yaml | 26 + .../samples/api_v1alpha1_realtimeapi.yaml | 7 + 8 files changed, 900 insertions(+) create mode 100644 pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml create mode 100644 pkg/crds/config/crd/patches/cainjection_in_realtimeapis.yaml create mode 100644 pkg/crds/config/crd/patches/webhook_in_realtimeapis.yaml create mode 100644 pkg/crds/config/rbac/realtimeapi_editor_role.yaml create mode 100644 pkg/crds/config/rbac/realtimeapi_viewer_role.yaml create mode 100644 pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go index 91802889ce..e61255039f 100644 --- a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go @@ -17,15 +17,202 @@ limitations under the License. package v1alpha1 import ( + "time" + + "github.com/cortexlabs/cortex/pkg/types/status" + kcore "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" ) // RealtimeAPISpec defines the desired state of RealtimeAPI type RealtimeAPISpec struct { + // Pod configuration + // +kubebuilder:validation:Required + Pod PodSpec `json:"pod"` + + // +kubebuilder:validation:Optional + // Autoscaling configuration + Autoscaling AutoscalingSpec `json:"autoscaling"` + + // +kubebuilder:validation:Optional + // List of node groups on which this API can run (default: all node groups are eligible) + NodeGroups []string `json:"node_groups,omitempty"` + + // +kubebuilder:validation:Optional + // Deployment strategy to use when replacing existing replicas with new ones + UpdateStrategy UpdateStratagySpec `json:"update_strategy"` + + // +kubebuilder:validation:Optional + // Networking configuration + Networking NetworkingSpec `json:"networking"` +} + +type PodSpec struct { + // +kubebuilder:validation:Optional + // +kubebuilder:default=8080 + // Port to which requests will be sent to + Port int `json:"port"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // Maximum number of requests that will be concurrently sent into the container + MaxConcurrency int `json:"max_concurrency"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default=100 + // Maximum number of requests per replica which will be queued + // (beyond max_concurrency) before requests are rejected with error code 503 + MaxQueueLength int `json:"max_queue_length"` + + // +kubebuilder:validation:Required + // Configurations for the containers to run + Containers []ContainerSpec `json:"containers"` +} + +type ContainerSpec struct { + // +kubebuilder:validation:Required + // Name of the container + Name string `json:"name"` + + // +kubebuilder:validation:Required + // Docker image to use for the container + Image string `json:"image"` + + // +kubebuilder:validation:Optional + // Entrypoint (not executed within a shell) + Command []string `json:"command,omitempty"` + + // +kubebuilder:validation:Optional + // Arguments to the entrypoint + Args []string `json:"args,omitempty"` + + // +kubebuilder:validation:Optional + // Environment variables to set in the container + Env []kcore.EnvVar `json:"env,omitempty"` + + // +kubebuilder:validation:Optional + // Compute resource requests + Compute *ComputeSpec `json:"compute,omitempty"` + + // +kubebuilder:validation:Optional + // Periodic probe of container readiness; + // traffic will not be sent into the pod unless all containers' readiness probes are succeeding + ReadinessProbe *kcore.Probe `json:"readiness_probe,omitempty"` + + // +kubebuilder:validation:Optional + // Periodic probe of container liveness; container will be restarted if the probe fails + LivenessProbe *kcore.Probe `json:"liveness_probe,omitempty"` +} + +type ComputeSpec struct { + // +kubebuilder:validation:Optional + // CPU request for the container; one unit of CPU corresponds to one virtual CPU; + // fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix + CPU *resource.Quantity `json:"cpu,omitempty"` + + // +kubebuilder:validation:Optional + // GPU request for the container; one unit of GPU corresponds to one virtual GPU + GPU int `json:"gpu,omitempty"` + + // +kubebuilder:validation:Optional + // Memory request for the container; + // one unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T + // (or their power-of two counterparts: Ki, Mi, Gi, Ti) + Mem *resource.Quantity `json:"mem,omitempty"` + + // +kubebuilder:validation:Optional + // Size of shared memory (/dev/shm) for sharing data between multiple processes + Shm *resource.Quantity `json:"shm,omitempty"` +} + +type AutoscalingSpec struct { + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // Minimum number of replicas + MinReplicas int `json:"min_replicas,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default=100 + // Maximum number of replicas + MaxReplicas int `json:"max_replicas,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // Initial number of replicas + InitReplicas int `json:"init_replicas,omitempty"` + + // +kubebuilder:validation:Optional + // Desired number of in-flight requests per replica (including requests actively being processed as well as queued), + // which the autoscaler tries to maintain + TargetInFlight int `json:"target_in_flight,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="60s" + // Duration over which to average the API's in-flight requests per replica + Window time.Duration `json:"window,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="5m" + // The API will not scale below the highest recommendation made during this period + DownscaleStabilizationPeriod time.Duration `json:"downscale_stabilization_period,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="1m" + // The API will not scale above the lowest recommendation made during this period + UpscaleStabilizationPeriod time.Duration `json:"upscale_stabilization_period,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="750m" + // Maximum factor by which to scale down the API on a single scaling event + MaxDownscaleFactor resource.Quantity `json:"max_downscale_factor,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="1500m" + // Maximum factor by which to scale up the API on a single scaling event + MaxUpscaleFactor resource.Quantity `json:"max_upscale_factor,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="50m" + // Any recommendation falling within this factor below the current number of replicas will not trigger a + // scale down event + DownscaleTolerance resource.Quantity `json:"downscale_tolerance,omitempty"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="50m" + // Any recommendation falling within this factor above the current number of replicas will not trigger a scale up event + UpscaleTolerance resource.Quantity `json:"upscale_tolerance,omitempty"` +} + +type UpdateStratagySpec struct { + // +kubebuilder:validation:Optional + // +kubebuilder:default="25%" + // Maximum number of replicas that can be scheduled above the desired number of replicas during an update; + // can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) + // (set to 0 to disable rolling updates) + MaxSurge intstr.IntOrString `json:"max_surge"` + + // +kubebuilder:validation:Optional + // +kubebuilder:default="25%" + // maximum number of replicas that can be unavailable during an update; can be an absolute number, + // e.g. 5, or a percentage of desired replicas, e.g. 10% + MaxUnavailable intstr.IntOrString `json:"max_unavailable"` +} + +type NetworkingSpec struct { + // +kubebuilder:validation:Optional + // Endpoint for the API + Endpoint string `json:"endpoint,omitempty"` } // RealtimeAPIStatus defines the observed state of RealtimeAPI type RealtimeAPIStatus struct { + Status status.Code `json:"status"` + DesiredReplicas int `json:"desired_replicas"` + CurrentReplicas int `json:"current_replicas"` + ReadyReplicas int `json:"ready_replicas"` + Endpoint string `json:"endpoint,omitempty"` } //+kubebuilder:object:root=true diff --git a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml new file mode 100644 index 0000000000..5fc4fbd7c8 --- /dev/null +++ b/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml @@ -0,0 +1,615 @@ + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.4.1 + creationTimestamp: null + name: realtimeapis.api.cortex.dev +spec: + group: api.cortex.dev + names: + kind: RealtimeAPI + listKind: RealtimeAPIList + plural: realtimeapis + singular: realtimeapi + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: RealtimeAPI is the Schema for the realtimeapis API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: RealtimeAPISpec defines the desired state of RealtimeAPI + properties: + autoscaling: + description: Autoscaling configuration + properties: + downscale_stabilization_period: + default: 5m + description: The API will not scale below the highest recommendation + made during this period + format: int64 + type: integer + downscale_tolerance: + anyOf: + - type: integer + - type: string + default: 50m + description: Any recommendation falling within this factor below + the current number of replicas will not trigger a scale down + event + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + init_replicas: + default: 1 + description: Initial number of replicas + type: integer + max_downscale_factor: + anyOf: + - type: integer + - type: string + default: 750m + description: Maximum factor by which to scale down the API on + a single scaling event + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + max_replicas: + default: 100 + description: Maximum number of replicas + type: integer + max_upscale_factor: + anyOf: + - type: integer + - type: string + default: 1500m + description: Maximum factor by which to scale up the API on a + single scaling event + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + min_replicas: + default: 1 + description: Minimum number of replicas + type: integer + target_in_flight: + description: Desired number of in-flight requests per replica + (including requests actively being processed as well as queued), + which the autoscaler tries to maintain + type: integer + upscale_stabilization_period: + default: 1m + description: The API will not scale above the lowest recommendation + made during this period + format: int64 + type: integer + upscale_tolerance: + anyOf: + - type: integer + - type: string + default: 50m + description: Any recommendation falling within this factor above + the current number of replicas will not trigger a scale up event + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + window: + default: 60s + description: Duration over which to average the API's in-flight + requests per replica + format: int64 + type: integer + type: object + networking: + description: Networking configuration + properties: + endpoint: + description: Endpoint for the API + type: string + type: object + node_groups: + description: 'List of node groups on which this API can run (default: + all node groups are eligible)' + items: + type: string + type: array + pod: + description: Pod configuration + properties: + containers: + description: Configurations for the containers to run + items: + properties: + args: + description: Arguments to the entrypoint + items: + type: string + type: array + command: + description: Entrypoint (not executed within a shell) + items: + type: string + type: array + compute: + description: Compute resource requests + properties: + cpu: + anyOf: + - type: integer + - type: string + description: CPU request for the container; one unit + of CPU corresponds to one virtual CPU; fractional + requests are allowed, and can be specified as a floating + point number or via the "m" suffix + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + gpu: + description: GPU request for the container; one unit + of GPU corresponds to one virtual GPU + type: integer + mem: + anyOf: + - type: integer + - type: string + description: 'Memory request for the container; one + unit of memory is one byte and can be expressed as + an integer or by using one of these suffixes: K, M, + G, T (or their power-of two counterparts: Ki, Mi, + Gi, Ti)' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + shm: + anyOf: + - type: integer + - type: string + description: Size of shared memory (/dev/shm) for sharing + data between multiple processes + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + env: + description: Environment variables to set in the container + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. Must + be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are + expanded using the previous defined environment + variables in the container and any service environment + variables. If a variable cannot be resolved, the + reference in the input string will be unchanged. + The $(VAR_NAME) syntax can be escaped with a double + $$, ie: $$(VAR_NAME). Escaped references will never + be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's + value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: supports + metadata.name, metadata.namespace, `metadata.labels['''']`, + `metadata.annotations['''']`, spec.nodeName, + spec.serviceAccountName, status.hostIP, status.podIP, + status.podIPs.' + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in + the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the container: + only resources limits and requests (limits.cpu, + limits.memory, limits.ephemeral-storage, requests.cpu, + requests.memory and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: required for + volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of + the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret in the + pod's namespace + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + image: + description: Docker image to use for the container + type: string + liveness_probe: + description: Periodic probe of container liveness; container + will be restarted if the probe fails + properties: + exec: + description: One and only one of the following should + be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's + filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, you need + to explicitly call out to that shell. Exit status + of 0 is treated as live/healthy and non-zero is + unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe + to be considered failed after having succeeded. Defaults + to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access + on the container. Number must be in the range + 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the + host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe + to be considered successful after having failed. Defaults + to 1. Must be 1 for liveness and startup. Minimum + value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving + a TCP port. TCP hooks not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, + defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access + on the container. Number must be in the range + 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + timeoutSeconds: + description: 'Number of seconds after which the probe + times out. Defaults to 1 second. Minimum value is + 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container + type: string + readiness_probe: + description: Periodic probe of container readiness; traffic + will not be sent into the pod unless all containers' readiness + probes are succeeding + properties: + exec: + description: One and only one of the following should + be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's + filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, you need + to explicitly call out to that shell. Exit status + of 0 is treated as live/healthy and non-zero is + unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe + to be considered failed after having succeeded. Defaults + to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access + on the container. Number must be in the range + 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the + host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe + to be considered successful after having failed. Defaults + to 1. Must be 1 for liveness and startup. Minimum + value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving + a TCP port. TCP hooks not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, + defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access + on the container. Number must be in the range + 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + timeoutSeconds: + description: 'Number of seconds after which the probe + times out. Defaults to 1 second. Minimum value is + 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + required: + - image + - name + type: object + type: array + max_concurrency: + default: 1 + description: Maximum number of requests that will be concurrently + sent into the container + type: integer + max_queue_length: + default: 100 + description: Maximum number of requests per replica which will + be queued (beyond max_concurrency) before requests are rejected + with error code 503 + type: integer + port: + default: 8080 + description: Port to which requests will be sent to + type: integer + required: + - containers + type: object + update_strategy: + description: Deployment strategy to use when replacing existing replicas + with new ones + properties: + max_surge: + anyOf: + - type: integer + - type: string + default: 25% + description: 'Maximum number of replicas that can be scheduled + above the desired number of replicas during an update; can be + an absolute number, e.g. 5, or a percentage of desired replicas, + e.g. 10% (default: 25%) (set to 0 to disable rolling updates)' + x-kubernetes-int-or-string: true + max_unavailable: + anyOf: + - type: integer + - type: string + default: 25% + description: maximum number of replicas that can be unavailable + during an update; can be an absolute number, e.g. 5, or a percentage + of desired replicas, e.g. 10% + x-kubernetes-int-or-string: true + type: object + required: + - pod + type: object + status: + description: RealtimeAPIStatus defines the observed state of RealtimeAPI + properties: + current_replicas: + type: integer + desired_replicas: + type: integer + endpoint: + type: string + ready_replicas: + type: integer + status: + type: integer + required: + - current_replicas + - desired_replicas + - ready_replicas + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/pkg/crds/config/crd/patches/cainjection_in_realtimeapis.yaml b/pkg/crds/config/crd/patches/cainjection_in_realtimeapis.yaml new file mode 100644 index 0000000000..a1311cf904 --- /dev/null +++ b/pkg/crds/config/crd/patches/cainjection_in_realtimeapis.yaml @@ -0,0 +1,7 @@ +# The following patch adds a directive for certmanager to inject CA into the CRD +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) + name: realtimeapis.api.cortex.dev diff --git a/pkg/crds/config/crd/patches/webhook_in_realtimeapis.yaml b/pkg/crds/config/crd/patches/webhook_in_realtimeapis.yaml new file mode 100644 index 0000000000..4ee0f5880c --- /dev/null +++ b/pkg/crds/config/crd/patches/webhook_in_realtimeapis.yaml @@ -0,0 +1,14 @@ +# The following patch enables a conversion webhook for the CRD +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: realtimeapis.api.cortex.dev +spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + namespace: system + name: webhook-service + path: /convert diff --git a/pkg/crds/config/rbac/realtimeapi_editor_role.yaml b/pkg/crds/config/rbac/realtimeapi_editor_role.yaml new file mode 100644 index 0000000000..34e836e2e9 --- /dev/null +++ b/pkg/crds/config/rbac/realtimeapi_editor_role.yaml @@ -0,0 +1,24 @@ +# permissions for end users to edit realtimeapis. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: realtimeapi-editor-role +rules: +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis/status + verbs: + - get diff --git a/pkg/crds/config/rbac/realtimeapi_viewer_role.yaml b/pkg/crds/config/rbac/realtimeapi_viewer_role.yaml new file mode 100644 index 0000000000..004387bf35 --- /dev/null +++ b/pkg/crds/config/rbac/realtimeapi_viewer_role.yaml @@ -0,0 +1,20 @@ +# permissions for end users to view realtimeapis. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: realtimeapi-viewer-role +rules: +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis + verbs: + - get + - list + - watch +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis/status + verbs: + - get diff --git a/pkg/crds/config/rbac/role.yaml b/pkg/crds/config/rbac/role.yaml index 4b64fb36ab..f8b89211e5 100644 --- a/pkg/crds/config/rbac/role.yaml +++ b/pkg/crds/config/rbac/role.yaml @@ -23,6 +23,32 @@ rules: - get - list - watch +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis/finalizers + verbs: + - update +- apiGroups: + - api.cortex.dev + resources: + - realtimeapis/status + verbs: + - get + - patch + - update - apiGroups: - batch resources: diff --git a/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml b/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml new file mode 100644 index 0000000000..638b3039f3 --- /dev/null +++ b/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml @@ -0,0 +1,7 @@ +apiVersion: api.cortex.dev/v1alpha1 +kind: RealtimeAPI +metadata: + name: realtimeapi-sample +spec: + # Add fields here + foo: bar From 0613ff3e6cd983a10f800ef6f33b60ea3ad4cb19 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Thu, 15 Jul 2021 11:42:48 +0200 Subject: [PATCH 03/42] Initial implementation of the realtime api controller --- .../apis/api/v1alpha1/realtimeapi_types.go | 28 +-- .../controllers/api/realtimeapi_controller.go | 195 ++++++++++++++++-- pkg/types/status/code.go | 3 + 3 files changed, 200 insertions(+), 26 deletions(-) diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go index e61255039f..4ff5bbe074 100644 --- a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go @@ -50,22 +50,27 @@ type RealtimeAPISpec struct { } type PodSpec struct { - // +kubebuilder:validation:Optional + // +kubebuilder:validation:Required // +kubebuilder:default=8080 // Port to which requests will be sent to Port int `json:"port"` - // +kubebuilder:validation:Optional + // +kubebuilder:validation:Required // +kubebuilder:default=1 // Maximum number of requests that will be concurrently sent into the container MaxConcurrency int `json:"max_concurrency"` - // +kubebuilder:validation:Optional + // +kubebuilder:validation:Required // +kubebuilder:default=100 // Maximum number of requests per replica which will be queued // (beyond max_concurrency) before requests are rejected with error code 503 MaxQueueLength int `json:"max_queue_length"` + // +kubebuilder:validation:Required + // +kubebuilder:default=1 + // Number of desired replicas + Replicas int32 `json:"replicas"` + // +kubebuilder:validation:Required // Configurations for the containers to run Containers []ContainerSpec `json:"containers"` @@ -131,22 +136,17 @@ type AutoscalingSpec struct { // +kubebuilder:validation:Optional // +kubebuilder:default=1 // Minimum number of replicas - MinReplicas int `json:"min_replicas,omitempty"` + MinReplicas int32 `json:"min_replicas,omitempty"` // +kubebuilder:validation:Optional // +kubebuilder:default=100 // Maximum number of replicas - MaxReplicas int `json:"max_replicas,omitempty"` - - // +kubebuilder:validation:Optional - // +kubebuilder:default=1 - // Initial number of replicas - InitReplicas int `json:"init_replicas,omitempty"` + MaxReplicas int32 `json:"max_replicas,omitempty"` // +kubebuilder:validation:Optional // Desired number of in-flight requests per replica (including requests actively being processed as well as queued), // which the autoscaler tries to maintain - TargetInFlight int `json:"target_in_flight,omitempty"` + TargetInFlight int32 `json:"target_in_flight,omitempty"` // +kubebuilder:validation:Optional // +kubebuilder:default="60s" @@ -209,9 +209,9 @@ type NetworkingSpec struct { // RealtimeAPIStatus defines the observed state of RealtimeAPI type RealtimeAPIStatus struct { Status status.Code `json:"status"` - DesiredReplicas int `json:"desired_replicas"` - CurrentReplicas int `json:"current_replicas"` - ReadyReplicas int `json:"ready_replicas"` + DesiredReplicas int32 `json:"desired_replicas"` + CurrentReplicas int32 `json:"current_replicas"` + ReadyReplicas int32 `json:"ready_replicas"` Endpoint string `json:"endpoint,omitempty"` } diff --git a/pkg/crds/controllers/api/realtimeapi_controller.go b/pkg/crds/controllers/api/realtimeapi_controller.go index f891b922c2..3b3055fa4c 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller.go +++ b/pkg/crds/controllers/api/realtimeapi_controller.go @@ -18,11 +18,23 @@ package api import ( "context" + "fmt" + "github.com/cortexlabs/cortex/pkg/consts" + "github.com/cortexlabs/cortex/pkg/crds/controllers" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/workloads" "github.com/go-logr/logr" + istionetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + kapps "k8s.io/api/apps/v1" + kcore "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" ) @@ -34,23 +46,65 @@ type RealtimeAPIReconciler struct { Scheme *runtime.Scheme } -//+kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/status,verbs=get;update;patch -//+kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/finalizers,verbs=update +// +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/finalizers,verbs=update +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch +// +kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=get;list;watch;create;update;patch +// +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch +// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the RealtimeAPI object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.7.2/pkg/reconcile func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - _ = r.Log.WithValues("realtimeapi", req.NamespacedName) + log := r.Log.WithValues("realtimeapi", req.NamespacedName) - // your logic here + // Step 1: get resource from request + api := apiv1alpha1.RealtimeAPI{} + log.V(1).Info("retrieving resource") + if err := r.Get(ctx, req.NamespacedName, &api); err != nil { + if !kerrors.IsNotFound(err) { + log.Error(err, "failed to retrieve resource") + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Step 2: Update status + log.V(1).Info("getting deployment") + deployment, err := r.getDeployment(ctx, api) + if err != nil { + log.Error(err, "failed to get deployment") + return ctrl.Result{}, err + } + + log.V(1).Info("updating status") + if err = r.updateStatus(ctx, &api, deployment); err != nil { + if controllers.IsOptimisticLockError(err) { + log.Info("conflict during status update, retrying") + return ctrl.Result{Requeue: true}, nil + } + log.Error(err, "failed to update status") + return ctrl.Result{}, err + } + + // Step 3: Create or Update Resources + deployOp, err := r.createOrUpdateDeployment(ctx, api) + if err != nil { + return ctrl.Result{}, err + } + log.V(1).Info(fmt.Sprintf("deployment %s", deployOp)) + + svcOp, err := r.createOrUpdateService(ctx, api) + if err != nil { + return ctrl.Result{}, err + } + log.V(1).Info(fmt.Sprintf("service %s", svcOp)) + + vsOp, err := r.createOrUpdateVirtualService(ctx, api) + if err != nil { + return ctrl.Result{}, err + } + log.V(1).Info(fmt.Sprintf("virtual service %s", vsOp)) return ctrl.Result{}, nil } @@ -59,5 +113,122 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) func (r *RealtimeAPIReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&apiv1alpha1.RealtimeAPI{}). + Owns(&kapps.Deployment{}). + Owns(&kcore.Service{}). + Owns(&istionetworking.VirtualService{}). Complete(r) } + +func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (*kapps.Deployment, error) { + req := client.ObjectKey{Namespace: api.Namespace, Name: workloads.K8sName(api.Name)} + deployment := kapps.Deployment{} + if err := r.Get(ctx, req, &deployment); err != nil { + if kerrors.IsNotFound(err) { + return nil, nil + } + return nil, err + } + return &deployment, nil +} + +func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *apiv1alpha1.RealtimeAPI, deployment *kapps.Deployment) error { + apiStatus := status.Pending + api.Status.Status = apiStatus // FIXME: handle other status + + endpoint, err := r.getEndpoint(ctx, api) + if err != nil { + return errors.Wrap(err, "failed to get api endpoint") + } + + api.Status.Endpoint = endpoint + if deployment != nil { + api.Status.DesiredReplicas = *deployment.Spec.Replicas + api.Status.CurrentReplicas = deployment.Status.Replicas + api.Status.ReadyReplicas = deployment.Status.ReadyReplicas + } + + if err = r.Status().Update(ctx, api); err != nil { + return err + } + + return nil +} + +func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { + deployment := kapps.Deployment{ + ObjectMeta: kmeta.ObjectMeta{ + Name: workloads.K8sName(api.Name), + Namespace: api.Namespace}, + } + op, err := controllerutil.CreateOrUpdate(ctx, r, &deployment, func() error { + deployment.Spec = r.desiredDeployment(api).Spec + return nil + }) + if err != nil { + return op, err + } + return op, nil +} + +func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { + service := kcore.Service{ + ObjectMeta: kmeta.ObjectMeta{ + Name: workloads.K8sName(api.Name), + Namespace: api.Namespace}, + } + op, err := controllerutil.CreateOrUpdate(ctx, r, &service, func() error { + service.Spec = r.desiredService(api).Spec + return nil + }) + if err != nil { + return op, err + } + return op, nil +} + +func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { + vs := istionetworking.VirtualService{ + ObjectMeta: kmeta.ObjectMeta{ + Name: workloads.K8sName(api.Name), + Namespace: api.Namespace}, + } + op, err := controllerutil.CreateOrUpdate(ctx, r, &vs, func() error { + vs.Spec = r.desiredVirtualService(api).Spec + return nil + }) + if err != nil { + return op, err + } + return op, nil +} + +func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *apiv1alpha1.RealtimeAPI) (string, error) { + req := client.ObjectKey{Namespace: consts.IstioNamespace, Name: "ingressgateway-apis"} + svc := kcore.Service{} + if err := r.Get(ctx, req, &svc); err != nil { + return "", err + } + + ingress := svc.Status.LoadBalancer.Ingress + if ingress == nil || len(ingress) == 0 { + return "", nil + } + + endpoint := fmt.Sprintf("http://%s/%s", + svc.Status.LoadBalancer.Ingress[0].Hostname, api.Spec.Networking.Endpoint, + ) + + return endpoint, nil +} + +func (r *RealtimeAPIReconciler) desiredDeployment(api apiv1alpha1.RealtimeAPI) kapps.Deployment { + panic("implement me!") +} + +func (r *RealtimeAPIReconciler) desiredService(api apiv1alpha1.RealtimeAPI) kcore.Service { + panic("implement me!") +} + +func (r *RealtimeAPIReconciler) desiredVirtualService(api apiv1alpha1.RealtimeAPI) istionetworking.VirtualService { + panic("implement me!") +} diff --git a/pkg/types/status/code.go b/pkg/types/status/code.go index 41a8a13d91..3845a913c3 100644 --- a/pkg/types/status/code.go +++ b/pkg/types/status/code.go @@ -20,6 +20,7 @@ type Code int const ( Unknown Code = iota + Pending Stalled Error ErrorImagePull @@ -30,6 +31,7 @@ const ( var _codes = []string{ "status_unknown", + "status_pending", "status_stalled", "status_error", "status_error_image_pull", @@ -42,6 +44,7 @@ var _ = [1]int{}[int(Updating)-(len(_codes)-1)] // Ensure list length matches var _codeMessages = []string{ "unknown", // Unknown + "pending", // Pending "compute unavailable", // Stalled "error", // Error "error (image pull)", // Live From c5ecd0835d68dc84aec6503354994c9013ba0061 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 20 Jul 2021 12:41:13 +0200 Subject: [PATCH 04/42] Add desired resources methods for realtime api controller --- .../apis/api/v1alpha1/realtimeapi_types.go | 12 +- .../controllers/api/realtimeapi_controller.go | 136 +----- .../api/realtimeapi_controller_helpers.go | 419 ++++++++++++++++++ pkg/workloads/helpers.go | 9 +- pkg/workloads/k8s.go | 13 +- 5 files changed, 449 insertions(+), 140 deletions(-) create mode 100644 pkg/crds/controllers/api/realtimeapi_controller_helpers.go diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go index 4ff5bbe074..20d1e2efda 100644 --- a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go @@ -53,18 +53,18 @@ type PodSpec struct { // +kubebuilder:validation:Required // +kubebuilder:default=8080 // Port to which requests will be sent to - Port int `json:"port"` + Port int32 `json:"port"` // +kubebuilder:validation:Required // +kubebuilder:default=1 // Maximum number of requests that will be concurrently sent into the container - MaxConcurrency int `json:"max_concurrency"` + MaxConcurrency int32 `json:"max_concurrency"` // +kubebuilder:validation:Required // +kubebuilder:default=100 // Maximum number of requests per replica which will be queued // (beyond max_concurrency) before requests are rejected with error code 503 - MaxQueueLength int `json:"max_queue_length"` + MaxQueueLength int32 `json:"max_queue_length"` // +kubebuilder:validation:Required // +kubebuilder:default=1 @@ -119,7 +119,11 @@ type ComputeSpec struct { // +kubebuilder:validation:Optional // GPU request for the container; one unit of GPU corresponds to one virtual GPU - GPU int `json:"gpu,omitempty"` + GPU int64 `json:"gpu,omitempty"` + + // +kubebuilder:validation:Optional + // Inferentia request for the container; one unit of Inf corresponds to one virtual Inf chip + Inf int64 `json:"inf,omitempty"` // +kubebuilder:validation:Optional // Memory request for the container; diff --git a/pkg/crds/controllers/api/realtimeapi_controller.go b/pkg/crds/controllers/api/realtimeapi_controller.go index 3b3055fa4c..0e55e9a99d 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller.go +++ b/pkg/crds/controllers/api/realtimeapi_controller.go @@ -20,30 +20,28 @@ import ( "context" "fmt" - "github.com/cortexlabs/cortex/pkg/consts" + apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" "github.com/cortexlabs/cortex/pkg/crds/controllers" - "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/types/status" - "github.com/cortexlabs/cortex/pkg/workloads" + "github.com/cortexlabs/cortex/pkg/types/clusterconfig" "github.com/go-logr/logr" - istionetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - - apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" ) +const _terminationGracePeriodSeconds int64 = 60 // seconds + // RealtimeAPIReconciler reconciles a RealtimeAPI object type RealtimeAPIReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme + ClusterConfig *clusterconfig.Config + Log logr.Logger + Scheme *runtime.Scheme } // +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis,verbs=get;list;watch;create;update;patch;delete @@ -115,120 +113,6 @@ func (r *RealtimeAPIReconciler) SetupWithManager(mgr ctrl.Manager) error { For(&apiv1alpha1.RealtimeAPI{}). Owns(&kapps.Deployment{}). Owns(&kcore.Service{}). - Owns(&istionetworking.VirtualService{}). + Owns(&istioclientnetworking.VirtualService{}). Complete(r) } - -func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (*kapps.Deployment, error) { - req := client.ObjectKey{Namespace: api.Namespace, Name: workloads.K8sName(api.Name)} - deployment := kapps.Deployment{} - if err := r.Get(ctx, req, &deployment); err != nil { - if kerrors.IsNotFound(err) { - return nil, nil - } - return nil, err - } - return &deployment, nil -} - -func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *apiv1alpha1.RealtimeAPI, deployment *kapps.Deployment) error { - apiStatus := status.Pending - api.Status.Status = apiStatus // FIXME: handle other status - - endpoint, err := r.getEndpoint(ctx, api) - if err != nil { - return errors.Wrap(err, "failed to get api endpoint") - } - - api.Status.Endpoint = endpoint - if deployment != nil { - api.Status.DesiredReplicas = *deployment.Spec.Replicas - api.Status.CurrentReplicas = deployment.Status.Replicas - api.Status.ReadyReplicas = deployment.Status.ReadyReplicas - } - - if err = r.Status().Update(ctx, api); err != nil { - return err - } - - return nil -} - -func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { - deployment := kapps.Deployment{ - ObjectMeta: kmeta.ObjectMeta{ - Name: workloads.K8sName(api.Name), - Namespace: api.Namespace}, - } - op, err := controllerutil.CreateOrUpdate(ctx, r, &deployment, func() error { - deployment.Spec = r.desiredDeployment(api).Spec - return nil - }) - if err != nil { - return op, err - } - return op, nil -} - -func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { - service := kcore.Service{ - ObjectMeta: kmeta.ObjectMeta{ - Name: workloads.K8sName(api.Name), - Namespace: api.Namespace}, - } - op, err := controllerutil.CreateOrUpdate(ctx, r, &service, func() error { - service.Spec = r.desiredService(api).Spec - return nil - }) - if err != nil { - return op, err - } - return op, nil -} - -func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { - vs := istionetworking.VirtualService{ - ObjectMeta: kmeta.ObjectMeta{ - Name: workloads.K8sName(api.Name), - Namespace: api.Namespace}, - } - op, err := controllerutil.CreateOrUpdate(ctx, r, &vs, func() error { - vs.Spec = r.desiredVirtualService(api).Spec - return nil - }) - if err != nil { - return op, err - } - return op, nil -} - -func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *apiv1alpha1.RealtimeAPI) (string, error) { - req := client.ObjectKey{Namespace: consts.IstioNamespace, Name: "ingressgateway-apis"} - svc := kcore.Service{} - if err := r.Get(ctx, req, &svc); err != nil { - return "", err - } - - ingress := svc.Status.LoadBalancer.Ingress - if ingress == nil || len(ingress) == 0 { - return "", nil - } - - endpoint := fmt.Sprintf("http://%s/%s", - svc.Status.LoadBalancer.Ingress[0].Hostname, api.Spec.Networking.Endpoint, - ) - - return endpoint, nil -} - -func (r *RealtimeAPIReconciler) desiredDeployment(api apiv1alpha1.RealtimeAPI) kapps.Deployment { - panic("implement me!") -} - -func (r *RealtimeAPIReconciler) desiredService(api apiv1alpha1.RealtimeAPI) kcore.Service { - panic("implement me!") -} - -func (r *RealtimeAPIReconciler) desiredVirtualService(api apiv1alpha1.RealtimeAPI) istionetworking.VirtualService { - panic("implement me!") -} diff --git a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go new file mode 100644 index 0000000000..3f3c658ad1 --- /dev/null +++ b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go @@ -0,0 +1,419 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package api + +import ( + "context" + "fmt" + + "github.com/cortexlabs/cortex/pkg/consts" + apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/pointer" + "github.com/cortexlabs/cortex/pkg/lib/strings" + "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/cortexlabs/cortex/pkg/workloads" + istionetworking "istio.io/api/networking/v1beta1" + istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + kapps "k8s.io/api/apps/v1" + kcore "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + kresource "k8s.io/apimachinery/pkg/api/resource" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (*kapps.Deployment, error) { + req := client.ObjectKey{Namespace: api.Namespace, Name: workloads.K8sName(api.Name)} + deployment := kapps.Deployment{} + if err := r.Get(ctx, req, &deployment); err != nil { + if kerrors.IsNotFound(err) { + return nil, nil + } + return nil, err + } + return &deployment, nil +} + +func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *apiv1alpha1.RealtimeAPI, deployment *kapps.Deployment) error { + apiStatus := status.Pending + api.Status.Status = apiStatus // FIXME: handle other status + + endpoint, err := r.getEndpoint(ctx, api) + if err != nil { + return errors.Wrap(err, "failed to get api endpoint") + } + + api.Status.Endpoint = endpoint + if deployment != nil { + api.Status.DesiredReplicas = *deployment.Spec.Replicas + api.Status.CurrentReplicas = deployment.Status.Replicas + api.Status.ReadyReplicas = deployment.Status.ReadyReplicas + } + + if err = r.Status().Update(ctx, api); err != nil { + return err + } + + return nil +} + +func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { + deployment := kapps.Deployment{ + ObjectMeta: kmeta.ObjectMeta{ + Name: workloads.K8sName(api.Name), + Namespace: api.Namespace}, + } + op, err := controllerutil.CreateOrUpdate(ctx, r, &deployment, func() error { + deployment.Spec = r.desiredDeployment(api).Spec + return nil + }) + if err != nil { + return op, err + } + return op, nil +} + +func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { + service := kcore.Service{ + ObjectMeta: kmeta.ObjectMeta{ + Name: workloads.K8sName(api.Name), + Namespace: api.Namespace}, + } + op, err := controllerutil.CreateOrUpdate(ctx, r, &service, func() error { + service.Spec = r.desiredService(api).Spec + return nil + }) + if err != nil { + return op, err + } + return op, nil +} + +func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { + vs := istioclientnetworking.VirtualService{ + ObjectMeta: kmeta.ObjectMeta{ + Name: workloads.K8sName(api.Name), + Namespace: api.Namespace}, + } + op, err := controllerutil.CreateOrUpdate(ctx, r, &vs, func() error { + vs.Spec = r.desiredVirtualService(api).Spec + return nil + }) + if err != nil { + return op, err + } + return op, nil +} + +func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *apiv1alpha1.RealtimeAPI) (string, error) { + req := client.ObjectKey{Namespace: consts.IstioNamespace, Name: "ingressgateway-apis"} + svc := kcore.Service{} + if err := r.Get(ctx, req, &svc); err != nil { + return "", err + } + + ingress := svc.Status.LoadBalancer.Ingress + if ingress == nil || len(ingress) == 0 { + return "", nil + } + + endpoint := fmt.Sprintf("http://%s/%s", + svc.Status.LoadBalancer.Ingress[0].Hostname, api.Spec.Networking.Endpoint, + ) + + return endpoint, nil +} + +func (r *RealtimeAPIReconciler) desiredDeployment(api apiv1alpha1.RealtimeAPI) kapps.Deployment { + containers, volumes := r.desiredContainers(api) + + return *k8s.Deployment(&k8s.DeploymentSpec{ + Name: workloads.K8sName(api.Name), + Replicas: api.Spec.Pod.Replicas, + MaxSurge: pointer.String(api.Spec.UpdateStrategy.MaxSurge.String()), + MaxUnavailable: pointer.String(api.Spec.UpdateStrategy.MaxUnavailable.String()), + Labels: map[string]string{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + "apiID": api.Annotations["cortex.dev/api-id"], // TODO: check if can be replaced with resource version + "deploymentID": api.Annotations["cortex.dev/deployment-id"], // FIXME: needs to be created beforehand + "cortex.dev/api": "true", + }, + Annotations: getAPIAnnotations(api), + Selector: map[string]string{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + }, + PodSpec: k8s.PodSpec{ + Labels: map[string]string{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + "deploymentID": api.Annotations["cortex.dev/deployment-id"], + "cortex.dev/api": "true", + }, + Annotations: map[string]string{ + "traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0", + }, + K8sPodSpec: kcore.PodSpec{ + RestartPolicy: kcore.RestartPolicyAlways, + TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds), + Containers: containers, + NodeSelector: workloads.NodeSelectors(), + Tolerations: workloads.GenerateResourceTolerations(), + Affinity: workloads.GenerateNodeAffinities(api.Spec.NodeGroups), + Volumes: volumes, + ServiceAccountName: workloads.ServiceAccountName, + }, + }, + }) +} + +func (r *RealtimeAPIReconciler) desiredContainers(api apiv1alpha1.RealtimeAPI) ([]kcore.Container, []kcore.Volume) { + containers, volumes := r.userContainers(api) + proxyContainer, proxyVolume := r.proxyContainer(api) + + containers = append(containers, proxyContainer) + volumes = append(volumes, proxyVolume) + + return containers, volumes +} + +func (r *RealtimeAPIReconciler) desiredService(api apiv1alpha1.RealtimeAPI) kcore.Service { + return *k8s.Service(&k8s.ServiceSpec{ + Name: workloads.K8sName(api.Name), + PortName: "http", + Port: consts.ProxyPortInt32, + TargetPort: consts.ProxyPortInt32, + Annotations: getAPIAnnotations(api), + Labels: map[string]string{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + "cortex.dev/api": "true", + }, + Selector: map[string]string{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + }, + }) +} + +func (r *RealtimeAPIReconciler) desiredVirtualService(api apiv1alpha1.RealtimeAPI) istioclientnetworking.VirtualService { + var activatorWeight int32 + if api.Spec.Pod.Replicas == 0 { + activatorWeight = 100 + } + + return *k8s.VirtualService(&k8s.VirtualServiceSpec{ + Name: workloads.K8sName(api.Name), + Gateways: []string{"apis-gateway"}, + Destinations: []k8s.Destination{ + { + ServiceName: workloads.K8sName(api.Name), + Weight: 100 - activatorWeight, + Port: uint32(consts.ProxyPortInt32), + Headers: &istionetworking.Headers{ + Response: &istionetworking.Headers_HeaderOperations{ + Set: map[string]string{ + consts.CortexOriginHeader: "api", + }, + }, + }, + }, + { + ServiceName: consts.ActivatorName, + Weight: activatorWeight, + Port: uint32(consts.ActivatorPortInt32), + Headers: &istionetworking.Headers{ + Request: &istionetworking.Headers_HeaderOperations{ + Set: map[string]string{ + consts.CortexAPINameHeader: api.Name, + consts.CortexTargetServiceHeader: fmt.Sprintf( + "http://%s.%s:%d", + workloads.K8sName(api.Name), + consts.DefaultNamespace, + consts.ProxyPortInt32, + ), + }, + }, + Response: &istionetworking.Headers_HeaderOperations{ + Set: map[string]string{ + consts.CortexOriginHeader: consts.ActivatorName, + }, + }, + }, + }, + }, + PrefixPath: pointer.String(api.Spec.Networking.Endpoint), + Rewrite: pointer.String("/"), + Annotations: getAPIAnnotations(api), + Labels: map[string]string{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + "apiID": api.Annotations["cortex.dev/api-id"], + "deploymentID": api.Annotations["cortex.dev/deployment-id"], + "cortex.dev/api": "true", + }, + }) +} + +func (r *RealtimeAPIReconciler) userContainers(api apiv1alpha1.RealtimeAPI) ([]kcore.Container, []kcore.Volume) { + volumes := []kcore.Volume{ + workloads.MntVolume(), + workloads.CortexVolume(), + workloads.ClientConfigVolume(), + } + containerMounts := []kcore.VolumeMount{ + workloads.MntMount(), + workloads.CortexMount(), + workloads.ClientConfigMount(), + } + + var containers []kcore.Container + for _, container := range api.Spec.Pod.Containers { + containerResourceList := kcore.ResourceList{} + containerResourceLimitsList := kcore.ResourceList{} + securityContext := kcore.SecurityContext{ + Privileged: pointer.Bool(true), + } + + if container.Compute.CPU != nil { + containerResourceList[kcore.ResourceCPU] = *k8s.QuantityPtr(container.Compute.CPU.DeepCopy()) + } + + if container.Compute.Mem != nil { + containerResourceList[kcore.ResourceMemory] = *k8s.QuantityPtr(container.Compute.Mem.DeepCopy()) + } + + if container.Compute.GPU > 0 { + containerResourceList["nvidia.com/gpu"] = *kresource.NewQuantity(container.Compute.GPU, kresource.DecimalSI) + containerResourceLimitsList["nvidia.com/gpu"] = *kresource.NewQuantity(container.Compute.GPU, kresource.DecimalSI) + } + + if container.Compute.Inf > 0 { + totalHugePages := container.Compute.Inf * workloads.HugePagesMemPerInf + containerResourceList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) + containerResourceList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) + containerResourceLimitsList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) + containerResourceLimitsList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) + + securityContext.Capabilities = &kcore.Capabilities{ + Add: []kcore.Capability{ + "SYS_ADMIN", + "IPC_LOCK", + }, + } + } + + if container.Compute.Shm != nil { + volumes = append(volumes, workloads.ShmVolume(*container.Compute.Shm, "dshm-"+container.Name)) + containerMounts = append(containerMounts, workloads.ShmMount("dshm-"+container.Name)) + } + + containerEnvVars := workloads.BaseEnvVars + containerEnvVars = append(containerEnvVars, workloads.ClientConfigEnvVar()) + containerEnvVars = append(containerEnvVars, container.Env...) + + containers = append(containers, kcore.Container{ + Name: container.Name, + Image: container.Image, + Command: container.Command, + Args: container.Args, + Env: containerEnvVars, + VolumeMounts: containerMounts, + LivenessProbe: container.LivenessProbe, + ReadinessProbe: container.ReadinessProbe, + Resources: kcore.ResourceRequirements{ + Requests: containerResourceList, + Limits: containerResourceLimitsList, + }, + ImagePullPolicy: kcore.PullAlways, + SecurityContext: &securityContext, + }) + } + + return containers, volumes +} + +func (r *RealtimeAPIReconciler) proxyContainer(api apiv1alpha1.RealtimeAPI) (kcore.Container, kcore.Volume) { + return kcore.Container{ + Name: workloads.ProxyContainerName, + Image: r.ClusterConfig.ImageProxy, + ImagePullPolicy: kcore.PullAlways, + Args: []string{ + "--cluster-config", + consts.DefaultInClusterConfigPath, + "--port", + consts.ProxyPortStr, + "--admin-port", + consts.AdminPortStr, + "--user-port", + strings.Int32(api.Spec.Pod.Port), + "--max-concurrency", + strings.Int32(api.Spec.Pod.MaxConcurrency), + "--max-queue-length", + strings.Int32(api.Spec.Pod.MaxQueueLength), + }, + Ports: []kcore.ContainerPort{ + {Name: consts.AdminPortName, ContainerPort: consts.AdminPortInt32}, + {ContainerPort: consts.ProxyPortInt32}, + }, + Env: workloads.BaseEnvVars, + EnvFrom: workloads.BaseClusterEnvVars(), + VolumeMounts: []kcore.VolumeMount{ + workloads.ClusterConfigMount(), + }, + Resources: kcore.ResourceRequirements{ + Requests: kcore.ResourceList{ + kcore.ResourceCPU: consts.CortexProxyCPU, + kcore.ResourceMemory: consts.CortexProxyMem, + }, + }, + ReadinessProbe: &kcore.Probe{ + Handler: kcore.Handler{ + HTTPGet: &kcore.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromInt(int(consts.AdminPortInt32)), + }, + }, + InitialDelaySeconds: 1, + TimeoutSeconds: 1, + PeriodSeconds: 10, + SuccessThreshold: 1, + FailureThreshold: 1, + }, + }, workloads.ClusterConfigVolume() +} + +func getAPIAnnotations(api apiv1alpha1.RealtimeAPI) map[string]string { + return map[string]string{ + userconfig.MinReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MinReplicas), + userconfig.MaxReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MaxReplicas), + userconfig.TargetInFlightAnnotationKey: strings.Int32(api.Spec.Autoscaling.TargetInFlight), + userconfig.WindowAnnotationKey: api.Spec.Autoscaling.Window.String(), + userconfig.DownscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.DownscaleStabilizationPeriod.String(), + userconfig.UpscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.UpscaleStabilizationPeriod.String(), + userconfig.MaxDownscaleFactorAnnotationKey: strings.Float64(api.Spec.Autoscaling.MaxDownscaleFactor.AsApproximateFloat64()), + userconfig.MaxUpscaleFactorAnnotationKey: strings.Float64(api.Spec.Autoscaling.MaxUpscaleFactor.AsApproximateFloat64()), + userconfig.DownscaleToleranceAnnotationKey: strings.Float64(api.Spec.Autoscaling.DownscaleTolerance.AsApproximateFloat64()), + userconfig.UpscaleToleranceAnnotationKey: strings.Float64(api.Spec.Autoscaling.UpscaleTolerance.AsApproximateFloat64()), + } +} diff --git a/pkg/workloads/helpers.go b/pkg/workloads/helpers.go index 1d0bf847ba..c14cb77d7a 100644 --- a/pkg/workloads/helpers.go +++ b/pkg/workloads/helpers.go @@ -228,7 +228,7 @@ func APIConfigMount(name string) kcore.VolumeMount { func ClientConfigMount() kcore.VolumeMount { return kcore.VolumeMount{ Name: _clientConfigDirVolume, - MountPath: path.Join(_clientConfigDir, "cli.yaml"), + MountPath: path.Join(clientConfigDir, "cli.yaml"), SubPath: "cli.yaml", } } @@ -248,3 +248,10 @@ func ShmMount(volumeName string) kcore.VolumeMount { func KubexitMount() kcore.VolumeMount { return k8s.EmptyDirVolumeMount(_kubexitGraveyardName, _kubexitGraveyardMountPath) } + +func ClientConfigEnvVar() kcore.EnvVar { + return kcore.EnvVar{ + Name: "CORTEX_CLI_CONFIG_DIR", + Value: clientConfigDir, + } +} diff --git a/pkg/workloads/k8s.go b/pkg/workloads/k8s.go index 0523c8cbfe..d4f3aec746 100644 --- a/pkg/workloads/k8s.go +++ b/pkg/workloads/k8s.go @@ -41,7 +41,7 @@ const ( const ( _cortexDirVolumeName = "cortex" _cortexDirMountPath = "/cortex" - _clientConfigDir = "/cortex/client" + clientConfigDir = "/cortex/client" _emptyDirVolumeName = "mnt" _emptyDirMountPath = "/mnt" @@ -70,7 +70,7 @@ var ( _statsdAddress = fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.PrometheusNamespace) // each Inferentia chip requires 128 HugePages with each HugePage having a size of 2Mi - _hugePagesMemPerInf = int64(128 * 2 * 1024 * 1024) // bytes + HugePagesMemPerInf = int64(128 * 2 * 1024 * 1024) // bytes ) func AsyncGatewayContainer(api spec.API, queueURL string, volumeMounts []kcore.VolumeMount) kcore.Container { @@ -393,7 +393,7 @@ func userPodContainers(api spec.API) ([]kcore.Container, []kcore.Volume) { } if container.Compute.Inf > 0 { - totalHugePages := container.Compute.Inf * _hugePagesMemPerInf + totalHugePages := container.Compute.Inf * HugePagesMemPerInf containerResourceList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) containerResourceList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) containerResourceLimitsList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) @@ -413,12 +413,7 @@ func userPodContainers(api spec.API) ([]kcore.Container, []kcore.Volume) { } containerEnvVars := BaseEnvVars - - containerEnvVars = append(containerEnvVars, kcore.EnvVar{ - Name: "CORTEX_CLI_CONFIG_DIR", - Value: _clientConfigDir, - }) - + containerEnvVars = append(containerEnvVars, ClientConfigEnvVar()) if api.Kind != userconfig.TaskAPIKind { containerEnvVars = append(containerEnvVars, kcore.EnvVar{ Name: "CORTEX_PORT", From 3334a92dd603279e864a2e5f2e8c3e13125ef251 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 20 Jul 2021 15:01:55 +0200 Subject: [PATCH 05/42] Fix CRD types --- .../apis/api/v1alpha1/realtimeapi_types.go | 9 +- .../api/v1alpha1/zz_generated.deepcopy.go | 165 +++++++++++++++++- .../bases/api.cortex.dev_realtimeapis.yaml | 37 ++-- pkg/crds/config/rbac/role.yaml | 33 ++++ 4 files changed, 228 insertions(+), 16 deletions(-) diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go index 20d1e2efda..c569eb0e0e 100644 --- a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go @@ -17,11 +17,10 @@ limitations under the License. package v1alpha1 import ( - "time" - "github.com/cortexlabs/cortex/pkg/types/status" kcore "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) @@ -155,17 +154,17 @@ type AutoscalingSpec struct { // +kubebuilder:validation:Optional // +kubebuilder:default="60s" // Duration over which to average the API's in-flight requests per replica - Window time.Duration `json:"window,omitempty"` + Window kmeta.Duration `json:"window,omitempty"` // +kubebuilder:validation:Optional // +kubebuilder:default="5m" // The API will not scale below the highest recommendation made during this period - DownscaleStabilizationPeriod time.Duration `json:"downscale_stabilization_period,omitempty"` + DownscaleStabilizationPeriod kmeta.Duration `json:"downscale_stabilization_period,omitempty"` // +kubebuilder:validation:Optional // +kubebuilder:default="1m" // The API will not scale above the lowest recommendation made during this period - UpscaleStabilizationPeriod time.Duration `json:"upscale_stabilization_period,omitempty"` + UpscaleStabilizationPeriod kmeta.Duration `json:"upscale_stabilization_period,omitempty"` // +kubebuilder:validation:Optional // +kubebuilder:default="750m" diff --git a/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go index 852f080173..a52d87d385 100644 --- a/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go @@ -21,15 +21,152 @@ limitations under the License. package v1alpha1 import ( + "k8s.io/api/core/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoscalingSpec) DeepCopyInto(out *AutoscalingSpec) { + *out = *in + out.Window = in.Window + out.DownscaleStabilizationPeriod = in.DownscaleStabilizationPeriod + out.UpscaleStabilizationPeriod = in.UpscaleStabilizationPeriod + out.MaxDownscaleFactor = in.MaxDownscaleFactor.DeepCopy() + out.MaxUpscaleFactor = in.MaxUpscaleFactor.DeepCopy() + out.DownscaleTolerance = in.DownscaleTolerance.DeepCopy() + out.UpscaleTolerance = in.UpscaleTolerance.DeepCopy() +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoscalingSpec. +func (in *AutoscalingSpec) DeepCopy() *AutoscalingSpec { + if in == nil { + return nil + } + out := new(AutoscalingSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComputeSpec) DeepCopyInto(out *ComputeSpec) { + *out = *in + if in.CPU != nil { + in, out := &in.CPU, &out.CPU + x := (*in).DeepCopy() + *out = &x + } + if in.Mem != nil { + in, out := &in.Mem, &out.Mem + x := (*in).DeepCopy() + *out = &x + } + if in.Shm != nil { + in, out := &in.Shm, &out.Shm + x := (*in).DeepCopy() + *out = &x + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComputeSpec. +func (in *ComputeSpec) DeepCopy() *ComputeSpec { + if in == nil { + return nil + } + out := new(ComputeSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ContainerSpec) DeepCopyInto(out *ContainerSpec) { + *out = *in + if in.Command != nil { + in, out := &in.Command, &out.Command + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Compute != nil { + in, out := &in.Compute, &out.Compute + *out = new(ComputeSpec) + (*in).DeepCopyInto(*out) + } + if in.ReadinessProbe != nil { + in, out := &in.ReadinessProbe, &out.ReadinessProbe + *out = new(v1.Probe) + (*in).DeepCopyInto(*out) + } + if in.LivenessProbe != nil { + in, out := &in.LivenessProbe, &out.LivenessProbe + *out = new(v1.Probe) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerSpec. +func (in *ContainerSpec) DeepCopy() *ContainerSpec { + if in == nil { + return nil + } + out := new(ContainerSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NetworkingSpec) DeepCopyInto(out *NetworkingSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NetworkingSpec. +func (in *NetworkingSpec) DeepCopy() *NetworkingSpec { + if in == nil { + return nil + } + out := new(NetworkingSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodSpec) DeepCopyInto(out *PodSpec) { + *out = *in + if in.Containers != nil { + in, out := &in.Containers, &out.Containers + *out = make([]ContainerSpec, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodSpec. +func (in *PodSpec) DeepCopy() *PodSpec { + if in == nil { + return nil + } + out := new(PodSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RealtimeAPI) DeepCopyInto(out *RealtimeAPI) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec + in.Spec.DeepCopyInto(&out.Spec) out.Status = in.Status } @@ -86,6 +223,15 @@ func (in *RealtimeAPIList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RealtimeAPISpec) DeepCopyInto(out *RealtimeAPISpec) { *out = *in + in.Pod.DeepCopyInto(&out.Pod) + in.Autoscaling.DeepCopyInto(&out.Autoscaling) + if in.NodeGroups != nil { + in, out := &in.NodeGroups, &out.NodeGroups + *out = make([]string, len(*in)) + copy(*out, *in) + } + out.UpdateStrategy = in.UpdateStrategy + out.Networking = in.Networking } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPISpec. @@ -112,3 +258,20 @@ func (in *RealtimeAPIStatus) DeepCopy() *RealtimeAPIStatus { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *UpdateStratagySpec) DeepCopyInto(out *UpdateStratagySpec) { + *out = *in + out.MaxSurge = in.MaxSurge + out.MaxUnavailable = in.MaxUnavailable +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UpdateStratagySpec. +func (in *UpdateStratagySpec) DeepCopy() *UpdateStratagySpec { + if in == nil { + return nil + } + out := new(UpdateStratagySpec) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml index 5fc4fbd7c8..1b4a092ae3 100644 --- a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml @@ -43,8 +43,7 @@ spec: default: 5m description: The API will not scale below the highest recommendation made during this period - format: int64 - type: integer + type: string downscale_tolerance: anyOf: - type: integer @@ -55,10 +54,6 @@ spec: event pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - init_replicas: - default: 1 - description: Initial number of replicas - type: integer max_downscale_factor: anyOf: - type: integer @@ -71,6 +66,7 @@ spec: max_replicas: default: 100 description: Maximum number of replicas + format: int32 type: integer max_upscale_factor: anyOf: @@ -84,18 +80,19 @@ spec: min_replicas: default: 1 description: Minimum number of replicas + format: int32 type: integer target_in_flight: description: Desired number of in-flight requests per replica (including requests actively being processed as well as queued), which the autoscaler tries to maintain + format: int32 type: integer upscale_stabilization_period: default: 1m description: The API will not scale above the lowest recommendation made during this period - format: int64 - type: integer + type: string upscale_tolerance: anyOf: - type: integer @@ -109,8 +106,7 @@ spec: default: 60s description: Duration over which to average the API's in-flight requests per replica - format: int64 - type: integer + type: string type: object networking: description: Networking configuration @@ -158,6 +154,12 @@ spec: gpu: description: GPU request for the container; one unit of GPU corresponds to one virtual GPU + format: int64 + type: integer + inf: + description: Inferentia request for the container; one + unit of Inf corresponds to one virtual Inf chip + format: int64 type: integer mem: anyOf: @@ -542,19 +544,31 @@ spec: default: 1 description: Maximum number of requests that will be concurrently sent into the container + format: int32 type: integer max_queue_length: default: 100 description: Maximum number of requests per replica which will be queued (beyond max_concurrency) before requests are rejected with error code 503 + format: int32 type: integer port: default: 8080 description: Port to which requests will be sent to + format: int32 + type: integer + replicas: + default: 1 + description: Number of desired replicas + format: int32 type: integer required: - containers + - max_concurrency + - max_queue_length + - port + - replicas type: object update_strategy: description: Deployment strategy to use when replacing existing replicas @@ -587,12 +601,15 @@ spec: description: RealtimeAPIStatus defines the observed state of RealtimeAPI properties: current_replicas: + format: int32 type: integer desired_replicas: + format: int32 type: integer endpoint: type: string ready_replicas: + format: int32 type: integer status: type: integer diff --git a/pkg/crds/config/rbac/role.yaml b/pkg/crds/config/rbac/role.yaml index f8b89211e5..c6c2b052a2 100644 --- a/pkg/crds/config/rbac/role.yaml +++ b/pkg/crds/config/rbac/role.yaml @@ -23,6 +23,17 @@ rules: - get - list - watch +- apiGroups: + - "" + resources: + - services + verbs: + - create + - get + - list + - patch + - update + - watch - apiGroups: - api.cortex.dev resources: @@ -49,6 +60,17 @@ rules: - get - patch - update +- apiGroups: + - apps + resources: + - deployments + verbs: + - create + - get + - list + - patch + - update + - watch - apiGroups: - batch resources: @@ -86,3 +108,14 @@ rules: - get - patch - update +- apiGroups: + - networking.istio.io + resources: + - virtualservices + verbs: + - create + - get + - list + - patch + - update + - watch From a56ece57700d9177be7c3a9bf3c01444de39fe78 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 20 Jul 2021 15:17:08 +0200 Subject: [PATCH 06/42] Add istio to scheme --- pkg/crds/controllers/api/realtimeapi_controller.go | 1 - pkg/crds/controllers/api/realtimeapi_controller_helpers.go | 6 +++--- pkg/crds/main.go | 2 ++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/crds/controllers/api/realtimeapi_controller.go b/pkg/crds/controllers/api/realtimeapi_controller.go index 0e55e9a99d..53e368d856 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller.go +++ b/pkg/crds/controllers/api/realtimeapi_controller.go @@ -25,7 +25,6 @@ import ( "github.com/cortexlabs/cortex/pkg/types/clusterconfig" "github.com/go-logr/logr" istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" - kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go index 3f3c658ad1..1523665472 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go @@ -82,7 +82,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, ap Name: workloads.K8sName(api.Name), Namespace: api.Namespace}, } - op, err := controllerutil.CreateOrUpdate(ctx, r, &deployment, func() error { + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, &deployment, func() error { deployment.Spec = r.desiredDeployment(api).Spec return nil }) @@ -98,7 +98,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api a Name: workloads.K8sName(api.Name), Namespace: api.Namespace}, } - op, err := controllerutil.CreateOrUpdate(ctx, r, &service, func() error { + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, &service, func() error { service.Spec = r.desiredService(api).Spec return nil }) @@ -114,7 +114,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context Name: workloads.K8sName(api.Name), Namespace: api.Namespace}, } - op, err := controllerutil.CreateOrUpdate(ctx, r, &vs, func() error { + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, &vs, func() error { vs.Spec = r.desiredVirtualService(api).Spec return nil }) diff --git a/pkg/crds/main.go b/pkg/crds/main.go index 90502d08bc..aba1346cac 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -34,6 +34,7 @@ import ( // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + istioscheme "istio.io/client-go/pkg/clientset/versioned/scheme" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -55,6 +56,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(istioscheme.AddToScheme(scheme)) utilruntime.Must(batch.AddToScheme(scheme)) utilruntime.Must(apiv1alpha1.AddToScheme(scheme)) From 32b6be2c62c89966b1f124fab520c333bdffe5f9 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 20 Jul 2021 19:43:21 +0200 Subject: [PATCH 07/42] Fix RealtimeAPI CRD defaulting behaviour --- .../apis/api/v1alpha1/realtimeapi_types.go | 10 ++-- .../bases/api.cortex.dev_realtimeapis.yaml | 8 ++- .../samples/api_v1alpha1_realtimeapi.yaml | 14 +++-- .../api/realtimeapi_controller_helpers.go | 54 ++++++++++--------- pkg/crds/main.go | 9 ++-- pkg/types/status/code.go | 1 + 6 files changed, 57 insertions(+), 39 deletions(-) diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go index c569eb0e0e..997f9b87f0 100644 --- a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go @@ -32,18 +32,20 @@ type RealtimeAPISpec struct { Pod PodSpec `json:"pod"` // +kubebuilder:validation:Optional + // +kubebuilder:default={"min_replicas": 1} // Autoscaling configuration Autoscaling AutoscalingSpec `json:"autoscaling"` // +kubebuilder:validation:Optional // List of node groups on which this API can run (default: all node groups are eligible) - NodeGroups []string `json:"node_groups,omitempty"` + NodeGroups []string `json:"node_groups"` // +kubebuilder:validation:Optional + // +kubebuilder:default={"max_surge": "25%", "max_unavailable": "25%"} // Deployment strategy to use when replacing existing replicas with new ones UpdateStrategy UpdateStratagySpec `json:"update_strategy"` - // +kubebuilder:validation:Optional + // +kubebuilder:validation:Required // Networking configuration Networking NetworkingSpec `json:"networking"` } @@ -96,7 +98,6 @@ type ContainerSpec struct { // Environment variables to set in the container Env []kcore.EnvVar `json:"env,omitempty"` - // +kubebuilder:validation:Optional // Compute resource requests Compute *ComputeSpec `json:"compute,omitempty"` @@ -136,12 +137,10 @@ type ComputeSpec struct { } type AutoscalingSpec struct { - // +kubebuilder:validation:Optional // +kubebuilder:default=1 // Minimum number of replicas MinReplicas int32 `json:"min_replicas,omitempty"` - // +kubebuilder:validation:Optional // +kubebuilder:default=100 // Maximum number of replicas MaxReplicas int32 `json:"max_replicas,omitempty"` @@ -211,6 +210,7 @@ type NetworkingSpec struct { // RealtimeAPIStatus defines the observed state of RealtimeAPI type RealtimeAPIStatus struct { + // +kubebuilder:validation:Type=string Status status.Code `json:"status"` DesiredReplicas int32 `json:"desired_replicas"` CurrentReplicas int32 `json:"current_replicas"` diff --git a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml index 1b4a092ae3..f68726183b 100644 --- a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml @@ -37,6 +37,8 @@ spec: description: RealtimeAPISpec defines the desired state of RealtimeAPI properties: autoscaling: + default: + min_replicas: 1 description: Autoscaling configuration properties: downscale_stabilization_period: @@ -571,6 +573,9 @@ spec: - replicas type: object update_strategy: + default: + max_surge: 25% + max_unavailable: 25% description: Deployment strategy to use when replacing existing replicas with new ones properties: @@ -595,6 +600,7 @@ spec: x-kubernetes-int-or-string: true type: object required: + - networking - pod type: object status: @@ -612,7 +618,7 @@ spec: format: int32 type: integer status: - type: integer + type: string required: - current_replicas - desired_replicas diff --git a/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml b/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml index 638b3039f3..c3295fc4f4 100644 --- a/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml +++ b/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml @@ -1,7 +1,15 @@ apiVersion: api.cortex.dev/v1alpha1 kind: RealtimeAPI metadata: - name: realtimeapi-sample + name: hello-world spec: - # Add fields here - foo: bar + pod: + containers: + - name: api + image: quay.io/cortexlabs-test/realtime-hello-world-cpu:latest + max_concurrency: 1 + max_queue_length: 100 + port: 8080 + replicas: 1 + networking: + endpoint: "/hello-world" diff --git a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go index 1523665472..b6d9b9e610 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go @@ -295,37 +295,39 @@ func (r *RealtimeAPIReconciler) userContainers(api apiv1alpha1.RealtimeAPI) ([]k Privileged: pointer.Bool(true), } - if container.Compute.CPU != nil { - containerResourceList[kcore.ResourceCPU] = *k8s.QuantityPtr(container.Compute.CPU.DeepCopy()) - } + if container.Compute != nil { + if container.Compute.CPU != nil { + containerResourceList[kcore.ResourceCPU] = *k8s.QuantityPtr(container.Compute.CPU.DeepCopy()) + } - if container.Compute.Mem != nil { - containerResourceList[kcore.ResourceMemory] = *k8s.QuantityPtr(container.Compute.Mem.DeepCopy()) - } + if container.Compute.Mem != nil { + containerResourceList[kcore.ResourceMemory] = *k8s.QuantityPtr(container.Compute.Mem.DeepCopy()) + } - if container.Compute.GPU > 0 { - containerResourceList["nvidia.com/gpu"] = *kresource.NewQuantity(container.Compute.GPU, kresource.DecimalSI) - containerResourceLimitsList["nvidia.com/gpu"] = *kresource.NewQuantity(container.Compute.GPU, kresource.DecimalSI) - } + if container.Compute.GPU > 0 { + containerResourceList["nvidia.com/gpu"] = *kresource.NewQuantity(container.Compute.GPU, kresource.DecimalSI) + containerResourceLimitsList["nvidia.com/gpu"] = *kresource.NewQuantity(container.Compute.GPU, kresource.DecimalSI) + } - if container.Compute.Inf > 0 { - totalHugePages := container.Compute.Inf * workloads.HugePagesMemPerInf - containerResourceList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) - containerResourceList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) - containerResourceLimitsList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) - containerResourceLimitsList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) - - securityContext.Capabilities = &kcore.Capabilities{ - Add: []kcore.Capability{ - "SYS_ADMIN", - "IPC_LOCK", - }, + if container.Compute.Inf > 0 { + totalHugePages := container.Compute.Inf * workloads.HugePagesMemPerInf + containerResourceList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) + containerResourceList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) + containerResourceLimitsList["aws.amazon.com/neuron"] = *kresource.NewQuantity(container.Compute.Inf, kresource.DecimalSI) + containerResourceLimitsList["hugepages-2Mi"] = *kresource.NewQuantity(totalHugePages, kresource.BinarySI) + + securityContext.Capabilities = &kcore.Capabilities{ + Add: []kcore.Capability{ + "SYS_ADMIN", + "IPC_LOCK", + }, + } } - } - if container.Compute.Shm != nil { - volumes = append(volumes, workloads.ShmVolume(*container.Compute.Shm, "dshm-"+container.Name)) - containerMounts = append(containerMounts, workloads.ShmMount("dshm-"+container.Name)) + if container.Compute.Shm != nil { + volumes = append(volumes, workloads.ShmVolume(*container.Compute.Shm, "dshm-"+container.Name)) + containerMounts = append(containerMounts, workloads.ShmMount("dshm-"+container.Name)) + } } containerEnvVars := workloads.BaseEnvVars diff --git a/pkg/crds/main.go b/pkg/crds/main.go index aba1346cac..d4400a5e88 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -156,7 +156,7 @@ func main() { if err = (&batchcontrollers.BatchJobReconciler{ Client: mgr.GetClient(), Config: batchcontrollers.BatchJobReconcilerConfig{}.ApplyDefaults(), - Log: ctrl.Log.WithName("controllers").WithName("BatchJob"), + Log: ctrl.Log.WithName("controllers").WithName("batch").WithName("BatchJob"), ClusterConfig: clusterConfig, AWS: awsClient, Prometheus: promv1.NewAPI(promClient), @@ -166,9 +166,10 @@ func main() { os.Exit(1) } if err = (&apicontrollers.RealtimeAPIReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("api").WithName("RealtimeAPI"), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + ClusterConfig: clusterConfig, + Log: ctrl.Log.WithName("controllers").WithName("api").WithName("RealtimeAPI"), + Scheme: mgr.GetScheme(), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "RealtimeAPI") os.Exit(1) diff --git a/pkg/types/status/code.go b/pkg/types/status/code.go index 3845a913c3..17fc8ca12f 100644 --- a/pkg/types/status/code.go +++ b/pkg/types/status/code.go @@ -16,6 +16,7 @@ limitations under the License. package status +// +kubebuilder:validation:Type=string type Code int const ( From ad5cfdb857392b9a1f40da9d2157bc13789d8f7a Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 13:04:22 +0200 Subject: [PATCH 08/42] Fix createOrUpdateService method --- .../controllers/api/realtimeapi_controller_helpers.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go index b6d9b9e610..4f4bab31f3 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/api/realtimeapi_controller_helpers.go @@ -99,7 +99,13 @@ func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api a Namespace: api.Namespace}, } op, err := controllerutil.CreateOrUpdate(ctx, r.Client, &service, func() error { - service.Spec = r.desiredService(api).Spec + desiredSvc := r.desiredService(api) + // We need to set fields individually because some are immutable + service.Labels = desiredSvc.Labels + service.Annotations = desiredSvc.Annotations + service.Spec.Type = desiredSvc.Spec.Type + service.Spec.Ports = desiredSvc.Spec.Ports + service.Spec.Selector = desiredSvc.Spec.Selector return nil }) if err != nil { From 164d492f0e4357bb9d9e9d9b0fe4f2fc312cf0fb Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 13:22:17 +0200 Subject: [PATCH 09/42] Rename CRD api group to serverless --- pkg/crds/PROJECT | 4 +- .../v1alpha1/groupversion_info.go | 4 +- .../v1alpha1/realtimeapi_types.go | 0 .../v1alpha1/zz_generated.deepcopy.go | 0 ...> serverless.cortex.dev_realtimeapis.yaml} | 4 +- pkg/crds/config/crd/kustomization.yaml | 4 +- pkg/crds/config/rbac/role.yaml | 52 +++++++++---------- ...l => serverless_v1alpha1_realtimeapi.yaml} | 2 +- .../realtimeapi_controller.go | 14 ++--- .../realtimeapi_controller_helpers.go | 36 ++++++------- .../{api => serverless}/suite_test.go | 6 +-- pkg/crds/main.go | 8 +-- 12 files changed, 67 insertions(+), 67 deletions(-) rename pkg/crds/apis/{api => serverless}/v1alpha1/groupversion_info.go (90%) rename pkg/crds/apis/{api => serverless}/v1alpha1/realtimeapi_types.go (100%) rename pkg/crds/apis/{api => serverless}/v1alpha1/zz_generated.deepcopy.go (100%) rename pkg/crds/config/crd/bases/{api.cortex.dev_realtimeapis.yaml => serverless.cortex.dev_realtimeapis.yaml} (99%) rename pkg/crds/config/samples/{api_v1alpha1_realtimeapi.yaml => serverless_v1alpha1_realtimeapi.yaml} (87%) rename pkg/crds/controllers/{api => serverless}/realtimeapi_controller.go (87%) rename pkg/crds/controllers/{api => serverless}/realtimeapi_controller_helpers.go (90%) rename pkg/crds/controllers/{api => serverless}/suite_test.go (93%) diff --git a/pkg/crds/PROJECT b/pkg/crds/PROJECT index 01bbb7d1e9..a80d48987a 100644 --- a/pkg/crds/PROJECT +++ b/pkg/crds/PROJECT @@ -19,8 +19,8 @@ resources: namespaced: true controller: true domain: cortex.dev - group: api + group: serverless kind: RealtimeAPI - path: github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1 + path: github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1 version: v1alpha1 version: "3" diff --git a/pkg/crds/apis/api/v1alpha1/groupversion_info.go b/pkg/crds/apis/serverless/v1alpha1/groupversion_info.go similarity index 90% rename from pkg/crds/apis/api/v1alpha1/groupversion_info.go rename to pkg/crds/apis/serverless/v1alpha1/groupversion_info.go index 3625dc1527..f9193a464a 100644 --- a/pkg/crds/apis/api/v1alpha1/groupversion_info.go +++ b/pkg/crds/apis/serverless/v1alpha1/groupversion_info.go @@ -16,7 +16,7 @@ limitations under the License. // Package v1alpha1 contains API Schema definitions for the api v1alpha1 API group //+kubebuilder:object:generate=true -//+groupName=api.cortex.dev +//+groupName=serverless.cortex.dev package v1alpha1 import ( @@ -26,7 +26,7 @@ import ( var ( // GroupVersion is group version used to register these objects - GroupVersion = schema.GroupVersion{Group: "api.cortex.dev", Version: "v1alpha1"} + GroupVersion = schema.GroupVersion{Group: "serverless.cortex.dev", Version: "v1alpha1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} diff --git a/pkg/crds/apis/api/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go similarity index 100% rename from pkg/crds/apis/api/v1alpha1/realtimeapi_types.go rename to pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go diff --git a/pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go similarity index 100% rename from pkg/crds/apis/api/v1alpha1/zz_generated.deepcopy.go rename to pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go diff --git a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml similarity index 99% rename from pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml rename to pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index f68726183b..377b3cc658 100644 --- a/pkg/crds/config/crd/bases/api.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -6,9 +6,9 @@ metadata: annotations: controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null - name: realtimeapis.api.cortex.dev + name: realtimeapis.serverless.cortex.dev spec: - group: api.cortex.dev + group: serverless.cortex.dev names: kind: RealtimeAPI listKind: RealtimeAPIList diff --git a/pkg/crds/config/crd/kustomization.yaml b/pkg/crds/config/crd/kustomization.yaml index 59e4b92a53..77bbf7b21d 100644 --- a/pkg/crds/config/crd/kustomization.yaml +++ b/pkg/crds/config/crd/kustomization.yaml @@ -3,10 +3,10 @@ # It should be run by config/default resources: - bases/batch.cortex.dev_batchjobs.yaml -- bases/api.cortex.dev_realtimeapis.yaml +- bases/serverless.cortex.dev_realtimeapis.yaml #+kubebuilder:scaffold:crdkustomizeresource -patchesStrategicMerge: +#patchesStrategicMerge: # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. # patches here are for enabling the conversion webhook for each CRD #- patches/webhook_in_batchjobs.yaml diff --git a/pkg/crds/config/rbac/role.yaml b/pkg/crds/config/rbac/role.yaml index c6c2b052a2..2c9787c377 100644 --- a/pkg/crds/config/rbac/role.yaml +++ b/pkg/crds/config/rbac/role.yaml @@ -34,32 +34,6 @@ rules: - patch - update - watch -- apiGroups: - - api.cortex.dev - resources: - - realtimeapis - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - api.cortex.dev - resources: - - realtimeapis/finalizers - verbs: - - update -- apiGroups: - - api.cortex.dev - resources: - - realtimeapis/status - verbs: - - get - - patch - - update - apiGroups: - apps resources: @@ -119,3 +93,29 @@ rules: - patch - update - watch +- apiGroups: + - serverless.cortex.dev + resources: + - realtimeapis + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - serverless.cortex.dev + resources: + - realtimeapis/finalizers + verbs: + - update +- apiGroups: + - serverless.cortex.dev + resources: + - realtimeapis/status + verbs: + - get + - patch + - update diff --git a/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml b/pkg/crds/config/samples/serverless_v1alpha1_realtimeapi.yaml similarity index 87% rename from pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml rename to pkg/crds/config/samples/serverless_v1alpha1_realtimeapi.yaml index c3295fc4f4..a3e9ab2f4f 100644 --- a/pkg/crds/config/samples/api_v1alpha1_realtimeapi.yaml +++ b/pkg/crds/config/samples/serverless_v1alpha1_realtimeapi.yaml @@ -1,4 +1,4 @@ -apiVersion: api.cortex.dev/v1alpha1 +apiVersion: serverless.cortex.dev/v1alpha1 kind: RealtimeAPI metadata: name: hello-world diff --git a/pkg/crds/controllers/api/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go similarity index 87% rename from pkg/crds/controllers/api/realtimeapi_controller.go rename to pkg/crds/controllers/serverless/realtimeapi_controller.go index 53e368d856..1c1bd1b93d 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -14,13 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package api +package serverlesscontroller import ( "context" "fmt" - apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/crds/controllers" "github.com/cortexlabs/cortex/pkg/types/clusterconfig" "github.com/go-logr/logr" @@ -43,9 +43,9 @@ type RealtimeAPIReconciler struct { Scheme *runtime.Scheme } -// +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=api.cortex.dev,resources=realtimeapis/finalizers,verbs=update +// +kubebuilder:rbac:groups=serverless.cortex.dev,resources=realtimeapis,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=serverless.cortex.dev,resources=realtimeapis/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=serverless.cortex.dev,resources=realtimeapis/finalizers,verbs=update // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch // +kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=get;list;watch;create;update;patch // +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch @@ -57,7 +57,7 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) log := r.Log.WithValues("realtimeapi", req.NamespacedName) // Step 1: get resource from request - api := apiv1alpha1.RealtimeAPI{} + api := serverless.RealtimeAPI{} log.V(1).Info("retrieving resource") if err := r.Get(ctx, req.NamespacedName, &api); err != nil { if !kerrors.IsNotFound(err) { @@ -109,7 +109,7 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) // SetupWithManager sets up the controller with the Manager. func (r *RealtimeAPIReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&apiv1alpha1.RealtimeAPI{}). + For(&serverless.RealtimeAPI{}). Owns(&kapps.Deployment{}). Owns(&kcore.Service{}). Owns(&istioclientnetworking.VirtualService{}). diff --git a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go similarity index 90% rename from pkg/crds/controllers/api/realtimeapi_controller_helpers.go rename to pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 4f4bab31f3..1beb66a88b 100644 --- a/pkg/crds/controllers/api/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -14,14 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. */ -package api +package serverlesscontroller import ( "context" "fmt" "github.com/cortexlabs/cortex/pkg/consts" - apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/pointer" @@ -41,7 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (*kapps.Deployment, error) { +func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api serverless.RealtimeAPI) (*kapps.Deployment, error) { req := client.ObjectKey{Namespace: api.Namespace, Name: workloads.K8sName(api.Name)} deployment := kapps.Deployment{} if err := r.Get(ctx, req, &deployment); err != nil { @@ -53,7 +53,7 @@ func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api apiv1alph return &deployment, nil } -func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *apiv1alpha1.RealtimeAPI, deployment *kapps.Deployment) error { +func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *serverless.RealtimeAPI, deployment *kapps.Deployment) error { apiStatus := status.Pending api.Status.Status = apiStatus // FIXME: handle other status @@ -76,7 +76,7 @@ func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *apiv1alph return nil } -func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { +func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api serverless.RealtimeAPI) (controllerutil.OperationResult, error) { deployment := kapps.Deployment{ ObjectMeta: kmeta.ObjectMeta{ Name: workloads.K8sName(api.Name), @@ -92,7 +92,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, ap return op, nil } -func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { +func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api serverless.RealtimeAPI) (controllerutil.OperationResult, error) { service := kcore.Service{ ObjectMeta: kmeta.ObjectMeta{ Name: workloads.K8sName(api.Name), @@ -114,7 +114,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api a return op, nil } -func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context, api apiv1alpha1.RealtimeAPI) (controllerutil.OperationResult, error) { +func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context, api serverless.RealtimeAPI) (controllerutil.OperationResult, error) { vs := istioclientnetworking.VirtualService{ ObjectMeta: kmeta.ObjectMeta{ Name: workloads.K8sName(api.Name), @@ -130,7 +130,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context return op, nil } -func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *apiv1alpha1.RealtimeAPI) (string, error) { +func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *serverless.RealtimeAPI) (string, error) { req := client.ObjectKey{Namespace: consts.IstioNamespace, Name: "ingressgateway-apis"} svc := kcore.Service{} if err := r.Get(ctx, req, &svc); err != nil { @@ -149,7 +149,7 @@ func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *apiv1alpha return endpoint, nil } -func (r *RealtimeAPIReconciler) desiredDeployment(api apiv1alpha1.RealtimeAPI) kapps.Deployment { +func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) kapps.Deployment { containers, volumes := r.desiredContainers(api) return *k8s.Deployment(&k8s.DeploymentSpec{ @@ -164,7 +164,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api apiv1alpha1.RealtimeAPI) k "deploymentID": api.Annotations["cortex.dev/deployment-id"], // FIXME: needs to be created beforehand "cortex.dev/api": "true", }, - Annotations: getAPIAnnotations(api), + Annotations: r.getAPIAnnotations(api), Selector: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), @@ -193,7 +193,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api apiv1alpha1.RealtimeAPI) k }) } -func (r *RealtimeAPIReconciler) desiredContainers(api apiv1alpha1.RealtimeAPI) ([]kcore.Container, []kcore.Volume) { +func (r *RealtimeAPIReconciler) desiredContainers(api serverless.RealtimeAPI) ([]kcore.Container, []kcore.Volume) { containers, volumes := r.userContainers(api) proxyContainer, proxyVolume := r.proxyContainer(api) @@ -203,13 +203,13 @@ func (r *RealtimeAPIReconciler) desiredContainers(api apiv1alpha1.RealtimeAPI) ( return containers, volumes } -func (r *RealtimeAPIReconciler) desiredService(api apiv1alpha1.RealtimeAPI) kcore.Service { +func (r *RealtimeAPIReconciler) desiredService(api serverless.RealtimeAPI) kcore.Service { return *k8s.Service(&k8s.ServiceSpec{ Name: workloads.K8sName(api.Name), PortName: "http", Port: consts.ProxyPortInt32, TargetPort: consts.ProxyPortInt32, - Annotations: getAPIAnnotations(api), + Annotations: r.getAPIAnnotations(api), Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), @@ -222,7 +222,7 @@ func (r *RealtimeAPIReconciler) desiredService(api apiv1alpha1.RealtimeAPI) kcor }) } -func (r *RealtimeAPIReconciler) desiredVirtualService(api apiv1alpha1.RealtimeAPI) istioclientnetworking.VirtualService { +func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI) istioclientnetworking.VirtualService { var activatorWeight int32 if api.Spec.Pod.Replicas == 0 { activatorWeight = 100 @@ -270,7 +270,7 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api apiv1alpha1.RealtimeAP }, PrefixPath: pointer.String(api.Spec.Networking.Endpoint), Rewrite: pointer.String("/"), - Annotations: getAPIAnnotations(api), + Annotations: r.getAPIAnnotations(api), Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), @@ -281,7 +281,7 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api apiv1alpha1.RealtimeAP }) } -func (r *RealtimeAPIReconciler) userContainers(api apiv1alpha1.RealtimeAPI) ([]kcore.Container, []kcore.Volume) { +func (r *RealtimeAPIReconciler) userContainers(api serverless.RealtimeAPI) ([]kcore.Container, []kcore.Volume) { volumes := []kcore.Volume{ workloads.MntVolume(), workloads.CortexVolume(), @@ -361,7 +361,7 @@ func (r *RealtimeAPIReconciler) userContainers(api apiv1alpha1.RealtimeAPI) ([]k return containers, volumes } -func (r *RealtimeAPIReconciler) proxyContainer(api apiv1alpha1.RealtimeAPI) (kcore.Container, kcore.Volume) { +func (r *RealtimeAPIReconciler) proxyContainer(api serverless.RealtimeAPI) (kcore.Container, kcore.Volume) { return kcore.Container{ Name: workloads.ProxyContainerName, Image: r.ClusterConfig.ImageProxy, @@ -411,7 +411,7 @@ func (r *RealtimeAPIReconciler) proxyContainer(api apiv1alpha1.RealtimeAPI) (kco }, workloads.ClusterConfigVolume() } -func getAPIAnnotations(api apiv1alpha1.RealtimeAPI) map[string]string { +func (r *RealtimeAPIReconciler) getAPIAnnotations(api serverless.RealtimeAPI) map[string]string { return map[string]string{ userconfig.MinReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MinReplicas), userconfig.MaxReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MaxReplicas), diff --git a/pkg/crds/controllers/api/suite_test.go b/pkg/crds/controllers/serverless/suite_test.go similarity index 93% rename from pkg/crds/controllers/api/suite_test.go rename to pkg/crds/controllers/serverless/suite_test.go index 134a7234a8..5698e2887b 100644 --- a/pkg/crds/controllers/api/suite_test.go +++ b/pkg/crds/controllers/serverless/suite_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package api +package serverlesscontroller import ( "path/filepath" @@ -30,7 +30,7 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" - apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" //+kubebuilder:scaffold:imports ) @@ -62,7 +62,7 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) Expect(cfg).NotTo(BeNil()) - err = apiv1alpha1.AddToScheme(scheme.Scheme) + err = serverless.AddToScheme(scheme.Scheme) Expect(err).NotTo(HaveOccurred()) //+kubebuilder:scaffold:scheme diff --git a/pkg/crds/main.go b/pkg/crds/main.go index d4400a5e88..01e08b1d82 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -42,10 +42,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" - apiv1alpha1 "github.com/cortexlabs/cortex/pkg/crds/apis/api/v1alpha1" batch "github.com/cortexlabs/cortex/pkg/crds/apis/batch/v1alpha1" - apicontrollers "github.com/cortexlabs/cortex/pkg/crds/controllers/api" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" batchcontrollers "github.com/cortexlabs/cortex/pkg/crds/controllers/batch" + serverlesscontrollers "github.com/cortexlabs/cortex/pkg/crds/controllers/serverless" //+kubebuilder:scaffold:imports ) @@ -59,7 +59,7 @@ func init() { utilruntime.Must(istioscheme.AddToScheme(scheme)) utilruntime.Must(batch.AddToScheme(scheme)) - utilruntime.Must(apiv1alpha1.AddToScheme(scheme)) + utilruntime.Must(serverless.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -165,7 +165,7 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "BatchJob") os.Exit(1) } - if err = (&apicontrollers.RealtimeAPIReconciler{ + if err = (&serverlesscontrollers.RealtimeAPIReconciler{ Client: mgr.GetClient(), ClusterConfig: clusterConfig, Log: ctrl.Log.WithName("controllers").WithName("api").WithName("RealtimeAPI"), From 5f22fd83b29a6e2609106de7a1404b2d94b3d8d1 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 13:32:29 +0200 Subject: [PATCH 10/42] Update logger name for serverless CRD controllers --- pkg/crds/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/crds/main.go b/pkg/crds/main.go index 01e08b1d82..9948837832 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -168,7 +168,7 @@ func main() { if err = (&serverlesscontrollers.RealtimeAPIReconciler{ Client: mgr.GetClient(), ClusterConfig: clusterConfig, - Log: ctrl.Log.WithName("controllers").WithName("api").WithName("RealtimeAPI"), + Log: ctrl.Log.WithName("controllers").WithName("serverless").WithName("RealtimeAPI"), Scheme: mgr.GetScheme(), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "RealtimeAPI") From 6a97d37f6ba9dd110784a65db16982d7dc0586bc Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 13:40:01 +0200 Subject: [PATCH 11/42] Add additional print columns to realtime crd --- .../apis/serverless/v1alpha1/realtimeapi_types.go | 4 ++++ .../bases/serverless.cortex.dev_realtimeapis.yaml | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 997f9b87f0..18ff08c4ee 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -220,6 +220,10 @@ type RealtimeAPIStatus struct { //+kubebuilder:object:root=true //+kubebuilder:subresource:status +//+kubebuilder:printcolumn:JSONPath=".status.current_replicas",name="Replicas",type="integer" +//+kubebuilder:printcolumn:JSONPath=".status.ready_replicas",name="Ready",type="integer" +//+kubebuilder:printcolumn:JSONPath=".status.status",name="Status",type="string" +//+kubebuilder:printcolumn:JSONPath=".status.endpoint",name="Endpoint",type="string" // RealtimeAPI is the Schema for the realtimeapis API type RealtimeAPI struct { diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 377b3cc658..507793db05 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -16,7 +16,20 @@ spec: singular: realtimeapi scope: Namespaced versions: - - name: v1alpha1 + - additionalPrinterColumns: + - jsonPath: .status.current_replicas + name: Replicas + type: integer + - jsonPath: .status.ready_replicas + name: Ready + type: integer + - jsonPath: .status.status + name: Status + type: string + - jsonPath: .status.endpoint + name: Endpoint + type: string + name: v1alpha1 schema: openAPIV3Schema: description: RealtimeAPI is the Schema for the realtimeapis API From da929a6e7da02009a3ec2c40e95b51b550e3b4c9 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 16:35:57 +0200 Subject: [PATCH 12/42] Fix annotations in realtime crd --- .../serverless/realtimeapi_controller_helpers.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 1beb66a88b..5c2c23d3fd 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -416,9 +416,9 @@ func (r *RealtimeAPIReconciler) getAPIAnnotations(api serverless.RealtimeAPI) ma userconfig.MinReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MinReplicas), userconfig.MaxReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MaxReplicas), userconfig.TargetInFlightAnnotationKey: strings.Int32(api.Spec.Autoscaling.TargetInFlight), - userconfig.WindowAnnotationKey: api.Spec.Autoscaling.Window.String(), - userconfig.DownscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.DownscaleStabilizationPeriod.String(), - userconfig.UpscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.UpscaleStabilizationPeriod.String(), + userconfig.WindowAnnotationKey: api.Spec.Autoscaling.Window.Duration.String(), + userconfig.DownscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration.String(), + userconfig.UpscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.UpscaleStabilizationPeriod.Duration.String(), userconfig.MaxDownscaleFactorAnnotationKey: strings.Float64(api.Spec.Autoscaling.MaxDownscaleFactor.AsApproximateFloat64()), userconfig.MaxUpscaleFactorAnnotationKey: strings.Float64(api.Spec.Autoscaling.MaxUpscaleFactor.AsApproximateFloat64()), userconfig.DownscaleToleranceAnnotationKey: strings.Float64(api.Spec.Autoscaling.DownscaleTolerance.AsApproximateFloat64()), From f164ecbd565818573b2ecb1c94479c4d4d0b1249 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 16:36:40 +0200 Subject: [PATCH 13/42] Fix endpoint string on realtime crd status --- .../controllers/serverless/realtimeapi_controller_helpers.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 5c2c23d3fd..5dd1f862bc 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -142,8 +142,9 @@ func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *serverless return "", nil } - endpoint := fmt.Sprintf("http://%s/%s", - svc.Status.LoadBalancer.Ingress[0].Hostname, api.Spec.Networking.Endpoint, + endpoint := urls.Join( + fmt.Sprintf("http://%s", svc.Status.LoadBalancer.Ingress[0].Hostname), + api.Spec.Networking.Endpoint, ) return endpoint, nil From 27c83071b419fc91f6fa433f03fdc54faa542be9 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 16:39:49 +0200 Subject: [PATCH 14/42] Update createOrUpdate* methods --- .../serverless/realtimeapi_controller_helpers.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 5dd1f862bc..23c5260a7e 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -24,8 +24,10 @@ import ( serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/maps" "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/strings" + "github.com/cortexlabs/cortex/pkg/lib/urls" "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" @@ -83,7 +85,10 @@ func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, ap Namespace: api.Namespace}, } op, err := controllerutil.CreateOrUpdate(ctx, r.Client, &deployment, func() error { - deployment.Spec = r.desiredDeployment(api).Spec + desiredDeployment := r.desiredDeployment(api) + deployment.Labels = desiredDeployment.Labels + deployment.Annotations = maps.MergeStrMapsString(deployment.Annotations, desiredDeployment.Annotations) + deployment.Spec = desiredDeployment.Spec return nil }) if err != nil { @@ -102,7 +107,7 @@ func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api s desiredSvc := r.desiredService(api) // We need to set fields individually because some are immutable service.Labels = desiredSvc.Labels - service.Annotations = desiredSvc.Annotations + service.Annotations = maps.MergeStrMapsString(service.Annotations, desiredSvc.Annotations) service.Spec.Type = desiredSvc.Spec.Type service.Spec.Ports = desiredSvc.Spec.Ports service.Spec.Selector = desiredSvc.Spec.Selector @@ -121,7 +126,10 @@ func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context Namespace: api.Namespace}, } op, err := controllerutil.CreateOrUpdate(ctx, r.Client, &vs, func() error { - vs.Spec = r.desiredVirtualService(api).Spec + desiredVirtualService := r.desiredVirtualService(api) + vs.Labels = desiredVirtualService.Labels + vs.Annotations = maps.MergeStrMapsString(vs.Annotations, desiredVirtualService.Annotations) + vs.Spec = desiredVirtualService.Spec return nil }) if err != nil { From 2f0f481a18810e71675909b65d055c4ac9f02302 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 18:52:08 +0200 Subject: [PATCH 15/42] Create apiID and deploymentID annotations on resource creation --- .../serverless/v1alpha1/realtimeapi_types.go | 2 +- .../serverless.cortex.dev_realtimeapis.yaml | 2 +- .../serverless/realtimeapi_controller.go | 15 +++- .../realtimeapi_controller_helpers.go | 72 +++++++++++++------ 4 files changed, 68 insertions(+), 23 deletions(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 18ff08c4ee..eb82ed3944 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -220,7 +220,7 @@ type RealtimeAPIStatus struct { //+kubebuilder:object:root=true //+kubebuilder:subresource:status -//+kubebuilder:printcolumn:JSONPath=".status.current_replicas",name="Replicas",type="integer" +//+kubebuilder:printcolumn:JSONPath=".spec.replicas",name="Replicas",type="integer" //+kubebuilder:printcolumn:JSONPath=".status.ready_replicas",name="Ready",type="integer" //+kubebuilder:printcolumn:JSONPath=".status.status",name="Status",type="string" //+kubebuilder:printcolumn:JSONPath=".status.endpoint",name="Endpoint",type="string" diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 507793db05..2e7a8479a5 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -17,7 +17,7 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - jsonPath: .status.current_replicas + - jsonPath: .spec.replicas name: Replicas type: integer - jsonPath: .status.ready_replicas diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index 1c1bd1b93d..f6cbf8ef6f 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -84,7 +84,20 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, err } - // Step 3: Create or Update Resources + // Step 3: Get or create deployment and API ids + deploymentID, apiID := r.getOrCreateAPIIDs(api) + if api.Annotations["cortex.dev/deployment-id"] == "" || + api.Annotations["cortex.dev/api-id"] == "" { + + log.V(1).Info("creating api and deployment id annotations") + api.Annotations["cortex.dev/deployment-id"] = deploymentID + api.Annotations["cortex.dev/api-id"] = apiID + if err = r.Update(ctx, &api); err != nil { + return ctrl.Result{}, err + } + } + + // Step 4: Create or Update Resources deployOp, err := r.createOrUpdateDeployment(ctx, api) if err != nil { return ctrl.Result{}, err diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 23c5260a7e..cc86eb8135 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -17,17 +17,20 @@ limitations under the License. package serverlesscontroller import ( + "bytes" "context" "fmt" "github.com/cortexlabs/cortex/pkg/consts" serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/hash" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/maps" "github.com/cortexlabs/cortex/pkg/lib/pointer" - "github.com/cortexlabs/cortex/pkg/lib/strings" + s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/lib/urls" + "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" @@ -39,6 +42,7 @@ import ( kresource "k8s.io/apimachinery/pkg/api/resource" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -160,6 +164,7 @@ func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *serverless func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) kapps.Deployment { containers, volumes := r.desiredContainers(api) + deploymentID, apiID := r.getOrCreateAPIIDs(api) return *k8s.Deployment(&k8s.DeploymentSpec{ Name: workloads.K8sName(api.Name), @@ -169,11 +174,11 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "apiID": api.Annotations["cortex.dev/api-id"], // TODO: check if can be replaced with resource version - "deploymentID": api.Annotations["cortex.dev/deployment-id"], // FIXME: needs to be created beforehand + "apiID": apiID, + "deploymentID": deploymentID, "cortex.dev/api": "true", }, - Annotations: r.getAPIAnnotations(api), + Annotations: r.generateAPIAnnotations(api), Selector: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), @@ -182,7 +187,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "deploymentID": api.Annotations["cortex.dev/deployment-id"], + "deploymentID": deploymentID, "cortex.dev/api": "true", }, Annotations: map[string]string{ @@ -218,7 +223,7 @@ func (r *RealtimeAPIReconciler) desiredService(api serverless.RealtimeAPI) kcore PortName: "http", Port: consts.ProxyPortInt32, TargetPort: consts.ProxyPortInt32, - Annotations: r.getAPIAnnotations(api), + Annotations: r.generateAPIAnnotations(api), Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), @@ -237,6 +242,8 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI activatorWeight = 100 } + deploymentID, apiID := r.getOrCreateAPIIDs(api) + return *k8s.VirtualService(&k8s.VirtualServiceSpec{ Name: workloads.K8sName(api.Name), Gateways: []string{"apis-gateway"}, @@ -279,12 +286,12 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI }, PrefixPath: pointer.String(api.Spec.Networking.Endpoint), Rewrite: pointer.String("/"), - Annotations: r.getAPIAnnotations(api), + Annotations: r.generateAPIAnnotations(api), Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "apiID": api.Annotations["cortex.dev/api-id"], - "deploymentID": api.Annotations["cortex.dev/deployment-id"], + "apiID": apiID, + "deploymentID": deploymentID, "cortex.dev/api": "true", }, }) @@ -383,11 +390,11 @@ func (r *RealtimeAPIReconciler) proxyContainer(api serverless.RealtimeAPI) (kcor "--admin-port", consts.AdminPortStr, "--user-port", - strings.Int32(api.Spec.Pod.Port), + s.Int32(api.Spec.Pod.Port), "--max-concurrency", - strings.Int32(api.Spec.Pod.MaxConcurrency), + s.Int32(api.Spec.Pod.MaxConcurrency), "--max-queue-length", - strings.Int32(api.Spec.Pod.MaxQueueLength), + s.Int32(api.Spec.Pod.MaxQueueLength), }, Ports: []kcore.ContainerPort{ {Name: consts.AdminPortName, ContainerPort: consts.AdminPortInt32}, @@ -420,17 +427,42 @@ func (r *RealtimeAPIReconciler) proxyContainer(api serverless.RealtimeAPI) (kcor }, workloads.ClusterConfigVolume() } -func (r *RealtimeAPIReconciler) getAPIAnnotations(api serverless.RealtimeAPI) map[string]string { +func (r *RealtimeAPIReconciler) getOrCreateAPIIDs(api serverless.RealtimeAPI) (deploymentID string, apiID string) { + deploymentID = api.Annotations["cortex.dev/deployment-id"] + if deploymentID == "" { + deploymentID = k8s.RandomName()[:10] + } + + apiID = api.Annotations["cortex.dev/api-id"] + if apiID == "" { + var buf bytes.Buffer + + buf.WriteString(api.Name) + buf.WriteString(s.Obj(api.TypeMeta)) + buf.WriteString(s.Obj(api.Spec.Pod)) + buf.WriteString(s.Obj(api.Spec.Networking)) + buf.WriteString(s.Obj(api.Spec.Autoscaling)) + buf.WriteString(s.Obj(api.Spec.NodeGroups)) + buf.WriteString(s.Obj(api.Spec.UpdateStrategy)) + specID := hash.Bytes(buf.Bytes())[:32] + + apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) + } + + return deploymentID, apiID +} + +func (r *RealtimeAPIReconciler) generateAPIAnnotations(api serverless.RealtimeAPI) map[string]string { return map[string]string{ - userconfig.MinReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MinReplicas), - userconfig.MaxReplicasAnnotationKey: strings.Int32(api.Spec.Autoscaling.MaxReplicas), - userconfig.TargetInFlightAnnotationKey: strings.Int32(api.Spec.Autoscaling.TargetInFlight), + userconfig.MinReplicasAnnotationKey: s.Int32(api.Spec.Autoscaling.MinReplicas), + userconfig.MaxReplicasAnnotationKey: s.Int32(api.Spec.Autoscaling.MaxReplicas), + userconfig.TargetInFlightAnnotationKey: s.Int32(api.Spec.Autoscaling.TargetInFlight), userconfig.WindowAnnotationKey: api.Spec.Autoscaling.Window.Duration.String(), userconfig.DownscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration.String(), userconfig.UpscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.UpscaleStabilizationPeriod.Duration.String(), - userconfig.MaxDownscaleFactorAnnotationKey: strings.Float64(api.Spec.Autoscaling.MaxDownscaleFactor.AsApproximateFloat64()), - userconfig.MaxUpscaleFactorAnnotationKey: strings.Float64(api.Spec.Autoscaling.MaxUpscaleFactor.AsApproximateFloat64()), - userconfig.DownscaleToleranceAnnotationKey: strings.Float64(api.Spec.Autoscaling.DownscaleTolerance.AsApproximateFloat64()), - userconfig.UpscaleToleranceAnnotationKey: strings.Float64(api.Spec.Autoscaling.UpscaleTolerance.AsApproximateFloat64()), + userconfig.MaxDownscaleFactorAnnotationKey: s.Float64(api.Spec.Autoscaling.MaxDownscaleFactor.AsApproximateFloat64()), + userconfig.MaxUpscaleFactorAnnotationKey: s.Float64(api.Spec.Autoscaling.MaxUpscaleFactor.AsApproximateFloat64()), + userconfig.DownscaleToleranceAnnotationKey: s.Float64(api.Spec.Autoscaling.DownscaleTolerance.AsApproximateFloat64()), + userconfig.UpscaleToleranceAnnotationKey: s.Float64(api.Spec.Autoscaling.UpscaleTolerance.AsApproximateFloat64()), } } From a51872e84999402547c4cf55f94255409ee2a5c6 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Wed, 21 Jul 2021 18:52:24 +0200 Subject: [PATCH 16/42] Set controller reference on child resources --- .../serverless/realtimeapi_controller_helpers.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index cc86eb8135..fbc934a9f0 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -93,6 +93,11 @@ func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, ap deployment.Labels = desiredDeployment.Labels deployment.Annotations = maps.MergeStrMapsString(deployment.Annotations, desiredDeployment.Annotations) deployment.Spec = desiredDeployment.Spec + + if err := ctrl.SetControllerReference(&api, &deployment, r.Scheme); err != nil { + return err + } + return nil }) if err != nil { @@ -115,6 +120,11 @@ func (r *RealtimeAPIReconciler) createOrUpdateService(ctx context.Context, api s service.Spec.Type = desiredSvc.Spec.Type service.Spec.Ports = desiredSvc.Spec.Ports service.Spec.Selector = desiredSvc.Spec.Selector + + if err := ctrl.SetControllerReference(&api, &service, r.Scheme); err != nil { + return err + } + return nil }) if err != nil { @@ -134,6 +144,11 @@ func (r *RealtimeAPIReconciler) createOrUpdateVirtualService(ctx context.Context vs.Labels = desiredVirtualService.Labels vs.Annotations = maps.MergeStrMapsString(vs.Annotations, desiredVirtualService.Annotations) vs.Spec = desiredVirtualService.Spec + + if err := ctrl.SetControllerReference(&api, &vs, r.Scheme); err != nil { + return err + } + return nil }) if err != nil { From 5d830c7cfc89aa0e92f95fcb00d1653e8028e401 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Thu, 22 Jul 2021 12:54:15 +0200 Subject: [PATCH 17/42] Add replica counts to RealtimeAPI CRD status --- .../serverless/v1alpha1/realtimeapi_types.go | 12 +-- .../v1alpha1/zz_generated.deepcopy.go | 1 + .../serverless.cortex.dev_realtimeapis.yaml | 95 +++++++++++++++--- .../realtimeapi_controller_helpers.go | 98 +++++++++++++++++-- pkg/types/status/code.go | 16 +-- pkg/types/status/status.go | 25 ++--- 6 files changed, 198 insertions(+), 49 deletions(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index eb82ed3944..023b1967aa 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -211,17 +211,15 @@ type NetworkingSpec struct { // RealtimeAPIStatus defines the observed state of RealtimeAPI type RealtimeAPIStatus struct { // +kubebuilder:validation:Type=string - Status status.Code `json:"status"` - DesiredReplicas int32 `json:"desired_replicas"` - CurrentReplicas int32 `json:"current_replicas"` - ReadyReplicas int32 `json:"ready_replicas"` - Endpoint string `json:"endpoint,omitempty"` + Status status.Code `json:"status"` + ReplicaCounts status.ReplicaCounts `json:"replica_counts"` + Endpoint string `json:"endpoint,omitempty"` } //+kubebuilder:object:root=true //+kubebuilder:subresource:status -//+kubebuilder:printcolumn:JSONPath=".spec.replicas",name="Replicas",type="integer" -//+kubebuilder:printcolumn:JSONPath=".status.ready_replicas",name="Ready",type="integer" +//+kubebuilder:printcolumn:JSONPath=".spec.pod.replicas",name="Replicas",type="integer" +//+kubebuilder:printcolumn:JSONPath=".status.replica_counts.updated.ready",name="Ready",type="integer" //+kubebuilder:printcolumn:JSONPath=".status.status",name="Status",type="string" //+kubebuilder:printcolumn:JSONPath=".status.endpoint",name="Endpoint",type="string" diff --git a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go index a52d87d385..9376932a2d 100644 --- a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go @@ -247,6 +247,7 @@ func (in *RealtimeAPISpec) DeepCopy() *RealtimeAPISpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RealtimeAPIStatus) DeepCopyInto(out *RealtimeAPIStatus) { *out = *in + out.ReplicaCounts = in.ReplicaCounts } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPIStatus. diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 2e7a8479a5..61e2d626f2 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -17,10 +17,10 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - jsonPath: .spec.replicas + - jsonPath: .spec.pod.replicas name: Replicas type: integer - - jsonPath: .status.ready_replicas + - jsonPath: .status.replica_counts.updated.ready name: Ready type: integer - jsonPath: .status.status @@ -619,23 +619,90 @@ spec: status: description: RealtimeAPIStatus defines the observed state of RealtimeAPI properties: - current_replicas: - format: int32 - type: integer - desired_replicas: - format: int32 - type: integer endpoint: type: string - ready_replicas: - format: int32 - type: integer + replica_counts: + properties: + requested: + format: int32 + type: integer + stale: + properties: + err_image_pull: + format: int32 + type: integer + failed: + format: int32 + type: integer + initializing: + format: int32 + type: integer + killed: + format: int32 + type: integer + killed_oom: + format: int32 + type: integer + not_ready: + format: int32 + type: integer + pending: + format: int32 + type: integer + ready: + format: int32 + type: integer + stalled: + format: int32 + type: integer + terminating: + format: int32 + type: integer + unknown: + format: int32 + type: integer + type: object + updated: + properties: + err_image_pull: + format: int32 + type: integer + failed: + format: int32 + type: integer + initializing: + format: int32 + type: integer + killed: + format: int32 + type: integer + killed_oom: + format: int32 + type: integer + not_ready: + format: int32 + type: integer + pending: + format: int32 + type: integer + ready: + format: int32 + type: integer + stalled: + format: int32 + type: integer + terminating: + format: int32 + type: integer + unknown: + format: int32 + type: integer + type: object + type: object status: type: string required: - - current_replicas - - desired_replicas - - ready_replicas + - replica_counts - status type: object type: object diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index fbc934a9f0..632f11f9fd 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -20,6 +20,7 @@ import ( "bytes" "context" "fmt" + "time" "github.com/cortexlabs/cortex/pkg/consts" serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" @@ -60,21 +61,28 @@ func (r *RealtimeAPIReconciler) getDeployment(ctx context.Context, api serverles } func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *serverless.RealtimeAPI, deployment *kapps.Deployment) error { - apiStatus := status.Pending - api.Status.Status = apiStatus // FIXME: handle other status - - endpoint, err := r.getEndpoint(ctx, api) + var err error + api.Status.Endpoint, err = r.getEndpoint(ctx, api) if err != nil { return errors.Wrap(err, "failed to get api endpoint") } - api.Status.Endpoint = endpoint + apiStatus := status.Pending + api.Status.ReplicaCounts = status.ReplicaCounts{} if deployment != nil { - api.Status.DesiredReplicas = *deployment.Spec.Replicas - api.Status.CurrentReplicas = deployment.Status.Replicas - api.Status.ReadyReplicas = deployment.Status.ReadyReplicas + if deployment.Status.ReadyReplicas == api.Spec.Pod.Replicas { + apiStatus = status.Live + api.Status.ReplicaCounts.Updated.Ready = deployment.Status.ReadyReplicas + // TODO: handle out of date (?) + } else { + if err = r.getReplicaCounts(ctx, api); err != nil { + return err + } + apiStatus = r.getStatusCode(api) + } } + api.Status.Status = apiStatus if err = r.Status().Update(ctx, api); err != nil { return err } @@ -82,6 +90,80 @@ func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *serverles return nil } +func (r *RealtimeAPIReconciler) getReplicaCounts(ctx context.Context, api *serverless.RealtimeAPI) error { + var podList kcore.PodList + if err := r.List(ctx, &podList, client.MatchingLabels{ + "apiName": api.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + "deploymentID": api.Annotations["cortex.dev/deployment-id"], + }); err != nil { + return err + } + for i := range podList.Items { + pod := &podList.Items[i] + if k8s.IsPodReady(pod) { + api.Status.ReplicaCounts.Updated.Ready++ + continue + } + + switch k8s.GetPodStatus(pod) { + case k8s.PodStatusPending: + if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { + api.Status.ReplicaCounts.Updated.Stalled++ + } else { + api.Status.ReplicaCounts.Updated.Pending++ + } + case k8s.PodStatusInitializing: + api.Status.ReplicaCounts.Updated.Initializing++ + case k8s.PodStatusRunning: + api.Status.ReplicaCounts.Updated.Initializing++ + case k8s.PodStatusErrImagePull: + api.Status.ReplicaCounts.Updated.ErrImagePull++ + case k8s.PodStatusTerminating: + api.Status.ReplicaCounts.Updated.Terminating++ + case k8s.PodStatusFailed: + api.Status.ReplicaCounts.Updated.Failed++ + case k8s.PodStatusKilled: + api.Status.ReplicaCounts.Updated.Killed++ + case k8s.PodStatusKilledOOM: + api.Status.ReplicaCounts.Updated.KilledOOM++ + default: + api.Status.ReplicaCounts.Updated.Unknown++ + } + } + + return nil +} + +func (r *RealtimeAPIReconciler) getStatusCode(api *serverless.RealtimeAPI) status.Code { + counts := api.Status.ReplicaCounts + if counts.Updated.Ready >= api.Spec.Pod.Replicas { + return status.Live + } + + if counts.Updated.ErrImagePull > 0 { + return status.ErrorImagePull + } + + if counts.Updated.Failed > 0 || counts.Updated.Killed > 0 { + return status.Error + } + + if counts.Updated.KilledOOM > 0 { + return status.OOM + } + + if counts.Updated.Stalled > 0 { + return status.Stalled + } + + if counts.Updated.Ready >= api.Spec.Autoscaling.MinReplicas { + return status.Live + } + + return status.Updating +} + func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api serverless.RealtimeAPI) (controllerutil.OperationResult, error) { deployment := kapps.Deployment{ ObjectMeta: kmeta.ObjectMeta{ diff --git a/pkg/types/status/code.go b/pkg/types/status/code.go index 17fc8ca12f..11d9c002ea 100644 --- a/pkg/types/status/code.go +++ b/pkg/types/status/code.go @@ -31,14 +31,14 @@ const ( ) var _codes = []string{ - "status_unknown", - "status_pending", - "status_stalled", - "status_error", - "status_error_image_pull", - "status_oom", - "status_live", - "status_updating", + "unknown", + "pending", + "stalled", + "error", + "error_image_pull", + "oom", + "live", + "updating", } var _ = [1]int{}[int(Updating)-(len(_codes)-1)] // Ensure list length matches diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index 6dad4e1992..b1ef426504 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -24,22 +24,23 @@ type Status struct { } type ReplicaCounts struct { - Updated SubReplicaCounts `json:"updated"` - Stale SubReplicaCounts `json:"stale"` - Requested int32 `json:"requested"` + Updated SubReplicaCounts `json:"updated,omitempty"` + Stale SubReplicaCounts `json:"stale,omitempty"` + Requested int32 `json:"requested,omitempty"` } type SubReplicaCounts struct { - Pending int32 `json:"pending"` - Initializing int32 `json:"initializing"` + Pending int32 `json:"pending,omitempty"` + Initializing int32 `json:"initializing,omitempty"` Ready int32 `json:"ready"` - ErrImagePull int32 `json:"err_image_pull"` - Terminating int32 `json:"terminating"` - Failed int32 `json:"failed"` - Killed int32 `json:"killed"` - KilledOOM int32 `json:"killed_oom"` - Stalled int32 `json:"stalled"` // pending for a long time - Unknown int32 `json:"unknown"` + NotReady int32 `json:"not_ready,omitempty"` + ErrImagePull int32 `json:"err_image_pull,omitempty"` + Terminating int32 `json:"terminating,omitempty"` + Failed int32 `json:"failed,omitempty"` + Killed int32 `json:"killed,omitempty"` + KilledOOM int32 `json:"killed_oom,omitempty"` + Stalled int32 `json:"stalled,omitempty"` // pending for a long time + Unknown int32 `json:"unknown,omitempty"` } // Worker counts don't have as many failure variations because Jobs clean up dead pods, so counting different failure scenarios isn't interesting From 766f154e4421220a078c067a8ca3aa5cb7f08e3f Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Thu, 22 Jul 2021 14:58:55 +0200 Subject: [PATCH 18/42] Handle api ids annotations --- .../serverless/realtimeapi_controller.go | 22 +++++++++--- .../realtimeapi_controller_helpers.go | 36 +++++++++++-------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index f6cbf8ef6f..1697be70bf 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -85,13 +85,27 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Step 3: Get or create deployment and API ids - deploymentID, apiID := r.getOrCreateAPIIDs(api) - if api.Annotations["cortex.dev/deployment-id"] == "" || - api.Annotations["cortex.dev/api-id"] == "" { + deploymentID, specID, apiID := r.getOrCreateAPIIDs(api) + idsOutdated := api.Annotations["cortex.dev/deployment-id"] != deploymentID || + api.Annotations["cortex.dev/spec-id"] != specID || + api.Annotations["cortex.dev/api-id"] != apiID - log.V(1).Info("creating api and deployment id annotations") + if api.Annotations["cortex.dev/deployment-id"] != deploymentID { + log.V(1).Info("updating deployment id annotation") api.Annotations["cortex.dev/deployment-id"] = deploymentID + } + + if api.Annotations["cortex.dev/spec-id"] != specID { + log.V(1).Info("updating spec id annotation") + api.Annotations["cortex.dev/spec-id"] = specID + } + + if api.Annotations["cortex.dev/api-id"] != apiID { + log.V(1).Info("updating api id annotation") api.Annotations["cortex.dev/api-id"] = apiID + } + + if idsOutdated { if err = r.Update(ctx, &api); err != nil { return ctrl.Result{}, err } diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 632f11f9fd..c561539e50 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -261,7 +261,7 @@ func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *serverless func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) kapps.Deployment { containers, volumes := r.desiredContainers(api) - deploymentID, apiID := r.getOrCreateAPIIDs(api) + deploymentID, _, apiID := r.getOrCreateAPIIDs(api) return *k8s.Deployment(&k8s.DeploymentSpec{ Name: workloads.K8sName(api.Name), @@ -339,7 +339,7 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI activatorWeight = 100 } - deploymentID, apiID := r.getOrCreateAPIIDs(api) + deploymentID, _, apiID := r.getOrCreateAPIIDs(api) return *k8s.VirtualService(&k8s.VirtualServiceSpec{ Name: workloads.K8sName(api.Name), @@ -524,29 +524,35 @@ func (r *RealtimeAPIReconciler) proxyContainer(api serverless.RealtimeAPI) (kcor }, workloads.ClusterConfigVolume() } -func (r *RealtimeAPIReconciler) getOrCreateAPIIDs(api serverless.RealtimeAPI) (deploymentID string, apiID string) { +func (r *RealtimeAPIReconciler) getOrCreateAPIIDs(api serverless.RealtimeAPI) (deploymentID string, specID string, apiID string) { deploymentID = api.Annotations["cortex.dev/deployment-id"] if deploymentID == "" { deploymentID = k8s.RandomName()[:10] } + specID = r.getSpecHash(api) + apiID = api.Annotations["cortex.dev/api-id"] - if apiID == "" { - var buf bytes.Buffer - - buf.WriteString(api.Name) - buf.WriteString(s.Obj(api.TypeMeta)) - buf.WriteString(s.Obj(api.Spec.Pod)) - buf.WriteString(s.Obj(api.Spec.Networking)) - buf.WriteString(s.Obj(api.Spec.Autoscaling)) - buf.WriteString(s.Obj(api.Spec.NodeGroups)) - buf.WriteString(s.Obj(api.Spec.UpdateStrategy)) - specID := hash.Bytes(buf.Bytes())[:32] + if apiID == "" || + api.Annotations["cortex.dev/deployment-id"] != deploymentID || + api.Annotations["cortex.dev/spec-id"] != specID { apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) } - return deploymentID, apiID + return deploymentID, specID, apiID +} + +func (r *RealtimeAPIReconciler) getSpecHash(api serverless.RealtimeAPI) string { + var buf bytes.Buffer + buf.WriteString(api.Name) + buf.WriteString(s.Obj(api.TypeMeta)) + buf.WriteString(s.Obj(api.Spec.Pod)) + buf.WriteString(s.Obj(api.Spec.Networking)) + buf.WriteString(s.Obj(api.Spec.Autoscaling)) + buf.WriteString(s.Obj(api.Spec.NodeGroups)) + buf.WriteString(s.Obj(api.Spec.UpdateStrategy)) + return hash.Bytes(buf.Bytes())[:32] } func (r *RealtimeAPIReconciler) generateAPIAnnotations(api serverless.RealtimeAPI) map[string]string { From 07af2a828b57eafa811dc9e7f61c7bf479b5e43b Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Sat, 24 Jul 2021 17:39:49 +0200 Subject: [PATCH 19/42] WIP: refactor UpdateAPI function for RealtimeAPI to work with the CRD --- pkg/config/config.go | 2 + .../serverless/v1alpha1/realtimeapi_types.go | 18 +- .../v1alpha1/zz_generated.deepcopy.go | 6 +- .../serverless.cortex.dev_realtimeapis.yaml | 39 ++-- .../serverless/realtimeapi_controller.go | 4 + .../realtimeapi_controller_helpers.go | 10 +- pkg/operator/resources/realtimeapi/api.go | 174 ++++++++++++------ 7 files changed, 152 insertions(+), 101 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index eb7bd5e269..0e8b9bc566 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -24,6 +24,7 @@ import ( "github.com/DataDog/datadog-go/statsd" "github.com/cortexlabs/cortex/pkg/consts" batch "github.com/cortexlabs/cortex/pkg/crds/apis/batch/v1alpha1" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/aws" cr "github.com/cortexlabs/cortex/pkg/lib/configreader" "github.com/cortexlabs/cortex/pkg/lib/errors" @@ -55,6 +56,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(batch.AddToScheme(scheme)) + utilruntime.Must(serverless.AddToScheme(scheme)) } func InitConfigs(clusterConfig *clusterconfig.Config, operatorMetadata *clusterconfig.OperatorMetadata) { diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 023b1967aa..66749debe3 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -148,7 +148,7 @@ type AutoscalingSpec struct { // +kubebuilder:validation:Optional // Desired number of in-flight requests per replica (including requests actively being processed as well as queued), // which the autoscaler tries to maintain - TargetInFlight int32 `json:"target_in_flight,omitempty"` + TargetInFlight string `json:"target_in_flight,omitempty"` // +kubebuilder:validation:Optional // +kubebuilder:default="60s" @@ -166,25 +166,25 @@ type AutoscalingSpec struct { UpscaleStabilizationPeriod kmeta.Duration `json:"upscale_stabilization_period,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="750m" + // +kubebuilder:default="0.75" // Maximum factor by which to scale down the API on a single scaling event - MaxDownscaleFactor resource.Quantity `json:"max_downscale_factor,omitempty"` + MaxDownscaleFactor string `json:"max_downscale_factor,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="1500m" + // +kubebuilder:default="1.5" // Maximum factor by which to scale up the API on a single scaling event - MaxUpscaleFactor resource.Quantity `json:"max_upscale_factor,omitempty"` + MaxUpscaleFactor string `json:"max_upscale_factor,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="50m" + // +kubebuilder:default="0.5" // Any recommendation falling within this factor below the current number of replicas will not trigger a // scale down event - DownscaleTolerance resource.Quantity `json:"downscale_tolerance,omitempty"` + DownscaleTolerance string `json:"downscale_tolerance,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="50m" + // +kubebuilder:default="0.5" // Any recommendation falling within this factor above the current number of replicas will not trigger a scale up event - UpscaleTolerance resource.Quantity `json:"upscale_tolerance,omitempty"` + UpscaleTolerance string `json:"upscale_tolerance,omitempty"` } type UpdateStratagySpec struct { diff --git a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go index 9376932a2d..030c91042f 100644 --- a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go @@ -31,10 +31,6 @@ func (in *AutoscalingSpec) DeepCopyInto(out *AutoscalingSpec) { out.Window = in.Window out.DownscaleStabilizationPeriod = in.DownscaleStabilizationPeriod out.UpscaleStabilizationPeriod = in.UpscaleStabilizationPeriod - out.MaxDownscaleFactor = in.MaxDownscaleFactor.DeepCopy() - out.MaxUpscaleFactor = in.MaxUpscaleFactor.DeepCopy() - out.DownscaleTolerance = in.DownscaleTolerance.DeepCopy() - out.UpscaleTolerance = in.UpscaleTolerance.DeepCopy() } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoscalingSpec. @@ -224,7 +220,7 @@ func (in *RealtimeAPIList) DeepCopyObject() runtime.Object { func (in *RealtimeAPISpec) DeepCopyInto(out *RealtimeAPISpec) { *out = *in in.Pod.DeepCopyInto(&out.Pod) - in.Autoscaling.DeepCopyInto(&out.Autoscaling) + out.Autoscaling = in.Autoscaling if in.NodeGroups != nil { in, out := &in.NodeGroups, &out.NodeGroups *out = make([]string, len(*in)) diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 61e2d626f2..3e8b6d267b 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -60,38 +60,26 @@ spec: made during this period type: string downscale_tolerance: - anyOf: - - type: integer - - type: string - default: 50m + default: "0.5" description: Any recommendation falling within this factor below the current number of replicas will not trigger a scale down event - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string max_downscale_factor: - anyOf: - - type: integer - - type: string - default: 750m + default: "0.75" description: Maximum factor by which to scale down the API on a single scaling event - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string max_replicas: default: 100 description: Maximum number of replicas format: int32 type: integer max_upscale_factor: - anyOf: - - type: integer - - type: string - default: 1500m + default: "1.5" description: Maximum factor by which to scale up the API on a single scaling event - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string min_replicas: default: 1 description: Minimum number of replicas @@ -101,22 +89,17 @@ spec: description: Desired number of in-flight requests per replica (including requests actively being processed as well as queued), which the autoscaler tries to maintain - format: int32 - type: integer + type: string upscale_stabilization_period: default: 1m description: The API will not scale above the lowest recommendation made during this period type: string upscale_tolerance: - anyOf: - - type: integer - - type: string - default: 50m + default: "0.5" description: Any recommendation falling within this factor above the current number of replicas will not trigger a scale up event - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string window: default: 60s description: Duration over which to average the API's in-flight @@ -661,6 +644,8 @@ spec: unknown: format: int32 type: integer + required: + - ready type: object updated: properties: @@ -697,6 +682,8 @@ spec: unknown: format: int32 type: integer + required: + - ready type: object type: object status: diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index 1697be70bf..fa970708c1 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -90,6 +90,10 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) api.Annotations["cortex.dev/spec-id"] != specID || api.Annotations["cortex.dev/api-id"] != apiID + if api.Annotations == nil { + api.Annotations = map[string]string{} + } + if api.Annotations["cortex.dev/deployment-id"] != deploymentID { log.V(1).Info("updating deployment id annotation") api.Annotations["cortex.dev/deployment-id"] = deploymentID diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index c561539e50..41316512bc 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -559,13 +559,13 @@ func (r *RealtimeAPIReconciler) generateAPIAnnotations(api serverless.RealtimeAP return map[string]string{ userconfig.MinReplicasAnnotationKey: s.Int32(api.Spec.Autoscaling.MinReplicas), userconfig.MaxReplicasAnnotationKey: s.Int32(api.Spec.Autoscaling.MaxReplicas), - userconfig.TargetInFlightAnnotationKey: s.Int32(api.Spec.Autoscaling.TargetInFlight), + userconfig.TargetInFlightAnnotationKey: api.Spec.Autoscaling.TargetInFlight, userconfig.WindowAnnotationKey: api.Spec.Autoscaling.Window.Duration.String(), userconfig.DownscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration.String(), userconfig.UpscaleStabilizationPeriodAnnotationKey: api.Spec.Autoscaling.UpscaleStabilizationPeriod.Duration.String(), - userconfig.MaxDownscaleFactorAnnotationKey: s.Float64(api.Spec.Autoscaling.MaxDownscaleFactor.AsApproximateFloat64()), - userconfig.MaxUpscaleFactorAnnotationKey: s.Float64(api.Spec.Autoscaling.MaxUpscaleFactor.AsApproximateFloat64()), - userconfig.DownscaleToleranceAnnotationKey: s.Float64(api.Spec.Autoscaling.DownscaleTolerance.AsApproximateFloat64()), - userconfig.UpscaleToleranceAnnotationKey: s.Float64(api.Spec.Autoscaling.UpscaleTolerance.AsApproximateFloat64()), + userconfig.MaxDownscaleFactorAnnotationKey: api.Spec.Autoscaling.MaxDownscaleFactor, + userconfig.MaxUpscaleFactorAnnotationKey: api.Spec.Autoscaling.MaxUpscaleFactor, + userconfig.DownscaleToleranceAnnotationKey: api.Spec.Autoscaling.DownscaleTolerance, + userconfig.UpscaleToleranceAnnotationKey: api.Spec.Autoscaling.UpscaleTolerance, } } diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 256b253f8e..84e24bf4b8 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -17,16 +17,18 @@ limitations under the License. package realtimeapi import ( + "context" "fmt" "path/filepath" - "time" + "reflect" "github.com/cortexlabs/cortex/pkg/config" + "github.com/cortexlabs/cortex/pkg/consts" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/lib/pointer" - "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" @@ -36,6 +38,11 @@ import ( istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + kresource "k8s.io/apimachinery/pkg/api/resource" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" ) const _realtimeDashboardUID = "realtimeapi" @@ -45,67 +52,35 @@ func generateDeploymentID() string { } func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) { - prevDeployment, prevService, prevVirtualService, err := getK8sResources(apiConfig.Name) - if err != nil { - return nil, "", err - } - - initialDeploymentTime := time.Now().UnixNano() - deploymentID := generateDeploymentID() - if prevVirtualService != nil && prevVirtualService.Labels["initialDeploymentTime"] != "" { - var err error - initialDeploymentTime, err = k8s.ParseInt64Label(prevVirtualService, "initialDeploymentTime") - if err != nil { - return nil, "", err - } - deploymentID = prevVirtualService.Labels["deploymentID"] - } + ctx := context.Background() + var api serverless.RealtimeAPI + key := client.ObjectKey{Namespace: consts.DefaultNamespace, Name: apiConfig.Name} - api := spec.GetAPISpec(apiConfig, initialDeploymentTime, deploymentID, config.ClusterConfig.ClusterUID) - - if prevDeployment == nil { - if err := config.AWS.UploadJSONToS3(api, config.ClusterConfig.Bucket, api.Key); err != nil { - return nil, "", errors.Wrap(err, "upload api spec") - } - - if err := applyK8sResources(api, prevDeployment, prevService, prevVirtualService); err != nil { - routines.RunWithPanicHandler(func() { - _ = deleteK8sResources(api.Name) - }) - return nil, "", err + apiSpec := &spec.API{API: apiConfig} + err := config.K8s.Get(ctx, key, &api) + if err != nil { + if kerrors.IsNotFound(err) { + if kerrors.IsNotFound(err) { + api := APIConfigToK8sResource(*apiConfig) + if err = config.K8s.Create(ctx, &api); err != nil { + return nil, "", errors.Wrap(err, "failed to create realtime api resource") + } + return apiSpec, fmt.Sprintf("creating %s", apiConfig.Resource.UserString()), nil + } } - - return api, fmt.Sprintf("creating %s", api.Resource.UserString()), nil + return nil, "", errors.Wrap(err, "failed to get realtime api resource") } - if prevVirtualService.Labels["specID"] != api.SpecID || prevVirtualService.Labels["deploymentID"] != api.DeploymentID { - isUpdating, err := isAPIUpdating(prevDeployment) - if err != nil { - return nil, "", err - } - if isUpdating && !force { - return nil, "", ErrorAPIUpdating(api.Name) + desiredAPI := APIConfigToK8sResource(*apiConfig) + if !reflect.DeepEqual(api.Spec, desiredAPI.Spec) || force { + api.Spec = desiredAPI.Spec + if err = config.K8s.Update(ctx, &api); err != nil { + return nil, "", errors.Wrap(err, "failed to update realtime api resource") } - - if err := config.AWS.UploadJSONToS3(api, config.ClusterConfig.Bucket, api.Key); err != nil { - return nil, "", errors.Wrap(err, "upload api spec") - } - - if err := applyK8sResources(api, prevDeployment, prevService, prevVirtualService); err != nil { - return nil, "", err - } - return api, fmt.Sprintf("updating %s", api.Resource.UserString()), nil + return apiSpec, fmt.Sprintf("updating %s", apiConfig.Resource.UserString()), nil } - // deployment didn't change - isUpdating, err := isAPIUpdating(prevDeployment) - if err != nil { - return nil, "", err - } - if isUpdating { - return api, fmt.Sprintf("%s is already updating", api.Resource.UserString()), nil - } - return api, fmt.Sprintf("%s is up to date", api.Resource.UserString()), nil + return apiSpec, fmt.Sprintf("%s is up to date", apiConfig.Resource.UserString()), nil } func RefreshAPI(apiName string, force bool) (string, error) { @@ -396,3 +371,90 @@ func getDashboardURL(apiName string) string { return dashboardURL } + +func APIConfigToK8sResource(apiConfig userconfig.API) serverless.RealtimeAPI { + var containers []serverless.ContainerSpec + for _, containerConfig := range apiConfig.Pod.Containers { + var env []kcore.EnvVar + for k, v := range containerConfig.Env { + env = append(env, kcore.EnvVar{ + Name: k, + Value: v, + }) + } + + var compute *serverless.ComputeSpec + if containerConfig.Compute != nil { + var cpu *kresource.Quantity + if containerConfig.Compute.CPU != nil { + cpu = &containerConfig.Compute.CPU.Quantity + } + var mem *kresource.Quantity + if containerConfig.Compute.Mem != nil { + mem = &containerConfig.Compute.Mem.Quantity + } + var shm *kresource.Quantity + if containerConfig.Compute.Shm != nil { + shm = &containerConfig.Compute.Shm.Quantity + } + + compute = &serverless.ComputeSpec{ + CPU: cpu, + GPU: containerConfig.Compute.GPU, + Inf: containerConfig.Compute.Inf, + Mem: mem, + Shm: shm, + } + } + + container := serverless.ContainerSpec{ + Name: containerConfig.Name, + Image: containerConfig.Image, + Command: containerConfig.Command, + Args: containerConfig.Args, + Env: env, + Compute: compute, + ReadinessProbe: workloads.GetProbeSpec(containerConfig.ReadinessProbe), + LivenessProbe: workloads.GetProbeSpec(containerConfig.LivenessProbe), + } + + containers = append(containers, container) + } + + api := serverless.RealtimeAPI{ + ObjectMeta: kmeta.ObjectMeta{ + Name: apiConfig.Name, + Namespace: consts.DefaultNamespace, + }, + Spec: serverless.RealtimeAPISpec{ + Pod: serverless.PodSpec{ + Port: *apiConfig.Pod.Port, + MaxConcurrency: int32(apiConfig.Pod.MaxConcurrency), + MaxQueueLength: int32(apiConfig.Pod.MaxQueueLength), + Replicas: apiConfig.Autoscaling.InitReplicas, + Containers: containers, + }, + Autoscaling: serverless.AutoscalingSpec{ + MinReplicas: apiConfig.Autoscaling.MinReplicas, + MaxReplicas: apiConfig.Autoscaling.MaxReplicas, + TargetInFlight: fmt.Sprintf("%f", *apiConfig.Autoscaling.TargetInFlight), + Window: kmeta.Duration{Duration: apiConfig.Autoscaling.Window}, + DownscaleStabilizationPeriod: kmeta.Duration{Duration: apiConfig.Autoscaling.DownscaleStabilizationPeriod}, + UpscaleStabilizationPeriod: kmeta.Duration{Duration: apiConfig.Autoscaling.UpscaleStabilizationPeriod}, + MaxDownscaleFactor: fmt.Sprintf("%f", apiConfig.Autoscaling.MaxDownscaleFactor), + MaxUpscaleFactor: fmt.Sprintf("%f", apiConfig.Autoscaling.MaxUpscaleFactor), + DownscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.DownscaleTolerance), + UpscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.UpscaleTolerance), + }, + NodeGroups: apiConfig.NodeGroups, + UpdateStrategy: serverless.UpdateStratagySpec{ + MaxSurge: intstr.FromString(apiConfig.UpdateStrategy.MaxSurge), + MaxUnavailable: intstr.FromString(apiConfig.UpdateStrategy.MaxUnavailable), + }, + Networking: serverless.NetworkingSpec{ + Endpoint: *apiConfig.Networking.Endpoint, + }, + }, + } + return api +} From 9feb4e765f5359b3c6dc96c58c97668348047573 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Sun, 25 Jul 2021 12:37:43 +0200 Subject: [PATCH 20/42] Implementation of get, refresh and delete operations for realtime apis in the cortex operator --- pkg/operator/endpoints/logs.go | 2 +- pkg/operator/resources/realtimeapi/api.go | 344 ++++++------------ .../resources/realtimeapi/k8s_specs.go | 182 --------- pkg/operator/resources/realtimeapi/status.go | 174 --------- pkg/operator/resources/resources.go | 12 +- 5 files changed, 110 insertions(+), 604 deletions(-) delete mode 100644 pkg/operator/resources/realtimeapi/k8s_specs.go delete mode 100644 pkg/operator/resources/realtimeapi/status.go diff --git a/pkg/operator/endpoints/logs.go b/pkg/operator/endpoints/logs.go index 2d335e27da..dbe10828b1 100644 --- a/pkg/operator/endpoints/logs.go +++ b/pkg/operator/endpoints/logs.go @@ -107,7 +107,7 @@ func GetLogURL(w http.ResponseWriter, r *http.Request) { LogURL: logURL, }) case userconfig.RealtimeAPIKind: - apiResponse, err := realtimeapi.GetAPIByName(deployedResource) + apiResponse, err := realtimeapi.GetAPIByName(apiName) if err != nil { respondError(w, r, err) return diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 84e24bf4b8..9405235faf 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -19,28 +19,27 @@ package realtimeapi import ( "context" "fmt" - "path/filepath" "reflect" + "time" "github.com/cortexlabs/cortex/pkg/config" "github.com/cortexlabs/cortex/pkg/consts" serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/lib/pointer" + s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" - istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" - kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" kresource "k8s.io/apimachinery/pkg/api/resource" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + ktypes "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -74,6 +73,8 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) desiredAPI := APIConfigToK8sResource(*apiConfig) if !reflect.DeepEqual(api.Spec, desiredAPI.Spec) || force { api.Spec = desiredAPI.Spec + api.Annotations["cortex.dev/last-updated"] = s.Int64(time.Now().Unix()) + if err = config.K8s.Update(ctx, &api); err != nil { return nil, "", errors.Wrap(err, "failed to update realtime api resource") } @@ -83,281 +84,148 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) return apiSpec, fmt.Sprintf("%s is up to date", apiConfig.Resource.UserString()), nil } -func RefreshAPI(apiName string, force bool) (string, error) { - prevDeployment, prevService, prevVirtualService, err := getK8sResources(apiName) - if err != nil { - return "", err - } else if prevDeployment == nil || prevVirtualService == nil { - return "", errors.ErrorUnexpected("unable to find deployment", apiName) - } - - isUpdating, err := isAPIUpdating(prevDeployment) - if err != nil { - return "", err - } - - if isUpdating && !force { - return "", ErrorAPIUpdating(apiName) - } - - apiID, err := k8s.GetLabel(prevDeployment, "apiID") - if err != nil { - return "", err - } - - api, err := operator.DownloadAPISpec(apiName, apiID) - if err != nil { - return "", err - } - - initialDeploymentTime, err := k8s.ParseInt64Label(prevVirtualService, "initialDeploymentTime") - if err != nil { - return "", err +func RefreshAPI(apiName string) (string, error) { + ctx := context.Background() + api := serverless.RealtimeAPI{ + ObjectMeta: kmeta.ObjectMeta{ + Namespace: consts.DefaultNamespace, + Name: apiName, + }, } - api = spec.GetAPISpec(api.API, initialDeploymentTime, generateDeploymentID(), config.ClusterConfig.ClusterUID) - - if err := config.AWS.UploadJSONToS3(api, config.ClusterConfig.Bucket, api.Key); err != nil { - return "", errors.Wrap(err, "upload api spec") + // slashes are encoded as ~1 in the json patch + patch := []byte(fmt.Sprintf( + "[{\"op\": \"replace\", \"path\": \"/metadata/annotations/cortex.dev~1deployment-id\", \"value\": \"%s\" }]", + generateDeploymentID())) + if err := config.K8s.Patch(ctx, &api, client.RawPatch(ktypes.JSONPatchType, patch)); err != nil { + return "", errors.Wrap(err, "failed to get realtime api resource") } - if err := applyK8sResources(api, prevDeployment, prevService, prevVirtualService); err != nil { - return "", err + apiResource := userconfig.Resource{ + Name: apiName, + Kind: userconfig.RealtimeAPIKind, } - return fmt.Sprintf("updating %s", api.Resource.UserString()), nil + return fmt.Sprintf("updating %s", apiResource.UserString()), nil } func DeleteAPI(apiName string, keepCache bool) error { - err := parallel.RunFirstErr( - func() error { - return deleteK8sResources(apiName) - }, - func() error { - if keepCache { - return nil - } - // best effort deletion, swallow errors because there could be weird error messages - _ = deleteBucketResources(apiName) - return nil + ctx := context.Background() + api := serverless.RealtimeAPI{ + ObjectMeta: kmeta.ObjectMeta{ + Name: apiName, + Namespace: consts.DefaultNamespace, }, - ) - - if err != nil { - return err + } + if err := config.K8s.Delete(ctx, &api); err != nil { + return errors.Wrap(err, "failed to delete realtime api resource") } + // TODO: delete bucket resources (?) + return nil } -func GetAllAPIs(pods []kcore.Pod, deployments []kapps.Deployment) ([]schema.APIResponse, error) { - statuses, err := GetAllStatuses(deployments, pods) - if err != nil { - return nil, err - } - - apiNames, apiIDs := namesAndIDsFromStatuses(statuses) - apis, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err +func GetAllAPIs() ([]schema.APIResponse, error) { + ctx := context.Background() + apis := serverless.RealtimeAPIList{} + if err := config.K8s.List(ctx, &apis); err != nil { + return nil, errors.Wrap(err, "failed to list realtime api resources") } - realtimeAPIs := make([]schema.APIResponse, len(apis)) - - for i := range apis { - api := apis[i] - endpoint, err := operator.APIEndpoint(&api) - if err != nil { - return nil, err + realtimeAPIs := make([]schema.APIResponse, len(apis.Items)) + for i := range apis.Items { + api := apis.Items[i] + api.Status.ReplicaCounts.Requested = api.Spec.Pod.Replicas + + lastUpdated := api.CreationTimestamp.Unix() + if api.Annotations["cortex.dev/last-updated"] != "" { + var ok bool + lastUpdated, ok = s.ParseInt64(api.Annotations["cortex.dev/last-updated"]) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse 'cortex.dev/last-updated' annotation") + } } realtimeAPIs[i] = schema.APIResponse{ - Spec: api, - Status: &statuses[i], - Endpoint: endpoint, + Spec: spec.API{ + API: &userconfig.API{ + Resource: userconfig.Resource{ + Name: api.Name, + Kind: userconfig.RealtimeAPIKind, + }, + }, + LastUpdated: lastUpdated, + InitialDeploymentTime: api.CreationTimestamp.Unix(), + }, + Status: &status.Status{ + APIName: api.Name, + APIID: api.Annotations["cortex.dev/api-id"], + Code: api.Status.Status, + ReplicaCounts: api.Status.ReplicaCounts, + }, + Endpoint: api.Status.Endpoint, } } return realtimeAPIs, nil } -func namesAndIDsFromStatuses(statuses []status.Status) ([]string, []string) { - apiNames := make([]string, len(statuses)) - apiIDs := make([]string, len(statuses)) +func GetAPIByName(apiName string) ([]schema.APIResponse, error) { + ctx := context.Background() - for i, st := range statuses { - apiNames[i] = st.APIName - apiIDs[i] = st.APIID + api := serverless.RealtimeAPI{} + key := client.ObjectKey{Namespace: consts.DefaultNamespace, Name: apiName} + if err := config.K8s.Get(ctx, key, &api); err != nil { + return nil, errors.Wrap(err, "failed to get realtime api resource") } - return apiNames, apiIDs -} + // TODO: needs api id history + //api, err := operator.DownloadAPISpec(st.APIName, st.APIID) + //if err != nil { + // return nil, err + //} -func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - st, err := GetStatus(deployedResource.Name) - if err != nil { - return nil, err - } - - api, err := operator.DownloadAPISpec(st.APIName, st.APIID) - if err != nil { - return nil, err - } + dashboardURL := pointer.String(getDashboardURL(api.Name)) - apiEndpoint, err := operator.APIEndpoint(api) - if err != nil { - return nil, err + lastUpdated := api.CreationTimestamp.Unix() + if api.Annotations["cortex.dev/last-updated"] != "" { + var ok bool + lastUpdated, ok = s.ParseInt64(api.Annotations["cortex.dev/last-updated"]) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse 'cortex.dev/last-updated' annotation") + } } - dashboardURL := pointer.String(getDashboardURL(api.Name)) + api.Status.ReplicaCounts.Requested = api.Spec.Pod.Replicas return []schema.APIResponse{ { - Spec: *api, - Status: st, - Endpoint: apiEndpoint, + Spec: spec.API{ + API: &userconfig.API{ + Resource: userconfig.Resource{ + Name: api.Name, + Kind: userconfig.RealtimeAPIKind, + }, + }, + ID: api.Annotations["cortex.dev/api-id"], + SpecID: api.Annotations["cortex.dev/spec-id"], + DeploymentID: api.Annotations["cortex.dev/deployment-id"], + InitialDeploymentTime: api.CreationTimestamp.Unix(), + LastUpdated: lastUpdated, + }, + Status: &status.Status{ + APIName: api.Name, + APIID: api.Annotations["cortex.dev/api-id"], + Code: api.Status.Status, + ReplicaCounts: api.Status.ReplicaCounts, + }, + Endpoint: api.Status.Endpoint, DashboardURL: dashboardURL, }, }, nil } -func getK8sResources(apiName string) (*kapps.Deployment, *kcore.Service, *istioclientnetworking.VirtualService, error) { - var deployment *kapps.Deployment - var service *kcore.Service - var virtualService *istioclientnetworking.VirtualService - - err := parallel.RunFirstErr( - func() error { - var err error - deployment, err = config.K8s.GetDeployment(workloads.K8sName(apiName)) - return err - }, - func() error { - var err error - service, err = config.K8s.GetService(workloads.K8sName(apiName)) - return err - }, - func() error { - var err error - virtualService, err = config.K8s.GetVirtualService(workloads.K8sName(apiName)) - return err - }, - ) - - return deployment, service, virtualService, err -} - -func applyK8sResources(api *spec.API, prevDeployment *kapps.Deployment, prevService *kcore.Service, prevVirtualService *istioclientnetworking.VirtualService) error { - return parallel.RunFirstErr( - func() error { - return applyK8sDeployment(api, prevDeployment) - }, - func() error { - return applyK8sService(api, prevService) - }, - func() error { - return applyK8sVirtualService(api, prevVirtualService) - }, - ) -} - -func applyK8sDeployment(api *spec.API, prevDeployment *kapps.Deployment) error { - newDeployment := deploymentSpec(api, prevDeployment) - - if prevDeployment == nil { - _, err := config.K8s.CreateDeployment(newDeployment) - if err != nil { - return err - } - } else if prevDeployment.Status.ReadyReplicas == 0 { - // Delete deployment if it never became ready - _, _ = config.K8s.DeleteDeployment(workloads.K8sName(api.Name)) - _, err := config.K8s.CreateDeployment(newDeployment) - if err != nil { - return err - } - } else { - _, err := config.K8s.UpdateDeployment(newDeployment) - if err != nil { - return err - } - } - - return nil -} - -func applyK8sService(api *spec.API, prevService *kcore.Service) error { - newService := serviceSpec(api) - - if prevService == nil { - _, err := config.K8s.CreateService(newService) - return err - } - - _, err := config.K8s.UpdateService(prevService, newService) - return err -} - -func applyK8sVirtualService(api *spec.API, prevVirtualService *istioclientnetworking.VirtualService) error { - newVirtualService := virtualServiceSpec(api) - - if prevVirtualService == nil { - _, err := config.K8s.CreateVirtualService(newVirtualService) - return err - } - - _, err := config.K8s.UpdateVirtualService(prevVirtualService, newVirtualService) - return err -} - -func deleteK8sResources(apiName string) error { - return parallel.RunFirstErr( - func() error { - _, err := config.K8s.DeleteDeployment(workloads.K8sName(apiName)) - return err - }, - func() error { - _, err := config.K8s.DeleteService(workloads.K8sName(apiName)) - return err - }, - func() error { - _, err := config.K8s.DeleteVirtualService(workloads.K8sName(apiName)) - return err - }, - ) -} - -func deleteBucketResources(apiName string) error { - prefix := filepath.Join(config.ClusterConfig.ClusterUID, "apis", apiName) - return config.AWS.DeleteS3Dir(config.ClusterConfig.Bucket, prefix, true) -} - -// returns true if min_replicas are not ready and no updated replicas have errored -func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { - pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) - if err != nil { - return false, err - } - - replicaCounts := getReplicaCounts(deployment, pods) - - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) - if err != nil { - return false, err - } - - if replicaCounts.Updated.Ready < autoscalingSpec.MinReplicas && replicaCounts.Updated.TotalFailed() == 0 { - return true, nil - } - - return false, nil -} - -func isPodSpecLatest(deployment *kapps.Deployment, pod *kcore.Pod) bool { - return deployment.Spec.Template.Labels["podID"] == pod.Labels["podID"] && - deployment.Spec.Template.Labels["deploymentID"] == pod.Labels["deploymentID"] -} - func getDashboardURL(apiName string) string { loadBalancerURL, err := operator.LoadBalancerURL() if err != nil { diff --git a/pkg/operator/resources/realtimeapi/k8s_specs.go b/pkg/operator/resources/realtimeapi/k8s_specs.go deleted file mode 100644 index 7a6824d14d..0000000000 --- a/pkg/operator/resources/realtimeapi/k8s_specs.go +++ /dev/null @@ -1,182 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package realtimeapi - -import ( - "fmt" - - "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/pointer" - s "github.com/cortexlabs/cortex/pkg/lib/strings" - "github.com/cortexlabs/cortex/pkg/types/spec" - "github.com/cortexlabs/cortex/pkg/workloads" - istionetworking "istio.io/api/networking/v1beta1" - istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" - kapps "k8s.io/api/apps/v1" - kcore "k8s.io/api/core/v1" -) - -var _terminationGracePeriodSeconds int64 = 60 // seconds - -func deploymentSpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deployment { - containers, volumes := workloads.RealtimeContainers(*api) - - return k8s.Deployment(&k8s.DeploymentSpec{ - Name: workloads.K8sName(api.Name), - Replicas: getRequestedReplicasFromDeployment(*api, prevDeployment), - MaxSurge: pointer.String(api.UpdateStrategy.MaxSurge), - MaxUnavailable: pointer.String(api.UpdateStrategy.MaxUnavailable), - Labels: map[string]string{ - "apiName": api.Name, - "apiKind": api.Kind.String(), - "apiID": api.ID, - "specID": api.SpecID, - "initialDeploymentTime": s.Int64(api.InitialDeploymentTime), - "deploymentID": api.DeploymentID, - "podID": api.PodID, - "cortex.dev/api": "true", - }, - Annotations: api.ToK8sAnnotations(), - Selector: map[string]string{ - "apiName": api.Name, - "apiKind": api.Kind.String(), - }, - PodSpec: k8s.PodSpec{ - Labels: map[string]string{ - "apiName": api.Name, - "apiKind": api.Kind.String(), - "initialDeploymentTime": s.Int64(api.InitialDeploymentTime), - "deploymentID": api.DeploymentID, - "podID": api.PodID, - "cortex.dev/api": "true", - }, - Annotations: map[string]string{ - "traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0", - }, - K8sPodSpec: kcore.PodSpec{ - RestartPolicy: "Always", - TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds), - Containers: containers, - NodeSelector: workloads.NodeSelectors(), - Tolerations: workloads.GenerateResourceTolerations(), - Affinity: workloads.GenerateNodeAffinities(api.NodeGroups), - Volumes: volumes, - ServiceAccountName: workloads.ServiceAccountName, - }, - }, - }) -} - -func serviceSpec(api *spec.API) *kcore.Service { - return k8s.Service(&k8s.ServiceSpec{ - Name: workloads.K8sName(api.Name), - PortName: "http", - Port: consts.ProxyPortInt32, - TargetPort: consts.ProxyPortInt32, - Annotations: api.ToK8sAnnotations(), - Labels: map[string]string{ - "apiName": api.Name, - "apiKind": api.Kind.String(), - "cortex.dev/api": "true", - }, - Selector: map[string]string{ - "apiName": api.Name, - "apiKind": api.Kind.String(), - }, - }) -} - -func virtualServiceSpec(api *spec.API) *istioclientnetworking.VirtualService { - var activatorWeight int32 - if api.Autoscaling.InitReplicas == 0 { - activatorWeight = 100 - } - - return k8s.VirtualService(&k8s.VirtualServiceSpec{ - Name: workloads.K8sName(api.Name), - Gateways: []string{"apis-gateway"}, - Destinations: []k8s.Destination{ - { - ServiceName: workloads.K8sName(api.Name), - Weight: 100 - activatorWeight, - Port: uint32(consts.ProxyPortInt32), - Headers: &istionetworking.Headers{ - Response: &istionetworking.Headers_HeaderOperations{ - Set: map[string]string{ - consts.CortexOriginHeader: "api", - }, - }, - }, - }, - { - ServiceName: consts.ActivatorName, - Weight: activatorWeight, - Port: uint32(consts.ActivatorPortInt32), - Headers: &istionetworking.Headers{ - Request: &istionetworking.Headers_HeaderOperations{ - Set: map[string]string{ - consts.CortexAPINameHeader: api.Name, - consts.CortexTargetServiceHeader: fmt.Sprintf( - "http://%s.%s:%d", - workloads.K8sName(api.Name), - consts.DefaultNamespace, - consts.ProxyPortInt32, - ), - }, - }, - Response: &istionetworking.Headers_HeaderOperations{ - Set: map[string]string{ - consts.CortexOriginHeader: consts.ActivatorName, - }, - }, - }, - }, - }, - PrefixPath: api.Networking.Endpoint, - Rewrite: pointer.String("/"), - Annotations: api.ToK8sAnnotations(), - Labels: map[string]string{ - "apiName": api.Name, - "apiKind": api.Kind.String(), - "apiID": api.ID, - "specID": api.SpecID, - "initialDeploymentTime": s.Int64(api.InitialDeploymentTime), - "deploymentID": api.DeploymentID, - "podID": api.PodID, - "cortex.dev/api": "true", - }, - }) -} - -func getRequestedReplicasFromDeployment(api spec.API, deployment *kapps.Deployment) int32 { - requestedReplicas := api.Autoscaling.InitReplicas - - if deployment != nil && deployment.Spec.Replicas != nil && *deployment.Spec.Replicas > 0 { - requestedReplicas = *deployment.Spec.Replicas - } - - if requestedReplicas < api.Autoscaling.MinReplicas { - requestedReplicas = api.Autoscaling.MinReplicas - } - - if requestedReplicas > api.Autoscaling.MaxReplicas { - requestedReplicas = api.Autoscaling.MaxReplicas - } - - return requestedReplicas -} diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go deleted file mode 100644 index a65716f35c..0000000000 --- a/pkg/operator/resources/realtimeapi/status.go +++ /dev/null @@ -1,174 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package realtimeapi - -import ( - "sort" - "time" - - "github.com/cortexlabs/cortex/pkg/config" - "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/parallel" - "github.com/cortexlabs/cortex/pkg/types/status" - "github.com/cortexlabs/cortex/pkg/types/userconfig" - "github.com/cortexlabs/cortex/pkg/workloads" - kapps "k8s.io/api/apps/v1" - kcore "k8s.io/api/core/v1" -) - -func GetStatus(apiName string) (*status.Status, error) { - var deployment *kapps.Deployment - var pods []kcore.Pod - - err := parallel.RunFirstErr( - func() error { - var err error - deployment, err = config.K8s.GetDeployment(workloads.K8sName(apiName)) - return err - }, - func() error { - var err error - pods, err = config.K8s.ListPodsByLabel("apiName", apiName) - return err - }, - ) - if err != nil { - return nil, err - } - - if deployment == nil { - return nil, errors.ErrorUnexpected("unable to find deployment", apiName) - } - - return apiStatus(deployment, pods) -} - -func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status.Status, error) { - statuses := make([]status.Status, len(deployments)) - for i := range deployments { - st, err := apiStatus(&deployments[i], pods) - if err != nil { - return nil, err - } - statuses[i] = *st - } - - sort.Slice(statuses, func(i, j int) bool { - return statuses[i].APIName < statuses[j].APIName - }) - - return statuses, nil -} - -func apiStatus(deployment *kapps.Deployment, allPods []kcore.Pod) (*status.Status, error) { - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) - if err != nil { - return nil, err - } - - status := &status.Status{} - status.APIName = deployment.Labels["apiName"] - status.APIID = deployment.Labels["apiID"] - status.ReplicaCounts = getReplicaCounts(deployment, allPods) - status.Code = getStatusCode(&status.ReplicaCounts, autoscalingSpec.MinReplicas) - - return status, nil -} - -func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { - counts := status.ReplicaCounts{} - counts.Requested = *deployment.Spec.Replicas - - for i := range pods { - pod := pods[i] - if pod.Labels["apiName"] != deployment.Labels["apiName"] { - continue - } - addPodToReplicaCounts(&pods[i], deployment, &counts) - } - - return counts -} - -func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts *status.ReplicaCounts) { - var subCounts *status.SubReplicaCounts - if isPodSpecLatest(deployment, pod) { - subCounts = &counts.Updated - } else { - subCounts = &counts.Stale - } - - if k8s.IsPodReady(pod) { - subCounts.Ready++ - return - } - - switch k8s.GetPodStatus(pod) { - case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - subCounts.Stalled++ - } else { - subCounts.Pending++ - } - case k8s.PodStatusInitializing: - subCounts.Initializing++ - case k8s.PodStatusRunning: - subCounts.Initializing++ - case k8s.PodStatusErrImagePull: - subCounts.ErrImagePull++ - case k8s.PodStatusTerminating: - subCounts.Terminating++ - case k8s.PodStatusFailed: - subCounts.Failed++ - case k8s.PodStatusKilled: - subCounts.Killed++ - case k8s.PodStatusKilledOOM: - subCounts.KilledOOM++ - default: - subCounts.Unknown++ - } -} - -func getStatusCode(counts *status.ReplicaCounts, minReplicas int32) status.Code { - if counts.Updated.Ready >= counts.Requested { - return status.Live - } - - if counts.Updated.ErrImagePull > 0 { - return status.ErrorImagePull - } - - if counts.Updated.Failed > 0 || counts.Updated.Killed > 0 { - return status.Error - } - - if counts.Updated.KilledOOM > 0 { - return status.OOM - } - - if counts.Updated.Stalled > 0 { - return status.Stalled - } - - if counts.Updated.Ready >= minReplicas { - return status.Live - } - - return status.Updating -} diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 87069c2136..6eeb95b3b0 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -174,7 +174,7 @@ func RefreshAPI(apiName string, force bool) (string, error) { switch deployedResource.Kind { case userconfig.RealtimeAPIKind: - return realtimeapi.RefreshAPI(apiName, force) + return realtimeapi.RefreshAPI(apiName) case userconfig.AsyncAPIKind: return asyncapi.RefreshAPI(apiName, force) default: @@ -297,25 +297,19 @@ func GetAPIs() ([]schema.APIResponse, error) { return nil, err } - var realtimeAPIDeployments []kapps.Deployment var asyncAPIDeployments []kapps.Deployment for _, deployment := range deployments { switch deployment.Labels["apiKind"] { - case userconfig.RealtimeAPIKind.String(): - realtimeAPIDeployments = append(realtimeAPIDeployments, deployment) case userconfig.AsyncAPIKind.String(): asyncAPIDeployments = append(asyncAPIDeployments, deployment) } } - var realtimeAPIPods []kcore.Pod var batchAPIPods []kcore.Pod var taskAPIPods []kcore.Pod var asyncAPIPods []kcore.Pod for _, pod := range pods { switch pod.Labels["apiKind"] { - case userconfig.RealtimeAPIKind.String(): - realtimeAPIPods = append(realtimeAPIPods, pod) case userconfig.BatchAPIKind.String(): batchAPIPods = append(batchAPIPods, pod) case userconfig.TaskAPIKind.String(): @@ -340,7 +334,7 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIPods, realtimeAPIDeployments) + realtimeAPIList, err := realtimeapi.GetAllAPIs() if err != nil { return nil, err } @@ -387,7 +381,7 @@ func GetAPI(apiName string) ([]schema.APIResponse, error) { switch deployedResource.Kind { case userconfig.RealtimeAPIKind: - apiResponse, err = realtimeapi.GetAPIByName(deployedResource) + apiResponse, err = realtimeapi.GetAPIByName(apiName) if err != nil { return nil, err } From 532c8f25b318b92dbbc8f4b22ce5e715346243e6 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Sun, 25 Jul 2021 12:47:57 +0200 Subject: [PATCH 21/42] Fix linting errors --- pkg/operator/resources/realtimeapi/api.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 9405235faf..76dcafb517 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -242,7 +242,8 @@ func getDashboardURL(apiName string) string { func APIConfigToK8sResource(apiConfig userconfig.API) serverless.RealtimeAPI { var containers []serverless.ContainerSpec - for _, containerConfig := range apiConfig.Pod.Containers { + for i := range apiConfig.Pod.Containers { + containerConfig := apiConfig.Pod.Containers[i] var env []kcore.EnvVar for k, v := range containerConfig.Env { env = append(env, kcore.EnvVar{ From 7c7afa06ef74b1fe0c1ad5aafe3fb3ce11be6467 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Mon, 26 Jul 2021 13:48:12 +0200 Subject: [PATCH 22/42] Fix typo in UpdateStrategySpec struct --- pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go | 4 ++-- .../apis/serverless/v1alpha1/zz_generated.deepcopy.go | 8 ++++---- pkg/operator/resources/realtimeapi/api.go | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 66749debe3..59403ca18e 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -43,7 +43,7 @@ type RealtimeAPISpec struct { // +kubebuilder:validation:Optional // +kubebuilder:default={"max_surge": "25%", "max_unavailable": "25%"} // Deployment strategy to use when replacing existing replicas with new ones - UpdateStrategy UpdateStratagySpec `json:"update_strategy"` + UpdateStrategy UpdateStrategySpec `json:"update_strategy"` // +kubebuilder:validation:Required // Networking configuration @@ -187,7 +187,7 @@ type AutoscalingSpec struct { UpscaleTolerance string `json:"upscale_tolerance,omitempty"` } -type UpdateStratagySpec struct { +type UpdateStrategySpec struct { // +kubebuilder:validation:Optional // +kubebuilder:default="25%" // Maximum number of replicas that can be scheduled above the desired number of replicas during an update; diff --git a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go index 030c91042f..df2ba5ab3d 100644 --- a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go @@ -257,18 +257,18 @@ func (in *RealtimeAPIStatus) DeepCopy() *RealtimeAPIStatus { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *UpdateStratagySpec) DeepCopyInto(out *UpdateStratagySpec) { +func (in *UpdateStrategySpec) DeepCopyInto(out *UpdateStrategySpec) { *out = *in out.MaxSurge = in.MaxSurge out.MaxUnavailable = in.MaxUnavailable } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UpdateStratagySpec. -func (in *UpdateStratagySpec) DeepCopy() *UpdateStratagySpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UpdateStrategySpec. +func (in *UpdateStrategySpec) DeepCopy() *UpdateStrategySpec { if in == nil { return nil } - out := new(UpdateStratagySpec) + out := new(UpdateStrategySpec) in.DeepCopyInto(out) return out } diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 76dcafb517..7815fa4f31 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -316,7 +316,7 @@ func APIConfigToK8sResource(apiConfig userconfig.API) serverless.RealtimeAPI { UpscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.UpscaleTolerance), }, NodeGroups: apiConfig.NodeGroups, - UpdateStrategy: serverless.UpdateStratagySpec{ + UpdateStrategy: serverless.UpdateStrategySpec{ MaxSurge: intstr.FromString(apiConfig.UpdateStrategy.MaxSurge), MaxUnavailable: intstr.FromString(apiConfig.UpdateStrategy.MaxUnavailable), }, From 424cba79b428235e5f2b1c6d8c95057fb9d3e910 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Mon, 26 Jul 2021 13:49:04 +0200 Subject: [PATCH 23/42] Declare slice size when known beforehand --- pkg/operator/resources/realtimeapi/api.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 7815fa4f31..1e0b1ca327 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -109,7 +109,7 @@ func RefreshAPI(apiName string) (string, error) { return fmt.Sprintf("updating %s", apiResource.UserString()), nil } -func DeleteAPI(apiName string, keepCache bool) error { +func DeleteAPI(apiName string, _ bool) error { ctx := context.Background() api := serverless.RealtimeAPI{ ObjectMeta: kmeta.ObjectMeta{ @@ -241,7 +241,7 @@ func getDashboardURL(apiName string) string { } func APIConfigToK8sResource(apiConfig userconfig.API) serverless.RealtimeAPI { - var containers []serverless.ContainerSpec + containers := make([]serverless.ContainerSpec, len(apiConfig.Pod.Containers)) for i := range apiConfig.Pod.Containers { containerConfig := apiConfig.Pod.Containers[i] var env []kcore.EnvVar From b8d8b6a474a66b1576e493620a12e55606f25108 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 27 Jul 2021 12:15:21 +0200 Subject: [PATCH 24/42] Rename helper function --- pkg/operator/resources/realtimeapi/api.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 1e0b1ca327..6fa7cb452d 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -60,7 +60,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) if err != nil { if kerrors.IsNotFound(err) { if kerrors.IsNotFound(err) { - api := APIConfigToK8sResource(*apiConfig) + api = K8sResourceFromAPIConfig(*apiConfig) if err = config.K8s.Create(ctx, &api); err != nil { return nil, "", errors.Wrap(err, "failed to create realtime api resource") } @@ -70,7 +70,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) return nil, "", errors.Wrap(err, "failed to get realtime api resource") } - desiredAPI := APIConfigToK8sResource(*apiConfig) + desiredAPI := K8sResourceFromAPIConfig(*apiConfig) if !reflect.DeepEqual(api.Spec, desiredAPI.Spec) || force { api.Spec = desiredAPI.Spec api.Annotations["cortex.dev/last-updated"] = s.Int64(time.Now().Unix()) @@ -240,7 +240,8 @@ func getDashboardURL(apiName string) string { return dashboardURL } -func APIConfigToK8sResource(apiConfig userconfig.API) serverless.RealtimeAPI { +// K8sResourceFromAPIConfig converts a cortex API config into a realtime API CRD resource +func K8sResourceFromAPIConfig(apiConfig userconfig.API) serverless.RealtimeAPI { containers := make([]serverless.ContainerSpec, len(apiConfig.Pod.Containers)) for i := range apiConfig.Pod.Containers { containerConfig := apiConfig.Pod.Containers[i] From 39ddc6b9f8fce632c59b9a77c60b082b1c6ecf5c Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 27 Jul 2021 19:15:04 +0200 Subject: [PATCH 25/42] Fix `cortex get` behaviour by uploading specs to S3 --- .../serverless/v1alpha1/realtimeapi_types.go | 53 ++++- .../serverless/realtimeapi_controller.go | 5 +- .../realtimeapi_controller_helpers.go | 57 +---- pkg/crds/controllers/serverless/suite_test.go | 3 +- pkg/operator/resources/realtimeapi/api.go | 198 ++++++++++-------- pkg/operator/resources/realtimeapi/errors.go | 34 --- 6 files changed, 176 insertions(+), 174 deletions(-) delete mode 100644 pkg/operator/resources/realtimeapi/errors.go diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 59403ca18e..d3247de9b8 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -17,11 +17,18 @@ limitations under the License. package v1alpha1 import ( + "bytes" + "fmt" + + "github.com/cortexlabs/cortex/pkg/lib/hash" + "github.com/cortexlabs/cortex/pkg/lib/k8s" + s "github.com/cortexlabs/cortex/pkg/lib/strings" + "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" kcore "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) @@ -225,20 +232,54 @@ type RealtimeAPIStatus struct { // RealtimeAPI is the Schema for the realtimeapis API type RealtimeAPI struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` + kmeta.TypeMeta `json:",inline"` + kmeta.ObjectMeta `json:"metadata,omitempty"` Spec RealtimeAPISpec `json:"spec,omitempty"` Status RealtimeAPIStatus `json:"status,omitempty"` } +// GetOrCreateAPIIDs retrieves API ids from annotations or creates them if they don't exist +func (api RealtimeAPI) GetOrCreateAPIIDs() (deploymentID, podID, specID, apiID string) { + deploymentID = api.Annotations["cortex.dev/deployment-id"] + if deploymentID == "" { + deploymentID = k8s.RandomName()[:10] + } + + var buf bytes.Buffer + + buf.WriteString(api.Name) + buf.WriteString(api.Name) + buf.WriteString(userconfig.RealtimeAPIKind.String()) + buf.WriteString(s.Obj(api.Spec.Pod)) + podID = hash.Bytes(buf.Bytes()) + + buf.Reset() + buf.WriteString(podID) + buf.WriteString(s.Obj(api.Spec.Networking)) + buf.WriteString(s.Obj(api.Spec.Autoscaling)) + buf.WriteString(s.Obj(api.Spec.NodeGroups)) + buf.WriteString(s.Obj(api.Spec.UpdateStrategy)) + specID = hash.Bytes(buf.Bytes())[:32] + + apiID = api.Annotations["cortex.dev/api-id"] + if apiID == "" || + api.Annotations["cortex.dev/deployment-id"] != deploymentID || + api.Annotations["cortex.dev/spec-id"] != specID { + + apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) + } + + return deploymentID, podID, specID, apiID +} + //+kubebuilder:object:root=true // RealtimeAPIList contains a list of RealtimeAPI type RealtimeAPIList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []RealtimeAPI `json:"items"` + kmeta.TypeMeta `json:",inline"` + kmeta.ListMeta `json:"metadata,omitempty"` + Items []RealtimeAPI `json:"items"` } func init() { diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index fa970708c1..dab7d0e7a4 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -85,7 +85,7 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Step 3: Get or create deployment and API ids - deploymentID, specID, apiID := r.getOrCreateAPIIDs(api) + deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() idsOutdated := api.Annotations["cortex.dev/deployment-id"] != deploymentID || api.Annotations["cortex.dev/spec-id"] != specID || api.Annotations["cortex.dev/api-id"] != apiID @@ -100,7 +100,8 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if api.Annotations["cortex.dev/spec-id"] != specID { - log.V(1).Info("updating spec id annotation") + log.V(1).Info("updating pod and spec id annotations") + api.Annotations["cortex.dev/pod-id"] = podID api.Annotations["cortex.dev/spec-id"] = specID } diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 41316512bc..047ef42a43 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -17,7 +17,6 @@ limitations under the License. package serverlesscontroller import ( - "bytes" "context" "fmt" "time" @@ -25,13 +24,11 @@ import ( "github.com/cortexlabs/cortex/pkg/consts" serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/hash" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/maps" "github.com/cortexlabs/cortex/pkg/lib/pointer" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/lib/urls" - "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" @@ -247,7 +244,7 @@ func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *serverless } ingress := svc.Status.LoadBalancer.Ingress - if ingress == nil || len(ingress) == 0 { + if len(ingress) == 0 { return "", nil } @@ -261,7 +258,6 @@ func (r *RealtimeAPIReconciler) getEndpoint(ctx context.Context, api *serverless func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) kapps.Deployment { containers, volumes := r.desiredContainers(api) - deploymentID, _, apiID := r.getOrCreateAPIIDs(api) return *k8s.Deployment(&k8s.DeploymentSpec{ Name: workloads.K8sName(api.Name), @@ -271,8 +267,8 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "apiID": apiID, - "deploymentID": deploymentID, + "apiID": api.Annotations["cortex.dev/api-id"], + "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, Annotations: r.generateAPIAnnotations(api), @@ -284,7 +280,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "deploymentID": deploymentID, + "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, Annotations: map[string]string{ @@ -339,8 +335,6 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI activatorWeight = 100 } - deploymentID, _, apiID := r.getOrCreateAPIIDs(api) - return *k8s.VirtualService(&k8s.VirtualServiceSpec{ Name: workloads.K8sName(api.Name), Gateways: []string{"apis-gateway"}, @@ -387,8 +381,8 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "apiID": apiID, - "deploymentID": deploymentID, + "apiID": api.Annotations["cortex.dev/api-id"], + "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, }) @@ -406,8 +400,8 @@ func (r *RealtimeAPIReconciler) userContainers(api serverless.RealtimeAPI) ([]kc workloads.ClientConfigMount(), } - var containers []kcore.Container - for _, container := range api.Spec.Pod.Containers { + containers := make([]kcore.Container, len(api.Spec.Pod.Containers)) + for i, container := range api.Spec.Pod.Containers { containerResourceList := kcore.ResourceList{} containerResourceLimitsList := kcore.ResourceList{} securityContext := kcore.SecurityContext{ @@ -453,7 +447,7 @@ func (r *RealtimeAPIReconciler) userContainers(api serverless.RealtimeAPI) ([]kc containerEnvVars = append(containerEnvVars, workloads.ClientConfigEnvVar()) containerEnvVars = append(containerEnvVars, container.Env...) - containers = append(containers, kcore.Container{ + containers[i] = kcore.Container{ Name: container.Name, Image: container.Image, Command: container.Command, @@ -468,7 +462,7 @@ func (r *RealtimeAPIReconciler) userContainers(api serverless.RealtimeAPI) ([]kc }, ImagePullPolicy: kcore.PullAlways, SecurityContext: &securityContext, - }) + } } return containers, volumes @@ -524,37 +518,6 @@ func (r *RealtimeAPIReconciler) proxyContainer(api serverless.RealtimeAPI) (kcor }, workloads.ClusterConfigVolume() } -func (r *RealtimeAPIReconciler) getOrCreateAPIIDs(api serverless.RealtimeAPI) (deploymentID string, specID string, apiID string) { - deploymentID = api.Annotations["cortex.dev/deployment-id"] - if deploymentID == "" { - deploymentID = k8s.RandomName()[:10] - } - - specID = r.getSpecHash(api) - - apiID = api.Annotations["cortex.dev/api-id"] - if apiID == "" || - api.Annotations["cortex.dev/deployment-id"] != deploymentID || - api.Annotations["cortex.dev/spec-id"] != specID { - - apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) - } - - return deploymentID, specID, apiID -} - -func (r *RealtimeAPIReconciler) getSpecHash(api serverless.RealtimeAPI) string { - var buf bytes.Buffer - buf.WriteString(api.Name) - buf.WriteString(s.Obj(api.TypeMeta)) - buf.WriteString(s.Obj(api.Spec.Pod)) - buf.WriteString(s.Obj(api.Spec.Networking)) - buf.WriteString(s.Obj(api.Spec.Autoscaling)) - buf.WriteString(s.Obj(api.Spec.NodeGroups)) - buf.WriteString(s.Obj(api.Spec.UpdateStrategy)) - return hash.Bytes(buf.Bytes())[:32] -} - func (r *RealtimeAPIReconciler) generateAPIAnnotations(api serverless.RealtimeAPI) map[string]string { return map[string]string{ userconfig.MinReplicasAnnotationKey: s.Int32(api.Spec.Autoscaling.MinReplicas), diff --git a/pkg/crds/controllers/serverless/suite_test.go b/pkg/crds/controllers/serverless/suite_test.go index 5698e2887b..96c9626946 100644 --- a/pkg/crds/controllers/serverless/suite_test.go +++ b/pkg/crds/controllers/serverless/suite_test.go @@ -23,7 +23,6 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/controller-runtime/pkg/envtest/printer" @@ -37,7 +36,7 @@ import ( // These tests use Ginkgo (BDD-style Go testing framework). Refer to // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. -var cfg *rest.Config +//var cfg *rest.Config var k8sClient client.Client var testEnv *envtest.Environment diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 6fa7cb452d..cd7284f276 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -19,6 +19,7 @@ package realtimeapi import ( "context" "fmt" + "path/filepath" "reflect" "time" @@ -27,6 +28,8 @@ import ( serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/maps" + "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/lib/pointer" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/operator/operator" @@ -55,7 +58,6 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) var api serverless.RealtimeAPI key := client.ObjectKey{Namespace: consts.DefaultNamespace, Name: apiConfig.Name} - apiSpec := &spec.API{API: apiConfig} err := config.K8s.Get(ctx, key, &api) if err != nil { if kerrors.IsNotFound(err) { @@ -64,6 +66,23 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) if err = config.K8s.Create(ctx, &api); err != nil { return nil, "", errors.Wrap(err, "failed to create realtime api resource") } + + apiSpec := &spec.API{ + API: apiConfig, + ID: api.Annotations["cortex.dev/api-id"], + SpecID: api.Annotations["cortex.dev/spec-id"], + PodID: api.Annotations["cortex.dev/pod-id"], + DeploymentID: api.Annotations["cortex.dev/deployment-id"], + Key: spec.Key(apiConfig.Name, api.Annotations["cortex.dev/api-id"], config.ClusterConfig.ClusterUID), + InitialDeploymentTime: api.CreationTimestamp.Unix(), + LastUpdated: api.CreationTimestamp.Unix(), + MetadataRoot: spec.MetadataRoot(apiConfig.Name, config.ClusterConfig.ClusterUID), + } + + if err := config.AWS.UploadJSONToS3(apiSpec, config.ClusterConfig.Bucket, apiSpec.Key); err != nil { + return nil, "", errors.Wrap(err, "failed to upload api spec") + } + return apiSpec, fmt.Sprintf("creating %s", apiConfig.Resource.UserString()), nil } } @@ -71,13 +90,34 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) } desiredAPI := K8sResourceFromAPIConfig(*apiConfig) + + apiSpec := &spec.API{ + API: apiConfig, + ID: desiredAPI.Annotations["cortex.dev/api-id"], + SpecID: desiredAPI.Annotations["cortex.dev/spec-id"], + PodID: desiredAPI.Annotations["cortex.dev/pod-id"], + DeploymentID: desiredAPI.Annotations["cortex.dev/deployment-id"], + Key: spec.Key(apiConfig.Name, desiredAPI.Annotations["cortex.dev/api-id"], config.ClusterConfig.ClusterUID), + InitialDeploymentTime: api.CreationTimestamp.Unix(), + MetadataRoot: spec.MetadataRoot(apiConfig.Name, config.ClusterConfig.ClusterUID), + } + if !reflect.DeepEqual(api.Spec, desiredAPI.Spec) || force { api.Spec = desiredAPI.Spec - api.Annotations["cortex.dev/last-updated"] = s.Int64(time.Now().Unix()) + api.Annotations = maps.MergeStrMapsString(api.Annotations, desiredAPI.Annotations) + + lastUpdated := time.Now().Unix() + api.Annotations["cortex.dev/last-updated"] = s.Int64(lastUpdated) + apiSpec.LastUpdated = lastUpdated if err = config.K8s.Update(ctx, &api); err != nil { return nil, "", errors.Wrap(err, "failed to update realtime api resource") } + + if err := config.AWS.UploadJSONToS3(apiSpec, config.ClusterConfig.Bucket, apiSpec.Key); err != nil { + return nil, "", errors.Wrap(err, "failed to upload api spec") + } + return apiSpec, fmt.Sprintf("updating %s", apiConfig.Resource.UserString()), nil } @@ -109,21 +149,30 @@ func RefreshAPI(apiName string) (string, error) { return fmt.Sprintf("updating %s", apiResource.UserString()), nil } -func DeleteAPI(apiName string, _ bool) error { - ctx := context.Background() - api := serverless.RealtimeAPI{ - ObjectMeta: kmeta.ObjectMeta{ - Name: apiName, - Namespace: consts.DefaultNamespace, +func DeleteAPI(apiName string, keepCache bool) error { + return parallel.RunFirstErr( + func() error { + ctx := context.Background() + api := serverless.RealtimeAPI{ + ObjectMeta: kmeta.ObjectMeta{ + Name: apiName, + Namespace: consts.DefaultNamespace, + }, + } + if err := config.K8s.Delete(ctx, &api); err != nil { + return errors.Wrap(err, "failed to delete realtime api resource") + } + return nil }, - } - if err := config.K8s.Delete(ctx, &api); err != nil { - return errors.Wrap(err, "failed to delete realtime api resource") - } - - // TODO: delete bucket resources (?) - - return nil + func() error { + if keepCache { + return nil + } + // best effort deletion, swallow errors because there could be weird error messages + _ = deleteBucketResources(apiName) + return nil + }, + ) } func GetAllAPIs() ([]schema.APIResponse, error) { @@ -133,31 +182,25 @@ func GetAllAPIs() ([]schema.APIResponse, error) { return nil, errors.Wrap(err, "failed to list realtime api resources") } + apiNames := make([]string, len(apis.Items)) + apiIDs := make([]string, len(apis.Items)) + for i, api := range apis.Items { + apiNames[i] = api.Name + apiIDs[i] = api.Annotations["cortex.dev/api-id"] + } + + apiSpecs, err := operator.DownloadAPISpecs(apiNames, apiIDs) + if err != nil { + return nil, err + } + realtimeAPIs := make([]schema.APIResponse, len(apis.Items)) for i := range apis.Items { api := apis.Items[i] api.Status.ReplicaCounts.Requested = api.Spec.Pod.Replicas - lastUpdated := api.CreationTimestamp.Unix() - if api.Annotations["cortex.dev/last-updated"] != "" { - var ok bool - lastUpdated, ok = s.ParseInt64(api.Annotations["cortex.dev/last-updated"]) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse 'cortex.dev/last-updated' annotation") - } - } - realtimeAPIs[i] = schema.APIResponse{ - Spec: spec.API{ - API: &userconfig.API{ - Resource: userconfig.Resource{ - Name: api.Name, - Kind: userconfig.RealtimeAPIKind, - }, - }, - LastUpdated: lastUpdated, - InitialDeploymentTime: api.CreationTimestamp.Unix(), - }, + Spec: apiSpecs[i], Status: &status.Status{ APIName: api.Name, APIID: api.Annotations["cortex.dev/api-id"], @@ -180,40 +223,17 @@ func GetAPIByName(apiName string) ([]schema.APIResponse, error) { return nil, errors.Wrap(err, "failed to get realtime api resource") } - // TODO: needs api id history - //api, err := operator.DownloadAPISpec(st.APIName, st.APIID) - //if err != nil { - // return nil, err - //} - - dashboardURL := pointer.String(getDashboardURL(api.Name)) - - lastUpdated := api.CreationTimestamp.Unix() - if api.Annotations["cortex.dev/last-updated"] != "" { - var ok bool - lastUpdated, ok = s.ParseInt64(api.Annotations["cortex.dev/last-updated"]) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse 'cortex.dev/last-updated' annotation") - } + apiSpec, err := operator.DownloadAPISpec(api.Name, api.Annotations["cortex.dev/api-id"]) + if err != nil { + return nil, err } + dashboardURL := pointer.String(getDashboardURL(api.Name)) api.Status.ReplicaCounts.Requested = api.Spec.Pod.Replicas return []schema.APIResponse{ { - Spec: spec.API{ - API: &userconfig.API{ - Resource: userconfig.Resource{ - Name: api.Name, - Kind: userconfig.RealtimeAPIKind, - }, - }, - ID: api.Annotations["cortex.dev/api-id"], - SpecID: api.Annotations["cortex.dev/spec-id"], - DeploymentID: api.Annotations["cortex.dev/deployment-id"], - InitialDeploymentTime: api.CreationTimestamp.Unix(), - LastUpdated: lastUpdated, - }, + Spec: *apiSpec, Status: &status.Status{ APIName: api.Name, APIID: api.Annotations["cortex.dev/api-id"], @@ -244,9 +264,9 @@ func getDashboardURL(apiName string) string { func K8sResourceFromAPIConfig(apiConfig userconfig.API) serverless.RealtimeAPI { containers := make([]serverless.ContainerSpec, len(apiConfig.Pod.Containers)) for i := range apiConfig.Pod.Containers { - containerConfig := apiConfig.Pod.Containers[i] + container := apiConfig.Pod.Containers[i] var env []kcore.EnvVar - for k, v := range containerConfig.Env { + for k, v := range container.Env { env = append(env, kcore.EnvVar{ Name: k, Value: v, @@ -254,41 +274,39 @@ func K8sResourceFromAPIConfig(apiConfig userconfig.API) serverless.RealtimeAPI { } var compute *serverless.ComputeSpec - if containerConfig.Compute != nil { + if container.Compute != nil { var cpu *kresource.Quantity - if containerConfig.Compute.CPU != nil { - cpu = &containerConfig.Compute.CPU.Quantity + if container.Compute.CPU != nil { + cpu = &container.Compute.CPU.Quantity } var mem *kresource.Quantity - if containerConfig.Compute.Mem != nil { - mem = &containerConfig.Compute.Mem.Quantity + if container.Compute.Mem != nil { + mem = &container.Compute.Mem.Quantity } var shm *kresource.Quantity - if containerConfig.Compute.Shm != nil { - shm = &containerConfig.Compute.Shm.Quantity + if container.Compute.Shm != nil { + shm = &container.Compute.Shm.Quantity } compute = &serverless.ComputeSpec{ CPU: cpu, - GPU: containerConfig.Compute.GPU, - Inf: containerConfig.Compute.Inf, + GPU: container.Compute.GPU, + Inf: container.Compute.Inf, Mem: mem, Shm: shm, } } - container := serverless.ContainerSpec{ - Name: containerConfig.Name, - Image: containerConfig.Image, - Command: containerConfig.Command, - Args: containerConfig.Args, + containers[i] = serverless.ContainerSpec{ + Name: container.Name, + Image: container.Image, + Command: container.Command, + Args: container.Args, Env: env, Compute: compute, - ReadinessProbe: workloads.GetProbeSpec(containerConfig.ReadinessProbe), - LivenessProbe: workloads.GetProbeSpec(containerConfig.LivenessProbe), + ReadinessProbe: workloads.GetProbeSpec(container.ReadinessProbe), + LivenessProbe: workloads.GetProbeSpec(container.LivenessProbe), } - - containers = append(containers, container) } api := serverless.RealtimeAPI{ @@ -326,5 +344,19 @@ func K8sResourceFromAPIConfig(apiConfig userconfig.API) serverless.RealtimeAPI { }, }, } + + deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() + api.Annotations = map[string]string{ + "cortex.dev/deployment-id": deploymentID, + "cortex.dev/spec-id": specID, + "cortex.dev/pod-id": podID, + "cortex.dev/api-id": apiID, + } + return api } + +func deleteBucketResources(apiName string) error { + prefix := filepath.Join(config.ClusterConfig.ClusterUID, "apis", apiName) + return config.AWS.DeleteS3Dir(config.ClusterConfig.Bucket, prefix, true) +} diff --git a/pkg/operator/resources/realtimeapi/errors.go b/pkg/operator/resources/realtimeapi/errors.go deleted file mode 100644 index 58b60a7743..0000000000 --- a/pkg/operator/resources/realtimeapi/errors.go +++ /dev/null @@ -1,34 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package realtimeapi - -import ( - "fmt" - - "github.com/cortexlabs/cortex/pkg/lib/errors" -) - -const ( - ErrAPIUpdating = "realtimeapi.api_updating" -) - -func ErrorAPIUpdating(apiName string) error { - return errors.WithStack(&errors.Error{ - Kind: ErrAPIUpdating, - Message: fmt.Sprintf("%s is updating (override with --force)", apiName), - }) -} From 17427b3a73a1a8c7e0e6052697d37970913c7d1c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 29 Jul 2021 10:29:20 +0300 Subject: [PATCH 26/42] Update API status following the Realtime CRD addition (#2375) --- cli/cluster/delete.go | 4 +- cli/cluster/get.go | 14 + cli/cmd/describe.go | 113 +++++++ cli/cmd/get.go | 67 ++-- cli/cmd/lib_apis.go | 64 ++++ cli/cmd/lib_async_apis.go | 77 +++-- cli/cmd/lib_batch_apis.go | 59 ++-- cli/cmd/lib_realtime_apis.go | 74 +++-- cli/cmd/lib_task_apis.go | 59 ++-- cli/cmd/lib_traffic_splitters.go | 40 +-- cli/cmd/lib_watch.go | 4 +- cli/cmd/root.go | 2 + cmd/operator/main.go | 1 + dev/generate_cli_md.sh | 1 + docs/clients/cli.md | 14 + docs/workloads/async/statuses.md | 21 +- docs/workloads/batch/statuses.md | 2 +- docs/workloads/realtime/statuses.md | 26 +- docs/workloads/realtime/troubleshooting.md | 6 +- docs/workloads/task/statuses.md | 2 +- go.mod | 4 +- go.sum | 6 +- pkg/activator/activator.go | 2 +- pkg/activator/helpers.go | 21 +- pkg/consts/consts.go | 3 +- .../serverless/v1alpha1/realtimeapi_types.go | 22 +- .../v1alpha1/zz_generated.deepcopy.go | 1 - .../crd/bases/batch.cortex.dev_batchjobs.yaml | 19 +- .../serverless.cortex.dev_realtimeapis.yaml | 110 +------ .../batch/batchjob_controller_helpers.go | 92 ++++-- .../realtimeapi_controller_helpers.go | 97 +----- pkg/lib/k8s/pod.go | 142 ++++++--- pkg/operator/endpoints/describe.go | 36 +++ pkg/operator/endpoints/logs.go | 11 +- pkg/operator/operator/k8s.go | 18 ++ pkg/operator/resources/asyncapi/api.go | 164 ++++++++-- pkg/operator/resources/asyncapi/status.go | 293 +++--------------- pkg/operator/resources/job/batchapi/api.go | 37 +-- pkg/operator/resources/job/taskapi/api.go | 38 +-- pkg/operator/resources/job/worker_stats.go | 31 +- pkg/operator/resources/realtimeapi/api.go | 172 ++++++++-- pkg/operator/resources/resources.go | 56 ++-- pkg/operator/resources/trafficsplitter/api.go | 46 +-- pkg/operator/schema/schema.go | 41 +-- pkg/types/spec/api.go | 58 +++- pkg/types/spec/job.go | 40 +-- pkg/types/status/code.go | 101 ------ pkg/types/status/job_status.go | 14 +- pkg/types/status/status.go | 130 ++++++-- pkg/types/userconfig/api.go | 34 ++ pkg/types/userconfig/config_key.go | 1 + 51 files changed, 1457 insertions(+), 1033 deletions(-) create mode 100644 cli/cmd/describe.go create mode 100644 cli/cmd/lib_apis.go create mode 100644 pkg/operator/endpoints/describe.go delete mode 100644 pkg/types/status/code.go diff --git a/cli/cluster/delete.go b/cli/cluster/delete.go index e81624f98d..47618b304b 100644 --- a/cli/cluster/delete.go +++ b/cli/cluster/delete.go @@ -22,6 +22,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/json" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/prompt" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -70,8 +71,7 @@ func getReadyRealtimeAPIReplicasOrNil(operatorConfig OperatorConfig, apiName str return nil } - totalReady := apiRes.Status.Updated.Ready + apiRes.Status.Stale.Ready - return &totalReady + return pointer.Int32(apiRes.Status.Ready) } func StopJob(operatorConfig OperatorConfig, kind userconfig.Kind, apiName string, jobID string) (schema.DeleteResponse, error) { diff --git a/cli/cluster/get.go b/cli/cluster/get.go index 47a24aa0a3..6d88e707b8 100644 --- a/cli/cluster/get.go +++ b/cli/cluster/get.go @@ -51,6 +51,20 @@ func GetAPI(operatorConfig OperatorConfig, apiName string) ([]schema.APIResponse return apiRes, nil } +func DescribeAPI(operatorConfig OperatorConfig, apiName string) ([]schema.APIResponse, error) { + httpRes, err := HTTPGet(operatorConfig, "/describe/"+apiName) + if err != nil { + return nil, err + } + + var apiRes []schema.APIResponse + if err = json.Unmarshal(httpRes, &apiRes); err != nil { + return nil, errors.Wrap(err, "/describe/"+apiName, string(httpRes)) + } + + return apiRes, nil +} + func GetAPIByID(operatorConfig OperatorConfig, apiName string, apiID string) ([]schema.APIResponse, error) { httpRes, err := HTTPGet(operatorConfig, "/get/"+apiName+"/"+apiID) if err != nil { diff --git a/cli/cmd/describe.go b/cli/cmd/describe.go new file mode 100644 index 0000000000..767045c5a2 --- /dev/null +++ b/cli/cmd/describe.go @@ -0,0 +1,113 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cmd + +import ( + "fmt" + + "github.com/cortexlabs/cortex/cli/cluster" + "github.com/cortexlabs/cortex/cli/types/cliconfig" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/exit" + "github.com/cortexlabs/cortex/pkg/lib/telemetry" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/spf13/cobra" +) + +const ( + _titleReplicaStatus = "replica status" + _titleReplicaCount = "replica count" +) + +var ( + _flagDescribeEnv string + _flagDescribeWatch bool +) + +func describeInit() { + _describeCmd.Flags().SortFlags = false + _describeCmd.Flags().StringVarP(&_flagDescribeEnv, "env", "e", "", "environment to use") + _describeCmd.Flags().BoolVarP(&_flagDescribeWatch, "watch", "w", false, "re-run the command every 2 seconds") +} + +var _describeCmd = &cobra.Command{ + Use: "describe [API_NAME]", + Short: "describe an api", + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + apiName := args[0] + + var envName string + if wasFlagProvided(cmd, "env") { + envName = _flagDescribeEnv + } else { + var err error + envName, err = getEnvFromFlag("") + if err != nil { + telemetry.Event("cli.describe") + exit.Error(err) + } + } + + env, err := ReadOrConfigureEnv(envName) + if err != nil { + telemetry.Event("cli.describe") + exit.Error(err) + } + telemetry.Event("cli.describe", map[string]interface{}{"env_name": env.Name}) + + rerun(_flagDescribeWatch, func() (string, error) { + env, err := ReadOrConfigureEnv(envName) + if err != nil { + exit.Error(err) + } + + out, err := envStringIfNotSpecified(envName, cmd) + if err != nil { + return "", err + } + apiTable, err := describeAPI(env, apiName) + if err != nil { + return "", err + } + + return out + apiTable, nil + }) + }, +} + +func describeAPI(env cliconfig.Environment, apiName string) (string, error) { + apisRes, err := cluster.DescribeAPI(MustGetOperatorConfig(env.Name), apiName) + if err != nil { + return "", err + } + + if len(apisRes) == 0 { + exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find api %s", apiName))) + } + + apiRes := apisRes[0] + + switch apiRes.Metadata.Kind { + case userconfig.RealtimeAPIKind: + return realtimeDescribeAPITable(apiRes, env) + case userconfig.AsyncAPIKind: + return asyncDescribeAPITable(apiRes, env) + default: + return "", errors.ErrorUnexpected(fmt.Sprintf("encountered unexpected kind %s for api %s", apiRes.Spec.Kind, apiRes.Spec.Name)) + } +} diff --git a/cli/cmd/get.go b/cli/cmd/get.go index 1b11b984a0..c260d0c5e9 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -35,29 +35,28 @@ import ( libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/cortexlabs/yaml" "github.com/spf13/cobra" ) const ( _titleEnvironment = "env" _titleRealtimeAPI = "realtime api" - _titleStatus = "status" + _titleAsyncAPI = "async api" + _titleLive = "live" _titleUpToDate = "up-to-date" - _titleStale = "stale" - _titleRequested = "requested" - _titleFailed = "failed" - _titleLastupdated = "last update" + _titleLastUpdated = "last update" ) var ( - _flagGetEnv string - _flagWatch bool + _flagGetEnv string + _flagGetWatch bool ) func getInit() { _getCmd.Flags().SortFlags = false _getCmd.Flags().StringVarP(&_flagGetEnv, "env", "e", "", "environment to use") - _getCmd.Flags().BoolVarP(&_flagWatch, "watch", "w", false, "re-run the command every 2 seconds") + _getCmd.Flags().BoolVarP(&_flagGetWatch, "watch", "w", false, "re-run the command every 2 seconds") _getCmd.Flags().VarP(&_flagOutput, "output", "o", fmt.Sprintf("output format: one of %s", strings.Join(flags.OutputTypeStringsExcluding(flags.YAMLOutputType), "|"))) addVerboseFlag(_getCmd) } @@ -90,7 +89,7 @@ var _getCmd = &cobra.Command{ telemetry.Event("cli.get") } - rerun(func() (string, error) { + rerun(_flagGetWatch, func() (string, error) { if len(args) == 1 { env, err := ReadOrConfigureEnv(envName) if err != nil { @@ -106,7 +105,7 @@ var _getCmd = &cobra.Command{ return "", err } - if _flagOutput == flags.JSONOutputType { + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return apiTable, nil } @@ -136,7 +135,7 @@ var _getCmd = &cobra.Command{ if err != nil { return "", err } - if _flagOutput == flags.JSONOutputType { + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return jobTable, nil } @@ -166,7 +165,7 @@ var _getCmd = &cobra.Command{ return "", err } - if _flagOutput == flags.JSONOutputType { + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return apiTable, nil } @@ -221,7 +220,7 @@ func getAPIsInAllEnvironments() (string, error) { if err == nil { for _, api := range apisRes { - switch api.Spec.Kind { + switch api.Metadata.Kind { case userconfig.BatchAPIKind: allBatchAPIEnvs = append(allBatchAPIEnvs, env.Name) allBatchAPIs = append(allBatchAPIs, api) @@ -247,12 +246,16 @@ func getAPIsInAllEnvironments() (string, error) { allAPIsOutput = append(allAPIsOutput, apisOutput) } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(allAPIsOutput) - if err != nil { - return "", err - } - + bytes, err = libjson.Marshal(allAPIsOutput) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(allAPIsOutput) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } @@ -337,11 +340,16 @@ func getAPIsByEnv(env cliconfig.Environment) (string, error) { return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(apisRes) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(apisRes) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(apisRes) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } @@ -457,16 +465,21 @@ func getAPI(env cliconfig.Environment, apiName string) (string, error) { return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(apisRes) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(apisRes) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(apisRes) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } if len(apisRes) == 0 { - exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find API %s", apiName))) + exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find api %s", apiName))) } apiRes := apisRes[0] diff --git a/cli/cmd/lib_apis.go b/cli/cmd/lib_apis.go new file mode 100644 index 0000000000..bce36d0ca2 --- /dev/null +++ b/cli/cmd/lib_apis.go @@ -0,0 +1,64 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cmd + +import ( + "github.com/cortexlabs/cortex/pkg/lib/table" + "github.com/cortexlabs/cortex/pkg/types/status" +) + +func replicaCountTable(counts *status.ReplicaCounts) table.Table { + var rows [][]interface{} + for _, replicaCountType := range status.ReplicaCountTypes { + // skip up-to-date count + if replicaCountType == status.ReplicaCountUpToDate { + continue + } + + count := counts.GetCountBy(replicaCountType) + canBeHiddenIfZero := false + switch replicaCountType { + case status.ReplicaCountFailed: + canBeHiddenIfZero = true + case status.ReplicaCountKilled: + canBeHiddenIfZero = true + case status.ReplicaCountKilledOOM: + canBeHiddenIfZero = true + case status.ReplicaCountErrImagePull: + canBeHiddenIfZero = true + case status.ReplicaCountUnknown: + canBeHiddenIfZero = true + case status.ReplicaCountStalled: + canBeHiddenIfZero = true + } + if count == 0 && canBeHiddenIfZero { + continue + } + rows = append(rows, []interface{}{ + replicaCountType, + count, + }) + } + + return table.Table{ + Headers: []table.Header{ + {Title: _titleReplicaStatus, MinWidth: 32, MaxWidth: 32}, + {Title: _titleReplicaCount}, + }, + Rows: rows, + } +} diff --git a/cli/cmd/lib_async_apis.go b/cli/cmd/lib_async_apis.go index 114c88bca8..e534a9923d 100644 --- a/cli/cmd/lib_async_apis.go +++ b/cli/cmd/lib_async_apis.go @@ -17,26 +17,22 @@ limitations under the License. package cmd import ( + "fmt" "strings" "time" "github.com/cortexlabs/cortex/cli/types/cliconfig" "github.com/cortexlabs/cortex/pkg/lib/console" + "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/table" libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" ) -const ( - _titleAsyncAPI = "async api" -) - func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (string, error) { var out string t := asyncAPIsTable([]schema.APIResponse{asyncAPI}, []string{env.Name}) - t.FindHeaderByTitle(_titleEnvironment).Hidden = true - t.FindHeaderByTitle(_titleAsyncAPI).Hidden = true out += t.MustFormat() @@ -44,7 +40,9 @@ func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (stri out += "\n" + console.Bold("metrics dashboard: ") + *asyncAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + asyncAPI.Endpoint + "\n" + if asyncAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *asyncAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(asyncAPI.APIVersions) @@ -57,39 +55,68 @@ func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (stri return out, nil } +func asyncDescribeAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (string, error) { + if asyncAPI.Metadata == nil { + return "", errors.ErrorUnexpected("missing metadata from operator response") + } + + if asyncAPI.ReplicaCounts == nil { + return "", errors.ErrorUnexpected(fmt.Sprintf("missing replica counts for %s api", asyncAPI.Metadata.Name)) + } + + t := asyncAPIsTable([]schema.APIResponse{asyncAPI}, []string{env.Name}) + out := t.MustFormat() + + if asyncAPI.DashboardURL != nil && *asyncAPI.DashboardURL != "" { + out += "\n" + console.Bold("metrics dashboard: ") + *asyncAPI.DashboardURL + "\n" + } + + if asyncAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *asyncAPI.Endpoint + "\n" + } + + t = replicaCountTable(asyncAPI.ReplicaCounts) + out += "\n" + t.MustFormat() + + return out, nil +} + func asyncAPIsTable(asyncAPIs []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(asyncAPIs)) - var totalFailed int32 - var totalStale int32 - for i, asyncAPI := range asyncAPIs { - lastUpdated := time.Unix(asyncAPI.Spec.LastUpdated, 0) + if asyncAPI.Metadata == nil || (asyncAPI.Status == nil && asyncAPI.ReplicaCounts == nil) { + continue + } + + var ready, requested, upToDate int32 + if asyncAPI.Status != nil { + ready = asyncAPI.Status.Ready + requested = asyncAPI.Status.Requested + upToDate = asyncAPI.Status.UpToDate + } else { + ready = asyncAPI.ReplicaCounts.Ready + requested = asyncAPI.ReplicaCounts.Requested + upToDate = asyncAPI.ReplicaCounts.UpToDate + } + + lastUpdated := time.Unix(asyncAPI.Metadata.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], - asyncAPI.Spec.Name, - asyncAPI.Status.Message(), - asyncAPI.Status.Updated.Ready, - asyncAPI.Status.Stale.Ready, - asyncAPI.Status.Requested, - asyncAPI.Status.Updated.TotalFailed(), + asyncAPI.Metadata.Name, + fmt.Sprintf("%d/%d", ready, requested), + upToDate, libtime.SinceStr(&lastUpdated), }) - - totalFailed += asyncAPI.Status.Updated.TotalFailed() - totalStale += asyncAPI.Status.Stale.Ready } return table.Table{ Headers: []table.Header{ {Title: _titleEnvironment}, {Title: _titleAsyncAPI}, - {Title: _titleStatus}, + {Title: _titleLive}, {Title: _titleUpToDate}, - {Title: _titleStale, Hidden: totalStale == 0}, - {Title: _titleRequested}, - {Title: _titleFailed, Hidden: totalFailed == 0}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 272dbfa0fa..ebabc29243 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -31,6 +31,7 @@ import ( libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/yaml" ) const ( @@ -43,7 +44,10 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab rows := make([][]interface{}, 0, len(batchAPIs)) for i, batchAPI := range batchAPIs { - lastAPIUpdated := time.Unix(batchAPI.Spec.LastUpdated, 0) + if batchAPI.Metadata == nil { + continue + } + lastAPIUpdated := time.Unix(batchAPI.Metadata.LastUpdated, 0) latestStartTime := time.Time{} latestJobID := "-" runningJobs := 0 @@ -61,7 +65,7 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab rows = append(rows, []interface{}{ envNames[i], - batchAPI.Spec.Name, + batchAPI.Metadata.Name, runningJobs, latestJobID, libtime.SinceStr(&lastAPIUpdated), @@ -74,7 +78,7 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab {Title: _titleBatchAPI}, {Title: _titleJobCount}, {Title: _titleLatestJobID}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } @@ -123,7 +127,9 @@ func batchAPITable(batchAPI schema.APIResponse) string { out += "\n" + console.Bold("metrics dashboard: ") + *batchAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + batchAPI.Endpoint + "\n" + if batchAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *batchAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(batchAPI.APIVersions) @@ -142,11 +148,16 @@ func getBatchJob(env cliconfig.Environment, apiName string, jobID string) (strin return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(resp) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(resp) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(resp) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } @@ -216,22 +227,34 @@ func getBatchJob(env cliconfig.Environment, apiName string, jobID string) (strin if job.WorkerCounts != nil { t := table.Table{ Headers: []table.Header{ - {Title: "requested"}, - {Title: "pending", Hidden: job.WorkerCounts.Pending == 0}, - {Title: "initializing", Hidden: job.WorkerCounts.Initializing == 0}, - {Title: "stalled", Hidden: job.WorkerCounts.Stalled == 0}, - {Title: "running"}, - {Title: "failed", Hidden: job.WorkerCounts.Failed == 0}, - {Title: "succeeded"}, + {Title: "Requested"}, + {Title: "Pending"}, + {Title: "Creating"}, + {Title: "Ready"}, + {Title: "NotReady"}, + {Title: "ErrImagePull", Hidden: job.WorkerCounts.ErrImagePull == 0}, + {Title: "Terminating", Hidden: job.WorkerCounts.Terminating == 0}, + {Title: "Failed", Hidden: job.WorkerCounts.Failed == 0}, + {Title: "Killed", Hidden: job.WorkerCounts.Killed == 0}, + {Title: "KilledOOM", Hidden: job.WorkerCounts.KilledOOM == 0}, + {Title: "Stalled", Hidden: job.WorkerCounts.Stalled == 0}, + {Title: "Unknown", Hidden: job.WorkerCounts.Unknown == 0}, + {Title: "Succeeded"}, }, Rows: [][]interface{}{ { job.Workers, job.WorkerCounts.Pending, - job.WorkerCounts.Initializing, - job.WorkerCounts.Stalled, - job.WorkerCounts.Running, + job.WorkerCounts.Creating, + job.WorkerCounts.Ready, + job.WorkerCounts.NotReady, + job.WorkerCounts.ErrImagePull, + job.WorkerCounts.Terminating, job.WorkerCounts.Failed, + job.WorkerCounts.Killed, + job.WorkerCounts.KilledOOM, + job.WorkerCounts.Stalled, + job.WorkerCounts.Unknown, job.WorkerCounts.Succeeded, }, }, diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go index be4316e0a8..92234a83f9 100644 --- a/cli/cmd/lib_realtime_apis.go +++ b/cli/cmd/lib_realtime_apis.go @@ -17,11 +17,13 @@ limitations under the License. package cmd import ( + "fmt" "strings" "time" "github.com/cortexlabs/cortex/cli/types/cliconfig" "github.com/cortexlabs/cortex/pkg/lib/console" + "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/table" libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -31,16 +33,15 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) var out string t := realtimeAPIsTable([]schema.APIResponse{realtimeAPI}, []string{env.Name}) - t.FindHeaderByTitle(_titleEnvironment).Hidden = true - t.FindHeaderByTitle(_titleRealtimeAPI).Hidden = true - out += t.MustFormat() if realtimeAPI.DashboardURL != nil && *realtimeAPI.DashboardURL != "" { out += "\n" + console.Bold("metrics dashboard: ") + *realtimeAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + realtimeAPI.Endpoint + "\n" + if realtimeAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *realtimeAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(realtimeAPI.APIVersions) @@ -53,39 +54,68 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) return out, nil } +func realtimeDescribeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) (string, error) { + if realtimeAPI.Metadata == nil { + return "", errors.ErrorUnexpected("missing metadata from operator response") + } + + if realtimeAPI.ReplicaCounts == nil { + return "", errors.ErrorUnexpected(fmt.Sprintf("missing replica counts for %s api", realtimeAPI.Metadata.Name)) + } + + t := realtimeAPIsTable([]schema.APIResponse{realtimeAPI}, []string{env.Name}) + out := t.MustFormat() + + if realtimeAPI.DashboardURL != nil && *realtimeAPI.DashboardURL != "" { + out += "\n" + console.Bold("metrics dashboard: ") + *realtimeAPI.DashboardURL + "\n" + } + + if realtimeAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *realtimeAPI.Endpoint + "\n" + } + + t = replicaCountTable(realtimeAPI.ReplicaCounts) + out += "\n" + t.MustFormat() + + return out, nil +} + func realtimeAPIsTable(realtimeAPIs []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(realtimeAPIs)) - var totalFailed int32 - var totalStale int32 - for i, realtimeAPI := range realtimeAPIs { - lastUpdated := time.Unix(realtimeAPI.Spec.LastUpdated, 0) + if realtimeAPI.Metadata == nil || (realtimeAPI.Status == nil && realtimeAPI.ReplicaCounts == nil) { + continue + } + + var ready, requested, upToDate int32 + if realtimeAPI.Status != nil { + ready = realtimeAPI.Status.Ready + requested = realtimeAPI.Status.Requested + upToDate = realtimeAPI.Status.UpToDate + } else { + ready = realtimeAPI.ReplicaCounts.Ready + requested = realtimeAPI.ReplicaCounts.Requested + upToDate = realtimeAPI.ReplicaCounts.UpToDate + } + + lastUpdated := time.Unix(realtimeAPI.Metadata.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], - realtimeAPI.Spec.Name, - realtimeAPI.Status.Message(), - realtimeAPI.Status.Updated.Ready, - realtimeAPI.Status.Stale.Ready, - realtimeAPI.Status.Requested, - realtimeAPI.Status.Updated.TotalFailed(), + realtimeAPI.Metadata.Name, + fmt.Sprintf("%d/%d", ready, requested), + upToDate, libtime.SinceStr(&lastUpdated), }) - - totalFailed += realtimeAPI.Status.Updated.TotalFailed() - totalStale += realtimeAPI.Status.Stale.Ready } return table.Table{ Headers: []table.Header{ {Title: _titleEnvironment}, {Title: _titleRealtimeAPI}, - {Title: _titleStatus}, + {Title: _titleLive}, {Title: _titleUpToDate}, - {Title: _titleStale, Hidden: totalStale == 0}, - {Title: _titleRequested}, - {Title: _titleFailed, Hidden: totalFailed == 0}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index 50575b8516..295e1af875 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -29,6 +29,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/table" libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/yaml" ) const ( @@ -41,7 +42,10 @@ func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table rows := make([][]interface{}, 0, len(taskAPIs)) for i, taskAPI := range taskAPIs { - lastAPIUpdated := time.Unix(taskAPI.Spec.LastUpdated, 0) + if taskAPI.Metadata == nil { + continue + } + lastAPIUpdated := time.Unix(taskAPI.Metadata.LastUpdated, 0) latestStartTime := time.Time{} latestJobID := "-" runningJobs := 0 @@ -59,7 +63,7 @@ func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table rows = append(rows, []interface{}{ envNames[i], - taskAPI.Spec.Name, + taskAPI.Metadata.Name, runningJobs, latestJobID, libtime.SinceStr(&lastAPIUpdated), @@ -72,7 +76,7 @@ func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table {Title: _titleTaskAPI}, {Title: _titleTaskJobCount}, {Title: _titleLatestTaskJobID}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } @@ -118,7 +122,9 @@ func taskAPITable(taskAPI schema.APIResponse) string { out += "\n" + console.Bold("metrics dashboard: ") + *taskAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + taskAPI.Endpoint + "\n" + if taskAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *taskAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(taskAPI.APIVersions) @@ -137,11 +143,16 @@ func getTaskJob(env cliconfig.Environment, apiName string, jobID string) (string return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(resp) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(resp) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(resp) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } @@ -176,22 +187,34 @@ func getTaskJob(env cliconfig.Environment, apiName string, jobID string) (string if job.WorkerCounts != nil { t := table.Table{ Headers: []table.Header{ - {Title: "requested"}, - {Title: "pending", Hidden: job.WorkerCounts.Pending == 0}, - {Title: "initializing", Hidden: job.WorkerCounts.Initializing == 0}, - {Title: "stalled", Hidden: job.WorkerCounts.Stalled == 0}, - {Title: "running"}, - {Title: "failed", Hidden: job.WorkerCounts.Failed == 0}, - {Title: "succeeded"}, + {Title: "Requested"}, + {Title: "Pending"}, + {Title: "Creating"}, + {Title: "Ready"}, + {Title: "NotReady"}, + {Title: "ErrImagePull", Hidden: job.WorkerCounts.ErrImagePull == 0}, + {Title: "Terminating", Hidden: job.WorkerCounts.Terminating == 0}, + {Title: "Failed", Hidden: job.WorkerCounts.Failed == 0}, + {Title: "Killed", Hidden: job.WorkerCounts.Killed == 0}, + {Title: "KilledOOM", Hidden: job.WorkerCounts.KilledOOM == 0}, + {Title: "Stalled", Hidden: job.WorkerCounts.Stalled == 0}, + {Title: "Unknown", Hidden: job.WorkerCounts.Unknown == 0}, + {Title: "Succeeded"}, }, Rows: [][]interface{}{ { job.Workers, job.WorkerCounts.Pending, - job.WorkerCounts.Initializing, - job.WorkerCounts.Stalled, - job.WorkerCounts.Running, + job.WorkerCounts.Creating, + job.WorkerCounts.Ready, + job.WorkerCounts.NotReady, + job.WorkerCounts.ErrImagePull, + job.WorkerCounts.Terminating, job.WorkerCounts.Failed, + job.WorkerCounts.Killed, + job.WorkerCounts.KilledOOM, + job.WorkerCounts.Stalled, + job.WorkerCounts.Unknown, job.WorkerCounts.Succeeded, }, }, diff --git a/cli/cmd/lib_traffic_splitters.go b/cli/cmd/lib_traffic_splitters.go index 39c344038a..af2b4e4aad 100644 --- a/cli/cmd/lib_traffic_splitters.go +++ b/cli/cmd/lib_traffic_splitters.go @@ -17,6 +17,7 @@ limitations under the License. package cmd import ( + "fmt" "strings" "time" @@ -44,12 +45,14 @@ func trafficSplitterTable(trafficSplitter schema.APIResponse, env cliconfig.Envi if err != nil { return "", err } - t.FindHeaderByTitle(_titleEnvironment).Hidden = true out += t.MustFormat() out += "\n" + console.Bold("last updated: ") + libtime.SinceStr(&lastUpdated) - out += "\n" + console.Bold("endpoint: ") + trafficSplitter.Endpoint + "\n" + + if trafficSplitter.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *trafficSplitter.Endpoint + "\n" + } out += "\n" + apiHistoryTable(trafficSplitter.APIVersions) @@ -72,7 +75,10 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ } apiRes := apisRes[0] - lastUpdated := time.Unix(apiRes.Spec.LastUpdated, 0) + if apiRes.Metadata == nil || apiRes.Status == nil { + continue + } + lastUpdated := time.Unix(apiRes.Metadata.LastUpdated, 0) apiName := apiRes.Spec.Name if api.Shadow { @@ -82,8 +88,8 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ env.Name, apiName, api.Weight, - apiRes.Status.Message(), - apiRes.Status.Requested, + fmt.Sprintf("%d/%d", apiRes.Status.Ready, apiRes.Status.Requested), + apiRes.Status.UpToDate, libtime.SinceStr(&lastUpdated), }) } @@ -93,9 +99,9 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ {Title: _titleEnvironment}, {Title: _titleAPIs}, {Title: _trafficSplitterWeights}, - {Title: _titleStatus}, - {Title: _titleRequested}, - {Title: _titleLastupdated}, + {Title: _titleLive}, + {Title: _titleUpToDate}, + {Title: _titleLastUpdated}, }, Rows: rows, }, nil @@ -104,20 +110,14 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(trafficSplitter)) for i, splitAPI := range trafficSplitter { - lastUpdated := time.Unix(splitAPI.Spec.LastUpdated, 0) - var apis []string - for _, api := range splitAPI.Spec.APIs { - apiName := api.Name - if api.Shadow { - apiName += " (shadow)" - } - apis = append(apis, apiName+":"+s.Int32(api.Weight)) + if splitAPI.Metadata == nil || splitAPI.NumTrafficSplitterTargets == nil { + continue } - apisStr := s.TruncateEllipses(strings.Join(apis, " "), 50) + lastUpdated := time.Unix(splitAPI.Metadata.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], - splitAPI.Spec.Name, - apisStr, + splitAPI.Metadata.Name, + s.Int32(*splitAPI.NumTrafficSplitterTargets), libtime.SinceStr(&lastUpdated), }) } @@ -127,7 +127,7 @@ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []s {Title: _titleEnvironment}, {Title: _titleTrafficSplitter}, {Title: _titleAPIs}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_watch.go b/cli/cmd/lib_watch.go index 06aebb26c2..a0f9043492 100644 --- a/cli/cmd/lib_watch.go +++ b/cli/cmd/lib_watch.go @@ -56,8 +56,8 @@ func watchHeader() string { return fmt.Sprintf("$ %s %s%s", _cmdStr, padding, libtime.LocalHourNow()) } -func rerun(f func() (string, error)) { - if _flagWatch { +func rerun(watchFlag bool, f func() (string, error)) { + if watchFlag { print("\033[H\033[2J") // clear the screen var prevStrSlice []string diff --git a/cli/cmd/root.go b/cli/cmd/root.go index 68649c0cc1..8aa7d1e0e0 100644 --- a/cli/cmd/root.go +++ b/cli/cmd/root.go @@ -112,6 +112,7 @@ func init() { clusterInit() completionInit() deleteInit() + describeInit() deployInit() envInit() getInit() @@ -154,6 +155,7 @@ func Execute() { _rootCmd.AddCommand(_deployCmd) _rootCmd.AddCommand(_getCmd) + _rootCmd.AddCommand(_describeCmd) _rootCmd.AddCommand(_logsCmd) _rootCmd.AddCommand(_refreshCmd) _rootCmd.AddCommand(_deleteCmd) diff --git a/cmd/operator/main.go b/cmd/operator/main.go index bf5a50d33b..ac38ee7130 100644 --- a/cmd/operator/main.go +++ b/cmd/operator/main.go @@ -105,6 +105,7 @@ func main() { routerWithAuth.HandleFunc("/get", endpoints.GetAPIs).Methods("GET") routerWithAuth.HandleFunc("/get/{apiName}", endpoints.GetAPI).Methods("GET") routerWithAuth.HandleFunc("/get/{apiName}/{apiID}", endpoints.GetAPIByID).Methods("GET") + routerWithAuth.HandleFunc("/describe/{apiName}", endpoints.DescribeAPI).Methods("GET") routerWithAuth.HandleFunc("/streamlogs/{apiName}", endpoints.ReadLogs) routerWithAuth.HandleFunc("/logs/{apiName}", endpoints.GetLogURL).Methods("GET") diff --git a/dev/generate_cli_md.sh b/dev/generate_cli_md.sh index 5715f6fdb8..fdf2566624 100755 --- a/dev/generate_cli_md.sh +++ b/dev/generate_cli_md.sh @@ -33,6 +33,7 @@ echo "# CLI commands" >> $out_file commands=( "deploy" "get" + "describe" "logs" "refresh" "delete" diff --git a/docs/clients/cli.md b/docs/clients/cli.md index be43886dba..b10957bfe4 100644 --- a/docs/clients/cli.md +++ b/docs/clients/cli.md @@ -32,6 +32,20 @@ Flags: -h, --help help for get ``` +## describe + +```text +describe an api + +Usage: + cortex describe [API_NAME] [flags] + +Flags: + -e, --env string environment to use + -w, --watch re-run the command every 2 seconds + -h, --help help for describe +``` + ## logs ```text diff --git a/docs/workloads/async/statuses.md b/docs/workloads/async/statuses.md index 3ecaeba865..9c4787f293 100644 --- a/docs/workloads/async/statuses.md +++ b/docs/workloads/async/statuses.md @@ -1,4 +1,4 @@ -# Statuses +# Request statuses | Status | Meaning | | :--- | :--- | @@ -6,3 +6,22 @@ | in_progress | Workload has been pulled by the API and is currently being processed | | completed | Workload has completed with success | | failed | Workload encountered an error during processing | + +# Replica states + +The replica states of an API can be inspected by running `cortex describe `. Here are the possible states for each replica in an API: + +| State | Meaning | +|:---|:---| +| Ready | Replica is running and it has passed the readiness checks | +| ReadyOutOfDate | Replica is running and it has passed the readiness checks (for an out-of-date replica) | +| NotReady | Replica is running but it's not passing the readiness checks; make sure the server is listening on the designed port of the API | +| Pending | Replica is in a pending state (waiting to get scheduled onto a node) | +| Creating | Replica is in the process of having its containers created | +| ErrImagePull | Replica was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | +| Failed | Replica couldn't start due to an error; run `cortex logs ` to view the logs | +| Killed | Replica has had one of its containers killed | +| KilledOOM | Replica was terminated due to excessive memory usage; try allocating more memory to the API and re-deploy | +| Stalled | Replica has been in a pending state for more than 15 minutes; see [troubleshooting](../realtime/troubleshooting.md) | +| Terminating | Replica is currently in the process of being terminated | +| Unknown | Replica is in an unknown state | diff --git a/docs/workloads/batch/statuses.md b/docs/workloads/batch/statuses.md index 1bcddcd6bd..019ca55789 100644 --- a/docs/workloads/batch/statuses.md +++ b/docs/workloads/batch/statuses.md @@ -1,4 +1,4 @@ -# Statuses +# Job statuses | Status | Meaning | | :--- | :--- | diff --git a/docs/workloads/realtime/statuses.md b/docs/workloads/realtime/statuses.md index 2ee32aca40..d4e201bfba 100644 --- a/docs/workloads/realtime/statuses.md +++ b/docs/workloads/realtime/statuses.md @@ -1,10 +1,18 @@ -# Statuses +# Replica states -| Status | Meaning | -| :--- | :--- | -| live | API is deployed and ready to serve requests (at least one replica is running) | -| updating | API is updating | -| error | API was not created due to an error; run `cortex logs ` to view the logs | -| error (image pull) | API was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | -| error (out of memory) | API was terminated due to excessive memory usage; try allocating more memory to the API and re-deploying | -| compute unavailable | API could not start due to insufficient memory, CPU, GPU, or Inf in the cluster; some replicas may be ready | +The replica states of an API can be inspected by running `cortex describe `. Here are the possible states for each replica in an API: + +| State | Meaning | +|:---|:---| +| Ready | Replica is running and it has passed the readiness checks | +| ReadyOutOfDate | Replica is running and it has passed the readiness checks (for an out-of-date replica) | +| NotReady | Replica is running but it's not passing the readiness checks; make sure the server is listening on the designed port of the API | +| Pending | Replica is in a pending state (waiting to get scheduled onto a node) | +| Creating | Replica is in the process of having its containers created | +| ErrImagePull | Replica was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | +| Failed | Replica couldn't start due to an error; run `cortex logs ` to view the logs | +| Killed | Replica has had one of its containers killed | +| KilledOOM | Replica was terminated due to excessive memory usage; try allocating more memory to the API and re-deploy | +| Stalled | Replica has been in a pending state for more than 15 minutes; see [troubleshooting](../realtime/troubleshooting.md) | +| Terminating | Replica is currently in the process of being terminated | +| Unknown | Replica is in an unknown state | diff --git a/docs/workloads/realtime/troubleshooting.md b/docs/workloads/realtime/troubleshooting.md index 61de9dfe74..5254d25aaa 100644 --- a/docs/workloads/realtime/troubleshooting.md +++ b/docs/workloads/realtime/troubleshooting.md @@ -4,14 +4,14 @@ When making requests to your API, it's possible to get a `no healthy upstream` error message (with HTTP status code `503`). This means that there are currently no live replicas running for your API. This could happen for a few reasons: -1. It's possible that your API is simply not ready yet. You can check the status of your API with `cortex get API_NAME`, and inspect the logs in CloudWatch with the help of `cortex logs API_NAME`. -1. Your API may have errored during initialization or while responding to a previous request. `cortex get API_NAME` will show the status of your API, and you can view the logs for all replicas by visiting the CloudWatch Insights URL from `cortex logs API_NAME`. +1. It's possible that your API is simply not ready yet. You can check the number of ready replicas on your API with `cortex get API_NAME`, and inspect the logs in CloudWatch with the help of `cortex logs API_NAME`. +1. Your API may have errored during initialization or while responding to a previous request. `cortex describe API_NAME` will show the number of replicas that have failed to start on your API, and you can view the logs for all replicas by visiting the CloudWatch Insights URL from `cortex logs API_NAME`. If you are using API Gateway in front of your API endpoints, it is also possible to receive a `{"message":"Service Unavailable"}` error message (with HTTP status code `503`) after 29 seconds if your request exceeds API Gateway's 29 second timeout. If this is the case, you can either modify your code to take less time, run on faster hardware (e.g. GPUs), or don't use API Gateway (there is no timeout when using the API's endpoint directly). ## API is stuck updating -If your API is stuck in the "updating" or "compute unavailable" state (which is displayed when running `cortex get`), there are a few possible causes. Here are some things to check: +If your API has pods stuck in the "pending" or "stalled" states (which is displayed when running `cortex describe API_NAME`), there are a few possible causes. Here are some things to check: ### Inspect API logs in CloudWatch diff --git a/docs/workloads/task/statuses.md b/docs/workloads/task/statuses.md index b51eaf010f..0631ab68f2 100644 --- a/docs/workloads/task/statuses.md +++ b/docs/workloads/task/statuses.md @@ -1,4 +1,4 @@ -# Statuses +# Job statuses | Status | Meaning | | :--- | :--- | diff --git a/go.mod b/go.mod index 6acb918587..4381c8a46b 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/aws/amazon-vpc-cni-k8s v1.8.0 github.com/aws/aws-sdk-go v1.38.70 github.com/cenkalti/backoff/v4 v4.1.1 // indirect - github.com/containerd/containerd v1.5.2 // indirect + github.com/containerd/containerd v1.5.4 // indirect github.com/cortexlabs/go-input v0.0.0-20200503032952-8b67a7a7b28d github.com/cortexlabs/yaml v0.0.0-20210628201654-31e52ba8433b github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964 // indirect @@ -67,7 +67,7 @@ require ( golang.org/x/time v0.0.0-20210611083556-38a9dc6acbc6 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect google.golang.org/genproto v0.0.0-20210701133433-6b8dcf568a95 // indirect - google.golang.org/grpc v1.39.0 // indirect + google.golang.org/grpc v1.39.0 gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect gopkg.in/karalabe/cookiejar.v2 v2.0.0-20150724131613-8dcd6a7f4951 gopkg.in/segmentio/analytics-go.v3 v3.1.0 diff --git a/go.sum b/go.sum index 3517621d1e..e2c0784e16 100644 --- a/go.sum +++ b/go.sum @@ -94,6 +94,7 @@ github.com/Microsoft/hcsshim v0.8.9/go.mod h1:5692vkUqntj1idxauYlpoINNKeqCiG6Sg3 github.com/Microsoft/hcsshim v0.8.14/go.mod h1:NtVKoYxQuTLx6gEq0L96c9Ju4JbRJ4nY2ow3VK6a9Lg= github.com/Microsoft/hcsshim v0.8.15/go.mod h1:x38A4YbHbdxJtc0sF6oIz+RG0npwSCAvn69iY6URG00= github.com/Microsoft/hcsshim v0.8.16/go.mod h1:o5/SZqmR7x9JNKsW3pu+nqHm0MF8vbA+VxGOoXdC600= +github.com/Microsoft/hcsshim v0.8.18/go.mod h1:+w2gRZ5ReXQhFOrvSQeNfhrYB/dg3oDwTOcER2fw4I4= github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5hlzMzRKMLyo42nCZ9oml8AdTlq/0cvIaBv6tK1RehU= github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= @@ -211,8 +212,9 @@ github.com/containerd/containerd v1.5.0-beta.1/go.mod h1:5HfvG1V2FsKesEGQ17k5/T7 github.com/containerd/containerd v1.5.0-beta.3/go.mod h1:/wr9AVtEM7x9c+n0+stptlo/uBBoBORwEx6ardVcmKU= github.com/containerd/containerd v1.5.0-beta.4/go.mod h1:GmdgZd2zA2GYIBZ0w09ZvgqEq8EfBp/m3lcVZIvPHhI= github.com/containerd/containerd v1.5.0-rc.0/go.mod h1:V/IXoMqNGgBlabz3tHD2TWDoTJseu1FGOKuoA4nNb2s= -github.com/containerd/containerd v1.5.2 h1:MG/Bg1pbmMb61j3wHCFWPxESXHieiKr2xG64px/k8zQ= -github.com/containerd/containerd v1.5.2/go.mod h1:0DOxVqwDy2iZvrZp2JUx/E+hS0UNTVn7dJnIOwtYR4g= +github.com/containerd/containerd v1.5.1/go.mod h1:0DOxVqwDy2iZvrZp2JUx/E+hS0UNTVn7dJnIOwtYR4g= +github.com/containerd/containerd v1.5.4 h1:uPF0og3ByFzDnaStfiQj3fVGTEtaSNyU+bW7GR/nqGA= +github.com/containerd/containerd v1.5.4/go.mod h1:sx18RgvW6ABJ4iYUw7Q5x7bgFOAB9B6G7+yO0XBc4zw= github.com/containerd/continuity v0.0.0-20190426062206-aaeac12a7ffc/go.mod h1:GL3xCUCBDV3CZiTSEKksMWbLE66hEyuu9qyDOOqM47Y= github.com/containerd/continuity v0.0.0-20190815185530-f2a389ac0a02/go.mod h1:GL3xCUCBDV3CZiTSEKksMWbLE66hEyuu9qyDOOqM47Y= github.com/containerd/continuity v0.0.0-20190827140505-75bee3e2ccb6/go.mod h1:GL3xCUCBDV3CZiTSEKksMWbLE66hEyuu9qyDOOqM47Y= diff --git a/pkg/activator/activator.go b/pkg/activator/activator.go index b7c54adc3d..7b68736951 100644 --- a/pkg/activator/activator.go +++ b/pkg/activator/activator.go @@ -131,7 +131,7 @@ func (a *activator) getOrCreateAPIActivator(ctx context.Context, apiName string) return nil, errors.WithStack(err) } - maxQueueLength, maxConcurrency, err := concurrencyFromAnnotations(vs.Annotations) + maxQueueLength, maxConcurrency, err := userconfig.ConcurrencyFromAnnotations(vs) if err != nil { return nil, err } diff --git a/pkg/activator/helpers.go b/pkg/activator/helpers.go index f32c7e54f2..5bce2cb7bf 100644 --- a/pkg/activator/helpers.go +++ b/pkg/activator/helpers.go @@ -17,8 +17,6 @@ limitations under the License. package activator import ( - "strconv" - "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/types/userconfig" "k8s.io/apimachinery/pkg/api/meta" @@ -50,8 +48,7 @@ func getAPIMeta(obj interface{}) (apiMeta, error) { return apiMeta{}, errors.ErrorUnexpected("got a virtual service without apiName label") } - annotations := resource.GetAnnotations() - maxQueueLength, maxConcurrency, err := concurrencyFromAnnotations(annotations) + maxQueueLength, maxConcurrency, err := userconfig.ConcurrencyFromAnnotations(resource) if err != nil { return apiMeta{}, err } @@ -60,22 +57,8 @@ func getAPIMeta(obj interface{}) (apiMeta, error) { apiName: apiName, apiKind: userconfig.KindFromString(apiKind), labels: labels, - annotations: annotations, + annotations: resource.GetAnnotations(), maxConcurrency: maxConcurrency, maxQueueLength: maxQueueLength, }, nil } - -func concurrencyFromAnnotations(annotations map[string]string) (int, int, error) { - maxQueueLength, err := strconv.Atoi(annotations[userconfig.MaxQueueLengthAnnotationKey]) - if err != nil { - return 0, 0, errors.ErrorUnexpected("failed to parse annotation", userconfig.MaxQueueLengthAnnotationKey) - } - - maxConcurrency, err := strconv.Atoi(annotations[userconfig.MaxConcurrencyAnnotationKey]) - if err != nil { - return 0, 0, errors.ErrorUnexpected("failed to parse annotation", userconfig.MaxConcurrencyAnnotationKey) - } - - return maxQueueLength, maxConcurrency, err -} diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 7ea590fc45..3fe860d776 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -76,8 +76,7 @@ var ( CortexProbeHeader = "X-Cortex-Probe" CortexOriginHeader = "X-Cortex-Origin" - WaitForInitializingReplicasTimeout = 15 * time.Minute - WaitForReadyReplicasTimeout = 20 * time.Minute + WaitForReadyReplicasTimeout = 20 * time.Minute ) func DefaultRegistry() string { diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index d3247de9b8..693557818e 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -24,7 +24,6 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/k8s" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types/spec" - "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" kcore "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -217,17 +216,25 @@ type NetworkingSpec struct { // RealtimeAPIStatus defines the observed state of RealtimeAPI type RealtimeAPIStatus struct { + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Type=integer + Ready int32 `json:"ready"` + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Type=integer + Requested int32 `json:"requested"` + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Type=integer + UpToDate int32 `json:"up_to_date"` + // +kubebuilder:validation:Optional // +kubebuilder:validation:Type=string - Status status.Code `json:"status"` - ReplicaCounts status.ReplicaCounts `json:"replica_counts"` - Endpoint string `json:"endpoint,omitempty"` + Endpoint string `json:"endpoint,omitempty"` } //+kubebuilder:object:root=true //+kubebuilder:subresource:status -//+kubebuilder:printcolumn:JSONPath=".spec.pod.replicas",name="Replicas",type="integer" -//+kubebuilder:printcolumn:JSONPath=".status.replica_counts.updated.ready",name="Ready",type="integer" -//+kubebuilder:printcolumn:JSONPath=".status.status",name="Status",type="string" +//+kubebuilder:printcolumn:JSONPath=".status.ready",name="Ready",type="integer" +//+kubebuilder:printcolumn:JSONPath=".status.requested",name="Requested",type="integer" +//+kubebuilder:printcolumn:JSONPath=".status.up_to_date",name="Up-To-Date",type="integer" //+kubebuilder:printcolumn:JSONPath=".status.endpoint",name="Endpoint",type="string" // RealtimeAPI is the Schema for the realtimeapis API @@ -266,7 +273,6 @@ func (api RealtimeAPI) GetOrCreateAPIIDs() (deploymentID, podID, specID, apiID s if apiID == "" || api.Annotations["cortex.dev/deployment-id"] != deploymentID || api.Annotations["cortex.dev/spec-id"] != specID { - apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) } diff --git a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go index df2ba5ab3d..c81d4ac6f6 100644 --- a/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/crds/apis/serverless/v1alpha1/zz_generated.deepcopy.go @@ -243,7 +243,6 @@ func (in *RealtimeAPISpec) DeepCopy() *RealtimeAPISpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RealtimeAPIStatus) DeepCopyInto(out *RealtimeAPIStatus) { *out = *in - out.ReplicaCounts = in.ReplicaCounts } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RealtimeAPIStatus. diff --git a/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml b/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml index 63b1987bd9..a60ccbba4a 100644 --- a/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml +++ b/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml @@ -251,16 +251,28 @@ spec: worker_counts: description: Detailed worker counts with respective status properties: + creating: + format: int32 + type: integer + err_image_pull: + format: int32 + type: integer failed: format: int32 type: integer - initializing: + killed: + format: int32 + type: integer + killed_oom: + format: int32 + type: integer + not_ready: format: int32 type: integer pending: format: int32 type: integer - running: + ready: format: int32 type: integer stalled: @@ -269,6 +281,9 @@ spec: succeeded: format: int32 type: integer + terminating: + format: int32 + type: integer unknown: format: int32 type: integer diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 3e8b6d267b..58d12f66df 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -17,15 +17,15 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - jsonPath: .spec.pod.replicas - name: Replicas - type: integer - - jsonPath: .status.replica_counts.updated.ready + - jsonPath: .status.ready name: Ready type: integer - - jsonPath: .status.status - name: Status - type: string + - jsonPath: .status.requested + name: Requested + type: integer + - jsonPath: .status.up_to_date + name: Up-To-Date + type: integer - jsonPath: .status.endpoint name: Endpoint type: string @@ -604,93 +604,15 @@ spec: properties: endpoint: type: string - replica_counts: - properties: - requested: - format: int32 - type: integer - stale: - properties: - err_image_pull: - format: int32 - type: integer - failed: - format: int32 - type: integer - initializing: - format: int32 - type: integer - killed: - format: int32 - type: integer - killed_oom: - format: int32 - type: integer - not_ready: - format: int32 - type: integer - pending: - format: int32 - type: integer - ready: - format: int32 - type: integer - stalled: - format: int32 - type: integer - terminating: - format: int32 - type: integer - unknown: - format: int32 - type: integer - required: - - ready - type: object - updated: - properties: - err_image_pull: - format: int32 - type: integer - failed: - format: int32 - type: integer - initializing: - format: int32 - type: integer - killed: - format: int32 - type: integer - killed_oom: - format: int32 - type: integer - not_ready: - format: int32 - type: integer - pending: - format: int32 - type: integer - ready: - format: int32 - type: integer - stalled: - format: int32 - type: integer - terminating: - format: int32 - type: integer - unknown: - format: int32 - type: integer - required: - - ready - type: object - type: object - status: - type: string - required: - - replica_counts - - status + ready: + format: int32 + type: integer + requested: + format: int32 + type: integer + up_to_date: + format: int32 + type: integer type: object type: object served: true diff --git a/pkg/crds/controllers/batch/batchjob_controller_helpers.go b/pkg/crds/controllers/batch/batchjob_controller_helpers.go index dd052dfc58..0f11ba67ba 100644 --- a/pkg/crds/controllers/batch/batchjob_controller_helpers.go +++ b/pkg/crds/controllers/batch/batchjob_controller_helpers.go @@ -442,6 +442,22 @@ func (r *BatchJobReconciler) getWorkerJob(ctx context.Context, batchJob batch.Ba return &job, nil } +func (r *BatchJobReconciler) getWorkerJobPods(ctx context.Context, batchJob batch.BatchJob) ([]kcore.Pod, error) { + workerJobPods := kcore.PodList{} + if err := r.List(ctx, &workerJobPods, + client.InNamespace(consts.DefaultNamespace), + client.MatchingLabels{ + "jobID": batchJob.Name, + "apiName": batchJob.Spec.APIName, + "apiID": batchJob.Spec.APIID, + "cortex.dev/batch": "worker", + }, + ); err != nil { + return nil, err + } + return workerJobPods.Items, nil +} + func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.BatchJob, statusInfo batchJobStatusInfo) error { batchJob.Status.ID = batchJob.Name @@ -461,6 +477,11 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B batchJob.Status.TotalBatchCount = statusInfo.TotalBatchCount } + workerJobPods, err := r.getWorkerJobPods(ctx, *batchJob) + if err != nil { + return errors.Wrap(err, "failed to retrieve worker pods") + } + worker := statusInfo.WorkerJob if worker != nil { batchJob.Status.EndTime = worker.Status.CompletionTime // assign right away, because it's a pointer @@ -486,13 +507,11 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B } } - isWorkerOOM, err := r.checkWorkersOOM(ctx, batchJob) - if err != nil { - return err - } - - if isWorkerOOM { - batchJobStatus = status.JobWorkerOOM + for i := range workerJobPods { + if k8s.WasPodOOMKilled(&workerJobPods[i]) { + batchJobStatus = status.JobWorkerOOM + break + } } batchJob.Status.Status = batchJobStatus @@ -512,11 +531,8 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B batchJob.Status.Status = status.JobRunning } - batchJob.Status.WorkerCounts = &status.WorkerCounts{ - Running: worker.Status.Active, - Succeeded: worker.Status.Succeeded, - Failed: worker.Status.Failed, - } + workerCounts := getReplicaCounts(workerJobPods) + batchJob.Status.WorkerCounts = &workerCounts } if err := r.Status().Update(ctx, batchJob); err != nil { @@ -526,27 +542,6 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B return nil } -func (r *BatchJobReconciler) checkWorkersOOM(ctx context.Context, batchJob *batch.BatchJob) (bool, error) { - workerJobPods := kcore.PodList{} - if err := r.List(ctx, &workerJobPods, - client.InNamespace(consts.DefaultNamespace), - client.MatchingLabels{ - "jobID": batchJob.Name, - "apiName": batchJob.Spec.APIName, - "apiID": batchJob.Spec.APIID, - }, - ); err != nil { - return false, err - } - - for i := range workerJobPods.Items { - if k8s.WasPodOOMKilled(&workerJobPods.Items[i]) { - return true, nil - } - } - return false, nil -} - func (r *BatchJobReconciler) deleteSQSQueue(batchJob batch.BatchJob) error { queueURL := r.getQueueURL(batchJob) input := sqs.DeleteQueueInput{QueueUrl: aws.String(queueURL)} @@ -736,3 +731,34 @@ func saveJobStatus(r *BatchJobReconciler, batchJob batch.BatchJob) error { }, ) } + +func getReplicaCounts(workerJobPods []kcore.Pod) status.WorkerCounts { + workerCounts := status.WorkerCounts{} + for i := range workerJobPods { + switch k8s.GetPodStatus(&workerJobPods[i]) { + case k8s.PodStatusPending: + workerCounts.Pending++ + case k8s.PodStatusStalled: + workerCounts.Stalled++ + case k8s.PodStatusCreating: + workerCounts.Creating++ + case k8s.PodStatusNotReady: + workerCounts.NotReady++ + case k8s.PodStatusErrImagePull: + workerCounts.ErrImagePull++ + case k8s.PodStatusTerminating: + workerCounts.Terminating++ + case k8s.PodStatusFailed: + workerCounts.Failed++ + case k8s.PodStatusKilled: + workerCounts.Killed++ + case k8s.PodStatusKilledOOM: + workerCounts.KilledOOM++ + case k8s.PodStatusSucceeded: + workerCounts.Succeeded++ + case k8s.PodStatusUnknown: + workerCounts.Unknown++ + } + } + return workerCounts +} diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 047ef42a43..64ec0cd000 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -19,7 +19,6 @@ package serverlesscontroller import ( "context" "fmt" - "time" "github.com/cortexlabs/cortex/pkg/consts" serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" @@ -29,7 +28,6 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/pointer" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/lib/urls" - "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" istionetworking "istio.io/api/networking/v1beta1" @@ -64,103 +62,21 @@ func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *serverles return errors.Wrap(err, "failed to get api endpoint") } - apiStatus := status.Pending - api.Status.ReplicaCounts = status.ReplicaCounts{} if deployment != nil { - if deployment.Status.ReadyReplicas == api.Spec.Pod.Replicas { - apiStatus = status.Live - api.Status.ReplicaCounts.Updated.Ready = deployment.Status.ReadyReplicas - // TODO: handle out of date (?) - } else { - if err = r.getReplicaCounts(ctx, api); err != nil { - return err - } - apiStatus = r.getStatusCode(api) + api.Status.Ready = deployment.Status.ReadyReplicas + api.Status.UpToDate = deployment.Status.UpdatedReplicas + if deployment.Spec.Replicas != nil { + api.Status.Requested = *deployment.Spec.Replicas } - } - - api.Status.Status = apiStatus - if err = r.Status().Update(ctx, api); err != nil { - return err - } - - return nil -} -func (r *RealtimeAPIReconciler) getReplicaCounts(ctx context.Context, api *serverless.RealtimeAPI) error { - var podList kcore.PodList - if err := r.List(ctx, &podList, client.MatchingLabels{ - "apiName": api.Name, - "apiKind": userconfig.RealtimeAPIKind.String(), - "deploymentID": api.Annotations["cortex.dev/deployment-id"], - }); err != nil { - return err - } - for i := range podList.Items { - pod := &podList.Items[i] - if k8s.IsPodReady(pod) { - api.Status.ReplicaCounts.Updated.Ready++ - continue - } - - switch k8s.GetPodStatus(pod) { - case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - api.Status.ReplicaCounts.Updated.Stalled++ - } else { - api.Status.ReplicaCounts.Updated.Pending++ - } - case k8s.PodStatusInitializing: - api.Status.ReplicaCounts.Updated.Initializing++ - case k8s.PodStatusRunning: - api.Status.ReplicaCounts.Updated.Initializing++ - case k8s.PodStatusErrImagePull: - api.Status.ReplicaCounts.Updated.ErrImagePull++ - case k8s.PodStatusTerminating: - api.Status.ReplicaCounts.Updated.Terminating++ - case k8s.PodStatusFailed: - api.Status.ReplicaCounts.Updated.Failed++ - case k8s.PodStatusKilled: - api.Status.ReplicaCounts.Updated.Killed++ - case k8s.PodStatusKilledOOM: - api.Status.ReplicaCounts.Updated.KilledOOM++ - default: - api.Status.ReplicaCounts.Updated.Unknown++ + if err = r.Status().Update(ctx, api); err != nil { + return err } } return nil } -func (r *RealtimeAPIReconciler) getStatusCode(api *serverless.RealtimeAPI) status.Code { - counts := api.Status.ReplicaCounts - if counts.Updated.Ready >= api.Spec.Pod.Replicas { - return status.Live - } - - if counts.Updated.ErrImagePull > 0 { - return status.ErrorImagePull - } - - if counts.Updated.Failed > 0 || counts.Updated.Killed > 0 { - return status.Error - } - - if counts.Updated.KilledOOM > 0 { - return status.OOM - } - - if counts.Updated.Stalled > 0 { - return status.Stalled - } - - if counts.Updated.Ready >= api.Spec.Autoscaling.MinReplicas { - return status.Live - } - - return status.Updating -} - func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api serverless.RealtimeAPI) (controllerutil.OperationResult, error) { deployment := kapps.Deployment{ ObjectMeta: kmeta.ObjectMeta{ @@ -281,6 +197,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), "deploymentID": api.Annotations["cortex.dev/deployment-id"], + "apiID": api.Annotations["cortex.dev/api-id"], "cortex.dev/api": "true", }, Annotations: map[string]string{ diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index e841a7b8a8..293e88a476 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -23,6 +23,7 @@ import ( "time" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" @@ -45,37 +46,48 @@ const ( ReasonCompleted = "Completed" ) +type PodSpec struct { + Name string + K8sPodSpec kcore.PodSpec + Labels map[string]string + Annotations map[string]string +} + type PodStatus string const ( - PodStatusUnknown PodStatus = "Unknown" PodStatusPending PodStatus = "Pending" - PodStatusInitializing PodStatus = "Initializing" - PodStatusRunning PodStatus = "Running" - PodStatusErrImagePull PodStatus = "Image pull error" + PodStatusCreating PodStatus = "Creating" + PodStatusNotReady PodStatus = "NotReady" + PodStatusReady PodStatus = "Ready" + PodStatusErrImagePull PodStatus = "ErrImagePull" PodStatusTerminating PodStatus = "Terminating" - PodStatusSucceeded PodStatus = "Succeeded" PodStatusFailed PodStatus = "Failed" PodStatusKilled PodStatus = "Killed" - PodStatusKilledOOM PodStatus = "Out of Memory" + PodStatusKilledOOM PodStatus = "KilledOOM" + PodStatusStalled PodStatus = "Stalled" + PodStatusSucceeded PodStatus = "Succeeded" + PodStatusUnknown PodStatus = "Unknown" ) -var _killStatuses = map[int32]bool{ - 137: true, // SIGKILL - 143: true, // SIGTERM - 130: true, // SIGINT - 129: true, // SIGHUP -} +var ( + _killStatuses = map[int32]bool{ + 137: true, // SIGKILL + 143: true, // SIGTERM + 130: true, // SIGINT + 129: true, // SIGHUP + } -// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/images/types.go#L27 -var _imagePullErrorStrings = strset.New("ErrImagePull", "ImagePullBackOff", "RegistryUnavailable") + _evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`) -type PodSpec struct { - Name string - K8sPodSpec kcore.PodSpec - Labels map[string]string - Annotations map[string]string -} + // https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/images/types.go#L27 + _imagePullErrorStrings = strset.New("ErrImagePull", "ImagePullBackOff", "RegistryUnavailable") + + // https://github.com/kubernetes/kubernetes/blob/9f47110aa29094ed2878cf1d85874cb59214664a/staging/src/k8s.io/api/core/v1/types.go#L76-L77 + _creatingReasons = strset.New("ContainerCreating", "PodInitializing") + + _waitForCreatingPodTimeout = time.Minute * 15 +) func Pod(spec *PodSpec) *kcore.Pod { pod := &kcore.Pod{ @@ -90,6 +102,28 @@ func Pod(spec *PodSpec) *kcore.Pod { return pod } +func GetPodConditionOf(pod *kcore.Pod, podType kcore.PodConditionType) (*bool, *kcore.PodCondition) { + if pod == nil { + return nil, nil + } + + var conditionState *bool + var condition *kcore.PodCondition + for i := range pod.Status.Conditions { + if pod.Status.Conditions[i].Type == podType { + if pod.Status.Conditions[i].Status == kcore.ConditionTrue { + conditionState = pointer.Bool(true) + } + if pod.Status.Conditions[i].Status == kcore.ConditionFalse { + conditionState = pointer.Bool(false) + } + condition = &pod.Status.Conditions[i] + break + } + } + return conditionState, condition +} + func (c *Client) CreatePod(pod *kcore.Pod) (*kcore.Pod, error) { pod.TypeMeta = _podTypeMeta pod, err := c.podClient.Create(context.Background(), pod, kmeta.CreateOptions{}) @@ -120,14 +154,26 @@ func (c *Client) ApplyPod(pod *kcore.Pod) (*kcore.Pod, error) { } func IsPodReady(pod *kcore.Pod) bool { - if GetPodStatus(pod) != PodStatusRunning { + if GetPodStatus(pod) != PodStatusReady { return false } - for _, condition := range pod.Status.Conditions { - if condition.Type == "Ready" && condition.Status == kcore.ConditionTrue { - return true - } + podConditionState, _ := GetPodConditionOf(pod, kcore.PodReady) + if podConditionState != nil && *podConditionState { + return true + } + + return false +} + +func IsPodStalled(pod *kcore.Pod) bool { + if GetPodStatus(pod) != PodStatusPending { + return false + } + + podConditionState, podCondition := GetPodConditionOf(pod, kcore.PodScheduled) + if podConditionState != nil && !*podConditionState && !podCondition.LastTransitionTime.Time.IsZero() && time.Since(podCondition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { + return true } return false @@ -137,7 +183,7 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time { for i := range pod.Status.Conditions { condition := pod.Status.Conditions[i] - if condition.Type == "Ready" && condition.Status == kcore.ConditionTrue { + if condition.Type == kcore.PodReady && condition.Status == kcore.ConditionTrue { if condition.LastTransitionTime.Time.IsZero() { return nil } @@ -148,8 +194,6 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time { return nil } -var _evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`) - func WasPodOOMKilled(pod *kcore.Pod) bool { if pod.Status.Reason == ReasonEvicted && _evictedMemoryMessageRegex.MatchString(pod.Status.Message) { return true @@ -176,15 +220,11 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { switch pod.Status.Phase { case kcore.PodPending: - initPodStatus := PodStatusFromContainerStatuses(pod.Status.InitContainerStatuses) - if initPodStatus == PodStatusRunning { - return PodStatusInitializing + podConditionState, podCondition := GetPodConditionOf(pod, kcore.PodScheduled) + if podConditionState != nil && !*podConditionState && !podCondition.LastTransitionTime.Time.IsZero() && time.Since(podCondition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { + return PodStatusStalled } - allPodStatus := PodStatusFromContainerStatuses(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)) - if allPodStatus == PodStatusErrImagePull { - return PodStatusErrImagePull - } - return PodStatusPending + return PodStatusFromContainerStatuses(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)) case kcore.PodSucceeded: return PodStatusSucceeded case kcore.PodFailed: @@ -215,7 +255,17 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { return PodStatusTerminating } - return PodStatusFromContainerStatuses(pod.Status.ContainerStatuses) + podConditionState, _ := GetPodConditionOf(pod, kcore.PodReady) + if podConditionState != nil && *podConditionState { + return PodStatusReady + } + + status := PodStatusFromContainerStatuses(pod.Status.ContainerStatuses) + if status == PodStatusReady { + return PodStatusNotReady + } + + return status default: return PodStatusUnknown } @@ -224,7 +274,9 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) PodStatus { numContainers := len(containerStatuses) numWaiting := 0 - numRunning := 0 + numCreating := 0 + numNotReady := 0 + numReady := 0 numSucceeded := 0 numFailed := 0 numKilled := 0 @@ -235,9 +287,9 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P } for _, containerStatus := range containerStatuses { if containerStatus.State.Running != nil && containerStatus.Ready { - numRunning++ - } else if containerStatus.State.Running != nil && containerStatus.RestartCount == 0 { - numRunning++ + numReady++ + } else if containerStatus.State.Running != nil && !containerStatus.Ready { + numNotReady++ } else if containerStatus.State.Terminated != nil { exitCode := containerStatus.State.Terminated.ExitCode reason := containerStatus.State.Terminated.Reason @@ -264,6 +316,8 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P } } else if containerStatus.State.Waiting != nil && _imagePullErrorStrings.Has(containerStatus.State.Waiting.Reason) { return PodStatusErrImagePull + } else if containerStatus.State.Waiting != nil && _creatingReasons.Has(containerStatus.State.Waiting.Reason) { + numCreating++ } else { // either containerStatus.State.Waiting != nil or all containerStatus.States are nil (which implies waiting) numWaiting++ @@ -279,8 +333,12 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P return PodStatusPending } else if numSucceeded == numContainers { return PodStatusSucceeded + } else if numCreating > 0 { + return PodStatusCreating + } else if numNotReady > 0 { + return PodStatusNotReady } else { - return PodStatusRunning + return PodStatusReady } } diff --git a/pkg/operator/endpoints/describe.go b/pkg/operator/endpoints/describe.go new file mode 100644 index 0000000000..b574d5eefc --- /dev/null +++ b/pkg/operator/endpoints/describe.go @@ -0,0 +1,36 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package endpoints + +import ( + "net/http" + + "github.com/cortexlabs/cortex/pkg/operator/resources" + "github.com/gorilla/mux" +) + +func DescribeAPI(w http.ResponseWriter, r *http.Request) { + apiName := mux.Vars(r)["apiName"] + + response, err := resources.DescribeAPI(apiName) + if err != nil { + respondError(w, r, err) + return + } + + respondJSON(w, r, response) +} diff --git a/pkg/operator/endpoints/logs.go b/pkg/operator/endpoints/logs.go index dbe10828b1..4daa1904c4 100644 --- a/pkg/operator/endpoints/logs.go +++ b/pkg/operator/endpoints/logs.go @@ -19,6 +19,7 @@ package endpoints import ( "net/http" + "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/resources" "github.com/cortexlabs/cortex/pkg/operator/resources/asyncapi" @@ -98,7 +99,10 @@ func GetLogURL(w http.ResponseWriter, r *http.Request) { respondError(w, r, err) return } - logURL, err := operator.APILogURL(apiResponse[0].Spec) + if apiResponse[0].Spec == nil { + respondError(w, r, errors.ErrorUnexpected("unable to get api spec", apiName)) + } + logURL, err := operator.APILogURL(*apiResponse[0].Spec) if err != nil { respondError(w, r, err) return @@ -112,7 +116,10 @@ func GetLogURL(w http.ResponseWriter, r *http.Request) { respondError(w, r, err) return } - logURL, err := operator.APILogURL(apiResponse[0].Spec) + if apiResponse[0].Spec == nil { + respondError(w, r, errors.ErrorUnexpected("unable to get api spec", apiName)) + } + logURL, err := operator.APILogURL(*apiResponse[0].Spec) if err != nil { respondError(w, r, err) return diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go index f9536596ce..43e36168c9 100644 --- a/pkg/operator/operator/k8s.go +++ b/pkg/operator/operator/k8s.go @@ -22,6 +22,7 @@ import ( "github.com/cortexlabs/cortex/pkg/config" "github.com/cortexlabs/cortex/pkg/lib/urls" "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" ) // APILoadBalancerURL returns the http endpoint of the ingress load balancer for deployed APIs @@ -63,3 +64,20 @@ func APIEndpoint(api *spec.API) (string, error) { return urls.Join(baseAPIEndpoint, *api.Networking.Endpoint), nil } + +func APIEndpointFromResource(deployedResource *DeployedResource) (string, error) { + apiEndpoint, err := userconfig.EndpointFromAnnotation(deployedResource.VirtualService) + if err != nil { + return "", err + } + + baseAPIEndpoint := "" + + baseAPIEndpoint, err = APILoadBalancerURL() + if err != nil { + return "", err + } + baseAPIEndpoint = strings.Replace(baseAPIEndpoint, "https://", "http://", 1) + + return urls.Join(baseAPIEndpoint, apiEndpoint), nil +} diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 39cce27446..d662223f2b 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -19,6 +19,7 @@ package asyncapi import ( "fmt" "path/filepath" + "sort" "time" "github.com/cortexlabs/cortex/pkg/config" @@ -31,6 +32,7 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" @@ -249,13 +251,71 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } +func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { + asyncAPIs := make([]schema.APIResponse, 0) + mappedAsyncAPIs := make(map[string]schema.APIResponse, 0) + apiNames := make([]string, 0) + + for i := range deployments { + if deployments[i].Labels["cortex.dev/async"] != "api" { + continue + } + apiName := deployments[i].Labels["apiName"] + apiNames = append(apiNames, apiName) + + metadata, err := spec.MetadataFromDeployment(&deployments[i]) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) + } + mappedAsyncAPIs[apiName] = schema.APIResponse{ + Status: status.FromDeployment(&deployments[i]), + Metadata: metadata, + } + } + + sort.Strings(apiNames) + for _, apiName := range apiNames { + asyncAPIs = append(asyncAPIs, mappedAsyncAPIs[apiName]) + } + + return asyncAPIs, nil +} + func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - status, err := GetStatus(deployedResource.Name) + var apiDeployment *kapps.Deployment + var gatewayDeployment *kapps.Deployment + + err := parallel.RunFirstErr( + func() error { + var err error + apiDeployment, err = config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + return err + }, + func() error { + var err error + gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(deployedResource.Name)) + return err + }, + ) if err != nil { return nil, err } - api, err := operator.DownloadAPISpec(status.APIName, status.APIID) + if apiDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find api deployment", deployedResource.Name) + } + + if gatewayDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) + } + + apiStatus := status.FromDeployment(apiDeployment) + apiMetadata, err := spec.MetadataFromDeployment(apiDeployment) + if err != nil { + return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) + } + + api, err := operator.DownloadAPISpec(apiMetadata.Name, apiMetadata.APIID) if err != nil { return nil, err } @@ -269,43 +329,72 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, - Status: status, - Endpoint: apiEndpoint, + Spec: api, + Metadata: apiMetadata, + Status: apiStatus, + Endpoint: &apiEndpoint, DashboardURL: dashboardURL, }, }, nil } -func GetAllAPIs(pods []kcore.Pod, deployments []kapps.Deployment) ([]schema.APIResponse, error) { - statuses, err := GetAllStatuses(deployments, pods) +func DescribeAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { + var apiDeployment *kapps.Deployment + var gatewayDeployment *kapps.Deployment + + err := parallel.RunFirstErr( + func() error { + var err error + apiDeployment, err = config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + return err + }, + func() error { + var err error + gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(deployedResource.Name)) + return err + }, + ) if err != nil { return nil, err } - apiNames, apiIDs := namesAndIDsFromStatuses(statuses) - apis, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err + if apiDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find api deployment", deployedResource.Name) } - asyncAPIs := make([]schema.APIResponse, len(apis)) + if gatewayDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) + } - for i := range apis { - api := apis[i] - endpoint, err := operator.APIEndpoint(&api) - if err != nil { - return nil, err - } + apiStatus := status.FromDeployment(apiDeployment) + apiMetadata, err := spec.MetadataFromDeployment(apiDeployment) + if err != nil { + return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) + } - asyncAPIs[i] = schema.APIResponse{ - Spec: api, - Status: &statuses[i], - Endpoint: endpoint, - } + apiPods, err := config.K8s.ListPodsByLabels(map[string]string{ + "apiName": apiDeployment.Labels["apiName"], + "cortex.dev/async": "api", + }) + if err != nil { + return nil, err } - return asyncAPIs, nil + apiEndpoint, err := operator.APIEndpointFromResource(deployedResource) + if err != nil { + return nil, err + } + + dashboardURL := pointer.String(getDashboardURL(deployedResource.Name)) + + return []schema.APIResponse{ + { + Metadata: apiMetadata, + ReplicaCounts: GetReplicaCounts(apiStatus, apiDeployment, apiPods), + Endpoint: &apiEndpoint, + DashboardURL: dashboardURL, + }, + }, nil } func UpdateAPIMetricsCron(apiDeployment *kapps.Deployment) error { @@ -545,6 +634,33 @@ func deleteK8sResources(apiName string) error { return err } +// returns true if min_replicas are not ready and no updated replicas have errored +func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { + pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) + if err != nil { + return false, err + } + + replicaCounts := GetReplicaCounts(nil, deployment, pods) + + autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) + if err != nil { + return false, err + } + + if replicaCounts.Ready < autoscalingSpec.MinReplicas && replicaCounts.TotalFailed() == 0 { + return true, nil + } + + return false, nil +} + +func isPodSpecLatest(deployment *kapps.Deployment, pod *kcore.Pod) bool { + // Note: the gateway deployment/pods don't have "podID" or "deploymentID" labels, which is ok since it is always up-to-date + return deployment.Spec.Template.Labels["podID"] == pod.Labels["podID"] && + deployment.Spec.Template.Labels["deploymentID"] == pod.Labels["deploymentID"] +} + func getDashboardURL(apiName string) string { loadBalancerURL, err := operator.LoadBalancerURL() if err != nil { diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 38e02329d0..41b0d11fab 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -17,234 +17,18 @@ limitations under the License. package asyncapi import ( - "sort" - "time" - - "github.com/cortexlabs/cortex/pkg/config" - "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/types/status" - "github.com/cortexlabs/cortex/pkg/types/userconfig" - "github.com/cortexlabs/cortex/pkg/workloads" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" ) -type asyncResourceGroup struct { - APIDeployment *kapps.Deployment - APIPods []kcore.Pod - GatewayDeployment *kapps.Deployment - GatewayPods []kcore.Pod -} - -func GetStatus(apiName string) (*status.Status, error) { - var apiDeployment *kapps.Deployment - var gatewayDeployment *kapps.Deployment - var gatewayPods []kcore.Pod - var apiPods []kcore.Pod - - err := parallel.RunFirstErr( - func() error { - var err error - apiDeployment, err = config.K8s.GetDeployment(workloads.K8sName(apiName)) - return err - }, - func() error { - var err error - gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(apiName)) - return err - }, - func() error { - var err error - gatewayPods, err = config.K8s.ListPodsByLabels( - map[string]string{ - "apiName": apiName, - "cortex.dev/async": "gateway", - }, - ) - return err - }, - func() error { - var err error - apiPods, err = config.K8s.ListPodsByLabels( - map[string]string{ - "apiName": apiName, - "cortex.dev/async": "api", - }, - ) - return err - }, - ) - if err != nil { - return nil, err - } - - if apiDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find api deployment", apiName) - } - - if gatewayDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) - } - - return apiStatus(apiDeployment, apiPods, gatewayDeployment, gatewayPods) -} - -func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status.Status, error) { - resourcesByAPI := groupResourcesByAPI(deployments, pods) - statuses := make([]status.Status, len(resourcesByAPI)) - - var i int - for apiName, k8sResources := range resourcesByAPI { - if k8sResources.APIDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find api deployment", apiName) - } - - if k8sResources.GatewayDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) - } - - st, err := apiStatus(k8sResources.APIDeployment, k8sResources.APIPods, k8sResources.GatewayDeployment, k8sResources.GatewayPods) - if err != nil { - return nil, err - } - statuses[i] = *st - i++ - } - - sort.Slice(statuses, func(i, j int) bool { - return statuses[i].APIName < statuses[j].APIName - }) - - return statuses, nil -} - -func namesAndIDsFromStatuses(statuses []status.Status) ([]string, []string) { - apiNames := make([]string, len(statuses)) - apiIDs := make([]string, len(statuses)) - - for i, st := range statuses { - apiNames[i] = st.APIName - apiIDs[i] = st.APIID - } - - return apiNames, apiIDs -} - -// let's do CRDs instead, to avoid this -func groupResourcesByAPI(deployments []kapps.Deployment, pods []kcore.Pod) map[string]*asyncResourceGroup { - resourcesByAPI := map[string]*asyncResourceGroup{} - for i := range deployments { - deployment := deployments[i] - apiName := deployment.Labels["apiName"] - asyncType := deployment.Labels["cortex.dev/async"] - apiResources, exists := resourcesByAPI[apiName] - if exists { - if asyncType == "api" { - apiResources.APIDeployment = &deployment - } else { - apiResources.GatewayDeployment = &deployment - } - } else { - if asyncType == "api" { - resourcesByAPI[apiName] = &asyncResourceGroup{APIDeployment: &deployment} - } else { - resourcesByAPI[apiName] = &asyncResourceGroup{GatewayDeployment: &deployment} - } - } - } - - for _, pod := range pods { - apiName := pod.Labels["apiName"] - asyncType := pod.Labels["cortex.dev/async"] - apiResources, exists := resourcesByAPI[apiName] - if !exists { - // ignore pods that might still be waiting to be deleted while the deployment has already been deleted - continue - } - - if asyncType == "api" { - apiResources.APIPods = append(resourcesByAPI[apiName].APIPods, pod) - } else { - apiResources.GatewayPods = append(resourcesByAPI[apiName].GatewayPods, pod) - } - } - return resourcesByAPI -} - -func apiStatus(apiDeployment *kapps.Deployment, apiPods []kcore.Pod, gatewayDeployment *kapps.Deployment, gatewayPods []kcore.Pod) (*status.Status, error) { - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(apiDeployment) - if err != nil { - return nil, err - } - - apiReplicaCounts := getReplicaCounts(apiDeployment, apiPods) - gatewayReplicaCounts := getReplicaCounts(gatewayDeployment, gatewayPods) - - st := &status.Status{} - st.APIName = apiDeployment.Labels["apiName"] - st.APIID = apiDeployment.Labels["apiID"] - st.ReplicaCounts = apiReplicaCounts - st.Code = getStatusCode(apiReplicaCounts, gatewayReplicaCounts, autoscalingSpec.MinReplicas) - - return st, nil -} - -func getStatusCode(apiCounts status.ReplicaCounts, gatewayCounts status.ReplicaCounts, apiMinReplicas int32) status.Code { - if apiCounts.Updated.Ready >= apiCounts.Requested && gatewayCounts.Updated.Ready >= 1 { - return status.Live - } - - if apiCounts.Updated.ErrImagePull > 0 || gatewayCounts.Updated.ErrImagePull > 0 { - return status.ErrorImagePull - } - - if apiCounts.Updated.Failed > 0 || apiCounts.Updated.Killed > 0 || - gatewayCounts.Updated.Failed > 0 || gatewayCounts.Updated.Killed > 0 { - return status.Error - } - - if apiCounts.Updated.KilledOOM > 0 || gatewayCounts.Updated.KilledOOM > 0 { - return status.OOM - } - - if apiCounts.Updated.Stalled > 0 || gatewayCounts.Updated.Stalled > 0 { - return status.Stalled - } - - if apiCounts.Updated.Ready >= apiMinReplicas && gatewayCounts.Updated.Ready >= 1 { - return status.Live - } - - return status.Updating -} - -// returns true if min_replicas are not ready and no updated replicas have errored -func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { - pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) - if err != nil { - return false, err - } - - replicaCounts := getReplicaCounts(deployment, pods) - - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) - if err != nil { - return false, err - } - - if replicaCounts.Updated.Ready < autoscalingSpec.MinReplicas && replicaCounts.Updated.TotalFailed() == 0 { - return true, nil - } - - return false, nil -} - -func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { +func GetReplicaCounts(apiStatus *status.Status, deployment *kapps.Deployment, pods []kcore.Pod) *status.ReplicaCounts { counts := status.ReplicaCounts{} - counts.Requested = *deployment.Spec.Replicas + if apiStatus != nil { + counts.Requested = apiStatus.Requested + counts.UpToDate = apiStatus.UpToDate + } for i := range pods { pod := pods[i] @@ -255,50 +39,55 @@ func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.Rep addPodToReplicaCounts(&pod, deployment, &counts) } - return counts + return &counts } func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts *status.ReplicaCounts) { - var subCounts *status.SubReplicaCounts + latest := false if isPodSpecLatest(deployment, pod) { - subCounts = &counts.Updated - } else { - subCounts = &counts.Stale + latest = true } - if k8s.IsPodReady(pod) { - subCounts.Ready++ + isPodReady := k8s.IsPodReady(pod) + if latest && isPodReady { + counts.Ready++ + return + } else if !latest && isPodReady { + counts.ReadyOutOfDate++ return } - switch k8s.GetPodStatus(pod) { + podStatus := k8s.GetPodStatus(pod) + + if podStatus == k8s.PodStatusTerminating { + counts.Terminating++ + return + } + + if !latest { + return + } + + switch podStatus { case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - subCounts.Stalled++ - } else { - subCounts.Pending++ - } - case k8s.PodStatusInitializing: - subCounts.Initializing++ - case k8s.PodStatusRunning: - subCounts.Initializing++ + counts.Pending++ + case k8s.PodStatusStalled: + counts.Stalled++ + case k8s.PodStatusCreating: + counts.Creating++ + case k8s.PodStatusReady: + counts.Ready++ + case k8s.PodStatusNotReady: + counts.NotReady++ case k8s.PodStatusErrImagePull: - subCounts.ErrImagePull++ - case k8s.PodStatusTerminating: - subCounts.Terminating++ + counts.ErrImagePull++ case k8s.PodStatusFailed: - subCounts.Failed++ + counts.Failed++ case k8s.PodStatusKilled: - subCounts.Killed++ + counts.Killed++ case k8s.PodStatusKilledOOM: - subCounts.KilledOOM++ - default: - subCounts.Unknown++ + counts.KilledOOM++ + case k8s.PodStatusUnknown: + counts.Unknown++ } } - -func isPodSpecLatest(deployment *kapps.Deployment, pod *kcore.Pod) bool { - // Note: the gateway deployment/pods don't have "podID" or "deploymentID" labels, which is ok since it is always up-to-date - return deployment.Spec.Template.Labels["podID"] == pod.Labels["podID"] && - deployment.Spec.Template.Labels["deploymentID"] == pod.Labels["deploymentID"] -} diff --git a/pkg/operator/resources/job/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go index b85726a531..6ac1c87219 100644 --- a/pkg/operator/resources/job/batchapi/api.go +++ b/pkg/operator/resources/job/batchapi/api.go @@ -140,25 +140,18 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob apiNameToBatchJobsMap[batchJob.Spec.APIName] = append(apiNameToBatchJobsMap[batchJob.Spec.APIName], &batchJobList[i]) } - for _, virtualService := range virtualServices { - apiName := virtualService.Labels["apiName"] - apiID := virtualService.Labels["apiID"] - - api, err := operator.DownloadAPISpec(apiName, apiID) + for i := range virtualServices { + apiName := virtualServices[i].Labels["apiName"] + metadata, err := spec.MetadataFromVirtualService(&virtualServices[i]) if err != nil { - return nil, err - } - - endpoint, err := operator.APIEndpoint(api) - if err != nil { - return nil, err + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } var jobStatuses []status.BatchJobStatus - batchJobs := apiNameToBatchJobsMap[apiName] + batchJobs := apiNameToBatchJobsMap[metadata.Name] if len(batchJobs) == 0 { - jobStates, err := job.GetMostRecentlySubmittedJobStates(apiName, 1, userconfig.BatchAPIKind) + jobStates, err := job.GetMostRecentlySubmittedJobStates(metadata.Name, 1, userconfig.BatchAPIKind) if err != nil { return nil, err } @@ -183,9 +176,8 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob } } - batchAPIsMap[apiName] = &schema.APIResponse{ - Spec: *api, - Endpoint: endpoint, + batchAPIsMap[metadata.Name] = &schema.APIResponse{ + Metadata: metadata, BatchJobStatuses: jobStatuses, } } @@ -200,10 +192,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob } func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - virtualService := deployedResource.VirtualService + metadata, err := spec.MetadataFromVirtualService(deployedResource.VirtualService) + if err != nil { + return nil, err + } - apiID := virtualService.Labels["apiID"] - api, err := operator.DownloadAPISpec(deployedResource.Name, apiID) + api, err := operator.DownloadAPISpec(deployedResource.Name, metadata.APIID) if err != nil { return nil, err } @@ -263,9 +257,10 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, + Metadata: metadata, BatchJobStatuses: jobStatuses, - Endpoint: endpoint, + Endpoint: &endpoint, DashboardURL: dashboardURL, }, }, nil diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index 9261cc16a9..c5ca6e17fa 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -146,21 +146,15 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } } - for _, virtualService := range virtualServices { - apiName := virtualService.Labels["apiName"] - apiID := virtualService.Labels["apiID"] + for i := range virtualServices { + apiName := virtualServices[i].Labels["apiName"] - api, err := operator.DownloadAPISpec(apiName, apiID) + metadata, err := spec.MetadataFromVirtualService(&virtualServices[i]) if err != nil { - return nil, err - } - - endpoint, err := operator.APIEndpoint(api) - if err != nil { - return nil, err + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } - jobStates, err := job.GetMostRecentlySubmittedJobStates(apiName, 1, userconfig.TaskAPIKind) + jobStates, err := job.GetMostRecentlySubmittedJobStates(metadata.Name, 1, userconfig.TaskAPIKind) jobStatuses := []status.TaskJobStatus{} if len(jobStates) > 0 { @@ -172,9 +166,8 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs jobStatuses = append(jobStatuses, *jobStatus) } - taskAPIsMap[apiName] = &schema.APIResponse{ - Spec: *api, - Endpoint: endpoint, + taskAPIsMap[metadata.Name] = &schema.APIResponse{ + Metadata: metadata, TaskJobStatuses: jobStatuses, } } @@ -209,8 +202,8 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs taskAPIList := make([]schema.APIResponse, 0, len(taskAPIsMap)) - for _, batchAPI := range taskAPIsMap { - taskAPIList = append(taskAPIList, *batchAPI) + for _, taskAPI := range taskAPIsMap { + taskAPIList = append(taskAPIList, *taskAPI) } return taskAPIList, nil @@ -218,10 +211,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs // GetAPIByName returns a single task API and its most recently submitted job along with all running task jobs func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - virtualService := deployedResource.VirtualService + metadata, err := spec.MetadataFromVirtualService(deployedResource.VirtualService) + if err != nil { + return nil, err + } - apiID := virtualService.Labels["apiID"] - api, err := operator.DownloadAPISpec(deployedResource.Name, apiID) + api, err := operator.DownloadAPISpec(deployedResource.Name, metadata.APIID) if err != nil { return nil, err } @@ -295,9 +290,10 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, + Metadata: metadata, TaskJobStatuses: jobStatuses, - Endpoint: endpoint, + Endpoint: &endpoint, DashboardURL: dashboardURL, }, }, nil diff --git a/pkg/operator/resources/job/worker_stats.go b/pkg/operator/resources/job/worker_stats.go index 07628995e4..797d65980e 100644 --- a/pkg/operator/resources/job/worker_stats.go +++ b/pkg/operator/resources/job/worker_stats.go @@ -17,9 +17,6 @@ limitations under the License. package job import ( - "time" - - "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/types/status" kbatch "k8s.io/api/batch/v1" @@ -43,34 +40,32 @@ func GetWorkerCountsForJob(k8sJob kbatch.Job, pods []kcore.Pod) status.WorkerCou func addPodToWorkerCounts(pod *kcore.Pod, workerCounts *status.WorkerCounts) { if k8s.IsPodReady(pod) { - workerCounts.Running++ + workerCounts.Ready++ return } switch k8s.GetPodStatus(pod) { case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - workerCounts.Stalled++ - } else { - workerCounts.Pending++ - } - case k8s.PodStatusInitializing: - workerCounts.Initializing++ - case k8s.PodStatusRunning: - workerCounts.Initializing++ + workerCounts.Pending++ + case k8s.PodStatusStalled: + workerCounts.Stalled++ + case k8s.PodStatusCreating: + workerCounts.Creating++ + case k8s.PodStatusNotReady: + workerCounts.NotReady++ case k8s.PodStatusErrImagePull: - workerCounts.Failed++ + workerCounts.ErrImagePull++ case k8s.PodStatusTerminating: - workerCounts.Failed++ + workerCounts.Terminating++ case k8s.PodStatusFailed: workerCounts.Failed++ case k8s.PodStatusKilled: - workerCounts.Failed++ + workerCounts.Killed++ case k8s.PodStatusKilledOOM: - workerCounts.Failed++ + workerCounts.KilledOOM++ case k8s.PodStatusSucceeded: workerCounts.Succeeded++ - default: + case k8s.PodStatusUnknown: workerCounts.Unknown++ } } diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index cd7284f276..096f83348c 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -21,6 +21,7 @@ import ( "fmt" "path/filepath" "reflect" + "sort" "time" "github.com/cortexlabs/cortex/pkg/config" @@ -183,34 +184,35 @@ func GetAllAPIs() ([]schema.APIResponse, error) { } apiNames := make([]string, len(apis.Items)) - apiIDs := make([]string, len(apis.Items)) for i, api := range apis.Items { apiNames[i] = api.Name - apiIDs[i] = api.Annotations["cortex.dev/api-id"] - } - - apiSpecs, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err } realtimeAPIs := make([]schema.APIResponse, len(apis.Items)) + mappedRealtimeAPIs := make(map[string]schema.APIResponse, 0) for i := range apis.Items { api := apis.Items[i] - api.Status.ReplicaCounts.Requested = api.Spec.Pod.Replicas - realtimeAPIs[i] = schema.APIResponse{ - Spec: apiSpecs[i], + metadata, err := metadataFromRealtimeAPI(&api) + if err != nil { + return nil, err + } + + mappedRealtimeAPIs[api.Name] = schema.APIResponse{ + Metadata: metadata, Status: &status.Status{ - APIName: api.Name, - APIID: api.Annotations["cortex.dev/api-id"], - Code: api.Status.Status, - ReplicaCounts: api.Status.ReplicaCounts, + Ready: api.Status.Ready, + Requested: api.Status.Requested, + UpToDate: api.Status.UpToDate, }, - Endpoint: api.Status.Endpoint, } } + sort.Strings(apiNames) + for i, apiName := range apiNames { + realtimeAPIs[i] = mappedRealtimeAPIs[apiName] + } + return realtimeAPIs, nil } @@ -223,29 +225,71 @@ func GetAPIByName(apiName string) ([]schema.APIResponse, error) { return nil, errors.Wrap(err, "failed to get realtime api resource") } + metadata, err := metadataFromRealtimeAPI(&api) + if err != nil { + return nil, err + } + apiSpec, err := operator.DownloadAPISpec(api.Name, api.Annotations["cortex.dev/api-id"]) if err != nil { return nil, err } dashboardURL := pointer.String(getDashboardURL(api.Name)) - api.Status.ReplicaCounts.Requested = api.Spec.Pod.Replicas return []schema.APIResponse{ { - Spec: *apiSpec, + Spec: apiSpec, + Metadata: metadata, Status: &status.Status{ - APIName: api.Name, - APIID: api.Annotations["cortex.dev/api-id"], - Code: api.Status.Status, - ReplicaCounts: api.Status.ReplicaCounts, + Ready: api.Status.Ready, + Requested: api.Status.Requested, + UpToDate: api.Status.UpToDate, }, - Endpoint: api.Status.Endpoint, + Endpoint: &api.Status.Endpoint, DashboardURL: dashboardURL, }, }, nil } +func DescribeAPIByName(apiName string) ([]schema.APIResponse, error) { + ctx := context.Background() + + api := serverless.RealtimeAPI{} + key := client.ObjectKey{Namespace: consts.DefaultNamespace, Name: apiName} + if err := config.K8s.Get(ctx, key, &api); err != nil { + return nil, errors.Wrap(err, "failed to get realtime api resource") + } + + metadata, err := metadataFromRealtimeAPI(&api) + if err != nil { + return nil, err + } + + var podList kcore.PodList + if err := config.K8s.List(ctx, &podList, client.MatchingLabels{ + "apiName": metadata.Name, + "apiKind": userconfig.RealtimeAPIKind.String(), + }); err != nil { + return nil, err + } + + replicaCounts := getReplicaCounts(podList.Items, metadata) + replicaCounts.Requested = api.Status.Requested + replicaCounts.UpToDate = api.Status.UpToDate + + dashboardURL := pointer.String(getDashboardURL(api.Name)) + + return []schema.APIResponse{ + { + Metadata: metadata, + ReplicaCounts: &replicaCounts, + Endpoint: &api.Status.Endpoint, + DashboardURL: dashboardURL, + }, + }, nil +} + func getDashboardURL(apiName string) string { loadBalancerURL, err := operator.LoadBalancerURL() if err != nil { @@ -360,3 +404,87 @@ func deleteBucketResources(apiName string) error { prefix := filepath.Join(config.ClusterConfig.ClusterUID, "apis", apiName) return config.AWS.DeleteS3Dir(config.ClusterConfig.Bucket, prefix, true) } + +func metadataFromRealtimeAPI(sv *serverless.RealtimeAPI) (*spec.Metadata, error) { + lastUpdated, err := spec.TimeFromAPIID(sv.Annotations["cortex.dev/api-id"]) + if err != nil { + return nil, err + } + return &spec.Metadata{ + Resource: &userconfig.Resource{ + Name: sv.Name, + Kind: userconfig.RealtimeAPIKind, + }, + APIID: sv.Annotations["cortex.dev/api-id"], + DeploymentID: sv.Annotations["cortex.dev/deployment-id"], + LastUpdated: lastUpdated.Unix(), + }, nil +} + +func getReplicaCounts(pods []kcore.Pod, metadata *spec.Metadata) status.ReplicaCounts { + counts := status.ReplicaCounts{} + + for i := range pods { + pod := pods[i] + if pod.Labels["apiName"] != metadata.Name { + continue + } + addPodToReplicaCounts(&pods[i], metadata, &counts) + } + + return counts +} + +func addPodToReplicaCounts(pod *kcore.Pod, metadata *spec.Metadata, counts *status.ReplicaCounts) { + latest := false + if isPodSpecLatest(pod, metadata) { + latest = true + } + + isPodReady := k8s.IsPodReady(pod) + if latest && isPodReady { + counts.Ready++ + return + } else if !latest && isPodReady { + counts.ReadyOutOfDate++ + return + } + + podStatus := k8s.GetPodStatus(pod) + + if podStatus == k8s.PodStatusTerminating { + counts.Terminating++ + return + } + + if !latest { + return + } + + switch podStatus { + case k8s.PodStatusPending: + counts.Pending++ + case k8s.PodStatusStalled: + counts.Stalled++ + case k8s.PodStatusCreating: + counts.Creating++ + case k8s.PodStatusReady: + counts.Ready++ + case k8s.PodStatusNotReady: + counts.NotReady++ + case k8s.PodStatusErrImagePull: + counts.ErrImagePull++ + case k8s.PodStatusFailed: + counts.Failed++ + case k8s.PodStatusKilled: + counts.Killed++ + case k8s.PodStatusKilledOOM: + counts.KilledOOM++ + case k8s.PodStatusUnknown: + counts.Unknown++ + } +} + +func isPodSpecLatest(pod *kcore.Pod, metadata *spec.Metadata) bool { + return metadata.APIID == pod.Labels["apiID"] +} diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 6eeb95b3b0..11cd50a5c8 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -158,8 +158,8 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*schema.APIResponse, stri apiEndpoint, _ := operator.APIEndpoint(api) return &schema.APIResponse{ - Spec: *api, - Endpoint: apiEndpoint, + Spec: api, + Endpoint: &apiEndpoint, }, msg, nil } @@ -256,7 +256,7 @@ func DeleteAPI(apiName string, keepCache bool) (*schema.DeleteResponse, error) { func GetAPIs() ([]schema.APIResponse, error) { var deployments []kapps.Deployment var k8sTaskJobs []kbatch.Job - var pods []kcore.Pod + var taskAPIPods []kcore.Pod var virtualServices []istioclientnetworking.VirtualService var batchJobList batch.BatchJobList @@ -268,7 +268,7 @@ func GetAPIs() ([]schema.APIResponse, error) { }, func() error { var err error - pods, err = config.K8s.ListPodsWithLabelKeys("apiName") + taskAPIPods, err = config.K8s.ListPodsByLabel("apiKind", userconfig.TaskAPIKind.String()) return err }, func() error { @@ -305,20 +305,6 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - var batchAPIPods []kcore.Pod - var taskAPIPods []kcore.Pod - var asyncAPIPods []kcore.Pod - for _, pod := range pods { - switch pod.Labels["apiKind"] { - case userconfig.BatchAPIKind.String(): - batchAPIPods = append(batchAPIPods, pod) - case userconfig.TaskAPIKind.String(): - taskAPIPods = append(taskAPIPods, pod) - case userconfig.AsyncAPIKind.String(): - asyncAPIPods = append(asyncAPIPods, pod) - } - } - var batchAPIVirtualServices []istioclientnetworking.VirtualService var taskAPIVirtualServices []istioclientnetworking.VirtualService var trafficSplitterVirtualServices []istioclientnetworking.VirtualService @@ -350,7 +336,7 @@ func GetAPIs() ([]schema.APIResponse, error) { return nil, err } - asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIPods, asyncAPIDeployments) + asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIDeployments) if err != nil { return nil, err } @@ -443,7 +429,7 @@ func GetAPIByID(apiName string, apiID string) ([]schema.APIResponse, error) { return []schema.APIResponse{ { - Spec: *apiSpec, + Spec: apiSpec, }, }, nil } @@ -494,3 +480,33 @@ func checkIfUsedByTrafficSplitter(apiName string) error { } return nil } + +func DescribeAPI(apiName string) ([]schema.APIResponse, error) { + deployedResource, err := GetDeployedResourceByName(apiName) + if err != nil { + return nil, err + } + + var apiResponse []schema.APIResponse + + switch deployedResource.Kind { + case userconfig.RealtimeAPIKind: + apiResponse, err = realtimeapi.DescribeAPIByName(apiName) + if err != nil { + return nil, err + } + case userconfig.AsyncAPIKind: + apiResponse, err = asyncapi.DescribeAPIByName(deployedResource) + if err != nil { + return nil, err + } + default: + return nil, ErrorOperationIsOnlySupportedForKind( + *deployedResource, + userconfig.RealtimeAPIKind, + userconfig.AsyncAPIKind, + ) // unexpected + } + + return apiResponse, nil +} diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index 9d81a17faa..4881f724e3 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -26,6 +26,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/parallel" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -132,34 +133,27 @@ func getTrafficSplitterDestinations(trafficSplitter *spec.API) []k8s.Destination // GetAllAPIs returns a list of metadata, in the form of schema.APIResponse, about all the created traffic splitter APIs func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { - var ( - apiNames []string - apiIDs []string - trafficSplitters []schema.APIResponse - ) + var trafficSplitters []schema.APIResponse + for i := range virtualServices { + apiName := virtualServices[i].Labels["apiName"] - for _, virtualService := range virtualServices { - if virtualService.Labels["apiKind"] == userconfig.TrafficSplitterKind.String() { - apiNames = append(apiNames, virtualService.Labels["apiName"]) - apiIDs = append(apiIDs, virtualService.Labels["apiID"]) + metadata, err := spec.MetadataFromVirtualService(&virtualServices[i]) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } - } - apis, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err - } + if metadata.Kind != userconfig.TrafficSplitterKind { + continue + } - for i := range apis { - trafficSplitter := apis[i] - endpoint, err := operator.APIEndpoint(&trafficSplitter) + targets, err := userconfig.TrafficSplitterTargetsFromAnnotations(&virtualServices[i]) if err != nil { - return nil, err + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } trafficSplitters = append(trafficSplitters, schema.APIResponse{ - Spec: trafficSplitter, - Endpoint: endpoint, + Metadata: metadata, + NumTrafficSplitterTargets: pointer.Int32(targets), }) } @@ -168,7 +162,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schem // GetAPIByName retrieves the metadata, in the form of schema.APIResponse, of a single traffic splitter API func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - api, err := operator.DownloadAPISpec(deployedResource.Name, deployedResource.VirtualService.Labels["apiID"]) + metadata, err := spec.MetadataFromVirtualService(deployedResource.VirtualService) + if err != nil { + return nil, err + } + + api, err := operator.DownloadAPISpec(deployedResource.Name, metadata.APIID) if err != nil { return nil, err } @@ -180,8 +179,9 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, - Endpoint: endpoint, + Spec: api, + Metadata: metadata, + Endpoint: &endpoint, }, }, nil } diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index eff68701ee..703b794483 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -50,19 +50,22 @@ type NodeInfo struct { } type DeployResult struct { - API *APIResponse `json:"api"` - Message string `json:"message"` - Error string `json:"error"` + API *APIResponse `json:"api" yaml:"api"` + Message string `json:"message" yaml:"message"` + Error string `json:"error" yaml:"error"` } type APIResponse struct { - Spec spec.API `json:"spec"` - Status *status.Status `json:"status,omitempty"` - Endpoint string `json:"endpoint"` - DashboardURL *string `json:"dashboard_url,omitempty"` - BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty"` - TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty"` - APIVersions []APIVersion `json:"api_versions,omitempty"` + Spec *spec.API `json:"spec,omitempty" yaml:"spec,omitempty"` + Metadata *spec.Metadata `json:"metadata,omitempty" yaml:"metadata,omitempty"` + Status *status.Status `json:"status,omitempty" yaml:"status,omitempty"` + ReplicaCounts *status.ReplicaCounts `json:"replica_counts,omitempty" yaml:"replica_counts,omitempty"` + NumTrafficSplitterTargets *int32 `json:"num_traffic_splitter_targets,omitempty" yaml:"num_traffic_splitter_targets,omitempty"` + Endpoint *string `json:"endpoint,omitempty" yaml:"endpoint,omitempty"` + DashboardURL *string `json:"dashboard_url,omitempty" yaml:"dashboard_url,omitempty"` + BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty" yaml:"batch_job_statuses,omitempty"` + TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty" yaml:"task_job_statuses,omitempty"` + APIVersions []APIVersion `json:"api_versions,omitempty" yaml:"api_versions,omitempty"` } type LogResponse struct { @@ -70,16 +73,16 @@ type LogResponse struct { } type BatchJobResponse struct { - APISpec spec.API `json:"api_spec"` - JobStatus status.BatchJobStatus `json:"job_status"` - Metrics *metrics.BatchMetrics `json:"metrics,omitempty"` - Endpoint string `json:"endpoint"` + APISpec spec.API `json:"api_spec" yaml:"api_spec"` + JobStatus status.BatchJobStatus `json:"job_status" yaml:"job_status"` + Metrics *metrics.BatchMetrics `json:"metrics,omitempty" yaml:"metrics,omitempty"` + Endpoint string `json:"endpoint" yaml:"endpoint"` } type TaskJobResponse struct { - APISpec spec.API `json:"api_spec"` - JobStatus status.TaskJobStatus `json:"job_status"` - Endpoint string `json:"endpoint"` + APISpec spec.API `json:"api_spec" yaml:"api_spec"` + JobStatus status.TaskJobStatus `json:"job_status" yaml:"job_status"` + Endpoint string `json:"endpoint" yaml:"endpoint"` } type DeleteResponse struct { @@ -96,8 +99,8 @@ type ErrorResponse struct { } type APIVersion struct { - APIID string `json:"api_id"` - LastUpdated int64 `json:"last_updated"` + APIID string `json:"api_id" yaml:"api_id"` + LastUpdated int64 `json:"last_updated" yaml:"last_updated"` } type VerifyCortexResponse struct{} diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go index e181a0ffab..3361114078 100644 --- a/pkg/types/spec/api.go +++ b/pkg/types/spec/api.go @@ -30,20 +30,62 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/hash" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types/userconfig" + istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + kapps "k8s.io/api/apps/v1" ) type API struct { *userconfig.API - ID string `json:"id"` - SpecID string `json:"spec_id"` - PodID string `json:"pod_id"` - DeploymentID string `json:"deployment_id"` + ID string `json:"id" yaml:"id"` + SpecID string `json:"spec_id" yaml:"spec_id"` + PodID string `json:"pod_id" yaml:"pod_id"` + DeploymentID string `json:"deployment_id" yaml:"deployment_id"` - Key string `json:"key"` + Key string `json:"key" yaml:"key"` - InitialDeploymentTime int64 `json:"initial_deployment_time"` - LastUpdated int64 `json:"last_updated"` - MetadataRoot string `json:"metadata_root"` + InitialDeploymentTime int64 `json:"initial_deployment_time" yaml:"initial_deployment_time"` + LastUpdated int64 `json:"last_updated" yaml:"last_updated"` + MetadataRoot string `json:"metadata_root" yaml:"metadata_root"` +} + +type Metadata struct { + *userconfig.Resource + APIID string `json:"id" yaml:"id"` + PodID string `json:"pod_id,omitempty" yaml:"pod_id,omitempty"` + DeploymentID string `json:"deployment_id,omitempty" yaml:"deployment_id,omitempty"` + LastUpdated int64 `json:"last_updated" yaml:"last_updated"` +} + +func MetadataFromDeployment(deployment *kapps.Deployment) (*Metadata, error) { + lastUpdated, err := TimeFromAPIID(deployment.Labels["apiID"]) + if err != nil { + return nil, err + } + return &Metadata{ + Resource: &userconfig.Resource{ + Name: deployment.Labels["apiName"], + Kind: userconfig.KindFromString(deployment.Labels["apiKind"]), + }, + APIID: deployment.Labels["apiID"], + DeploymentID: deployment.Labels["deploymentID"], + LastUpdated: lastUpdated.Unix(), + }, nil +} + +func MetadataFromVirtualService(vs *istioclientnetworking.VirtualService) (*Metadata, error) { + lastUpdated, err := TimeFromAPIID(vs.Labels["apiID"]) + if err != nil { + return nil, err + } + return &Metadata{ + Resource: &userconfig.Resource{ + Name: vs.Labels["apiName"], + Kind: userconfig.KindFromString(vs.Labels["apiKind"]), + }, + APIID: vs.Labels["apiID"], + DeploymentID: vs.Labels["deploymentID"], + LastUpdated: lastUpdated.Unix(), + }, nil } /* diff --git a/pkg/types/spec/job.go b/pkg/types/spec/job.go index 784fb4f199..d6c6cb354d 100644 --- a/pkg/types/spec/job.go +++ b/pkg/types/spec/job.go @@ -32,9 +32,9 @@ const ( ) type JobKey struct { - ID string `json:"job_id"` - APIName string `json:"api_name"` - Kind userconfig.Kind `json:"kind"` + ID string `json:"job_id" yaml:"job_id"` + APIName string `json:"api_name" yaml:"api_name"` + Kind userconfig.Kind `json:"kind" yaml:"kind"` } func (j JobKey) UserString() string { @@ -56,39 +56,39 @@ func (j JobKey) K8sName() string { } type SQSDeadLetterQueue struct { - ARN string `json:"arn"` - MaxReceiveCount int `json:"max_receive_count"` + ARN string `json:"arn" yaml:"arn"` + MaxReceiveCount int `json:"max_receive_count" yaml:"max_receive_count"` } type RuntimeBatchJobConfig struct { - Workers int `json:"workers"` - SQSDeadLetterQueue *SQSDeadLetterQueue `json:"sqs_dead_letter_queue"` - Config map[string]interface{} `json:"config"` - Timeout *int `json:"timeout"` + Workers int `json:"workers" yaml:"workers"` + SQSDeadLetterQueue *SQSDeadLetterQueue `json:"sqs_dead_letter_queue" yaml:"sqs_dead_letter_queue"` + Config map[string]interface{} `json:"config" yaml:"config"` + Timeout *int `json:"timeout" yaml:"timeout"` } type RuntimeTaskJobConfig struct { - Workers int `json:"workers"` - Config map[string]interface{} `json:"config"` - Timeout *int `json:"timeout"` + Workers int `json:"workers" yaml:"workers"` + Config map[string]interface{} `json:"config" yaml:"config"` + Timeout *int `json:"timeout" yaml:"timeout"` } type BatchJob struct { JobKey RuntimeBatchJobConfig - APIID string `json:"api_id"` - SQSUrl string `json:"sqs_url"` - TotalBatchCount int `json:"total_batch_count,omitempty"` - StartTime time.Time `json:"start_time,omitempty"` + APIID string `json:"api_id" yaml:"api_id"` + SQSUrl string `json:"sqs_url" yaml:"sqs_url"` + TotalBatchCount int `json:"total_batch_count,omitempty" yaml:"total_batch_count,omitempty"` + StartTime time.Time `json:"start_time,omitempty" yaml:"start_time,omitempty"` } type TaskJob struct { JobKey RuntimeTaskJobConfig - APIID string `json:"api_id"` - SpecID string `json:"spec_id"` - PodID string `json:"pod_id"` - StartTime time.Time `json:"start_time"` + APIID string `json:"api_id" yaml:"api_id"` + SpecID string `json:"spec_id" yaml:"spec_id"` + PodID string `json:"pod_id" yaml:"pod_id"` + StartTime time.Time `json:"start_time" yaml:"start_time"` } // e.g. //jobs/// diff --git a/pkg/types/status/code.go b/pkg/types/status/code.go deleted file mode 100644 index 11d9c002ea..0000000000 --- a/pkg/types/status/code.go +++ /dev/null @@ -1,101 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package status - -// +kubebuilder:validation:Type=string -type Code int - -const ( - Unknown Code = iota - Pending - Stalled - Error - ErrorImagePull - OOM - Live - Updating -) - -var _codes = []string{ - "unknown", - "pending", - "stalled", - "error", - "error_image_pull", - "oom", - "live", - "updating", -} - -var _ = [1]int{}[int(Updating)-(len(_codes)-1)] // Ensure list length matches - -var _codeMessages = []string{ - "unknown", // Unknown - "pending", // Pending - "compute unavailable", // Stalled - "error", // Error - "error (image pull)", // Live - "error (out of memory)", // OOM - "live", // Live - "updating", // Updating -} - -var _ = [1]int{}[int(Updating)-(len(_codeMessages)-1)] // Ensure list length matches - -func (code Code) String() string { - if int(code) < 0 || int(code) >= len(_codes) { - return _codes[Unknown] - } - return _codes[code] -} - -func (code Code) Message() string { - if int(code) < 0 || int(code) >= len(_codeMessages) { - return _codeMessages[Unknown] - } - return _codeMessages[code] -} - -// MarshalText satisfies TextMarshaler -func (code Code) MarshalText() ([]byte, error) { - return []byte(code.String()), nil -} - -// UnmarshalText satisfies TextUnmarshaler -func (code *Code) UnmarshalText(text []byte) error { - enum := string(text) - for i := 0; i < len(_codes); i++ { - if enum == _codes[i] { - *code = Code(i) - return nil - } - } - - *code = Unknown - return nil -} - -// UnmarshalBinary satisfies BinaryUnmarshaler -// Needed for msgpack -func (code *Code) UnmarshalBinary(data []byte) error { - return code.UnmarshalText(data) -} - -// MarshalBinary satisfies BinaryMarshaler -func (code Code) MarshalBinary() ([]byte, error) { - return []byte(code.String()), nil -} diff --git a/pkg/types/status/job_status.go b/pkg/types/status/job_status.go index eb299831ba..f106d051a7 100644 --- a/pkg/types/status/job_status.go +++ b/pkg/types/status/job_status.go @@ -24,15 +24,15 @@ import ( type BatchJobStatus struct { spec.BatchJob - Status JobCode `json:"status"` - EndTime *time.Time `json:"end_time,omitempty"` - BatchesInQueue int `json:"batches_in_queue"` - WorkerCounts *WorkerCounts `json:"worker_counts,omitempty"` + Status JobCode `json:"status" yaml:"status"` + EndTime *time.Time `json:"end_time,omitempty" yaml:"end_time,omitempty"` + BatchesInQueue int `json:"batches_in_queue" yaml:"batches_in_queue"` + WorkerCounts *WorkerCounts `json:"worker_counts,omitempty" yaml:"worker_counts,omitempty"` } type TaskJobStatus struct { spec.TaskJob - EndTime *time.Time `json:"end_time"` - Status JobCode `json:"status"` - WorkerCounts *WorkerCounts `json:"worker_counts"` + EndTime *time.Time `json:"end_time,omitempty" yaml:"end_time,omitempty"` + Status JobCode `json:"status" yaml:"status"` + WorkerCounts *WorkerCounts `json:"worker_counts,omitempty" yaml:"worker_counts,omitempty"` } diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index b1ef426504..6c61b3ac6b 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -16,48 +16,118 @@ limitations under the License. package status +import ( + kapps "k8s.io/api/apps/v1" +) + type Status struct { - APIName string `json:"api_name"` - APIID string `json:"api_id"` - Code Code `json:"status_code"` - ReplicaCounts `json:"replica_counts"` + Ready int32 `json:"ready" yaml:"ready"` // deployment-reported number of ready replicas (latest + out of date) + Requested int32 `json:"requested" yaml:"requested"` // deployment-reported number of requested replicas + UpToDate int32 `json:"up_to_date" yaml:"up_to_date"` // deployment-reported number of up-to-date replicas (in whichever phase they are found in) } type ReplicaCounts struct { - Updated SubReplicaCounts `json:"updated,omitempty"` - Stale SubReplicaCounts `json:"stale,omitempty"` - Requested int32 `json:"requested,omitempty"` + Status + Pending int32 `json:"pending" yaml:"pending"` + Creating int32 `json:"creating" yaml:"creating"` + NotReady int32 `json:"not_ready" yaml:"not_ready"` + ReadyOutOfDate int32 `json:"ready_out_of_date" yaml:"ready_out_of_date"` + ErrImagePull int32 `json:"err_image_pull" yaml:"err_image_pull"` + Terminating int32 `json:"terminating" yaml:"terminating"` // includes up-to-date and out-of-date pods + Failed int32 `json:"failed" yaml:"failed"` + Killed int32 `json:"killed" yaml:"killed"` + KilledOOM int32 `json:"killed_oom" yaml:"killed_oom"` + Stalled int32 `json:"stalled" yaml:"stalled"` // pending for a long time + Unknown int32 `json:"unknown" yaml:"unknown"` } -type SubReplicaCounts struct { - Pending int32 `json:"pending,omitempty"` - Initializing int32 `json:"initializing,omitempty"` - Ready int32 `json:"ready"` - NotReady int32 `json:"not_ready,omitempty"` - ErrImagePull int32 `json:"err_image_pull,omitempty"` - Terminating int32 `json:"terminating,omitempty"` - Failed int32 `json:"failed,omitempty"` - Killed int32 `json:"killed,omitempty"` - KilledOOM int32 `json:"killed_oom,omitempty"` - Stalled int32 `json:"stalled,omitempty"` // pending for a long time - Unknown int32 `json:"unknown,omitempty"` +type ReplicaCountType string + +const ( + ReplicaCountUpToDate ReplicaCountType = "UpToDate" // total up-to-date pods + ReplicaCountRequested ReplicaCountType = "Requested" // requested number of replicas (for up-to-date pods) + ReplicaCountPending ReplicaCountType = "Pending" // pods that are in the pending state (for up-to-date pods) + ReplicaCountCreating ReplicaCountType = "Creating" // pods that that have their init/non-init containers in the process of being created (for up-to-date pods) + ReplicaCountNotReady ReplicaCountType = "NotReady" // pods that are not passing the readiness checks (for up-to-date pods) + ReplicaCountReady ReplicaCountType = "Ready" // pods that are passing the readiness checks (for up-to-date pods) + ReplicaCountReadyOutOfDate ReplicaCountType = "ReadyOutOfDate" // pods that are passing the readiness checks (for out-of-date pods) + ReplicaCountErrImagePull ReplicaCountType = "ErrImagePull" // pods that couldn't pull the containers' images (for up-to-date pods) + ReplicaCountTerminating ReplicaCountType = "Terminating" // pods that are in a terminating state (for up-to-date pods) + ReplicaCountFailed ReplicaCountType = "Failed" // pods that have had their containers erroring (for up-to-date pods) + ReplicaCountKilled ReplicaCountType = "Killed" // pods that have had their container processes killed (for up-to-date pods) + ReplicaCountKilledOOM ReplicaCountType = "KilledOOM" // pods that have had their containers OOM (for up-to-date pods) + ReplicaCountStalled ReplicaCountType = "Stalled" // pods that have been in a pending state for more than 15 mins (for up-to-date pods) + ReplicaCountUnknown ReplicaCountType = "Unknown" // pods that are in an unknown state (for up-to-date pods) +) + +var ReplicaCountTypes []ReplicaCountType = []ReplicaCountType{ + ReplicaCountRequested, ReplicaCountPending, ReplicaCountCreating, + ReplicaCountNotReady, ReplicaCountReady, ReplicaCountReadyOutOfDate, + ReplicaCountErrImagePull, ReplicaCountTerminating, ReplicaCountFailed, + ReplicaCountKilled, ReplicaCountKilledOOM, ReplicaCountStalled, + ReplicaCountUnknown, ReplicaCountUpToDate, } // Worker counts don't have as many failure variations because Jobs clean up dead pods, so counting different failure scenarios isn't interesting type WorkerCounts struct { - Pending int32 `json:"pending,omitempty"` - Initializing int32 `json:"initializing,omitempty"` - Running int32 `json:"running,omitempty"` - Succeeded int32 `json:"succeeded,omitempty"` - Failed int32 `json:"failed,omitempty"` - Stalled int32 `json:"stalled,omitempty"` // pending for a long time - Unknown int32 `json:"unknown,omitempty"` + Pending int32 `json:"pending,omitempty" yaml:"pending,omitempty"` + Creating int32 `json:"creating,omitempty" yaml:"creating,omitempty"` + NotReady int32 `json:"not_ready,omitempty" yaml:"not_ready,omitempty"` + Ready int32 `json:"ready,omitempty" yaml:"ready,omitempty"` + Succeeded int32 `json:"succeeded,omitempty" yaml:"succeeded,omitempty"` + ErrImagePull int32 `json:"err_image_pull,omitempty" yaml:"err_image_pull,omitempty"` + Terminating int32 `json:"terminating,omitempty" yaml:"terminating,omitempty"` + Failed int32 `json:"failed,omitempty" yaml:"failed,omitempty"` + Killed int32 `json:"killed,omitempty" yaml:"killed,omitempty"` + KilledOOM int32 `json:"killed_oom,omitempty" yaml:"killed_oom,omitempty"` + Stalled int32 `json:"stalled,omitempty" yaml:"stalled,omitempty"` // pending for a long time + Unknown int32 `json:"unknown,omitempty" yaml:"unknown,omitempty"` +} + +func FromDeployment(deployment *kapps.Deployment) *Status { + var requested int32 + if deployment.Spec.Replicas != nil { + requested = *deployment.Spec.Replicas + } + return &Status{ + Ready: deployment.Status.ReadyReplicas, + Requested: requested, + UpToDate: deployment.Status.UpdatedReplicas, + } } -func (status *Status) Message() string { - return status.Code.Message() +func (counts *ReplicaCounts) GetCountBy(replicaType ReplicaCountType) int32 { + switch replicaType { + case ReplicaCountUpToDate: + return counts.UpToDate + case ReplicaCountRequested: + return counts.Requested + case ReplicaCountPending: + return counts.Pending + case ReplicaCountCreating: + return counts.Creating + case ReplicaCountNotReady: + return counts.NotReady + case ReplicaCountReady: + return counts.Ready + case ReplicaCountReadyOutOfDate: + return counts.ReadyOutOfDate + case ReplicaCountErrImagePull: + return counts.ErrImagePull + case ReplicaCountTerminating: + return counts.Terminating + case ReplicaCountFailed: + return counts.Failed + case ReplicaCountKilled: + return counts.Killed + case ReplicaCountKilledOOM: + return counts.KilledOOM + case ReplicaCountStalled: + return counts.Stalled + } + return counts.Unknown } -func (src *SubReplicaCounts) TotalFailed() int32 { - return src.Failed + src.ErrImagePull + src.Killed + src.KilledOOM + src.Stalled +func (counts *ReplicaCounts) TotalFailed() int32 { + return counts.ErrImagePull + counts.Failed + counts.Killed + counts.KilledOOM + counts.Unknown } diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go index c2f8585941..c524c599e0 100644 --- a/pkg/types/userconfig/api.go +++ b/pkg/types/userconfig/api.go @@ -155,6 +155,10 @@ func IdentifyAPI(filePath string, name string, kind Kind, index int) string { func (api *API) ToK8sAnnotations() map[string]string { annotations := map[string]string{} + if len(api.APIs) > 0 { + annotations[NumTrafficSplitterTargetsAnnotationKey] = s.Int32(int32(len(api.APIs))) + } + if api.Pod != nil && api.Kind == RealtimeAPIKind { annotations[MaxConcurrencyAnnotationKey] = s.Int64(api.Pod.MaxConcurrency) annotations[MaxQueueLengthAnnotationKey] = s.Int64(api.Pod.MaxQueueLength) @@ -245,6 +249,36 @@ func AutoscalingFromAnnotations(k8sObj kmeta.Object) (*Autoscaling, error) { return &a, nil } +func TrafficSplitterTargetsFromAnnotations(k8sObj kmeta.Object) (int32, error) { + targets, err := k8s.ParseInt32Annotation(k8sObj, NumTrafficSplitterTargetsAnnotationKey) + if err != nil { + return 0, err + } + return targets, nil +} + +func EndpointFromAnnotation(k8sObj kmeta.Object) (string, error) { + endpoint, err := k8s.GetAnnotation(k8sObj, EndpointAnnotationKey) + if err != nil { + return "", err + } + return endpoint, nil +} + +func ConcurrencyFromAnnotations(k8sObj kmeta.Object) (int, int, error) { + maxQueueLength, err := k8s.ParseIntAnnotation(k8sObj, MaxQueueLengthAnnotationKey) + if err != nil { + return 0, 0, err + } + + maxConcurrency, err := k8s.ParseIntAnnotation(k8sObj, MaxConcurrencyAnnotationKey) + if err != nil { + return 0, 0, err + } + + return maxQueueLength, maxConcurrency, nil +} + func (api *API) UserStr() string { var sb strings.Builder sb.WriteString(fmt.Sprintf("%s: %s\n", NameKey, api.Name)) diff --git a/pkg/types/userconfig/config_key.go b/pkg/types/userconfig/config_key.go index 826e144b05..5cbe3b2dda 100644 --- a/pkg/types/userconfig/config_key.go +++ b/pkg/types/userconfig/config_key.go @@ -91,6 +91,7 @@ const ( EndpointAnnotationKey = "networking.cortex.dev/endpoint" MaxConcurrencyAnnotationKey = "pod.cortex.dev/max-concurrency" MaxQueueLengthAnnotationKey = "pod.cortex.dev/max-queue-length" + NumTrafficSplitterTargetsAnnotationKey = "apis.cortex.dev/traffic-splitter-targets" MinReplicasAnnotationKey = "autoscaling.cortex.dev/min-replicas" MaxReplicasAnnotationKey = "autoscaling.cortex.dev/max-replicas" TargetInFlightAnnotationKey = "autoscaling.cortex.dev/target-in-flight" From 69f0fe897383354c73d217d89603146985766483 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Thu, 29 Jul 2021 12:28:15 +0200 Subject: [PATCH 27/42] Remove unnecessary annotations and add descriptions to the api status struct --- .../apis/serverless/v1alpha1/realtimeapi_types.go | 11 +++++++---- .../crd/bases/serverless.cortex.dev_realtimeapis.yaml | 4 ++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 693557818e..704f3b6f7f 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -217,16 +217,19 @@ type NetworkingSpec struct { // RealtimeAPIStatus defines the observed state of RealtimeAPI type RealtimeAPIStatus struct { // +kubebuilder:validation:Optional - // +kubebuilder:validation:Type=integer + // Number of ready pods Ready int32 `json:"ready"` + // +kubebuilder:validation:Optional - // +kubebuilder:validation:Type=integer + // Number of requested pods Requested int32 `json:"requested"` + // +kubebuilder:validation:Optional - // +kubebuilder:validation:Type=integer + // Number of pods with the last requested spec UpToDate int32 `json:"up_to_date"` + // +kubebuilder:validation:Optional - // +kubebuilder:validation:Type=string + // URL of the deployed API Endpoint string `json:"endpoint,omitempty"` } diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 58d12f66df..21dda24d33 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -603,14 +603,18 @@ spec: description: RealtimeAPIStatus defines the observed state of RealtimeAPI properties: endpoint: + description: URL of the deployed API type: string ready: + description: Number of ready pods format: int32 type: integer requested: + description: Number of requested pods format: int32 type: integer up_to_date: + description: Number of pods with the last requested spec format: int32 type: integer type: object From 128cf5b24d47f98fa98e13853ed49e4ee558b79d Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Thu, 29 Jul 2021 14:31:28 +0200 Subject: [PATCH 28/42] Fix deep equal comparison --- pkg/operator/resources/realtimeapi/api.go | 216 +--------------- pkg/operator/resources/realtimeapi/helpers.go | 241 ++++++++++++++++++ 2 files changed, 244 insertions(+), 213 deletions(-) create mode 100644 pkg/operator/resources/realtimeapi/helpers.go diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 096f83348c..2abe49403b 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -19,8 +19,6 @@ package realtimeapi import ( "context" "fmt" - "path/filepath" - "reflect" "sort" "time" @@ -28,7 +26,6 @@ import ( "github.com/cortexlabs/cortex/pkg/consts" serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/maps" "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/lib/pointer" @@ -38,22 +35,14 @@ import ( "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" - "github.com/cortexlabs/cortex/pkg/workloads" + "github.com/google/go-cmp/cmp" kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" - kresource "k8s.io/apimachinery/pkg/api/resource" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" ktypes "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/controller-runtime/pkg/client" ) -const _realtimeDashboardUID = "realtimeapi" - -func generateDeploymentID() string { - return k8s.RandomName()[:10] -} - func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) { ctx := context.Background() var api serverless.RealtimeAPI @@ -103,7 +92,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) MetadataRoot: spec.MetadataRoot(apiConfig.Name, config.ClusterConfig.ClusterUID), } - if !reflect.DeepEqual(api.Spec, desiredAPI.Spec) || force { + if !cmp.Equal(api.Spec, desiredAPI.Spec) || force { api.Spec = desiredAPI.Spec api.Annotations = maps.MergeStrMapsString(api.Annotations, desiredAPI.Annotations) @@ -189,7 +178,7 @@ func GetAllAPIs() ([]schema.APIResponse, error) { } realtimeAPIs := make([]schema.APIResponse, len(apis.Items)) - mappedRealtimeAPIs := make(map[string]schema.APIResponse, 0) + mappedRealtimeAPIs := map[string]schema.APIResponse{} for i := range apis.Items { api := apis.Items[i] @@ -289,202 +278,3 @@ func DescribeAPIByName(apiName string) ([]schema.APIResponse, error) { }, }, nil } - -func getDashboardURL(apiName string) string { - loadBalancerURL, err := operator.LoadBalancerURL() - if err != nil { - return "" - } - - dashboardURL := fmt.Sprintf( - "%s/dashboard/d/%s/realtimeapi?orgId=1&refresh=30s&var-api_name=%s", - loadBalancerURL, _realtimeDashboardUID, apiName, - ) - - return dashboardURL -} - -// K8sResourceFromAPIConfig converts a cortex API config into a realtime API CRD resource -func K8sResourceFromAPIConfig(apiConfig userconfig.API) serverless.RealtimeAPI { - containers := make([]serverless.ContainerSpec, len(apiConfig.Pod.Containers)) - for i := range apiConfig.Pod.Containers { - container := apiConfig.Pod.Containers[i] - var env []kcore.EnvVar - for k, v := range container.Env { - env = append(env, kcore.EnvVar{ - Name: k, - Value: v, - }) - } - - var compute *serverless.ComputeSpec - if container.Compute != nil { - var cpu *kresource.Quantity - if container.Compute.CPU != nil { - cpu = &container.Compute.CPU.Quantity - } - var mem *kresource.Quantity - if container.Compute.Mem != nil { - mem = &container.Compute.Mem.Quantity - } - var shm *kresource.Quantity - if container.Compute.Shm != nil { - shm = &container.Compute.Shm.Quantity - } - - compute = &serverless.ComputeSpec{ - CPU: cpu, - GPU: container.Compute.GPU, - Inf: container.Compute.Inf, - Mem: mem, - Shm: shm, - } - } - - containers[i] = serverless.ContainerSpec{ - Name: container.Name, - Image: container.Image, - Command: container.Command, - Args: container.Args, - Env: env, - Compute: compute, - ReadinessProbe: workloads.GetProbeSpec(container.ReadinessProbe), - LivenessProbe: workloads.GetProbeSpec(container.LivenessProbe), - } - } - - api := serverless.RealtimeAPI{ - ObjectMeta: kmeta.ObjectMeta{ - Name: apiConfig.Name, - Namespace: consts.DefaultNamespace, - }, - Spec: serverless.RealtimeAPISpec{ - Pod: serverless.PodSpec{ - Port: *apiConfig.Pod.Port, - MaxConcurrency: int32(apiConfig.Pod.MaxConcurrency), - MaxQueueLength: int32(apiConfig.Pod.MaxQueueLength), - Replicas: apiConfig.Autoscaling.InitReplicas, - Containers: containers, - }, - Autoscaling: serverless.AutoscalingSpec{ - MinReplicas: apiConfig.Autoscaling.MinReplicas, - MaxReplicas: apiConfig.Autoscaling.MaxReplicas, - TargetInFlight: fmt.Sprintf("%f", *apiConfig.Autoscaling.TargetInFlight), - Window: kmeta.Duration{Duration: apiConfig.Autoscaling.Window}, - DownscaleStabilizationPeriod: kmeta.Duration{Duration: apiConfig.Autoscaling.DownscaleStabilizationPeriod}, - UpscaleStabilizationPeriod: kmeta.Duration{Duration: apiConfig.Autoscaling.UpscaleStabilizationPeriod}, - MaxDownscaleFactor: fmt.Sprintf("%f", apiConfig.Autoscaling.MaxDownscaleFactor), - MaxUpscaleFactor: fmt.Sprintf("%f", apiConfig.Autoscaling.MaxUpscaleFactor), - DownscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.DownscaleTolerance), - UpscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.UpscaleTolerance), - }, - NodeGroups: apiConfig.NodeGroups, - UpdateStrategy: serverless.UpdateStrategySpec{ - MaxSurge: intstr.FromString(apiConfig.UpdateStrategy.MaxSurge), - MaxUnavailable: intstr.FromString(apiConfig.UpdateStrategy.MaxUnavailable), - }, - Networking: serverless.NetworkingSpec{ - Endpoint: *apiConfig.Networking.Endpoint, - }, - }, - } - - deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() - api.Annotations = map[string]string{ - "cortex.dev/deployment-id": deploymentID, - "cortex.dev/spec-id": specID, - "cortex.dev/pod-id": podID, - "cortex.dev/api-id": apiID, - } - - return api -} - -func deleteBucketResources(apiName string) error { - prefix := filepath.Join(config.ClusterConfig.ClusterUID, "apis", apiName) - return config.AWS.DeleteS3Dir(config.ClusterConfig.Bucket, prefix, true) -} - -func metadataFromRealtimeAPI(sv *serverless.RealtimeAPI) (*spec.Metadata, error) { - lastUpdated, err := spec.TimeFromAPIID(sv.Annotations["cortex.dev/api-id"]) - if err != nil { - return nil, err - } - return &spec.Metadata{ - Resource: &userconfig.Resource{ - Name: sv.Name, - Kind: userconfig.RealtimeAPIKind, - }, - APIID: sv.Annotations["cortex.dev/api-id"], - DeploymentID: sv.Annotations["cortex.dev/deployment-id"], - LastUpdated: lastUpdated.Unix(), - }, nil -} - -func getReplicaCounts(pods []kcore.Pod, metadata *spec.Metadata) status.ReplicaCounts { - counts := status.ReplicaCounts{} - - for i := range pods { - pod := pods[i] - if pod.Labels["apiName"] != metadata.Name { - continue - } - addPodToReplicaCounts(&pods[i], metadata, &counts) - } - - return counts -} - -func addPodToReplicaCounts(pod *kcore.Pod, metadata *spec.Metadata, counts *status.ReplicaCounts) { - latest := false - if isPodSpecLatest(pod, metadata) { - latest = true - } - - isPodReady := k8s.IsPodReady(pod) - if latest && isPodReady { - counts.Ready++ - return - } else if !latest && isPodReady { - counts.ReadyOutOfDate++ - return - } - - podStatus := k8s.GetPodStatus(pod) - - if podStatus == k8s.PodStatusTerminating { - counts.Terminating++ - return - } - - if !latest { - return - } - - switch podStatus { - case k8s.PodStatusPending: - counts.Pending++ - case k8s.PodStatusStalled: - counts.Stalled++ - case k8s.PodStatusCreating: - counts.Creating++ - case k8s.PodStatusReady: - counts.Ready++ - case k8s.PodStatusNotReady: - counts.NotReady++ - case k8s.PodStatusErrImagePull: - counts.ErrImagePull++ - case k8s.PodStatusFailed: - counts.Failed++ - case k8s.PodStatusKilled: - counts.Killed++ - case k8s.PodStatusKilledOOM: - counts.KilledOOM++ - case k8s.PodStatusUnknown: - counts.Unknown++ - } -} - -func isPodSpecLatest(pod *kcore.Pod, metadata *spec.Metadata) bool { - return metadata.APIID == pod.Labels["apiID"] -} diff --git a/pkg/operator/resources/realtimeapi/helpers.go b/pkg/operator/resources/realtimeapi/helpers.go new file mode 100644 index 0000000000..deb503edb3 --- /dev/null +++ b/pkg/operator/resources/realtimeapi/helpers.go @@ -0,0 +1,241 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package realtimeapi + +import ( + "fmt" + "path/filepath" + + "github.com/cortexlabs/cortex/pkg/config" + "github.com/cortexlabs/cortex/pkg/consts" + "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" + "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/cortexlabs/cortex/pkg/workloads" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + v12 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +const _realtimeDashboardUID = "realtimeapi" + +func generateDeploymentID() string { + return k8s.RandomName()[:10] +} + +func getDashboardURL(apiName string) string { + loadBalancerURL, err := operator.LoadBalancerURL() + if err != nil { + return "" + } + + dashboardURL := fmt.Sprintf( + "%s/dashboard/d/%s/realtimeapi?orgId=1&refresh=30s&var-api_name=%s", + loadBalancerURL, _realtimeDashboardUID, apiName, + ) + + return dashboardURL +} + +// K8sResourceFromAPIConfig converts a cortex API config into a realtime API CRD resource +func K8sResourceFromAPIConfig(apiConfig userconfig.API) v1alpha1.RealtimeAPI { + containers := make([]v1alpha1.ContainerSpec, len(apiConfig.Pod.Containers)) + for i := range apiConfig.Pod.Containers { + container := apiConfig.Pod.Containers[i] + var env []v1.EnvVar + for k, v := range container.Env { + env = append(env, v1.EnvVar{ + Name: k, + Value: v, + }) + } + + var compute *v1alpha1.ComputeSpec + if container.Compute != nil { + var cpu *resource.Quantity + if container.Compute.CPU != nil { + cpu = &container.Compute.CPU.Quantity + } + var mem *resource.Quantity + if container.Compute.Mem != nil { + mem = &container.Compute.Mem.Quantity + } + var shm *resource.Quantity + if container.Compute.Shm != nil { + shm = &container.Compute.Shm.Quantity + } + + compute = &v1alpha1.ComputeSpec{ + CPU: cpu, + GPU: container.Compute.GPU, + Inf: container.Compute.Inf, + Mem: mem, + Shm: shm, + } + } + + containers[i] = v1alpha1.ContainerSpec{ + Name: container.Name, + Image: container.Image, + Command: container.Command, + Args: container.Args, + Env: env, + Compute: compute, + ReadinessProbe: workloads.GetProbeSpec(container.ReadinessProbe), + LivenessProbe: workloads.GetProbeSpec(container.LivenessProbe), + } + } + + api := v1alpha1.RealtimeAPI{ + ObjectMeta: v12.ObjectMeta{ + Name: apiConfig.Name, + Namespace: consts.DefaultNamespace, + }, + Spec: v1alpha1.RealtimeAPISpec{ + Pod: v1alpha1.PodSpec{ + Port: *apiConfig.Pod.Port, + MaxConcurrency: int32(apiConfig.Pod.MaxConcurrency), + MaxQueueLength: int32(apiConfig.Pod.MaxQueueLength), + Replicas: apiConfig.Autoscaling.InitReplicas, + Containers: containers, + }, + Autoscaling: v1alpha1.AutoscalingSpec{ + MinReplicas: apiConfig.Autoscaling.MinReplicas, + MaxReplicas: apiConfig.Autoscaling.MaxReplicas, + TargetInFlight: fmt.Sprintf("%f", *apiConfig.Autoscaling.TargetInFlight), + Window: v12.Duration{Duration: apiConfig.Autoscaling.Window}, + DownscaleStabilizationPeriod: v12.Duration{Duration: apiConfig.Autoscaling.DownscaleStabilizationPeriod}, + UpscaleStabilizationPeriod: v12.Duration{Duration: apiConfig.Autoscaling.UpscaleStabilizationPeriod}, + MaxDownscaleFactor: fmt.Sprintf("%f", apiConfig.Autoscaling.MaxDownscaleFactor), + MaxUpscaleFactor: fmt.Sprintf("%f", apiConfig.Autoscaling.MaxUpscaleFactor), + DownscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.DownscaleTolerance), + UpscaleTolerance: fmt.Sprintf("%f", apiConfig.Autoscaling.UpscaleTolerance), + }, + NodeGroups: apiConfig.NodeGroups, + UpdateStrategy: v1alpha1.UpdateStrategySpec{ + MaxSurge: intstr.FromString(apiConfig.UpdateStrategy.MaxSurge), + MaxUnavailable: intstr.FromString(apiConfig.UpdateStrategy.MaxUnavailable), + }, + Networking: v1alpha1.NetworkingSpec{ + Endpoint: *apiConfig.Networking.Endpoint, + }, + }, + } + + deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() + api.Annotations = map[string]string{ + "cortex.dev/deployment-id": deploymentID, + "cortex.dev/spec-id": specID, + "cortex.dev/pod-id": podID, + "cortex.dev/api-id": apiID, + } + + return api +} + +func deleteBucketResources(apiName string) error { + prefix := filepath.Join(config.ClusterConfig.ClusterUID, "apis", apiName) + return config.AWS.DeleteS3Dir(config.ClusterConfig.Bucket, prefix, true) +} + +func metadataFromRealtimeAPI(sv *v1alpha1.RealtimeAPI) (*spec.Metadata, error) { + lastUpdated, err := spec.TimeFromAPIID(sv.Annotations["cortex.dev/api-id"]) + if err != nil { + return nil, err + } + return &spec.Metadata{ + Resource: &userconfig.Resource{ + Name: sv.Name, + Kind: userconfig.RealtimeAPIKind, + }, + APIID: sv.Annotations["cortex.dev/api-id"], + DeploymentID: sv.Annotations["cortex.dev/deployment-id"], + LastUpdated: lastUpdated.Unix(), + }, nil +} + +func getReplicaCounts(pods []v1.Pod, metadata *spec.Metadata) status.ReplicaCounts { + counts := status.ReplicaCounts{} + + for i := range pods { + pod := pods[i] + if pod.Labels["apiName"] != metadata.Name { + continue + } + addPodToReplicaCounts(&pods[i], metadata, &counts) + } + + return counts +} + +func addPodToReplicaCounts(pod *v1.Pod, metadata *spec.Metadata, counts *status.ReplicaCounts) { + latest := false + if isPodSpecLatest(pod, metadata) { + latest = true + } + + isPodReady := k8s.IsPodReady(pod) + if latest && isPodReady { + counts.Ready++ + return + } else if !latest && isPodReady { + counts.ReadyOutOfDate++ + return + } + + podStatus := k8s.GetPodStatus(pod) + + if podStatus == k8s.PodStatusTerminating { + counts.Terminating++ + return + } + + if !latest { + return + } + + switch podStatus { + case k8s.PodStatusPending: + counts.Pending++ + case k8s.PodStatusStalled: + counts.Stalled++ + case k8s.PodStatusCreating: + counts.Creating++ + case k8s.PodStatusReady: + counts.Ready++ + case k8s.PodStatusNotReady: + counts.NotReady++ + case k8s.PodStatusErrImagePull: + counts.ErrImagePull++ + case k8s.PodStatusFailed: + counts.Failed++ + case k8s.PodStatusKilled: + counts.Killed++ + case k8s.PodStatusKilledOOM: + counts.KilledOOM++ + case k8s.PodStatusUnknown: + counts.Unknown++ + } +} + +func isPodSpecLatest(pod *v1.Pod, metadata *spec.Metadata) bool { + return metadata.APIID == pod.Labels["apiID"] +} From c1df9b7ff12fd1866ec63130c87fb4439f661419 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Thu, 29 Jul 2021 16:40:48 +0200 Subject: [PATCH 29/42] Fix rolling update on autoscaling spec update --- .../serverless/v1alpha1/realtimeapi_types.go | 10 +++++----- .../serverless.cortex.dev_realtimeapis.yaml | 12 ++++++------ .../realtimeapi_controller_helpers.go | 17 +++++++++++------ pkg/operator/resources/realtimeapi/api.go | 6 +++--- pkg/operator/resources/realtimeapi/helpers.go | 15 ++++++++++++--- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 704f3b6f7f..e68b2c0629 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -33,6 +33,11 @@ import ( // RealtimeAPISpec defines the desired state of RealtimeAPI type RealtimeAPISpec struct { + // +kubebuilder:validation:Required + // +kubebuilder:default=1 + // Number of desired replicas + Replicas int32 `json:"replicas"` + // Pod configuration // +kubebuilder:validation:Required Pod PodSpec `json:"pod"` @@ -73,11 +78,6 @@ type PodSpec struct { // (beyond max_concurrency) before requests are rejected with error code 503 MaxQueueLength int32 `json:"max_queue_length"` - // +kubebuilder:validation:Required - // +kubebuilder:default=1 - // Number of desired replicas - Replicas int32 `json:"replicas"` - // +kubebuilder:validation:Required // Configurations for the containers to run Containers []ContainerSpec `json:"containers"` diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 21dda24d33..7346ec219d 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -556,18 +556,17 @@ spec: description: Port to which requests will be sent to format: int32 type: integer - replicas: - default: 1 - description: Number of desired replicas - format: int32 - type: integer required: - containers - max_concurrency - max_queue_length - port - - replicas type: object + replicas: + default: 1 + description: Number of desired replicas + format: int32 + type: integer update_strategy: default: max_surge: 25% @@ -598,6 +597,7 @@ spec: required: - networking - pod + - replicas type: object status: description: RealtimeAPIStatus defines the observed state of RealtimeAPI diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 64ec0cd000..67a3381ef3 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -92,7 +92,6 @@ func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, ap if err := ctrl.SetControllerReference(&api, &deployment, r.Scheme); err != nil { return err } - return nil }) if err != nil { @@ -177,7 +176,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka return *k8s.Deployment(&k8s.DeploymentSpec{ Name: workloads.K8sName(api.Name), - Replicas: api.Spec.Pod.Replicas, + Replicas: api.Spec.Replicas, MaxSurge: pointer.String(api.Spec.UpdateStrategy.MaxSurge.String()), MaxUnavailable: pointer.String(api.Spec.UpdateStrategy.MaxUnavailable.String()), Labels: map[string]string{ @@ -197,7 +196,6 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), "deploymentID": api.Annotations["cortex.dev/deployment-id"], - "apiID": api.Annotations["cortex.dev/api-id"], "cortex.dev/api": "true", }, Annotations: map[string]string{ @@ -248,7 +246,7 @@ func (r *RealtimeAPIReconciler) desiredService(api serverless.RealtimeAPI) kcore func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI) istioclientnetworking.VirtualService { var activatorWeight int32 - if api.Spec.Pod.Replicas == 0 { + if api.Spec.Replicas == 0 { activatorWeight = 100 } @@ -405,8 +403,15 @@ func (r *RealtimeAPIReconciler) proxyContainer(api serverless.RealtimeAPI) (kcor s.Int32(api.Spec.Pod.MaxQueueLength), }, Ports: []kcore.ContainerPort{ - {Name: consts.AdminPortName, ContainerPort: consts.AdminPortInt32}, - {ContainerPort: consts.ProxyPortInt32}, + { + Name: consts.AdminPortName, + ContainerPort: consts.AdminPortInt32, + Protocol: kcore.ProtocolTCP, + }, + { + ContainerPort: consts.ProxyPortInt32, + Protocol: kcore.ProtocolTCP, + }, }, Env: workloads.BaseEnvVars, EnvFrom: workloads.BaseClusterEnvVars(), diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 2abe49403b..6e78509274 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -52,7 +52,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) if err != nil { if kerrors.IsNotFound(err) { if kerrors.IsNotFound(err) { - api = K8sResourceFromAPIConfig(*apiConfig) + api = k8sResourceFromAPIConfig(*apiConfig, nil) if err = config.K8s.Create(ctx, &api); err != nil { return nil, "", errors.Wrap(err, "failed to create realtime api resource") } @@ -79,7 +79,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) return nil, "", errors.Wrap(err, "failed to get realtime api resource") } - desiredAPI := K8sResourceFromAPIConfig(*apiConfig) + desiredAPI := k8sResourceFromAPIConfig(*apiConfig, &api) apiSpec := &spec.API{ API: apiConfig, @@ -256,7 +256,7 @@ func DescribeAPIByName(apiName string) ([]schema.APIResponse, error) { } var podList kcore.PodList - if err := config.K8s.List(ctx, &podList, client.MatchingLabels{ + if err = config.K8s.List(ctx, &podList, client.MatchingLabels{ "apiName": metadata.Name, "apiKind": userconfig.RealtimeAPIKind.String(), }); err != nil { diff --git a/pkg/operator/resources/realtimeapi/helpers.go b/pkg/operator/resources/realtimeapi/helpers.go index deb503edb3..938fccc99a 100644 --- a/pkg/operator/resources/realtimeapi/helpers.go +++ b/pkg/operator/resources/realtimeapi/helpers.go @@ -23,6 +23,7 @@ import ( "github.com/cortexlabs/cortex/pkg/config" "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/types/spec" @@ -55,8 +56,8 @@ func getDashboardURL(apiName string) string { return dashboardURL } -// K8sResourceFromAPIConfig converts a cortex API config into a realtime API CRD resource -func K8sResourceFromAPIConfig(apiConfig userconfig.API) v1alpha1.RealtimeAPI { +// k8sResourceFromAPIConfig converts a cortex API config into a realtime API CRD resource +func k8sResourceFromAPIConfig(apiConfig userconfig.API, prevAPI *serverless.RealtimeAPI) v1alpha1.RealtimeAPI { containers := make([]v1alpha1.ContainerSpec, len(apiConfig.Pod.Containers)) for i := range apiConfig.Pod.Containers { container := apiConfig.Pod.Containers[i] @@ -110,11 +111,11 @@ func K8sResourceFromAPIConfig(apiConfig userconfig.API) v1alpha1.RealtimeAPI { Namespace: consts.DefaultNamespace, }, Spec: v1alpha1.RealtimeAPISpec{ + Replicas: apiConfig.Autoscaling.InitReplicas, Pod: v1alpha1.PodSpec{ Port: *apiConfig.Pod.Port, MaxConcurrency: int32(apiConfig.Pod.MaxConcurrency), MaxQueueLength: int32(apiConfig.Pod.MaxQueueLength), - Replicas: apiConfig.Autoscaling.InitReplicas, Containers: containers, }, Autoscaling: v1alpha1.AutoscalingSpec{ @@ -148,6 +149,14 @@ func K8sResourceFromAPIConfig(apiConfig userconfig.API) v1alpha1.RealtimeAPI { "cortex.dev/api-id": apiID, } + if prevAPI != nil { + // we should keep the existing number of replicas instead of init_replicas + api.Spec.Replicas = prevAPI.Spec.Replicas + if prevDeployID := prevAPI.Annotations["cortex.dev/deployment-id"]; prevDeployID != "" { + api.Annotations["cortex.dev/deployment-id"] = prevDeployID + } + } + return api } From b7cac93d0b043bad95e7b07c50d8e377301de7b1 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Fri, 30 Jul 2021 15:52:10 +0200 Subject: [PATCH 30/42] WIP: update realtime scaler to work with CRD --- pkg/autoscaler/autoscaler_test.go | 27 +-- pkg/autoscaler/realtime_scaler.go | 293 +++++++++++++++--------------- 2 files changed, 156 insertions(+), 164 deletions(-) diff --git a/pkg/autoscaler/autoscaler_test.go b/pkg/autoscaler/autoscaler_test.go index bd01d7e286..4dc461f9b7 100644 --- a/pkg/autoscaler/autoscaler_test.go +++ b/pkg/autoscaler/autoscaler_test.go @@ -297,12 +297,9 @@ func TestAutoscaler_Awake(t *testing.T) { ticker := time.NewTicker(250 * time.Millisecond) go func() { - for { - select { - case <-ticker.C: - err := autoscaleFn() - require.NoError(t, err) - } + for range ticker.C { + err := autoscaleFn() + require.NoError(t, err) } }() @@ -372,12 +369,9 @@ func TestAutoscaler_MinReplicas(t *testing.T) { ticker := time.NewTicker(250 * time.Millisecond) go func() { - for { - select { - case <-ticker.C: - err := autoscaleFn() - require.NoError(t, err) - } + for range ticker.C { + err := autoscaleFn() + require.NoError(t, err) } }() @@ -444,12 +438,9 @@ func TestAutoscaler_MaxReplicas(t *testing.T) { ticker := time.NewTicker(250 * time.Millisecond) go func() { - for { - select { - case <-ticker.C: - err := autoscaleFn() - require.NoError(t, err) - } + for range ticker.C { + err := autoscaleFn() + require.NoError(t, err) } }() diff --git a/pkg/autoscaler/realtime_scaler.go b/pkg/autoscaler/realtime_scaler.go index a0dbb60c28..4ce175d29a 100644 --- a/pkg/autoscaler/realtime_scaler.go +++ b/pkg/autoscaler/realtime_scaler.go @@ -21,18 +21,14 @@ import ( "fmt" "time" - "github.com/cortexlabs/cortex/pkg/consts" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/pointer" - "github.com/cortexlabs/cortex/pkg/lib/telemetry" + libstrings "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types/userconfig" - "github.com/cortexlabs/cortex/pkg/workloads" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" "go.uber.org/zap" - kapps "k8s.io/api/apps/v1" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -53,49 +49,25 @@ func NewRealtimeScaler(k8sClient *k8s.Client, promClient promv1.API, logger *zap func (s *RealtimeScaler) Scale(apiName string, request int32) error { ctx := context.Background() - // we use the controller-runtime client to make use of the cache mechanism - var deployment kapps.Deployment - err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ + var api serverless.RealtimeAPI + if err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ Namespace: s.k8s.Namespace, - Name: workloads.K8sName(apiName), - }, &deployment) - if err != nil { - return errors.Wrap(err, "failed to get deployment") - } - - if deployment.Spec.Replicas == nil { - return errors.Wrap(err, "k8s deployment doesn't have the replicas field set") + Name: apiName}, + &api, + ); err != nil { + return err } - current := *deployment.Spec.Replicas + current := api.Spec.Replicas if current == request { return nil } - if request == 0 { - if err = s.routeToActivator(&deployment); err != nil { - return errors.Wrap(err, "failed to re-route traffic to activator") - } - } - - deployment.Spec.Replicas = pointer.Int32(request) - - if err = s.k8s.Update(ctx, &deployment); err != nil { + api.Spec.Replicas = request + if err := s.k8s.Update(ctx, &api); err != nil { return errors.Wrap(err, "failed to update deployment") } - if current == 0 && request > 0 { - go func() { - if err := s.routeToService(&deployment); err != nil { - s.logger.Errorw("failed to re-route traffic to API", - zap.Error(err), zap.String("apiName", apiName), - ) - telemetry.Error(err) - } - }() - - } - return nil } @@ -136,133 +108,162 @@ func (s *RealtimeScaler) GetInFlightRequests(apiName string, window time.Duratio } func (s *RealtimeScaler) GetAutoscalingSpec(apiName string) (*userconfig.Autoscaling, error) { - deployment, err := s.k8s.GetDeployment(workloads.K8sName(apiName)) - if err != nil { - return nil, errors.Wrap(err, "failed to get deployment") - } - - if deployment == nil { - return nil, errors.ErrorUnexpected("unable to find k8s deployment", apiName) - } - - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) - if err != nil { - return nil, err - } - - return autoscalingSpec, nil -} - -func (s *RealtimeScaler) CurrentRequestedReplicas(apiName string) (int32, error) { ctx := context.Background() - // we use the controller-runtime client to make use of the cache mechanism - var deployment kapps.Deployment - err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ + var api serverless.RealtimeAPI + if err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ Namespace: s.k8s.Namespace, - Name: workloads.K8sName(apiName), - }, &deployment) - if err != nil { - return 0, errors.Wrap(err, "failed to get deployment") - } - - if deployment.Spec.Replicas == nil { - return 0, errors.Wrap(err, "k8s deployment doesn't have the replicas field set") + Name: apiName}, + &api, + ); err != nil { + return nil, err } - return *deployment.Spec.Replicas, nil -} - -func (s *RealtimeScaler) routeToService(deployment *kapps.Deployment) error { - ctx := context.Background() - vs, err := s.k8s.GetVirtualService(deployment.Name) - if err != nil { - return errors.Wrap(err, "failed to get virtual service") + targetInFlight, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.TargetInFlight) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse target-in-flight requests from autoscaling spec") } - if len(vs.Spec.Http) < 1 { - return errors.ErrorUnexpected("virtual service does not have any http entries") + maxDownscaleFactor, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.MaxDownscaleFactor) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse max downscale factor from autoscaling spec") } - if err = s.waitForReadyReplicas(ctx, deployment); err != nil { - return errors.Wrap(err, "no ready replicas available") + maxUpscaleFactor, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.MaxUpscaleFactor) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse max upscale factor from autoscaling spec") } - for i := range vs.Spec.Http { - if len(vs.Spec.Http[i].Route) != 2 { - return errors.ErrorUnexpected("virtual service does not have the required number of 2 http routes") - } - - vs.Spec.Http[i].Route[0].Weight = 100 // service traffic - vs.Spec.Http[i].Route[1].Weight = 0 // activator traffic + downscaleTolerance, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.DownscaleTolerance) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse downscale tolerance from autoscaling spec") } - vsClient := s.k8s.IstioClientSet().NetworkingV1beta1().VirtualServices(s.k8s.Namespace) - if _, err = vsClient.Update(ctx, vs, kmeta.UpdateOptions{}); err != nil { - return errors.Wrap(err, "failed to update virtual service") + upscaleTolerance, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.UpscaleTolerance) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse upscale tolerance from autoscaling spec") } - return nil + return &userconfig.Autoscaling{ + MinReplicas: api.Spec.Autoscaling.MinReplicas, + MaxReplicas: api.Spec.Autoscaling.MaxReplicas, + InitReplicas: api.Spec.Autoscaling.MinReplicas, // FIXME: either add init replicas to the CRD autoscaling spec or remove init_replicas (?) + TargetInFlight: &targetInFlight, + Window: api.Spec.Autoscaling.Window.Duration, + DownscaleStabilizationPeriod: api.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration, + UpscaleStabilizationPeriod: api.Spec.Autoscaling.UpscaleStabilizationPeriod.Duration, + MaxDownscaleFactor: maxDownscaleFactor, + MaxUpscaleFactor: maxUpscaleFactor, + DownscaleTolerance: downscaleTolerance, + UpscaleTolerance: upscaleTolerance, + }, nil } -func (s *RealtimeScaler) routeToActivator(deployment *kapps.Deployment) error { +func (s *RealtimeScaler) CurrentRequestedReplicas(apiName string) (int32, error) { ctx := context.Background() - vs, err := s.k8s.GetVirtualService(deployment.Name) - if err != nil { - return errors.Wrap(err, "failed to get virtual service") - } - - if len(vs.Spec.Http) < 1 { - return errors.ErrorUnexpected("virtual service does not have any http entries") - } - - for i := range vs.Spec.Http { - if len(vs.Spec.Http[i].Route) != 2 { - return errors.ErrorUnexpected("virtual service does not have the required number of 2 http routes") - } - vs.Spec.Http[i].Route[0].Weight = 0 // service traffic - vs.Spec.Http[i].Route[1].Weight = 100 // activator traffic - } - - vsClient := s.k8s.IstioClientSet().NetworkingV1beta1().VirtualServices(s.k8s.Namespace) - if _, err = vsClient.Update(ctx, vs, kmeta.UpdateOptions{}); err != nil { - return errors.Wrap(err, "failed to update virtual service") + var api serverless.RealtimeAPI + if err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ + Namespace: s.k8s.Namespace, + Name: apiName}, + &api, + ); err != nil { + return 0, err } - return nil + return api.Spec.Replicas, nil } -func (s *RealtimeScaler) waitForReadyReplicas(ctx context.Context, deployment *kapps.Deployment) error { - watcher, err := s.k8s.ClientSet().AppsV1().Deployments(s.k8s.Namespace).Watch( - ctx, - kmeta.ListOptions{ - FieldSelector: fmt.Sprintf("metadata.name=%s", deployment.Name), - Watch: true, - }, - ) - if err != nil { - return errors.Wrap(err, "could not create deployment watcher") - } - - defer watcher.Stop() - - ctx, cancel := context.WithTimeout(ctx, consts.WaitForReadyReplicasTimeout) - defer cancel() - - for { - select { - case event := <-watcher.ResultChan(): - deploy, ok := event.Object.(*kapps.Deployment) - if !ok { - continue - } - - if deploy.Status.ReadyReplicas > 0 { - return nil - } - case <-ctx.Done(): - return ctx.Err() - } - } -} +//func (s *RealtimeScaler) routeToService(deployment *kapps.Deployment) error { +// ctx := context.Background() +// vs, err := s.k8s.GetVirtualService(deployment.Name) +// if err != nil { +// return errors.Wrap(err, "failed to get virtual service") +// } +// +// if len(vs.Spec.Http) < 1 { +// return errors.ErrorUnexpected("virtual service does not have any http entries") +// } +// +// if err = s.waitForReadyReplicas(ctx, deployment); err != nil { +// return errors.Wrap(err, "no ready replicas available") +// } +// +// for i := range vs.Spec.Http { +// if len(vs.Spec.Http[i].Route) != 2 { +// return errors.ErrorUnexpected("virtual service does not have the required number of 2 http routes") +// } +// +// vs.Spec.Http[i].Route[0].Weight = 100 // service traffic +// vs.Spec.Http[i].Route[1].Weight = 0 // activator traffic +// } +// +// vsClient := s.k8s.IstioClientSet().NetworkingV1beta1().VirtualServices(s.k8s.Namespace) +// if _, err = vsClient.Update(ctx, vs, kmeta.UpdateOptions{}); err != nil { +// return errors.Wrap(err, "failed to update virtual service") +// } +// +// return nil +//} +// +//func (s *RealtimeScaler) routeToActivator(deployment *kapps.Deployment) error { +// ctx := context.Background() +// vs, err := s.k8s.GetVirtualService(deployment.Name) +// if err != nil { +// return errors.Wrap(err, "failed to get virtual service") +// } +// +// if len(vs.Spec.Http) < 1 { +// return errors.ErrorUnexpected("virtual service does not have any http entries") +// } +// +// for i := range vs.Spec.Http { +// if len(vs.Spec.Http[i].Route) != 2 { +// return errors.ErrorUnexpected("virtual service does not have the required number of 2 http routes") +// } +// +// vs.Spec.Http[i].Route[0].Weight = 0 // service traffic +// vs.Spec.Http[i].Route[1].Weight = 100 // activator traffic +// } +// +// vsClient := s.k8s.IstioClientSet().NetworkingV1beta1().VirtualServices(s.k8s.Namespace) +// if _, err = vsClient.Update(ctx, vs, kmeta.UpdateOptions{}); err != nil { +// return errors.Wrap(err, "failed to update virtual service") +// } +// +// return nil +//} +// +//func (s *RealtimeScaler) waitForReadyReplicas(ctx context.Context, deployment *kapps.Deployment) error { +// watcher, err := s.k8s.ClientSet().AppsV1().Deployments(s.k8s.Namespace).Watch( +// ctx, +// kmeta.ListOptions{ +// FieldSelector: fmt.Sprintf("metadata.name=%s", deployment.Name), +// Watch: true, +// }, +// ) +// if err != nil { +// return errors.Wrap(err, "could not create deployment watcher") +// } +// +// defer watcher.Stop() +// +// ctx, cancel := context.WithTimeout(ctx, consts.WaitForReadyReplicasTimeout) +// defer cancel() +// +// for { +// select { +// case event := <-watcher.ResultChan(): +// deploy, ok := event.Object.(*kapps.Deployment) +// if !ok { +// continue +// } +// +// if deploy.Status.ReadyReplicas > 0 { +// return nil +// } +// case <-ctx.Done(): +// return ctx.Err() +// } +// } +//} From e27f41458397803bbd781372403e65a236369b89 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 30 Jul 2021 17:50:27 +0300 Subject: [PATCH 31/42] Fix autoscaler for RealtimeAPI CRD --- go.mod | 2 +- pkg/autoscaler/realtime_scaler.go | 117 ++---------------- .../serverless/v1alpha1/realtimeapi_types.go | 20 +-- .../serverless/realtimeapi_controller.go | 12 +- .../realtimeapi_controller_helpers.go | 2 +- pkg/operator/resources/realtimeapi/helpers.go | 3 +- 6 files changed, 25 insertions(+), 131 deletions(-) diff --git a/go.mod b/go.mod index 4381c8a46b..490aac0a00 100644 --- a/go.mod +++ b/go.mod @@ -26,7 +26,7 @@ require ( github.com/go-ole/go-ole v1.2.5 // indirect github.com/gobwas/glob v0.2.3 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/google/go-cmp v0.5.6 // indirect + github.com/google/go-cmp v0.5.6 github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.2.0 github.com/googleapis/gnostic v0.5.5 // indirect diff --git a/pkg/autoscaler/realtime_scaler.go b/pkg/autoscaler/realtime_scaler.go index 4ce175d29a..5d1df8ad27 100644 --- a/pkg/autoscaler/realtime_scaler.go +++ b/pkg/autoscaler/realtime_scaler.go @@ -49,23 +49,24 @@ func NewRealtimeScaler(k8sClient *k8s.Client, promClient promv1.API, logger *zap func (s *RealtimeScaler) Scale(apiName string, request int32) error { ctx := context.Background() - var api serverless.RealtimeAPI - if err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ + // we use the controller-runtime client to make use of the cache mechanism + var realtimeAPI serverless.RealtimeAPI + err := s.k8s.Get(ctx, ctrlclient.ObjectKey{ Namespace: s.k8s.Namespace, - Name: apiName}, - &api, - ); err != nil { - return err + Name: apiName, + }, &realtimeAPI) + if err != nil { + return errors.Wrap(err, "failed to get realtimeapi") } - current := api.Spec.Replicas + current := realtimeAPI.Spec.Replicas if current == request { return nil } + realtimeAPI.Spec.Replicas = request - api.Spec.Replicas = request - if err := s.k8s.Update(ctx, &api); err != nil { - return errors.Wrap(err, "failed to update deployment") + if err = s.k8s.Update(ctx, &realtimeAPI); err != nil { + return errors.Wrap(err, "failed to update realtimeapi") } return nil @@ -147,7 +148,7 @@ func (s *RealtimeScaler) GetAutoscalingSpec(apiName string) (*userconfig.Autosca return &userconfig.Autoscaling{ MinReplicas: api.Spec.Autoscaling.MinReplicas, MaxReplicas: api.Spec.Autoscaling.MaxReplicas, - InitReplicas: api.Spec.Autoscaling.MinReplicas, // FIXME: either add init replicas to the CRD autoscaling spec or remove init_replicas (?) + InitReplicas: api.Spec.Autoscaling.InitReplicas, TargetInFlight: &targetInFlight, Window: api.Spec.Autoscaling.Window.Duration, DownscaleStabilizationPeriod: api.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration, @@ -173,97 +174,3 @@ func (s *RealtimeScaler) CurrentRequestedReplicas(apiName string) (int32, error) return api.Spec.Replicas, nil } - -//func (s *RealtimeScaler) routeToService(deployment *kapps.Deployment) error { -// ctx := context.Background() -// vs, err := s.k8s.GetVirtualService(deployment.Name) -// if err != nil { -// return errors.Wrap(err, "failed to get virtual service") -// } -// -// if len(vs.Spec.Http) < 1 { -// return errors.ErrorUnexpected("virtual service does not have any http entries") -// } -// -// if err = s.waitForReadyReplicas(ctx, deployment); err != nil { -// return errors.Wrap(err, "no ready replicas available") -// } -// -// for i := range vs.Spec.Http { -// if len(vs.Spec.Http[i].Route) != 2 { -// return errors.ErrorUnexpected("virtual service does not have the required number of 2 http routes") -// } -// -// vs.Spec.Http[i].Route[0].Weight = 100 // service traffic -// vs.Spec.Http[i].Route[1].Weight = 0 // activator traffic -// } -// -// vsClient := s.k8s.IstioClientSet().NetworkingV1beta1().VirtualServices(s.k8s.Namespace) -// if _, err = vsClient.Update(ctx, vs, kmeta.UpdateOptions{}); err != nil { -// return errors.Wrap(err, "failed to update virtual service") -// } -// -// return nil -//} -// -//func (s *RealtimeScaler) routeToActivator(deployment *kapps.Deployment) error { -// ctx := context.Background() -// vs, err := s.k8s.GetVirtualService(deployment.Name) -// if err != nil { -// return errors.Wrap(err, "failed to get virtual service") -// } -// -// if len(vs.Spec.Http) < 1 { -// return errors.ErrorUnexpected("virtual service does not have any http entries") -// } -// -// for i := range vs.Spec.Http { -// if len(vs.Spec.Http[i].Route) != 2 { -// return errors.ErrorUnexpected("virtual service does not have the required number of 2 http routes") -// } -// -// vs.Spec.Http[i].Route[0].Weight = 0 // service traffic -// vs.Spec.Http[i].Route[1].Weight = 100 // activator traffic -// } -// -// vsClient := s.k8s.IstioClientSet().NetworkingV1beta1().VirtualServices(s.k8s.Namespace) -// if _, err = vsClient.Update(ctx, vs, kmeta.UpdateOptions{}); err != nil { -// return errors.Wrap(err, "failed to update virtual service") -// } -// -// return nil -//} -// -//func (s *RealtimeScaler) waitForReadyReplicas(ctx context.Context, deployment *kapps.Deployment) error { -// watcher, err := s.k8s.ClientSet().AppsV1().Deployments(s.k8s.Namespace).Watch( -// ctx, -// kmeta.ListOptions{ -// FieldSelector: fmt.Sprintf("metadata.name=%s", deployment.Name), -// Watch: true, -// }, -// ) -// if err != nil { -// return errors.Wrap(err, "could not create deployment watcher") -// } -// -// defer watcher.Stop() -// -// ctx, cancel := context.WithTimeout(ctx, consts.WaitForReadyReplicasTimeout) -// defer cancel() -// -// for { -// select { -// case event := <-watcher.ResultChan(): -// deploy, ok := event.Object.(*kapps.Deployment) -// if !ok { -// continue -// } -// -// if deploy.Status.ReadyReplicas > 0 { -// return nil -// } -// case <-ctx.Done(): -// return ctx.Err() -// } -// } -//} diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index e68b2c0629..9de96d79cc 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -34,7 +34,6 @@ import ( // RealtimeAPISpec defines the desired state of RealtimeAPI type RealtimeAPISpec struct { // +kubebuilder:validation:Required - // +kubebuilder:default=1 // Number of desired replicas Replicas int32 `json:"replicas"` @@ -43,7 +42,6 @@ type RealtimeAPISpec struct { Pod PodSpec `json:"pod"` // +kubebuilder:validation:Optional - // +kubebuilder:default={"min_replicas": 1} // Autoscaling configuration Autoscaling AutoscalingSpec `json:"autoscaling"` @@ -52,7 +50,6 @@ type RealtimeAPISpec struct { NodeGroups []string `json:"node_groups"` // +kubebuilder:validation:Optional - // +kubebuilder:default={"max_surge": "25%", "max_unavailable": "25%"} // Deployment strategy to use when replacing existing replicas with new ones UpdateStrategy UpdateStrategySpec `json:"update_strategy"` @@ -63,17 +60,14 @@ type RealtimeAPISpec struct { type PodSpec struct { // +kubebuilder:validation:Required - // +kubebuilder:default=8080 // Port to which requests will be sent to Port int32 `json:"port"` // +kubebuilder:validation:Required - // +kubebuilder:default=1 // Maximum number of requests that will be concurrently sent into the container MaxConcurrency int32 `json:"max_concurrency"` // +kubebuilder:validation:Required - // +kubebuilder:default=100 // Maximum number of requests per replica which will be queued // (beyond max_concurrency) before requests are rejected with error code 503 MaxQueueLength int32 `json:"max_queue_length"` @@ -143,11 +137,12 @@ type ComputeSpec struct { } type AutoscalingSpec struct { - // +kubebuilder:default=1 + // Init number of replicas + InitReplicas int32 `json:"init_replicas,omitempty"` + // Minimum number of replicas MinReplicas int32 `json:"min_replicas,omitempty"` - // +kubebuilder:default=100 // Maximum number of replicas MaxReplicas int32 `json:"max_replicas,omitempty"` @@ -157,52 +152,43 @@ type AutoscalingSpec struct { TargetInFlight string `json:"target_in_flight,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="60s" // Duration over which to average the API's in-flight requests per replica Window kmeta.Duration `json:"window,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="5m" // The API will not scale below the highest recommendation made during this period DownscaleStabilizationPeriod kmeta.Duration `json:"downscale_stabilization_period,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="1m" // The API will not scale above the lowest recommendation made during this period UpscaleStabilizationPeriod kmeta.Duration `json:"upscale_stabilization_period,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="0.75" // Maximum factor by which to scale down the API on a single scaling event MaxDownscaleFactor string `json:"max_downscale_factor,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="1.5" // Maximum factor by which to scale up the API on a single scaling event MaxUpscaleFactor string `json:"max_upscale_factor,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="0.5" // Any recommendation falling within this factor below the current number of replicas will not trigger a // scale down event DownscaleTolerance string `json:"downscale_tolerance,omitempty"` // +kubebuilder:validation:Optional - // +kubebuilder:default="0.5" // Any recommendation falling within this factor above the current number of replicas will not trigger a scale up event UpscaleTolerance string `json:"upscale_tolerance,omitempty"` } type UpdateStrategySpec struct { // +kubebuilder:validation:Optional - // +kubebuilder:default="25%" // Maximum number of replicas that can be scheduled above the desired number of replicas during an update; // can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) // (set to 0 to disable rolling updates) MaxSurge intstr.IntOrString `json:"max_surge"` // +kubebuilder:validation:Optional - // +kubebuilder:default="25%" // maximum number of replicas that can be unavailable during an update; can be an absolute number, // e.g. 5, or a percentage of desired replicas, e.g. 10% MaxUnavailable intstr.IntOrString `json:"max_unavailable"` diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index dab7d0e7a4..384ab8b342 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -117,12 +117,6 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Step 4: Create or Update Resources - deployOp, err := r.createOrUpdateDeployment(ctx, api) - if err != nil { - return ctrl.Result{}, err - } - log.V(1).Info(fmt.Sprintf("deployment %s", deployOp)) - svcOp, err := r.createOrUpdateService(ctx, api) if err != nil { return ctrl.Result{}, err @@ -135,6 +129,12 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) } log.V(1).Info(fmt.Sprintf("virtual service %s", vsOp)) + deployOp, err := r.createOrUpdateDeployment(ctx, api) + if err != nil { + return ctrl.Result{}, err + } + log.V(1).Info(fmt.Sprintf("deployment %s", deployOp)) + return ctrl.Result{}, nil } diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 67a3381ef3..9c92495ecd 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -246,7 +246,7 @@ func (r *RealtimeAPIReconciler) desiredService(api serverless.RealtimeAPI) kcore func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI) istioclientnetworking.VirtualService { var activatorWeight int32 - if api.Spec.Replicas == 0 { + if api.Spec.Replicas == 0 || api.Status.Ready == 0 { activatorWeight = 100 } diff --git a/pkg/operator/resources/realtimeapi/helpers.go b/pkg/operator/resources/realtimeapi/helpers.go index 938fccc99a..6041907a78 100644 --- a/pkg/operator/resources/realtimeapi/helpers.go +++ b/pkg/operator/resources/realtimeapi/helpers.go @@ -30,7 +30,7 @@ import ( "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" v12 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" @@ -119,6 +119,7 @@ func k8sResourceFromAPIConfig(apiConfig userconfig.API, prevAPI *serverless.Real Containers: containers, }, Autoscaling: v1alpha1.AutoscalingSpec{ + InitReplicas: apiConfig.Autoscaling.InitReplicas, MinReplicas: apiConfig.Autoscaling.MinReplicas, MaxReplicas: apiConfig.Autoscaling.MaxReplicas, TargetInFlight: fmt.Sprintf("%f", *apiConfig.Autoscaling.TargetInFlight), From fb5c08527a4d491a67ae9342eb73b58f04338530 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 01:23:01 +0300 Subject: [PATCH 32/42] RealtimeAPI controller fixes --- pkg/activator/activator.go | 12 +++++------ pkg/activator/helpers.go | 16 ++++++++++----- .../serverless/v1alpha1/realtimeapi_types.go | 4 +++- .../serverless.cortex.dev_realtimeapis.yaml | 20 ------------------- .../serverless/realtimeapi_controller.go | 1 + .../realtimeapi_controller_helpers.go | 2 ++ 6 files changed, 23 insertions(+), 32 deletions(-) diff --git a/pkg/activator/activator.go b/pkg/activator/activator.go index 7b68736951..f5400e5fcb 100644 --- a/pkg/activator/activator.go +++ b/pkg/activator/activator.go @@ -158,7 +158,7 @@ func (a *activator) getOrCreateReadinessTracker(apiName string) *readinessTracke } func (a *activator) addAPI(obj interface{}) { - apiMetadata, err := getAPIMeta(obj) + apiMetadata, err := getAPIMeta(obj, true) if err != nil { a.logger.Errorw("error during virtual service informer add callback", zap.Error(err)) telemetry.Error(err) @@ -182,7 +182,7 @@ func (a *activator) addAPI(obj interface{}) { } func (a *activator) updateAPI(oldObj interface{}, newObj interface{}) { - apiMetadata, err := getAPIMeta(newObj) + apiMetadata, err := getAPIMeta(newObj, true) if err != nil { a.logger.Errorw("error during virtual service informer update callback", zap.Error(err)) telemetry.Error(err) @@ -195,7 +195,7 @@ func (a *activator) updateAPI(oldObj interface{}, newObj interface{}) { apiName := apiMetadata.apiName - oldAPIMetatada, err := getAPIMeta(oldObj) + oldAPIMetatada, err := getAPIMeta(oldObj, true) if err != nil { a.logger.Errorw("error during virtual service informer update callback", zap.Error(err)) telemetry.Error(err) @@ -212,7 +212,7 @@ func (a *activator) updateAPI(oldObj interface{}, newObj interface{}) { } func (a *activator) removeAPI(obj interface{}) { - apiMetadata, err := getAPIMeta(obj) + apiMetadata, err := getAPIMeta(obj, false) if err != nil { a.logger.Errorw("error during virtual service informer delete callback", zap.Error(err)) telemetry.Error(err) @@ -250,7 +250,7 @@ func (a *activator) updateReadinessTracker(obj interface{}) { return } - api, err := getAPIMeta(obj) + api, err := getAPIMeta(obj, false) if err != nil { a.logger.Errorw("error during deployment informer callback", zap.Error(err)) telemetry.Error(err) @@ -271,7 +271,7 @@ func (a *activator) updateReadinessTracker(obj interface{}) { } func (a *activator) removeReadinessTracker(obj interface{}) { - api, err := getAPIMeta(obj) + api, err := getAPIMeta(obj, false) if err != nil { a.logger.Errorw("error during deployment informer callback", zap.Error(err)) telemetry.Error(err) diff --git a/pkg/activator/helpers.go b/pkg/activator/helpers.go index 5bce2cb7bf..e3d20b6f0f 100644 --- a/pkg/activator/helpers.go +++ b/pkg/activator/helpers.go @@ -31,7 +31,7 @@ type apiMeta struct { maxQueueLength int } -func getAPIMeta(obj interface{}) (apiMeta, error) { +func getAPIMeta(obj interface{}, includeAnnotations bool) (apiMeta, error) { resource, err := meta.Accessor(obj) if err != nil { return apiMeta{}, err @@ -48,16 +48,22 @@ func getAPIMeta(obj interface{}) (apiMeta, error) { return apiMeta{}, errors.ErrorUnexpected("got a virtual service without apiName label") } - maxQueueLength, maxConcurrency, err := userconfig.ConcurrencyFromAnnotations(resource) - if err != nil { - return apiMeta{}, err + var maxQueueLength, maxConcurrency int + var annotations map[string]string + + if includeAnnotations { + maxQueueLength, maxConcurrency, err = userconfig.ConcurrencyFromAnnotations(resource) + if err != nil { + return apiMeta{}, err + } + annotations = resource.GetAnnotations() } return apiMeta{ apiName: apiName, apiKind: userconfig.KindFromString(apiKind), labels: labels, - annotations: resource.GetAnnotations(), + annotations: annotations, maxConcurrency: maxConcurrency, maxQueueLength: maxQueueLength, }, nil diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 9de96d79cc..6e25cec237 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -137,12 +137,15 @@ type ComputeSpec struct { } type AutoscalingSpec struct { + // +kubebuilder:validation:Optional // Init number of replicas InitReplicas int32 `json:"init_replicas,omitempty"` + // +kubebuilder:validation:Optional // Minimum number of replicas MinReplicas int32 `json:"min_replicas,omitempty"` + // +kubebuilder:validation:Optional // Maximum number of replicas MaxReplicas int32 `json:"max_replicas,omitempty"` @@ -244,7 +247,6 @@ func (api RealtimeAPI) GetOrCreateAPIIDs() (deploymentID, podID, specID, apiID s var buf bytes.Buffer - buf.WriteString(api.Name) buf.WriteString(api.Name) buf.WriteString(userconfig.RealtimeAPIKind.String()) buf.WriteString(s.Obj(api.Spec.Pod)) diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 7346ec219d..1fd70d2d93 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -50,38 +50,30 @@ spec: description: RealtimeAPISpec defines the desired state of RealtimeAPI properties: autoscaling: - default: - min_replicas: 1 description: Autoscaling configuration properties: downscale_stabilization_period: - default: 5m description: The API will not scale below the highest recommendation made during this period type: string downscale_tolerance: - default: "0.5" description: Any recommendation falling within this factor below the current number of replicas will not trigger a scale down event type: string max_downscale_factor: - default: "0.75" description: Maximum factor by which to scale down the API on a single scaling event type: string max_replicas: - default: 100 description: Maximum number of replicas format: int32 type: integer max_upscale_factor: - default: "1.5" description: Maximum factor by which to scale up the API on a single scaling event type: string min_replicas: - default: 1 description: Minimum number of replicas format: int32 type: integer @@ -91,17 +83,14 @@ spec: which the autoscaler tries to maintain type: string upscale_stabilization_period: - default: 1m description: The API will not scale above the lowest recommendation made during this period type: string upscale_tolerance: - default: "0.5" description: Any recommendation falling within this factor above the current number of replicas will not trigger a scale up event type: string window: - default: 60s description: Duration over which to average the API's in-flight requests per replica type: string @@ -539,20 +528,17 @@ spec: type: object type: array max_concurrency: - default: 1 description: Maximum number of requests that will be concurrently sent into the container format: int32 type: integer max_queue_length: - default: 100 description: Maximum number of requests per replica which will be queued (beyond max_concurrency) before requests are rejected with error code 503 format: int32 type: integer port: - default: 8080 description: Port to which requests will be sent to format: int32 type: integer @@ -563,14 +549,10 @@ spec: - port type: object replicas: - default: 1 description: Number of desired replicas format: int32 type: integer update_strategy: - default: - max_surge: 25% - max_unavailable: 25% description: Deployment strategy to use when replacing existing replicas with new ones properties: @@ -578,7 +560,6 @@ spec: anyOf: - type: integer - type: string - default: 25% description: 'Maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, @@ -588,7 +569,6 @@ spec: anyOf: - type: integer - type: string - default: 25% description: maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index 384ab8b342..a2c788ef8d 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -86,6 +86,7 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Step 3: Get or create deployment and API ids deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() + idsOutdated := api.Annotations["cortex.dev/deployment-id"] != deploymentID || api.Annotations["cortex.dev/spec-id"] != specID || api.Annotations["cortex.dev/api-id"] != apiID diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 9c92495ecd..9580bf8b7c 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -452,5 +452,7 @@ func (r *RealtimeAPIReconciler) generateAPIAnnotations(api serverless.RealtimeAP userconfig.MaxUpscaleFactorAnnotationKey: api.Spec.Autoscaling.MaxUpscaleFactor, userconfig.DownscaleToleranceAnnotationKey: api.Spec.Autoscaling.DownscaleTolerance, userconfig.UpscaleToleranceAnnotationKey: api.Spec.Autoscaling.UpscaleTolerance, + userconfig.MaxQueueLengthAnnotationKey: s.Int32(api.Spec.Pod.MaxQueueLength), + userconfig.MaxConcurrencyAnnotationKey: s.Int32(api.Spec.Pod.MaxConcurrency), } } From 835ff98b07d69004a6d3e412f11077f55b448c34 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 01:32:14 +0300 Subject: [PATCH 33/42] Add InitReplicas to RealtimeAPI spec --- pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go | 2 +- .../config/crd/bases/serverless.cortex.dev_realtimeapis.yaml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 6e25cec237..439711ea5c 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -138,7 +138,7 @@ type ComputeSpec struct { type AutoscalingSpec struct { // +kubebuilder:validation:Optional - // Init number of replicas + // Initial number of replicas InitReplicas int32 `json:"init_replicas,omitempty"` // +kubebuilder:validation:Optional diff --git a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml index 1fd70d2d93..7a8c6a755b 100644 --- a/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml +++ b/pkg/crds/config/crd/bases/serverless.cortex.dev_realtimeapis.yaml @@ -61,6 +61,10 @@ spec: the current number of replicas will not trigger a scale down event type: string + init_replicas: + description: Initial number of replicas + format: int32 + type: integer max_downscale_factor: description: Maximum factor by which to scale down the API on a single scaling event From 7ae8949a84dfa276ffdb7f22220e18bdcad7fe6c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 01:49:27 +0300 Subject: [PATCH 34/42] Simplify GetAutoscalingSpec function --- pkg/autoscaler/helpers.go | 65 +++++++++++++++++++++++++++++++ pkg/autoscaler/realtime_scaler.go | 40 +------------------ 2 files changed, 66 insertions(+), 39 deletions(-) create mode 100644 pkg/autoscaler/helpers.go diff --git a/pkg/autoscaler/helpers.go b/pkg/autoscaler/helpers.go new file mode 100644 index 0000000000..9f9e4af039 --- /dev/null +++ b/pkg/autoscaler/helpers.go @@ -0,0 +1,65 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" + "github.com/cortexlabs/cortex/pkg/lib/errors" + libstrings "github.com/cortexlabs/cortex/pkg/lib/strings" + "github.com/cortexlabs/cortex/pkg/types/userconfig" +) + +func generateAutoscalingFromServerlessRealtimeAPI(realtimeAPI serverless.RealtimeAPI) (*userconfig.Autoscaling, error) { + targetInFlight, ok := libstrings.ParseFloat64(realtimeAPI.Spec.Autoscaling.TargetInFlight) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse target-in-flight requests from autoscaling spec") + } + + maxDownscaleFactor, ok := libstrings.ParseFloat64(realtimeAPI.Spec.Autoscaling.MaxDownscaleFactor) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse max downscale factor from autoscaling spec") + } + + maxUpscaleFactor, ok := libstrings.ParseFloat64(realtimeAPI.Spec.Autoscaling.MaxUpscaleFactor) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse max upscale factor from autoscaling spec") + } + + downscaleTolerance, ok := libstrings.ParseFloat64(realtimeAPI.Spec.Autoscaling.DownscaleTolerance) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse downscale tolerance from autoscaling spec") + } + + upscaleTolerance, ok := libstrings.ParseFloat64(realtimeAPI.Spec.Autoscaling.UpscaleTolerance) + if !ok { + return nil, errors.ErrorUnexpected("failed to parse upscale tolerance from autoscaling spec") + } + + return &userconfig.Autoscaling{ + MinReplicas: realtimeAPI.Spec.Autoscaling.MinReplicas, + MaxReplicas: realtimeAPI.Spec.Autoscaling.MaxReplicas, + InitReplicas: realtimeAPI.Spec.Autoscaling.InitReplicas, + TargetInFlight: &targetInFlight, + Window: realtimeAPI.Spec.Autoscaling.Window.Duration, + DownscaleStabilizationPeriod: realtimeAPI.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration, + UpscaleStabilizationPeriod: realtimeAPI.Spec.Autoscaling.UpscaleStabilizationPeriod.Duration, + MaxDownscaleFactor: maxDownscaleFactor, + MaxUpscaleFactor: maxUpscaleFactor, + DownscaleTolerance: downscaleTolerance, + UpscaleTolerance: upscaleTolerance, + }, nil +} diff --git a/pkg/autoscaler/realtime_scaler.go b/pkg/autoscaler/realtime_scaler.go index 5d1df8ad27..9aa3e5c8ef 100644 --- a/pkg/autoscaler/realtime_scaler.go +++ b/pkg/autoscaler/realtime_scaler.go @@ -24,7 +24,6 @@ import ( serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - libstrings "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types/userconfig" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" @@ -120,44 +119,7 @@ func (s *RealtimeScaler) GetAutoscalingSpec(apiName string) (*userconfig.Autosca return nil, err } - targetInFlight, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.TargetInFlight) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse target-in-flight requests from autoscaling spec") - } - - maxDownscaleFactor, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.MaxDownscaleFactor) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse max downscale factor from autoscaling spec") - } - - maxUpscaleFactor, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.MaxUpscaleFactor) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse max upscale factor from autoscaling spec") - } - - downscaleTolerance, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.DownscaleTolerance) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse downscale tolerance from autoscaling spec") - } - - upscaleTolerance, ok := libstrings.ParseFloat64(api.Spec.Autoscaling.UpscaleTolerance) - if !ok { - return nil, errors.ErrorUnexpected("failed to parse upscale tolerance from autoscaling spec") - } - - return &userconfig.Autoscaling{ - MinReplicas: api.Spec.Autoscaling.MinReplicas, - MaxReplicas: api.Spec.Autoscaling.MaxReplicas, - InitReplicas: api.Spec.Autoscaling.InitReplicas, - TargetInFlight: &targetInFlight, - Window: api.Spec.Autoscaling.Window.Duration, - DownscaleStabilizationPeriod: api.Spec.Autoscaling.DownscaleStabilizationPeriod.Duration, - UpscaleStabilizationPeriod: api.Spec.Autoscaling.UpscaleStabilizationPeriod.Duration, - MaxDownscaleFactor: maxDownscaleFactor, - MaxUpscaleFactor: maxUpscaleFactor, - DownscaleTolerance: downscaleTolerance, - UpscaleTolerance: upscaleTolerance, - }, nil + return generateAutoscalingFromServerlessRealtimeAPI(api) } func (s *RealtimeScaler) CurrentRequestedReplicas(apiName string) (int32, error) { From 5050519a32baf30cd34576a4f85a1d55aedb87d8 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 03:36:59 +0300 Subject: [PATCH 35/42] Add serverless to autoscaler's scheme --- cmd/autoscaler/main.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/autoscaler/main.go b/cmd/autoscaler/main.go index 71e8bd034e..24035c2c38 100644 --- a/cmd/autoscaler/main.go +++ b/cmd/autoscaler/main.go @@ -28,6 +28,7 @@ import ( "time" "github.com/cortexlabs/cortex/pkg/autoscaler" + serverless "github.com/cortexlabs/cortex/pkg/crds/apis/serverless/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" @@ -107,6 +108,9 @@ func main() { defer telemetry.Close() scheme := runtime.NewScheme() + if err := serverless.AddToScheme(scheme); err != nil { + exit(log, err, "failed to add k8s serverless to scheme") + } if err := clientgoscheme.AddToScheme(scheme); err != nil { exit(log, err, "failed to add k8s client-go-scheme to scheme") } From 934f1ff4f2a25b02ed95e3e235040de2d5254bdc Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 03:37:15 +0300 Subject: [PATCH 36/42] apiID is required for determining if a pod is up-to-date or not --- .../controllers/serverless/realtimeapi_controller_helpers.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 9580bf8b7c..e4ac81a1a2 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -195,6 +195,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), + "apiID": api.Annotations["cortex.dev/api-id"], "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, From d0a1b612ce3a1e4f144cd0996bc62c71d834b69e Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 03:37:37 +0300 Subject: [PATCH 37/42] Allow autoscaler to get/update realtimeapis resources --- manager/manifests/autoscaler.yaml.j2 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/manager/manifests/autoscaler.yaml.j2 b/manager/manifests/autoscaler.yaml.j2 index ce875b24c3..2f00afb164 100644 --- a/manager/manifests/autoscaler.yaml.j2 +++ b/manager/manifests/autoscaler.yaml.j2 @@ -42,6 +42,13 @@ rules: - get - update - watch +- apiGroups: + - "serverless.cortex.dev" + resources: + - realtimeapis + verbs: + - get + - update --- From cac1a11954e8b125f78a41eca44e754b63fdeaf7 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 08:15:47 +0300 Subject: [PATCH 38/42] Fix cortex logs/refresh cmds for realtime API --- .../serverless/v1alpha1/realtimeapi_types.go | 3 +- .../realtimeapi_controller_helpers.go | 4 +- pkg/operator/operator/logging.go | 34 ---------------- pkg/operator/resources/realtimeapi/api.go | 39 +++++++++++++------ pkg/operator/resources/realtimeapi/helpers.go | 22 ++++++----- 5 files changed, 44 insertions(+), 58 deletions(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 439711ea5c..31c722a4d8 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -266,8 +266,7 @@ func (api RealtimeAPI) GetOrCreateAPIIDs() (deploymentID, podID, specID, apiID s api.Annotations["cortex.dev/spec-id"] != specID { apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) } - - return deploymentID, podID, specID, apiID + return } //+kubebuilder:object:root=true diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index e4ac81a1a2..9c672458a0 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -183,6 +183,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), "apiID": api.Annotations["cortex.dev/api-id"], + "podID": api.Annotations["cortex.dev/pod-id"], "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, @@ -195,7 +196,7 @@ func (r *RealtimeAPIReconciler) desiredDeployment(api serverless.RealtimeAPI) ka Labels: map[string]string{ "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), - "apiID": api.Annotations["cortex.dev/api-id"], + "podID": api.Annotations["cortex.dev/pod-id"], "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, @@ -298,6 +299,7 @@ func (r *RealtimeAPIReconciler) desiredVirtualService(api serverless.RealtimeAPI "apiName": api.Name, "apiKind": userconfig.RealtimeAPIKind.String(), "apiID": api.Annotations["cortex.dev/api-id"], + "podID": api.Annotations["cortex.dev/pod-id"], "deploymentID": api.Annotations["cortex.dev/deployment-id"], "cortex.dev/api": "true", }, diff --git a/pkg/operator/operator/logging.go b/pkg/operator/operator/logging.go index f49746f64b..51e6ece794 100644 --- a/pkg/operator/operator/logging.go +++ b/pkg/operator/operator/logging.go @@ -101,40 +101,6 @@ func initializeLogger(key string, level userconfig.LogLevel, fields map[string]i return sugarLogger, nil } -func GetRealtimeAPILogger(apiName string, apiID string) (*zap.SugaredLogger, error) { - loggerCacheKey := fmt.Sprintf("apiName=%s,apiID=%s", apiName, apiID) - logger := getFromCacheOrNil(loggerCacheKey) - - if logger != nil { - return logger, nil - } - - apiSpec, err := DownloadAPISpec(apiName, apiID) - if err != nil { - return nil, err - } - - return initializeLogger(loggerCacheKey, userconfig.InfoLogLevel, map[string]interface{}{ - "apiName": apiSpec.Name, - "apiKind": apiSpec.Kind.String(), - "apiID": apiSpec.ID, - }) -} - -func GetRealtimeAPILoggerFromSpec(apiSpec *spec.API) (*zap.SugaredLogger, error) { - loggerCacheKey := fmt.Sprintf("apiName=%s,apiID=%s", apiSpec.Name, apiSpec.ID) - logger := getFromCacheOrNil(loggerCacheKey) - if logger != nil { - return logger, nil - } - - return initializeLogger(loggerCacheKey, userconfig.InfoLogLevel, map[string]interface{}{ - "apiName": apiSpec.Name, - "apiKind": apiSpec.Kind.String(), - "apiID": apiSpec.ID, - }) -} - func GetJobLogger(jobKey spec.JobKey) (*zap.SugaredLogger, error) { loggerCacheKey := fmt.Sprintf("apiName=%s,jobID=%s", jobKey.APIName, jobKey.ID) logger := getFromCacheOrNil(loggerCacheKey) diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index fe9203a4de..8c58564b14 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -39,7 +39,6 @@ import ( kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - ktypes "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -116,21 +115,37 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) func RefreshAPI(apiName string) (string, error) { ctx := context.Background() - api := serverless.RealtimeAPI{ - ObjectMeta: kmeta.ObjectMeta{ - Namespace: consts.DefaultNamespace, - Name: apiName, - }, - } + var api serverless.RealtimeAPI + key := client.ObjectKey{Namespace: consts.DefaultNamespace, Name: apiName} - // slashes are encoded as ~1 in the json patch - patch := []byte(fmt.Sprintf( - "[{\"op\": \"replace\", \"path\": \"/metadata/annotations/cortex.dev~1deployment-id\", \"value\": \"%s\" }]", - generateDeploymentID())) - if err := config.K8s.Patch(ctx, &api, client.RawPatch(ktypes.JSONPatchType, patch)); err != nil { + err := config.K8s.Get(ctx, key, &api) + if err != nil { return "", errors.Wrap(err, "failed to get realtime api resource") } + apiSpec, err := operator.DownloadAPISpec(api.Name, api.Annotations["cortex.dev/api-id"]) + if err != nil { + return "", err + } + + // generate a new api-id + // the deployment-id and spec-id components of the api-id remain unchanged + api.Annotations["cortex.dev/api-id"] = "" + _, _, _, apiID := api.GetOrCreateAPIIDs() + api.Annotations["cortex.dev/api-id"] = apiID + + err = config.K8s.Update(ctx, &api) + if err != nil { + return "", errors.Wrap(err, "failed to update realtime api resource") + } + + apiSpec.ID = apiID + apiSpec.Key = spec.Key(apiName, apiID, config.ClusterConfig.ClusterUID) + + if err := config.AWS.UploadJSONToS3(apiSpec, config.ClusterConfig.Bucket, apiSpec.Key); err != nil { + return "", errors.Wrap(err, "failed to upload api spec") + } + apiResource := userconfig.Resource{ Name: apiName, Kind: userconfig.RealtimeAPIKind, diff --git a/pkg/operator/resources/realtimeapi/helpers.go b/pkg/operator/resources/realtimeapi/helpers.go index 6041907a78..979ad90ef3 100644 --- a/pkg/operator/resources/realtimeapi/helpers.go +++ b/pkg/operator/resources/realtimeapi/helpers.go @@ -142,6 +142,16 @@ func k8sResourceFromAPIConfig(apiConfig userconfig.API, prevAPI *serverless.Real }, } + if prevAPI != nil { + // we should keep the existing number of replicas instead of init_replicas + api.Spec.Replicas = prevAPI.Spec.Replicas + if prevDeployID := prevAPI.Annotations["cortex.dev/deployment-id"]; prevDeployID != "" { + api.Annotations = map[string]string{ + "cortex.dev/deployment-id": prevDeployID, + } + } + } + deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() api.Annotations = map[string]string{ "cortex.dev/deployment-id": deploymentID, @@ -150,14 +160,6 @@ func k8sResourceFromAPIConfig(apiConfig userconfig.API, prevAPI *serverless.Real "cortex.dev/api-id": apiID, } - if prevAPI != nil { - // we should keep the existing number of replicas instead of init_replicas - api.Spec.Replicas = prevAPI.Spec.Replicas - if prevDeployID := prevAPI.Annotations["cortex.dev/deployment-id"]; prevDeployID != "" { - api.Annotations["cortex.dev/deployment-id"] = prevDeployID - } - } - return api } @@ -178,6 +180,7 @@ func metadataFromRealtimeAPI(sv *v1alpha1.RealtimeAPI) (*spec.Metadata, error) { }, APIID: sv.Annotations["cortex.dev/api-id"], DeploymentID: sv.Annotations["cortex.dev/deployment-id"], + PodID: sv.Annotations["cortex.dev/pod-id"], LastUpdated: lastUpdated.Unix(), }, nil } @@ -247,5 +250,6 @@ func addPodToReplicaCounts(pod *v1.Pod, metadata *spec.Metadata, counts *status. } func isPodSpecLatest(pod *v1.Pod, metadata *spec.Metadata) bool { - return metadata.APIID == pod.Labels["apiID"] + return metadata.DeploymentID == pod.Labels["deploymentID"] && + metadata.PodID == pod.Labels["podID"] } From c31bf2cbdfaffc80f3d7373400864924c16b065b Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 08:22:46 +0300 Subject: [PATCH 39/42] Fix CORTEX_PORT not present on realtime api pods --- .../controllers/serverless/realtimeapi_controller_helpers.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 9c672458a0..4cf5637f93 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -363,6 +363,10 @@ func (r *RealtimeAPIReconciler) userContainers(api serverless.RealtimeAPI) ([]kc containerEnvVars := workloads.BaseEnvVars containerEnvVars = append(containerEnvVars, workloads.ClientConfigEnvVar()) + containerEnvVars = append(containerEnvVars, kcore.EnvVar{ + Name: "CORTEX_PORT", + Value: s.Int32(api.Spec.Pod.Port), + }) containerEnvVars = append(containerEnvVars, container.Env...) containers[i] = kcore.Container{ From 2c8c233282394b829fd884a653a063d0ad9d6d04 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 17:37:17 +0300 Subject: [PATCH 40/42] Use deployment-id instead of api-id for --- pkg/operator/resources/realtimeapi/api.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 8c58564b14..52b995bcb7 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -128,11 +128,10 @@ func RefreshAPI(apiName string) (string, error) { return "", err } - // generate a new api-id - // the deployment-id and spec-id components of the api-id remain unchanged - api.Annotations["cortex.dev/api-id"] = "" - _, _, _, apiID := api.GetOrCreateAPIIDs() - api.Annotations["cortex.dev/api-id"] = apiID + // create new deployment + api.Annotations["cortex.dev/deployment-id"] = "" + deploymentID, _, _, apiID := api.GetOrCreateAPIIDs() + api.Annotations["cortex.dev/deployment-id"] = deploymentID err = config.K8s.Update(ctx, &api) if err != nil { From 763acf0e29dd513c43dad3230e11a441a983d258 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 17:38:14 +0300 Subject: [PATCH 41/42] Ensure that the `min_replicas`/`max_replicas` range is ensured by the controller --- .../serverless/realtimeapi_controller.go | 3 ++- .../serverless/realtimeapi_controller_helpers.go | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller.go b/pkg/crds/controllers/serverless/realtimeapi_controller.go index a2c788ef8d..f422813acf 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller.go @@ -85,6 +85,7 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Step 3: Get or create deployment and API ids + desiredReplicasChanged := r.ensureDesiredReplicasRange(ctx, &api) deploymentID, podID, specID, apiID := api.GetOrCreateAPIIDs() idsOutdated := api.Annotations["cortex.dev/deployment-id"] != deploymentID || @@ -111,7 +112,7 @@ func (r *RealtimeAPIReconciler) Reconcile(ctx context.Context, req ctrl.Request) api.Annotations["cortex.dev/api-id"] = apiID } - if idsOutdated { + if idsOutdated || desiredReplicasChanged { if err = r.Update(ctx, &api); err != nil { return ctrl.Result{}, err } diff --git a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go index 4cf5637f93..909c446475 100644 --- a/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go +++ b/pkg/crds/controllers/serverless/realtimeapi_controller_helpers.go @@ -77,6 +77,22 @@ func (r *RealtimeAPIReconciler) updateStatus(ctx context.Context, api *serverles return nil } +func (r *RealtimeAPIReconciler) ensureDesiredReplicasRange(ctx context.Context, api *serverless.RealtimeAPI) bool { + replicasFieldChanged := false + desiredReplicas := api.Spec.Replicas + + if desiredReplicas < api.Spec.Autoscaling.MinReplicas { + desiredReplicas = api.Spec.Autoscaling.MinReplicas + replicasFieldChanged = true + } else if desiredReplicas > api.Spec.Autoscaling.MaxReplicas { + desiredReplicas = api.Spec.Autoscaling.MaxReplicas + replicasFieldChanged = true + } + + api.Spec.Replicas = desiredReplicas + return replicasFieldChanged +} + func (r *RealtimeAPIReconciler) createOrUpdateDeployment(ctx context.Context, api serverless.RealtimeAPI) (controllerutil.OperationResult, error) { deployment := kapps.Deployment{ ObjectMeta: kmeta.ObjectMeta{ From b4b87458784a1545238db4593f87886f2cb10886 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 31 Jul 2021 17:39:06 +0300 Subject: [PATCH 42/42] Revert explicit return expression --- pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go index 31c722a4d8..cf0708a735 100644 --- a/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go +++ b/pkg/crds/apis/serverless/v1alpha1/realtimeapi_types.go @@ -266,7 +266,7 @@ func (api RealtimeAPI) GetOrCreateAPIIDs() (deploymentID, podID, specID, apiID s api.Annotations["cortex.dev/spec-id"] != specID { apiID = fmt.Sprintf("%s-%s-%s", spec.MonotonicallyDecreasingID(), deploymentID, specID) } - return + return deploymentID, podID, specID, apiID } //+kubebuilder:object:root=true