diff --git a/charts/csi-hyperstack/files/dashboards/csi-hyperstack.json b/charts/csi-hyperstack/files/dashboards/csi-hyperstack.json new file mode 100644 index 0000000..15f8e4d --- /dev/null +++ b/charts/csi-hyperstack/files/dashboards/csi-hyperstack.json @@ -0,0 +1,271 @@ +{ + "title": "Hyperstack CSI Dashboard", + "uid": "csi-hyperstack", + "tags": ["storage", "hyperstack"], + "schemaVersion": 37, + "version": 1, + "refresh": "auto", + "time": { "from": "now-6h", "to": "now" }, + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "pluginId": "prometheus", + "query": "prometheus", + "label": "Datasource", + "hide": 0, + "refresh": 1, + "current": { + "selected": false, + "text": "default", + "value": "default" + } + }, + { + "name": "storageclass", + "type": "query", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "label": "Storage Class", + "query": "label_values(kube_storageclass_info{provisioner=\"hyperstack.csi.nexgencloud.com\"}, storageclass)", + "refresh": 2, + "sort": 1, + "hide": 0, + "multi": false, + "includeAll": true, + "allValue": ".*" + }, + { + "name": "csi_namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "label": "Driver Namespace", + "query": "label_values(csi_operations_total, namespace)", + "refresh": 2, + "sort": 1, + "hide": 0, + "multi": false, + "includeAll": true, + "allValue": ".*" + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Operation Rate", + "type": "timeseries", + "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { "lineWidth": 2 } + } + }, + "options": { "legend": { "displayMode": "table", "placement": "bottom" } }, + "targets": [ + { + "expr": "rate(csi_operations_total{namespace=~\"$csi_namespace\"}[5m])", + "legendFormat": "{{operation}}", + "refId": "A" + } + ] + }, + { + "id": 2, + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "fixedColor": "red", "mode": "fixed" }, + "custom": { "lineWidth": 2 } + } + }, + "options": { "legend": { "displayMode": "table", "placement": "bottom" } }, + "targets": [ + { + "expr": "rate(csi_operation_errors_total{namespace=~\"$csi_namespace\"}[5m])", + "legendFormat": "{{operation}}", + "refId": "A" + } + ] + }, + { + "id": 3, + "title": "Operation Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { "unit": "s", "custom": { "lineWidth": 2 } } + }, + "options": { "legend": { "displayMode": "table", "placement": "bottom" } }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (operation, le) (rate(csi_operation_duration_seconds_bucket{namespace=~\"$csi_namespace\"}[5m])))", + "legendFormat": "p50 {{operation}}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (operation, le) (rate(csi_operation_duration_seconds_bucket{namespace=~\"$csi_namespace\"}[5m])))", + "legendFormat": "p95 {{operation}}", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (operation, le) (rate(csi_operation_duration_seconds_bucket{namespace=~\"$csi_namespace\"}[5m])))", + "legendFormat": "p99 {{operation}}", + "refId": "C" + } + ] + }, + { + "id": 4, + "title": "Volume Attachments", + "type": "timeseries", + "gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { "lineWidth": 2, "fillOpacity": 10 }, + "min": 0 + }, + "overrides": [] + }, + "options": { "legend": { "displayMode": "table", "placement": "bottom" } }, + "targets": [ + { + "expr": "csi_volume_attachments_total{namespace=~\"$csi_namespace\"}", + "legendFormat": "{{pod}}", + "refId": "A" + } + ] + }, + { + "id": 5, + "title": "PVC Usage", + "type": "table", + "gridPos": { "x": 0, "y": 16, "w": 24, "h": 10 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "decimals": 1, + "unit": "bytes", + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "#EAB839", "value": 75 }, + { "color": "red", "value": 90 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Usage" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "min", "value": 0 }, + { "id": "max", "value": 100 }, + { "id": "custom.cellOptions", "value": { "mode": "lcd", "type": "gauge" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Status" }, + "properties": [ + { "id": "unit", "value": "none" }, + { + "id": "mappings", + "value": [ + { "type": "value", "options": { "1": { "text": "Attached", "color": "green", "index": 0 }, "0": { "text": "Detached", "color": "orange", "index": 1 } } } + ] + }, + { "id": "custom.cellOptions", "value": { "type": "color-text" } }, + { "id": "custom.width", "value": 100 } + ] + }, + { "matcher": { "id": "byName", "options": "PVC" }, "properties": [{ "id": "custom.minWidth", "value": 220 }] }, + { "matcher": { "id": "byName", "options": "Volume" }, "properties": [{ "id": "custom.minWidth", "value": 280 }] } + ] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "sortBy": [{ "desc": true, "displayName": "Usage" }], + "footer": { "show": false } + }, + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "persistentvolumeclaim", "storageclass", "volumename", "Value"] + } + } + }, + { "id": "merge", "options": {} }, + { + "id": "organize", + "options": { + "renameByName": { + "namespace": "Namespace", + "persistentvolumeclaim": "PVC", + "volumename": "Volume", + "storageclass": "Storage Class", + "Value #status": "Status", + "Value #capacity": "Capacity", + "Value #used": "Used", + "Value #pct": "Usage" + }, + "indexByName": { + "namespace": 0, + "persistentvolumeclaim": 1, + "volumename": 2, + "storageclass": 3, + "Value #status": 4, + "Value #capacity": 5, + "Value #used": 6, + "Value #pct": 7 + }, + "excludeByName": {} + } + } + ], + "targets": [ + { + "refId": "status", + "expr": "((kube_persistentvolumeclaim_info{storageclass=~\"$storageclass\"} * 0 + 1) * on(namespace,persistentvolumeclaim) group_left() clamp_max(count(kubelet_volume_stats_capacity_bytes) by (namespace,persistentvolumeclaim), 1)) or ((kube_persistentvolumeclaim_info{storageclass=~\"$storageclass\"} unless on(namespace,persistentvolumeclaim) kubelet_volume_stats_capacity_bytes) * 0)", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "refId": "capacity", + "expr": "kubelet_volume_stats_capacity_bytes * on(namespace,persistentvolumeclaim) group_left(volumename,storageclass) kube_persistentvolumeclaim_info{storageclass=~\"$storageclass\"}", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "refId": "used", + "expr": "kubelet_volume_stats_used_bytes * on(namespace,persistentvolumeclaim) group_left(volumename,storageclass) kube_persistentvolumeclaim_info{storageclass=~\"$storageclass\"}", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "refId": "pct", + "expr": "(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on(namespace,persistentvolumeclaim) group_left(volumename,storageclass) kube_persistentvolumeclaim_info{storageclass=~\"$storageclass\"} * 100", + "instant": true, + "format": "table", + "legendFormat": "" + } + ] + } + ] +} diff --git a/charts/csi-hyperstack/templates/daemonset-node.yaml b/charts/csi-hyperstack/templates/daemonset-node.yaml index 8a7a400..d896601 100644 --- a/charts/csi-hyperstack/templates/daemonset-node.yaml +++ b/charts/csi-hyperstack/templates/daemonset-node.yaml @@ -13,6 +13,7 @@ spec: metadata: labels: {{- include "csi-hyperstack.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: node spec: {{- if .Values.imagePullSecrets }} imagePullSecrets: @@ -58,6 +59,10 @@ spec: # timeoutSeconds: 5 # failureThreshold: 3 # successThreshold: 1 + ports: + - containerPort: 8080 + name: metrics + protocol: TCP env: - name: DRIVER_NAME value: "hyperstack.csi.nexgencloud.com" diff --git a/charts/csi-hyperstack/templates/dashboard.yaml b/charts/csi-hyperstack/templates/dashboard.yaml new file mode 100644 index 0000000..1fea8c3 --- /dev/null +++ b/charts/csi-hyperstack/templates/dashboard.yaml @@ -0,0 +1,14 @@ +{{- if .Values.metrics.dashboards.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "csi-hyperstack.fullname" . }}-dashboard + namespace: {{ .Values.metrics.dashboards.namespace | default .Release.Namespace }} + labels: + {{- include "csi-hyperstack.labels" . | nindent 4 }} + {{- with .Values.metrics.dashboards.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +data: + csi-hyperstack.json: {{ .Files.Get "files/dashboards/csi-hyperstack.json" | toJson }} +{{- end }} diff --git a/charts/csi-hyperstack/templates/deployment-controller.yaml b/charts/csi-hyperstack/templates/deployment-controller.yaml index d3c161c..efd2bcd 100644 --- a/charts/csi-hyperstack/templates/deployment-controller.yaml +++ b/charts/csi-hyperstack/templates/deployment-controller.yaml @@ -14,6 +14,7 @@ spec: metadata: labels: {{- include "csi-hyperstack.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: controller spec: {{- if .Values.imagePullSecrets }} imagePullSecrets: @@ -72,7 +73,7 @@ spec: runAsGroup: 0 ports: - containerPort: 8080 - name: auth + name: metrics protocol: TCP volumeMounts: - mountPath: /csi diff --git a/charts/csi-hyperstack/templates/servicemonitor.yaml b/charts/csi-hyperstack/templates/servicemonitor.yaml new file mode 100644 index 0000000..3542bd0 --- /dev/null +++ b/charts/csi-hyperstack/templates/servicemonitor.yaml @@ -0,0 +1,66 @@ +{{- if .Values.metrics.serviceMonitor.enabled }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "csi-hyperstack.fullname" . }}-controller-metrics + namespace: {{ .Release.Namespace }} + labels: + {{- include "csi-hyperstack.labels" . | nindent 4 }} + app.kubernetes.io/component: controller +spec: + selector: + {{- include "csi-hyperstack.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: controller + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "csi-hyperstack.fullname" . }}-node-metrics + namespace: {{ .Release.Namespace }} + labels: + {{- include "csi-hyperstack.labels" . | nindent 4 }} + app.kubernetes.io/component: node +spec: + clusterIP: None + selector: + {{- include "csi-hyperstack.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: node + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "csi-hyperstack.fullname" . }} + namespace: {{ .Values.metrics.serviceMonitor.namespace | default .Release.Namespace }} + labels: + {{- include "csi-hyperstack.labels" . | nindent 4 }} + {{- with .Values.metrics.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + selector: + matchLabels: + {{- include "csi-hyperstack.selectorLabels" . | nindent 6 }} + endpoints: + - port: metrics + path: /metrics + {{- with .Values.metrics.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.metrics.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} +{{- end }} diff --git a/charts/csi-hyperstack/values.yaml b/charts/csi-hyperstack/values.yaml index e1834f1..6f1838e 100644 --- a/charts/csi-hyperstack/values.yaml +++ b/charts/csi-hyperstack/values.yaml @@ -24,3 +24,17 @@ storageClass: name: "csi-hyperstack" volumeBindingMode: "Immediate" reclaimPolicy: "Delete" + +metrics: + serviceMonitor: + enabled: false + namespace: "" + interval: "" + scrapeTimeout: "" + labels: {} + + dashboards: + enabled: false + namespace: "" + labels: + grafana_dashboard: "1" diff --git a/pkg/driver/controllerserver.go b/pkg/driver/controllerserver.go index e2eeea8..cb4e5c5 100644 --- a/pkg/driver/controllerserver.go +++ b/pkg/driver/controllerserver.go @@ -10,6 +10,7 @@ import ( "golang.org/x/net/context" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "k8s.io/csi-hyperstack/pkg/metrics" util "k8s.io/csi-hyperstack/pkg/utils" kubernetes "k8s.io/csi-hyperstack/pkg/utils/kubernetes" "k8s.io/csi-hyperstack/pkg/utils/metadata" @@ -167,7 +168,9 @@ func (cs *controllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol return &csi.DeleteVolumeResponse{}, nil } -func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error) { +func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (resp *csi.ControllerPublishVolumeResponse, err error) { + mc := metrics.NewMetricContext("volume", "attach") + defer func() { mc.ObserveCSIOperation(err) }() klog.Infof("\n==============ControllerPublishVolume: called================\n") klog.Infof("ControllerPublishVolume: called with args %+v", protosanitizer.StripSecrets(*req)) @@ -284,7 +287,9 @@ func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *cs "Volume in unexpected state: %s (expected 'available' or 'in-use')", *getVolume.Status) } -func (cs *controllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error) { +func (cs *controllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (resp *csi.ControllerUnpublishVolumeResponse, err error) { + mc := metrics.NewMetricContext("volume", "detach") + defer func() { mc.ObserveCSIOperation(err) }() klog.Infof("ControllerUnpublishVolume: called with args %+v", protosanitizer.StripSecrets(*req)) virtualMachineId := req.NodeId vmId, err := strconv.Atoi(virtualMachineId) diff --git a/pkg/driver/nodeserver.go b/pkg/driver/nodeserver.go index 0494857..81a1c52 100644 --- a/pkg/driver/nodeserver.go +++ b/pkg/driver/nodeserver.go @@ -17,6 +17,7 @@ import ( "k8s.io/klog/v2" // cpoerrors "k8s.io/cloud-provider-openstack/pkg/util/errors" + "k8s.io/csi-hyperstack/pkg/metrics" kubernetes "k8s.io/csi-hyperstack/pkg/utils/kubernetes" "k8s.io/csi-hyperstack/pkg/utils/metadata" "k8s.io/csi-hyperstack/pkg/utils/mount" @@ -33,7 +34,9 @@ type nodeServer struct { csi.UnimplementedNodeServer } -func (ns *nodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error) { +func (ns *nodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (resp *csi.NodeStageVolumeResponse, err error) { + mc := metrics.NewMetricContext("volume", "node_stage") + defer func() { mc.ObserveCSIOperation(err) }() klog.Infof("\n==============NodeStageVolume: called================\n") klog.Infof("NodeStageVolume: called with args %+v", protosanitizer.StripSecrets(*req)) devicename := req.PublishContext[volNameKeyFromControllerPublishVolume] @@ -41,7 +44,7 @@ func (ns *nodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol return nil, status.Error(codes.InvalidArgument, "Device name not found in publish context. Please wait for volume to be attached.") } klog.Infof("NodeStageVolume: devicename from publish context: %s", devicename) - err := formateAndMakeFS(devicename, "ext4") + err = formateAndMakeFS(devicename, "ext4") if err != nil { return nil, err } @@ -165,7 +168,9 @@ func mountDevice(source string, target string, fsType string, options []string) return nil } -func (ns *nodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error) { +func (ns *nodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (resp *csi.NodeUnstageVolumeResponse, err error) { + mc := metrics.NewMetricContext("volume", "node_unstage") + defer func() { mc.ObserveCSIOperation(err) }() klog.Infof("==============NodeUnstageVolume: called================\n") klog.Infof("NodeUnstageVolume: called with args %+v", protosanitizer.StripSecrets(*req)) @@ -179,7 +184,7 @@ func (ns *nodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag return nil, status.Error(codes.InvalidArgument, "NodeUnstageVolume Staging Target Path must be provided") } - err := ns.mount.UnmountPath(stagingTargetPath) + err = ns.mount.UnmountPath(stagingTargetPath) if err != nil { return nil, status.Errorf(codes.Internal, "Unmount of targetPath %s failed with error %v", stagingTargetPath, err) } @@ -187,7 +192,9 @@ func (ns *nodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag return &csi.NodeUnstageVolumeResponse{}, nil } -func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) { +func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (resp *csi.NodePublishVolumeResponse, err error) { + mc := metrics.NewMetricContext("volume", "node_publish") + defer func() { mc.ObserveCSIOperation(err) }() klog.Infof("==============NodePublishVolume: called================\n") klog.Infof("NodePublishVolume: called with args %+v", protosanitizer.StripSecrets(*req)) @@ -204,15 +211,20 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis source := req.StagingTargetPath target := req.TargetPath - err := mountDevice(source, target, fsType, options) + err = mountDevice(source, target, fsType, options) if err != nil { return nil, status.Error(codes.Internal, fmt.Sprintf("Error %s, mounting the volume from staging dir to target dir", err.Error())) } + nodeID, _ := ns.metadata.GetHyperstackVMId() + metrics.VolumeAttachmentsGauge.WithLabelValues(nodeID).Inc() + return &csi.NodePublishVolumeResponse{}, nil } -func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) { +func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (resp *csi.NodeUnpublishVolumeResponse, err error) { + mc := metrics.NewMetricContext("volume", "node_unpublish") + defer func() { mc.ObserveCSIOperation(err) }() klog.Infof("NodeUnPublishVolume: called with args %+v", protosanitizer.StripSecrets(*req)) volumeID := req.GetVolumeId() @@ -224,10 +236,13 @@ func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu return nil, status.Error(codes.InvalidArgument, "[NodeUnpublishVolume] volumeID must be provided") } - if err := ns.mount.UnmountPath(targetPath); err != nil { + if err = ns.mount.UnmountPath(targetPath); err != nil { return nil, status.Errorf(codes.Internal, "Unmount of targetpath %s failed with error %v", targetPath, err) } + nodeID, _ := ns.metadata.GetHyperstackVMId() + metrics.VolumeAttachmentsGauge.WithLabelValues(nodeID).Dec() + return &csi.NodeUnpublishVolumeResponse{}, nil } @@ -259,7 +274,37 @@ func (ns *nodeServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetC func (ns *nodeServer) NodeGetVolumeStats(_ context.Context, req *csi.NodeGetVolumeStatsRequest) (*csi.NodeGetVolumeStatsResponse, error) { klog.Infof("NodeGetVolumeStats: called with args %+v", protosanitizer.StripSecrets(*req)) - return &csi.NodeGetVolumeStatsResponse{}, nil + + if req.VolumePath == "" { + return nil, status.Error(codes.InvalidArgument, "NodeGetVolumeStats: volume path must be provided") + } + + stats, err := ns.mount.GetDeviceStats(req.VolumePath) + if err != nil { + if os.IsNotExist(err) { + return nil, status.Errorf(codes.NotFound, "NodeGetVolumeStats: volume path %s not found: %v", req.VolumePath, err) + } + return nil, status.Errorf(codes.Internal, "NodeGetVolumeStats: failed to get stats for %s: %v", req.VolumePath, err) + } + + usage := []*csi.VolumeUsage{ + { + Unit: csi.VolumeUsage_BYTES, + Available: stats.AvailableBytes, + Total: stats.TotalBytes, + Used: stats.UsedBytes, + }, + } + if !stats.Block { + usage = append(usage, &csi.VolumeUsage{ + Unit: csi.VolumeUsage_INODES, + Available: stats.AvailableInodes, + Total: stats.TotalInodes, + Used: stats.UsedInodes, + }) + } + + return &csi.NodeGetVolumeStatsResponse{Usage: usage}, nil } func (ns *nodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandVolumeRequest) (*csi.NodeExpandVolumeResponse, error) { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 766503f..eec7079 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -45,6 +45,7 @@ func (mc *MetricContext) Observe(om *OpenstackMetrics, err error) error { func RegisterMetrics(component string) { doRegisterAPIMetrics() + doRegisterCSIMetrics() if component == "occm" { doRegisterOccmMetrics() } diff --git a/pkg/metrics/metrics_csi.go b/pkg/metrics/metrics_csi.go new file mode 100644 index 0000000..50fcafa --- /dev/null +++ b/pkg/metrics/metrics_csi.go @@ -0,0 +1,55 @@ +package metrics + +import ( + "sync" + + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +var ( + CSIOperationMetrics = &OpenstackMetrics{ + Duration: metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Name: "csi_operation_duration_seconds", + Help: "Latency of a CSI driver operation", + Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, 600, 1200}, + }, []string{"operation"}), + Total: metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "csi_operations_total", + Help: "Total number of CSI driver operations", + }, []string{"operation"}), + Errors: metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "csi_operation_errors_total", + Help: "Total number of errors for a CSI driver operation", + }, []string{"operation"}), + } + + VolumeAttachmentsGauge = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Name: "csi_volume_attachments_total", + Help: "Number of volumes currently published (bind-mounted) to pods by this CSI node plugin", + }, + []string{"node_id"}, + ) +) + +// ObserveCSIOperation records the operation latency and counts errors. +func (mc *MetricContext) ObserveCSIOperation(err error) error { + return mc.Observe(CSIOperationMetrics, err) +} + +var registerCSIMetrics sync.Once + +func doRegisterCSIMetrics() { + registerCSIMetrics.Do(func() { + legacyregistry.MustRegister( + CSIOperationMetrics.Duration, + CSIOperationMetrics.Total, + CSIOperationMetrics.Errors, + VolumeAttachmentsGauge, + ) + }) +}