HPA_VPA 弹性扩缩容价值与现代化挑战Kubernetes 的 Horizontal Pod Autoscaler (HPA) 和 Vertical Pod Autoscaler (VPA) 作为云原生弹性计算的核心组件,其配置质量直接影响着应用的可用性和资源成本。传统弹性扩缩容往往面临指标采集延迟、扩缩容决策滞后、资源利用率低下、成本控制困难、预测精度不足等核心痛点。现代化弹性扩缩容需要从算法层面考虑智能化、从成本角度考虑最优化、从运维维度考虑自动化,构建标准化的弹性计算体系。企业级弹性扩缩容需要解决多维度指标融合、业务负载预测、成本效益平衡、性能保障、异常处理等复杂挑战。通过智能化的指标采集、精准的预测算法、灵活的扩缩容策略和完善的监控体系,可以实现应用资源的动态优化和成本的有效控制,为云原生应用提供可靠的弹性基础设施。核心架构设计与指标采集多维度指标采集架构构建统一的多维度指标采集体系,支持业务与资源指标融合:# metrics-architecture.yaml

apiVersion: v1

kind: ConfigMap

metadata:

name: metrics-collector-config

namespace: monitoring

data:

config.yaml: |

collectors:

# 资源指标采集器

  • name: resource-metrics

type: prometheus

config:

endpoint: http://prometheus:9090

query_interval: 15s

metrics:

  • name: cpu_utilization

query: |

sum(rate(container_cpu_usage_seconds_total{

container!="",

pod!="",

namespace=~"{{ .Values.targetNamespaces }}"

}[2m])) by (namespace, pod)

/

sum(kube_pod_container_resource_requests{

resource="cpu",

namespace=~"{{ .Values.targetNamespaces }}"

}) by (namespace, pod) * 100

  • name: memory_utilization

query: |

sum(container_memory_usage_bytes{

container!="",

pod!="",

namespace=~"{{ .Values.targetNamespaces }}"

}) by (namespace, pod)

/

sum(kube_pod_container_resource_requests{

resource="memory",

namespace=~"{{ .Values.targetNamespaces }}"

}) by (namespace, pod) * 100

  • name: network_io_rate

query: |

sum(rate(container_network_receive_bytes_total{

namespace=~"{{ .Values.targetNamespaces }}"

}[2m])) by (namespace, pod)

  • name: disk_io_rate

query: |

sum(rate(container_fs_reads_bytes_total{

namespace=~"{{ .Values.targetNamespaces }}"

}[2m]) + rate(container_fs_writes_bytes_total{

namespace=~"{{ .Values.targetNamespaces }}"

}[2m])) by (namespace, pod)

# 业务指标采集器

  • name: business-metrics

type: custom

config:

endpoint: http://business-metrics-service:8080

query_interval: 30s

metrics:

  • name: request_rate

query: "sum(rate(http_requests_total[2m])) by (namespace, service)"

  • name: error_rate

query: "sum(rate(http_requests_total{status=~\"5..\"}[2m])) by (namespace, service)"

  • name: response_time_p95

query: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[2m])) by (namespace, service)"

  • name: queue_depth

query: "sum(queue_depth) by (namespace, service)"

  • name: active_connections

query: "sum(active_connections) by (namespace, service)"

# 预测指标采集器

  • name: predictive-metrics

type: ml

config:

model_endpoint: http://ml-prediction-service:8080

query_interval: 60s

metrics:

  • name: predicted_cpu_load

horizon: 5m

confidence: 0.85

  • name: predicted_memory_load

horizon: 10m

confidence: 0.80

  • name: predicted_request_rate

horizon: 15m

confidence: 0.90

# 数据聚合配置

aggregation:

enabled: true

window: 2m

functions:

  • avg
  • max
  • min
  • p95
  • p99

# 缓存配置

cache:

enabled: true

ttl: 30s

max_size: 1000

# 导出配置

exporters:

  • name: prometheus-exporter

type: prometheus

endpoint: 0.0.0.0:8080

path: /metrics

  • name: custom-metrics-api

type: custom-metrics-api

endpoint: 0.0.0.0:8081

path: /apis/custom.metrics.k8s.io/v1beta1

---

apiVersion: apps/v1

kind: Deployment

metadata:

name: enhanced-metrics-collector

namespace: monitoring

spec:

replicas: 2

selector:

matchLabels:

app: enhanced-metrics-collector

template:

metadata:

labels:

app: enhanced-metrics-collector

spec:

serviceAccountName: metrics-collector

containers:

  • name: collector

image: metrics-collector:v2.1.0

ports:

  • containerPort: 8080

name: prometheus

  • containerPort: 8081

name: custom-metrics

env:

  • name: TARGET_NAMESPACES

value: "default,production,staging"

  • name: SCRAPE_INTERVAL

value: "15s"

  • name: CACHE_TTL

value: "30s"

volumeMounts:

  • name: config

mountPath: /etc/metrics-collector

readOnly: true

resources:

requests:

cpu: 100m

memory: 256Mi

limits:

cpu: 500m

memory: 1Gi

volumes:

  • name: config

configMap:

name: metrics-collector-config

HPA 高级配置实现基于多指标的智能 HPA 配置:# advanced-hpa.yaml

apiVersion: autoscaling/v2

kind: HorizontalPodAutoscaler

metadata:

name: intelligent-app-hpa

namespace: production

spec:

scaleTargetRef:

apiVersion: apps/v1

kind: Deployment

name: intelligent-app

# 扩缩容范围

minReplicas: 2

maxReplicas: 50

# 扩缩容行为配置

behavior:

scaleUp:

stabilizationWindowSeconds: 60

policies:

  • type: Percent

value: 100

periodSeconds: 15

  • type: Pods

value: 4

periodSeconds: 15

selectPolicy: Max

scaleDown:

stabilizationWindowSeconds: 300

policies:

  • type: Percent

value: 10

periodSeconds: 60

  • type: Pods

value: 2

periodSeconds: 60

selectPolicy: Min

# 多指标配置

metrics:

# CPU 利用率指标

  • type: Resource

resource:

name: cpu

target:

type: Utilization

averageUtilization: 70

container: main-app

# 内存利用率指标

  • type: Resource

resource:

name: memory

target:

type: Utilization

averageUtilization: 80

container: main-app

# 自定义业务指标 - 请求率

  • type: Pods

pods:

metric:

name: http_requests_per_second

target:

type: AverageValue

averageValue: "1000"

# 自定义业务指标 - 响应时间

  • type: Pods

pods:

metric:

name: http_request_duration_p95

target:

type: AverageValue

averageValue: "500m" # 500ms

# 队列深度指标

  • type: Object

object:

metric:

name: queue_depth

describedObject:

apiVersion: v1

kind: Service

name: message-queue-service

target:

type: Value

value: "100"

# 预测性指标

  • type: External

external:

metric:

name: predicted_cpu_load_5m

selector:

matchLabels:

app: intelligent-app

target:

type: Value

value: "80"

---

apiVersion: autoscaling/v2

kind: HorizontalPodAutoscaler

metadata:

name: cost-optimized-hpa

namespace: production

spec:

scaleTargetRef:

apiVersion: apps/v1

kind: Deployment

name: cost-optimized-app

minReplicas: 1

maxReplicas: 20

# 成本感知扩缩容行为

behavior:

scaleUp:

stabilizationWindowSeconds: 120

policies:

  • type: Percent

value: 50

periodSeconds: 60

selectPolicy: Max

scaleDown:

stabilizationWindowSeconds: 600 # 10分钟稳定窗口,避免频繁缩容

policies:

  • type: Percent

value: 5

periodSeconds: 300

selectPolicy: Min

metrics:

# 成本权重指标

  • type: External

external:

metric:

name: cost_per_request

selector:

matchLabels:

app: cost-optimized-app

target:

type: Value

value: "0.001" # 每个请求成本不超过 0.1 美分

# 资源效率指标

  • type: External

external:

metric:

name: resource_efficiency_score

target:

type: Value

value: "0.75" # 资源效率评分目标

VPA 推荐算法配置实现智能化的垂直扩缩容推荐:# vpa-recommender-config.yaml

apiVersion: v1

kind: ConfigMap

metadata:

name: vpa-recommender-config

namespace: kube-system

data:

recommender-config.yaml: |

# 推荐算法配置

recommender:

name: intelligent-recommender

# 算法参数

algorithm:

type: percentile

cpu:

percentile: 0.95 # 使用 95 分位数

margin: 0.1 # 10% 安全边距

min: 10m # 最小 CPU 请求

max: 4000m # 最大 CPU 请求

memory:

percentile: 0.95

margin: 0.15 # 15% 安全边距

min: 32Mi # 最小内存请求

max: 32Gi # 最大内存请求

# 历史数据分析窗口

history:

cpu:

window: 24h # CPU 分析窗口

resolution: 5m # 数据分辨率

memory:

window: 7d # 内存分析窗口(更长,因为内存使用更稳定)

resolution: 15m

# 成本优化参数

costOptimization:

enabled: true

weight: 0.3 # 成本权重(0-1)

# 资源价格配置(可根据云厂商调整)

pricing:

cpuPricePerCore: 0.04 # CPU 每核每小时价格(美元)

memoryPricePerGB: 0.005 # 内存每 GB 每小时价格(美元)

# 预算约束

budget:

maxIncrease: 0.5 # 最大资源增长比例

maxDecrease: 0.3 # 最大资源减少比例

# 预测算法

prediction:

enabled: true

horizon: 24h # 预测时间窗口

confidence: 0.85 # 置信度要求

models:

  • type: linear

weight: 0.4

  • type: seasonal

weight: 0.4

seasonality: daily

  • type: ml

weight: 0.2

model: prophet

---

apiVersion: apps/v1

kind: Deployment

metadata:

name: vpa-recommender

namespace: kube-system

spec:

replicas: 1

selector:

matchLabels:

app: vpa-recommender

template:

metadata:

labels:

app: vpa-recommender

spec:

serviceAccountName: vpa-recommender

containers:

  • name: recommender

image: registry.k8s.io/autoscaling/vpa-recommender:1.0.0

args:

  • --recommender-name=intelligent-recommender
  • --v=4
  • --config=/etc/vpa/recommender-config.yaml
  • --checkpoints-timeout=10m
  • --metrics-address=:8942
  • --profile-address=:8943

volumeMounts:

  • name: config

mountPath: /etc/vpa

readOnly: true

resources:

requests:

cpu: 50m

memory: 500Mi

limits:

cpu: 200m

memory: 2Gi

ports:

  • name: metrics

containerPort: 8942

  • name: profile

containerPort: 8943

volumes:

  • name: config

configMap:

name: vpa-recommender-config

---

apiVersion: autoscaling.k8s.io/v1

kind: VerticalPodAutoscaler

metadata:

name: intelligent-app-vpa

namespace: production

spec:

targetRef:

apiVersion: apps/v1

kind: Deployment

name: intelligent-app

# 更新策略

updatePolicy:

updateMode: "Auto" # Auto, Recreate, Initial, Off

minReplicas: 1

# 资源策略

resourcePolicy:

containerPolicies:

  • containerName: main-app

# 资源约束

minAllowed:

cpu: 10m

memory: 32Mi

maxAllowed:

cpu: 2000m

memory: 4Gi

# 受控资源

controlledResources:

  • cpu
  • memory

# 控制模式

controlledValues: RequestsAndLimits

# 比例缩放

scalingMode:

cpu: Auto

memory: Auto

# 模式特定配置

mode: Auto

# 推荐器配置

recommender:

name: intelligent-recommender

# 目标利用率

targetRef:

apiVersion: apps/v1

kind: Deployment

name: intelligent-app

智能预测算法与策略机器学习预测模型实现基于机器学习的负载预测算法:# ml_predictor.py

import numpy as np

import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error

import joblib

from prophet import Prophet

import warnings

warnings.filterwarnings('ignore')

class IntelligentLoadPredictor:

def __init__(self, config):

self.config = config

self.models = {}

self.scalers = {}

self.feature_importance = {}

# 特征工程配置

self.feature_config = {

'temporal_features': [

'hour_of_day', 'day_of_week', 'day_of_month',

'month', 'is_weekend', 'is_business_hour'

],

'lag_features': [1, 5, 15, 30, 60], # 分钟级滞后特征

'rolling_features': {

'window': [5, 15, 30],

'functions': ['mean', 'std', 'max', 'min']

},

'external_features': [

'cpu_usage', 'memory_usage', 'network_io',

'disk_io', 'active_connections', 'queue_depth'

]

}

def prepare_features(self, df):

"""特征工程"""

df = df.copy()

df['timestamp'] = pd.to_datetime(df['timestamp'])

df = df.sort_values('timestamp')

# 时间特征

df['hour_of_day'] = df['timestamp'].dt.hour

df['day_of_week'] = df['timestamp'].dt.dayofweek

df['day_of_month'] = df['timestamp'].dt.day

df['month'] = df['timestamp'].dt.month

df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

df['is_business_hour'] = ((df['hour_of_day'] >= 9) &

(df['hour_of_day'] <= 18)).astype(int)

# 滞后特征

for lag in self.feature_config['lag_features']:

df[f'cpu_lag_{lag}'] = df['cpu_usage'].shift(lag)

df[f'memory_lag_{lag}'] = df['memory_usage'].shift(lag)

df[f'requests_lag_{lag}'] = df['request_rate'].shift(lag)

# 滚动窗口特征

for window in self.feature_config['rolling_features']['window']:

for func in self.feature_config['rolling_features']['functions']:

df[f'cpu_{func}_{window}'] = df['cpu_usage'].rolling(

window=window, min_periods=1).agg(func)

df[f'memory_{func}_{window}'] = df['memory_usage'].rolling(

window=window, min_periods=1).agg(func)

# 处理缺失值

df = df.fillna(method='ffill').fillna(method='bfill')

return df

def train_models(self, training_data):

"""训练多种预测模型"""

print("开始训练智能负载预测模型...")

# 数据准备

df = self.prepare_features(training_data)

# 特征选择

feature_cols = [col for col in df.columns

if col not in ['timestamp', 'cpu_usage_future',

'memory_usage_future', 'request_rate_future']]

# 训练 CPU 使用率预测模型

print("训练 CPU 使用率预测模型...")

self.train_cpu_model(df, feature_cols)

# 训练内存使用率预测模型

print("训练内存使用率预测模型...")

self.train_memory_model(df, feature_cols)

# 训练请求率预测模型

print("训练请求率预测模型...")

self.train_requests_model(df, feature_cols)

# 训练时间序列模型(Prophet)

print("训练 Prophet 时间序列模型...")

self.train_prophet_models(df)

print("模型训练完成!")

def train_cpu_model(self, df, feature_cols):

"""训练 CPU 使用率预测模型"""

X = df[feature_cols]

y = df['cpu_usage_future']

# 数据分割

train_size = int(0.8 * len(df))

X_train, X_test = X[:train_size], X[train_size:]

y_train, y_test = y[:train_size], y[train_size:]

# 特征缩放

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

self.scalers['cpu'] = scaler

# 训练多个模型

models = {

'random_forest': RandomForestRegressor(

n_estimators=100,

max_depth=10,

random_state=42

),

'gradient_boosting': GradientBoostingRegressor(

n_estimators=100,

learning_rate=0.1,

max_depth=5,

random_state=42

)

}

best_model = None

best_score = float('inf')

for name, model in models.items():

model.fit(X_train_scaled, y_train)

predictions = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, predictions)

rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"{name} - CPU 预测 - MAE: {mae:.4f}, RMSE: {rmse:.4f}")

if mae < best_score:

best_score = mae

best_model = model

self.models['cpu'] = best_model

# 特征重要性分析

if hasattr(best_model, 'feature_importances_'):

importance = pd.DataFrame({

'feature': feature_cols,

'importance': best_model.feature_importances_

}).sort_values('importance', ascending=False)

self.feature_importance['cpu'] = importance

print("CPU 预测特征重要性:")

print(importance.head(10))

def train_prophet_models(self, df):

"""训练 Prophet 时间序列模型"""

# CPU Prophet 模型

cpu_prophet = Prophet(

daily_seasonality=True,

weekly_seasonality=True,

yearly_seasonality=True,

interval_width=0.85

)

cpu_df = df[['timestamp', 'cpu_usage']].rename(

columns={'timestamp': 'ds', 'cpu_usage': 'y'})

cpu_prophet.fit(cpu_df)

self.models['cpu_prophet'] = cpu_prophet

# 内存 Prophet 模型

memory_prophet = Prophet(

daily_seasonality=True,

weekly_seasonality=True,

yearly_seasonality=True,

interval_width=0.85

)

memory_df = df[['timestamp', 'memory_usage']].rename(

columns={'timestamp': 'ds', 'memory_usage': 'y'})

memory_prophet.fit(memory_df)

self.models['memory_prophet'] = memory_prophet

def predict(self, current_metrics, horizon_minutes=5):

"""综合预测函数"""

# 准备输入数据

df = pd.DataFrame([current_metrics])

df['timestamp'] = pd.Timestamp.now()

df = self.prepare_features(df)

feature_cols = [col for col in df.columns

if col not in ['timestamp']]

predictions = {}

# CPU 预测

if 'cpu' in self.models:

X_scaled = self.scalers['cpu'].transform(df[feature_cols])

cpu_ml = self.models['cpu'].predict(X_scaled)[0]

# Prophet 预测

future = pd.DataFrame({

'ds': [pd.Timestamp.now() + pd.Timedelta(minutes=horizon_minutes)]

})

if 'cpu_prophet' in self.models:

prophet_forecast = self.models['cpu_prophet'].predict(future)

cpu_prophet = prophet_forecast['yhat'].iloc[0]

# 综合预测(加权平均)

predictions['cpu'] = 0.7 * cpu_ml + 0.3 * cpu_prophet

else:

predictions['cpu'] = cpu_ml

# 内存预测

if 'memory' in self.models:

X_scaled = self.scalers['memory'].transform(df[feature_cols])

predictions['memory'] = self.models['memory'].predict(X_scaled)[0]

# 请求率预测

if 'requests' in self.models:

X_scaled = self.scalers['requests'].transform(df[feature_cols])

predictions['requests'] = self.models['requests'].predict(X_scaled)[0]

# 置信度计算

confidence = self.calculate_confidence(predictions, df)

return {

'predictions': predictions,

'confidence': confidence,

'timestamp': pd.Timestamp.now(),

'horizon_minutes': horizon_minutes

}

def calculate_confidence(self, predictions, input_data):

"""计算预测置信度"""

# 基于历史准确率和输入数据质量计算置信度

base_confidence = 0.85

# 数据质量调整

if len(input_data) < 10:

base_confidence *= 0.8

# 特征完整性调整

null_percentage = input_data.isnull().sum().sum() / input_data.size

confidence = base_confidence * (1 - null_percentage)

return min(confidence, 0.95) # 最高 95% 置信度

def save_models(self, path):

"""保存模型"""

import joblib

model_data = {

'models': self.models,

'scalars': self.scalers,

'feature_importance': self.feature_importance,

'config': self.config

}

joblib.dump(model_data, path)

print(f"模型已保存到: {path}")

def load_models(self, path):

"""加载模型"""

import joblib

model_data = joblib.load(path)

self.models = model_data['models']

self.scalers = model_data['scalers']

self.feature_importance = model_data.get('feature_importance', {})

self.config = model_data.get('config', {})

print(f"模型已从 {path} 加载")

# 使用示例

if __name__ == "__main__":

# 配置

config = {

'prediction_horizon': 300, # 5 分钟

'confidence_threshold': 0.8,

'model_update_interval': 3600, # 1 小时

'feature_engineering': True

}

# 创建预测器

predictor = IntelligentLoadPredictor(config)

# 模拟训练数据

np.random.seed(42)

n_samples = 10000

training_data = pd.DataFrame({

'timestamp': pd.date_range(start='2024-01-01', periods=n_samples, freq='1min'),

'cpu_usage': np.random.normal(50, 20, n_samples) +

20 * np.sin(2 * np.pi * np.arange(n_samples) / 1440) + # 日周期

10 * np.sin(2 * np.pi * np.arange(n_samples) / 10080), # 周周期

'memory_usage': np.random.normal(60, 15, n_samples),

'request_rate': np.random.poisson(100, n_samples) +

50 * np.sin(2 * np.pi * np.arange(n_samples) / 1440),

'active_connections': np.random.poisson(50, n_samples),

'queue_depth': np.random.poisson(20, n_samples),

'network_io': np.random.normal(1000, 300, n_samples),

'disk_io': np.random.normal(500, 150, n_samples)

})

# 确保数据在合理范围内

training_data['cpu_usage'] = np.clip(training_data['cpu_usage'], 0, 100)

training_data['memory_usage'] = np.clip(training_data['memory_usage'], 0, 100)

# 添加目标变量(未来值)

training_data['cpu_usage_future'] = training_data['cpu_usage'].shift(-5).fillna(method='bfill')

training_data['memory_usage_future'] = training_data['memory_usage'].shift(-5).fillna(method='bfill')

training_data['request_rate_future'] = training_data['request_rate'].shift(-5).fillna(method='bfill')

# 训练模型

predictor.train_models(training_data)

# 预测示例

current_metrics = {

'cpu_usage': 65.0,

'memory_usage': 70.0,

'request_rate': 120.0,

'active_connections': 45.0,

'queue_depth': 15.0,

'network_io': 950.0,

'disk_io': 480.0

}

prediction = predictor.predict(current_metrics, horizon_minutes=5)

print("预测结果:")

print(f"CPU 使用率预测: {prediction['predictions']['cpu']:.2f}%")

print(f"内存使用率预测: {prediction['predictions']['memory']:.2f}%")

print(f"请求率预测: {prediction['predictions']['requests']:.2f}")

print(f"置信度: {prediction['confidence']:.2f}")

# 保存模型

predictor.save_models('/models/intelligent_load_predictor.pkl')

智能扩缩容控制器实现基于预测结果的智能扩缩容决策:// intelligent_autoscaler.go

package main

import (

"context"

"fmt"

"math"

"time"

"k8s.io/api/autoscaling/v2"

"k8s.io/api/core/v1"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/client-go/kubernetes"

"k8s.io/client-go/rest"

"k8s.io/metrics/pkg/apis/custom_metrics"

)

type IntelligentAutoscaler struct {

client kubernetes.Interface

predictor *LoadPredictor

config *AutoscalerConfig

metricsCache *MetricsCache

decisionLog *DecisionLog

}

type AutoscalerConfig struct {

PredictionHorizon time.Duration

SafetyMargin float64

MaxScaleUpRate float64

MaxScaleDownRate float64

CooldownPeriod time.Duration

MinReplicaChange int32

CostOptimizationWeight float64

PerformanceTarget PerformanceTarget

}

type PerformanceTarget struct {

CPUUtilization float64

MemoryUtilization float64

RequestRate float64

ResponseTime time.Duration

}

type ScalingDecision struct {

Action string // scale_up, scale_down, maintain

CurrentReplicas int32

TargetReplicas int32

Reason string

Confidence float64

CostImpact float64

PerformanceImpact float64

Timestamp time.Time

}

func NewIntelligentAutoscaler(config *AutoscalerConfig) (*IntelligentAutoscaler, error) {

// 创建 Kubernetes 客户端

config, err := rest.InClusterConfig()

if err != nil {

return nil, fmt.Errorf("failed to create in-cluster config: %v", err)

}

clientset, err := kubernetes.NewForConfig(config)

if err != nil {

return nil, fmt.Errorf("failed to create kubernetes client: %v", err)

}

// 创建预测器

predictor, err := NewLoadPredictor("/models/intelligent_load_predictor.pkl")

if err != nil {

return nil, fmt.Errorf("failed to create load predictor: %v", err)

}

return &IntelligentAutoscaler{

client: clientset,

predictor: predictor,

config: config,

metricsCache: NewMetricsCache(5 * time.Minute),

decisionLog: NewDecisionLog(1000),

}, nil

}

func (ia *IntelligentAutoscaler) MakeScalingDecision(ctx context.Context,

namespace, deploymentName string) (*ScalingDecision, error) {

// 1. 获取当前状态

deployment, err := ia.client.AppsV1().Deployments(namespace).Get(

ctx, deploymentName, metav1.GetOptions{})

if err != nil {

return nil, fmt.Errorf("failed to get deployment: %v", err)

}

currentReplicas := *deployment.Spec.Replicas

// 2. 获取当前指标

currentMetrics, err := ia.collectCurrentMetrics(ctx, namespace, deploymentName)

if err != nil {

return nil, fmt.Errorf("failed to collect metrics: %v", err)

}

// 3. 预测未来负载

prediction, err := ia.predictor.Predict(currentMetrics, ia.config.PredictionHorizon)

if err != nil {

return nil, fmt.Errorf("failed to predict load: %v", err)

}

// 4. 计算目标副本数

targetReplicas, err := ia.calculateTargetReplicas(currentReplicas, prediction, currentMetrics)

if err != nil {

return nil, fmt.Errorf("failed to calculate target replicas: %v", err)

}

// 5. 验证扩缩容约束

targetReplicas = ia.applyScalingConstraints(currentReplicas, targetReplicas)

// 6. 评估成本和性能影响

costImpact, performanceImpact := ia.evaluateImpact(currentReplicas, targetReplicas, prediction)

// 7. 生成决策

decision := &ScalingDecision{

CurrentReplicas: currentReplicas,

TargetReplicas: targetReplicas,

Confidence: prediction.Confidence,

CostImpact: costImpact,

PerformanceImpact: performanceImpact,

Timestamp: time.Now(),

}

if targetReplicas > currentReplicas {

decision.Action = "scale_up"

decision.Reason = fmt.Sprintf("Predicted load increase: CPU %.1f%%, Memory %.1f%%",

prediction.CPU, prediction.Memory)

} else if targetReplicas < currentReplicas {

decision.Action = "scale_down"

decision.Reason = fmt.Sprintf("Predicted load decrease: CPU %.1f%%, Memory %.1f%%",

prediction.CPU, prediction.Memory)

} else {

decision.Action = "maintain"

decision.Reason = "Predicted load within target range"

}

// 记录决策日志

ia.decisionLog.Record(decision)

return decision, nil

}

func (ia *IntelligentAutoscaler) calculateTargetReplicas(current int32,

prediction *LoadPrediction, currentMetrics *Metrics) (int32, error) {

// 基于多指标计算目标副本数

var cpuBasedReplicas, memoryBasedReplicas, requestBasedReplicas int32

// CPU 基础计算

if prediction.CPU > 0 {

cpuBasedReplicas = int32(math.Ceil(

float64(current) * (prediction.CPU / ia.config.PerformanceTarget.CPUUtilization)))

}

// 内存基础计算

if prediction.Memory > 0 {

memoryBasedReplicas = int32(math.Ceil(

float64(current) * (prediction.Memory / ia.config.PerformanceTarget.MemoryUtilization)))

}

// 请求率基础计算

if prediction.RequestRate > 0 && currentMetrics.RequestRate > 0 {

requestBasedReplicas = int32(math.Ceil(

float64(current) * (prediction.RequestRate / currentMetrics.RequestRate)))

}

// 加权平均计算

targetReplicas := ia.weightedAverageReplicas([]int32{

cpuBasedReplicas, memoryBasedReplicas, requestBasedReplicas

}, []float64{0.4, 0.3, 0.3})

// 应用安全边距

targetReplicas = int32(math.Ceil(float64(targetReplicas) * (1 + ia.config.SafetyMargin)))

return targetReplicas, nil

}

func (ia *IntelligentAutoscaler) weightedAverageReplicas(replicas []int32, weights []float64) int32 {

if len(replicas) != len(weights) {

panic("replicas and weights must have same length")

}

var weightedSum float64

var weightSum float64

for i, replica := range replicas {

if replica > 0 { // 只考虑有效的副本数

weightedSum += float64(replica) * weights[i]

weightSum += weights[i]

}

}

if weightSum == 0 {

return 0

}

return int32(math.Round(weightedSum / weightSum))

}

func (ia *IntelligentAutoscaler) applyScalingConstraints(current, target int32) int32 {

// 应用最大扩缩容速率限制

maxScaleUp := int32(math.Ceil(float64(current) * ia.config.MaxScaleUpRate))

maxScaleDown := int32(math.Floor(float64(current) * (1 - ia.config.MaxScaleDownRate)))

if target > maxScaleUp {

target = maxScaleUp

}

if target < maxScaleDown {

target = maxScaleDown

}

// 应用最小变化限制

if abs(target-current) < ia.config.MinReplicaChange {

target = current

}

return target

}

func (ia *IntelligentAutoscaler) evaluateImpact(current, target int32,

prediction *LoadPrediction) (costImpact, performanceImpact float64) {

// 成本影响评估

replicaChange := float64(target - current)

baseCostPerReplica := 0.04 // CPU 成本(每小时)

costImpact = replicaChange * baseCostPerReplica * ia.config.CostOptimizationWeight

// 性能影响评估

if target > current {

// 扩容:预期性能提升

performanceImpact = math.Min(1.0, replicaChange/float64(current)) *

(1.0 - ia.config.CostOptimizationWeight)

} else if target < current {

// 缩容:需要评估性能风险

predictedLoad := math.Max(prediction.CPU, prediction.Memory)

if predictedLoad > 80 {

performanceImpact = -0.5 // 高风险

} else if predictedLoad > 60 {

performanceImpact = -0.2 // 中等风险

} else {

performanceImpact = 0.1 // 低风险,可能节省成本

}

}

return costImpact, performanceImpact

}

func (ia *IntelligentAutoscaler) ExecuteScaling(ctx context.Context,

namespace, deploymentName string, decision *ScalingDecision) error {

// 检查冷却期

if ia.isInCooldownPeriod() {

return fmt.Errorf("in cooldown period, scaling not allowed")

}

if decision.Action == "maintain" {

return nil // 不需要执行任何操作

}

// 更新 HPA

hpa, err := ia.client.AutoscalingV2().HorizontalPodAutoscalers(namespace).Get(

ctx, deploymentName+"-hpa", metav1.GetOptions{})

if err != nil {

return fmt.Errorf("failed to get HPA: %v", err)

}

// 临时禁用自动扩缩容

hpa.Spec.Behavior = &v2.HorizontalPodAutoscalerBehavior{

ScaleUp: &v2.HPAScalingRules{

StabilizationWindowSeconds: func() *int32 { i := int32(300); return &i }(),

Policies: []v2.HPAScalingPolicy{

{

Type: v2.PercentScalingPolicy,

Value: 0, // 禁用自动扩容

PeriodSeconds: 300,

},

},

},

}

_, err = ia.client.AutoscalingV2().HorizontalPodAutoscalers(namespace).Update(

ctx, hpa, metav1.UpdateOptions{})

if err != nil {

return fmt.Errorf("failed to update HPA: %v", err)

}

// 更新 Deployment 副本数

deployment, err := ia.client.AppsV1().Deployments(namespace).Get(

ctx, deploymentName, metav1.GetOptions{})

if err != nil {

return fmt.Errorf("failed to get deployment: %v", err)

}

deployment.Spec.Replicas = &decision.TargetReplicas

_, err = ia.client.AppsV1().Deployments(namespace).Update(

ctx, deployment, metav1.UpdateOptions{})

if err != nil {

return fmt.Errorf("failed to update deployment: %v", err)

}

// 记录执行时间

ia.lastScalingTime = time.Now()

return nil

}

// 指标采集函数

func (ia *IntelligentAutoscaler) collectCurrentMetrics(ctx context.Context,

namespace, deploymentName string) (*Metrics, error) {

// 检查缓存

cached := ia.metricsCache.Get(namespace, deploymentName)

if cached != nil && time.Since(cached.Timestamp) < 30*time.Second {

return cached, nil

}

metrics := &Metrics{

Timestamp: time.Now(),

}

// 获取 CPU 使用率

cpuQuery := fmt.Sprintf(`

sum(rate(container_cpu_usage_seconds_total{

namespace="%s",

pod=~"%s-.*",

container!=""

}[2m])) by (namespace)

`, namespace, deploymentName)

cpuResult, err := ia.queryPrometheus(cpuQuery)

if err == nil && len(cpuResult) > 0 {

metrics.CPUUsage = cpuResult[0].Value

}

// 获取内存使用率

memoryQuery := fmt.Sprintf(`

sum(container_memory_usage_bytes{

namespace="%s",

pod=~"%s-.*",

container!=""

}) by (namespace)

`, namespace, deploymentName)

memoryResult, err := ia.queryPrometheus(memoryQuery)

if err == nil && len(memoryResult) > 0 {

metrics.MemoryUsage = memoryResult[0].Value

}

// 获取请求率

requestQuery := fmt.Sprintf(`

sum(rate(http_requests_total{

namespace="%s",

service=~"%s.*"

}[2m])) by (namespace)

`, namespace, deploymentName)

requestResult, err := ia.queryPrometheus(requestQuery)

if err == nil && len(requestResult) > 0 {

metrics.RequestRate = requestResult[0].Value

}

// 缓存结果

ia.metricsCache.Set(namespace, deploymentName, metrics)

return metrics, nil

}

成本优化与性能调优成本感知扩缩容策略实现基于成本效益分析的扩缩容决策:# cost-aware-autoscaling.yaml

apiVersion: v1

kind: ConfigMap

metadata:

name: cost-optimizer-config

namespace: kube-system

data:

cost-config.yaml: |

# 云厂商定价配置

pricing:

aws:

on_demand:

m5.large: 0.096 # 每小时价格(美元)

m5.xlarge: 0.192

m5.2xlarge: 0.384

c5.large: 0.085

c5.xlarge: 0.17

r5.large: 0.126

r5.xlarge: 0.252

spot:

discount: 0.7 # 70% 折扣

interruption_rate: 0.05 # 5% 中断率

reserved:

discount: 0.4 # 60% 折扣(1年期)

upfront: 0.3 # 30% 预付

gcp:

preemptible:

discount: 0.8

committed:

discount: 0.57 # 1年期承诺使用折扣

# 成本优化策略

strategies:

spot_instances:

enabled: true

max_percentage: 0.8 # 最多 80% Spot 实例

fallback: on_demand # Spot 不可用时回退到按需实例

# 容错配置

interruption_handling:

grace_period: 30s # 中断通知宽限期

drain_timeout: 120s # Pod 驱逐超时

rightsizing:

enabled: true

aggressiveness: medium # conservative, medium, aggressive

# 调整阈值

thresholds:

cpu: 0.2 # CPU 利用率低于 20% 时考虑缩容

memory: 0.3 # 内存利用率低于 30% 时考虑缩容

duration: 24h # 持续时间要求

binpacking:

enabled: true

strategy: dense # dense, balanced, spread

# 装箱算法参数

weights:

resource_utilization: 0.6

cost_efficiency: 0.3

availability: 0.1

---

apiVersion: autoscaling/v2

kind: HorizontalPodAutoscaler

metadata:

name: cost-optimized-autoscaler

namespace: production

annotations:

cost.optimizer/enabled: "true"

cost.optimizer/spot-percentage: "80"

cost.optimizer/max-cost-increase: "50" # 最大成本增长 50%

spec:

scaleTargetRef:

apiVersion: apps/v1

kind: Deployment

name: cost-optimized-app

minReplicas: 2

maxReplicas: 100

# 成本感知扩缩容行为

behavior:

scaleUp:

stabilizationWindowSeconds: 180

policies:

  • type: Percent

value: 30

periodSeconds: 60

  • type: Pods

value: 2

periodSeconds: 60

selectPolicy: Min # 选择保守策略,优先考虑成本

scaleDown:

stabilizationWindowSeconds: 900 # 15分钟稳定期

policies:

  • type: Percent

value: 5

periodSeconds: 300

selectPolicy: Max # 积极缩容,节省成本

metrics:

# 成本效率指标

  • type: External

external:

metric:

name: cost_efficiency_score

selector:

matchLabels:

app: cost-optimized-app

target:

type: Value

value: "0.8" # 成本效率评分目标

# 预算使用率指标

  • type: External

external:

metric:

name: budget_utilization

selector:

matchLabels:

app: cost-optimized-app

target:

type: Value

value: "0.9" # 预算使用率不超过 90%

# 性能指标(确保服务质量)

  • type: Pods

pods:

metric:

name: service_level_indicator

target:

type: AverageValue

averageValue: "0.95" # SLI 目标 95%

---

apiVersion: v1

kind: Service

metadata:

name: cost-optimizer-service

namespace: kube-system

labels:

app: cost-optimizer

spec:

selector:

app: cost-optimizer

ports:

  • port: 8080

targetPort: 8080

name: http

---

apiVersion: apps/v1

kind: Deployment

metadata:

name: cost-optimizer

namespace: kube-system

spec:

replicas: 2

selector:

matchLabels:

app: cost-optimizer

template:

metadata:

labels:

app: cost-optimizer

spec:

serviceAccountName: cost-optimizer

containers:

  • name: optimizer

image: cost-optimizer:v1.2.0

ports:

  • containerPort: 8080

name: http

env:

  • name: CLOUD_PROVIDER

value: "aws"

  • name: COST_OPTIMIZATION_ENABLED

value: "true"

  • name: SPOT_INSTANCE_RATIO

value: "0.8"

  • name: BUDGET_ALERT_THRESHOLD

value: "0.9"

volumeMounts:

  • name: config

mountPath: /etc/cost-optimizer

readOnly: true

resources:

requests:

cpu: 100m

memory: 256Mi

limits:

cpu: 500m

memory: 1Gi

livenessProbe:

httpGet:

path: /health

port: 8080

initialDelaySeconds: 30

periodSeconds: 10

readinessProbe:

httpGet:

path: /ready

port: 8080

initialDelaySeconds: 5

periodSeconds: 5

volumes:

  • name: config

configMap:

name: cost-optimizer-config

性能调优与容量规划实现基于性能基线的容量规划与调优:# performance-baseline.yaml

apiVersion: v1

kind: ConfigMap

metadata:

name: performance-baseline-config

namespace: monitoring

data:

baseline-config.yaml: |

# 性能基线配置

baselines:

# 应用类型定义

application_types:

web_frontend:

characteristics:

cpu_intensive: false

memory_intensive: false

io_intensive: false

network_intensive: true

# 基线指标

baseline_metrics:

cpu_utilization:

target: 0.7

max: 0.85

min: 0.2

memory_utilization:

target: 0.75

max: 0.9

min: 0.3

response_time:

target: 200ms

max: 500ms

p95: 800ms

p99: 1200ms

throughput:

target: 1000

max: 2000

per_core: 500

api_backend:

characteristics:

cpu_intensive: true

memory_intensive: false

io_intensive: false

network_intensive: true

baseline_metrics:

cpu_utilization:

target: 0.8

max: 0.95

min: 0.3

memory_utilization:

target: 0.6

max: 0.8

min: 0.2

response_time:

target: 100ms

max: 300ms

p95: 500ms

p99: 800ms

throughput:

target: 5000

max: 10000

per_core: 2500

data_processing:

characteristics:

cpu_intensive: true

memory_intensive: true

io_intensive: true

network_intensive: false

baseline_metrics:

cpu_utilization:

target: 0.85

max: 0.98

min: 0.4

memory_utilization:

target: 0.8

max: 0.95

min: 0.3

disk_io_rate:

target: 100MB/s

max: 500MB/s

processing_rate:

target: 1000

max: 5000

per_core: 500

# 容量规划参数

capacity_planning:

# 增长预测

growth_prediction:

enabled: true

horizon: 30d

confidence: 0.9

# 季节性分析

seasonality:

daily: true

weekly: true

monthly: true

yearly: true

# 缓冲策略

buffering:

strategy: proportional # fixed, proportional, adaptive

cpu_buffer: 0.2 # 20% CPU 缓冲

memory_buffer: 0.25 # 25% 内存缓冲

io_buffer: 0.3 # 30% IO 缓冲

# 扩展阈值

scaling_thresholds:

scale_up_threshold: 0.8 # 80% 时开始扩容

scale_down_threshold: 0.3 # 30% 时开始缩容

panic_threshold: 0.9 # 90% 时紧急扩容

# 预测性扩容

predictive_scaling:

enabled: true

lead_time: 5m # 提前 5 分钟扩容

confidence: 0.85 # 置信度要求

---

apiVersion: monitoring.coreos.com/v1

kind: ServiceMonitor

metadata:

name: performance-monitor

namespace: monitoring

spec:

selector:

matchLabels:

app: performance-monitor

endpoints:

  • port: metrics

interval: 30s

path: /metrics

honorLabels: true

metricRelabelings:

  • sourceLabels: [__name__]

regex: 'performance_.*'

targetLabel: performance_metric

replacement: 'true'

---

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

name: performance-alerts

namespace: monitoring

spec:

groups:

  • name: performance.baseline

interval: 30s

rules:

  • alert: PerformanceBaselineDeviation

expr: |

abs(performance_current_value - performance_baseline_target) / performance_baseline_target > 0.2

for: 10m

labels:

severity: warning

team: performance

annotations:

summary: "Performance baseline deviation detected"

description: "{{ $labels.metric }} deviation {{ $value }}% from baseline"

runbook_url: "https://wiki.example.com/performance-baseline-deviation"

  • alert: HighResourceUtilization

expr: |

max_over_time(cpu_utilization[5m]) > 0.9 or

max_over_time(memory_utilization[5m]) > 0.9

for: 5m

labels:

severity: critical

team: performance

annotations:

summary: "High resource utilization detected"

description: "Resource utilization above 90% for more than 5 minutes"

  • alert: PredictiveCapacityExhaustion

expr: |

predictive_capacity_exhaustion_hours < 24

for: 1m

labels:

severity: critical

team: capacity

annotations:

summary: "Predictive capacity exhaustion within 24 hours"

description: "Predicted capacity exhaustion in {{ $value }} hours"

监控告警与运维自动化统一监控告警体系构建覆盖性能、成本、容量的统一监控告警:# unified-monitoring.yaml

apiVersion: v1

kind: ConfigMap

metadata:

name: monitoring-dashboard-config

namespace: monitoring

data:

dashboard.json: |

{

"dashboard": {

"title": "HPA/VPA Unified Monitoring",

"tags": ["autoscaling", "performance", "cost"],

"panels": [

{

"title": "Autoscaling Overview",

"type": "stat",

"targets": [

{

"expr": "sum(kube_deployment_status_replicas{namespace=~\"production\"})",

"legendFormat": "Total Replicas"

},

{

"expr": "sum(kube_deployment_status_replicas_available{namespace=~\"production\"})",

"legendFormat": "Available Replicas"

}

]

},

{

"title": "Resource Utilization Heatmap",

"type": "heatmap",

"targets": [

{

"expr": "kube_pod_container_resource_requests_cpu_cores / kube_node_status_allocatable_cpu_cores",

"legendFormat": "CPU Request Utilization"

},

{

"expr": "kube_pod_container_resource_requests_memory_bytes / kube_node_status_allocatable_memory_bytes",

"legendFormat": "Memory Request Utilization"

}

]

},

{

"title": "Scaling Events Timeline",

"type": "table",

"targets": [

{

"expr": "increase(hpa_scaling_events_total[1h])",

"legendFormat": "HPA Scaling Events"

},

{

"expr": "increase(vpa_scaling_events_total[1h])",

"legendFormat": "VPA Scaling Events"

}

]

},

{

"title": "Cost Efficiency Trend",

"type": "graph",

"targets": [

{

"expr": "cost_per_request",

"legendFormat": "Cost per Request"

},

{

"expr": "resource_efficiency_score",

"legendFormat": "Resource Efficiency"

}

]

},

{

"title": "Predictive Scaling Accuracy",

"type": "graph",

"targets": [

{

"expr": "prediction_accuracy_score",

"legendFormat": "Prediction Accuracy"

},

{

"expr": "prediction_confidence",

"legendFormat": "Prediction Confidence"

}

]

}

]

}

}

---

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

name: autoscaling-health-alerts

namespace: monitoring

spec:

groups:

  • name: autoscaling.health

interval: 30s

rules:

  • alert: HPAScalingStuck

expr: |

(kube_deployment_status_replicas != kube_deployment_status_replicas_available) and

(kube_deployment_status_replicas_unavailable > 0)

for: 15m

labels:

severity: critical

team: platform

annotations:

summary: "HPA scaling appears to be stuck"

description: "Deployment {{ $labels.deployment }} has unavailable replicas for more than 15 minutes"

runbook_url: "https://wiki.example.com/hpa-scaling-stuck"

  • alert: VPAThresholdExceeded

expr: |

(vpa_recommendation_cpu > vpa_upper_bound_cpu) or

(vpa_recommendation_memory > vpa_upper_bound_memory)

for: 10m

labels:

severity: warning

team: platform

annotations:

summary: "VPA recommendation exceeds upper bound"

description: "VPA recommendation for {{ $labels.container }} exceeds configured upper bound"

  • alert: PredictiveScalingAccuracyLow

expr: |

prediction_accuracy_score < 0.7

for: 30m

labels:

severity: warning

team: ml

annotations:

summary: "Predictive scaling accuracy is low"

description: "Prediction accuracy {{ $value }} is below 70% for more than 30 minutes"

  • alert: CostOptimizationDegraded

expr: |

cost_efficiency_score < 0.6

for: 1h

labels:

severity: warning

team: finops

annotations:

summary: "Cost optimization efficiency is degraded"

description: "Cost efficiency score {{ $value }} is below 60% for more than 1 hour"

  • alert: CapacityExhaustionImminent

expr: |

predictive_capacity_exhaustion_hours < 6

for: 5m

labels:

severity: critical

team: capacity

annotations:

summary: "Capacity exhaustion imminent"

description: "Predicted capacity exhaustion in {{ $value }} hours"

---

apiVersion: batch/v1

kind: CronJob

metadata:

name: autoscaling-health-check

namespace: monitoring

spec:

schedule: "*/5 * * * *" # 每5分钟执行一次

jobTemplate:

spec:

template:

spec:

serviceAccountName: monitoring

containers:

  • name: health-check

image: monitoring-tools:v1.0.0

command:

  • /bin/bash
  • -c
  • |

#!/bin/bash

set -e

echo "Starting autoscaling health check..."

# 检查 HPA 状态

echo "Checking HPA status..."

kubectl get hpa --all-namespaces -o json | \

jq -r '.items[] | select(.status.currentReplicas != .status.desiredReplicas) | \

"WARNING: HPA \(.metadata.name) in namespace \(.metadata.namespace) has mismatch: current=\(.status.currentReplicas), desired=\(.status.desiredReplicas)"'

# 检查 VPA 状态

echo "Checking VPA status..."

kubectl get vpa --all-namespaces -o json | \

jq -r '.items[] | select(.status.recommendation == null) | \

"WARNING: VPA \(.metadata.name) in namespace \(.metadata.namespace) has no recommendation"'

# 检查预测模型状态

echo "Checking prediction model status..."

curl -s http://ml-prediction-service:8080/health | \

jq -r '.status' | grep -q "healthy" || echo "WARNING: Prediction model is unhealthy"

# 检查成本优化器状态

echo "Checking cost optimizer status..."

curl -s http://cost-optimizer:8080/health | \

jq -r '.status' | grep -q "healthy" || echo "WARNING: Cost optimizer is unhealthy"

# 生成健康报告

echo "Generating health report..."

cat > /tmp/autoscaling-health-report.json <<EOF

{

"timestamp": "$(date -Iseconds)",

"check_type": "autoscaling_health",

"status": "completed",

"duration_seconds": "$(($(date +%s) - $(date -d '5 minutes ago' +%s)))"

}

EOF

# 发送报告到监控系统

curl -X POST \

-H "Content-Type: application/json" \

-d @/tmp/autoscaling-health-report.json \

http://monitoring-service:8080/api/health-reports

echo "Health check completed successfully"

restartPolicy: OnFailure

自动化运维工作流实现自动化的扩缩容运维工作流:# automation-workflow.yaml

apiVersion: argoproj.io/v1alpha1

kind: WorkflowTemplate

metadata:

name: autoscaling-optimization-workflow

namespace: argo

spec:

templates:

  • name: performance-analysis

inputs:

parameters:

  • name: namespace
  • name: deployment

container:

image: performance-analyzer:v1.0.0

command: ["/bin/bash", "-c"]

args:

  • |

echo "Analyzing performance for {{inputs.parameters.deployment}} in {{inputs.parameters.namespace}}"

# 收集性能指标

kubectl top pods -n {{inputs.parameters.namespace}} -l app={{inputs.parameters.deployment}}

# 分析响应时间

curl -s http://prometheus:9090/api/v1/query?query=\

"histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"

# 生成性能报告

/usr/local/bin/performance-analysis \

--namespace {{inputs.parameters.namespace}} \

--deployment {{inputs.parameters.deployment}} \

--output /tmp/performance-report.json

# 上传报告

curl -X POST -H "Content-Type: application/json" \

-d @/tmp/performance-report.json \

http://reporting-service:8080/api/performance-reports

  • name: cost-optimization-analysis

inputs:

parameters:

  • name: namespace
  • name: deployment

container:

image: cost-analyzer:v1.0.0

command: ["/bin/bash", "-c"]

args:

  • |

echo "Analyzing cost optimization opportunities"

# 获取当前成本数据

/usr/local/bin/cost-analysis \

--namespace {{inputs.parameters.namespace}} \

--deployment {{inputs.parameters.deployment}} \

--cloud-provider aws \

--output /tmp/cost-analysis.json

# 生成优化建议

/usr/local/bin/cost-optimizer \

--input /tmp/cost-analysis.json \

--generate-recommendations \

--output /tmp/cost-recommendations.json

# 应用优化建议(如果成本节省 > 20%)

savings=$(cat /tmp/cost-recommendations.json | jq -r '.estimated_savings_percentage')

if (( $(echo "$savings > 20" | bc -l) )); then

echo "Applying cost optimization recommendations (savings: $savings%)"

kubectl patch deployment {{inputs.parameters.deployment}} -n {{inputs.parameters.namespace}} \

--patch-file /tmp/cost-recommendations.json

fi

  • name: predictive

点赞(0) 打赏

评论列表 共有 0 条评论

暂无评论
立即
投稿

微信公众账号

微信扫一扫加关注

发表
评论
返回
顶部