用pod部署prometheus
node_exporter
节点数据收集器
daemonset ————> 保证每个节点都有一个收集器
prometheus————>监控主程序
grafana————>图形化
altermanager————>告警模块
[root@master01 ~]# kubectl create ns monitor-sa namespace/monitor-sa created [root@master01 opt]# mkdir prometheus [root@master01 opt]# cd prometheus/ [root@master01 prometheus]# vim node_exporter.yaml apiVersion: apps/v1 kind: DaemonSet metadata:name: node-exporternamespace: monitor-salabels:name: node-exporter spec:selector:matchLabels:name: node-exportertemplate:metadata:labels:name: node-exporterspec:hostPID: truehostIPC: truehostNetwork: truecontainers:- name: node-exporterimage: prom/node-exporter:v1ports:- containerPort: 9100resources:limits:cpu: "0.5"securityContext:privileged: trueargs:- --path.procfs- /host/proc- --path.sysfs- /host/sys- --collector.filesystem.ignored-mount-points- '"^/(sys|proc|dev|host|etc)($|/)"'volumeMounts:- name: devmountPath: /host/dev- name: procmountPath: /host/proc- name: sysmountPath: /host/sys- name: rootfsmountPath: /rootfsvolumes:- name: prochostPath:path: /proc- name: devhostPath:path: /dev- name: syshostPath:path: /sys- name: rootfshostPath:path: / [root@master01 prometheus]# kubectl apply -f node_exporter.yaml daemonset.apps/node-exporter created [root@master01 prometheus]# kubectl get pod -n monitor-sa -o wide node-exporter-99vhd 1/1 Running 0 15s 192.168.60.120 node01 <none> <none> node-exporter-c6md9 1/1 Running 0 15s 192.168.60.130 node02 <none> <none> node-exporter-f29fh 1/1 Running 0 15s 192.168.60.110 master01 <none> <none>
#创建两个账号 [root@master01 prometheus]# kubectl create serviceaccount monitor -n monitor-sa [root@master01 prometheus]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin --serviceaccount=monitor-sa:monitor [root@master01 prometheus]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin --serviceaccount=monitor-sa:monitor [root@master01 prometheus]# rz -E rz waiting to receive. [root@master01 prometheus]# ls node_exporter.yaml prometheus-alertmanager-cfg.yaml [root@master01 prometheus]# vim prometheus-alertmanager-cfg.yaml- targets: ['192.168.60.110:10251']- job_name: 'kubernetes-controller-manager'scrape_interval: 5sstatic_configs:- targets: ['192.168.60.110:10252']- job_name: 'kubernetes-kube-proxy'scrape_interval: 5sstatic_configs:- targets: ['192.168.60.110:10249','192.168.60.120:10249','192.168.60.130:10249']- job_name: 'kubernetes-etcd'scheme: httpstls_config:ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.crtcert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.crtkey_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.keyscrape_interval: 5sstatic_configs:- targets: ['192.168.60.110:2379'] - alert: HighPodCpuUsage #告警邮件的标题expr: sum(rate(container_cpu_usage_seconds_total{namespace="default", pod=~".+"}[5m])) by (pod) > 0.9 #收集指标数据for: 5m #占用90%cpu的持续时间5M。告警labels:severity: warningannotations: #告警的内容 [root@master01 prometheus]# vim alter-mail.yaml kind: ConfigMap apiVersion: v1 metadata:name: alertmanagernamespace: monitor-sa data:alertmanager.yml: |-global:resolve_timeout: 1msmtp_smarthost: 'smtp.qq.com:25'smtp_from: '1647629457@qq.com'smtp_auth_username: '1647629457@qq.com'smtp_auth_password: 'mhmjbfjydbuhecea'smtp_require_tls: falseroute:group_by: [alertname]group_wait: 10sgroup_interval: 10srepeat_interval: 10m receiver: default-receiverreceivers:- name: 'default-receiver'email_configs:- to: '1647629457@qq.com'send_resolved: true
[root@master01 prometheus]# vim prometheus-svc.yaml apiVersion: v1 kind: Service metadata:name: prometheusnamespace: monitor-salabels:app: prometheus spec:type: NodePortports:- port: 9090targetPort: 9090protocol: TCPselector:app: prometheuscomponent: server[root@master01 prometheus]# vim prometheus-alter.yaml apiVersion: v1 kind: Service metadata:labels:name: prometheuskubernetes.io/cluster-service: 'true'name: alertmanagernamespace: monitor-sa spec:ports:- name: alertmanagernodePort: 30066port: 9093protocol: TCPtargetPort: 9093selector:app: prometheussessionAffinity: Nonetype: NodePort[root@master01 prometheus]# vim prometheus-deploy.yaml apiVersion: apps/v1 kind: Deployment metadata:name: prometheus-servernamespace: monitor-salabels:app: prometheus spec:replicas: 1selector:matchLabels:app: prometheuscomponent: servertemplate:metadata:labels:app: prometheuscomponent: serverannotations:prometheus.io/scrape: 'false'spec:serviceAccountName: monitorinitContainers:- name: init-chmodimage: busybox:latestcommand: ['sh','-c','chmod -R 777 /prometheus;chmod -R 777 /etc']volumeMounts:- mountPath: /prometheusname: prometheus-storage-volume- mountPath: /etc/localtimename: timezonecontainers:- name: prometheusimage: prom/prometheus:v2.45.0command:- prometheus- --config.file=/etc/prometheus/prometheus.yml- --storage.tsdb.path=/prometheus- --storage.tsdb.retention=720h- --web.enable-lifecycleports:- containerPort: 9090volumeMounts:- name: prometheus-configmountPath: /etc/prometheus/- mountPath: /prometheus/name: prometheus-storage-volume- name: timezonemountPath: /etc/localtime- name: k8s-certsmountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/- name: alertmanagerimage: prom/alertmanager:v0.20.0args:- "--config.file=/etc/alertmanager/alertmanager.yml"- "--log.level=debug"ports:- containerPort: 9093protocol: TCPname: alertmanagervolumeMounts:- name: alertmanager-configmountPath: /etc/alertmanager- name: alertmanager-storagemountPath: /alertmanager- name: localtimemountPath: /etc/localtimevolumes:- name: prometheus-configconfigMap:name: prometheus-configdefaultMode: 0777- name: prometheus-storage-volumehostPath:path: /datatype: DirectoryOrCreate- name: k8s-certssecret:secretName: etcd-certs- name: timezonehostPath:path: /usr/share/zoneinfo/Asia/Shanghai- name: alertmanager-configconfigMap:name: alertmanager- name: alertmanager-storagehostPath:path: /data/alertmanagertype: DirectoryOrCreate- name: localtimehostPath:path: /usr/share/zoneinfo/Asia/Shanghaikubectl apply -f prometheus-deploy.yaml kubectl apply -f prometheus-svc.yaml kubectl apply -f prometheus-alter.yaml #生成证书 kubectl -n monitor-sa create secret generic etcd-certs --from-file=/etc/kubernetes/pki/etcd/server.key --from-file=/etc/kubernetes/pki/etcd/server.crt --from-file=/etc/kubernetes/pki/etcd/ca.crt [root@master01 prometheus]# kubectl get pod -n monitor-sa -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES node-exporter-99vhd 1/1 Running 0 140m 192.168.60.120 node01 <none> <none> node-exporter-c6md9 1/1 Running 0 140m 192.168.60.130 node02 <none> <none> node-exporter-f29fh 1/1 Running 0 140m 192.168.60.110 master01 <none> <none> prometheus-server-55d866cb44-wrrbx 2/2 Running 0 5m29s 10.244.2.34 node02 <none> <none> [root@master01 prometheus]# kubectl get svc -n monitor-sa -o wide NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR alertmanager NodePort 10.96.153.49 <none> 9093:30066/TCP 6m app=prometheus prometheus NodePort 10.96.215.253 <none> 9090:31758/TCP 6m1s app=prometheus,component=server
[root@master01 prometheus]# vim pro-gra.yml apiVersion: v1 kind: PersistentVolumeClaim metadata:name: grafananamespace: kube-system spec:accessModes:- ReadWriteManystorageClassName: nfs-client-storageclassresources:requests:storage: 2Gi --- apiVersion: apps/v1 kind: Deployment metadata:name: monitoring-grafananamespace: kube-system spec:replicas: 1selector:matchLabels:task: monitoringk8s-app: grafanatemplate:metadata:labels:task: monitoringk8s-app: grafanaspec:containers:- name: grafanaimage: grafana/grafana:7.5.11securityContext:runAsUser: 104runAsGroup: 107ports:- containerPort: 3000protocol: TCPvolumeMounts:- mountPath: /etc/ssl/certsname: ca-certificatesreadOnly: false- mountPath: /varname: grafana-storage- mountPath: /var/lib/grafananame: graf-testenv:- name: INFLUXDB_HOSTvalue: monitoring-influxdb- name: GF_SERVER_HTTP_PORTvalue: "3000"- name: GF_AUTH_BASIC_ENABLEDvalue: "false"- name: GF_AUTH_ANONYMOUS_ENABLEDvalue: "true"- name: GF_AUTH_ANONYMOUS_ORG_ROLEvalue: Admin- name: GF_SERVER_ROOT_URLvalue: /volumes:- name: ca-certificateshostPath:path: /etc/ssl/certs- name: grafana-storageemptyDir: {}- name: graf-testpersistentVolumeClaim:claimName: grafana --- apiVersion: v1 kind: Service metadata:labels:name: monitoring-grafananamespace: kube-system spec:ports:- port: 80targetPort: 3000selector:k8s-app: grafanatype: NodePort[root@master01 prometheus]# kubectl apply -f pro-gra.yml [root@master01 prometheus]# kubectl get svc -n kube-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 16d monitoring-grafana NodePort 10.96.220.147 <none> 80:31771/TCP 12s
//处理 kube-proxy 监控告警 kubectl edit configmap kube-proxy -n kube-system ...... metricsBindAddress: "0.0.0.0:10249" #因为 kube-proxy 默认端口10249是监听在 127.0.0.1 上的,需要改成监听到物理节点上 #重新启动 kube-proxy kubectl get pods -n kube-system | grep kube-proxy |awk '{print $1}' | xargs kubectl delete pods -n kube-system
测试:
[root@master01 prometheus]# vim ylce.yml apiVersion: apps/v1 kind: Deployment metadata:name: hpa-testlabels:hpa: test spec:replicas: 1selector:matchLabels:hpa: testtemplate:metadata:labels:hpa: testspec:containers:- name: centosimage: centos:7command: ["/bin/bash", "-c", "yum install -y stress --nogpgcheck && sleep 3600"]volumeMounts:- name: yummountPath: /etc/yum.repos.d/volumes:- name: yumhostPath:path: /etc/yum.repos.d/