Prometheus

https://github.com/m3db/m3 Distributed TSDB, Aggregator and Query Engine, Prometheus Sidecar, Graphite Compatible, Metrics Platform

https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster VictoriaMetrics - fast, cost-effective and scalable time series database, long-term remote storage for Prometheus

https://news.ycombinator.com/item?id=22758402

https://github.com/OpenObservability/OpenMetrics

Prometheus

Tips

# 安装
brew install prometheus
# 从源码编译安装
go get -v -u github.com/prometheus/prometheus/cmd/...
# 启动 默认前端页面 localhost:9090
prometheus --config.file ~/.config/prometheus.yml
# docker 启动
docker run \
-p 9090:9090 \
-v /etc/prometheus:/etc/prometheus \
prom/prometheus
# 或通过 brew 启动服务
# 如果没有 --storage.tsdb.path 可能出现权限问题
echo "--config.file $HOME/.config/prometheus.yml --storage.tsdb.path $HOME/.data/prometheus" > /usr/local/etc/prometheus.args
brew services start prometheus
# 错误日志
cat /usr/local/var/log/prometheus.err.log
# 正常日志
cat /usr/local/var/log/prometheus.log
# 管理接口
# --web.enable-admin-api 启用 /api/*/admin/ 相关接口
# --web.enable-lifecycle 启用 reload 和 quite
# 参考 https://prometheus.io/docs/operating/security/
prometheus --config.file ~/.config/prometheus.yml --web.enable-admin-api --web.enable-lifecycle
# brew 参数
echo "--config.file $HOME/.config/prometheus.yml --storage.tsdb.path $HOME/.data/prometheus --web.enable-admin-api --web.enable-lifecycle" > /usr/local/etc/prometheus.args
# 重启服务
brew services restart prometheus
# 重载配置
curl -X POST http://localhost:9090/-/reload
# 主机节点监控
brew install node_exporter
# 通过 service 启动
echo --web.listen-address :9101 > /usr/local/etc/node_exporter.args
brew services start node_exporter
# 直接启动
node_exporter --web.listen-address :9101
# Docker
# ===================
docker pull prom/prometheus
docker pull prom/alertmanager
# 需要挂载 /proc
docker pull prom/node-exporter
docker pull prom/blackbox-exporter
docker pull prom/container-exporter
docker pull prom/mysqld-exporter
# node_exporter 在容器中运行的不太好
go get github.com/prometheus/node_exporter
# 默认账号密码为 admin/admin
docker pull grafana/grafana
docker network create --subnet=172.18.0.0/16 mon-net
docker run --net mon-net --ip 172.18.0.10 -i -p 12000:3000 grafana/grafana
docker run --net mon-net --ip 172.18.0.20 -i -p 12001:9090 prom/prometheus
# http://docs.grafana.org/installation/docker/
docker run -d --restart always -v /etc/localtime:/etc/localtime:ro \
-p 12000:3000 \
-e "GF_SERVER_ROOT_URL=http://grafana.server.name" \
-e "GF_SECURITY_ADMIN_PASSWORD=secret" \
grafana/grafana

Config

# 全局配置
global:
# 抓取间隔,默认 1m
scrape_interval: 15s
# 抓取超时,默认 10s
scrape_timeout: 10s
# 计算规则间隔,默认 1m
evaluation_interval: 15s
# 告警配置
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# 周期性计算的规则文件
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# 抓取配置
scrape_configs:
# 任务名字,会添加一个 job=$job_name 的标签
- job_name: 'prometheus'
# 指标路径,默认 '/metrics'
metrics_path: '/metrics'
# 请求 schema,默认 'http'
scheme: 'http'
# 静态配置
static_configs:
# 抓取目标
- targets: ['localhost:9090']

PromQL

# 指标
node_cpu_seconds_total
# 标签过滤,支持操作符号 = != 匹配 =~ 不匹配 !~
node_cpu_seconds_total{mode="user"}
# 5 分钟均值
rate(node_cpu_seconds_total{mode="user"}[5m])
# 聚合结果
sum(rate(node_cpu_seconds_total{mode="user"}[5m]))
# 按照 mode 分组
sum(rate(node_cpu_seconds_total[5m])) by (mode)
# 不看 idle 和 nice
sum(rate(node_cpu_seconds_total{mode!~"idle|nice"}[5m])) by (mode)
# 只看 user 和 system
sum(rate(node_cpu_seconds_total{mode=~"user|system"}[5m])) by (mode)
# 分别返回
# by (mode) 是必须的,如果丢失了 label,则会认为是同样的指标,会被丢弃
sum(rate(node_cpu_seconds_total{mode="user"}[5m])) by (mode) or sum(rate(node_cpu_seconds_total{mode="system"}[5m])) by (mode)
# 结果加上另外一个指标
sum(rate(node_cpu_seconds_total{mode=~"user|system"}[5m])) by (mode) or node_load15

服务发现

  • 支持配置
    • azure
    • consul - 服务 catalog
    • digitalocean
    • dockerswarm
    • dns - SVR 记录
    • ec2
    • openstack
    • file - 检测文件变化
      • 格式与 static_config 相同
    • gce
    • kubernetes
      • node、service、pod、endpoints、ingress
    • marathon
    • nerve
    • serverset
    • triton
  • mDNS
# _prometheus-http._tcp
# _prometheus-https._tcp
go get github.com/msiebuhr/prometheus-mdns-sd
#
prometheus-mdns-sd -out /etc/prometheus/mdns-sd.json
cat <<XML > /etc/avahi/services/node-exporter.service
<service-group>
<name replace-wildcards="yes">%h</name>
<service>
<type>_prometheus-http._tcp</type>
<port>9100</port>
</service>
</service-group>
XML
# macOS
dns-sd -R "node_exporter metrics" _prometheus-http._tcp. . 9100 path=/metrics

Exporter

端口

服务默认端口说明监控面板
prometheus9090
grafana3000
blackbox_exporter9115检测 HTTP, HTTPS, DNS, TCP, ICMP.
mysqld-exporter9104
redis-exporter9121Prometheus Redis
node-exporter9100节点状态信息Node Exporter Server Metrics
Node exporter single server
container-exporter9104Docker Dashboard
nginx-lua-prometheusn/aNginx Overview
brew install node_exporter
# 从源码编译
go get -u -v github.com/prometheus/node_exporter
cd ~/gp/src/github.com/prometheus/node_exporter
make
./node_exporter
go get github.com/oliver006/redis_exporter
redis_exporter
# Prometheus Redis https://grafana.net/dashboards/763
go get github.com/prometheus/mysqld_exporter
export DATA_SOURCE_NAME='login:[email protected](hostname:port)/'
mysqld_exporter

blackbox-exporter

  • prometheus/blackbox_exporter
  • http://localhost:9115/probe?target=google.com&module=http_2xx , debug=true 会包含额外信息
    • probe_success
  • SIGHUP, POST /-/reload
  • ICMP 需要更高的权限

blackbox.yml

# 模块配置 - probe 时进行引用
modules:
http_2xx:
# 底层 probe 类型
# http, tcp, dns, icmp
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp

prometheus.yml

scrape_configs:
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- http://prometheus.io # Target to probe with http.
- https://prometheus.io # Target to probe with https.
- http://example.com:8080 # Target to probe with http on port 8080.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.