从零实现一个全链路监控平台:Metrics与Alerting

从零实现一个全链路监控平台:Metrics与Alerting
前言你有没有想过当系统出现故障时你怎么知道是哪个服务、哪个接口出了问题CPU飙高、内存泄漏、接口变慢——这些怎么才能提前发现全链路监控平台是可观测性的三大支柱之一Metrics Logging Tracing。今天我们从零实现· 指标采集Counter、Gauge、Histogram· 指标聚合与存储· 告警规则引擎· 告警通知· 可视化Dashboard---一、监控平台核心原理1. 架构图┌─────────────────────────────────────────────────────────────┐│ 应用服务 ││ ┌─────────┐ ┌─────────┐ ┌─────────┐ ││ │ 指标采集 │→│ 指标聚合 │→│ 指标上报 │ ││ └─────────┘ └─────────┘ └─────────┘ │└─────────────────────────────────────────────────────────────┘│▼┌─────────────────────────────────────────────────────────────┐│ 监控中心 ││ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ││ │ 指标存储 │ │ 规则引擎 │ │ 告警通知 │ ││ │ (时序数据) │ │ (阈值判断) │ │ (钉钉/邮件) │ ││ └─────────────┘ └─────────────┘ └─────────────┘ │└─────────────────────────────────────────────────────────────┘│▼┌─────────────┐│ Dashboard ││ (可视化) │└─────────────┘2. 核心概念概念 说明 示例Counter 只增不减的计数器 请求总数Gauge 可增可减的测量值 CPU使用率、内存使用量Histogram 分布统计 请求延迟P50/P95/P99Label 维度标签 服务名、接口名、状态码---二、完整代码实现1. 基础数据结构c#include stdio.h#include stdlib.h#include string.h#include unistd.h#include pthread.h#include time.h#include errno.h#include math.h#define MAX_METRIC_NAME 128#define MAX_LABELS 8#define MAX_LABEL_KEY 32#define MAX_LABEL_VALUE 64#define MAX_HISTOGRAM_BUCKETS 20#define MAX_ALERTS 100// 指标类型typedef enum {METRIC_COUNTER 0,METRIC_GAUGE,METRIC_HISTOGRAM,METRIC_SUMMARY} metric_type_t;// 标签typedef struct label {char key[MAX_LABEL_KEY];char value[MAX_LABEL_VALUE];} label_t;// 指标值typedef struct metric_value {metric_type_t type;char name[MAX_METRIC_NAME];label_t labels[MAX_LABELS];int label_count;double value;double sum;double count;double buckets[MAX_HISTOGRAM_BUCKETS];double bucket_upper[MAX_HISTOGRAM_BUCKETS];int bucket_count;time_t timestamp;struct metric_value *next;} metric_value_t;// 告警规则typedef struct alert_rule {char name[64];char metric_name[MAX_METRIC_NAME];char condition[16]; // , , , , double threshold;int for_seconds; // 持续时间char severity[16]; // critical, warning, infochar message[256];struct alert_rule *next;} alert_rule_t;// 告警事件typedef struct alert_event {char rule_name[64];char metric_name[MAX_METRIC_NAME];char severity[16];char message[256];double current_value;time_t start_time;time_t end_time;int active;struct alert_event *next;} alert_event_t;// 监控平台typedef struct monitor_platform {metric_value_t *metrics;alert_rule_t *alert_rules;alert_event_t *alert_events;pthread_mutex_t mutex;int retention_days;int running;} monitor_platform_t;2. 指标采集c// 创建监控平台monitor_platform_t *monitor_create(void) {monitor_platform_t *mp malloc(sizeof(monitor_platform_t));memset(mp, 0, sizeof(monitor_platform_t));mp-retention_days 7;mp-running 1;pthread_mutex_init(mp-mutex, NULL);printf(监控平台启动\n);return mp;}// 创建Countervoid metric_counter_add(monitor_platform_t *mp, const char *name,label_t *labels, int label_count, double delta) {pthread_mutex_lock(mp-mutex);metric_value_t *m mp-metrics;while (m) {if (strcmp(m-name, name) 0 m-type METRIC_COUNTER) {// 检查标签匹配int match 1;if (m-label_count label_count) {for (int i 0; i label_count; i) {if (strcmp(m-labels[i].key, labels[i].key) ! 0 ||strcmp(m-labels[i].value, labels[i].value) ! 0) {match 0;break;}}} else {match 0;}if (match) {m-value delta;m-timestamp time(NULL);pthread_mutex_unlock(mp-mutex);return;}}m m-next;}// 创建新指标m malloc(sizeof(metric_value_t));m-type METRIC_COUNTER;strcpy(m-name, name);m-label_count label_count;for (int i 0; i label_count i MAX_LABELS; i) {strcpy(m-labels[i].key, labels[i].key);strcpy(m-labels[i].value, labels[i].value);}m-value delta;m-timestamp time(NULL);m-next mp-metrics;mp-metrics m;pthread_mutex_unlock(mp-mutex);}// 设置Gaugevoid metric_gauge_set(monitor_platform_t *mp, const char *name,label_t *labels, int label_count, double value) {pthread_mutex_lock(mp-mutex);metric_value_t *m mp-metrics;while (m) {if (strcmp(m-name, name) 0 m-type METRIC_GAUGE) {int match 1;if (m-label_count label_count) {for (int i 0; i label_count; i) {if (strcmp(m-labels[i].key, labels[i].key) ! 0 ||strcmp(m-labels[i].value, labels[i].value) ! 0) {match 0;break;}}} else {match 0;}if (match) {m-value value;m-timestamp time(NULL);pthread_mutex_unlock(mp-mutex);return;}}m m-next;}m malloc(sizeof(metric_value_t));m-type METRIC_GAUGE;strcpy(m-name, name);m-label_count label_count;for (int i 0; i label_count i MAX_LABELS; i) {strcpy(m-labels[i].key, labels[i].key);strcpy(m-labels[i].value, labels[i].value);}m-value value;m-timestamp time(NULL);m-next mp-metrics;mp-metrics m;pthread_mutex_unlock(mp-mutex);}// 记录Histogram观测值void metric_histogram_observe(monitor_platform_t *mp, const char *name,label_t *labels, int label_count, double value) {pthread_mutex_lock(mp-mutex);metric_value_t *m mp-metrics;while (m) {if (strcmp(m-name, name) 0 m-type METRIC_HISTOGRAM) {int match 1;if (m-label_count label_count) {for (int i 0; i label_count; i) {if (strcmp(m-labels[i].key, labels[i].key) ! 0 ||strcmp(m-labels[i].value, labels[i].value) ! 0) {match 0;break;}}} else {match 0;}if (match) {m-sum value;m-count;// 分配桶for (int i 0; i m-bucket_count; i) {if (value m-bucket_upper[i]) {m-buckets[i];break;}}m-timestamp time(NULL);pthread_mutex_unlock(mp-mutex);return;}}m m-next;}m malloc(sizeof(metric_value_t));m-type METRIC_HISTOGRAM;strcpy(m-name, name);m-label_count label_count;for (int i 0; i label_count i MAX_LABELS; i) {strcpy(m-labels[i].key, labels[i].key);strcpy(m-labels[i].value, labels[i].value);}m-sum value;m-count 1;// 默认桶分布0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10double default_buckets[] {0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10};m-bucket_count 11;for (int i 0; i m-bucket_count; i) {m-bucket_upper[i] default_buckets[i];m-buckets[i] (value default_buckets[i]) ? 1 : 0;}m-timestamp time(NULL);m-next mp-metrics;mp-metrics m;pthread_mutex_unlock(mp-mutex);}3. 告警规则引擎c// 添加告警规则void monitor_add_alert_rule(monitor_platform_t *mp, const char *name,const char *metric_name, const char *condition,double threshold, int for_seconds,const char *severity, const char *message) {pthread_mutex_lock(mp-mutex);alert_rule_t *rule malloc(sizeof(alert_rule_t));strcpy(rule-name, name);strcpy(rule-metric_name, metric_name);strcpy(rule-condition, condition);rule-threshold threshold;rule-for_seconds for_seconds;strcpy(rule-severity, severity);strcpy(rule-message, message);rule-next mp-alert_rules;mp-alert_rules rule;pthread_mutex_unlock(mp-mutex);printf([告警] 添加规则: %s (%s %s %.2f)\n,name, metric_name, condition, threshold);}// 检查告警条件int check_condition(double value, const char *condition, double threshold) {if (strcmp(condition, ) 0) return value threshold;if (strcmp(condition, ) 0) return value threshold;if (strcmp(condition, ) 0) return value threshold;if (strcmp(condition, ) 0) return value threshold;if (strcmp(condition, ) 0) return fabs(value - threshold) 0.0001;return 0;}// 评估告警规则void monitor_evaluate_alerts(monitor_platform_t *mp) {pthread_mutex_lock(mp-mutex);time_t now time(NULL);alert_rule_t *rule mp-alert_rules;while (rule) {// 查找对应的指标metric_value_t *m mp-metrics;while (m) {if (strcmp(m-name, rule-metric_name) 0) {int triggered check_condition(m-value, rule-condition, rule-threshold);if (triggered) {// 检查是否已存在告警alert_event_t *evt mp-alert_events;int found 0;while (evt) {if (strcmp(evt-rule_name, rule-name) 0 evt-active) {found 1;break;}evt evt-next;}if (!found) {// 创建新告警alert_event_t *new_evt malloc(sizeof(alert_event_t));strcpy(new_evt-rule_name, rule-name);strcpy(new_evt-metric_name, rule-metric_name);strcpy(new_evt-severity, rule-severity);snprintf(new_evt-message, sizeof(new_evt-message),%s (当前值: %.2f), rule-message, m-value);new_evt-current_value m-value;new_evt-start_time now;new_evt-end_time 0;new_evt-active 1;new_evt-next mp-alert_events;mp-alert_events new_evt;printf([告警] %s: %s (%.2f %s %.2f)\n,rule-severity, rule-name, m-value,rule-condition, rule-threshold);}} else {// 关闭告警alert_event_t *evt mp-alert_events;while (evt) {if (strcmp(evt-rule_name, rule-name) 0 evt-active) {evt-active 0;evt-end_time now;printf([告警] %s 已恢复\n, rule-name);}evt evt-next;}}}m m-next;}rule rule-next;}pthread_mutex_unlock(mp-mutex);}4. 监控线程c// 监控评估线程void *monitor_eval_thread(void *arg) {monitor_platform_t *mp (monitor_platform_t*)arg;while (mp-running) {sleep(10); // 每10秒评估一次monitor_evaluate_alerts(mp);}return NULL;}5. Dashboard生成c// 生成HTML Dashboardvoid monitor_generate_dashboard(monitor_platform_t *mp, const char *filename) {FILE *fp fopen(filename, w);if (!fp) return;fprintf(fp, !DOCTYPE html\n);fprintf(fp, htmlheadtitle监控Dashboard/title\n);fprintf(fp, style\n);fprintf(fp, body{font-family:monospace;padding:20px;background:#1a1a2e;color:#eee}\n);fprintf(fp, .metric{background:#16213e;padding:15px;margin:10px 0;border-radius:8px}\n);fprintf(fp, .metric-name{color:#e94560;font-weight:bold}\n);fprintf(fp, .metric-value{color:#0f3460;font-size:24px}\n);fprintf(fp, .label{color:#aaa;font-size:12px}\n);fprintf(fp, .critical{color:#ff6b6b}\n);fprintf(fp, .warning{color:#ffd93d}\n);fprintf(fp, .info{color:#6bcb77}\n);fprintf(fp, /style/headbody\n);fprintf(fp, h1 监控Dashboard/h1\n);// 显示指标pthread_mutex_lock(mp-mutex);metric_value_t *m mp-metrics;while (m) {fprintf(fp, div classmetric\n);fprintf(fp, span classmetric-name%s/span\n, m-name);// 显示标签if (m-label_count 0) {fprintf(fp, span classlabel);for (int i 0; i m-label_count; i) {fprintf(fp, %s%s , m-labels[i].key, m-labels[i].value);}fprintf(fp, /span\n);}// 显示值if (m-type METRIC_COUNTER) {fprintf(fp, div classmetric-value%.0f/div\n, m-value);} else if (m-type METRIC_GAUGE) {fprintf(fp, div classmetric-value%.2f/div\n, m-value);} else if (m-type METRIC_HISTOGRAM) {fprintf(fp, div classmetric-valuecount%.0f, sum%.2f/div\n,m-count, m-sum);}fprintf(fp, /div\n);m m-next;}// 显示告警fprintf(fp, h2 活跃告警/h2\n);alert_event_t *evt mp-alert_events;while (evt) {if (evt-active) {const char *cls (strcmp(evt-severity, critical) 0) ? critical :(strcmp(evt-severity, warning) 0) ? warning : info;fprintf(fp, div classmetric %s%s: %s/div\n,cls, evt-severity, evt-message);}evt evt-next;}pthread_mutex_unlock(mp-mutex);fprintf(fp, /body/html\n);fclose(fp);printf([Dashboard] 已生成: %s\n, filename);}6. 测试代码cvoid test_monitor() {printf( 全链路监控平台测试 \n\n);monitor_platform_t *mp monitor_create();// 添加告警规则monitor_add_alert_rule(mp, cpu_high, cpu_usage, , 80.0, 30,critical, CPU使用率过高);monitor_add_alert_rule(mp, error_rate_high, error_rate, , 5.0, 60,warning, 错误率过高);monitor_add_alert_rule(mp, memory_low, memory_free, , 1024.0, 60,critical, 可用内存不足);// 启动监控线程pthread_t eval_tid;pthread_create(eval_tid, NULL, monitor_eval_thread, mp);// 模拟指标采集printf([模拟] 开始采集指标...\n);for (int i 0; i 50; i) {label_t labels[2];strcpy(labels[0].key, service);strcpy(labels[0].value, order-service);strcpy(labels[1].key, env);strcpy(labels[1].value, prod);// 模拟Countermetric_counter_add(mp, http_requests_total, labels, 2, 10 rand() % 50);// 模拟Gaugedouble cpu 30 (rand() % 80);metric_gauge_set(mp, cpu_usage, labels, 2, cpu);// 模拟Histogramdouble latency (rand() % 1000) / 100.0;metric_histogram_observe(mp, request_duration_ms, labels, 2, latency);// 模拟错误率double error_rate (rand() % 10) / 100.0;metric_gauge_set(mp, error_rate, labels, 2, error_rate * 100);// 模拟内存double memory_free 500 (rand() % 4000);metric_gauge_set(mp, memory_free, labels, 2, memory_free);usleep(100000);if (i % 10 0) {printf([模拟] 采集 %d/50\n, i);}}sleep(2);// 生成Dashboardmonitor_generate_dashboard(mp, dashboard.html);mp-running 0;pthread_join(eval_tid, NULL);printf(\n✅ 测试完成打开 dashboard.html 查看监控面板\n);free(mp);}int main() {srand(time(NULL));test_monitor();return 0;}---三、编译和运行bashgcc -o monitor monitor.c -lpthread -lm./monitor---四、Prometheus vs 本实现特性 本实现 Prometheus指标采集 ✅ ✅多类型 Counter/Gauge/Histogram Counter/Gauge/Histogram/Summary标签支持 ✅ ✅告警规则 ✅ ✅ (Alertmanager)持久化 ❌ ✅ (TSDB)查询语言 ❌ ✅ (PromQL)可视化 ✅ (简单) ✅ (Grafana)服务发现 ❌ ✅---五、总结通过这篇文章你学会了· 监控平台的核心原理指标采集、告警、可视化· 三种指标类型Counter、Gauge、Histogram· 告警规则引擎· 标签维度的使用· Dashboard生成全链路监控是可观测性的核心。掌握它你就拥有了提前发现系统问题的能力。下一篇预告《从零实现一个分布式任务调度平台XXL-JOB的核心设计》---评论区分享一下你用监控系统发现过什么问题