From 7016ef880ad6440b13049e52522112db7fe18543 Mon Sep 17 00:00:00 2001 From: Ingolf Wagner Date: Tue, 14 May 2024 22:36:08 +0200 Subject: [PATCH] opentelemetry all the way. --- nixos/machines/cherry/configuration.nix | 4 + .../cherry/telemetry/opentelemetry.nix | 82 ++++++++++++ .../machines/cherry/telemetry/prometheus.nix | 117 ++++++++++++++++++ nixos/machines/cherry/telemetry/telegraf.nix | 91 ++++++++++++++ .../chungus/telemetry/opentelemetry.nix | 49 +++++++- .../machines/chungus/telemetry/prometheus.nix | 81 ++++++------ nixos/machines/chungus/telemetry/telegraf.nix | 31 ++--- 7 files changed, 396 insertions(+), 59 deletions(-) create mode 100644 nixos/machines/cherry/telemetry/opentelemetry.nix create mode 100644 nixos/machines/cherry/telemetry/prometheus.nix create mode 100644 nixos/machines/cherry/telemetry/telegraf.nix diff --git a/nixos/machines/cherry/configuration.nix b/nixos/machines/cherry/configuration.nix index 8c4f22a..9a0cbc6 100644 --- a/nixos/machines/cherry/configuration.nix +++ b/nixos/machines/cherry/configuration.nix @@ -16,6 +16,10 @@ ./37c3.nix + ./telemetry/opentelemetry.nix + ./telemetry/prometheus.nix + ./telemetry/telegraf.nix + ]; diff --git a/nixos/machines/cherry/telemetry/opentelemetry.nix b/nixos/machines/cherry/telemetry/opentelemetry.nix new file mode 100644 index 0000000..af40aab --- /dev/null +++ b/nixos/machines/cherry/telemetry/opentelemetry.nix @@ -0,0 +1,82 @@ +{ pkgs, config, ... }: +{ + + services.opentelemetry-collector = { + enable = true; + package = pkgs.unstable.opentelemetry-collector-contrib; + settings = { + receivers = { + + # provide a influxdb sink + influxdb = { + endpoint = "127.0.0.1:8088"; + }; + + # scrape opentelemetry-colectors metrics + prometheus.config.scrape_configs = [ + { + job_name = "netdata"; + scrape_interval = "10s"; + metrics_path = "/api/v1/allmetrics"; + params.format = [ "prometheus" ]; + static_configs = [{ + targets = [ "127.0.0.1:19999" ]; + labels = { + service = "netdata"; + server = config.networking.hostName; + }; + }]; + } + { + job_name = "otelcol"; + scrape_interval = "10s"; + static_configs = [{ + targets = [ "127.0.0.1:8100" ]; + labels = { + service = "otelcol"; + server = config.networking.hostName; + }; + }]; + metric_relabel_configs = [ + { + source_labels = [ "__name__" ]; + regex = ".*grpc_io.*"; + action = "drop"; + } + ]; + } + { + job_name = "node"; + static_configs = [{ + targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.node.port}" ]; + labels = { + service = "node-exporter"; + server = config.networking.hostName; + }; + }]; + } + ]; + }; + + exporters = { + # provide prometheus sink under `/metrics` to + prometheus = { + endpoint = "127.0.0.1:8090"; + }; + otlp = { + endpoint = "10.100.0.2:4317"; # chungus + tls.insecure = true; + }; + }; + + service = { + pipelines.metrics = { + receivers = [ "influxdb" "prometheus" ]; + exporters = [ "prometheus" "otlp" ]; + }; + # open telemetries own metrics? + telemetry.metrics.address = "0.0.0.0:8100"; + }; + }; + }; +} diff --git a/nixos/machines/cherry/telemetry/prometheus.nix b/nixos/machines/cherry/telemetry/prometheus.nix new file mode 100644 index 0000000..73b43a3 --- /dev/null +++ b/nixos/machines/cherry/telemetry/prometheus.nix @@ -0,0 +1,117 @@ +{ config, pkgs, lib, ... }: { + + services.nginx = { + enable = true; + statusPage = true; + virtualHosts = { + "prometheus.${config.networking.hostName}.private" = { + extraConfig = '' + allow ${config.tinc.private.subnet}; + deny all; + ''; + locations."/" = { proxyPass = "http://localhost:${toString config.services.prometheus.port}"; }; + }; + }; + }; + + services.prometheus = { + checkConfig = "syntax-only"; + enable = true; + # keep data for 30 days + extraFlags = [ "--storage.tsdb.retention.time=90d" ]; + + ruleFiles = [ + (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON { + groups = [ + { + name = "core"; + rules = [ + { + alert = "InstanceDown"; + expr = "up == 0"; + for = "5m"; + labels.severity = "page"; + annotations = { + summary = "Instance {{ $labels.instance }} down"; + description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."; + }; + } + ]; + } + # todo : move this to open telemetry + { + name = "home-assistant"; + rules = [ + { + record = "home_open_window_sum"; + expr = ''sum( homeassistant_binary_sensor_state{entity=~"binary_sensor\\.window_02_contact|binary_sensor\\.window_03_contact|binary_sensor\\.window_04_contact|binary_sensor\\.window_05_contact|binary_sensor\\.window_06_contact|binary_sensor\\.window_07_contact"} )''; + } + ] ++ (map + (number: + { + record = "home_at_least_n_windows_open"; + expr = ''home_open_window_sum >= bool ${toString number}''; + labels.n = number; + }) [ 1 2 3 ]); + } + ]; + })) + ]; + + exporters = { + node = { + enable = true; + enabledCollectors = [ "systemd" ]; + port = 9002; + }; + }; + + scrapeConfigs = [ + { + job_name = "opentelemetry"; + metrics_path = "/metrics"; + scrape_interval = "10s"; + static_configs = [{ targets = [ "localhost:8090" ]; }]; + } + #{ + # job_name = "netdata"; + # metrics_path = "/api/v1/allmetrics"; + # params.format = [ "prometheus" ]; + # scrape_interval = "5s"; + # static_configs = [ + # { + # targets = [ "localhost:19999" ]; + # labels = { + # service = "netdata"; + # server = config.networking.hostName; + # }; + # } + # ]; + #} + #{ + # job_name = "node"; + # static_configs = [{ + # targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; + # labels = { + # service = "node-exporter"; + # server = config.networking.hostName; + # }; + # }]; + #} + #{ + # # see https://www.home-assistant.io/integrations/prometheus/ + # job_name = "home-assistant"; + # scrape_interval = "60s"; + # metrics_path = "/api/prometheus"; + # bearer_token_file = toString config.sops.secrets.hass_long_term_token.path; + # static_configs = [{ + # targets = [ "localhost:8123" ]; + # labels = { + # service = "hass"; + # server = config.networking.hostName; + # }; + # }]; + #} + ]; + }; +} diff --git a/nixos/machines/cherry/telemetry/telegraf.nix b/nixos/machines/cherry/telemetry/telegraf.nix new file mode 100644 index 0000000..de9cac8 --- /dev/null +++ b/nixos/machines/cherry/telemetry/telegraf.nix @@ -0,0 +1,91 @@ +{ config, pkgs, ... }: +let + urls = [ + { url = "https://bitwarden.ingolf-wagner.de"; path = ""; } + { url = "https://flix.ingolf-wagner.de"; path = "web/index.html"; } + { url = "https://git.ingolf-wagner.de"; path = ""; } + { url = "https://ingolf-wagner.de"; path = ""; } + { url = "https://nextcloud.ingolf-wagner.de"; path = "login"; } + { url = "https://tech.ingolf-wagner.de"; path = ""; } + { url = "https://matrix.ingolf-wagner.de"; path = ""; } + ]; + +in +{ + systemd.services.telegraf.path = [ pkgs.inetutils ]; + + services.telegraf = { + enable = true; + extraConfig = { + #outputs.prometheus_client = { + # listen = ":9273"; + # metric_version = 2; + #}; + outputs.influxdb_v2 = { + urls = [ "http://127.0.0.1:8088" ]; + }; + + global_tags = { + service = "telegraf"; + server = config.networking.hostName; + }; + + # https://github.com/influxdata/telegraf/tree/master/plugins/inputs < all them plugins + inputs = { + cpu = { }; + diskio = { }; + smart.attributes = true; + x509_cert = [{ + sources = (map (url: "${url.url}:443") urls); + interval = "30m"; # agent.interval = "10s" is default + }]; + http_response = + let fullUrls = map ({ url, path }: "${url}/${path}") urls; + in [{ urls = fullUrls; }]; + processes = { }; + system = { }; + systemd_units = { }; + internet_speed.interval = "10m"; + nginx.urls = [ "http://localhost/nginx_status" ]; + ping = [{ urls = [ "10.100.0.1" ]; }]; # actually important to make pepe visible over wireguard + }; + }; + }; + + # todo : do this prometheus + services.prometheus.ruleFiles = [ + (pkgs.writeText "telegraf.yml" (builtins.toJSON { + groups = [ + { + name = "telegraf"; + rules = [ + { + alert = "HttpResponseNotOk"; + expr = "0 * (http_response_http_response_code != 200) + 1"; + for = "5m"; + labels.severity = "page"; + annotations = { + summary = "{{ $labels.exported_server }} does not return Ok"; + description = "{{ $labels.exported_server }} does not return Ok for more than 5 minutes"; + }; + } + { + alert = "CertificatExpires"; + expr = ''x509_cert_expiry{issuer_common_name="R3"} < ${toString (60 * 60 * 24 * 5)}''; + for = "1d"; + labels.severity = "page"; + annotations = { + summary = "{{ $labels.san }} does Expire Soon"; + description = "{{ $labels.san }} does expire in less than 5 days"; + }; + } + ]; + } + ]; + })) + ]; + + + + +} diff --git a/nixos/machines/chungus/telemetry/opentelemetry.nix b/nixos/machines/chungus/telemetry/opentelemetry.nix index 85a0775..83953e1 100644 --- a/nixos/machines/chungus/telemetry/opentelemetry.nix +++ b/nixos/machines/chungus/telemetry/opentelemetry.nix @@ -1,5 +1,9 @@ -{ pkgs, ... }: +{ pkgs, config, ... }: { + + networking.firewall.interfaces.wg0.allowedTCPPorts = [ 4317 ]; + networking.firewall.interfaces.wg0.allowedUDPPorts = [ 4317 ]; + services.opentelemetry-collector = { enable = true; package = pkgs.unstable.opentelemetry-collector-contrib; @@ -21,12 +25,24 @@ scrape_interval = "10s"; metrics_path = "/api/v1/allmetrics"; params.format = [ "prometheus" ]; - static_configs = [{ targets = [ "127.0.0.1:19999" ]; }]; + static_configs = [{ + targets = [ "127.0.0.1:19999" ]; + labels = { + service = "netdata"; + server = config.networking.hostName; + }; + }]; } { job_name = "otelcol"; scrape_interval = "10s"; - static_configs = [{ targets = [ "127.0.0.1:8100" ]; }]; + static_configs = [{ + targets = [ "127.0.0.1:8100" ]; + labels = { + service = "otelcol"; + server = config.networking.hostName; + }; + }]; metric_relabel_configs = [ { source_labels = [ "__name__" ]; @@ -35,6 +51,30 @@ } ]; } + { + job_name = "node"; + static_configs = [{ + targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.node.port}" ]; + labels = { + service = "node-exporter"; + server = config.networking.hostName; + }; + }]; + } + { + # see https://www.home-assistant.io/integrations/prometheus/ + job_name = "home-assistant"; + scrape_interval = "60s"; + metrics_path = "/api/prometheus"; + bearer_token_file = toString config.sops.secrets.hass_long_term_token.path; + static_configs = [{ + targets = [ "localhost:8123" ]; + labels = { + service = "hass"; + server = config.networking.hostName; + }; + }]; + } ]; }; @@ -47,7 +87,8 @@ service = { pipelines.metrics = { - receivers = [ "otlp" "influxdb" "prometheus" ]; + #receivers = [ "otlp" "influxdb" "prometheus" ]; + receivers = [ "otlp" "influxdb" ]; exporters = [ "prometheus" ]; }; # open telemetries own metrics? diff --git a/nixos/machines/chungus/telemetry/prometheus.nix b/nixos/machines/chungus/telemetry/prometheus.nix index 43a469e..c574852 100644 --- a/nixos/machines/chungus/telemetry/prometheus.nix +++ b/nixos/machines/chungus/telemetry/prometheus.nix @@ -40,6 +40,7 @@ } ]; } + # todo : move this to open telemetry { name = "home-assistant"; rules = [ @@ -69,44 +70,50 @@ scrapeConfigs = [ { - job_name = "netdata"; - metrics_path = "/api/v1/allmetrics"; - params.format = [ "prometheus" ]; - scrape_interval = "5s"; - static_configs = [ - { - targets = [ "localhost:19999" ]; - labels = { - service = "netdata"; - server = config.networking.hostName; - }; - } - ]; - } - { - job_name = "node"; - static_configs = [{ - targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; - labels = { - service = "node-exporter"; - server = config.networking.hostName; - }; - }]; - } - { - # see https://www.home-assistant.io/integrations/prometheus/ - job_name = "home-assistant"; - scrape_interval = "60s"; - metrics_path = "/api/prometheus"; - bearer_token_file = toString config.sops.secrets.hass_long_term_token.path; - static_configs = [{ - targets = [ "localhost:8123" ]; - labels = { - service = "hass"; - server = config.networking.hostName; - }; - }]; + job_name = "opentelemetry"; + metrics_path = "/metrics"; + scrape_interval = "10s"; + static_configs = [{ targets = [ "localhost:8090" ]; }]; } + #{ + # job_name = "netdata"; + # metrics_path = "/api/v1/allmetrics"; + # params.format = [ "prometheus" ]; + # scrape_interval = "5s"; + # static_configs = [ + # { + # targets = [ "localhost:19999" ]; + # labels = { + # service = "netdata"; + # server = config.networking.hostName; + # }; + # } + # ]; + #} + #{ + # job_name = "node"; + # static_configs = [{ + # targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; + # labels = { + # service = "node-exporter"; + # server = config.networking.hostName; + # }; + # }]; + #} + #{ + # # see https://www.home-assistant.io/integrations/prometheus/ + # job_name = "home-assistant"; + # scrape_interval = "60s"; + # metrics_path = "/api/prometheus"; + # bearer_token_file = toString config.sops.secrets.hass_long_term_token.path; + # static_configs = [{ + # targets = [ "localhost:8123" ]; + # labels = { + # service = "hass"; + # server = config.networking.hostName; + # }; + # }]; + #} ]; }; } diff --git a/nixos/machines/chungus/telemetry/telegraf.nix b/nixos/machines/chungus/telemetry/telegraf.nix index cebac07..de9cac8 100644 --- a/nixos/machines/chungus/telemetry/telegraf.nix +++ b/nixos/machines/chungus/telemetry/telegraf.nix @@ -17,10 +17,19 @@ in services.telegraf = { enable = true; extraConfig = { - outputs.prometheus_client = { - listen = ":9273"; - metric_version = 2; + #outputs.prometheus_client = { + # listen = ":9273"; + # metric_version = 2; + #}; + outputs.influxdb_v2 = { + urls = [ "http://127.0.0.1:8088" ]; }; + + global_tags = { + service = "telegraf"; + server = config.networking.hostName; + }; + # https://github.com/influxdata/telegraf/tree/master/plugins/inputs < all them plugins inputs = { cpu = { }; @@ -43,21 +52,7 @@ in }; }; - services.prometheus.scrapeConfigs = [ - { - # see https://www.home-assistant.io/integrations/prometheus/ - job_name = "telgraf"; - metrics_path = "/metrics"; - static_configs = [{ - targets = [ "localhost:9273" ]; - labels = { - service = "telegraf"; - server = config.networking.hostName; - }; - }]; - } - ]; - + # todo : do this prometheus services.prometheus.ruleFiles = [ (pkgs.writeText "telegraf.yml" (builtins.toJSON { groups = [