diff --git a/.dockerignore b/.dockerignore index 5eca8e1b80..c528ea1189 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,3 +7,4 @@ data/ !.build/linux-arm64/ !.build/linux-ppc64le/ !.build/linux-s390x/ +!.build/linux-riscv64/ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..432caee6f7 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +web/api/v1/testdata/openapi_golden.yaml linguist-generated diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ec4eef8dae..7873822f26 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -28,6 +28,7 @@ If no, just write "NONE" in the release-notes block below. Otherwise, please describe what should be mentioned in the CHANGELOG. Use the following prefixes: [FEATURE] [ENHANCEMENT] [PERF] [BUGFIX] [SECURITY] [CHANGE] Refer to the existing CHANGELOG for inspiration: https://github.com/prometheus/prometheus/blob/main/CHANGELOG.md +A concrete example may look as follows (be sure to leave out the surrounding quotes): "[FEATURE] API: Add /api/v1/features for clients to understand which features are supported". If you need help formulating your entries, consult the reviewer(s). --> ```release-notes diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8d25176252..87b6fb90a0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,6 +3,8 @@ name: CI on: pull_request: push: + branches: [main, 'release-*'] + tags: ['v*'] permissions: contents: read @@ -19,7 +21,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/setup_environment with: enable_npm: true @@ -37,7 +39,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/setup_environment - run: go test --tags=dedupelabels ./... - run: go test --tags=slicelabels -race ./cmd/prometheus ./model/textparse ./prompb/... @@ -81,7 +83,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/setup_environment with: enable_go: false @@ -100,7 +102,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # v6.1.0 + - uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0 with: go-version: 1.25.x - run: | @@ -146,7 +148,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/build with: promu_opts: "-p linux/amd64 -p windows/amd64 -p linux/arm64 -p darwin/amd64 -p darwin/arm64 -p linux/386" @@ -173,7 +175,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/build with: parallelism: 12 @@ -212,7 +214,7 @@ jobs: uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/setup_environment with: enable_npm: true @@ -227,7 +229,7 @@ jobs: with: persist-credentials: false - name: Install Go - uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # v6.1.0 + uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0 with: go-version: 1.25.x - name: Install snmp_exporter/generator dependencies @@ -270,7 +272,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/publish_main with: docker_hub_login: ${{ secrets.docker_hub_login }} @@ -289,7 +291,7 @@ jobs: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - uses: ./.github/promci/actions/publish_release with: docker_hub_login: ${{ secrets.docker_hub_login }} @@ -306,13 +308,13 @@ jobs: uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: persist-credentials: false - - uses: prometheus/promci@c0916f0a41f13444612a8f0f5e700ea34edd7c19 # v0.5.3 + - uses: prometheus/promci@fc721ff8497a70a93a881cd552b71af7fb3a9d53 # v0.5.4 - name: Install nodejs uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0 with: node-version-file: "web/ui/.nvmrc" registry-url: "https://registry.npmjs.org" - - uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1 + - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 with: path: ~/.npm key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} diff --git a/.github/workflows/fuzzing.yml b/.github/workflows/fuzzing.yml index 776e0a67c5..0afcbe6f0c 100644 --- a/.github/workflows/fuzzing.yml +++ b/.github/workflows/fuzzing.yml @@ -13,11 +13,11 @@ jobs: fuzz_test: [FuzzParseMetricText, FuzzParseOpenMetric, FuzzParseMetricSelector, FuzzParseExpr] steps: - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 with: persist-credentials: false - name: Install Go - uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 + uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0 with: go-version: 1.25.x - name: Run Fuzzing @@ -29,7 +29,7 @@ jobs: if: failure() with: name: fuzz-artifacts-${{ matrix.fuzz_test }} - path: promql/testdata/fuzz/${{ matrix.fuzz_test }} + path: util/fuzzing/testdata/fuzz/${{ matrix.fuzz_test }} fuzzing_status: # This status check aggregates the individual matrix jobs of the fuzzing # step into a final status. Fails if a single matrix job fails, succeeds if diff --git a/.golangci.yml b/.golangci.yml index 0c866611e9..8cb3265f4f 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -102,6 +102,10 @@ linters: desc: "Use github.com/klauspost/compress instead of zlib" - pkg: "golang.org/x/exp/slices" desc: "Use 'slices' instead." + - pkg: "gopkg.in/yaml.v2" + desc: "Use go.yaml.in/yaml/v2 instead of gopkg.in/yaml.v2" + - pkg: "gopkg.in/yaml.v3" + desc: "Use go.yaml.in/yaml/v3 instead of gopkg.in/yaml.v3" errcheck: exclude-functions: # Don't flag lines such as "io.Copy(io.Discard, resp.Body)". @@ -124,6 +128,8 @@ linters: # Disable this check for now since it introduces too many changes in our existing codebase. # See https://pkg.go.dev/golang.org/x/tools/go/analysis/passes/modernize#hdr-Analyzer_omitzero for more details. - omitzero + # Disable waitgroup check until we really move to Go 1.25. + - waitgroup perfsprint: # Optimizes even if it requires an int or uint type cast. int-conversion: true diff --git a/.yamllint b/.yamllint index 8d09c375fd..b329f464fb 100644 --- a/.yamllint +++ b/.yamllint @@ -2,6 +2,7 @@ extends: default ignore: | **/node_modules + web/api/v1/testdata/openapi_*_golden.yaml rules: braces: diff --git a/CODEOWNERS b/CODEOWNERS index f28cdbf832..2c5dedbffa 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,25 +2,28 @@ # Please keep this file in sync with the MAINTAINERS.md file! # +# Prometheus team members are members of the "default maintainers" github team. +# They are code owners by default for the whole repo. +* @prometheus/default-maintainers + # Subsystems. -/Makefile @simonpasquier @SuperQ -/cmd/promtool @dgl -/documentation/prometheus-mixin @metalmatze -/model/histogram @beorn7 @krajorama -/web/ui @juliusv -/web/ui/module @juliusv @nexucis -/promql @roidelapluie -/storage/remote @cstyan @bwplotka @tomwilkie @npazosmendez @alexgreenbank -/storage/remote/otlptranslator @aknuds1 @jesusvazquez @ArthurSens -/tsdb @jesusvazquez @codesome @bwplotka @krajorama +/Makefile @prometheus/default-maintainers @simonpasquier @SuperQ +/cmd/promtool @prometheus/default-maintainers @dgl +/documentation/prometheus-mixin @prometheus/default-maintainers @metalmatze +/model/histogram @prometheus/default-maintainers @beorn7 @krajorama +/web/ui @prometheus/default-maintainers @juliusv +/web/ui/module @prometheus/default-maintainers @juliusv @nexucis +/promql @prometheus/default-maintainers @roidelapluie +/storage/remote @prometheus/default-maintainers @cstyan @bwplotka @tomwilkie @alexgreenbank +/storage/remote/otlptranslator @prometheus/default-maintainers @aknuds1 @jesusvazquez @ArthurSens +/tsdb @prometheus/default-maintainers @jesusvazquez @codesome @bwplotka @krajorama # Service discovery. -/discovery/kubernetes @brancz -/discovery/stackit @jkroepke +/discovery/kubernetes @prometheus/default-maintainers @brancz +/discovery/stackit @prometheus/default-maintainers @jkroepke +/discovery/aws/ @prometheus/default-maintainers @matt-gp @sysadmind # Pending -# https://github.com/prometheus/prometheus/pull/17105#issuecomment-3248209452 -# /discovery/aws/ @matt-gp @sysadmind # https://github.com/prometheus/prometheus/pull/15212#issuecomment-3575225179 -# /discovery/aliyun @KeyOfSpectator +# /discovery/aliyun @prometheus/default-maintainers @KeyOfSpectator # https://github.com/prometheus/prometheus/pull/14108#issuecomment-2639515421 -# /discovery/nomad @jaloren @jrasell +# /discovery/nomad @prometheus/default-maintainers @jaloren @jrasell diff --git a/Dockerfile b/Dockerfile index 071e7441e3..98712d8f9c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,8 @@ LABEL org.opencontainers.image.authors="The Prometheus Authors" \ org.opencontainers.image.source="https://github.com/prometheus/prometheus" \ org.opencontainers.image.url="https://github.com/prometheus/prometheus" \ org.opencontainers.image.documentation="https://prometheus.io/docs" \ - org.opencontainers.image.licenses="Apache License 2.0" + org.opencontainers.image.licenses="Apache License 2.0" \ + io.prometheus.image.variant="busybox" ARG ARCH="amd64" ARG OS="linux" diff --git a/Dockerfile.distroless b/Dockerfile.distroless new file mode 100644 index 0000000000..0ee184a91c --- /dev/null +++ b/Dockerfile.distroless @@ -0,0 +1,29 @@ +ARG DISTROLESS_ARCH="amd64" + +# Use DISTROLESS_ARCH for base image selection (handles armv7->arm mapping). +FROM gcr.io/distroless/static-debian13:nonroot-${DISTROLESS_ARCH} +# Base image sets USER to 65532:65532 (nonroot user). + +ARG ARCH="amd64" +ARG OS="linux" + +LABEL org.opencontainers.image.authors="The Prometheus Authors" +LABEL org.opencontainers.image.vendor="Prometheus" +LABEL org.opencontainers.image.title="Prometheus" +LABEL org.opencontainers.image.description="The Prometheus monitoring system and time series database" +LABEL org.opencontainers.image.source="https://github.com/prometheus/prometheus" +LABEL org.opencontainers.image.url="https://github.com/prometheus/prometheus" +LABEL org.opencontainers.image.documentation="https://prometheus.io/docs" +LABEL org.opencontainers.image.licenses="Apache License 2.0" +LABEL io.prometheus.image.variant="distroless" + +COPY documentation/examples/prometheus.yml /etc/prometheus/prometheus.yml +COPY LICENSE NOTICE npm_licenses.tar.bz2 / +COPY .build/${OS}-${ARCH}/prometheus /bin/prometheus +COPY .build/${OS}-${ARCH}/promtool /bin/promtool + +WORKDIR /prometheus +EXPOSE 9090 +ENTRYPOINT [ "/bin/prometheus" ] +CMD [ "--config.file=/etc/prometheus/prometheus.yml", \ + "--storage.tsdb.path=/prometheus" ] diff --git a/MAINTAINERS.md b/MAINTAINERS.md index f23c7fbd63..ae61059af5 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -6,6 +6,7 @@ General maintainers: * Bryan Boreham (bjboreham@gmail.com / @bboreham) * Ayoub Mrini (ayoubmrini424@gmail.com / @machine424) * Julien Pivotto (roidelapluie@prometheus.io / @roidelapluie) +* György Krajcsovits ( / @krajorama) Maintainers for specific parts of the codebase: * `cmd` @@ -15,12 +16,10 @@ Maintainers for specific parts of the codebase: * `stackit`: Jan-Otto Kröpke ( / @jkroepke) * `documentation` * `prometheus-mixin`: Matthias Loibl ( / @metalmatze) -* `model/histogram` and other code related to native histograms: Björn Rabenstein ( / @beorn7), -George Krajcsovits ( / @krajorama) * `storage` - * `remote`: Callum Styan ( / @cstyan), Bartłomiej Płotka ( / @bwplotka), Tom Wilkie (tom.wilkie@gmail.com / @tomwilkie), Nicolás Pazos ( / @npazosmendez), Alex Greenbank ( / @alexgreenbank) + * `remote`: Callum Styan ( / @cstyan), Bartłomiej Płotka ( / @bwplotka), Tom Wilkie (tom.wilkie@gmail.com / @tomwilkie), Alex Greenbank ( / @alexgreenbank) * `otlptranslator`: Arthur Silva Sens ( / @ArthurSens), Arve Knudsen ( / @aknuds1), Jesús Vázquez ( / @jesusvazquez) -* `tsdb`: Ganesh Vernekar ( / @codesome), Bartłomiej Płotka ( / @bwplotka), Jesús Vázquez ( / @jesusvazquez), George Krajcsovits ( / @krajorama) +* `tsdb`: Ganesh Vernekar ( / @codesome), Bartłomiej Płotka ( / @bwplotka), Jesús Vázquez ( / @jesusvazquez) * `web` * `ui`: Julius Volz ( / @juliusv) * `module`: Augustin Husson ( / @nexucis) diff --git a/Makefile b/Makefile index 8bc4a3dcaa..ad4b90f020 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ # limitations under the License. # Needs to be defined before including Makefile.common to auto-generate targets -DOCKER_ARCHS ?= amd64 armv7 arm64 ppc64le s390x +DOCKER_ARCHS ?= amd64 armv7 arm64 ppc64le riscv64 s390x UI_PATH = web/ui UI_NODE_MODULES_PATH = $(UI_PATH)/node_modules diff --git a/Makefile.common b/Makefile.common index 7beae6e58f..b8c9b3844c 100644 --- a/Makefile.common +++ b/Makefile.common @@ -82,11 +82,32 @@ endif PREFIX ?= $(shell pwd) BIN_DIR ?= $(shell pwd) DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD)) -DOCKERFILE_PATH ?= ./Dockerfile DOCKERBUILD_CONTEXT ?= ./ DOCKER_REPO ?= prom +# Check if deprecated DOCKERFILE_PATH is set +ifdef DOCKERFILE_PATH +$(error DOCKERFILE_PATH is deprecated. Use DOCKERFILE_VARIANTS ?= $(DOCKERFILE_PATH) in the Makefile) +endif + DOCKER_ARCHS ?= amd64 +DOCKERFILE_VARIANTS ?= Dockerfile $(wildcard Dockerfile.*) + +# Function to extract variant from Dockerfile label. +# Returns the variant name from io.prometheus.image.variant label, or "default" if not found. +define dockerfile_variant +$(strip $(or $(shell sed -n 's/.*io\.prometheus\.image\.variant="\([^"]*\)".*/\1/p' $(1)),default)) +endef + +# Check for duplicate variant names (including default for Dockerfiles without labels). +DOCKERFILE_VARIANT_NAMES := $(foreach df,$(DOCKERFILE_VARIANTS),$(call dockerfile_variant,$(df))) +DOCKERFILE_VARIANT_NAMES_SORTED := $(sort $(DOCKERFILE_VARIANT_NAMES)) +ifneq ($(words $(DOCKERFILE_VARIANT_NAMES)),$(words $(DOCKERFILE_VARIANT_NAMES_SORTED))) +$(error Duplicate variant names found. Each Dockerfile must have a unique io.prometheus.image.variant label, and only one can be without a label (default)) +endif + +# Build variant:dockerfile pairs for shell iteration. +DOCKERFILE_VARIANTS_WITH_NAMES := $(foreach df,$(DOCKERFILE_VARIANTS),$(call dockerfile_variant,$(df)):$(df)) BUILD_DOCKER_ARCHS = $(addprefix common-docker-,$(DOCKER_ARCHS)) PUBLISH_DOCKER_ARCHS = $(addprefix common-docker-publish-,$(DOCKER_ARCHS)) @@ -226,28 +247,110 @@ common-docker-repo-name: .PHONY: common-docker $(BUILD_DOCKER_ARCHS) common-docker: $(BUILD_DOCKER_ARCHS) $(BUILD_DOCKER_ARCHS): common-docker-%: - docker build -t "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" \ - -f $(DOCKERFILE_PATH) \ - --build-arg ARCH="$*" \ - --build-arg OS="linux" \ - $(DOCKERBUILD_CONTEXT) + @for variant in $(DOCKERFILE_VARIANTS_WITH_NAMES); do \ + dockerfile=$${variant#*:}; \ + variant_name=$${variant%%:*}; \ + distroless_arch="$*"; \ + if [ "$*" = "armv7" ]; then \ + distroless_arch="arm"; \ + fi; \ + if [ "$$dockerfile" = "Dockerfile" ]; then \ + echo "Building default variant ($$variant_name) for linux-$* using $$dockerfile"; \ + docker build -t "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" \ + -f $$dockerfile \ + --build-arg ARCH="$*" \ + --build-arg OS="linux" \ + --build-arg DISTROLESS_ARCH="$$distroless_arch" \ + $(DOCKERBUILD_CONTEXT); \ + if [ "$$variant_name" != "default" ]; then \ + echo "Tagging default variant with $$variant_name suffix"; \ + docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" \ + "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name"; \ + fi; \ + else \ + echo "Building $$variant_name variant for linux-$* using $$dockerfile"; \ + docker build -t "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name" \ + -f $$dockerfile \ + --build-arg ARCH="$*" \ + --build-arg OS="linux" \ + --build-arg DISTROLESS_ARCH="$$distroless_arch" \ + $(DOCKERBUILD_CONTEXT); \ + fi; \ + done .PHONY: common-docker-publish $(PUBLISH_DOCKER_ARCHS) common-docker-publish: $(PUBLISH_DOCKER_ARCHS) $(PUBLISH_DOCKER_ARCHS): common-docker-publish-%: - docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" + @for variant in $(DOCKERFILE_VARIANTS_WITH_NAMES); do \ + dockerfile=$${variant#*:}; \ + variant_name=$${variant%%:*}; \ + if [ "$$dockerfile" != "Dockerfile" ] || [ "$$variant_name" != "default" ]; then \ + echo "Pushing $$variant_name variant for linux-$*"; \ + docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name"; \ + fi; \ + if [ "$$dockerfile" = "Dockerfile" ]; then \ + echo "Pushing default variant ($$variant_name) for linux-$*"; \ + docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)"; \ + fi; \ + if [ "$(DOCKER_IMAGE_TAG)" = "latest" ]; then \ + if [ "$$dockerfile" != "Dockerfile" ] || [ "$$variant_name" != "default" ]; then \ + echo "Pushing $$variant_name variant version tags for linux-$*"; \ + docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:v$(DOCKER_MAJOR_VERSION_TAG)-$$variant_name"; \ + fi; \ + if [ "$$dockerfile" = "Dockerfile" ]; then \ + echo "Pushing default variant version tag for linux-$*"; \ + docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:v$(DOCKER_MAJOR_VERSION_TAG)"; \ + fi; \ + fi; \ + done DOCKER_MAJOR_VERSION_TAG = $(firstword $(subst ., ,$(shell cat VERSION))) .PHONY: common-docker-tag-latest $(TAG_DOCKER_ARCHS) common-docker-tag-latest: $(TAG_DOCKER_ARCHS) $(TAG_DOCKER_ARCHS): common-docker-tag-latest-%: - docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:latest" - docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:v$(DOCKER_MAJOR_VERSION_TAG)" + @for variant in $(DOCKERFILE_VARIANTS_WITH_NAMES); do \ + dockerfile=$${variant#*:}; \ + variant_name=$${variant%%:*}; \ + if [ "$$dockerfile" != "Dockerfile" ] || [ "$$variant_name" != "default" ]; then \ + echo "Tagging $$variant_name variant for linux-$* as latest"; \ + docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:latest-$$variant_name"; \ + docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:v$(DOCKER_MAJOR_VERSION_TAG)-$$variant_name"; \ + fi; \ + if [ "$$dockerfile" = "Dockerfile" ]; then \ + echo "Tagging default variant ($$variant_name) for linux-$* as latest"; \ + docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:latest"; \ + docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(SANITIZED_DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:v$(DOCKER_MAJOR_VERSION_TAG)"; \ + fi; \ + done .PHONY: common-docker-manifest common-docker-manifest: - DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(SANITIZED_DOCKER_IMAGE_TAG)" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):$(SANITIZED_DOCKER_IMAGE_TAG)) - DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(SANITIZED_DOCKER_IMAGE_TAG)" + @for variant in $(DOCKERFILE_VARIANTS_WITH_NAMES); do \ + dockerfile=$${variant#*:}; \ + variant_name=$${variant%%:*}; \ + if [ "$$dockerfile" != "Dockerfile" ] || [ "$$variant_name" != "default" ]; then \ + echo "Creating manifest for $$variant_name variant"; \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name); \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(SANITIZED_DOCKER_IMAGE_TAG)-$$variant_name"; \ + fi; \ + if [ "$$dockerfile" = "Dockerfile" ]; then \ + echo "Creating default variant ($$variant_name) manifest"; \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(SANITIZED_DOCKER_IMAGE_TAG)" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):$(SANITIZED_DOCKER_IMAGE_TAG)); \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(SANITIZED_DOCKER_IMAGE_TAG)"; \ + fi; \ + if [ "$(DOCKER_IMAGE_TAG)" = "latest" ]; then \ + if [ "$$dockerfile" != "Dockerfile" ] || [ "$$variant_name" != "default" ]; then \ + echo "Creating manifest for $$variant_name variant version tag"; \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):v$(DOCKER_MAJOR_VERSION_TAG)-$$variant_name" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):v$(DOCKER_MAJOR_VERSION_TAG)-$$variant_name); \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):v$(DOCKER_MAJOR_VERSION_TAG)-$$variant_name"; \ + fi; \ + if [ "$$dockerfile" = "Dockerfile" ]; then \ + echo "Creating default variant version tag manifest"; \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):v$(DOCKER_MAJOR_VERSION_TAG)" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):v$(DOCKER_MAJOR_VERSION_TAG)); \ + DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):v$(DOCKER_MAJOR_VERSION_TAG)"; \ + fi; \ + fi; \ + done .PHONY: promu promu: $(PROMU) diff --git a/README.md b/README.md index 7b04a51cee..030a827952 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,15 @@ produce a fully working image when run locally. ## Using Prometheus as a Go Library +Within the Prometheus project, repositories such as [prometheus/common](https://github.com/prometheus/common) and +[prometheus/client-golang](https://github.com/prometheus/client-golang) are designed as re-usable libraries. + +The [prometheus/prometheus](https://github.com/prometheus/prometheus) repository builds a stand-alone program and is not +designed for use as a library. We are aware that people do use parts as such, +and we do not put any deliberate inconvenience in the way, but we want you to be +aware that no care has been taken to make it work well as a library. For instance, +you may encounter errors that only surface when used as a library. + ### Remote Write We are publishing our Remote Write protobuf independently at diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index b06b6095b3..6bee6dd25d 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -219,6 +219,8 @@ type flagConfig struct { promqlEnableDelayedNameRemoval bool + parserOpts parser.Options + promslogConfig promslog.Config } @@ -256,23 +258,36 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { c.enableConcurrentRuleEval = true logger.Info("Experimental concurrent rule evaluation enabled.") case "promql-experimental-functions": - parser.EnableExperimentalFunctions = true + c.parserOpts.EnableExperimentalFunctions = true logger.Info("Experimental PromQL functions enabled.") case "promql-duration-expr": - parser.ExperimentalDurationExpr = true + c.parserOpts.ExperimentalDurationExpr = true logger.Info("Experimental duration expression parsing enabled.") case "native-histograms": logger.Warn("This option for --enable-feature is a no-op. To scrape native histograms, set the scrape_native_histograms scrape config setting to true.", "option", o) case "ooo-native-histograms": logger.Warn("This option for --enable-feature is now permanently enabled and therefore a no-op.", "option", o) case "created-timestamp-zero-ingestion": + // NOTE(bwplotka): Once AppendableV1 is removed, there will be only the TSDB and agent flags. c.scrape.EnableStartTimestampZeroIngestion = true c.web.STZeroIngestionEnabled = true + c.tsdb.EnableSTAsZeroSample = true c.agent.EnableSTAsZeroSample = true + // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. + // This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental created timestamp zero ingestion enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp zero ingestion enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + case "st-storage": + // TODO(bwplotka): Implement ST Storage as per PROM-60 and document this hidden feature flag. + c.tsdb.EnableSTStorage = true + c.agent.EnableSTStorage = true + + // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. This is to widen the ST support surface. + config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols + config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols + logger.Info("Experimental start timestamp storage enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "delayed-compaction": c.tsdb.EnableDelayedCompaction = true logger.Info("Experimental delayed compaction is enabled.") @@ -280,8 +295,11 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { c.promqlEnableDelayedNameRemoval = true logger.Info("Experimental PromQL delayed name removal enabled.") case "promql-extended-range-selectors": - parser.EnableExtendedRangeSelectors = true + c.parserOpts.EnableExtendedRangeSelectors = true logger.Info("Experimental PromQL extended range selectors enabled.") + case "promql-binop-fill-modifiers": + c.parserOpts.EnableBinopFillModifiers = true + logger.Info("Experimental PromQL binary operator fill modifiers enabled.") case "": continue case "old-ui": @@ -581,7 +599,7 @@ func main() { a.Flag("scrape.discovery-reload-interval", "Interval used by scrape manager to throttle target groups updates."). Hidden().Default("5s").SetValue(&cfg.scrape.DiscoveryReloadInterval) - a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). + a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). Default("").StringsVar(&cfg.featureList) a.Flag("agent", "Run Prometheus in 'Agent mode'.").BoolVar(&agentMode) @@ -617,6 +635,8 @@ func main() { os.Exit(1) } + promqlParser := parser.NewParser(cfg.parserOpts) + if agentMode && len(serverOnlyFlags) > 0 { fmt.Fprintf(os.Stderr, "The following flag(s) can not be used in agent mode: %q", serverOnlyFlags) os.Exit(3) @@ -671,7 +691,7 @@ func main() { } // Parse rule files to verify they exist and contain valid rules. - if err := rules.ParseFiles(cfgFile.RuleFiles, cfgFile.GlobalConfig.MetricNameValidationScheme); err != nil { + if err := rules.ParseFiles(cfgFile.RuleFiles, cfgFile.GlobalConfig.MetricNameValidationScheme, promqlParser); err != nil { absPath, pathErr := filepath.Abs(cfg.configFile) if pathErr != nil { absPath = cfg.configFile @@ -692,6 +712,7 @@ func main() { } if cfgFile.StorageConfig.TSDBConfig != nil { cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow + cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold if cfgFile.StorageConfig.TSDBConfig.Retention != nil { if cfgFile.StorageConfig.TSDBConfig.Retention.Time > 0 { cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time @@ -881,7 +902,7 @@ func main() { &cfg.scrape, logger.With("component", "scrape manager"), logging.NewJSONFileLogger, - fanoutStorage, + nil, fanoutStorage, prometheus.DefaultRegisterer, ) if err != nil { @@ -913,6 +934,7 @@ func main() { EnableDelayedNameRemoval: cfg.promqlEnableDelayedNameRemoval, EnableTypeAndUnitLabels: cfg.scrape.EnableTypeAndUnitLabels, FeatureRegistry: features.DefaultRegistry, + Parser: promqlParser, } queryEngine = promql.NewEngine(opts) @@ -936,6 +958,7 @@ func main() { return time.Duration(cfgFile.GlobalConfig.RuleQueryOffset) }, FeatureRegistry: features.DefaultRegistry, + Parser: promqlParser, }) } @@ -955,6 +978,7 @@ func main() { cfg.web.LookbackDelta = time.Duration(cfg.lookbackDelta) cfg.web.IsAgent = agentMode cfg.web.AppName = modeAppName + cfg.web.Parser = promqlParser cfg.web.Version = &web.PrometheusVersion{ Version: version.Version, @@ -1373,6 +1397,8 @@ func main() { "WALSegmentSize", cfg.tsdb.WALSegmentSize, "WALCompressionType", cfg.tsdb.WALCompressionType, "BlockReloadInterval", cfg.tsdb.BlockReloadInterval, + "EnableSTAsZeroSample", cfg.tsdb.EnableSTAsZeroSample, + "EnableSTStorage", cfg.tsdb.EnableSTStorage, ) startTimeMargin := int64(2 * time.Duration(cfg.tsdb.MinBlockDuration).Seconds() * 1000) @@ -1430,6 +1456,7 @@ func main() { "MaxWALTime", cfg.agent.MaxWALTime, "OutOfOrderTimeWindow", cfg.agent.OutOfOrderTimeWindow, "EnableSTAsZeroSample", cfg.agent.EnableSTAsZeroSample, + "EnableSTStorage", cfg.tsdb.EnableSTStorage, ) localStorage.Set(db, 0) @@ -1581,7 +1608,7 @@ func reloadConfig(filename string, enableExemplarStorage bool, logger *slog.Logg logger.Error("Failed to apply configuration", "err", err) failed = true } - timingsLogger = timingsLogger.With((rl.name), time.Since(rstart)) + timingsLogger = timingsLogger.With(rl.name, time.Since(rstart)) } if failed { return fmt.Errorf("one or more errors occurred while applying the new configuration (--config.file=%q)", filename) @@ -1755,6 +1782,14 @@ func (s *readyStorage) Appender(ctx context.Context) storage.Appender { return notReadyAppender{} } +// AppenderV2 implements the Storage interface. +func (s *readyStorage) AppenderV2(ctx context.Context) storage.AppenderV2 { + if x := s.get(); x != nil { + return x.AppenderV2(ctx) + } + return notReadyAppenderV2{} +} + type notReadyAppender struct{} // SetOptions does nothing in this appender implementation. @@ -1788,6 +1823,15 @@ func (notReadyAppender) Commit() error { return tsdb.ErrNotReady } func (notReadyAppender) Rollback() error { return tsdb.ErrNotReady } +type notReadyAppenderV2 struct{} + +func (notReadyAppenderV2) Append(storage.SeriesRef, labels.Labels, int64, int64, float64, *histogram.Histogram, *histogram.FloatHistogram, storage.AOptions) (storage.SeriesRef, error) { + return 0, tsdb.ErrNotReady +} +func (notReadyAppenderV2) Commit() error { return tsdb.ErrNotReady } + +func (notReadyAppenderV2) Rollback() error { return tsdb.ErrNotReady } + // Close implements the Storage interface. func (s *readyStorage) Close() error { if x := s.get(); x != nil { @@ -1932,6 +1976,9 @@ type tsdbOptions struct { UseUncachedIO bool BlockCompactionExcludeFunc tsdb.BlockExcludeFilterFunc BlockReloadInterval model.Duration + EnableSTAsZeroSample bool + EnableSTStorage bool + StaleSeriesCompactionThreshold float64 } func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { @@ -1958,6 +2005,9 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { BlockCompactionExcludeFunc: opts.BlockCompactionExcludeFunc, BlockReloadInterval: time.Duration(opts.BlockReloadInterval), FeatureRegistry: features.DefaultRegistry, + EnableSTAsZeroSample: opts.EnableSTAsZeroSample, + EnableSTStorage: opts.EnableSTStorage, + StaleSeriesCompactionThreshold: opts.StaleSeriesCompactionThreshold, } } @@ -1972,6 +2022,7 @@ type agentOptions struct { NoLockfile bool OutOfOrderTimeWindow int64 // TODO(bwplotka): Unused option, fix it or remove. EnableSTAsZeroSample bool + EnableSTStorage bool } func (opts agentOptions) ToAgentOptions(outOfOrderTimeWindow int64) agent.Options { @@ -1988,6 +2039,7 @@ func (opts agentOptions) ToAgentOptions(outOfOrderTimeWindow int64) agent.Option NoLockfile: opts.NoLockfile, OutOfOrderTimeWindow: outOfOrderTimeWindow, EnableSTAsZeroSample: opts.EnableSTAsZeroSample, + EnableSTStorage: opts.EnableSTStorage, } } diff --git a/cmd/prometheus/main_test.go b/cmd/prometheus/main_test.go index 6765bae900..38dfd3f2da 100644 --- a/cmd/prometheus/main_test.go +++ b/cmd/prometheus/main_test.go @@ -395,6 +395,7 @@ func TestTimeMetrics(t *testing.T) { } func getCurrentGaugeValuesFor(t *testing.T, reg prometheus.Gatherer, metricNames ...string) map[string]float64 { + t.Helper() f, err := reg.Gather() require.NoError(t, err) @@ -426,7 +427,7 @@ func TestAgentSuccessfulStartup(t *testing.T) { go func() { done <- prom.Wait() }() select { case err := <-done: - t.Logf("prometheus agent should be still running: %v", err) + t.Logf("prometheus agent exited early: %v", err) actualExitStatus = prom.ProcessState.ExitCode() case <-time.After(startupTime): prom.Process.Kill() @@ -571,12 +572,7 @@ func TestDocumentation(t *testing.T) { var stdout bytes.Buffer cmd.Stdout = &stdout - if err := cmd.Run(); err != nil { - var exitError *exec.ExitError - if errors.As(err, &exitError) && exitError.ExitCode() != 0 { - fmt.Println("Command failed with non-zero exit code") - } - } + require.NoError(t, cmd.Run(), "failed to generate CLI documentation via --write-documentation") generatedContent := strings.ReplaceAll(stdout.String(), filepath.Base(promPath), strings.TrimSuffix(filepath.Base(promPath), ".test")) @@ -753,7 +749,7 @@ global: configFile := filepath.Join(tmpDir, "prometheus.yml") port := testutil.RandomUnprivilegedPort(t) - os.WriteFile(configFile, []byte(tc.config), 0o777) + require.NoError(t, os.WriteFile(configFile, []byte(tc.config), 0o777)) prom := prometheusCommandWithLogging( t, configFile, @@ -801,7 +797,7 @@ global: newConfig := ` runtime: gogc: 99` - os.WriteFile(configFile, []byte(newConfig), 0o777) + require.NoError(t, os.WriteFile(configFile, []byte(newConfig), 0o777)) reloadPrometheusConfig(t, reloadURL) ensureGOGCValue(99.0) }) @@ -834,7 +830,7 @@ scrape_configs: static_configs: - targets: ['localhost:%d'] `, port, port) - os.WriteFile(configFile, []byte(config), 0o777) + require.NoError(t, os.WriteFile(configFile, []byte(config), 0o777)) prom := prometheusCommandWithLogging( t, @@ -995,7 +991,7 @@ func TestRemoteWrite_ReshardingWithoutDeadlock(t *testing.T) { config := fmt.Sprintf(` global: # Using a smaller interval may cause the scrape to time out. - scrape_interval: 1s + scrape_interval: 1s scrape_configs: - job_name: 'self' static_configs: diff --git a/cmd/prometheus/query_log_test.go b/cmd/prometheus/query_log_test.go index 5e5a9ac3b7..e410f836a9 100644 --- a/cmd/prometheus/query_log_test.go +++ b/cmd/prometheus/query_log_test.go @@ -334,7 +334,8 @@ func (p *queryLogTest) run(t *testing.T) { p.query(t) - ql := readQueryLog(t, queryLogFile.Name()) + // Wait for query log entry to be written (avoid race with file I/O). + ql := waitForQueryLog(t, queryLogFile.Name(), 1) qc := len(ql) if p.exactQueryCount() { require.Equal(t, 1, qc) @@ -361,7 +362,8 @@ func (p *queryLogTest) run(t *testing.T) { p.query(t) qc++ - ql = readQueryLog(t, queryLogFile.Name()) + // Wait for query log entry to be written (avoid race with file I/O). + ql = waitForQueryLog(t, queryLogFile.Name(), qc) if p.exactQueryCount() { require.Len(t, ql, qc) } else { @@ -392,7 +394,8 @@ func (p *queryLogTest) run(t *testing.T) { qc++ - ql = readQueryLog(t, newFile.Name()) + // Wait for query log entry to be written (avoid race with file I/O). + ql = waitForQueryLog(t, newFile.Name(), qc) if p.exactQueryCount() { require.Len(t, ql, qc) } else { @@ -404,7 +407,8 @@ func (p *queryLogTest) run(t *testing.T) { p.query(t) - ql = readQueryLog(t, queryLogFile.Name()) + // Wait for query log entry to be written (avoid race with file I/O). + ql = waitForQueryLog(t, queryLogFile.Name(), 1) qc = len(ql) if p.exactQueryCount() { require.Equal(t, 1, qc) @@ -446,6 +450,18 @@ func readQueryLog(t *testing.T, path string) []queryLogLine { return ql } +// waitForQueryLog waits for the query log to contain at least minEntries entries, +// polling at regular intervals until the timeout is reached. +func waitForQueryLog(t *testing.T, path string, minEntries int) []queryLogLine { + t.Helper() + var ql []queryLogLine + require.Eventually(t, func() bool { + ql = readQueryLog(t, path) + return len(ql) >= minEntries + }, 5*time.Second, 100*time.Millisecond, "timed out waiting for query log to have at least %d entries, got %d", minEntries, len(ql)) + return ql +} + func TestQueryLog(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") diff --git a/cmd/prometheus/testdata/features.json b/cmd/prometheus/testdata/features.json index 145bb04d77..c39f60ab33 100644 --- a/cmd/prometheus/testdata/features.json +++ b/cmd/prometheus/testdata/features.json @@ -4,6 +4,8 @@ "exclude_alerts": true, "label_values_match": true, "lifecycle": false, + "openapi_3.1": true, + "openapi_3.2": true, "otlp_write_receiver": false, "query_stats": true, "query_warnings": true, @@ -28,6 +30,9 @@ "by": true, "delayed_name_removal": false, "duration_expr": false, + "fill": false, + "fill_left": false, + "fill_right": false, "group_left": true, "group_right": true, "ignoring": true, @@ -191,6 +196,7 @@ "lightsail": true, "linode": true, "marathon": true, + "msk": true, "nerve": true, "nomad": true, "openstack": true, diff --git a/cmd/promtool/main.go b/cmd/promtool/main.go index 16cc40233a..183b918ba0 100644 --- a/cmd/promtool/main.go +++ b/cmd/promtool/main.go @@ -61,7 +61,10 @@ import ( "github.com/prometheus/prometheus/util/documentcli" ) -var promqlEnableDelayedNameRemoval = false +var ( + promqlEnableDelayedNameRemoval = false + promtoolParserOpts parser.Options +) func init() { // This can be removed when the legacy global mode is fully deprecated. @@ -314,7 +317,7 @@ func main() { promQLLabelsDeleteQuery := promQLLabelsDeleteCmd.Arg("query", "PromQL query.").Required().String() promQLLabelsDeleteName := promQLLabelsDeleteCmd.Arg("name", "Name of the label to delete.").Required().String() - featureList := app.Flag("enable-feature", "Comma separated feature names to enable. Valid options: promql-experimental-functions, promql-delayed-name-removal. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details").Default("").Strings() + featureList := app.Flag("enable-feature", "Comma separated feature names to enable. Valid options: promql-experimental-functions, promql-delayed-name-removal, promql-duration-expr, promql-extended-range-selectors. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details").Default("").Strings() documentationCmd := app.Command("write-documentation", "Generate command line documentation. Internal use.").Hidden() @@ -348,9 +351,13 @@ func main() { for o := range strings.SplitSeq(f, ",") { switch o { case "promql-experimental-functions": - parser.EnableExperimentalFunctions = true + promtoolParserOpts.EnableExperimentalFunctions = true case "promql-delayed-name-removal": promqlEnableDelayedNameRemoval = true + case "promql-duration-expr": + promtoolParserOpts.ExperimentalDurationExpr = true + case "promql-extended-range-selectors": + promtoolParserOpts.EnableExtendedRangeSelectors = true case "": continue default: @@ -358,13 +365,14 @@ func main() { } } } + promtoolParser := parser.NewParser(promtoolParserOpts) switch parsedCmd { case sdCheckCmd.FullCommand(): os.Exit(CheckSD(*sdConfigFile, *sdJobName, *sdTimeout, prometheus.DefaultRegisterer)) case checkConfigCmd.FullCommand(): - os.Exit(CheckConfig(*agentMode, *checkConfigSyntaxOnly, newConfigLintConfig(*checkConfigLint, *checkConfigLintFatal, *checkConfigIgnoreUnknownFields, model.UTF8Validation, model.Duration(*checkLookbackDelta)), *configFiles...)) + os.Exit(CheckConfig(*agentMode, *checkConfigSyntaxOnly, newConfigLintConfig(*checkConfigLint, *checkConfigLintFatal, *checkConfigIgnoreUnknownFields, model.UTF8Validation, model.Duration(*checkLookbackDelta)), promtoolParser, *configFiles...)) case checkServerHealthCmd.FullCommand(): os.Exit(checkErr(CheckServerStatus(serverURL, checkHealth, httpRoundTripper))) @@ -376,7 +384,7 @@ func main() { os.Exit(CheckWebConfig(*webConfigFiles...)) case checkRulesCmd.FullCommand(): - os.Exit(CheckRules(newRulesLintConfig(*checkRulesLint, *checkRulesLintFatal, *checkRulesIgnoreUnknownFields, model.UTF8Validation), *ruleFiles...)) + os.Exit(CheckRules(newRulesLintConfig(*checkRulesLint, *checkRulesLintFatal, *checkRulesIgnoreUnknownFields, model.UTF8Validation), promtoolParser, *ruleFiles...)) case checkMetricsCmd.FullCommand(): os.Exit(CheckMetrics(*checkMetricsExtended, *checkMetricsLint)) @@ -416,6 +424,7 @@ func main() { EnableNegativeOffset: true, EnableDelayedNameRemoval: promqlEnableDelayedNameRemoval, }, + promtoolParser, *testRulesRun, *testRulesDiff, *testRulesDebug, @@ -427,7 +436,7 @@ func main() { os.Exit(checkErr(benchmarkWrite(*benchWriteOutPath, *benchSamplesFile, *benchWriteNumMetrics, *benchWriteNumScrapes))) case tsdbAnalyzeCmd.FullCommand(): - os.Exit(checkErr(analyzeBlock(ctx, *analyzePath, *analyzeBlockID, *analyzeLimit, *analyzeRunExtended, *analyzeMatchers))) + os.Exit(checkErr(analyzeBlock(ctx, *analyzePath, *analyzeBlockID, *analyzeLimit, *analyzeRunExtended, *analyzeMatchers, promtoolParser))) case tsdbListCmd.FullCommand(): os.Exit(checkErr(listBlocks(*listPath, *listHumanReadable))) @@ -437,10 +446,10 @@ func main() { if *dumpFormat == "seriesjson" { format = formatSeriesSetLabelsToJSON } - os.Exit(checkErr(dumpTSDBData(ctx, *dumpPath, *dumpSandboxDirRoot, *dumpMinTime, *dumpMaxTime, *dumpMatch, format))) + os.Exit(checkErr(dumpTSDBData(ctx, *dumpPath, *dumpSandboxDirRoot, *dumpMinTime, *dumpMaxTime, *dumpMatch, format, promtoolParser))) case tsdbDumpOpenMetricsCmd.FullCommand(): - os.Exit(checkErr(dumpTSDBData(ctx, *dumpOpenMetricsPath, *dumpOpenMetricsSandboxDirRoot, *dumpOpenMetricsMinTime, *dumpOpenMetricsMaxTime, *dumpOpenMetricsMatch, formatSeriesSetOpenMetrics))) + os.Exit(checkErr(dumpTSDBData(ctx, *dumpOpenMetricsPath, *dumpOpenMetricsSandboxDirRoot, *dumpOpenMetricsMinTime, *dumpOpenMetricsMaxTime, *dumpOpenMetricsMatch, formatSeriesSetOpenMetrics, promtoolParser))) // TODO(aSquare14): Work on adding support for custom block size. case openMetricsImportCmd.FullCommand(): os.Exit(backfillOpenMetrics(*importFilePath, *importDBPath, *importHumanReadable, *importQuiet, *maxBlockDuration, *openMetricsLabels)) @@ -456,15 +465,15 @@ func main() { case promQLFormatCmd.FullCommand(): checkExperimental(*experimental) - os.Exit(checkErr(formatPromQL(*promQLFormatQuery))) + os.Exit(checkErr(formatPromQL(*promQLFormatQuery, promtoolParser))) case promQLLabelsSetCmd.FullCommand(): checkExperimental(*experimental) - os.Exit(checkErr(labelsSetPromQL(*promQLLabelsSetQuery, *promQLLabelsSetType, *promQLLabelsSetName, *promQLLabelsSetValue))) + os.Exit(checkErr(labelsSetPromQL(*promQLLabelsSetQuery, *promQLLabelsSetType, *promQLLabelsSetName, *promQLLabelsSetValue, promtoolParser))) case promQLLabelsDeleteCmd.FullCommand(): checkExperimental(*experimental) - os.Exit(checkErr(labelsDeletePromQL(*promQLLabelsDeleteQuery, *promQLLabelsDeleteName))) + os.Exit(checkErr(labelsDeletePromQL(*promQLLabelsDeleteQuery, *promQLLabelsDeleteName, promtoolParser))) } } @@ -589,7 +598,7 @@ func CheckServerStatus(serverURL *url.URL, checkEndpoint string, roundTripper ht } // CheckConfig validates configuration files. -func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings configLintConfig, files ...string) int { +func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings configLintConfig, p parser.Parser, files ...string) int { failed := false hasErrors := false @@ -610,7 +619,7 @@ func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings configLintConfig, if !checkSyntaxOnly { scrapeConfigsFailed := lintScrapeConfigs(scrapeConfigs, lintSettings) failed = failed || scrapeConfigsFailed - rulesFailed, rulesHaveErrors := checkRules(ruleFiles, lintSettings.rulesLintConfig) + rulesFailed, rulesHaveErrors := checkRules(ruleFiles, lintSettings.rulesLintConfig, p) failed = failed || rulesFailed hasErrors = hasErrors || rulesHaveErrors } @@ -837,13 +846,13 @@ func checkSDFile(filename string) ([]*targetgroup.Group, error) { } // CheckRules validates rule files. -func CheckRules(ls rulesLintConfig, files ...string) int { +func CheckRules(ls rulesLintConfig, p parser.Parser, files ...string) int { failed := false hasErrors := false if len(files) == 0 { - failed, hasErrors = checkRulesFromStdin(ls) + failed, hasErrors = checkRulesFromStdin(ls, p) } else { - failed, hasErrors = checkRules(files, ls) + failed, hasErrors = checkRules(files, ls, p) } if failed && hasErrors { @@ -857,7 +866,7 @@ func CheckRules(ls rulesLintConfig, files ...string) int { } // checkRulesFromStdin validates rule from stdin. -func checkRulesFromStdin(ls rulesLintConfig) (bool, bool) { +func checkRulesFromStdin(ls rulesLintConfig, p parser.Parser) (bool, bool) { failed := false hasErrors := false fmt.Println("Checking standard input") @@ -866,7 +875,7 @@ func checkRulesFromStdin(ls rulesLintConfig) (bool, bool) { fmt.Fprintln(os.Stderr, " FAILED:", err) return true, true } - rgs, errs := rulefmt.Parse(data, ls.ignoreUnknownFields, ls.nameValidationScheme) + rgs, errs := rulefmt.Parse(data, ls.ignoreUnknownFields, ls.nameValidationScheme, p) if errs != nil { failed = true fmt.Fprintln(os.Stderr, " FAILED:") @@ -895,12 +904,12 @@ func checkRulesFromStdin(ls rulesLintConfig) (bool, bool) { } // checkRules validates rule files. -func checkRules(files []string, ls rulesLintConfig) (bool, bool) { +func checkRules(files []string, ls rulesLintConfig, p parser.Parser) (bool, bool) { failed := false hasErrors := false for _, f := range files { fmt.Println("Checking", f) - rgs, errs := rulefmt.ParseFile(f, ls.ignoreUnknownFields, ls.nameValidationScheme) + rgs, errs := rulefmt.ParseFile(f, ls.ignoreUnknownFields, ls.nameValidationScheme, p) if errs != nil { failed = true fmt.Fprintln(os.Stderr, " FAILED:") @@ -1341,8 +1350,8 @@ func checkTargetGroupsForScrapeConfig(targetGroups []*targetgroup.Group, scfg *c return nil } -func formatPromQL(query string) error { - expr, err := parser.ParseExpr(query) +func formatPromQL(query string, p parser.Parser) error { + expr, err := p.ParseExpr(query) if err != nil { return err } @@ -1351,8 +1360,8 @@ func formatPromQL(query string) error { return nil } -func labelsSetPromQL(query, labelMatchType, name, value string) error { - expr, err := parser.ParseExpr(query) +func labelsSetPromQL(query, labelMatchType, name, value string, p parser.Parser) error { + expr, err := p.ParseExpr(query) if err != nil { return err } @@ -1396,8 +1405,8 @@ func labelsSetPromQL(query, labelMatchType, name, value string) error { return nil } -func labelsDeletePromQL(query, name string) error { - expr, err := parser.ParseExpr(query) +func labelsDeletePromQL(query, name string, p parser.Parser) error { + expr, err := p.ParseExpr(query) if err != nil { return err } diff --git a/cmd/promtool/main_test.go b/cmd/promtool/main_test.go index 4f4ca3de71..297dd35d70 100644 --- a/cmd/promtool/main_test.go +++ b/cmd/promtool/main_test.go @@ -37,6 +37,7 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/rulefmt" + "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/promql/promqltest" ) @@ -187,7 +188,7 @@ func TestCheckDuplicates(t *testing.T) { c := test t.Run(c.name, func(t *testing.T) { t.Parallel() - rgs, err := rulefmt.ParseFile(c.ruleFile, false, model.UTF8Validation) + rgs, err := rulefmt.ParseFile(c.ruleFile, false, model.UTF8Validation, parser.NewParser(parser.Options{})) require.Empty(t, err) dups := checkDuplicates(rgs.Groups) require.Equal(t, c.expectedDups, dups) @@ -196,7 +197,7 @@ func TestCheckDuplicates(t *testing.T) { } func BenchmarkCheckDuplicates(b *testing.B) { - rgs, err := rulefmt.ParseFile("./testdata/rules_large.yml", false, model.UTF8Validation) + rgs, err := rulefmt.ParseFile("./testdata/rules_large.yml", false, model.UTF8Validation, parser.NewParser(parser.Options{})) require.Empty(b, err) for b.Loop() { @@ -602,7 +603,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation), parser.NewParser(parser.Options{})) require.Equal(t, successExitCode, exitCode) }) @@ -624,7 +625,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation), parser.NewParser(parser.Options{})) require.Equal(t, failureExitCode, exitCode) }) @@ -646,7 +647,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true, false, model.UTF8Validation)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true, false, model.UTF8Validation), parser.NewParser(parser.Options{})) require.Equal(t, lintErrExitCode, exitCode) }) } @@ -655,7 +656,7 @@ func TestCheckRulesWithFeatureFlag(t *testing.T) { // As opposed to TestCheckRules calling CheckRules directly we run promtool // so the feature flag parsing can be tested. - args := []string{"-test.main", "--enable-feature=promql-experimental-functions", "check", "rules", "testdata/features.yml"} + args := []string{"-test.main", "--enable-feature=promql-experimental-functions", "--enable-feature=promql-duration-expr", "--enable-feature=promql-extended-range-selectors", "check", "rules", "testdata/features.yml"} tool := exec.Command(promtoolPath, args...) err := tool.Run() require.NoError(t, err) @@ -664,19 +665,19 @@ func TestCheckRulesWithFeatureFlag(t *testing.T) { func TestCheckRulesWithRuleFiles(t *testing.T) { t.Run("rules-good", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation), "./testdata/rules.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation), parser.NewParser(parser.Options{}), "./testdata/rules.yml") require.Equal(t, successExitCode, exitCode) }) t.Run("rules-bad", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation), "./testdata/rules-bad.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false, model.UTF8Validation), parser.NewParser(parser.Options{}), "./testdata/rules-bad.yml") require.Equal(t, failureExitCode, exitCode) }) t.Run("rules-lint-fatal", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true, false, model.UTF8Validation), "./testdata/prometheus-rules.lint.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true, false, model.UTF8Validation), parser.NewParser(parser.Options{}), "./testdata/prometheus-rules.lint.yml") require.Equal(t, lintErrExitCode, exitCode) }) } @@ -705,20 +706,21 @@ func TestCheckScrapeConfigs(t *testing.T) { } { t.Run(tc.name, func(t *testing.T) { // Non-fatal linting. - code := CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, false, false, model.UTF8Validation, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + p := parser.NewParser(parser.Options{}) + code := CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, false, false, model.UTF8Validation, tc.lookbackDelta), p, "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") require.Equal(t, successExitCode, code, "Non-fatal linting should return success") // Fatal linting. - code = CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, false, model.UTF8Validation, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + code = CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, false, model.UTF8Validation, tc.lookbackDelta), p, "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") if tc.expectError { require.Equal(t, lintErrExitCode, code, "Fatal linting should return error") } else { require.Equal(t, successExitCode, code, "Fatal linting should return success when there are no problems") } // Check syntax only, no linting. - code = CheckConfig(false, true, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, false, model.UTF8Validation, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + code = CheckConfig(false, true, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, false, model.UTF8Validation, tc.lookbackDelta), p, "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") require.Equal(t, successExitCode, code, "Fatal linting should return success when checking syntax only") // Lint option "none" should disable linting. - code = CheckConfig(false, false, newConfigLintConfig(lintOptionNone+","+lintOptionTooLongScrapeInterval, true, false, model.UTF8Validation, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + code = CheckConfig(false, false, newConfigLintConfig(lintOptionNone+","+lintOptionTooLongScrapeInterval, true, false, model.UTF8Validation, tc.lookbackDelta), p, "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") require.Equal(t, successExitCode, code, `Fatal linting should return success when lint option "none" is specified`) }) } @@ -734,7 +736,6 @@ func TestTSDBDumpCommand(t *testing.T) { load 1m metric{foo="bar"} 1 2 3 `) - t.Cleanup(func() { storage.Close() }) for _, c := range []struct { name string diff --git a/cmd/promtool/testdata/features.yml b/cmd/promtool/testdata/features.yml index 769f8362bf..946e07d0d7 100644 --- a/cmd/promtool/testdata/features.yml +++ b/cmd/promtool/testdata/features.yml @@ -1,6 +1,10 @@ groups: - name: features rules: - - record: x - # We don't expect anything from this, just want to check the function parses. + # We don't expect anything from these, just want to check the syntax parses. + - record: promql-experimental-functions expr: sort_by_label(up, "instance") + - record: promql-duration-expr + expr: rate(up[1m * 2]) + - record: promql-extended-range-selectors + expr: rate(up[1m] anchored) diff --git a/cmd/promtool/tsdb.go b/cmd/promtool/tsdb.go index d0016ec0aa..1aaf87bc42 100644 --- a/cmd/promtool/tsdb.go +++ b/cmd/promtool/tsdb.go @@ -408,13 +408,13 @@ func openBlock(path, blockID string) (*tsdb.DBReadOnly, tsdb.BlockReader, error) return db, b, nil } -func analyzeBlock(ctx context.Context, path, blockID string, limit int, runExtended bool, matchers string) error { +func analyzeBlock(ctx context.Context, path, blockID string, limit int, runExtended bool, matchers string, p parser.Parser) error { var ( selectors []*labels.Matcher err error ) if len(matchers) > 0 { - selectors, err = parser.ParseMetricSelector(matchers) + selectors, err = p.ParseMetricSelector(matchers) if err != nil { return err } @@ -478,24 +478,24 @@ func analyzeBlock(ctx context.Context, path, blockID string, limit int, runExten labelpairsCount := map[string]uint64{} entries := 0 var ( - p index.Postings - refs []storage.SeriesRef + postings index.Postings + refs []storage.SeriesRef ) if len(matchers) > 0 { - p, err = tsdb.PostingsForMatchers(ctx, ir, selectors...) + postings, err = tsdb.PostingsForMatchers(ctx, ir, selectors...) if err != nil { return err } // Expand refs first and cache in memory. // So later we don't have to expand again. - refs, err = index.ExpandPostings(p) + refs, err = index.ExpandPostings(postings) if err != nil { return err } fmt.Printf("Matched series: %d\n", len(refs)) - p = index.NewListPostings(refs) + postings = index.NewListPostings(refs) } else { - p, err = ir.Postings(ctx, "", "") // The special all key. + postings, err = ir.Postings(ctx, "", "") // The special all key. if err != nil { return err } @@ -503,8 +503,8 @@ func analyzeBlock(ctx context.Context, path, blockID string, limit int, runExten chks := []chunks.Meta{} builder := labels.ScratchBuilder{} - for p.Next() { - if err = ir.Series(p.At(), &builder, &chks); err != nil { + for postings.Next() { + if err = ir.Series(postings.At(), &builder, &chks); err != nil { return err } // Amount of the block time range not covered by this series. @@ -517,8 +517,8 @@ func analyzeBlock(ctx context.Context, path, blockID string, limit int, runExten entries++ }) } - if p.Err() != nil { - return p.Err() + if postings.Err() != nil { + return postings.Err() } fmt.Printf("Postings (unique label pairs): %d\n", len(labelpairsUncovered)) fmt.Printf("Postings entries (total label pairs): %d\n", entries) @@ -706,7 +706,7 @@ func analyzeCompaction(ctx context.Context, block tsdb.BlockReader, indexr tsdb. type SeriesSetFormatter func(series storage.SeriesSet) error -func dumpTSDBData(ctx context.Context, dbDir, sandboxDirRoot string, mint, maxt int64, match []string, formatter SeriesSetFormatter) (err error) { +func dumpTSDBData(ctx context.Context, dbDir, sandboxDirRoot string, mint, maxt int64, match []string, formatter SeriesSetFormatter, p parser.Parser) (err error) { db, err := tsdb.OpenDBReadOnly(dbDir, sandboxDirRoot, nil) if err != nil { return err @@ -720,7 +720,7 @@ func dumpTSDBData(ctx context.Context, dbDir, sandboxDirRoot string, mint, maxt } defer q.Close() - matcherSets, err := parser.ParseMetricSelectors(match) + matcherSets, err := p.ParseMetricSelectors(match) if err != nil { return err } diff --git a/cmd/promtool/tsdb_test.go b/cmd/promtool/tsdb_test.go index 3a2a5aff72..86d7c67d77 100644 --- a/cmd/promtool/tsdb_test.go +++ b/cmd/promtool/tsdb_test.go @@ -27,6 +27,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/promql/promqltest" "github.com/prometheus/prometheus/tsdb" ) @@ -71,6 +72,7 @@ func getDumpedSamples(t *testing.T, databasePath, sandboxDirRoot string, mint, m maxt, match, formatter, + parser.NewParser(parser.Options{}), ) require.NoError(t, err) @@ -97,7 +99,6 @@ func TestTSDBDump(t *testing.T) { heavy_metric{foo="bar"} 5 4 3 2 1 heavy_metric{foo="foo"} 5 4 3 2 1 `) - t.Cleanup(func() { storage.Close() }) tests := []struct { name string @@ -196,7 +197,6 @@ func TestTSDBDumpOpenMetrics(t *testing.T) { my_counter{foo="bar", baz="abc"} 1 2 3 4 5 my_gauge{bar="foo", abc="baz"} 9 8 0 4 7 `) - t.Cleanup(func() { storage.Close() }) tests := []struct { name string diff --git a/cmd/promtool/unittest.go b/cmd/promtool/unittest.go index 105e626eba..c9278d8a46 100644 --- a/cmd/promtool/unittest.go +++ b/cmd/promtool/unittest.go @@ -47,11 +47,11 @@ import ( // RulesUnitTest does unit testing of rules based on the unit testing files provided. // More info about the file format can be found in the docs. -func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag, debug, ignoreUnknownFields bool, files ...string) int { - return RulesUnitTestResult(io.Discard, queryOpts, runStrings, diffFlag, debug, ignoreUnknownFields, files...) +func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, p parser.Parser, runStrings []string, diffFlag, debug, ignoreUnknownFields bool, files ...string) int { + return RulesUnitTestResult(io.Discard, queryOpts, p, runStrings, diffFlag, debug, ignoreUnknownFields, files...) } -func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag, debug, ignoreUnknownFields bool, files ...string) int { +func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, p parser.Parser, runStrings []string, diffFlag, debug, ignoreUnknownFields bool, files ...string) int { failed := false junit := &junitxml.JUnitXML{} @@ -61,7 +61,7 @@ func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, } for _, f := range files { - if errs := ruleUnitTest(f, queryOpts, run, diffFlag, debug, ignoreUnknownFields, junit.Suite(f)); errs != nil { + if errs := ruleUnitTest(f, queryOpts, p, run, diffFlag, debug, ignoreUnknownFields, junit.Suite(f)); errs != nil { fmt.Fprintln(os.Stderr, " FAILED:") for _, e := range errs { fmt.Fprintln(os.Stderr, e.Error()) @@ -83,7 +83,7 @@ func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, return successExitCode } -func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *regexp.Regexp, diffFlag, debug, ignoreUnknownFields bool, ts *junitxml.TestSuite) []error { +func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, p parser.Parser, run *regexp.Regexp, diffFlag, debug, ignoreUnknownFields bool, ts *junitxml.TestSuite) []error { b, err := os.ReadFile(filename) if err != nil { ts.Abort(err) @@ -132,6 +132,7 @@ func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *reg if t.Interval == 0 { t.Interval = unitTestInp.EvaluationInterval } + t.parser = p ers := t.test(testname, evalInterval, groupOrderMap, queryOpts, diffFlag, debug, ignoreUnknownFields, unitTestInp.FuzzyCompare, unitTestInp.RuleFiles...) if ers != nil { for _, e := range ers { @@ -219,6 +220,8 @@ type testGroup struct { ExternalURL string `yaml:"external_url,omitempty"` TestGroupName string `yaml:"name,omitempty"` StartTimestamp testStartTimestamp `yaml:"start_timestamp,omitempty"` + + parser parser.Parser `yaml:"-"` } // test performs the unit tests. @@ -252,6 +255,7 @@ func (tg *testGroup) test(testname string, evalInterval time.Duration, groupOrde Context: context.Background(), NotifyFunc: func(context.Context, string, ...*rules.Alert) {}, Logger: promslog.NewNopLogger(), + Parser: tg.parser, } m := rules.NewManager(opts) groupsMap, ers := m.LoadGroups(time.Duration(tg.Interval), tg.ExternalLabels, tg.ExternalURL, nil, ignoreUnknownFields, ruleFiles...) @@ -482,10 +486,10 @@ Outer: var expSamples []parsedSample for _, s := range testCase.ExpSamples { - lb, err := parser.ParseMetric(s.Labels) + lb, err := tg.parser.ParseMetric(s.Labels) var hist *histogram.FloatHistogram if err == nil && s.Histogram != "" { - _, values, parseErr := parser.ParseSeriesDesc("{} " + s.Histogram) + _, values, parseErr := tg.parser.ParseSeriesDesc("{} " + s.Histogram) switch { case parseErr != nil: err = parseErr diff --git a/cmd/promtool/unittest_test.go b/cmd/promtool/unittest_test.go index 32886fc4df..ce317e5e41 100644 --- a/cmd/promtool/unittest_test.go +++ b/cmd/promtool/unittest_test.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/promql/promqltest" "github.com/prometheus/prometheus/util/junitxml" ) @@ -153,7 +154,7 @@ func TestRulesUnitTest(t *testing.T) { } t.Run(tt.name, func(t *testing.T) { t.Parallel() - if got := RulesUnitTest(tt.queryOpts, nil, false, false, false, tt.args.files...); got != tt.want { + if got := RulesUnitTest(tt.queryOpts, parser.NewParser(parser.Options{}), nil, false, false, false, tt.args.files...); got != tt.want { t.Errorf("RulesUnitTest() = %v, want %v", got, tt.want) } }) @@ -161,7 +162,7 @@ func TestRulesUnitTest(t *testing.T) { t.Run("Junit xml output ", func(t *testing.T) { t.Parallel() var buf bytes.Buffer - if got := RulesUnitTestResult(&buf, promqltest.LazyLoaderOpts{}, nil, false, false, false, reuseFiles...); got != 1 { + if got := RulesUnitTestResult(&buf, promqltest.LazyLoaderOpts{}, parser.NewParser(parser.Options{}), nil, false, false, false, reuseFiles...); got != 1 { t.Errorf("RulesUnitTestResults() = %v, want 1", got) } var test junitxml.JUnitXML @@ -277,7 +278,7 @@ func TestRulesUnitTestRun(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { t.Parallel() - got := RulesUnitTest(tt.queryOpts, tt.args.run, false, false, tt.ignoreUnknownFields, tt.args.files...) + got := RulesUnitTest(tt.queryOpts, parser.NewParser(parser.Options{}), tt.args.run, false, false, tt.ignoreUnknownFields, tt.args.files...) require.Equal(t, tt.want, got) }) } diff --git a/config/config.go b/config/config.go index 0b9b059ab2..d721d7fb86 100644 --- a/config/config.go +++ b/config/config.go @@ -1107,6 +1107,10 @@ type TSDBConfig struct { // This should not be used directly and must be converted into OutOfOrderTimeWindow. OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"` + // StaleSeriesCompactionThreshold is a number between 0.0-1.0 indicating the % of stale series in + // the in-memory Head block. If the % of stale series crosses this threshold, stale series compaction is run immediately. + StaleSeriesCompactionThreshold float64 `yaml:"stale_series_compaction_threshold,omitempty"` + Retention *TSDBRetentionConfig `yaml:"retention,omitempty"` } diff --git a/config/config_test.go b/config/config_test.go index 08aa0b4f06..968b563e1e 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1733,8 +1733,9 @@ var expectedConf = &Config{ }, StorageConfig: StorageConfig{ TSDBConfig: &TSDBConfig{ - OutOfOrderTimeWindow: 30 * time.Minute.Milliseconds(), - OutOfOrderTimeWindowFlag: model.Duration(30 * time.Minute), + OutOfOrderTimeWindow: 30 * time.Minute.Milliseconds(), + OutOfOrderTimeWindowFlag: model.Duration(30 * time.Minute), + StaleSeriesCompactionThreshold: 0.5, Retention: &TSDBRetentionConfig{ Time: model.Duration(24 * time.Hour), Size: 1 * units.GiB, diff --git a/config/testdata/conf.good.yml b/config/testdata/conf.good.yml index 7aa53b3b74..96bf9e2b33 100644 --- a/config/testdata/conf.good.yml +++ b/config/testdata/conf.good.yml @@ -453,6 +453,7 @@ alerting: storage: tsdb: out_of_order_time_window: 30m + stale_series_compaction_threshold: 0.5 retention: time: 1d size: 1GB diff --git a/discovery/aws/aws.go b/discovery/aws/aws.go index 1ac97b3c9e..69b3b41c06 100644 --- a/discovery/aws/aws.go +++ b/discovery/aws/aws.go @@ -14,10 +14,13 @@ package aws import ( + "context" "errors" "fmt" "time" + awsConfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/config" "github.com/prometheus/common/model" @@ -43,6 +46,7 @@ const ( RoleEC2 Role = "ec2" RoleECS Role = "ecs" RoleLightsail Role = "lightsail" + RoleMSK Role = "msk" ) // UnmarshalYAML implements the yaml.Unmarshaler interface. @@ -51,7 +55,7 @@ func (c *Role) UnmarshalYAML(unmarshal func(any) error) error { return err } switch *c { - case RoleEC2, RoleECS, RoleLightsail: + case RoleEC2, RoleECS, RoleLightsail, RoleMSK: return nil default: return fmt.Errorf("unknown AWS SD role %q", *c) @@ -78,13 +82,14 @@ type SDConfig struct { // ec2 specific Filters []*EC2Filter `yaml:"filters,omitempty"` - // ecs specific + // ecs, msk specific Clusters []string `yaml:"clusters,omitempty"` // Embedded sub-configs (internal use only, not serialized) *EC2SDConfig `yaml:"-"` *ECSSDConfig `yaml:"-"` *LightsailSDConfig `yaml:"-"` + *MSKSDConfig `yaml:"-"` } // UnmarshalYAML implements the yaml.Unmarshaler interface for SDConfig. @@ -98,15 +103,20 @@ func (c *SDConfig) UnmarshalYAML(unmarshal func(any) error) error { } *c = SDConfig(aux) + var err error + c.Region, err = loadRegion(context.Background(), c.Region) + if err != nil { + return fmt.Errorf("could not determine AWS region: %w", err) + } + switch c.Role { case RoleEC2: if c.EC2SDConfig == nil { - c.EC2SDConfig = &DefaultEC2SDConfig + ec2Config := DefaultEC2SDConfig + c.EC2SDConfig = &ec2Config } c.EC2SDConfig.HTTPClientConfig = c.HTTPClientConfig - if c.Region != "" { - c.EC2SDConfig.Region = c.Region - } + c.EC2SDConfig.Region = c.Region if c.Endpoint != "" { c.EC2SDConfig.Endpoint = c.Endpoint } @@ -133,12 +143,11 @@ func (c *SDConfig) UnmarshalYAML(unmarshal func(any) error) error { } case RoleECS: if c.ECSSDConfig == nil { - c.ECSSDConfig = &DefaultECSSDConfig + ecsConfig := DefaultECSSDConfig + c.ECSSDConfig = &ecsConfig } c.ECSSDConfig.HTTPClientConfig = c.HTTPClientConfig - if c.Region != "" { - c.ECSSDConfig.Region = c.Region - } + c.ECSSDConfig.Region = c.Region if c.Endpoint != "" { c.ECSSDConfig.Endpoint = c.Endpoint } @@ -165,12 +174,11 @@ func (c *SDConfig) UnmarshalYAML(unmarshal func(any) error) error { } case RoleLightsail: if c.LightsailSDConfig == nil { - c.LightsailSDConfig = &DefaultLightsailSDConfig + lightsailConfig := DefaultLightsailSDConfig + c.LightsailSDConfig = &lightsailConfig } c.LightsailSDConfig.HTTPClientConfig = c.HTTPClientConfig - if c.Region != "" { - c.LightsailSDConfig.Region = c.Region - } + c.LightsailSDConfig.Region = c.Region if c.Endpoint != "" { c.LightsailSDConfig.Endpoint = c.Endpoint } @@ -192,6 +200,37 @@ func (c *SDConfig) UnmarshalYAML(unmarshal func(any) error) error { if c.RefreshInterval != 0 { c.LightsailSDConfig.RefreshInterval = c.RefreshInterval } + case RoleMSK: + if c.MSKSDConfig == nil { + mskConfig := DefaultMSKSDConfig + c.MSKSDConfig = &mskConfig + } + c.MSKSDConfig.HTTPClientConfig = c.HTTPClientConfig + c.MSKSDConfig.Region = c.Region + if c.Endpoint != "" { + c.MSKSDConfig.Endpoint = c.Endpoint + } + if c.AccessKey != "" { + c.MSKSDConfig.AccessKey = c.AccessKey + } + if c.SecretKey != "" { + c.MSKSDConfig.SecretKey = c.SecretKey + } + if c.Profile != "" { + c.MSKSDConfig.Profile = c.Profile + } + if c.RoleARN != "" { + c.MSKSDConfig.RoleARN = c.RoleARN + } + if c.Port != 0 { + c.MSKSDConfig.Port = c.Port + } + if c.RefreshInterval != 0 { + c.MSKSDConfig.RefreshInterval = c.RefreshInterval + } + if c.Clusters != nil { + c.MSKSDConfig.Clusters = c.Clusters + } default: return fmt.Errorf("unknown AWS SD role %q", c.Role) } @@ -223,7 +262,39 @@ func (c *SDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Di case RoleLightsail: opts.Metrics = &lightsailMetrics{refreshMetrics: awsMetrics.refreshMetrics} return NewLightsailDiscovery(c.LightsailSDConfig, opts) + case RoleMSK: + opts.Metrics = &mskMetrics{refreshMetrics: awsMetrics.refreshMetrics} + return NewMSKDiscovery(c.MSKSDConfig, opts) default: return nil, fmt.Errorf("unknown AWS SD role %q", c.Role) } } + +// loadRegion finds the region in order: AWS config/env vars ->IMDS. +func loadRegion(ctx context.Context, specifiedRegion string) (string, error) { + if specifiedRegion != "" { + return specifiedRegion, nil + } + + cfg, err := awsConfig.LoadDefaultConfig(ctx) + if err != nil { + return "", fmt.Errorf("failed to load AWS config: %w", err) + } + + if cfg.Region != "" { + return cfg.Region, nil + } + + // Fallback (may fail in non-AWS environments) + imdsClient := imds.NewFromConfig(cfg) + region, err := imdsClient.GetRegion(ctx, &imds.GetRegionInput{}) + if err != nil { + return "", fmt.Errorf("failed to get region from IMDS: %w", err) + } + + if region.Region == "" { + return "", errors.New("region not found in AWS config or IMDS") + } + + return region.Region, nil +} diff --git a/discovery/aws/aws_test.go b/discovery/aws/aws_test.go index a2f03a8b99..d1ec7b2282 100644 --- a/discovery/aws/aws_test.go +++ b/discovery/aws/aws_test.go @@ -14,13 +14,19 @@ package aws import ( + "context" "errors" + "math/rand/v2" + "net/http" + "net/http/httptest" + "os" + "path/filepath" "testing" "time" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" - "gopkg.in/yaml.v3" + "go.yaml.in/yaml/v3" ) func TestRoleUnmarshalYAML(t *testing.T) { @@ -177,3 +183,307 @@ port: 9300`, }) } } + +// TestMultipleSDConfigsDoNotShareState verifies that multiple AWS SD configs +// don't share the same underlying configuration object. This was a bug where +// all configs pointed to the same global default, causing port and other +// settings from one job to overwrite settings in another job. +func TestMultipleSDConfigsDoNotShareState(t *testing.T) { + tests := []struct { + name string + yaml string + validateFunc func(t *testing.T, cfg1, cfg2 *SDConfig) + }{ + { + name: "EC2MultipleJobsDifferentPorts", + yaml: ` +- role: ec2 + region: us-west-2 + port: 9100 + filters: + - name: tag:Name + values: [host-1] +- role: ec2 + region: us-west-2 + port: 9101 + filters: + - name: tag:Name + values: [host-2]`, + validateFunc: func(t *testing.T, cfg1, cfg2 *SDConfig) { + require.Equal(t, RoleEC2, cfg1.Role) + require.Equal(t, RoleEC2, cfg2.Role) + require.NotNil(t, cfg1.EC2SDConfig) + require.NotNil(t, cfg2.EC2SDConfig) + + // Verify ports are different and not shared + require.Equal(t, 9100, cfg1.EC2SDConfig.Port) + require.Equal(t, 9101, cfg2.EC2SDConfig.Port) + + // Verify filters are different and not shared + require.Len(t, cfg1.EC2SDConfig.Filters, 1) + require.Len(t, cfg2.EC2SDConfig.Filters, 1) + require.Equal(t, []string{"host-1"}, cfg1.EC2SDConfig.Filters[0].Values) + require.Equal(t, []string{"host-2"}, cfg2.EC2SDConfig.Filters[0].Values) + + // Most importantly: verify they're not the same pointer + require.NotSame(t, cfg1.EC2SDConfig, cfg2.EC2SDConfig, + "EC2SDConfig objects should not share the same memory address") + }, + }, + { + name: "ECSMultipleJobsDifferentPorts", + yaml: ` +- role: ecs + region: us-east-1 + port: 8080 + clusters: [cluster-a] +- role: ecs + region: us-east-1 + port: 8081 + clusters: [cluster-b]`, + validateFunc: func(t *testing.T, cfg1, cfg2 *SDConfig) { + require.Equal(t, RoleECS, cfg1.Role) + require.Equal(t, RoleECS, cfg2.Role) + require.NotNil(t, cfg1.ECSSDConfig) + require.NotNil(t, cfg2.ECSSDConfig) + + require.Equal(t, 8080, cfg1.ECSSDConfig.Port) + require.Equal(t, 8081, cfg2.ECSSDConfig.Port) + require.Equal(t, []string{"cluster-a"}, cfg1.ECSSDConfig.Clusters) + require.Equal(t, []string{"cluster-b"}, cfg2.ECSSDConfig.Clusters) + + require.NotSame(t, cfg1.ECSSDConfig, cfg2.ECSSDConfig, + "ECSSDConfig objects should not share the same memory address") + }, + }, + { + name: "LightsailMultipleJobsDifferentPorts", + yaml: ` +- role: lightsail + region: eu-west-1 + port: 7070 +- role: lightsail + region: eu-west-1 + port: 7071`, + validateFunc: func(t *testing.T, cfg1, cfg2 *SDConfig) { + require.Equal(t, RoleLightsail, cfg1.Role) + require.Equal(t, RoleLightsail, cfg2.Role) + require.NotNil(t, cfg1.LightsailSDConfig) + require.NotNil(t, cfg2.LightsailSDConfig) + + require.Equal(t, 7070, cfg1.LightsailSDConfig.Port) + require.Equal(t, 7071, cfg2.LightsailSDConfig.Port) + + require.NotSame(t, cfg1.LightsailSDConfig, cfg2.LightsailSDConfig, + "LightsailSDConfig objects should not share the same memory address") + }, + }, + { + name: "MSKMultipleJobsDifferentPorts", + yaml: ` +- role: msk + region: ap-south-1 + port: 6060 + clusters: ["cluster-1"] +- role: msk + region: ap-south-1 + port: 6061 + clusters: ["cluster-2"]`, + validateFunc: func(t *testing.T, cfg1, cfg2 *SDConfig) { + require.Equal(t, RoleMSK, cfg1.Role) + require.Equal(t, RoleMSK, cfg2.Role) + require.NotNil(t, cfg1.MSKSDConfig) + require.NotNil(t, cfg2.MSKSDConfig) + + require.Equal(t, 6060, cfg1.MSKSDConfig.Port) + require.Equal(t, []string{"cluster-1"}, cfg1.MSKSDConfig.Clusters) + require.Equal(t, 6061, cfg2.MSKSDConfig.Port) + require.Equal(t, []string{"cluster-2"}, cfg2.MSKSDConfig.Clusters) + + require.NotSame(t, cfg1.MSKSDConfig, cfg2.MSKSDConfig, + "MSKSDConfig objects should not share the same memory address") + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var configs []SDConfig + require.NoError(t, yaml.Unmarshal([]byte(tt.yaml), &configs)) + require.Len(t, configs, 2) + tt.validateFunc(t, &configs[0], &configs[1]) + }) + } +} + +// getRandomRegion is a helper to return a pseudo-random AWS region for testing. +func getRandomRegion() string { + regions := []string{ + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + "eu-west-1", + "eu-west-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-northeast-1", + "ap-northeast-2", + } + + return regions[rand.IntN(len(regions))] +} + +func TestLoadRegion(t *testing.T) { + t.Run("with_env_region", func(t *testing.T) { + randomRegion := getRandomRegion() + t.Setenv("AWS_REGION", randomRegion) + t.Setenv("AWS_ACCESS_KEY_ID", "dummy") + t.Setenv("AWS_SECRET_ACCESS_KEY", "dummy") + t.Setenv("AWS_CONFIG_FILE", "") // Ensure no config file is used + t.Setenv("AWS_PROFILE", "") // Ensure no profile file is used + + region, err := loadRegion(context.Background(), "") + require.NoError(t, err) + require.Equal(t, randomRegion, region) + }) + + t.Run("with_config_file_default_profile", func(t *testing.T) { + randomRegion := getRandomRegion() + + // Create a temporary AWS config file + tmpDir := t.TempDir() + configFile := filepath.Join(tmpDir, "config") + + configContent := `[default] +region = ` + randomRegion + ` +` + + err := os.WriteFile(configFile, []byte(configContent), 0o644) + require.NoError(t, err) + defer os.Remove(configFile) + + // Set up environment to use the config file + t.Setenv("AWS_CONFIG_FILE", configFile) + t.Setenv("AWS_ACCESS_KEY_ID", "dummy") + t.Setenv("AWS_SECRET_ACCESS_KEY", "dummy") + // Clear any region environment variables to force config file usage + t.Setenv("AWS_REGION", "") + t.Setenv("AWS_PROFILE", "") // Ensure no profile file is used + t.Setenv("AWS_DEFAULT_REGION", "") + + region, err := loadRegion(context.Background(), "") + require.NoError(t, err) + require.Equal(t, randomRegion, region) + }) + + t.Run("with_config_file_named_profile", func(t *testing.T) { + randomRegion := getRandomRegion() + + // Create a temporary AWS config file + tmpDir := t.TempDir() + configFile := filepath.Join(tmpDir, "config") + + configContent := `[default] +region = ` + getRandomRegion() + ` + +[profile ` + randomRegion + `-profile] +region = ` + randomRegion + ` +` + + err := os.WriteFile(configFile, []byte(configContent), 0o644) + require.NoError(t, err) + defer os.Remove(configFile) + + // Set up environment to use the config file + t.Setenv("AWS_CONFIG_FILE", configFile) + t.Setenv("AWS_PROFILE", randomRegion+"-profile") + t.Setenv("AWS_ACCESS_KEY_ID", "dummy") + t.Setenv("AWS_SECRET_ACCESS_KEY", "dummy") + // Clear any region environment variables to force config file usage + t.Setenv("AWS_REGION", "") + t.Setenv("AWS_DEFAULT_REGION", "") + + region, err := loadRegion(context.Background(), "") + require.NoError(t, err) + require.Equal(t, randomRegion, region) + }) + + t.Run("with_specified_region", func(t *testing.T) { + specifiedRegion := getRandomRegion() + + // Even with environment region set differently, specified region should take precedence + t.Setenv("AWS_REGION", getRandomRegion()) + t.Setenv("AWS_ACCESS_KEY_ID", "dummy") + t.Setenv("AWS_SECRET_ACCESS_KEY", "dummy") + + region, err := loadRegion(context.Background(), specifiedRegion) + require.NoError(t, err) + require.Equal(t, specifiedRegion, region) + }) + + t.Run("imds_fallback", func(t *testing.T) { + randomRegion := getRandomRegion() + + // Mock IMDS server that returns a region + mockIMDS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Handle instance identity document (contains region info) + if r.URL.Path == "/latest/dynamic/instance-identity/document" { + imdsPayload := `{"region": "` + randomRegion + `"}` + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(imdsPayload)) + return + } + w.WriteHeader(http.StatusNotFound) + })) + defer mockIMDS.Close() + + // Set up environment with no region but valid credentials + // This will force fallback to IMDS + t.Setenv("AWS_ACCESS_KEY_ID", "dummy") + t.Setenv("AWS_SECRET_ACCESS_KEY", "dummy") + // Unset any existing region + t.Setenv("AWS_REGION", "") + t.Setenv("AWS_DEFAULT_REGION", "") + t.Setenv("AWS_CONFIG_FILE", "") // Ensure no config file is used + t.Setenv("AWS_PROFILE", "") // Ensure no profile file is used + // Point IMDS to our mock server + t.Setenv("AWS_EC2_METADATA_SERVICE_ENDPOINT", mockIMDS.URL) + + region, err := loadRegion(context.Background(), "") + require.NoError(t, err) + require.Equal(t, randomRegion, region) + }) + + t.Run("imds_empty_region", func(t *testing.T) { + // Mock IMDS server that returns empty region + mockIMDS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Handle instance identity document with empty region + if r.URL.Path == "/latest/dynamic/instance-identity/document" { + imdsPayload := `{"region": ""}` + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(imdsPayload)) + return + } + w.WriteHeader(http.StatusNotFound) + })) + defer mockIMDS.Close() + + // Set up environment with no region but valid credentials + t.Setenv("AWS_ACCESS_KEY_ID", "dummy") + t.Setenv("AWS_SECRET_ACCESS_KEY", "dummy") + // Unset any existing region + t.Setenv("AWS_REGION", "") + t.Setenv("AWS_DEFAULT_REGION", "") + t.Setenv("AWS_CONFIG_FILE", "") // Ensure no config file is used + t.Setenv("AWS_PROFILE", "") // Ensure no profile file is used + // Point IMDS to our mock server + t.Setenv("AWS_EC2_METADATA_SERVICE_ENDPOINT", mockIMDS.URL) + + _, err := loadRegion(context.Background(), "") + require.Error(t, err) + require.Contains(t, err.Error(), "failed to get region from IMDS") + }) +} diff --git a/discovery/aws/ec2.go b/discovery/aws/ec2.go index 19ecebd491..4daff43ecc 100644 --- a/discovery/aws/ec2.go +++ b/discovery/aws/ec2.go @@ -27,7 +27,6 @@ import ( awsConfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/credentials/stscreds" - "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2Types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/aws/aws-sdk-go-v2/service/sts" @@ -125,31 +124,10 @@ func (c *EC2SDConfig) UnmarshalYAML(unmarshal func(any) error) error { return err } - if c.Region == "" { - cfg, err := awsConfig.LoadDefaultConfig(context.Background()) - if err != nil { - return err - } - - if cfg.Region != "" { - // If the region is already set in the config, use it. - // This can happen if the user has set the region in the AWS config file or environment variables. - c.Region = cfg.Region - } - - if c.Region == "" { - // Try to get the region from the instance metadata service (IMDS). - imdsClient := imds.NewFromConfig(cfg) - region, err := imdsClient.GetRegion(context.Background(), &imds.GetRegionInput{}) - if err != nil { - return err - } - c.Region = region.Region - } - } - - if c.Region == "" { - return errors.New("EC2 SD configuration requires a region") + // Check if the region is set, if not attempt to load it from the AWS SDK. + c.Region, err = loadRegion(context.Background(), c.Region) + if err != nil { + return fmt.Errorf("could not determine AWS region: %w", err) } for _, f := range c.Filters { diff --git a/discovery/aws/ecs.go b/discovery/aws/ecs.go index 1d5ff366de..18d2746cb6 100644 --- a/discovery/aws/ecs.go +++ b/discovery/aws/ecs.go @@ -19,7 +19,9 @@ import ( "fmt" "log/slog" "net" + "slices" "strconv" + "strings" "sync" "time" @@ -27,7 +29,6 @@ import ( awsConfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/credentials/stscreds" - "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/ecs" "github.com/aws/aws-sdk-go-v2/service/ecs/types" @@ -137,17 +138,9 @@ func (c *ECSSDConfig) UnmarshalYAML(unmarshal func(any) error) error { return err } - if c.Region == "" { - cfg, err := awsConfig.LoadDefaultConfig(context.TODO()) - if err != nil { - return err - } - client := imds.NewFromConfig(cfg) - result, err := client.GetRegion(context.Background(), &imds.GetRegionInput{}) - if err != nil { - return fmt.Errorf("ECS SD configuration requires a region. Tried to fetch it from the instance metadata: %w", err) - } - c.Region = result.Region + c.Region, err = loadRegion(context.Background(), c.Region) + if err != nil { + return fmt.Errorf("could not determine AWS region: %w", err) } return c.HTTPClientConfig.Validate() @@ -273,7 +266,6 @@ func (d *ECSDiscovery) initEcsClient(ctx context.Context) error { // listClusterARNs returns a slice of cluster arns. // This method does not use concurrency as it's a simple paginated call. -// AWS ECS Cluster read actions have burst=50, sustained=20 req/sec limits. func (d *ECSDiscovery) listClusterARNs(ctx context.Context) ([]string, error) { var ( clusterARNs []string @@ -281,7 +273,8 @@ func (d *ECSDiscovery) listClusterARNs(ctx context.Context) ([]string, error) { ) for { resp, err := d.ecs.ListClusters(ctx, &ecs.ListClustersInput{ - NextToken: nextToken, + NextToken: nextToken, + MaxResults: aws.Int32(100), }) if err != nil { return nil, fmt.Errorf("could not list clusters: %w", err) @@ -299,56 +292,61 @@ func (d *ECSDiscovery) listClusterARNs(ctx context.Context) ([]string, error) { } // describeClusters returns a map of cluster ARN to a slice of clusters. -// This method processes clusters in batches without concurrency as it's typically -// a single call handling up to 100 clusters. AWS ECS Cluster read actions have -// burst=50, sustained=20 req/sec limits. +// Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. +// Clusters are described in batches of 100 to respect AWS API limits (DescribeClusters allows up to 100 clusters per call). func (d *ECSDiscovery) describeClusters(ctx context.Context, clusters []string) (map[string]types.Cluster, error) { + mu := sync.Mutex{} clusterMap := make(map[string]types.Cluster) - - // AWS DescribeClusters can handle up to 100 clusters per call - batchSize := 100 - for _, batch := range batchSlice(clusters, batchSize) { - resp, err := d.ecs.DescribeClusters(ctx, &ecs.DescribeClustersInput{ - Clusters: batch, - Include: []types.ClusterField{"TAGS"}, - }) - if err != nil { - d.logger.Error("Failed to describe clusters", "clusters", batch, "error", err) - return nil, fmt.Errorf("could not describe clusters %v: %w", batch, err) - } - - for _, c := range resp.Clusters { - if c.ClusterArn != nil { - clusterMap[*c.ClusterArn] = c + errg, ectx := errgroup.WithContext(ctx) + errg.SetLimit(d.cfg.RequestConcurrency) + for batch := range slices.Chunk(clusters, 100) { + errg.Go(func() error { + resp, err := d.ecs.DescribeClusters(ectx, &ecs.DescribeClustersInput{ + Clusters: batch, + Include: []types.ClusterField{"TAGS"}, + }) + if err != nil { + d.logger.Error("Failed to describe clusters", "clusters", batch, "error", err) + return fmt.Errorf("could not describe clusters %v: %w", batch, err) } - } + + for _, cluster := range resp.Clusters { + if cluster.ClusterArn != nil { + mu.Lock() + clusterMap[*cluster.ClusterArn] = cluster + mu.Unlock() + } + } + return nil + }) } - return clusterMap, nil + return clusterMap, errg.Wait() } // listServiceARNs returns a map of cluster ARN to a slice of service ARNs. // Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. -// AWS ECS Service read actions have burst=100, sustained=20 req/sec limits. +// Services are listed in batches of 100 to respect AWS API limits (ListServices allows up to 100 services per call). func (d *ECSDiscovery) listServiceARNs(ctx context.Context, clusters []string) (map[string][]string, error) { - serviceARNsMu := sync.Mutex{} - serviceARNs := make(map[string][]string) + mu := sync.Mutex{} + services := make(map[string][]string) errg, ectx := errgroup.WithContext(ctx) errg.SetLimit(d.cfg.RequestConcurrency) for _, clusterARN := range clusters { errg.Go(func() error { var nextToken *string - var clusterServiceARNs []string + var serviceARNs []string for { resp, err := d.ecs.ListServices(ectx, &ecs.ListServicesInput{ - Cluster: aws.String(clusterARN), - NextToken: nextToken, + Cluster: aws.String(clusterARN), + NextToken: nextToken, + MaxResults: aws.Int32(100), }) if err != nil { return fmt.Errorf("could not list services for cluster %q: %w", clusterARN, err) } - clusterServiceARNs = append(clusterServiceARNs, resp.ServiceArns...) + serviceARNs = append(serviceARNs, resp.ServiceArns...) if resp.NextToken == nil { break @@ -356,75 +354,76 @@ func (d *ECSDiscovery) listServiceARNs(ctx context.Context, clusters []string) ( nextToken = resp.NextToken } - serviceARNsMu.Lock() - serviceARNs[clusterARN] = clusterServiceARNs - serviceARNsMu.Unlock() + mu.Lock() + services[clusterARN] = serviceARNs + mu.Unlock() return nil }) } - return serviceARNs, errg.Wait() -} - -// describeServices returns a map of cluster ARN to services. -// Uses concurrent requests with batching (10 services per request) to respect AWS API limits. -// AWS ECS Service read actions have burst=100, sustained=20 req/sec limits. -func (d *ECSDiscovery) describeServices(ctx context.Context, clusterServiceARNsMap map[string][]string) (map[string][]types.Service, error) { - batchSize := 10 // AWS DescribeServices API limit is 10 services per request - serviceMu := sync.Mutex{} - services := make(map[string][]types.Service) - errg, ectx := errgroup.WithContext(ctx) - errg.SetLimit(d.cfg.RequestConcurrency) - for clusterARN, serviceARNs := range clusterServiceARNsMap { - for _, batch := range batchSlice(serviceARNs, batchSize) { - errg.Go(func() error { - resp, err := d.ecs.DescribeServices(ectx, &ecs.DescribeServicesInput{ - Services: batch, - Cluster: aws.String(clusterARN), - Include: []types.ServiceField{"TAGS"}, - }) - if err != nil { - d.logger.Error("Failed to describe services", "cluster", clusterARN, "batch", batch, "error", err) - return fmt.Errorf("could not describe services for cluster %q: %w", clusterARN, err) - } - - serviceMu.Lock() - services[clusterARN] = append(services[clusterARN], resp.Services...) - serviceMu.Unlock() - - return nil - }) - } - } - return services, errg.Wait() } -// listTaskARNs returns a map of service ARN to a slice of task ARNs. +// describeServices returns a map of service name to service. // Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. -// AWS ECS Cluster resource read actions have burst=100, sustained=20 req/sec limits. -func (d *ECSDiscovery) listTaskARNs(ctx context.Context, services []types.Service) (map[string][]string, error) { - taskARNsMu := sync.Mutex{} - taskARNs := make(map[string][]string) +// Services are described in batches of 10 to respect AWS API limits (DescribeServices allows up to 10 services per call). +func (d *ECSDiscovery) describeServices(ctx context.Context, clusterARN string, serviceARNS []string) (map[string]types.Service, error) { + mu := sync.Mutex{} + services := make(map[string]types.Service) errg, ectx := errgroup.WithContext(ctx) errg.SetLimit(d.cfg.RequestConcurrency) - for _, service := range services { + for batch := range slices.Chunk(serviceARNS, 10) { errg.Go(func() error { - serviceArn := aws.ToString(service.ServiceArn) + resp, err := d.ecs.DescribeServices(ectx, &ecs.DescribeServicesInput{ + Cluster: aws.String(clusterARN), + Services: batch, + Include: []types.ServiceField{"TAGS"}, + }) + if err != nil { + d.logger.Error("Failed to describe services", "cluster", clusterARN, "batch", batch, "error", err) + return fmt.Errorf("could not describe services for cluster %q: batch %v: %w", clusterARN, batch, err) + } - var nextToken *string - var serviceTaskARNs []string + for _, service := range resp.Services { + if service.ServiceArn != nil { + mu.Lock() + services[*service.ServiceName] = service + mu.Unlock() + } + } + return nil + }) + } + + return services, errg.Wait() +} + +// listTaskARNs returns a map of clustersARN to a slice of task ARNs. +// Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. +// Tasks are listed in batches of 100 to respect AWS API limits (ListTasks allows up to 100 tasks per call). +// This method also uses pagination to handle cases where there are more than 100 tasks in a cluster. +func (d *ECSDiscovery) listTaskARNs(ctx context.Context, clusterARNs []string) (map[string][]string, error) { + mu := sync.Mutex{} + tasks := make(map[string][]string) + errg, ectx := errgroup.WithContext(ctx) + errg.SetLimit(d.cfg.RequestConcurrency) + for _, clusterARN := range clusterARNs { + errg.Go(func() error { + var ( + nextToken *string + taskARNs []string + ) for { resp, err := d.ecs.ListTasks(ectx, &ecs.ListTasksInput{ - Cluster: aws.String(*service.ClusterArn), - ServiceName: aws.String(*service.ServiceName), - NextToken: nextToken, + Cluster: aws.String(clusterARN), + NextToken: nextToken, + MaxResults: aws.Int32(100), }) if err != nil { - return fmt.Errorf("could not list tasks for service %q: %w", serviceArn, err) + return fmt.Errorf("could not list tasks for cluster %q: %w", clusterARN, err) } - serviceTaskARNs = append(serviceTaskARNs, resp.TaskArns...) + taskARNs = append(taskARNs, resp.TaskArns...) if resp.NextToken == nil { break @@ -432,77 +431,87 @@ func (d *ECSDiscovery) listTaskARNs(ctx context.Context, services []types.Servic nextToken = resp.NextToken } - taskARNsMu.Lock() - taskARNs[serviceArn] = serviceTaskARNs - taskARNsMu.Unlock() + mu.Lock() + tasks[clusterARN] = taskARNs + mu.Unlock() return nil }) } - return taskARNs, errg.Wait() + return tasks, errg.Wait() } -// describeTasks returns a map of task arn to a slice task. -// Uses concurrent requests with batching (100 tasks per request) to respect AWS API limits. -// AWS ECS Cluster resource read actions have burst=100, sustained=20 req/sec limits. -func (d *ECSDiscovery) describeTasks(ctx context.Context, clusterARN string, taskARNsMap map[string][]string) (map[string][]types.Task, error) { - batchSize := 100 // AWS DescribeTasks API limit is 100 tasks per request - taskMu := sync.Mutex{} - tasks := make(map[string][]types.Task) +// describeTasks returns a slice of tasks. +// Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. +// Tasks are described in batches of 100 to respect AWS API limits (DescribeTasks allows up to 100 tasks per call). +func (d *ECSDiscovery) describeTasks(ctx context.Context, clusterARN string, taskARNs []string) ([]types.Task, error) { + mu := sync.Mutex{} + var tasks []types.Task errg, ectx := errgroup.WithContext(ctx) errg.SetLimit(d.cfg.RequestConcurrency) - for serviceARN, taskARNs := range taskARNsMap { - for _, batch := range batchSlice(taskARNs, batchSize) { - errg.Go(func() error { - resp, err := d.ecs.DescribeTasks(ectx, &ecs.DescribeTasksInput{ - Cluster: aws.String(clusterARN), - Tasks: batch, - Include: []types.TaskField{"TAGS"}, - }) - if err != nil { - d.logger.Error("Failed to describe tasks", "service", serviceARN, "cluster", clusterARN, "batch", batch, "error", err) - return fmt.Errorf("could not describe tasks for service %q in cluster %q: %w", serviceARN, clusterARN, err) - } - - taskMu.Lock() - tasks[serviceARN] = append(tasks[serviceARN], resp.Tasks...) - taskMu.Unlock() - - return nil + for batch := range slices.Chunk(taskARNs, 100) { + errg.Go(func() error { + resp, err := d.ecs.DescribeTasks(ectx, &ecs.DescribeTasksInput{ + Cluster: aws.String(clusterARN), + Tasks: batch, + Include: []types.TaskField{"TAGS"}, }) - } + if err != nil { + d.logger.Error("Failed to describe tasks", "cluster", clusterARN, "batch", batch, "error", err) + return fmt.Errorf("could not describe tasks in cluster %q: batch %v: %w", clusterARN, batch, err) + } + + mu.Lock() + tasks = append(tasks, resp.Tasks...) + mu.Unlock() + return nil + }) } return tasks, errg.Wait() } // describeContainerInstances returns a map of container instance ARN to EC2 instance ID -// Uses batching to respect AWS API limits (100 container instances per request). -func (d *ECSDiscovery) describeContainerInstances(ctx context.Context, clusterARN string, containerInstanceARNs []string) (map[string]string, error) { +// Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. +// Container instances are described in batches of 100 to respect AWS API limits (DescribeContainerInstances allows up to 100 container instances per call). +func (d *ECSDiscovery) describeContainerInstances(ctx context.Context, clusterARN string, tasks []types.Task) (map[string]string, error) { + containerInstanceARNs := make([]string, 0, len(tasks)) + for _, task := range tasks { + if task.ContainerInstanceArn != nil { + containerInstanceARNs = append(containerInstanceARNs, *task.ContainerInstanceArn) + } + } + if len(containerInstanceARNs) == 0 { return make(map[string]string), nil } + mu := sync.Mutex{} containerInstToEC2 := make(map[string]string) - batchSize := 100 // AWS API limit - - for _, batch := range batchSlice(containerInstanceARNs, batchSize) { - resp, err := d.ecs.DescribeContainerInstances(ctx, &ecs.DescribeContainerInstancesInput{ - Cluster: aws.String(clusterARN), - ContainerInstances: batch, - }) - if err != nil { - return nil, fmt.Errorf("could not describe container instances: %w", err) - } - - for _, ci := range resp.ContainerInstances { - if ci.ContainerInstanceArn != nil && ci.Ec2InstanceId != nil { - containerInstToEC2[*ci.ContainerInstanceArn] = *ci.Ec2InstanceId + errg, ectx := errgroup.WithContext(ctx) + errg.SetLimit(d.cfg.RequestConcurrency) + for batch := range slices.Chunk(containerInstanceARNs, 100) { + errg.Go(func() error { + resp, err := d.ecs.DescribeContainerInstances(ectx, &ecs.DescribeContainerInstancesInput{ + Cluster: aws.String(clusterARN), + ContainerInstances: batch, + }) + if err != nil { + return fmt.Errorf("could not describe container instances: %w", err) } - } + + for _, ci := range resp.ContainerInstances { + if ci.ContainerInstanceArn != nil && ci.Ec2InstanceId != nil { + mu.Lock() + containerInstToEC2[*ci.ContainerInstanceArn] = *ci.Ec2InstanceId + mu.Unlock() + } + } + return nil + }) } - return containerInstToEC2, nil + return containerInstToEC2, errg.Wait() } // ec2InstanceInfo holds information retrieved from EC2 DescribeInstances. @@ -515,83 +524,112 @@ type ec2InstanceInfo struct { } // describeEC2Instances returns a map of EC2 instance ID to instance information. +// Uses concurrent requests limited by RequestConcurrency to respect AWS API throttling. +// This method does not use concurrency as it's a simple paginated call. func (d *ECSDiscovery) describeEC2Instances(ctx context.Context, instanceIDs []string) (map[string]ec2InstanceInfo, error) { if len(instanceIDs) == 0 { return make(map[string]ec2InstanceInfo), nil } instanceInfo := make(map[string]ec2InstanceInfo) + var nextToken *string - resp, err := d.ec2.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ - InstanceIds: instanceIDs, - }) - if err != nil { - return nil, fmt.Errorf("could not describe EC2 instances: %w", err) - } + for { + resp, err := d.ec2.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + NextToken: nextToken, + }) + if err != nil { + return nil, fmt.Errorf("could not describe EC2 instances: %w", err) + } - for _, reservation := range resp.Reservations { - for _, instance := range reservation.Instances { - if instance.InstanceId != nil && instance.PrivateIpAddress != nil { - info := ec2InstanceInfo{ - privateIP: *instance.PrivateIpAddress, - tags: make(map[string]string), - } - if instance.PublicIpAddress != nil { - info.publicIP = *instance.PublicIpAddress - } - if instance.SubnetId != nil { - info.subnetID = *instance.SubnetId - } - if instance.InstanceType != "" { - info.instanceType = string(instance.InstanceType) - } - // Collect EC2 instance tags - for _, tag := range instance.Tags { - if tag.Key != nil && tag.Value != nil { - info.tags[*tag.Key] = *tag.Value + for _, reservation := range resp.Reservations { + for _, instance := range reservation.Instances { + if instance.InstanceId != nil && instance.PrivateIpAddress != nil { + info := ec2InstanceInfo{ + privateIP: *instance.PrivateIpAddress, + tags: make(map[string]string), } + if instance.PublicIpAddress != nil { + info.publicIP = *instance.PublicIpAddress + } + if instance.SubnetId != nil { + info.subnetID = *instance.SubnetId + } + if instance.InstanceType != "" { + info.instanceType = string(instance.InstanceType) + } + // Collect EC2 instance tags + for _, tag := range instance.Tags { + if tag.Key != nil && tag.Value != nil { + info.tags[*tag.Key] = *tag.Value + } + } + instanceInfo[*instance.InstanceId] = info } - instanceInfo[*instance.InstanceId] = info } } + + if resp.NextToken == nil { + break + } + nextToken = resp.NextToken } return instanceInfo, nil } // describeNetworkInterfaces returns a map of ENI ID to public IP address. -func (d *ECSDiscovery) describeNetworkInterfaces(ctx context.Context, eniIDs []string) (map[string]string, error) { +// This is needed to get the public IP for tasks using awsvpc network mode, as the ENI is what gets the public IP, not the EC2 instance. +// This method does not use concurrency as it's a simple paginated call. +func (d *ECSDiscovery) describeNetworkInterfaces(ctx context.Context, tasks []types.Task) (map[string]string, error) { + eniIDs := make([]string, 0, len(tasks)) + + for _, task := range tasks { + for _, attachment := range task.Attachments { + if attachment.Type != nil && *attachment.Type == "ElasticNetworkInterface" { + for _, detail := range attachment.Details { + if detail.Name != nil && *detail.Name == "networkInterfaceId" && detail.Value != nil { + eniIDs = append(eniIDs, *detail.Value) + break + } + } + break + } + } + } + if len(eniIDs) == 0 { return make(map[string]string), nil } eniToPublicIP := make(map[string]string) + var nextToken *string - resp, err := d.ec2.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{ - NetworkInterfaceIds: eniIDs, - }) - if err != nil { - return nil, fmt.Errorf("could not describe network interfaces: %w", err) - } - - for _, eni := range resp.NetworkInterfaces { - if eni.NetworkInterfaceId != nil && eni.Association != nil && eni.Association.PublicIp != nil { - eniToPublicIP[*eni.NetworkInterfaceId] = *eni.Association.PublicIp + for { + resp, err := d.ec2.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{ + NetworkInterfaceIds: eniIDs, + NextToken: nextToken, + }) + if err != nil { + return nil, fmt.Errorf("could not describe network interfaces: %w", err) } + + for _, eni := range resp.NetworkInterfaces { + if eni.NetworkInterfaceId != nil && eni.Association != nil && eni.Association.PublicIp != nil { + eniToPublicIP[*eni.NetworkInterfaceId] = *eni.Association.PublicIp + } + } + + if resp.NextToken == nil { + break + } + nextToken = resp.NextToken } return eniToPublicIP, nil } -func batchSlice[T any](a []T, size int) [][]T { - batches := make([][]T, 0, len(a)/size+1) - for i := 0; i < len(a); i += size { - end := min(i+size, len(a)) - batches = append(batches, a[i:end]) - } - return batches -} - func (d *ECSDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { err := d.initEcsClient(ctx) if err != nil { @@ -620,314 +658,338 @@ func (d *ECSDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error Source: d.cfg.Region, } - clusterARNMap, err := d.describeClusters(ctx, clusters) - if err != nil { - return nil, err - } + // Fetch cluster details, service ARNs, and task ARNs in parallel + var ( + clusterMap map[string]types.Cluster + serviceMap map[string][]string + taskMap map[string][]string + ) - clusterServiceARNMap, err := d.listServiceARNs(ctx, clusters) - if err != nil { - return nil, err - } + clusterErrg, clusterCtx := errgroup.WithContext(ctx) + clusterErrg.Go(func() error { + var err error + clusterMap, err = d.describeClusters(clusterCtx, clusters) + return err + }) + clusterErrg.Go(func() error { + var err error + serviceMap, err = d.listServiceARNs(clusterCtx, clusters) + return err + }) + clusterErrg.Go(func() error { + var err error + taskMap, err = d.listTaskARNs(clusterCtx, clusters) + return err + }) - clusterServicesMap, err := d.describeServices(ctx, clusterServiceARNMap) - if err != nil { + if err := clusterErrg.Wait(); err != nil { return nil, err } // Use goroutines to process clusters in parallel var ( - targetsMu sync.Mutex - wg sync.WaitGroup + clusterWg sync.WaitGroup + clusterMu sync.Mutex + clusterTargets []model.LabelSet ) - for clusterArn, clusterServices := range clusterServicesMap { - if len(clusterServices) == 0 { + for clusterARN, taskARNs := range taskMap { + if len(taskARNs) == 0 { continue } - wg.Add(1) - go func(clusterArn string, clusterServices []types.Service) { - defer wg.Done() + clusterWg.Add(1) - serviceTaskARNMap, err := d.listTaskARNs(ctx, clusterServices) - if err != nil { - d.logger.Error("Failed to list task ARNs for cluster", "cluster", clusterArn, "error", err) - return - } + go func(cluster types.Cluster, serviceARNs, taskARNs []string) { + defer clusterWg.Done() - serviceTaskMap, err := d.describeTasks(ctx, clusterArn, serviceTaskARNMap) - if err != nil { - d.logger.Error("Failed to describe tasks for cluster", "cluster", clusterArn, "error", err) - return - } - - // Process services within this cluster in parallel + // Fetch services and tasks in parallel (they're independent) var ( - serviceWg sync.WaitGroup - localTargets []model.LabelSet - localTargetsMu sync.Mutex + services map[string]types.Service + tasks []types.Task ) - for _, clusterService := range clusterServices { - serviceWg.Add(1) - go func(clusterService types.Service) { - defer serviceWg.Done() + resourceErrg, resourceCtx := errgroup.WithContext(ctx) + resourceErrg.Go(func() error { + var err error + services, err = d.describeServices(resourceCtx, *cluster.ClusterArn, serviceARNs) + if err != nil { + d.logger.Error("Failed to describe services for cluster", "cluster", *cluster.ClusterArn, "error", err) + } + return err + }) + resourceErrg.Go(func() error { + var err error + tasks, err = d.describeTasks(resourceCtx, *cluster.ClusterArn, taskARNs) + if err != nil { + d.logger.Error("Failed to describe tasks for cluster", "cluster", *cluster.ClusterArn, "error", err) + } + return err + }) - serviceArn := *clusterService.ServiceArn - - if tasks, exists := serviceTaskMap[serviceArn]; exists { - var serviceTargets []model.LabelSet - - // Collect container instance ARNs for all EC2 tasks to get instance type - var containerInstanceARNs []string - taskToContainerInstance := make(map[string]string) - // Collect ENI IDs for awsvpc tasks to get public IPs - var eniIDs []string - taskToENI := make(map[string]string) - - for _, task := range tasks { - // Collect container instance ARN for any task running on EC2 - if task.ContainerInstanceArn != nil { - containerInstanceARNs = append(containerInstanceARNs, *task.ContainerInstanceArn) - taskToContainerInstance[*task.TaskArn] = *task.ContainerInstanceArn - } - - // Collect ENI IDs from awsvpc tasks - for _, attachment := range task.Attachments { - if attachment.Type != nil && *attachment.Type == "ElasticNetworkInterface" { - for _, detail := range attachment.Details { - if detail.Name != nil && *detail.Name == "networkInterfaceId" && detail.Value != nil { - eniIDs = append(eniIDs, *detail.Value) - taskToENI[*task.TaskArn] = *detail.Value - break - } - } - break - } - } - } - - // Batch describe container instances and EC2 instances to get instance type and other metadata - var containerInstToEC2 map[string]string - var ec2InstInfo map[string]ec2InstanceInfo - if len(containerInstanceARNs) > 0 { - var err error - containerInstToEC2, err = d.describeContainerInstances(ctx, clusterArn, containerInstanceARNs) - if err != nil { - d.logger.Error("Failed to describe container instances", "cluster", clusterArn, "error", err) - // Continue processing tasks - } else { - // Collect unique EC2 instance IDs - ec2InstanceIDs := make([]string, 0, len(containerInstToEC2)) - for _, ec2ID := range containerInstToEC2 { - ec2InstanceIDs = append(ec2InstanceIDs, ec2ID) - } - - // Batch describe EC2 instances - ec2InstInfo, err = d.describeEC2Instances(ctx, ec2InstanceIDs) - if err != nil { - d.logger.Error("Failed to describe EC2 instances", "cluster", clusterArn, "error", err) - } - } - } - - // Batch describe ENIs to get public IPs for awsvpc tasks - var eniToPublicIP map[string]string - if len(eniIDs) > 0 { - var err error - eniToPublicIP, err = d.describeNetworkInterfaces(ctx, eniIDs) - if err != nil { - d.logger.Error("Failed to describe network interfaces", "cluster", clusterArn, "error", err) - // Continue processing without ENI public IPs - } - } - - for _, task := range tasks { - var ipAddress, subnetID, publicIP string - var networkMode string - var ec2InstanceID, ec2InstanceType, ec2InstancePrivateIP, ec2InstancePublicIP string - - // Try to get IP from ENI attachment (awsvpc mode) - var eniAttachment *types.Attachment - for _, attachment := range task.Attachments { - if attachment.Type != nil && *attachment.Type == "ElasticNetworkInterface" { - eniAttachment = &attachment - break - } - } - - if eniAttachment != nil { - // awsvpc networking mode - get IP from ENI - networkMode = "awsvpc" - for _, detail := range eniAttachment.Details { - switch *detail.Name { - case "privateIPv4Address": - ipAddress = *detail.Value - case "subnetId": - subnetID = *detail.Value - } - } - // Get public IP from ENI if available - if eniID, ok := taskToENI[*task.TaskArn]; ok { - if eniPublicIP, ok := eniToPublicIP[eniID]; ok { - publicIP = eniPublicIP - } - } - } else if task.ContainerInstanceArn != nil { - // bridge/host networking mode - need to get EC2 instance IP and subnet - networkMode = "bridge" - containerInstARN, ok := taskToContainerInstance[*task.TaskArn] - if ok { - ec2InstanceID, ok = containerInstToEC2[containerInstARN] - if ok { - info, ok := ec2InstInfo[ec2InstanceID] - if ok { - ipAddress = info.privateIP - publicIP = info.publicIP - subnetID = info.subnetID - ec2InstanceType = info.instanceType - ec2InstancePrivateIP = info.privateIP - ec2InstancePublicIP = info.publicIP - } else { - d.logger.Debug("EC2 instance info not found", "instance", ec2InstanceID, "task", *task.TaskArn) - } - } else { - d.logger.Debug("Container instance not found in map", "arn", containerInstARN, "task", *task.TaskArn) - } - } - } - - // Get EC2 instance metadata for awsvpc tasks running on EC2 - // We want the instance type and the host IPs for advanced use cases - if networkMode == "awsvpc" && task.ContainerInstanceArn != nil { - containerInstARN, ok := taskToContainerInstance[*task.TaskArn] - if ok { - ec2InstanceID, ok = containerInstToEC2[containerInstARN] - if ok { - info, ok := ec2InstInfo[ec2InstanceID] - if ok { - ec2InstanceType = info.instanceType - ec2InstancePrivateIP = info.privateIP - ec2InstancePublicIP = info.publicIP - } - } - } - } - - if ipAddress == "" { - continue - } - - labels := model.LabelSet{ - ecsLabelClusterARN: model.LabelValue(*clusterService.ClusterArn), - ecsLabelService: model.LabelValue(*clusterService.ServiceName), - ecsLabelServiceARN: model.LabelValue(*clusterService.ServiceArn), - ecsLabelServiceStatus: model.LabelValue(*clusterService.Status), - ecsLabelTaskGroup: model.LabelValue(*task.Group), - ecsLabelTaskARN: model.LabelValue(*task.TaskArn), - ecsLabelTaskDefinition: model.LabelValue(*task.TaskDefinitionArn), - ecsLabelIPAddress: model.LabelValue(ipAddress), - ecsLabelRegion: model.LabelValue(d.cfg.Region), - ecsLabelLaunchType: model.LabelValue(task.LaunchType), - ecsLabelAvailabilityZone: model.LabelValue(*task.AvailabilityZone), - ecsLabelDesiredStatus: model.LabelValue(*task.DesiredStatus), - ecsLabelLastStatus: model.LabelValue(*task.LastStatus), - ecsLabelHealthStatus: model.LabelValue(task.HealthStatus), - ecsLabelNetworkMode: model.LabelValue(networkMode), - } - - // Add subnet ID when available (awsvpc mode from ENI, bridge/host from EC2 instance) - if subnetID != "" { - labels[ecsLabelSubnetID] = model.LabelValue(subnetID) - } - - // Add container instance and EC2 instance info for EC2 launch type - if task.ContainerInstanceArn != nil { - labels[ecsLabelContainerInstanceARN] = model.LabelValue(*task.ContainerInstanceArn) - } - if ec2InstanceID != "" { - labels[ecsLabelEC2InstanceID] = model.LabelValue(ec2InstanceID) - } - if ec2InstanceType != "" { - labels[ecsLabelEC2InstanceType] = model.LabelValue(ec2InstanceType) - } - if ec2InstancePrivateIP != "" { - labels[ecsLabelEC2InstancePrivateIP] = model.LabelValue(ec2InstancePrivateIP) - } - if ec2InstancePublicIP != "" { - labels[ecsLabelEC2InstancePublicIP] = model.LabelValue(ec2InstancePublicIP) - } - if publicIP != "" { - labels[ecsLabelPublicIP] = model.LabelValue(publicIP) - } - - if task.PlatformFamily != nil { - labels[ecsLabelPlatformFamily] = model.LabelValue(*task.PlatformFamily) - } - if task.PlatformVersion != nil { - labels[ecsLabelPlatformVersion] = model.LabelValue(*task.PlatformVersion) - } - - labels[model.AddressLabel] = model.LabelValue(net.JoinHostPort(ipAddress, strconv.Itoa(d.cfg.Port))) - - // Add cluster tags - if cluster, exists := clusterARNMap[*clusterService.ClusterArn]; exists { - if cluster.ClusterName != nil { - labels[ecsLabelCluster] = model.LabelValue(*cluster.ClusterName) - } - - for _, clusterTag := range cluster.Tags { - if clusterTag.Key != nil && clusterTag.Value != nil { - labels[model.LabelName(ecsLabelTagCluster+strutil.SanitizeLabelName(*clusterTag.Key))] = model.LabelValue(*clusterTag.Value) - } - } - } - - // Add service tags - for _, serviceTag := range clusterService.Tags { - if serviceTag.Key != nil && serviceTag.Value != nil { - labels[model.LabelName(ecsLabelTagService+strutil.SanitizeLabelName(*serviceTag.Key))] = model.LabelValue(*serviceTag.Value) - } - } - - // Add task tags - for _, taskTag := range task.Tags { - if taskTag.Key != nil && taskTag.Value != nil { - labels[model.LabelName(ecsLabelTagTask+strutil.SanitizeLabelName(*taskTag.Key))] = model.LabelValue(*taskTag.Value) - } - } - - // Add EC2 instance tags (if running on EC2) - if ec2InstanceID != "" { - if info, ok := ec2InstInfo[ec2InstanceID]; ok { - for tagKey, tagValue := range info.tags { - labels[model.LabelName(ecsLabelTagEC2+strutil.SanitizeLabelName(tagKey))] = model.LabelValue(tagValue) - } - } - } - - serviceTargets = append(serviceTargets, labels) - } - - // Add service targets to local targets with mutex protection - localTargetsMu.Lock() - localTargets = append(localTargets, serviceTargets...) - localTargetsMu.Unlock() - } - }(clusterService) + if err := resourceErrg.Wait(); err != nil { + return } - serviceWg.Wait() + // Fetch container instances and network interfaces in parallel (both depend on tasks) + var ( + containerInstances map[string]string + eniToPublicIP map[string]string + ) - // Add all local targets to main target group with mutex protection - targetsMu.Lock() - tg.Targets = append(tg.Targets, localTargets...) - targetsMu.Unlock() - }(clusterArn, clusterServices) + instanceErrg, instanceCtx := errgroup.WithContext(ctx) + instanceErrg.Go(func() error { + var err error + containerInstances, err = d.describeContainerInstances(instanceCtx, *cluster.ClusterArn, tasks) + if err != nil { + d.logger.Error("Failed to describe container instances for cluster", "cluster", *cluster.ClusterArn, "error", err) + } + return err + }) + instanceErrg.Go(func() error { + var err error + eniToPublicIP, err = d.describeNetworkInterfaces(instanceCtx, tasks) + if err != nil { + d.logger.Error("Failed to describe network interfaces for cluster", "cluster", *cluster.ClusterArn, "error", err) + } + return err + }) + + if err := instanceErrg.Wait(); err != nil { + return + } + + ec2Instances := make(map[string]ec2InstanceInfo) + if len(containerInstances) > 0 { + // Deduplicate EC2 instance IDs (multiple tasks can share the same instance) + ec2InstanceIDSet := make(map[string]struct{}) + for _, ec2ID := range containerInstances { + ec2InstanceIDSet[ec2ID] = struct{}{} + } + ec2InstanceIDs := make([]string, 0, len(ec2InstanceIDSet)) + for ec2ID := range ec2InstanceIDSet { + ec2InstanceIDs = append(ec2InstanceIDs, ec2ID) + } + ec2Instances, err = d.describeEC2Instances(ctx, ec2InstanceIDs) + if err != nil { + d.logger.Error("Failed to describe EC2 instances for cluster", "cluster", *cluster.ClusterArn, "error", err) + return + } + } + + var ( + taskWg sync.WaitGroup + taskMu sync.Mutex + taskTargets []model.LabelSet + ) + + for _, task := range tasks { + taskWg.Add(1) + + go func(cluster types.Cluster, services map[string]types.Service, task types.Task, containerInstances map[string]string, ec2Instances map[string]ec2InstanceInfo, eniToPublicIP map[string]string) { + defer taskWg.Done() + + var ( + ipAddress, subnetID, publicIP string + networkMode string + ec2InstanceID, ec2InstanceType, ec2InstancePrivateIP, ec2InstancePublicIP string + ) + + // Try to get IP from ENI attachment (awsvpc mode) + var eniAttachment *types.Attachment + for _, attachment := range task.Attachments { + if attachment.Type != nil && *attachment.Type == "ElasticNetworkInterface" { + eniAttachment = &attachment + break + } + } + + if eniAttachment != nil { + // awsvpc networking mode - get IP from ENI + networkMode = "awsvpc" + var eniID string + for _, detail := range eniAttachment.Details { + switch *detail.Name { + case "privateIPv4Address": + ipAddress = *detail.Value + case "subnetId": + subnetID = *detail.Value + case "networkInterfaceId": + eniID = *detail.Value + } + } + // Get public IP from ENI if available + if eniID != "" { + if pub, ok := eniToPublicIP[eniID]; ok { + publicIP = pub + } + } + } else if task.ContainerInstanceArn != nil { + // bridge/host networking mode - need to get EC2 instance IP and subnet + networkMode = "bridge" + var ok bool + ec2InstanceID, ok = containerInstances[*task.ContainerInstanceArn] + if ok { + info, ok := ec2Instances[ec2InstanceID] + if ok { + ipAddress = info.privateIP + publicIP = info.publicIP + subnetID = info.subnetID + ec2InstanceType = info.instanceType + ec2InstancePrivateIP = info.privateIP + ec2InstancePublicIP = info.publicIP + } else { + d.logger.Debug("EC2 instance info not found", "instance", ec2InstanceID, "task", *task.TaskArn) + } + } else { + d.logger.Debug("Container instance not found in map", "arn", *task.ContainerInstanceArn, "task", *task.TaskArn) + } + } + + // Get EC2 instance metadata for awsvpc tasks running on EC2 + // We want the instance type and the host IPs for advanced use cases + if networkMode == "awsvpc" && task.ContainerInstanceArn != nil { + var ok bool + ec2InstanceID, ok = containerInstances[*task.ContainerInstanceArn] + if ok { + info, ok := ec2Instances[ec2InstanceID] + if ok { + ec2InstanceType = info.instanceType + ec2InstancePrivateIP = info.privateIP + ec2InstancePublicIP = info.publicIP + } + } + } + + if ipAddress == "" { + return + } + + labels := model.LabelSet{ + ecsLabelClusterARN: model.LabelValue(*cluster.ClusterArn), + ecsLabelCluster: model.LabelValue(*cluster.ClusterName), + ecsLabelTaskGroup: model.LabelValue(*task.Group), + ecsLabelTaskARN: model.LabelValue(*task.TaskArn), + ecsLabelTaskDefinition: model.LabelValue(*task.TaskDefinitionArn), + ecsLabelIPAddress: model.LabelValue(ipAddress), + ecsLabelRegion: model.LabelValue(d.cfg.Region), + ecsLabelLaunchType: model.LabelValue(task.LaunchType), + ecsLabelAvailabilityZone: model.LabelValue(*task.AvailabilityZone), + ecsLabelDesiredStatus: model.LabelValue(*task.DesiredStatus), + ecsLabelLastStatus: model.LabelValue(*task.LastStatus), + ecsLabelHealthStatus: model.LabelValue(task.HealthStatus), + ecsLabelNetworkMode: model.LabelValue(networkMode), + } + + // Add subnet ID when available (awsvpc mode from ENI, bridge/host from EC2 instance) + if subnetID != "" { + labels[ecsLabelSubnetID] = model.LabelValue(subnetID) + } + + // Add container instance and EC2 instance info for EC2 launch type + if task.ContainerInstanceArn != nil { + labels[ecsLabelContainerInstanceARN] = model.LabelValue(*task.ContainerInstanceArn) + } + if ec2InstanceID != "" { + labels[ecsLabelEC2InstanceID] = model.LabelValue(ec2InstanceID) + } + if ec2InstanceType != "" { + labels[ecsLabelEC2InstanceType] = model.LabelValue(ec2InstanceType) + } + if ec2InstancePrivateIP != "" { + labels[ecsLabelEC2InstancePrivateIP] = model.LabelValue(ec2InstancePrivateIP) + } + if ec2InstancePublicIP != "" { + labels[ecsLabelEC2InstancePublicIP] = model.LabelValue(ec2InstancePublicIP) + } + if publicIP != "" { + labels[ecsLabelPublicIP] = model.LabelValue(publicIP) + } + + if task.PlatformFamily != nil { + labels[ecsLabelPlatformFamily] = model.LabelValue(*task.PlatformFamily) + } + if task.PlatformVersion != nil { + labels[ecsLabelPlatformVersion] = model.LabelValue(*task.PlatformVersion) + } + + labels[model.AddressLabel] = model.LabelValue(net.JoinHostPort(ipAddress, strconv.Itoa(d.cfg.Port))) + + // Add cluster tags + for _, clusterTag := range cluster.Tags { + if clusterTag.Key != nil && clusterTag.Value != nil { + labels[model.LabelName(ecsLabelTagCluster+strutil.SanitizeLabelName(*clusterTag.Key))] = model.LabelValue(*clusterTag.Value) + } + } + + // If this is not a standalone task, add service information and tags + if !isStandaloneTask(task) { + service, ok := services[getServiceNameFromTaskGroup(task)] + if !ok { + d.logger.Debug("Service not found for task", "task", *task.TaskArn, "service", getServiceNameFromTaskGroup(task)) + } + if service.ServiceName != nil { + labels[ecsLabelService] = model.LabelValue(*service.ServiceName) + } + if service.ServiceArn != nil { + labels[ecsLabelServiceARN] = model.LabelValue(*service.ServiceArn) + } + if service.Status != nil { + labels[ecsLabelServiceStatus] = model.LabelValue(*service.Status) + } + + // Add service tags + for _, serviceTag := range service.Tags { + if serviceTag.Key != nil && serviceTag.Value != nil { + labels[model.LabelName(ecsLabelTagService+strutil.SanitizeLabelName(*serviceTag.Key))] = model.LabelValue(*serviceTag.Value) + } + } + } + + // Add task tags + for _, taskTag := range task.Tags { + if taskTag.Key != nil && taskTag.Value != nil { + labels[model.LabelName(ecsLabelTagTask+strutil.SanitizeLabelName(*taskTag.Key))] = model.LabelValue(*taskTag.Value) + } + } + + // Add EC2 instance tags (if running on EC2) + if ec2InstanceID != "" { + if info, ok := ec2Instances[ec2InstanceID]; ok { + for tagKey, tagValue := range info.tags { + labels[model.LabelName(ecsLabelTagEC2+strutil.SanitizeLabelName(tagKey))] = model.LabelValue(tagValue) + } + } + } + + taskMu.Lock() + taskTargets = append(taskTargets, labels) + taskMu.Unlock() + }(cluster, services, task, containerInstances, ec2Instances, eniToPublicIP) + } + + taskWg.Wait() + + // Add this cluster's task targets to the overall collection + clusterMu.Lock() + clusterTargets = append(clusterTargets, taskTargets...) + clusterMu.Unlock() + }(clusterMap[clusterARN], serviceMap[clusterARN], taskARNs) } - wg.Wait() + clusterWg.Wait() + + // Set all targets to the target group + tg.Targets = clusterTargets return []*targetgroup.Group{tg}, nil } + +func isStandaloneTask(task types.Task) bool { + // A standalone task will have a group of "family:task-def-name" + return task.Group != nil && strings.HasPrefix(*task.Group, "family:") +} + +func getServiceNameFromTaskGroup(task types.Task) string { + return strings.Split(*task.Group, ":")[1] +} diff --git a/discovery/aws/ecs_test.go b/discovery/aws/ecs_test.go index 1cb48b27fa..bb1f96a28e 100644 --- a/discovery/aws/ecs_test.go +++ b/discovery/aws/ecs_test.go @@ -214,7 +214,6 @@ func TestECSDiscoveryDescribeClusters(t *testing.T) { func TestECSDiscoveryListServiceARNs(t *testing.T) { ctx := context.Background() - // iterate through the test cases for _, tt := range []struct { name string ecsData *ecsDataStore @@ -225,33 +224,18 @@ func TestECSDiscoveryListServiceARNs(t *testing.T) { name: "SingleClusterWithServices", ecsData: &ecsDataStore{ region: "us-west-2", - clusters: []ecsTypes.Cluster{ - { - ClusterName: strptr("test-cluster"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("ACTIVE"), - }, - }, services: []ecsTypes.Service{ { ServiceName: strptr("web-service"), ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service"), ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), + Status: strptr("ACTIVE"), }, { ServiceName: strptr("api-service"), ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service"), ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - }, - { - // this is to test the old arn format without the cluster name in the service arn - // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-arn-migration.html - ServiceName: strptr("old-api-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/old-api-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), + Status: strptr("ACTIVE"), }, }, }, @@ -260,70 +244,50 @@ func TestECSDiscoveryListServiceARNs(t *testing.T) { "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster": { "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service", "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service", - "arn:aws:ecs:us-west-2:123456789012:service/old-api-service", }, }, }, { - name: "MultipleClustesWithServices", + name: "MultipleClusters", ecsData: &ecsDataStore{ - region: "us-east-1", - clusters: []ecsTypes.Cluster{ - { - ClusterName: strptr("cluster-1"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1"), - Status: strptr("ACTIVE"), - }, - { - ClusterName: strptr("cluster-2"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2"), - Status: strptr("ACTIVE"), - }, - }, + region: "us-west-2", services: []ecsTypes.Service{ { - ServiceName: strptr("service-1"), - ServiceArn: strptr("arn:aws:ecs:us-east-1:123456789012:service/cluster-1/service-1"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1"), - Status: strptr("RUNNING"), + ServiceName: strptr("web-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/cluster-1/web-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/cluster-1"), + Status: strptr("ACTIVE"), }, { - ServiceName: strptr("service-2"), - ServiceArn: strptr("arn:aws:ecs:us-east-1:123456789012:service/cluster-2/service-2"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2"), - Status: strptr("RUNNING"), + ServiceName: strptr("api-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/cluster-2/api-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/cluster-2"), + Status: strptr("ACTIVE"), }, }, }, clusterARNs: []string{ - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1", - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2", + "arn:aws:ecs:us-west-2:123456789012:cluster/cluster-1", + "arn:aws:ecs:us-west-2:123456789012:cluster/cluster-2", }, expected: map[string][]string{ - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1": { - "arn:aws:ecs:us-east-1:123456789012:service/cluster-1/service-1", + "arn:aws:ecs:us-west-2:123456789012:cluster/cluster-1": { + "arn:aws:ecs:us-west-2:123456789012:service/cluster-1/web-service", }, - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2": { - "arn:aws:ecs:us-east-1:123456789012:service/cluster-2/service-2", + "arn:aws:ecs:us-west-2:123456789012:cluster/cluster-2": { + "arn:aws:ecs:us-west-2:123456789012:service/cluster-2/api-service", }, }, }, { - name: "ClusterWithNoServices", + name: "EmptyCluster", ecsData: &ecsDataStore{ - region: "us-west-2", - clusters: []ecsTypes.Cluster{ - { - ClusterName: strptr("empty-cluster"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/empty-cluster"), - Status: strptr("ACTIVE"), - }, - }, + region: "us-west-2", services: []ecsTypes.Service{}, }, - clusterARNs: []string{"arn:aws:ecs:us-west-2:123456789012:cluster/empty-cluster"}, + clusterARNs: []string{"arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"}, expected: map[string][]string{ - "arn:aws:ecs:us-west-2:123456789012:cluster/empty-cluster": nil, + "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster": nil, }, }, } { @@ -334,7 +298,7 @@ func TestECSDiscoveryListServiceARNs(t *testing.T) { ecs: client, cfg: &ECSSDConfig{ Region: tt.ecsData.region, - RequestConcurrency: 1, + RequestConcurrency: 2, }, } @@ -348,113 +312,178 @@ func TestECSDiscoveryListServiceARNs(t *testing.T) { func TestECSDiscoveryDescribeServices(t *testing.T) { ctx := context.Background() - // iterate through the test cases for _, tt := range []struct { - name string - ecsData *ecsDataStore - clusterServiceARNsMap map[string][]string - expected map[string][]ecsTypes.Service + name string + ecsData *ecsDataStore + clusterARN string + serviceARNs []string + expected map[string]ecsTypes.Service }{ { - name: "SingleClusterServices", + name: "ServicesWithTags", ecsData: &ecsDataStore{ region: "us-west-2", services: []ecsTypes.Service{ { - ServiceName: strptr("web-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - TaskDefinition: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/web-task:1"), + ServiceName: strptr("web-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Status: strptr("ACTIVE"), Tags: []ecsTypes.Tag{ {Key: strptr("Environment"), Value: strptr("production")}, + {Key: strptr("Team"), Value: strptr("platform")}, }, }, { - ServiceName: strptr("api-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - TaskDefinition: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/api-task:2"), - }, - }, - }, - clusterServiceARNsMap: map[string][]string{ - "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster": { - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service", - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service", - }, - }, - expected: map[string][]ecsTypes.Service{ - "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster": { - { - ServiceName: strptr("web-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - TaskDefinition: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/web-task:1"), + ServiceName: strptr("api-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Status: strptr("ACTIVE"), Tags: []ecsTypes.Tag{ - {Key: strptr("Environment"), Value: strptr("production")}, + {Key: strptr("Environment"), Value: strptr("staging")}, }, }, - { - ServiceName: strptr("api-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - TaskDefinition: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/api-task:2"), + }, + }, + clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", + serviceARNs: []string{ + "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service", + "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service", + }, + expected: map[string]ecsTypes.Service{ + "web-service": { + ServiceName: strptr("web-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Status: strptr("ACTIVE"), + Tags: []ecsTypes.Tag{ + {Key: strptr("Environment"), Value: strptr("production")}, + {Key: strptr("Team"), Value: strptr("platform")}, + }, + }, + "api-service": { + ServiceName: strptr("api-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Status: strptr("ACTIVE"), + Tags: []ecsTypes.Tag{ + {Key: strptr("Environment"), Value: strptr("staging")}, }, }, }, }, { - name: "MultipleClustersServices", + name: "EmptyServiceList", ecsData: &ecsDataStore{ - region: "us-east-1", - services: []ecsTypes.Service{ + region: "us-west-2", + services: []ecsTypes.Service{}, + }, + clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", + serviceARNs: []string{}, + expected: map[string]ecsTypes.Service{}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + client := newMockECSClient(tt.ecsData) + + d := &ECSDiscovery{ + ecs: client, + cfg: &ECSSDConfig{ + Region: tt.ecsData.region, + RequestConcurrency: 2, + }, + } + + services, err := d.describeServices(ctx, tt.clusterARN, tt.serviceARNs) + require.NoError(t, err) + require.Equal(t, tt.expected, services) + }) + } +} + +func TestECSDiscoveryDescribeContainerInstances(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + ecsData *ecsDataStore + clusterARN string + tasks []ecsTypes.Task + expected map[string]string + }{ + { + name: "EC2Tasks", + ecsData: &ecsDataStore{ + region: "us-west-2", + containerInstances: []ecsTypes.ContainerInstance{ { - ServiceName: strptr("service-1"), - ServiceArn: strptr("arn:aws:ecs:us-east-1:123456789012:service/cluster-1/service-1"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1"), - Status: strptr("RUNNING"), - TaskDefinition: strptr("arn:aws:ecs:us-east-1:123456789012:task-definition/task-1:1"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + Ec2InstanceId: strptr("i-1234567890abcdef0"), }, { - ServiceName: strptr("service-2"), - ServiceArn: strptr("arn:aws:ecs:us-east-1:123456789012:service/cluster-2/service-2"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2"), - Status: strptr("DRAINING"), - TaskDefinition: strptr("arn:aws:ecs:us-east-1:123456789012:task-definition/task-2:1"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/xyz789"), + Ec2InstanceId: strptr("i-0987654321fedcba0"), }, }, }, - clusterServiceARNsMap: map[string][]string{ - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1": { - "arn:aws:ecs:us-east-1:123456789012:service/cluster-1/service-1", + clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + LaunchType: ecsTypes.LaunchTypeEc2, }, - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2": { - "arn:aws:ecs:us-east-1:123456789012:service/cluster-2/service-2", + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/xyz789"), + LaunchType: ecsTypes.LaunchTypeEc2, }, }, - expected: map[string][]ecsTypes.Service{ - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1": { + expected: map[string]string{ + "arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123": "i-1234567890abcdef0", + "arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/xyz789": "i-0987654321fedcba0", + }, + }, + { + name: "FargateTasks", + ecsData: &ecsDataStore{ + region: "us-west-2", + containerInstances: []ecsTypes.ContainerInstance{}, + }, + clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), + LaunchType: ecsTypes.LaunchTypeFargate, + }, + }, + expected: map[string]string{}, + }, + { + name: "MixedTasks", + ecsData: &ecsDataStore{ + region: "us-west-2", + containerInstances: []ecsTypes.ContainerInstance{ { - ServiceName: strptr("service-1"), - ServiceArn: strptr("arn:aws:ecs:us-east-1:123456789012:service/cluster-1/service-1"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-1"), - Status: strptr("RUNNING"), - TaskDefinition: strptr("arn:aws:ecs:us-east-1:123456789012:task-definition/task-1:1"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + Ec2InstanceId: strptr("i-1234567890abcdef0"), }, }, - "arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2": { - { - ServiceName: strptr("service-2"), - ServiceArn: strptr("arn:aws:ecs:us-east-1:123456789012:service/cluster-2/service-2"), - ClusterArn: strptr("arn:aws:ecs:us-east-1:123456789012:cluster/cluster-2"), - Status: strptr("DRAINING"), - TaskDefinition: strptr("arn:aws:ecs:us-east-1:123456789012:task-definition/task-2:1"), - }, + }, + clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-ec2"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + LaunchType: ecsTypes.LaunchTypeEc2, }, + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-fargate"), + LaunchType: ecsTypes.LaunchTypeFargate, + }, + }, + expected: map[string]string{ + "arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123": "i-1234567890abcdef0", }, }, } { @@ -465,13 +494,267 @@ func TestECSDiscoveryDescribeServices(t *testing.T) { ecs: client, cfg: &ECSSDConfig{ Region: tt.ecsData.region, - RequestConcurrency: 1, + RequestConcurrency: 2, }, } - serviceMap, err := d.describeServices(ctx, tt.clusterServiceARNsMap) + containerInstances, err := d.describeContainerInstances(ctx, tt.clusterARN, tt.tasks) require.NoError(t, err) - require.Equal(t, tt.expected, serviceMap) + require.Equal(t, tt.expected, containerInstances) + }) + } +} + +func TestECSDiscoveryDescribeEC2Instances(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + ecsData *ecsDataStore + instanceIDs []string + expected map[string]ec2InstanceInfo + }{ + { + name: "InstancesWithTags", + ecsData: &ecsDataStore{ + region: "us-west-2", + ec2Instances: map[string]ec2InstanceInfo{ + "i-1234567890abcdef0": { + privateIP: "10.0.1.50", + publicIP: "54.1.2.3", + subnetID: "subnet-12345", + instanceType: "t3.medium", + tags: map[string]string{ + "Name": "ecs-host-1", + "Environment": "production", + }, + }, + "i-0987654321fedcba0": { + privateIP: "10.0.1.75", + publicIP: "54.2.3.4", + subnetID: "subnet-67890", + instanceType: "t3.large", + tags: map[string]string{ + "Name": "ecs-host-2", + "Team": "platform", + }, + }, + }, + }, + instanceIDs: []string{"i-1234567890abcdef0", "i-0987654321fedcba0"}, + expected: map[string]ec2InstanceInfo{ + "i-1234567890abcdef0": { + privateIP: "10.0.1.50", + publicIP: "54.1.2.3", + subnetID: "subnet-12345", + instanceType: "t3.medium", + tags: map[string]string{ + "Name": "ecs-host-1", + "Environment": "production", + }, + }, + "i-0987654321fedcba0": { + privateIP: "10.0.1.75", + publicIP: "54.2.3.4", + subnetID: "subnet-67890", + instanceType: "t3.large", + tags: map[string]string{ + "Name": "ecs-host-2", + "Team": "platform", + }, + }, + }, + }, + { + name: "EmptyList", + ecsData: &ecsDataStore{ + region: "us-west-2", + ec2Instances: map[string]ec2InstanceInfo{}, + }, + instanceIDs: []string{}, + expected: map[string]ec2InstanceInfo{}, + }, + { + name: "InstanceWithoutPublicIP", + ecsData: &ecsDataStore{ + region: "us-west-2", + ec2Instances: map[string]ec2InstanceInfo{ + "i-privateonly": { + privateIP: "10.0.1.100", + publicIP: "", + subnetID: "subnet-private", + instanceType: "t3.micro", + tags: map[string]string{}, + }, + }, + }, + instanceIDs: []string{"i-privateonly"}, + expected: map[string]ec2InstanceInfo{ + "i-privateonly": { + privateIP: "10.0.1.100", + publicIP: "", + subnetID: "subnet-private", + instanceType: "t3.micro", + tags: map[string]string{}, + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + ec2Client := newMockECSEC2Client(tt.ecsData.ec2Instances, nil) + + d := &ECSDiscovery{ + ec2: ec2Client, + cfg: &ECSSDConfig{ + Region: tt.ecsData.region, + RequestConcurrency: 2, + }, + } + + instances, err := d.describeEC2Instances(ctx, tt.instanceIDs) + require.NoError(t, err) + require.Equal(t, tt.expected, instances) + }) + } +} + +func TestECSDiscoveryDescribeNetworkInterfaces(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + ecsData *ecsDataStore + tasks []ecsTypes.Task + expected map[string]string + }{ + { + name: "AwsvpcTasksWithPublicIPs", + ecsData: &ecsDataStore{ + region: "us-west-2", + eniPublicIPs: map[string]string{ + "eni-12345": "52.1.2.3", + "eni-67890": "52.2.3.4", + }, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), + LaunchType: ecsTypes.LaunchTypeFargate, + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("networkInterfaceId"), Value: strptr("eni-12345")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.1.100")}, + }, + }, + }, + }, + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2"), + LaunchType: ecsTypes.LaunchTypeFargate, + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("networkInterfaceId"), Value: strptr("eni-67890")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.1.200")}, + }, + }, + }, + }, + }, + expected: map[string]string{ + "eni-12345": "52.1.2.3", + "eni-67890": "52.2.3.4", + }, + }, + { + name: "AwsvpcTasksWithoutPublicIPs", + ecsData: &ecsDataStore{ + region: "us-west-2", + eniPublicIPs: map[string]string{}, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), + LaunchType: ecsTypes.LaunchTypeFargate, + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("networkInterfaceId"), Value: strptr("eni-private")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.1.100")}, + }, + }, + }, + }, + }, + expected: map[string]string{}, + }, + { + name: "BridgeTasksNoENI", + ecsData: &ecsDataStore{ + region: "us-west-2", + eniPublicIPs: map[string]string{}, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), + LaunchType: ecsTypes.LaunchTypeEc2, + // No ENI attachment for bridge networking + Attachments: []ecsTypes.Attachment{}, + }, + }, + expected: map[string]string{}, + }, + { + name: "MixedTasks", + ecsData: &ecsDataStore{ + region: "us-west-2", + eniPublicIPs: map[string]string{ + "eni-fargate": "52.1.2.3", + }, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-fargate"), + LaunchType: ecsTypes.LaunchTypeFargate, + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("networkInterfaceId"), Value: strptr("eni-fargate")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.1.100")}, + }, + }, + }, + }, + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-bridge"), + LaunchType: ecsTypes.LaunchTypeEc2, + Attachments: []ecsTypes.Attachment{}, + }, + }, + expected: map[string]string{ + "eni-fargate": "52.1.2.3", + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + ec2Client := newMockECSEC2Client(nil, tt.ecsData.eniPublicIPs) + + d := &ECSDiscovery{ + ec2: ec2Client, + cfg: &ECSSDConfig{ + Region: tt.ecsData.region, + RequestConcurrency: 2, + }, + } + + eniMap, err := d.describeNetworkInterfaces(ctx, tt.tasks) + require.NoError(t, err) + require.Equal(t, tt.expected, eniMap) }) } } @@ -481,13 +764,13 @@ func TestECSDiscoveryListTaskARNs(t *testing.T) { // iterate through the test cases for _, tt := range []struct { - name string - ecsData *ecsDataStore - services []ecsTypes.Service - expected map[string][]string + name string + ecsData *ecsDataStore + clusterARNs []string + expected map[string][]string }{ { - name: "ServicesWithTasks", + name: "TasksInCluster", ecsData: &ecsDataStore{ region: "us-west-2", tasks: []ecsTypes.Task{ @@ -511,46 +794,24 @@ func TestECSDiscoveryListTaskARNs(t *testing.T) { }, }, }, - services: []ecsTypes.Service{ - { - ServiceName: strptr("web-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - }, - { - ServiceName: strptr("api-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - }, - }, + clusterARNs: []string{"arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"}, expected: map[string][]string{ - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service": { + "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster": { "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1", "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2", - }, - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service": { "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-3", }, }, }, { - name: "ServiceWithNoTasks", + name: "EmptyCluster", ecsData: &ecsDataStore{ region: "us-west-2", tasks: []ecsTypes.Task{}, }, - services: []ecsTypes.Service{ - { - ServiceName: strptr("empty-service"), - ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/empty-service"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Status: strptr("RUNNING"), - }, - }, + clusterARNs: []string{"arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"}, expected: map[string][]string{ - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/empty-service": nil, + "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster": nil, }, }, } { @@ -565,7 +826,7 @@ func TestECSDiscoveryListTaskARNs(t *testing.T) { }, } - taskMap, err := d.listTaskARNs(ctx, tt.services) + taskMap, err := d.listTaskARNs(ctx, tt.clusterARNs) require.NoError(t, err) require.Equal(t, tt.expected, taskMap) }) @@ -577,11 +838,11 @@ func TestECSDiscoveryDescribeTasks(t *testing.T) { // iterate through the test cases for _, tt := range []struct { - name string - ecsData *ecsDataStore - clusterARN string - taskARNsMap map[string][]string - expected map[string][]ecsTypes.Task + name string + ecsData *ecsDataStore + clusterARN string + taskARNs []string + expected []ecsTypes.Task }{ { name: "TasksInCluster", @@ -608,47 +869,39 @@ func TestECSDiscoveryDescribeTasks(t *testing.T) { }, }, clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", - taskARNsMap: map[string][]string{ - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service": { - "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1", - }, - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service": { - "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2", - }, + taskARNs: []string{ + "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1", + "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2", }, - expected: map[string][]ecsTypes.Task{ - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/web-service": { - { - TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Group: strptr("service:web-service"), - TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/web-task:1"), - LastStatus: strptr("RUNNING"), - Tags: []ecsTypes.Tag{ - {Key: strptr("Environment"), Value: strptr("production")}, - }, + expected: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-1"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Group: strptr("service:web-service"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/web-task:1"), + LastStatus: strptr("RUNNING"), + Tags: []ecsTypes.Tag{ + {Key: strptr("Environment"), Value: strptr("production")}, }, }, - "arn:aws:ecs:us-west-2:123456789012:service/test-cluster/api-service": { - { - TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2"), - ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), - Group: strptr("service:api-service"), - TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/api-task:2"), - LastStatus: strptr("RUNNING"), - }, + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-2"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Group: strptr("service:api-service"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/api-task:2"), + LastStatus: strptr("RUNNING"), }, }, }, { - name: "EmptyTaskARNsMap", + name: "EmptyTaskList", ecsData: &ecsDataStore{ region: "us-west-2", tasks: []ecsTypes.Task{}, }, - clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", - taskARNsMap: map[string][]string{}, - expected: map[string][]ecsTypes.Task{}, + clusterARN: "arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster", + taskARNs: []string{}, + expected: nil, }, } { t.Run(tt.name, func(t *testing.T) { @@ -662,9 +915,9 @@ func TestECSDiscoveryDescribeTasks(t *testing.T) { }, } - taskMap, err := d.describeTasks(ctx, tt.clusterARN, tt.taskARNsMap) + tasks, err := d.describeTasks(ctx, tt.clusterARN, tt.taskARNs) require.NoError(t, err) - require.Equal(t, tt.expected, taskMap) + require.Equal(t, tt.expected, tasks) }) } } @@ -836,6 +1089,75 @@ func TestECSDiscoveryRefresh(t *testing.T) { }, }, }, + { + name: "StandaloneTaskNoService", + ecsData: &ecsDataStore{ + region: "us-west-2", + clusters: []ecsTypes.Cluster{ + { + ClusterName: strptr("standalone-cluster"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/standalone-cluster"), + Status: strptr("ACTIVE"), + }, + }, + services: []ecsTypes.Service{}, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/standalone-cluster/task-standalone"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/standalone-cluster"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/standalone-task:1"), + Group: strptr("family:standalone-task"), + LaunchType: ecsTypes.LaunchTypeFargate, + LastStatus: strptr("RUNNING"), + DesiredStatus: strptr("RUNNING"), + HealthStatus: ecsTypes.HealthStatusHealthy, + AvailabilityZone: strptr("us-west-2a"), + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("subnetId"), Value: strptr("subnet-standalone-1")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.4.10")}, + {Name: strptr("networkInterfaceId"), Value: strptr("eni-standalone-123")}, + }, + }, + }, + Tags: []ecsTypes.Tag{ + {Key: strptr("Role"), Value: strptr("batch")}, + }, + }, + }, + eniPublicIPs: map[string]string{ + "eni-standalone-123": "52.4.5.6", + }, + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("10.0.4.10:80"), + "__meta_ecs_cluster": model.LabelValue("standalone-cluster"), + "__meta_ecs_cluster_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:cluster/standalone-cluster"), + "__meta_ecs_task_group": model.LabelValue("family:standalone-task"), + "__meta_ecs_task_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task/standalone-cluster/task-standalone"), + "__meta_ecs_task_definition": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task-definition/standalone-task:1"), + "__meta_ecs_region": model.LabelValue("us-west-2"), + "__meta_ecs_availability_zone": model.LabelValue("us-west-2a"), + "__meta_ecs_subnet_id": model.LabelValue("subnet-standalone-1"), + "__meta_ecs_ip_address": model.LabelValue("10.0.4.10"), + "__meta_ecs_launch_type": model.LabelValue("FARGATE"), + "__meta_ecs_desired_status": model.LabelValue("RUNNING"), + "__meta_ecs_last_status": model.LabelValue("RUNNING"), + "__meta_ecs_health_status": model.LabelValue("HEALTHY"), + "__meta_ecs_network_mode": model.LabelValue("awsvpc"), + "__meta_ecs_public_ip": model.LabelValue("52.4.5.6"), + "__meta_ecs_tag_task_Role": model.LabelValue("batch"), + }, + }, + }, + }, + }, { name: "TaskWithBridgeNetworking", ecsData: &ecsDataStore{ @@ -1184,7 +1506,14 @@ func TestECSDiscoveryRefresh(t *testing.T) { groups, err := d.refresh(ctx) require.NoError(t, err) - require.Equal(t, tt.expected, groups) + if tt.name == "MixedNetworkingModes" { + // Use ElementsMatch for tests with multiple tasks as goroutines can affect order + require.Len(t, groups, len(tt.expected)) + require.Equal(t, tt.expected[0].Source, groups[0].Source) + require.ElementsMatch(t, tt.expected[0].Targets, groups[0].Targets) + } else { + require.Equal(t, tt.expected, groups) + } }) } } @@ -1381,3 +1710,98 @@ func (m *mockECSEC2Client) DescribeNetworkInterfaces(_ context.Context, input *e NetworkInterfaces: networkInterfaces, }, nil } + +func TestIsStandaloneTask(t *testing.T) { + tests := []struct { + name string + task ecsTypes.Task + expected bool + }{ + { + name: "StandaloneTask", + task: ecsTypes.Task{ + Group: strptr("family:my-task-definition"), + }, + expected: true, + }, + { + name: "ServiceTask", + task: ecsTypes.Task{ + Group: strptr("service:my-service"), + }, + expected: false, + }, + { + name: "ServiceTaskWithColon", + task: ecsTypes.Task{ + Group: strptr("service:my:service:name"), + }, + expected: false, + }, + { + name: "NilGroup", + task: ecsTypes.Task{ + Group: nil, + }, + expected: false, + }, + { + name: "EmptyGroup", + task: ecsTypes.Task{ + Group: strptr(""), + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isStandaloneTask(tt.task) + require.Equal(t, tt.expected, result) + }) + } +} + +func TestGetServiceNameFromTaskGroup(t *testing.T) { + tests := []struct { + name string + task ecsTypes.Task + expected string + }{ + { + name: "SimpleServiceName", + task: ecsTypes.Task{ + Group: strptr("service:my-service"), + }, + expected: "my-service", + }, + { + name: "ServiceNameWithHyphens", + task: ecsTypes.Task{ + Group: strptr("service:web-api-service"), + }, + expected: "web-api-service", + }, + { + name: "ServiceNameWithColons", + task: ecsTypes.Task{ + Group: strptr("service:my:service:name"), + }, + expected: "my", + }, + { + name: "FamilyGroup", + task: ecsTypes.Task{ + Group: strptr("family:my-task-def"), + }, + expected: "my-task-def", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := getServiceNameFromTaskGroup(tt.task) + require.Equal(t, tt.expected, result) + }) + } +} diff --git a/discovery/aws/lightsail.go b/discovery/aws/lightsail.go index b13f26cc5f..69a5b6625f 100644 --- a/discovery/aws/lightsail.go +++ b/discovery/aws/lightsail.go @@ -26,7 +26,6 @@ import ( awsConfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/credentials/stscreds" - "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" "github.com/aws/aws-sdk-go-v2/service/lightsail" "github.com/aws/aws-sdk-go-v2/service/sts" "github.com/aws/smithy-go" @@ -106,30 +105,9 @@ func (c *LightsailSDConfig) UnmarshalYAML(unmarshal func(any) error) error { return err } - if c.Region == "" { - cfg, err := awsConfig.LoadDefaultConfig(context.Background()) - if err != nil { - return err - } - - if cfg.Region != "" { - // Use the region from the AWS config. It will load environment variables and shared config files. - c.Region = cfg.Region - } - - if c.Region == "" { - // Try to get the region from the instance metadata service (IMDS). - imdsClient := imds.NewFromConfig(cfg) - region, err := imdsClient.GetRegion(context.Background(), &imds.GetRegionInput{}) - if err != nil { - return err - } - c.Region = region.Region - } - } - - if c.Region == "" { - return errors.New("lightsail SD configuration requires a region") + c.Region, err = loadRegion(context.Background(), c.Region) + if err != nil { + return fmt.Errorf("could not determine AWS region: %w", err) } return c.HTTPClientConfig.Validate() diff --git a/discovery/aws/metrics_msk.go b/discovery/aws/metrics_msk.go new file mode 100644 index 0000000000..fc69f57aa1 --- /dev/null +++ b/discovery/aws/metrics_msk.go @@ -0,0 +1,32 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package aws + +import ( + "github.com/prometheus/prometheus/discovery" +) + +type mskMetrics struct { + refreshMetrics discovery.RefreshMetricsInstantiator +} + +var _ discovery.DiscovererMetrics = (*mskMetrics)(nil) + +// Register implements discovery.DiscovererMetrics. +func (*mskMetrics) Register() error { + return nil +} + +// Unregister implements discovery.DiscovererMetrics. +func (*mskMetrics) Unregister() {} diff --git a/discovery/aws/msk.go b/discovery/aws/msk.go new file mode 100644 index 0000000000..3ecc1e6235 --- /dev/null +++ b/discovery/aws/msk.go @@ -0,0 +1,451 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package aws + +import ( + "context" + "errors" + "fmt" + "log/slog" + "net" + "strconv" + "sync" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + awsConfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/credentials/stscreds" + "github.com/aws/aws-sdk-go-v2/service/kafka" + "github.com/aws/aws-sdk-go-v2/service/kafka/types" + "github.com/aws/aws-sdk-go-v2/service/sts" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/config" + "github.com/prometheus/common/model" + "github.com/prometheus/common/promslog" + "golang.org/x/sync/errgroup" + + "github.com/prometheus/prometheus/discovery" + "github.com/prometheus/prometheus/discovery/refresh" + "github.com/prometheus/prometheus/discovery/targetgroup" + "github.com/prometheus/prometheus/util/strutil" +) + +type NodeType string + +const ( + NodeTypeBroker NodeType = "BROKER" + NodeTypeController NodeType = "CONTROLLER" +) + +const ( + mskLabel = model.MetaLabelPrefix + "msk_" + + // Cluster labels. + mskLabelCluster = mskLabel + "cluster_" + mskLabelClusterName = mskLabelCluster + "name" + mskLabelClusterARN = mskLabelCluster + "arn" + mskLabelClusterState = mskLabelCluster + "state" + mskLabelClusterType = mskLabelCluster + "type" + mskLabelClusterVersion = mskLabelCluster + "version" + mskLabelClusterJmxExporterEnabled = mskLabelCluster + "jmx_exporter_enabled" + mskLabelClusterConfigurationARN = mskLabelCluster + "configuration_arn" + mskLabelClusterConfigurationRevision = mskLabelCluster + "configuration_revision" + mskLabelClusterKafkaVersion = mskLabelCluster + "kafka_version" + mskLabelClusterTags = mskLabelCluster + "tag_" + + // Node labels. + mskLabelNode = mskLabel + "node_" + mskLabelNodeType = mskLabelNode + "type" + mskLabelNodeARN = mskLabelNode + "arn" + mskLabelNodeAddedTime = mskLabelNode + "added_time" + mskLabelNodeInstanceType = mskLabelNode + "instance_type" + mskLabelNodeAttachedENI = mskLabelNode + "attached_eni" + + // Broker labels. + mskLabelBroker = mskLabel + "broker_" + mskLabelBrokerEndpointIndex = mskLabelBroker + "endpoint_index" + mskLabelBrokerID = mskLabelBroker + "id" + mskLabelBrokerClientSubnet = mskLabelBroker + "client_subnet" + mskLabelBrokerClientVPCIP = mskLabelBroker + "client_vpc_ip" + mskLabelBrokerNodeExporterEnabled = mskLabelBroker + "node_exporter_enabled" + + // Controller labels. + mskLabelController = mskLabel + "controller_" + mskLabelControllerEndpointIndex = mskLabelController + "endpoint_index" +) + +// DefaultMSKSDConfig is the default MSK SD configuration. +var DefaultMSKSDConfig = MSKSDConfig{ + Port: 80, + RefreshInterval: model.Duration(60 * time.Second), + RequestConcurrency: 10, + HTTPClientConfig: config.DefaultHTTPClientConfig, +} + +func init() { + discovery.RegisterConfig(&MSKSDConfig{}) +} + +// MSKSDConfig is the configuration for MSK based service discovery. +type MSKSDConfig struct { + Region string `yaml:"region"` + Endpoint string `yaml:"endpoint"` + AccessKey string `yaml:"access_key,omitempty"` + SecretKey config.Secret `yaml:"secret_key,omitempty"` + Profile string `yaml:"profile,omitempty"` + RoleARN string `yaml:"role_arn,omitempty"` + Clusters []string `yaml:"clusters,omitempty"` + Port int `yaml:"port"` + RefreshInterval model.Duration `yaml:"refresh_interval,omitempty"` + + RequestConcurrency int `yaml:"request_concurrency,omitempty"` + HTTPClientConfig config.HTTPClientConfig `yaml:",inline"` +} + +// NewDiscovererMetrics implements discovery.Config. +func (*MSKSDConfig) NewDiscovererMetrics(_ prometheus.Registerer, rmi discovery.RefreshMetricsInstantiator) discovery.DiscovererMetrics { + return &mskMetrics{ + refreshMetrics: rmi, + } +} + +// Name returns the name of the MSK Config. +func (*MSKSDConfig) Name() string { return "msk" } + +// NewDiscoverer returns a Discoverer for the MSK Config. +func (c *MSKSDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Discoverer, error) { + return NewMSKDiscovery(c, opts) +} + +// UnmarshalYAML implements the yaml.Unmarshaler interface for the MSK Config. +func (c *MSKSDConfig) UnmarshalYAML(unmarshal func(any) error) error { + *c = DefaultMSKSDConfig + type plain MSKSDConfig + err := unmarshal((*plain)(c)) + if err != nil { + return err + } + + c.Region, err = loadRegion(context.Background(), c.Region) + if err != nil { + return fmt.Errorf("could not determine AWS region: %w", err) + } + + return c.HTTPClientConfig.Validate() +} + +type mskClient interface { + DescribeClusterV2(context.Context, *kafka.DescribeClusterV2Input, ...func(*kafka.Options)) (*kafka.DescribeClusterV2Output, error) + ListClustersV2(context.Context, *kafka.ListClustersV2Input, ...func(*kafka.Options)) (*kafka.ListClustersV2Output, error) + ListNodes(context.Context, *kafka.ListNodesInput, ...func(*kafka.Options)) (*kafka.ListNodesOutput, error) +} + +// MSKDiscovery periodically performs MSK-SD requests. It implements +// the Discoverer interface. +type MSKDiscovery struct { + *refresh.Discovery + logger *slog.Logger + cfg *MSKSDConfig + msk mskClient +} + +// NewMSKDiscovery returns a new MSKDiscovery which periodically refreshes its targets. +func NewMSKDiscovery(conf *MSKSDConfig, opts discovery.DiscovererOptions) (*MSKDiscovery, error) { + m, ok := opts.Metrics.(*mskMetrics) + if !ok { + return nil, errors.New("invalid discovery metrics type") + } + + if opts.Logger == nil { + opts.Logger = promslog.NewNopLogger() + } + d := &MSKDiscovery{ + logger: opts.Logger, + cfg: conf, + } + d.Discovery = refresh.NewDiscovery( + refresh.Options{ + Logger: opts.Logger, + Mech: "msk", + Interval: time.Duration(d.cfg.RefreshInterval), + RefreshF: d.refresh, + MetricsInstantiator: m.refreshMetrics, + }, + ) + return d, nil +} + +func (d *MSKDiscovery) initMskClient(ctx context.Context) error { + if d.msk != nil { + return nil + } + + if d.cfg.Region == "" { + return errors.New("region must be set for MSK service discovery") + } + + // Build the HTTP client from the provided HTTPClientConfig. + client, err := config.NewClientFromConfig(d.cfg.HTTPClientConfig, "msk_sd") + if err != nil { + return err + } + + // Build the AWS config with the provided region. + var configOptions []func(*awsConfig.LoadOptions) error + configOptions = append(configOptions, awsConfig.WithRegion(d.cfg.Region)) + configOptions = append(configOptions, awsConfig.WithHTTPClient(client)) + + // Only set static credentials if both access key and secret key are provided + // Otherwise, let AWS SDK use its default credential chain + if d.cfg.AccessKey != "" && d.cfg.SecretKey != "" { + credProvider := credentials.NewStaticCredentialsProvider(d.cfg.AccessKey, string(d.cfg.SecretKey), "") + configOptions = append(configOptions, awsConfig.WithCredentialsProvider(credProvider)) + } + + if d.cfg.Profile != "" { + configOptions = append(configOptions, awsConfig.WithSharedConfigProfile(d.cfg.Profile)) + } + + cfg, err := awsConfig.LoadDefaultConfig(ctx, configOptions...) + if err != nil { + d.logger.Error("Failed to create AWS config", "error", err) + return fmt.Errorf("could not create aws config: %w", err) + } + + // If the role ARN is set, assume the role to get credentials and set the credentials provider in the config. + if d.cfg.RoleARN != "" { + assumeProvider := stscreds.NewAssumeRoleProvider(sts.NewFromConfig(cfg), d.cfg.RoleARN) + cfg.Credentials = aws.NewCredentialsCache(assumeProvider) + } + + d.msk = kafka.NewFromConfig(cfg, func(options *kafka.Options) { + if d.cfg.Endpoint != "" { + options.BaseEndpoint = &d.cfg.Endpoint + } + options.HTTPClient = client + }) + + // Test credentials by making a simple API call + testCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + _, err = d.msk.ListClustersV2(testCtx, &kafka.ListClustersV2Input{}) + if err != nil { + d.logger.Error("Failed to test MSK credentials", "error", err) + return fmt.Errorf("MSK credential test failed: %w", err) + } + + return nil +} + +// describeClusters describes the clusters with the given ARNs and returns their details. +func (d *MSKDiscovery) describeClusters(ctx context.Context, clusterARNs []string) ([]types.Cluster, error) { + var ( + clusters []types.Cluster + mu sync.Mutex + ) + errg, ectx := errgroup.WithContext(ctx) + errg.SetLimit(d.cfg.RequestConcurrency) + for _, clusterARN := range clusterARNs { + errg.Go(func() error { + cluster, err := d.msk.DescribeClusterV2(ectx, &kafka.DescribeClusterV2Input{ + ClusterArn: aws.String(clusterARN), + }) + if err != nil { + return fmt.Errorf("could not describe cluster %v: %w", clusterARN, err) + } + mu.Lock() + clusters = append(clusters, *cluster.ClusterInfo) + mu.Unlock() + return nil + }) + } + + return clusters, errg.Wait() +} + +// listClusters lists all MSK clusters in the configured region and returns their details. +func (d *MSKDiscovery) listClusters(ctx context.Context) ([]types.Cluster, error) { + var ( + clusters []types.Cluster + nextToken *string + ) + for { + listClustersInput := kafka.ListClustersV2Input{ + ClusterTypeFilter: aws.String("PROVISIONED"), + MaxResults: aws.Int32(100), + NextToken: nextToken, + } + + resp, err := d.msk.ListClustersV2(ctx, &listClustersInput) + if err != nil { + return nil, fmt.Errorf("could not list clusters: %w", err) + } + + clusters = append(clusters, resp.ClusterInfoList...) + if resp.NextToken == nil { + break + } + nextToken = resp.NextToken + } + + return clusters, nil +} + +// listNodes lists all nodes for the given clusters and returns a map of cluster ARN to its nodes. +func (d *MSKDiscovery) listNodes(ctx context.Context, clusters []types.Cluster) (map[string][]types.NodeInfo, error) { + clusterNodeMap := make(map[string][]types.NodeInfo) + mu := sync.Mutex{} + errg, ectx := errgroup.WithContext(ctx) + errg.SetLimit(d.cfg.RequestConcurrency) + for _, cluster := range clusters { + clusterARN := aws.ToString(cluster.ClusterArn) + errg.Go(func() error { + var clusterNodes []types.NodeInfo + var nextToken *string + for { + resp, err := d.msk.ListNodes(ectx, &kafka.ListNodesInput{ + ClusterArn: aws.String(clusterARN), + MaxResults: aws.Int32(100), + NextToken: nextToken, + }) + if err != nil { + return fmt.Errorf("could not list nodes for cluster %v: %w", clusterARN, err) + } + + clusterNodes = append(clusterNodes, resp.NodeInfoList...) + if resp.NextToken == nil { + break + } + nextToken = resp.NextToken + } + + mu.Lock() + clusterNodeMap[clusterARN] = clusterNodes + mu.Unlock() + return nil + }) + } + + return clusterNodeMap, errg.Wait() +} + +func (d *MSKDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { + err := d.initMskClient(ctx) + if err != nil { + return nil, err + } + + tg := &targetgroup.Group{ + Source: d.cfg.Region, + } + + var clusters []types.Cluster + if len(d.cfg.Clusters) > 0 { + clusters, err = d.describeClusters(ctx, d.cfg.Clusters) + if err != nil { + return nil, err + } + } else { + clusters, err = d.listClusters(ctx) + if err != nil { + return nil, err + } + } + + clusterNodeMap, err := d.listNodes(ctx, clusters) + if err != nil { + return nil, err + } + + var ( + targetsMu sync.Mutex + wg sync.WaitGroup + ) + for _, cluster := range clusters { + wg.Add(1) + + go func(cluster types.Cluster, nodes []types.NodeInfo) { + defer wg.Done() + for _, node := range nodes { + labels := model.LabelSet{ + mskLabelClusterName: model.LabelValue(aws.ToString(cluster.ClusterName)), + mskLabelClusterARN: model.LabelValue(aws.ToString(cluster.ClusterArn)), + mskLabelClusterState: model.LabelValue(string(cluster.State)), + mskLabelClusterType: model.LabelValue(string(cluster.ClusterType)), + mskLabelClusterVersion: model.LabelValue(aws.ToString(cluster.CurrentVersion)), + mskLabelNodeARN: model.LabelValue(aws.ToString(node.NodeARN)), + mskLabelNodeAddedTime: model.LabelValue(aws.ToString(node.AddedToClusterTime)), + mskLabelNodeInstanceType: model.LabelValue(aws.ToString(node.InstanceType)), + mskLabelClusterJmxExporterEnabled: model.LabelValue(strconv.FormatBool(*cluster.Provisioned.OpenMonitoring.Prometheus.JmxExporter.EnabledInBroker)), + mskLabelClusterConfigurationARN: model.LabelValue(aws.ToString(cluster.Provisioned.CurrentBrokerSoftwareInfo.ConfigurationArn)), + mskLabelClusterConfigurationRevision: model.LabelValue(strconv.FormatInt(*cluster.Provisioned.CurrentBrokerSoftwareInfo.ConfigurationRevision, 10)), + mskLabelClusterKafkaVersion: model.LabelValue(aws.ToString(cluster.Provisioned.CurrentBrokerSoftwareInfo.KafkaVersion)), + } + + for key, value := range cluster.Tags { + labels[model.LabelName(mskLabelClusterTags+strutil.SanitizeLabelName(key))] = model.LabelValue(value) + } + + switch nodeType(node) { + case NodeTypeBroker: + labels[mskLabelNodeType] = model.LabelValue(NodeTypeBroker) + labels[mskLabelNodeAttachedENI] = model.LabelValue(aws.ToString(node.BrokerNodeInfo.AttachedENIId)) + labels[mskLabelBrokerID] = model.LabelValue(fmt.Sprintf("%.0f", aws.ToFloat64(node.BrokerNodeInfo.BrokerId))) + labels[mskLabelBrokerClientSubnet] = model.LabelValue(aws.ToString(node.BrokerNodeInfo.ClientSubnet)) + labels[mskLabelBrokerClientVPCIP] = model.LabelValue(aws.ToString(node.BrokerNodeInfo.ClientVpcIpAddress)) + labels[mskLabelBrokerNodeExporterEnabled] = model.LabelValue(strconv.FormatBool(*cluster.Provisioned.OpenMonitoring.Prometheus.NodeExporter.EnabledInBroker)) + + for idx, endpoint := range node.BrokerNodeInfo.Endpoints { + endpointLabels := labels.Clone() + endpointLabels[mskLabelBrokerEndpointIndex] = model.LabelValue(strconv.Itoa(idx)) + endpointLabels[model.AddressLabel] = model.LabelValue(net.JoinHostPort(endpoint, strconv.Itoa(d.cfg.Port))) + + targetsMu.Lock() + tg.Targets = append(tg.Targets, endpointLabels) + targetsMu.Unlock() + } + + case NodeTypeController: + labels[mskLabelNodeType] = model.LabelValue(NodeTypeController) + + for idx, endpoint := range node.ControllerNodeInfo.Endpoints { + endpointLabels := labels.Clone() + endpointLabels[mskLabelControllerEndpointIndex] = model.LabelValue(strconv.Itoa(idx)) + endpointLabels[model.AddressLabel] = model.LabelValue(net.JoinHostPort(endpoint, strconv.Itoa(d.cfg.Port))) + + targetsMu.Lock() + tg.Targets = append(tg.Targets, endpointLabels) + targetsMu.Unlock() + } + default: + continue + } + } + }(cluster, clusterNodeMap[aws.ToString(cluster.ClusterArn)]) + } + wg.Wait() + + return []*targetgroup.Group{tg}, nil +} + +func nodeType(node types.NodeInfo) NodeType { + if node.BrokerNodeInfo != nil { + return NodeTypeBroker + } else if node.ControllerNodeInfo != nil { + return NodeTypeController + } + return "" +} diff --git a/discovery/aws/msk_test.go b/discovery/aws/msk_test.go new file mode 100644 index 0000000000..b1d48a7ea6 --- /dev/null +++ b/discovery/aws/msk_test.go @@ -0,0 +1,1131 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package aws + +import ( + "context" + "fmt" + "sort" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/kafka" + "github.com/aws/aws-sdk-go-v2/service/kafka/types" + "github.com/prometheus/common/model" + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/discovery/targetgroup" +) + +// Struct for test data. +type mskDataStore struct { + region string + clusters []types.Cluster + nodes map[string][]types.NodeInfo // keyed by cluster ARN +} + +func TestMSKDiscoveryListClusters(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + mskData *mskDataStore + expected []types.Cluster + }{ + { + name: "MultipleClusters", + mskData: &mskDataStore{ + region: "us-west-2", + clusters: []types.Cluster{ + { + ClusterName: strptr("test-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + { + ClusterName: strptr("prod-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/prod-cluster/def-456"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + }, + }, + expected: []types.Cluster{ + { + ClusterName: strptr("test-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + { + ClusterName: strptr("prod-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/prod-cluster/def-456"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + }, + }, + { + name: "SingleCluster", + mskData: &mskDataStore{ + region: "us-east-1", + clusters: []types.Cluster{ + { + ClusterName: strptr("single-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/single-cluster/xyz-789"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + }, + }, + expected: []types.Cluster{ + { + ClusterName: strptr("single-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/single-cluster/xyz-789"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + }, + }, + { + name: "NoClusters", + mskData: &mskDataStore{ + region: "us-east-1", + clusters: []types.Cluster{}, + }, + expected: nil, + }, + } { + t.Run(tt.name, func(t *testing.T) { + client := newMockMSKClient(tt.mskData) + + d := &MSKDiscovery{ + msk: client, + cfg: &MSKSDConfig{ + Region: tt.mskData.region, + }, + } + + clusters, err := d.listClusters(ctx) + require.NoError(t, err) + require.Equal(t, tt.expected, clusters) + }) + } +} + +func TestMSKDiscoveryDescribeClusters(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + mskData *mskDataStore + clusterARNs []string + expected []types.Cluster + }{ + { + name: "SingleCluster", + mskData: &mskDataStore{ + region: "us-west-2", + clusters: []types.Cluster{ + { + ClusterName: strptr("test-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + CurrentVersion: strptr("1.2.3"), + Tags: map[string]string{ + "Environment": "production", + "Team": "platform", + }, + }, + }, + }, + clusterARNs: []string{"arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"}, + expected: []types.Cluster{ + { + ClusterName: strptr("test-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + CurrentVersion: strptr("1.2.3"), + Tags: map[string]string{ + "Environment": "production", + "Team": "platform", + }, + }, + }, + }, + { + name: "MultipleClusters", + mskData: &mskDataStore{ + region: "us-east-1", + clusters: []types.Cluster{ + { + ClusterName: strptr("cluster-1"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/cluster-1/xyz-789"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + { + ClusterName: strptr("cluster-2"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/cluster-2/def-456"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + Tags: map[string]string{ + "Stage": "prod", + }, + }, + }, + }, + clusterARNs: []string{ + "arn:aws:kafka:us-east-1:123456789012:cluster/cluster-1/xyz-789", + "arn:aws:kafka:us-east-1:123456789012:cluster/cluster-2/def-456", + }, + expected: []types.Cluster{ + { + ClusterName: strptr("cluster-1"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/cluster-1/xyz-789"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + }, + { + ClusterName: strptr("cluster-2"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/cluster-2/def-456"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + Tags: map[string]string{ + "Stage": "prod", + }, + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + client := newMockMSKClient(tt.mskData) + + d := &MSKDiscovery{ + msk: client, + cfg: &MSKSDConfig{ + Region: tt.mskData.region, + RequestConcurrency: 10, + }, + } + + clusters, err := d.describeClusters(ctx, tt.clusterARNs) + require.NoError(t, err) + + // Sort clusters by ARN to handle non-deterministic ordering from goroutines + sort.Slice(clusters, func(i, j int) bool { + return aws.ToString(clusters[i].ClusterArn) < aws.ToString(clusters[j].ClusterArn) + }) + sort.Slice(tt.expected, func(i, j int) bool { + return aws.ToString(tt.expected[i].ClusterArn) < aws.ToString(tt.expected[j].ClusterArn) + }) + + require.Equal(t, tt.expected, clusters) + }) + } +} + +func TestMSKDiscoveryListNodes(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + mskData *mskDataStore + clusters []types.Cluster + expected map[string][]types.NodeInfo + }{ + { + name: "ClusterWithBrokers", + mskData: &mskDataStore{ + region: "us-west-2", + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + AddedToClusterTime: strptr("2023-01-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + ClientSubnet: strptr("subnet-12345"), + ClientVpcIpAddress: strptr("10.0.1.100"), + Endpoints: []string{"b-1.test-cluster.abc123.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-12345"), + }, + }, + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-2"), + AddedToClusterTime: strptr("2023-01-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(2), + ClientSubnet: strptr("subnet-67890"), + ClientVpcIpAddress: strptr("10.0.1.101"), + Endpoints: []string{"b-2.test-cluster.abc123.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-67890"), + }, + }, + }, + }, + }, + clusters: []types.Cluster{ + { + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + }, + }, + expected: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + AddedToClusterTime: strptr("2023-01-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + ClientSubnet: strptr("subnet-12345"), + ClientVpcIpAddress: strptr("10.0.1.100"), + Endpoints: []string{"b-1.test-cluster.abc123.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-12345"), + }, + }, + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-2"), + AddedToClusterTime: strptr("2023-01-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(2), + ClientSubnet: strptr("subnet-67890"), + ClientVpcIpAddress: strptr("10.0.1.101"), + Endpoints: []string{"b-2.test-cluster.abc123.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-67890"), + }, + }, + }, + }, + }, + { + name: "ClusterWithNoNodes", + mskData: &mskDataStore{ + region: "us-west-2", + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/empty-cluster/xyz-789": {}, + }, + }, + clusters: []types.Cluster{ + { + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/empty-cluster/xyz-789"), + }, + }, + expected: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/empty-cluster/xyz-789": nil, + }, + }, + { + name: "MultipleClusters", + mskData: &mskDataStore{ + region: "us-west-2", + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/cluster-1/abc-123": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + }, + }, + }, + "arn:aws:kafka:us-west-2:123456789012:cluster/cluster-2/def-456": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-2"), + InstanceType: strptr("kafka.m5.xlarge"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(2), + }, + }, + }, + }, + }, + clusters: []types.Cluster{ + { + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/cluster-1/abc-123"), + }, + { + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/cluster-2/def-456"), + }, + }, + expected: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/cluster-1/abc-123": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + }, + }, + }, + "arn:aws:kafka:us-west-2:123456789012:cluster/cluster-2/def-456": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-2"), + InstanceType: strptr("kafka.m5.xlarge"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(2), + }, + }, + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + client := newMockMSKClient(tt.mskData) + + d := &MSKDiscovery{ + msk: client, + cfg: &MSKSDConfig{ + Region: tt.mskData.region, + RequestConcurrency: 10, + }, + } + + nodes, err := d.listNodes(ctx, tt.clusters) + require.NoError(t, err) + require.Equal(t, tt.expected, nodes) + }) + } +} + +func TestMSKDiscoveryRefresh(t *testing.T) { + ctx := context.Background() + + tests := []struct { + name string + mskData *mskDataStore + config *MSKSDConfig + expected []*targetgroup.Group + }{ + { + name: "ClusterWithBrokersUsingClustersConfig", + mskData: &mskDataStore{ + region: "us-west-2", + clusters: []types.Cluster{ + { + ClusterName: strptr("test-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + CurrentVersion: strptr("1.2.3"), + Tags: map[string]string{ + "Environment": "production", + "Team": "platform", + }, + Provisioned: &types.Provisioned{ + CurrentBrokerSoftwareInfo: &types.BrokerSoftwareInfo{ + ConfigurationArn: strptr("arn:aws:kafka:us-west-2:123456789012:configuration/my-config/abc-123"), + ConfigurationRevision: aws.Int64(1), + KafkaVersion: strptr("2.8.1"), + }, + OpenMonitoring: &types.OpenMonitoringInfo{ + Prometheus: &types.PrometheusInfo{ + JmxExporter: &types.JmxExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + NodeExporter: &types.NodeExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + }, + }, + }, + }, + }, + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + AddedToClusterTime: strptr("2023-01-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + ClientSubnet: strptr("subnet-12345"), + ClientVpcIpAddress: strptr("10.0.1.100"), + Endpoints: []string{"b-1.test-cluster.abc123.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-12345"), + }, + }, + }, + }, + }, + config: &MSKSDConfig{ + Region: "us-west-2", + Port: 80, + RequestConcurrency: 10, + Clusters: []string{"arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"}, + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("b-1.test-cluster.abc123.kafka.us-west-2.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("test-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:cluster/test-cluster/abc-123"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("1.2.3"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:configuration/my-config/abc-123"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("2.8.1"), + "__meta_msk_cluster_tag_Environment": model.LabelValue("production"), + "__meta_msk_cluster_tag_Team": model.LabelValue("platform"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + "__meta_msk_node_added_time": model.LabelValue("2023-01-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-12345"), + "__meta_msk_broker_id": model.LabelValue("1"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-12345"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.1.100"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("true"), + "__meta_msk_broker_endpoint_index": model.LabelValue("0"), + }, + }, + }, + }, + }, + { + name: "NoClustersWithEmptyClustersConfig", + mskData: &mskDataStore{ + region: "us-east-1", + clusters: []types.Cluster{}, + }, + config: &MSKSDConfig{ + Region: "us-east-1", + Port: 80, + RequestConcurrency: 10, + Clusters: []string{}, // Empty clusters list uses listClusters + }, + expected: []*targetgroup.Group{ + { + Source: "us-east-1", + }, + }, + }, + { + name: "ClusterWithBrokersUsingListClusters", + mskData: &mskDataStore{ + region: "us-west-2", + clusters: []types.Cluster{ + { + ClusterName: strptr("auto-discovered-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/auto-discovered-cluster/xyz-123"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + CurrentVersion: strptr("1.0.0"), + Provisioned: &types.Provisioned{ + CurrentBrokerSoftwareInfo: &types.BrokerSoftwareInfo{ + ConfigurationArn: strptr("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + ConfigurationRevision: aws.Int64(1), + KafkaVersion: strptr("3.3.1"), + }, + OpenMonitoring: &types.OpenMonitoringInfo{ + Prometheus: &types.PrometheusInfo{ + JmxExporter: &types.JmxExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + NodeExporter: &types.NodeExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + }, + }, + }, + }, + }, + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/auto-discovered-cluster/xyz-123": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-auto"), + AddedToClusterTime: strptr("2023-01-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + ClientSubnet: strptr("subnet-auto"), + ClientVpcIpAddress: strptr("10.0.1.200"), + Endpoints: []string{"b-auto.cluster.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-auto"), + }, + }, + }, + }, + }, + config: &MSKSDConfig{ + Region: "us-west-2", + Port: 80, + RequestConcurrency: 10, + Clusters: nil, // nil clusters list uses listClusters (backward compatibility) + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("b-auto.cluster.kafka.us-west-2.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("auto-discovered-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:cluster/auto-discovered-cluster/xyz-123"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("1.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.3.1"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:node/broker-auto"), + "__meta_msk_node_added_time": model.LabelValue("2023-01-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-auto"), + "__meta_msk_broker_id": model.LabelValue("1"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-auto"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.1.200"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("true"), + "__meta_msk_broker_endpoint_index": model.LabelValue("0"), + }, + }, + }, + }, + }, + { + name: "ClusterWithBrokersAndControllersUsingClustersConfig", + mskData: &mskDataStore{ + region: "us-west-2", + clusters: []types.Cluster{ + { + ClusterName: strptr("kraft-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + CurrentVersion: strptr("1.0.0"), + Tags: map[string]string{ + "Type": "kraft", + }, + Provisioned: &types.Provisioned{ + CurrentBrokerSoftwareInfo: &types.BrokerSoftwareInfo{ + ConfigurationArn: strptr("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + ConfigurationRevision: aws.Int64(2), + KafkaVersion: strptr("3.3.1"), + }, + OpenMonitoring: &types.OpenMonitoringInfo{ + Prometheus: &types.PrometheusInfo{ + JmxExporter: &types.JmxExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + NodeExporter: &types.NodeExporterInfo{ + EnabledInBroker: aws.Bool(false), + }, + }, + }, + }, + }, + }, + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789": { + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + AddedToClusterTime: strptr("2023-06-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(1), + ClientSubnet: strptr("subnet-abc123"), + ClientVpcIpAddress: strptr("10.0.2.100"), + Endpoints: []string{"b-1.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-broker-1"), + }, + }, + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/broker-2"), + AddedToClusterTime: strptr("2023-06-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(2), + ClientSubnet: strptr("subnet-abc124"), + ClientVpcIpAddress: strptr("10.0.2.101"), + Endpoints: []string{"b-2.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com"}, + AttachedENIId: strptr("eni-broker-2"), + }, + }, + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/controller-1"), + AddedToClusterTime: strptr("2023-06-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + ControllerNodeInfo: &types.ControllerNodeInfo{ + Endpoints: []string{"c-1.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com"}, + }, + }, + { + NodeARN: strptr("arn:aws:kafka:us-west-2:123456789012:node/controller-2"), + AddedToClusterTime: strptr("2023-06-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + ControllerNodeInfo: &types.ControllerNodeInfo{ + Endpoints: []string{"c-2.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com"}, + }, + }, + }, + }, + }, + config: &MSKSDConfig{ + Region: "us-west-2", + Port: 80, + RequestConcurrency: 10, + Clusters: []string{"arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789"}, + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("b-1.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("kraft-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("1.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("2"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.3.1"), + "__meta_msk_cluster_tag_Type": model.LabelValue("kraft"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:node/broker-1"), + "__meta_msk_node_added_time": model.LabelValue("2023-06-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-broker-1"), + "__meta_msk_broker_id": model.LabelValue("1"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-abc123"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.2.100"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("false"), + "__meta_msk_broker_endpoint_index": model.LabelValue("0"), + }, + { + model.AddressLabel: model.LabelValue("b-2.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("kraft-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("1.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("2"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.3.1"), + "__meta_msk_cluster_tag_Type": model.LabelValue("kraft"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:node/broker-2"), + "__meta_msk_node_added_time": model.LabelValue("2023-06-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-broker-2"), + "__meta_msk_broker_id": model.LabelValue("2"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-abc124"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.2.101"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("false"), + "__meta_msk_broker_endpoint_index": model.LabelValue("0"), + }, + { + model.AddressLabel: model.LabelValue("c-1.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("kraft-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("1.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("2"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.3.1"), + "__meta_msk_cluster_tag_Type": model.LabelValue("kraft"), + "__meta_msk_node_type": model.LabelValue("CONTROLLER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:node/controller-1"), + "__meta_msk_node_added_time": model.LabelValue("2023-06-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_controller_endpoint_index": model.LabelValue("0"), + }, + { + model.AddressLabel: model.LabelValue("c-2.kraft-cluster.xyz789.kafka.us-west-2.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("kraft-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:cluster/kraft-cluster/xyz-789"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("1.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:configuration/config/xyz"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("2"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.3.1"), + "__meta_msk_cluster_tag_Type": model.LabelValue("kraft"), + "__meta_msk_node_type": model.LabelValue("CONTROLLER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-west-2:123456789012:node/controller-2"), + "__meta_msk_node_added_time": model.LabelValue("2023-06-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_controller_endpoint_index": model.LabelValue("0"), + }, + }, + }, + }, + }, + { + name: "NodesWithMultipleEndpointsUsingClustersConfig", + mskData: &mskDataStore{ + region: "us-east-1", + clusters: []types.Cluster{ + { + ClusterName: strptr("multi-endpoint-cluster"), + ClusterArn: strptr("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + State: types.ClusterStateActive, + ClusterType: types.ClusterTypeProvisioned, + CurrentVersion: strptr("2.0.0"), + Provisioned: &types.Provisioned{ + CurrentBrokerSoftwareInfo: &types.BrokerSoftwareInfo{ + ConfigurationArn: strptr("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + ConfigurationRevision: aws.Int64(1), + KafkaVersion: strptr("3.4.0"), + }, + OpenMonitoring: &types.OpenMonitoringInfo{ + Prometheus: &types.PrometheusInfo{ + JmxExporter: &types.JmxExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + NodeExporter: &types.NodeExporterInfo{ + EnabledInBroker: aws.Bool(true), + }, + }, + }, + }, + }, + }, + nodes: map[string][]types.NodeInfo{ + "arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999": { + { + NodeARN: strptr("arn:aws:kafka:us-east-1:123456789012:node/broker-multi"), + AddedToClusterTime: strptr("2023-08-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.xlarge"), + BrokerNodeInfo: &types.BrokerNodeInfo{ + BrokerId: aws.Float64(3), + ClientSubnet: strptr("subnet-multi-1"), + ClientVpcIpAddress: strptr("10.0.3.50"), + // Multiple endpoints for this broker + Endpoints: []string{"b-3-1.cluster.kafka.us-east-1.amazonaws.com", "b-3-2.cluster.kafka.us-east-1.amazonaws.com", "b-3-3.cluster.kafka.us-east-1.amazonaws.com"}, + AttachedENIId: strptr("eni-multi-broker"), + }, + }, + { + NodeARN: strptr("arn:aws:kafka:us-east-1:123456789012:node/controller-multi"), + AddedToClusterTime: strptr("2023-08-01T00:00:00Z"), + InstanceType: strptr("kafka.m5.large"), + ControllerNodeInfo: &types.ControllerNodeInfo{ + // Multiple endpoints for this controller + Endpoints: []string{"c-1-1.cluster.kafka.us-east-1.amazonaws.com", "c-1-2.cluster.kafka.us-east-1.amazonaws.com", "c-1-3.cluster.kafka.us-east-1.amazonaws.com", "c-1-4.cluster.kafka.us-east-1.amazonaws.com"}, + }, + }, + }, + }, + }, + config: &MSKSDConfig{ + Region: "us-east-1", + Port: 80, + RequestConcurrency: 10, + Clusters: []string{"arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"}, + }, + expected: []*targetgroup.Group{ + { + Source: "us-east-1", + Targets: []model.LabelSet{ + // Broker with 3 endpoints - creates 3 targets with different endpoint indices + { + model.AddressLabel: model.LabelValue("b-3-1.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/broker-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.xlarge"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-multi-broker"), + "__meta_msk_broker_id": model.LabelValue("3"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-multi-1"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.3.50"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("true"), + "__meta_msk_broker_endpoint_index": model.LabelValue("0"), + }, + { + model.AddressLabel: model.LabelValue("b-3-2.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/broker-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.xlarge"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-multi-broker"), + "__meta_msk_broker_id": model.LabelValue("3"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-multi-1"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.3.50"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("true"), + "__meta_msk_broker_endpoint_index": model.LabelValue("1"), + }, + { + model.AddressLabel: model.LabelValue("b-3-3.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("BROKER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/broker-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.xlarge"), + "__meta_msk_node_attached_eni": model.LabelValue("eni-multi-broker"), + "__meta_msk_broker_id": model.LabelValue("3"), + "__meta_msk_broker_client_subnet": model.LabelValue("subnet-multi-1"), + "__meta_msk_broker_client_vpc_ip": model.LabelValue("10.0.3.50"), + "__meta_msk_broker_node_exporter_enabled": model.LabelValue("true"), + "__meta_msk_broker_endpoint_index": model.LabelValue("2"), + }, + // Controller with 4 endpoints - creates 4 targets with different endpoint indices + { + model.AddressLabel: model.LabelValue("c-1-1.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("CONTROLLER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/controller-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_controller_endpoint_index": model.LabelValue("0"), + }, + { + model.AddressLabel: model.LabelValue("c-1-2.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("CONTROLLER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/controller-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_controller_endpoint_index": model.LabelValue("1"), + }, + { + model.AddressLabel: model.LabelValue("c-1-3.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("CONTROLLER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/controller-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_controller_endpoint_index": model.LabelValue("2"), + }, + { + model.AddressLabel: model.LabelValue("c-1-4.cluster.kafka.us-east-1.amazonaws.com:80"), + "__meta_msk_cluster_name": model.LabelValue("multi-endpoint-cluster"), + "__meta_msk_cluster_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:cluster/multi-endpoint-cluster/abc-999"), + "__meta_msk_cluster_state": model.LabelValue("ACTIVE"), + "__meta_msk_cluster_type": model.LabelValue("PROVISIONED"), + "__meta_msk_cluster_version": model.LabelValue("2.0.0"), + "__meta_msk_cluster_jmx_exporter_enabled": model.LabelValue("true"), + "__meta_msk_cluster_configuration_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:configuration/config/abc"), + "__meta_msk_cluster_configuration_revision": model.LabelValue("1"), + "__meta_msk_cluster_kafka_version": model.LabelValue("3.4.0"), + "__meta_msk_node_type": model.LabelValue("CONTROLLER"), + "__meta_msk_node_arn": model.LabelValue("arn:aws:kafka:us-east-1:123456789012:node/controller-multi"), + "__meta_msk_node_added_time": model.LabelValue("2023-08-01T00:00:00Z"), + "__meta_msk_node_instance_type": model.LabelValue("kafka.m5.large"), + "__meta_msk_controller_endpoint_index": model.LabelValue("3"), + }, + }, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client := newMockMSKClient(tt.mskData) + + config := tt.config + if config == nil { + // Default config for backward compatibility + config = &MSKSDConfig{ + Region: tt.mskData.region, + Port: 80, + RequestConcurrency: 10, + } + } + + d := &MSKDiscovery{ + msk: client, + cfg: config, + } + + groups, err := d.refresh(ctx) + require.NoError(t, err) + + // Sort targets within each group by address to handle non-deterministic ordering from goroutines + for _, group := range groups { + if len(group.Targets) > 0 { + sort.Slice(group.Targets, func(i, j int) bool { + return string(group.Targets[i][model.AddressLabel]) < string(group.Targets[j][model.AddressLabel]) + }) + } + } + for _, group := range tt.expected { + if len(group.Targets) > 0 { + sort.Slice(group.Targets, func(i, j int) bool { + return string(group.Targets[i][model.AddressLabel]) < string(group.Targets[j][model.AddressLabel]) + }) + } + } + + require.Equal(t, tt.expected, groups) + }) + } +} + +func TestNodeType(t *testing.T) { + tests := []struct { + name string + node types.NodeInfo + expected NodeType + }{ + { + name: "BrokerNode", + node: types.NodeInfo{ + BrokerNodeInfo: &types.BrokerNodeInfo{}, + }, + expected: NodeTypeBroker, + }, + { + name: "ControllerNode", + node: types.NodeInfo{ + ControllerNodeInfo: &types.ControllerNodeInfo{}, + }, + expected: NodeTypeController, + }, + { + name: "UnknownNode", + node: types.NodeInfo{}, + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := nodeType(tt.node) + require.Equal(t, tt.expected, result) + }) + } +} + +// MSK client mock. +type mockMSKClient struct { + mskData mskDataStore +} + +func newMockMSKClient(mskData *mskDataStore) *mockMSKClient { + return &mockMSKClient{ + mskData: *mskData, + } +} + +func (m *mockMSKClient) DescribeClusterV2(_ context.Context, input *kafka.DescribeClusterV2Input, _ ...func(*kafka.Options)) (*kafka.DescribeClusterV2Output, error) { + inputARN := aws.ToString(input.ClusterArn) + for i := range m.mskData.clusters { + cluster := &m.mskData.clusters[i] + if aws.ToString(cluster.ClusterArn) == inputARN { + return &kafka.DescribeClusterV2Output{ + ClusterInfo: cluster, + }, nil + } + } + + return nil, fmt.Errorf("cluster not found: %s", inputARN) +} + +func (m *mockMSKClient) ListClustersV2(_ context.Context, input *kafka.ListClustersV2Input, _ ...func(*kafka.Options)) (*kafka.ListClustersV2Output, error) { + var clusters []types.Cluster + + for _, cluster := range m.mskData.clusters { + // Apply cluster name filter if specified + if input.ClusterNameFilter != nil && *input.ClusterNameFilter != "" { + if cluster.ClusterName != nil && *cluster.ClusterName != *input.ClusterNameFilter { + continue + } + } + + // Apply cluster type filter if specified + if input.ClusterTypeFilter != nil && *input.ClusterTypeFilter != "" { + if string(cluster.ClusterType) != *input.ClusterTypeFilter { + continue + } + } + + clusters = append(clusters, cluster) + } + + return &kafka.ListClustersV2Output{ + ClusterInfoList: clusters, + }, nil +} + +func (m *mockMSKClient) ListNodes(_ context.Context, input *kafka.ListNodesInput, _ ...func(*kafka.Options)) (*kafka.ListNodesOutput, error) { + clusterARN := aws.ToString(input.ClusterArn) + nodes, exists := m.mskData.nodes[clusterARN] + if !exists { + return &kafka.ListNodesOutput{ + NodeInfoList: nil, + }, nil + } + + return &kafka.ListNodesOutput{ + NodeInfoList: nodes, + }, nil +} diff --git a/discovery/kubernetes/kubernetes_test.go b/discovery/kubernetes/kubernetes_test.go index a68a7c9a43..b4bba381a4 100644 --- a/discovery/kubernetes/kubernetes_test.go +++ b/discovery/kubernetes/kubernetes_test.go @@ -17,6 +17,7 @@ import ( "context" "encoding/json" "errors" + "os" "testing" "time" @@ -42,6 +43,14 @@ import ( ) func TestMain(m *testing.M) { + // Disable the WatchListClient feature gate that is enabled by default in + // client-go v0.35.0+. The WatchList flow requires the server to support + // SendInitialEvents and to send a bookmark event with the + // "k8s.io/initial-events-end" annotation. The fake clientset used in tests + // does not support this protocol, causing informers to hang indefinitely + // waiting for the bookmark. Disabling this feature restores the traditional + // List+Watch flow which is compatible with the fake clientset. + os.Setenv("KUBE_FEATURE_WatchListClient", "false") testutil.TolerantVerifyLeak(m) } @@ -52,7 +61,7 @@ func makeDiscovery(role Role, nsDiscovery NamespaceDiscovery, objects ...runtime // makeDiscoveryWithVersion creates a kubernetes.Discovery instance with the specified kubernetes version for testing. func makeDiscoveryWithVersion(role Role, nsDiscovery NamespaceDiscovery, k8sVer string, objects ...runtime.Object) (*Discovery, kubernetes.Interface) { - clientset := fake.NewSimpleClientset(objects...) + clientset := fake.NewClientset(objects...) fakeDiscovery, _ := clientset.Discovery().(*fakediscovery.FakeDiscovery) fakeDiscovery.FakedServerVersion = &version.Info{GitVersion: k8sVer} diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index d4a8cd4f20..251fdfd6a4 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -59,7 +59,7 @@ The Prometheus monitoring server | --query.timeout | Maximum time a query may take before being aborted. Use with server mode only. | `2m` | | --query.max-concurrency | Maximum number of queries executed concurrently. Use with server mode only. | `20` | | --query.max-samples | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` | -| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | +| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | | --agent | Run Prometheus in 'Agent mode'. | | | --log.level | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` | | --log.format | Output format of log messages. One of: [logfmt, json] | `logfmt` | diff --git a/docs/command-line/promtool.md b/docs/command-line/promtool.md index f6737bc37f..e8ffa75aaa 100644 --- a/docs/command-line/promtool.md +++ b/docs/command-line/promtool.md @@ -12,7 +12,7 @@ Tooling for the Prometheus monitoring system. | -h, --help | Show context-sensitive help (also try --help-long and --help-man). | | --version | Show application version. | | --experimental | Enable experimental commands. | -| --enable-feature ... | Comma separated feature names to enable. Valid options: promql-experimental-functions, promql-delayed-name-removal. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details | +| --enable-feature ... | Comma separated feature names to enable. Valid options: promql-experimental-functions, promql-delayed-name-removal, promql-duration-expr, promql-extended-range-selectors. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details | diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 4079daae02..49b7774b5f 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -984,11 +984,56 @@ The following meta labels are available on targets during [relabeling](#relabel_ * `__meta_ecs_tag_task_`: each task tag value, keyed by tag name * `__meta_ecs_tag_ec2_`: each EC2 instance tag value, keyed by tag name (EC2 launch type only) +#### `msk` + +The `msk` role discovers targets from AWS MSK (Managed Streaming for Apache Kafka) provisioned clusters. + +**Important**: This service discovery only works with **provisioned clusters**. Serverless clusters are not supported as they do not expose individual broker nodes. + +Discovery includes: +- **Broker nodes**: Kafka broker instances (supports both ZooKeeper-based and KRaft-based clusters) +- **KRaft Controller nodes**: Controller instances (KRaft-based clusters only) + +Note: ZooKeeper nodes are not discoverable via the MSK API. For monitoring, MSK provides: +- **JMX Exporter**: Available on both broker and KRaft controller nodes (when enabled) +- **Node Exporter**: Available on broker nodes only (when enabled) + +The IAM credentials used must have the following permissions to discover +scrape targets: + +- `kafka:DescribeClusterV2` +- `kafka:ListClustersV2` +- `kafka:ListNodes` + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_msk_cluster_name`: the name of the MSK cluster +* `__meta_msk_cluster_arn`: the ARN of the MSK cluster +* `__meta_msk_cluster_state`: the state of the MSK cluster (e.g., ACTIVE, CREATING, DELETING) +* `__meta_msk_cluster_type`: the type of the MSK cluster (e.g., PROVISIONED, SERVERLESS) +* `__meta_msk_cluster_version`: the current version of the MSK cluster +* `__meta_msk_cluster_kafka_version`: the Kafka version running on the cluster +* `__meta_msk_cluster_jmx_exporter_enabled`: whether JMX exporter is enabled on the cluster +* `__meta_msk_cluster_configuration_arn`: the ARN of the MSK configuration +* `__meta_msk_cluster_configuration_revision`: the revision of the MSK configuration +* `__meta_msk_cluster_tag_`: each cluster tag value, keyed by tag name +* `__meta_msk_node_type`: the type of the node (BROKER or CONTROLLER) +* `__meta_msk_node_arn`: the ARN of the node +* `__meta_msk_node_added_time`: the time the node was added to the cluster +* `__meta_msk_node_instance_type`: the instance type of the node +* `__meta_msk_node_attached_eni`: the ID of the attached ENI +* `__meta_msk_broker_id`: the broker ID (broker nodes only) +* `__meta_msk_broker_endpoint_index`: the index of the broker endpoint (broker nodes only) +* `__meta_msk_broker_client_subnet`: the client subnet of the broker (broker nodes only) +* `__meta_msk_broker_client_vpc_ip`: the VPC IP address of the broker (broker nodes only) +* `__meta_msk_broker_node_exporter_enabled`: whether node exporter is enabled on brokers (broker nodes only) +* `__meta_msk_controller_endpoint_index`: the index of the controller endpoint (controller nodes only) + See below for the configuration options for AWS discovery: ```yaml # The AWS role to use for service discovery. -# Must be one of: ec2, lightsail, or ecs. +# Must be one of: ec2, lightsail, ecs, or msk. role: # The AWS region. If blank, the region from the instance metadata is used. @@ -1024,7 +1069,7 @@ filters: [ - name: values: , [...] ] -# List of ECS cluster ARNs to discover (ecs role only). If empty, all clusters in the region are discovered. +# List of ECS or MSK cluster ARNs (ecs and msk roles only) to discover. If empty, all clusters in the region are discovered. # This can significantly improve performance when you only need to monitor specific clusters. [ clusters: [, ...] ] @@ -2483,8 +2528,7 @@ in the configuration file), which can also be changed using relabeling. ### `` -Nerve SD configurations allow retrieving scrape targets from [AirBnB's Nerve] -(https://github.com/airbnb/nerve) which are stored in +Nerve SD configurations allow retrieving scrape targets from [AirBnB's Nerve](https://github.com/airbnb/nerve) which are stored in [Zookeeper](https://zookeeper.apache.org/). The following meta labels are available on targets during [relabeling](#relabel_config): @@ -2538,8 +2582,7 @@ The following meta labels are available on targets during [relabeling](#relabel_ ### `` -Serverset SD configurations allow retrieving scrape targets from [Serversets] -(https://github.com/twitter/finagle/tree/develop/finagle-serversets) which are +Serverset SD configurations allow retrieving scrape targets from [Serversets](https://github.com/twitter/finagle/tree/develop/finagle-serversets) which are stored in [Zookeeper](https://zookeeper.apache.org/). Serversets are commonly used by [Finagle](https://twitter.github.io/finagle/) and [Aurora](https://aurora.apache.org/). @@ -2973,6 +3016,11 @@ labels: [ : ... ] ``` +The special labels mentioned in the [relabeling](#relabel_config) section can also be +used here to override the respective settings in the scrape configuration. This is +especially useful when combined with any of the service discovery mechanisms that do not +support these settings directly. + ### `` Relabeling is a powerful tool to dynamically rewrite the label set of a target before @@ -2982,6 +3030,11 @@ in the configuration file. Initially, aside from the configured per-target labels, a target's `job` label is set to the `job_name` value of the respective scrape configuration. + +You can also use special labels like `__address__`, `__scheme__`, `__metrics_path__`, +`__scrape_interval__`, `__scrape_timeout__` to customize the defined targets. These will +override the respective settings in the scrape configuration. + The `__address__` label is set to the `:` address of the target. After relabeling, the `instance` label is set to the value of `__address__` by default if it was not set during relabeling. @@ -3496,6 +3549,19 @@ with this feature. # to the timestamp of the last appended sample for the same series. [ out_of_order_time_window: | default = 0s ] +# Configures the trigger point for compacting the stale series from the memory into persistent blocks +# and remove those stale series from the memory. +# +# The threshold is a number between 0.0 and 1.0. It represents the ratio of stale series in the memory +# to the total series in the memory. The stale series compaction is triggered when this ratio crosses +# the configured threshold. It may not trigger the stale series compaction if the usual head compaction +# is about to happen soon. +# +# If set to 0, stale series compaction is disabled. +# +# This is an experimental feature, this behaviour could change or be removed in the future. +[ stale_series_compaction_threshold: | default = 0 ] + # Configures data retention settings for TSDB. # diff --git a/docs/feature_flags.md b/docs/feature_flags.md index af08eebb45..247941c5ce 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -67,12 +67,12 @@ Currently, Prometheus supports start timestamps on the * `PrometheusProto` * `OpenMetrics1.0.0` - + From the above, Prometheus recommends `PrometheusProto`. This is because OpenMetrics 1.0 Start Timestamp information is shared as a `_created` metric and parsing those are prone to errors and expensive (thus, adding an overhead). You also need to be careful to not pollute your Prometheus with extra `_created` metrics. - -Therefore, when `created-timestamp-zero-ingestion` is enabled Prometheus changes the global `scrape_protocols` default configuration option to + +Therefore, when `created-timestamp-zero-ingestion` is enabled Prometheus changes the global `scrape_protocols` default configuration option to `[ PrometheusProto, OpenMetricsText1.0.0, OpenMetricsText0.0.1, PrometheusText0.0.4 ]`, resulting in negotiating the Prometheus Protobuf protocol first (unless the `scrape_protocols` option is set to a different value explicitly). Besides enabling this feature in Prometheus, start timestamps need to be exposed by the application being scraped. @@ -288,8 +288,8 @@ when wrong types are used on wrong functions, automatic renames, delta types and ### Behavior with metadata records -When this feature is enabled and the metadata WAL records exists, in an unlikely situation when type or unit are different across those, -the Prometheus outputs intends to prefer the `__type__` and `__unit__` labels values. For example on Remote Write 2.0, +When this feature is enabled and the metadata WAL records exists, in an unlikely situation when type or unit are different across those, +the Prometheus outputs intends to prefer the `__type__` and `__unit__` labels values. For example on Remote Write 2.0, if the metadata record somehow (e.g. due to bug) says "counter", but `__type__="gauge"` the remote time series will be set to a gauge. ## Use Uncached IO @@ -338,9 +338,25 @@ Example query: > **Note for alerting and recording rules:** > The `smoothed` modifier requires samples after the evaluation interval, so using it directly in alerting or recording rules will typically *under-estimate* the result, as future samples are not available at evaluation time. -> To use `smoothed` safely in rules, you **must** apply a `query_offset` to the rule group (see [documentation](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group)) to ensure the calculation window is fully in the past and all needed samples are available. +> To use `smoothed` safely in rules, you **must** apply a `query_offset` to the rule group (see [documentation](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group)) to ensure the calculation window is fully in the past and all needed samples are available. > For critical alerting, set the offset to at least one scrape interval; for less critical or more resilient use cases, consider a larger offset (multiple scrape intervals) to tolerate missed scrapes. For more details, see the [design doc](https://github.com/prometheus/proposals/blob/main/proposals/2025-04-04_extended-range-selectors-semantics.md). **Note**: Extended Range Selectors are not supported for subqueries. + +## Binary operator fill modifiers + +`--enable-feature=promql-binop-fill-modifiers` + +Enables experimental `fill()`, `fill_left()`, and `fill_right()` modifiers for PromQL binary operators. These modifiers allow filling in missing matches on either side of a binary operation with a provided default sample value. + +Example query: + +``` + rate(successful_requests[5m]) ++ fill(0) + rate(failed_requests[5m]) +``` + +See [the fill modifiers documentation](querying/operators.md#filling-in-missing-matches) for more details and examples. diff --git a/docs/prometheus_agent.md b/docs/prometheus_agent.md index 468b5565d1..0d8c3fa94a 100644 --- a/docs/prometheus_agent.md +++ b/docs/prometheus_agent.md @@ -20,8 +20,8 @@ In essence, it looks like this: ### Benefits of agent mode -- Improved efficency. The customized Agent TSDB WAL removes the data immediately after successful writes. If it cannot reach the remote endpoint, it persists the data temporarily on the disk until the remote endpoint is back online. This is currently limited to a two-hour buffer only, similar to non-agent Prometheus. This means that there is no need to build chunks of data in memory or maintain a full index for querying purposes. Essentially the Agent mode uses a fraction of the resources that a normal Prometheus server would use in a similar situation. -- Agent mode eables easier [horizontal scalability for ingestion](https://prometheus.io/blog/2021/11/16/agent/#the-dream-auto-scalable-metric-ingestion). +- Improved efficiency. The customized Agent TSDB WAL removes the data immediately after successful writes. If it cannot reach the remote endpoint, it persists the data temporarily on the disk until the remote endpoint is back online. This is currently limited to a two-hour buffer only, similar to non-agent Prometheus. This means that there is no need to build chunks of data in memory or maintain a full index for querying purposes. Essentially the Agent mode uses a fraction of the resources that a normal Prometheus server would use in a similar situation. +- Agent mode enables easier [horizontal scalability for ingestion](https://prometheus.io/blog/2021/11/16/agent/#the-dream-auto-scalable-metric-ingestion). ### Downsides of agent mode diff --git a/docs/querying/api.md b/docs/querying/api.md index 4891db8980..78574ec103 100644 --- a/docs/querying/api.md +++ b/docs/querying/api.md @@ -6,6 +6,22 @@ sort_rank: 7 The current stable HTTP API is reachable under `/api/v1` on a Prometheus server. Any non-breaking additions will be added under that endpoint. +## OpenAPI Specification + +An OpenAPI specification for the HTTP API is available at `/api/v1/openapi.yaml`. +By default, it returns OpenAPI 3.1 for broader compatibility. Use `?openapi_version=3.2` +for OpenAPI 3.2, which includes advanced features and endpoints like `/api/v1/notifications/live`. + +This machine-readable specification describes all available endpoints, request parameters, +response formats, and schemas. + +The OpenAPI specification can be used to: + +- Generate client libraries in various programming languages. +- Validate API requests and responses. +- Generate interactive API documentation. +- Test API endpoints. + ## Format overview The API response format is JSON. Every successful API request returns a `2xx` @@ -1013,6 +1029,7 @@ curl http://localhost:9090/api/v1/alerts ## Querying target metadata The following endpoint returns metadata about metrics currently scraped from targets. +The endpoint has the limitation that only metadata scraped from targets directly is returned, metadata sent over Remote-Write or OTLP to Prometheus is not included in this endpoint and will not show up on the UI in "Explore Metrics". This is **experimental** and might change in the future. ``` diff --git a/docs/querying/functions.md b/docs/querying/functions.md index 0cae149dd7..3a9b7025f8 100644 --- a/docs/querying/functions.md +++ b/docs/querying/functions.md @@ -568,6 +568,8 @@ While `info` normally automatically finds all matching info series, it's possibl restrict them by providing a `__name__` label matcher, e.g. `{__name__="target_info"}`. +Note that if there are any time series in `v` that match the `data-label-selector` (or the default `target_info` if that argument is not specified), they will be treated as info series and will be returned unchanged. + ### Limitations In its current iteration, `info` defaults to considering only info series with diff --git a/docs/querying/operators.md b/docs/querying/operators.md index b320d8e86e..b15c02aedc 100644 --- a/docs/querying/operators.md +++ b/docs/querying/operators.md @@ -47,9 +47,9 @@ special values like `NaN`, `+Inf`, and `-Inf`. scalar that is the result of the operator applied to both scalar operands. **Between an instant vector and a scalar**, the operator is applied to the -value of every data sample in the vector. +value of every data sample in the vector. -If the data sample is a float, the operation is performed between that float and the scalar. +If the data sample is a float, the operation is performed between that float and the scalar. For example, if an instant vector of float samples is multiplied by 2, the result is another vector of float samples in which every sample value of the original vector is multiplied by 2. @@ -81,8 +81,9 @@ following: **Between two instant vectors**, a binary arithmetic operator is applied to each entry in the LHS vector and its [matching element](#vector-matching) in the RHS vector. The result is propagated into the result vector with the -grouping labels becoming the output label set. Entries for which no matching -entry in the right-hand vector can be found are not part of the result. +grouping labels becoming the output label set. By default, series for which +no matching entry in the opposite vector can be found are not part of the +result. This behavior can be adjusted using [fill modifiers](#filling-in-missing-matches). If two float samples are matched, the arithmetic operator is applied to the two input values. @@ -97,7 +98,7 @@ If two histogram samples are matched, only `+` and `-` are valid operations, each adding or subtracting all matching bucket populations and the count and the sum of observations. All other operations result in the removal of the corresponding element from the output vector, flagged by an info-level -annotation. The `+` and -` operations should generally only be applied to gauge +annotation. The `+` and `-` operations should generally only be applied to gauge histograms, but PromQL allows them for counter histograms, too, to cover specific use cases, for which special attention is required to avoid problems with unaligned counter resets. (Certain incompatibilities of counter resets can @@ -106,7 +107,7 @@ two counter histograms results in a counter histogram. All other combination of operands and all subtractions result in a gauge histogram. **In any arithmetic binary operation involving vectors**, the metric name is -dropped. This occurs even if `__name__` is explicitly mentioned in `on` +dropped. This occurs even if `__name__` is explicitly mentioned in `on` (see https://github.com/prometheus/prometheus/issues/16631 for further discussion). **For any arithmetic binary operation that may result in a negative @@ -156,9 +157,9 @@ info-level annotation. applied to matching entries. Vector elements for which the expression is not true or which do not find a match on the other side of the expression get dropped from the result, while the others are propagated into a result vector -with the grouping labels becoming the output label set. +with the grouping labels becoming the output label set. -Matches between two float samples work as usual. +Matches between two float samples work as usual. Matches between a float sample and a histogram sample are invalid, and the corresponding element is removed from the result vector, flagged by an info-level @@ -171,8 +172,8 @@ comparison binary operations are again invalid. modifier changes the behavior in the following ways: * Vector elements which find a match on the other side of the expression but for - which the expression is false instead have the value `0` and vector elements - that do find a match and for which the expression is true have the value `1`. + which the expression is false instead have the value `0`, and vector elements + that do find a match and for which the expression is true have the value `1`. (Note that elements with no match or invalid operations involving histogram samples still return no result rather than the value `0`.) * The metric name is dropped. @@ -216,11 +217,10 @@ matching behavior: One-to-one and many-to-one/one-to-many. ### Vector matching keywords -These vector matching keywords allow for matching between series with different label sets -providing: +These vector matching keywords allow for matching between series with different label sets: -* `on` -* `ignoring` +* `on(