From 8edac0b300fa1bedca5b4aac85521e7c8bffc55c Mon Sep 17 00:00:00 2001 From: michaeljguarino Date: Mon, 3 Mar 2025 15:54:53 -0500 Subject: [PATCH] Troubleshoot vector indexing Need to figure out what's going on while e2e testing --- .github/workflows/ai-proxy-cd.yaml | 4 +- .github/workflows/ai-proxy-ci.yaml | 2 +- .github/workflows/assets.yaml | 6 +-- .github/workflows/codeql.yml | 2 +- .github/workflows/controller-cd.yaml | 4 +- .github/workflows/controller-ci.yaml | 2 +- .github/workflows/demo-cd.yaml | 40 ++++++------------- .github/workflows/oci-auth-cd.yaml | 4 +- .github/workflows/oci-auth-ci.yaml | 2 +- .github/workflows/pr-labels.yaml | 2 +- .github/workflows/publish.yaml | 10 ++--- .github/workflows/test.yaml | 6 +-- .../flaky-service/api/behavior_modifiers.go | 4 +- lib/console/ai/evidence/context.ex | 1 + lib/console/ai/evidence/vector.ex | 31 ++++++++++++-- lib/console/ai/provider.ex | 3 +- lib/console/ai/pubsub/vector/consumer.ex | 2 +- lib/console/ai/pubsub/vector/protocol.ex | 4 +- lib/console/ai/vector/content.ex | 8 ++-- lib/console/ai/vector/elastic.ex | 4 +- lib/console/ai/vector_store.ex | 5 ++- lib/console/deployments/pr/impl/github.ex | 2 +- ...service.yaml => flaky-service.yaml.liquid} | 2 +- test/console/ai/cron_test.exs | 1 + .../ai/pubsub/vector/consumer_test.exs | 4 +- 25 files changed, 84 insertions(+), 71 deletions(-) rename test-apps/flaky-service/{flaky-service.yaml => flaky-service.yaml.liquid} (94%) diff --git a/.github/workflows/ai-proxy-cd.yaml b/.github/workflows/ai-proxy-cd.yaml index 9bd010cee5..887968b3a4 100644 --- a/.github/workflows/ai-proxy-cd.yaml +++ b/.github/workflows/ai-proxy-cd.yaml @@ -27,7 +27,7 @@ env: jobs: test: name: Unit test - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash @@ -44,7 +44,7 @@ jobs: publish-docker: name: Build and push ai-proxy container - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/ai-proxy-ci.yaml b/.github/workflows/ai-proxy-ci.yaml index 92e0e171e5..16a6347caa 100644 --- a/.github/workflows/ai-proxy-ci.yaml +++ b/.github/workflows/ai-proxy-ci.yaml @@ -34,7 +34,7 @@ jobs: check: name: Check - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/assets.yaml b/.github/workflows/assets.yaml index c894bde764..0e4d5be130 100644 --- a/.github/workflows/assets.yaml +++ b/.github/workflows/assets.yaml @@ -16,7 +16,7 @@ on: jobs: # e2e: # name: End-to-end test - # runs-on: ubuntu-20.04 + # runs-on: ubuntu-latest # env: # CYPRESS_EMAIL: ${{ secrets.CYPRESS_EMAIL }} # CYPRESS_PASSWORD: ${{ secrets.CYPRESS_PASSWORD }} @@ -50,7 +50,7 @@ jobs: test: name: Unit test - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash @@ -64,7 +64,7 @@ jobs: - run: yarn test lint: name: Lint - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 32b7648bf4..e01355bc6d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -17,7 +17,7 @@ jobs: contents: read security-events: write name: CodeQL - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: diff --git a/.github/workflows/controller-cd.yaml b/.github/workflows/controller-cd.yaml index 2d3bf31c91..3dbd015676 100644 --- a/.github/workflows/controller-cd.yaml +++ b/.github/workflows/controller-cd.yaml @@ -21,7 +21,7 @@ env: jobs: test: name: Unit test - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash @@ -41,7 +41,7 @@ jobs: run: PATH=$PATH:$GOPATH/bin make test publish-docker: name: Build and push controller container - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/controller-ci.yaml b/.github/workflows/controller-ci.yaml index 4d835e738c..0377d411c1 100644 --- a/.github/workflows/controller-ci.yaml +++ b/.github/workflows/controller-ci.yaml @@ -37,7 +37,7 @@ jobs: - run: PATH=$PATH:$GOPATH/bin make build unit-test: name: Unit tests - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/demo-cd.yaml b/.github/workflows/demo-cd.yaml index 1f2fce037f..e2c7803805 100644 --- a/.github/workflows/demo-cd.yaml +++ b/.github/workflows/demo-cd.yaml @@ -7,9 +7,6 @@ on: paths: - ".github/workflows/flaky-service-cd.yaml" - "go/demo/**" - push: - tags: - - 'go/demo/v*.*.*' permissions: contents: read @@ -22,7 +19,7 @@ env: jobs: publish-flaky-service-docker: name: Build and push flaky-service container - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash @@ -35,18 +32,18 @@ jobs: - uses: actions/checkout@v4.2.2 with: fetch-depth: 0 - - id: meta-flaky-service - uses: docker/metadata-action@v5 + - name: Docker meta + id: meta + uses: docker/metadata-action@v4 with: + # list of Docker images to use as base name for tags images: | ghcr.io/pluralsh/flaky-service - tags: v1.0.0 - - id: meta-flaky-service-sidecar - uses: docker/metadata-action@v5 - with: - images: | - ghcr.io/pluralsh/flaky-service-sidecar - tags: v1.0.0 + # generate Docker tags based on the following events/attributes + tags: | + type=sha + type=ref,event=pr + type=ref,event=branch - uses: docker/login-action@v3 with: registry: ghcr.io @@ -59,21 +56,8 @@ jobs: context: "./go/demo/flaky-service" file: "./go/demo/flaky-service/Dockerfile" push: true - tags: ${{ steps.meta-flaky-service.outputs.tags }} - labels: ${{ steps.meta-flaky-service.outputs.labels }} - platforms: linux/amd64, linux/arm64 - cache-from: type=gha - cache-to: type=gha, mode=max - build-args: | - GIT_COMMIT=${{ github.sha }} - VERSION=${{ steps.meta.outputs.version }} - - uses: docker/build-push-action@v5 - with: - context: "./go/demo/flaky-service" - file: "./go/demo/flaky-service/Dockerfile.sidecar" - push: true - tags: ${{ steps.meta-flaky-service-sidecar.outputs.tags }} - labels: ${{ steps.meta-flaky-service-sidecar.outputs.labels }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} platforms: linux/amd64, linux/arm64 cache-from: type=gha cache-to: type=gha, mode=max diff --git a/.github/workflows/oci-auth-cd.yaml b/.github/workflows/oci-auth-cd.yaml index 6d0815149b..215fa17e5d 100644 --- a/.github/workflows/oci-auth-cd.yaml +++ b/.github/workflows/oci-auth-cd.yaml @@ -21,7 +21,7 @@ env: jobs: test: name: Unit test - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash @@ -37,7 +37,7 @@ jobs: - run: PATH=$PATH:$GOPATH/bin make test publish-docker: name: Build and push oci-auth container - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/oci-auth-ci.yaml b/.github/workflows/oci-auth-ci.yaml index 4d7cca914e..3c9b4bf461 100644 --- a/.github/workflows/oci-auth-ci.yaml +++ b/.github/workflows/oci-auth-ci.yaml @@ -36,7 +36,7 @@ jobs: - run: PATH=$PATH:$GOPATH/bin make build unit-test: name: Unit tests - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest defaults: run: shell: bash diff --git a/.github/workflows/pr-labels.yaml b/.github/workflows/pr-labels.yaml index a45283a044..812c0178e2 100644 --- a/.github/workflows/pr-labels.yaml +++ b/.github/workflows/pr-labels.yaml @@ -6,7 +6,7 @@ on: jobs: label: name: Check that PR has required labels - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: mheap/github-action-required-labels@v2 with: diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index c56e61d7cc..47557cca16 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -6,7 +6,7 @@ on: - "v*.*.*" jobs: test: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: erlef/setup-beam@v1 @@ -45,7 +45,7 @@ jobs: if: always() publish: name: Build and push Console container - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest needs: test permissions: contents: "read" @@ -133,7 +133,7 @@ jobs: if: always() release: name: Create GitHub release - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest needs: publish permissions: contents: write @@ -148,7 +148,7 @@ jobs: draft: false bump: name: Bump Chart Version - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest needs: [release] permissions: contents: write @@ -207,7 +207,7 @@ jobs: base: master bump-rapid: name: Bump Rapid Chart Version - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest needs: [release] permissions: contents: write diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index debbdcae81..01bf9ef5f3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -8,7 +8,7 @@ on: jobs: build: name: Test Build Docker image - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 @@ -64,7 +64,7 @@ jobs: test: name: Test - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: erlef/setup-beam@v1 @@ -103,7 +103,7 @@ jobs: if: always() updateSchema: name: Check that Schema is up to date - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: erlef/setup-beam@v1 diff --git a/go/demo/flaky-service/api/behavior_modifiers.go b/go/demo/flaky-service/api/behavior_modifiers.go index 604e817f72..ea5476332f 100644 --- a/go/demo/flaky-service/api/behavior_modifiers.go +++ b/go/demo/flaky-service/api/behavior_modifiers.go @@ -3,6 +3,7 @@ package api import ( "log/slog" "net/http" + "runtime/debug" "time" "github.com/pluralsh/console/go/demo/flaky-service/metrics" @@ -26,13 +27,14 @@ func HandleRequestTimestampModulus(timestampModulus int64) http.HandlerFunc { if time_now%timestampModulus == 0 { slog.Error("found unknown error, returning status.InternalServerError", "time_now", time_now, "modulus", timestampModulus) + slog.Error("dumping stacktrace", "stacktrace", string(debug.Stack())) metrics.IncrementRequestCounter(http.StatusInternalServerError, r.Method) w.WriteHeader(http.StatusInternalServerError) w.Header().Set("Content-Type", "application/json") w.Write([]byte(`{"message": "req failed"}`)) } else { - slog.Info("Timestamp is not multiple of modulus, returning status.OK", "time_now", time_now, "modulus", timestampModulus) + slog.Info("Everything seems fine, returning status.OK", "time_now", time_now, "modulus", timestampModulus) metrics.IncrementRequestCounter(http.StatusOK, r.Method) w.WriteHeader(http.StatusOK) diff --git a/lib/console/ai/evidence/context.ex b/lib/console/ai/evidence/context.ex index 3e79d0a95c..273b4d4a29 100644 --- a/lib/console/ai/evidence/context.ex +++ b/lib/console/ai/evidence/context.ex @@ -15,6 +15,7 @@ defmodule Console.AI.Evidence.Context do def claims(%__MODULE__{} = ctx, %{evidence: [_ | _] = evidence}), do: evidence(ctx, evidence) def claims(%__MODULE__{} = ctx, _), do: ctx + def prompt(ctx, {_, nil}), do: ctx def prompt(%__MODULE__{history: hist} = ctx, msg), do: %{ctx | history: append(hist, msg)} def reduce(%__MODULE__{} = ctx, enum, fun) when is_function(fun, 2), do: Enum.reduce(enum, ctx, fun) diff --git a/lib/console/ai/evidence/vector.ex b/lib/console/ai/evidence/vector.ex index 33491a2462..63cdbeb0a1 100644 --- a/lib/console/ai/evidence/vector.ex +++ b/lib/console/ai/evidence/vector.ex @@ -22,7 +22,7 @@ defmodule Console.AI.Evidence.Vector do with true <- VectorStore.enabled?(), {:ok, %Vector{query: query}} <- use_vector(ctx.history), {:ok, [_ | _] = vdata} <- VectorStore.fetch(query) do - Context.prompt(ctx, {:user, "I've also found some relevent external data that could add additional context to what caused the issue:"}) + Context.prompt(ctx, {:user, "I've also found some relevent data that could add additional context to what caused the issue.:"}) |> Context.reduce(vdata, &Context.prompt(&2, {:user, vector_prompt(&1)})) |> Context.evidence(vector_evidence(vdata)) else @@ -32,9 +32,32 @@ defmodule Console.AI.Evidence.Vector do end end - defp vector_prompt(%VectorStore.Response{alert_resolution: alert_resolution}), - do: "A prior alert resolution with data like so: #{json!(alert_resolution)}" - defp vector_prompt(%VectorStore.Response{pr_file: pr_file}), do: "A file from a given pr with data like so: #{json!(pr_file)}" + defp vector_prompt(%VectorStore.Response{type: :alert, alert_resolution: alert_resolution}), + do: "A prior alert resolution with data like so that likely was caused by the same issue: #{json!(alert_resolution)}" + defp vector_prompt(%VectorStore.Response{type: :pr, pr_file: pr_file}) do + """ + A file from a given pull request with information like so, containing a possible code change that caused the issue, described below: + + Pull Request URL: #{pr_file.url} + Repo: #{pr_file.repo} + PR Title: #{pr_file.title} + Commit SHA: #{pr_file.sha} + Filename: #{pr_file.filename} + + The full contents of the file is: + + ``` + #{pr_file.contents} + ``` + + The git patch of the change is: + + ``` + #{pr_file.patch} + ``` + """ + end + defp vector_prompt(_), do: nil defp vector_evidence(vdata) do Enum.map(vdata, fn diff --git a/lib/console/ai/provider.ex b/lib/console/ai/provider.ex index e6b5dc108e..71e6d576ad 100644 --- a/lib/console/ai/provider.ex +++ b/lib/console/ai/provider.ex @@ -12,8 +12,7 @@ defmodule Console.AI.Provider do You're a seasoned devops engineer with experience in Kubernetes, GitOps and Infrastructure As Code, and need to give a concise but clear explanation of issues in your companies kubernetes infrastructure. The user is not necessarily an expert in the domain, so please provide as much documentation and evidence as is necessary to explain what issue they're - facing. Please provide a clear summary and any details to debug what's going on with the case provided. You should guide users - to implement GitOps best practices, so avoid telling them to manually modify resources via kubectl, helm or terraform commands directly. + facing. Please provide a clear summary and any details to debug what's going on with the case provided. """} @summary """ diff --git a/lib/console/ai/pubsub/vector/consumer.ex b/lib/console/ai/pubsub/vector/consumer.ex index c8bf2834fa..1b2ed0613a 100644 --- a/lib/console/ai/pubsub/vector/consumer.ex +++ b/lib/console/ai/pubsub/vector/consumer.ex @@ -12,7 +12,7 @@ defmodule Console.AI.PubSub.Vector.Consumer do end end - defp insert({:ok, [_ | _] = resources}), do: Enum.each(resources, &VectorStore.insert/1) + defp insert({:ok, resources}) when is_list(resources), do: Enum.each(resources, &VectorStore.insert/1) defp insert({:ok, res}), do: VectorStore.insert(res) defp insert(pass), do: pass end diff --git a/lib/console/ai/pubsub/vector/protocol.ex b/lib/console/ai/pubsub/vector/protocol.ex index 36b6372fd0..dc3da4e608 100644 --- a/lib/console/ai/pubsub/vector/protocol.ex +++ b/lib/console/ai/pubsub/vector/protocol.ex @@ -13,14 +13,16 @@ defimpl Console.AI.PubSub.Vectorizable, for: Console.PubSub.ScmWebhook do alias Console.AI.Tool alias Console.Deployments.Pr.Dispatcher alias Console.Schema.{ScmWebhook, ScmConnection} + require Logger def resource(%@for{ - item: %{"action" => "pull_request", "pull_request" => %{"merged" => true} = pr}, + item: %{"action" => "closed", "pull_request" => %{"merged" => true} = pr}, actor: %ScmWebhook{type: :github} }) do with %ScmConnection{} = conn <- Tool.scm_connection(), do: Dispatcher.files(conn, pr) end + def resource(_), do: :ok end diff --git a/lib/console/ai/vector/content.ex b/lib/console/ai/vector/content.ex index c736d0d8bd..8d7867aef3 100644 --- a/lib/console/ai/vector/content.ex +++ b/lib/console/ai/vector/content.ex @@ -6,11 +6,11 @@ defmodule Console.AI.Vector.Content do def content(data), do: {Storable.datatype(data), Storable.content(data)} - def decode("pr_file", data), do: %Response{pr_file: File.new(data)} - def decode(:pr_file, data), do: %Response{pr_file: File.new(data)} + def decode("pr_file", data), do: %Response{type: :pr, pr_file: File.new(data)} + def decode(:pr_file, data), do: %Response{type: :pr, pr_file: File.new(data)} - def decode("alert_resolution", data), do: %Response{alert_resolution: AlertResolution.Mini.new(data)} - def decode(:alert_resolution, data), do: %Response{alert_resolution: AlertResolution.Mini.new(data)} + def decode("alert_resolution", data), do: %Response{type: :alert,alert_resolution: AlertResolution.Mini.new(data)} + def decode(:alert_resolution, data), do: %Response{type: :alert, alert_resolution: AlertResolution.Mini.new(data)} def decode(_, _), do: nil end diff --git a/lib/console/ai/vector/elastic.ex b/lib/console/ai/vector/elastic.ex index 138da538ba..d784eeff51 100644 --- a/lib/console/ai/vector/elastic.ex +++ b/lib/console/ai/vector/elastic.ex @@ -78,9 +78,7 @@ defmodule Console.AI.Vector.Elastic do defp vector_query(embedding) do %{ size: 5, - query: %{ - knn: %{field: "passages.vector", query_vector: embedding, k: 5} - } + knn: %{field: "passages.vector", query_vector: embedding, k: 5} } end diff --git a/lib/console/ai/vector_store.ex b/lib/console/ai/vector_store.ex index 4c09cfa631..3cb11e9d0a 100644 --- a/lib/console/ai/vector_store.ex +++ b/lib/console/ai/vector_store.ex @@ -8,12 +8,15 @@ defmodule Console.AI.VectorStore do alias Console.Deployments.Settings defmodule Response do + @type type :: :alert | :pr + @type t :: %__MODULE__{ + type: type, pr_file: Console.Deployments.Pr.File.t, alert_resolution: Console.Schema.AlertResolution.Mini.t } - defstruct [:pr_file, :alert_resolution] + defstruct [:pr_file, :alert_resolution, :type] end @type store :: Console.AI.Vector.Elastic.t diff --git a/lib/console/deployments/pr/impl/github.ex b/lib/console/deployments/pr/impl/github.ex index 1ee484d8b6..bc9d21ced5 100644 --- a/lib/console/deployments/pr/impl/github.ex +++ b/lib/console/deployments/pr/impl/github.ex @@ -98,7 +98,7 @@ defmodule Console.Deployments.Pr.Impl.Github do end defp get_content(client, url) when is_binary(url) do - case HTTPoison.get(url, [{"authorization", "Token #{client.auth.access_token}"}]) do + case HTTPoison.get(url, [{"authorization", "Token #{client.auth.access_token}"}], follow_redirect: true) do {:ok, %HTTPoison.Response{status_code: code, body: content}} when code >= 200 and code < 300 -> content _ -> nil diff --git a/test-apps/flaky-service/flaky-service.yaml b/test-apps/flaky-service/flaky-service.yaml.liquid similarity index 94% rename from test-apps/flaky-service/flaky-service.yaml rename to test-apps/flaky-service/flaky-service.yaml.liquid index 10059e5ee1..083cb5b7a9 100644 --- a/test-apps/flaky-service/flaky-service.yaml +++ b/test-apps/flaky-service/flaky-service.yaml.liquid @@ -15,7 +15,7 @@ spec: spec: containers: - name: flaky-service - image: ghcr.io/pluralsh/flaky-service:v1.0.0 + image: ghcr.io/pluralsh/flaky-service:{{ configuration.tag | default: "v1.0.0" }} imagePullPolicy: Always ports: - containerPort: 8080 # API Server diff --git a/test/console/ai/cron_test.exs b/test/console/ai/cron_test.exs index 1727369e61..a33d737858 100644 --- a/test/console/ai/cron_test.exs +++ b/test/console/ai/cron_test.exs @@ -331,6 +331,7 @@ defmodule Console.AI.CronTest do end) expect(Console.AI.VectorStore, :fetch, fn "some query" -> {:ok, [ %Console.AI.VectorStore.Response{ + type: :pr, pr_file: %Console.Deployments.Pr.File{ url: "https://github.com/pr/url", repo: "some/repo", diff --git a/test/console/ai/pubsub/vector/consumer_test.exs b/test/console/ai/pubsub/vector/consumer_test.exs index dd0ba9aefe..2316d120de 100644 --- a/test/console/ai/pubsub/vector/consumer_test.exs +++ b/test/console/ai/pubsub/vector/consumer_test.exs @@ -31,11 +31,11 @@ defmodule Console.AI.PubSub.Vector.ConsumerTest do "patch" => "example diff", }], %HTTPoison.Response{status_code: 200}} end) - expect(HTTPoison, :get, fn "https://test.url", _ -> {:ok, %HTTPoison.Response{status_code: 200, body: "terraform"}} end) + expect(HTTPoison, :get, fn "https://test.url", _, [follow_redirect: true] -> {:ok, %HTTPoison.Response{status_code: 200, body: "terraform"}} end) event = %PubSub.ScmWebhook{ item: %{ - "action" => "pull_request", + "action" => "closed", "pull_request" => %{"merged" => true, "html_url" => "https://github.com/owner/repo/pull/1"}, }, actor: hook