-
Notifications
You must be signed in to change notification settings - Fork 0
140 lines (121 loc) · 4.41 KB
/
model-evaluation.yml
File metadata and controls
140 lines (121 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
name: Weekly Model Evaluation
on:
schedule:
- cron: '0 6 * * 1' # Every Monday at 6:00 UTC
workflow_dispatch:
inputs:
models:
description: 'Comma-separated list of model IDs to evaluate'
required: false
default: 'gpt-5-mini'
concurrency:
group: model-evaluation
cancel-in-progress: true
jobs:
prepare:
name: Prepare Model Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
date: ${{ steps.set-date.outputs.date }}
steps:
- name: Set date
id: set-date
run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
- name: Set matrix
id: set-matrix
run: |
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}')
echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
evaluate:
name: Evaluate ${{ matrix.model }}
needs: prepare
runs-on: ubuntu-latest
strategy:
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod
- name: Setup proto files
run: make proto-setup
- name: Generate proto descriptors
run: make proto-generate
- name: Download WireMock
run: make mock-download
- name: Run E2E tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL_NAME: "openai:${{ matrix.model }}"
run: make e2e-test
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ matrix.model }}
path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json
if-no-files-found: error
update-docs:
name: Update Documentation & Create PR
needs: [prepare, evaluate]
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download all results
uses: actions/download-artifact@v8
with:
pattern: eval-results-*
path: eval-results
- name: Update model evaluation docs
run: |
MODELS="${{ inputs.models || 'gpt-5-mini' }}"
for MODEL in $(echo "$MODELS" | tr ',' ' '); do
# download-artifact@v8 creates per-artifact subdirectories only when
# multiple artifacts are downloaded. With a single artifact it extracts
# directly into the target path. Check both locations.
RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json"
if [ ! -f "$RESULTS_FILE" ]; then
RESULTS_FILE="eval-results/mcpchecker-stackrox-mcp-e2e-out.json"
fi
if [ -f "$RESULTS_FILE" ]; then
echo "Updating docs for model: ${MODEL}"
./scripts/update-model-evaluation.sh \
--model-id "${MODEL}" \
--results "${RESULTS_FILE}"
else
echo "::warning::No results found for model ${MODEL}"
fi
done
- name: Clean up eval results
run: rm -rf eval-results
- name: Check for changes
id: check-changes
run: |
if git diff --quiet docs/model-evaluation.md; then
echo "changed=false" >> "$GITHUB_OUTPUT"
else
echo "changed=true" >> "$GITHUB_OUTPUT"
fi
- name: Create Pull Request
if: steps.check-changes.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v7
with:
branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }}
commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}"
title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}"
body: |
Automated weekly model evaluation update.
**Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }}
**Date:** ${{ needs.prepare.outputs.date }}
This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
base: main