Skip to content

Commit 734196c

Browse files
m-messerclaude
andcommitted
Implement I/O test pair evaluation with hidden test support
Replaces trivial equality check with an AAT runner that executes student Python code in a subprocess, feeds stdin per test case, and compares stdout against expected output. Supports hidden tests (results shown without revealing inputs/outputs) and handles runtime errors and timeouts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 7016703 commit 734196c

2 files changed

Lines changed: 116 additions & 51 deletions

File tree

evaluation_function/evaluation.py

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,75 @@
1+
import os
2+
import subprocess
3+
import tempfile
14
from typing import Any
25
from lf_toolkit.evaluation import Result, Params
36

4-
def evaluation_function(
5-
response: Any,
6-
answer: Any,
7-
params: Params,
8-
) -> Result:
9-
"""
10-
Function used to evaluate a student response.
11-
---
12-
The handler function passes three arguments to evaluation_function():
13-
14-
- `response` which are the answers provided by the student.
15-
- `answer` which are the correct answers to compare against.
16-
- `params` which are any extra parameters that may be useful,
17-
e.g., error tolerances.
18-
19-
The output of this function is what is returned as the API response
20-
and therefore must be JSON-encodable. It must also conform to the
21-
response schema.
22-
23-
Any standard python library may be used, as well as any package
24-
available on pip (provided it is added to requirements.txt).
25-
26-
The way you wish to structure you code (all in this function, or
27-
split into many) is entirely up to you. All that matters are the
28-
return types and that evaluation_function() is the main function used
29-
to output the evaluation response.
30-
"""
31-
32-
return Result(
33-
is_correct=response == answer
34-
)
7+
_TIMEOUT = 5
8+
9+
10+
def _run_code(code: str, stdin: str) -> tuple[str, str, bool]:
11+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
12+
f.write(code)
13+
tmpfile = f.name
14+
try:
15+
proc = subprocess.run(
16+
["python", tmpfile],
17+
input=stdin,
18+
capture_output=True,
19+
text=True,
20+
timeout=_TIMEOUT,
21+
)
22+
return proc.stdout, proc.stderr, False
23+
except subprocess.TimeoutExpired:
24+
return "", "", True
25+
finally:
26+
os.unlink(tmpfile)
27+
28+
29+
def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
30+
tests = params.get("tests", [])
31+
32+
if not tests:
33+
result = Result(is_correct=False)
34+
result.add_feedback("error", "No test cases provided.")
35+
return result
36+
37+
passed = 0
38+
result = Result()
39+
40+
for i, test in enumerate(tests, 1):
41+
stdin = test.get("input", "")
42+
expected = test.get("expected_output", "").rstrip()
43+
hidden = test.get("hidden", False)
44+
45+
stdout, stderr, timed_out = _run_code(str(response), stdin)
46+
actual = stdout.rstrip()
47+
48+
if timed_out:
49+
tag = "hidden_fail" if hidden else "fail"
50+
label = f"Hidden test {i}" if hidden else f"Test {i}"
51+
result.add_feedback(tag, f"{label}: timed out after {_TIMEOUT}s.")
52+
elif stderr and not stdout:
53+
tag = "hidden_fail" if hidden else "fail"
54+
label = f"Hidden test {i}" if hidden else f"Test {i}"
55+
msg = f"{label}: runtime error." if hidden else f"{label}: runtime error.\n{stderr.strip()}"
56+
result.add_feedback(tag, msg)
57+
elif actual == expected:
58+
passed += 1
59+
label = f"Hidden test {i}" if hidden else f"Test {i}"
60+
result.add_feedback("pass", f"{label}: passed.")
61+
else:
62+
tag = "hidden_fail" if hidden else "fail"
63+
if hidden:
64+
result.add_feedback(tag, f"Hidden test {i}: failed.")
65+
else:
66+
result.add_feedback(tag, (
67+
f"Test {i}: failed.\n"
68+
f" Input: {stdin.rstrip()}\n"
69+
f" Expected: {expected}\n"
70+
f" Got: {actual}"
71+
))
72+
73+
result.is_correct = passed == len(tests)
74+
result.add_feedback("summary", f"{passed}/{len(tests)} tests passed.")
75+
return result
Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,54 @@
11
import unittest
22

3-
from .evaluation import Params, evaluation_function
3+
from .evaluation import evaluation_function
4+
5+
_SQUARE_CODE = "n = int(input())\nprint(n * n)"
6+
_CRASH_CODE = "raise ValueError('oops')"
7+
_INFINITE_CODE = "while True: pass"
8+
9+
10+
def _params(*tests):
11+
return {"tests": list(tests)}
12+
13+
14+
def _test(inp, expected, hidden=False):
15+
return {"input": inp, "expected_output": expected, "hidden": hidden}
16+
417

518
class TestEvaluationFunction(unittest.TestCase):
6-
"""
7-
TestCase Class used to test the algorithm.
8-
---
9-
Tests are used here to check that the algorithm written
10-
is working as it should.
1119

12-
It's best practise to write these tests first to get a
13-
kind of 'specification' for how your algorithm should
14-
work, and you should run these tests before committing
15-
your code to AWS.
20+
def test_all_pass(self):
21+
params = _params(_test("5\n", "25\n"), _test("3\n", "9\n"))
22+
result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
23+
24+
self.assertTrue(result["is_correct"])
25+
self.assertIn("2/2 tests passed", result["feedback"])
26+
27+
def test_partial_fail(self):
28+
params = _params(_test("5\n", "25\n"), _test("3\n", "99\n"))
29+
result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
30+
31+
self.assertFalse(result["is_correct"])
32+
self.assertIn("1/2 tests passed", result["feedback"])
33+
34+
def test_hidden_test_fail(self):
35+
params = _params(_test("5\n", "999\n", hidden=True))
36+
result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
1637

17-
Read the docs on how to use unittest here:
18-
https://docs.python.org/3/library/unittest.html
38+
self.assertFalse(result["is_correct"])
39+
self.assertIn("Hidden test 1", result["feedback"])
40+
self.assertNotIn("999", result["feedback"])
41+
self.assertNotIn("5", result["feedback"])
1942

20-
Use evaluation_function() to check your algorithm works
21-
as it should.
22-
"""
43+
def test_runtime_error(self):
44+
params = _params(_test("5\n", "25\n"))
45+
result = evaluation_function(_CRASH_CODE, None, params).to_dict()
2346

24-
def test_evaluation(self):
25-
response, answer, params = "Hello, World", "Hello, World", Params()
47+
self.assertFalse(result["is_correct"])
48+
self.assertIn("runtime error", result["feedback"])
2649

27-
result = evaluation_function(response, answer, params).to_dict()
50+
def test_no_tests(self):
51+
result = evaluation_function(_SQUARE_CODE, None, {}).to_dict()
2852

29-
self.assertEqual(result.get("is_correct"), True)
30-
self.assertFalse(result.get("feedback", False))
53+
self.assertFalse(result["is_correct"])
54+
self.assertIn("No test cases", result["feedback"])

0 commit comments

Comments
 (0)