Implement I/O test pair evaluation with hidden test support

m-messer · claude · m-messer · commit 734196c1358a · 2026-05-22T11:47:14.000+01:00
Replaces trivial equality check with an AAT runner that executes student
Python code in a subprocess, feeds stdin per test case, and compares stdout
against expected output. Supports hidden tests (results shown without
revealing inputs/outputs) and handles runtime errors and timeouts.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/evaluation_function/evaluation.py b/evaluation_function/evaluation.py
@@ -1,34 +1,75 @@
+import os
+import subprocess
+import tempfile
 from typing import Any
 from lf_toolkit.evaluation import Result, Params
 
-def evaluation_function(
-    response: Any,
-    answer: Any,
-    params: Params,
-) -> Result:
-    """
-    Function used to evaluate a student response.
-    ---
-    The handler function passes three arguments to evaluation_function():
-
-    - `response` which are the answers provided by the student.
-    - `answer` which are the correct answers to compare against.
-    - `params` which are any extra parameters that may be useful,
-        e.g., error tolerances.
-
-    The output of this function is what is returned as the API response
-    and therefore must be JSON-encodable. It must also conform to the
-    response schema.
-
-    Any standard python library may be used, as well as any package
-    available on pip (provided it is added to requirements.txt).
-
-    The way you wish to structure you code (all in this function, or
-    split into many) is entirely up to you. All that matters are the
-    return types and that evaluation_function() is the main function used
-    to output the evaluation response.
-    """
-
-    return Result(
-        is_correct=response == answer
-    )
+_TIMEOUT = 5
+
+
+def _run_code(code: str, stdin: str) -> tuple[str, str, bool]:
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+        f.write(code)
+        tmpfile = f.name
+    try:
+        proc = subprocess.run(
+            ["python", tmpfile],
+            input=stdin,
+            capture_output=True,
+            text=True,
+            timeout=_TIMEOUT,
+        )
+        return proc.stdout, proc.stderr, False
+    except subprocess.TimeoutExpired:
+        return "", "", True
+    finally:
+        os.unlink(tmpfile)
+
+
+def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
+    tests = params.get("tests", [])
+
+    if not tests:
+        result = Result(is_correct=False)
+        result.add_feedback("error", "No test cases provided.")
+        return result
+
+    passed = 0
+    result = Result()
+
+    for i, test in enumerate(tests, 1):
+        stdin = test.get("input", "")
+        expected = test.get("expected_output", "").rstrip()
+        hidden = test.get("hidden", False)
+
+        stdout, stderr, timed_out = _run_code(str(response), stdin)
+        actual = stdout.rstrip()
+
+        if timed_out:
+            tag = "hidden_fail" if hidden else "fail"
+            label = f"Hidden test {i}" if hidden else f"Test {i}"
+            result.add_feedback(tag, f"{label}: timed out after {_TIMEOUT}s.")
+        elif stderr and not stdout:
+            tag = "hidden_fail" if hidden else "fail"
+            label = f"Hidden test {i}" if hidden else f"Test {i}"
+            msg = f"{label}: runtime error." if hidden else f"{label}: runtime error.\n{stderr.strip()}"
+            result.add_feedback(tag, msg)
+        elif actual == expected:
+            passed += 1
+            label = f"Hidden test {i}" if hidden else f"Test {i}"
+            result.add_feedback("pass", f"{label}: passed.")
+        else:
+            tag = "hidden_fail" if hidden else "fail"
+            if hidden:
+                result.add_feedback(tag, f"Hidden test {i}: failed.")
+            else:
+                result.add_feedback(tag, (
+                    f"Test {i}: failed.\n"
+                    f"  Input:    {stdin.rstrip()}\n"
+                    f"  Expected: {expected}\n"
+                    f"  Got:      {actual}"
+                ))
+
+    result.is_correct = passed == len(tests)
+    result.add_feedback("summary", f"{passed}/{len(tests)} tests passed.")
+    return result
diff --git a/evaluation_function/evaluation_test.py b/evaluation_function/evaluation_test.py
@@ -1,30 +1,54 @@
 import unittest
 
-from .evaluation import Params, evaluation_function
+from .evaluation import evaluation_function
+
+_SQUARE_CODE = "n = int(input())\nprint(n * n)"
+_CRASH_CODE = "raise ValueError('oops')"
+_INFINITE_CODE = "while True: pass"
+
+
+def _params(*tests):
+    return {"tests": list(tests)}
+
+
+def _test(inp, expected, hidden=False):
+    return {"input": inp, "expected_output": expected, "hidden": hidden}
+
 
 class TestEvaluationFunction(unittest.TestCase):
-    """
-    TestCase Class used to test the algorithm.
-    ---
-    Tests are used here to check that the algorithm written
-    is working as it should.
 
-    It's best practise to write these tests first to get a
-    kind of 'specification' for how your algorithm should
-    work, and you should run these tests before committing
-    your code to AWS.
+    def test_all_pass(self):
+        params = _params(_test("5\n", "25\n"), _test("3\n", "9\n"))
+        result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
+
+        self.assertTrue(result["is_correct"])
+        self.assertIn("2/2 tests passed", result["feedback"])
+
+    def test_partial_fail(self):
+        params = _params(_test("5\n", "25\n"), _test("3\n", "99\n"))
+        result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
+
+        self.assertFalse(result["is_correct"])
+        self.assertIn("1/2 tests passed", result["feedback"])
+
+    def test_hidden_test_fail(self):
+        params = _params(_test("5\n", "999\n", hidden=True))
+        result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
 
-    Read the docs on how to use unittest here:
-    https://docs.python.org/3/library/unittest.html
+        self.assertFalse(result["is_correct"])
+        self.assertIn("Hidden test 1", result["feedback"])
+        self.assertNotIn("999", result["feedback"])
+        self.assertNotIn("5", result["feedback"])
 
-    Use evaluation_function() to check your algorithm works
-    as it should.
-    """
+    def test_runtime_error(self):
+        params = _params(_test("5\n", "25\n"))
+        result = evaluation_function(_CRASH_CODE, None, params).to_dict()
 
-    def test_evaluation(self):
-        response, answer, params = "Hello, World", "Hello, World", Params()
+        self.assertFalse(result["is_correct"])
+        self.assertIn("runtime error", result["feedback"])
 
-        result = evaluation_function(response, answer, params).to_dict()
+    def test_no_tests(self):
+        result = evaluation_function(_SQUARE_CODE, None, {}).to_dict()
 
-        self.assertEqual(result.get("is_correct"), True)
-        self.assertFalse(result.get("feedback", False))
+        self.assertFalse(result["is_correct"])
+        self.assertIn("No test cases", result["feedback"])