Skip to content

Commit 2ae0ba0

Browse files
m-messerclaude
andcommitted
Display student output in Markdown code blocks, Jupyter-style
Each visible test now shows the actual stdout in a fenced code block below the input, mirroring a Jupyter notebook cell. Failing tests also show the expected output in a separate block. When no test cases are provided, the code still runs and its output is displayed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 734196c commit 2ae0ba0

2 files changed

Lines changed: 41 additions & 17 deletions

File tree

evaluation_function/evaluation.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,26 @@ def _run_code(code: str, stdin: str) -> tuple[str, str, bool]:
2626
os.unlink(tmpfile)
2727

2828

29+
def _code_block(label: str, content: str) -> str:
30+
return f"{label}:\n```\n{content}\n```"
31+
32+
2933
def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
3034
tests = params.get("tests", [])
35+
result = Result()
3136

3237
if not tests:
33-
result = Result(is_correct=False)
34-
result.add_feedback("error", "No test cases provided.")
38+
stdout, stderr, timed_out = _run_code(str(response), "")
39+
if timed_out:
40+
result.add_feedback("error", f"Code timed out after {_TIMEOUT}s.")
41+
elif stderr and not stdout:
42+
result.add_feedback("error", _code_block("Error", stderr.strip()))
43+
else:
44+
output = stdout.rstrip() or "(no output)"
45+
result.add_feedback("output", _code_block("Output", output))
3546
return result
3647

3748
passed = 0
38-
result = Result()
3949

4050
for i, test in enumerate(tests, 1):
4151
stdin = test.get("input", "")
@@ -44,31 +54,42 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
4454

4555
stdout, stderr, timed_out = _run_code(str(response), stdin)
4656
actual = stdout.rstrip()
57+
label = f"Hidden test {i}" if hidden else f"Test {i}"
4758

4859
if timed_out:
4960
tag = "hidden_fail" if hidden else "fail"
50-
label = f"Hidden test {i}" if hidden else f"Test {i}"
5161
result.add_feedback(tag, f"{label}: timed out after {_TIMEOUT}s.")
5262
elif stderr and not stdout:
5363
tag = "hidden_fail" if hidden else "fail"
54-
label = f"Hidden test {i}" if hidden else f"Test {i}"
55-
msg = f"{label}: runtime error." if hidden else f"{label}: runtime error.\n{stderr.strip()}"
56-
result.add_feedback(tag, msg)
64+
if hidden:
65+
result.add_feedback(tag, f"{label}: runtime error.")
66+
else:
67+
parts = [f"{label}: runtime error."]
68+
if stdin.strip():
69+
parts.append(_code_block("Input", stdin.rstrip()))
70+
parts.append(_code_block("Error", stderr.strip()))
71+
result.add_feedback(tag, "\n\n".join(parts))
5772
elif actual == expected:
5873
passed += 1
59-
label = f"Hidden test {i}" if hidden else f"Test {i}"
60-
result.add_feedback("pass", f"{label}: passed.")
74+
if hidden:
75+
result.add_feedback("pass", f"{label}: passed.")
76+
else:
77+
parts = [f"{label}: passed."]
78+
if stdin.strip():
79+
parts.append(_code_block("Input", stdin.rstrip()))
80+
parts.append(_code_block("Output", actual or "(no output)"))
81+
result.add_feedback("pass", "\n\n".join(parts))
6182
else:
6283
tag = "hidden_fail" if hidden else "fail"
6384
if hidden:
64-
result.add_feedback(tag, f"Hidden test {i}: failed.")
85+
result.add_feedback(tag, f"{label}: failed.")
6586
else:
66-
result.add_feedback(tag, (
67-
f"Test {i}: failed.\n"
68-
f" Input: {stdin.rstrip()}\n"
69-
f" Expected: {expected}\n"
70-
f" Got: {actual}"
71-
))
87+
parts = [f"{label}: failed."]
88+
if stdin.strip():
89+
parts.append(_code_block("Input", stdin.rstrip()))
90+
parts.append(_code_block("Your output", actual or "(no output)"))
91+
parts.append(_code_block("Expected", expected))
92+
result.add_feedback(tag, "\n\n".join(parts))
7293

7394
result.is_correct = passed == len(tests)
7495
result.add_feedback("summary", f"{passed}/{len(tests)} tests passed.")

evaluation_function/evaluation_test.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@ def test_all_pass(self):
2323

2424
self.assertTrue(result["is_correct"])
2525
self.assertIn("2/2 tests passed", result["feedback"])
26+
self.assertIn("```", result["feedback"])
2627

2728
def test_partial_fail(self):
2829
params = _params(_test("5\n", "25\n"), _test("3\n", "99\n"))
2930
result = evaluation_function(_SQUARE_CODE, None, params).to_dict()
3031

3132
self.assertFalse(result["is_correct"])
3233
self.assertIn("1/2 tests passed", result["feedback"])
34+
self.assertIn("```", result["feedback"])
3335

3436
def test_hidden_test_fail(self):
3537
params = _params(_test("5\n", "999\n", hidden=True))
@@ -46,9 +48,10 @@ def test_runtime_error(self):
4648

4749
self.assertFalse(result["is_correct"])
4850
self.assertIn("runtime error", result["feedback"])
51+
self.assertIn("```", result["feedback"])
4952

5053
def test_no_tests(self):
5154
result = evaluation_function(_SQUARE_CODE, None, {}).to_dict()
5255

5356
self.assertFalse(result["is_correct"])
54-
self.assertIn("No test cases", result["feedback"])
57+
self.assertIn("```", result["feedback"])

0 commit comments

Comments
 (0)