-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinference.py
More file actions
146 lines (116 loc) · 4.43 KB
/
inference.py
File metadata and controls
146 lines (116 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import json
import asyncio
from typing import List, Optional
from openai import OpenAI
from client import APIDebugEnv
from models import APIAction
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.3")
HF_TOKEN = os.getenv("HF_TOKEN")
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
BENCHMARK = "api_debug_env"
MAX_STEPS = 5
SUCCESS_SCORE_THRESHOLD = 0.8
llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
SYSTEM_PROMPT = """You are an HTTP API debugger. You receive a broken request and must fix it to get HTTP 200.
Respond ONLY in valid JSON with exactly these fields:
{
"method": "GET",
"url": "/mock_api/...",
"headers": {},
"body": null,
"query_params": {}
}
Rules:
- method must be uppercase
- url must start with /mock_api/
- body must be null for GET requests
- always include Content-Type: application/json when body is not null
- do not add any text outside the JSON"""
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
error_val = error if error else "null"
done_val = str(done).lower()
print(
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
flush=True,
)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
def call_llm(task_description, broken_request, last_status, last_body, feedback):
user_msg = (
f"Task: {task_description}\n\n"
f"Broken request:\n{json.dumps(broken_request, indent=2)}\n\n"
f"Last response status: {last_status}\n"
f"Last response body: {last_body[:300]}\n\n"
f"Feedback: {feedback}\n\n"
f"Return the fixed request as JSON:"
)
resp = llm.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
max_tokens=300,
temperature=0.1,
)
raw = resp.choices[0].message.content.strip()
raw = raw.replace("```json", "").replace("```", "").strip()
try:
return json.loads(raw)
except json.JSONDecodeError:
return broken_request
async def run_task(env, task_id):
rewards: List[float] = []
steps_taken = 0
score = 0.0
success = False
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
try:
result = await env.reset(task_id=task_id)
obs = result.observation
state = await env.state()
for step in range(1, state.max_steps + 1):
if result.done:
break
fixed = call_llm(
obs.task_description,
obs.broken_request,
obs.last_status_code,
obs.last_response_body,
obs.step_feedback,
)
action = APIAction(
method=str(fixed.get("method", "GET")),
url=str(fixed.get("url", obs.broken_request.get("url", "/mock_api/users"))),
headers=dict(fixed.get("headers") or {}),
body=fixed.get("body") or {},
query_params=dict(fixed.get("query_params") or {}),
)
action_str = f"{action.method}:{action.url}"
result = await env.step(action)
reward = result.reward or 0.0
done = result.done
obs = result.observation
rewards.append(reward)
steps_taken = step
log_step(step=step, action=action_str, reward=reward, done=done, error=None)
if done:
break
score = max(rewards) if rewards else 0.0
success = score >= SUCCESS_SCORE_THRESHOLD
except Exception as e:
print(f"[DEBUG] Task {task_id} error: {e}", flush=True)
finally:
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
return score
async def main():
async with APIDebugEnv(base_url=ENV_URL) as env:
for task_id in ["easy", "medium", "hard"]:
await run_task(env, task_id)
if __name__ == "__main__":
asyncio.run(main())