api-debug-env/inference.py at master · ProthamD/api-debug-env · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import json
import asyncio
from typing import List, Optional
from openai import OpenAI
from client import APIDebugEnv
from models import APIAction

API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.3")
HF_TOKEN = os.getenv("HF_TOKEN")
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
BENCHMARK = "api_debug_env"
MAX_STEPS = 5
SUCCESS_SCORE_THRESHOLD = 0.8

llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)

SYSTEM_PROMPT = """You are an HTTP API debugger. You receive a broken request and must fix it to get HTTP 200.
Respond ONLY in valid JSON with exactly these fields:
{
  "method": "GET",
  "url": "/mock_api/...",
  "headers": {},
  "body": null,
  "query_params": {}
}
Rules:
- method must be uppercase
- url must start with /mock_api/
- body must be null for GET requests
- always include Content-Type: application/json when body is not null
- do not add any text outside the JSON"""


def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    error_val = error if error else "null"
    done_val = str(done).lower()
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)


def call_llm(task_description, broken_request, last_status, last_body, feedback):
    user_msg = (
        f"Task: {task_description}\n\n"
        f"Broken request:\n{json.dumps(broken_request, indent=2)}\n\n"
        f"Last response status: {last_status}\n"
        f"Last response body: {last_body[:300]}\n\n"
        f"Feedback: {feedback}\n\n"
        f"Return the fixed request as JSON:"
    )
    resp = llm.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=300,
        temperature=0.1,
    )
    raw = resp.choices[0].message.content.strip()
    raw = raw.replace("```json", "").replace("```", "").strip()
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        return broken_request


async def run_task(env, task_id):
    rewards: List[float] = []
    steps_taken = 0
    score = 0.0
    success = False

    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

    try:
        result = await env.reset(task_id=task_id)
        obs = result.observation
        state = await env.state()

        for step in range(1, state.max_steps + 1):
            if result.done:
                break

            fixed = call_llm(
                obs.task_description,
                obs.broken_request,
                obs.last_status_code,
                obs.last_response_body,
                obs.step_feedback,
            )

            action = APIAction(
                method=str(fixed.get("method", "GET")),
                url=str(fixed.get("url", obs.broken_request.get("url", "/mock_api/users"))),
                headers=dict(fixed.get("headers") or {}),
                body=fixed.get("body") or {},
                query_params=dict(fixed.get("query_params") or {}),
            )

            action_str = f"{action.method}:{action.url}"
            result = await env.step(action)
            reward = result.reward or 0.0
            done = result.done
            obs = result.observation

            rewards.append(reward)
            steps_taken = step

            log_step(step=step, action=action_str, reward=reward, done=done, error=None)

            if done:
                break

        score = max(rewards) if rewards else 0.0
        success = score >= SUCCESS_SCORE_THRESHOLD

    except Exception as e:
        print(f"[DEBUG] Task {task_id} error: {e}", flush=True)

    finally:
        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

    return score


async def main():
    async with APIDebugEnv(base_url=ENV_URL) as env:
        for task_id in ["easy", "medium", "hard"]:
            await run_task(env, task_id)


if __name__ == "__main__":
    asyncio.run(main())