Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions server/api/views/assistant/assistant_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
INSTRUCTIONS = """
You are an AI assistant that helps users find and understand information about bipolar disorder
from your internal library of bipolar disorder research sources using semantic search.

IMPORTANT CONTEXT:
- You have access to a library of sources that the user CANNOT see
- The user did not upload these sources and doesn't know about them
- You must explain what information exists in your sources and provide clear references

TOPIC RESTRICTIONS:
When a prompt is received that is unrelated to bipolar disorder, mental health treatment,
or psychiatric medications, respond by saying you are limited to bipolar-specific conversations.

SEMANTIC SEARCH STRATEGY:
- Always perform semantic search using the search_documents function when users ask questions
- Use conceptually related terms and synonyms, not just exact keyword matches
- Search for the meaning and context of the user's question, not just literal words
- Consider medical terminology, lay terms, and related conditions when searching

FUNCTION USAGE:
- When a user asks about information that might be in your source library, ALWAYS use the search_documents function first
- Perform semantic searches using concepts, symptoms, treatments, and related terms from the user's question
- Only provide answers based on information found through your source searches

RESPONSE FORMAT:
After gathering information through semantic searches, provide responses that:
1. Answer the user's question directly using only the found information
2. Structure responses with clear sections and paragraphs
3. Explain what information you found in your sources and provide context
4. Include citations using this exact format: [Name {name}, Page {page_number}]
5. Only cite information that directly supports your statements

If no relevant information is found in your source library, clearly state that the information
is not available in your current sources.

REMEMBER: You are working with an internal library of bipolar disorder sources that the user
cannot see. Always search these sources first, explain what you found, and provide proper citations.
"""
72 changes: 72 additions & 0 deletions server/api/views/assistant/assistant_services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import logging

from openai import OpenAI

from .assistant_prompts import INSTRUCTIONS
from .tool_services import (
SEARCH_TOOLS_SCHEMA,
make_search_tool_mapping,
handle_tool_calls_with_reasoning,
)

logger = logging.getLogger(__name__)


def run_assistant(
message: str,
user,
previous_response_id: str | None = None,
) -> tuple[str, str]:
"""Wire together the OpenAI client, retrieval, and the agentic reasoning loop.

Parameters
----------
message : str
The user's input message.
user : User
The Django user object used for document access control in search_documents.
previous_response_id : str | None
ID of a prior response for multi-turn conversation continuity.

Returns
-------
tuple[str, str]
(final_response_output_text, final_response_id)
"""
# TODO: Track total duration, cost metrics, and tool_calls_made count
# and return them from run_assistant for use in eval_assistant.py CSV output

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

MODEL_DEFAULTS = {
"instructions": INSTRUCTIONS,
"model": "gpt-5-nano", # 400,000 token context window
# A summary of the reasoning performed by the model. This can be useful for debugging and understanding the model's reasoning process.
"reasoning": {"effort": "low", "summary": None},
"tools": SEARCH_TOOLS_SCHEMA,
}

# TOOLS_SCHEMA tells the model what tools exist and what arguments to generate.
# tool_mapping wires those tool names to the Python functions that execute them.
# They are separate because the model generates arguments (schema concern) but
# cannot supply request-time values like user (mapping concern).
tool_mapping = make_search_tool_mapping(user)

if not previous_response_id:
response = client.responses.create(
input=[
{"type": "message", "role": "user", "content": str(message)}
],
**MODEL_DEFAULTS,
)
else:
response = client.responses.create(
input=[
{"type": "message", "role": "user", "content": str(message)}
],
previous_response_id=str(previous_response_id),
**MODEL_DEFAULTS,
)

return handle_tool_calls_with_reasoning(response, client, MODEL_DEFAULTS, tool_mapping)
141 changes: 141 additions & 0 deletions server/api/views/assistant/eval_assistant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = "==3.11.11"
# dependencies = [
# "pandas==2.2.3",
# "openai",
# "django",
# ]
# ///

# uv script (or plain Python) to generate results to CSV, run from the terminal
# Run from inside the container: docker compose exec backend python eval_assistant.py
#


import os
import sys
import logging
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Django setup must come before any imports that touch the ORM
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "balancer_backend.settings")

import django
django.setup()

import pandas as pd
from django.contrib.auth import get_user_model

from api.views.assistant.assistant_services import run_assistant
from api.views.assistant.assistant_prompts import INSTRUCTIONS

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Read model and INSTRUCTIONS from the source file or add a lightweight config endpoint to the backend

# Read model and INSTRUCTIONS from the source file
# INSTRUCTIONS is imported from assistant_prompts.py
# MODEL is read from assistant_services.py MODEL_DEFAULTS
MODEL = "gpt-5-nano"

# Set of representative questions to evaluate the assistant
QUESTIONS = [
"What medications are recommended for bipolar depression?",
"What are the risks of lithium for patients with kidney disease?",
"Which mood stabilizers are safe during pregnancy?",
"What is the evidence for quetiapine in bipolar disorder?",
"How does valproate compare to lithium for mania?",
]


def run_one(question: str, user, branch: str) -> dict:
"""Run the assistant for a single question and return a result row.

Uses ThreadPoolExecutor (not asyncio.gather + await run_assistant) for concurrency.

Concurrency approach comparison:
- ThreadPoolExecutor (this implementation):
- run_assistant stays sync — views.py and the WSGI web app are unaffected
- Each question runs in a thread pool worker, blocking on OpenAI + DB I/O
- Django DB safe when run via `docker compose exec backend python eval_assistant.py`:
this is a synchronous Django process context. Each ThreadPoolExecutor worker
is a real OS thread with its own threading.local() storage, so each thread
gets its own DB connection created lazily on first use. There is no shared
event loop thread, so connections cannot clash or bleed between questions.
The connection isolation concern only arises in ASGI contexts where multiple
coroutines share one thread and therefore one threading.local() connection —
which is not the case here.
- Runtime: bottlenecked by OpenAI rate limits, not thread overhead
- asyncio.gather + await run_assistant (alternative):
- run_assistant becomes async — requires async def post in views.py,
AsyncOpenAI client, and async handle_tool_calls_with_reasoning
- Django DB unsafe if get_closest_embeddings is called directly in an async
context without wrapping: get_closest_embeddings is a sync function that
hits the ORM, so calling it on the event loop thread blocks all other
coroutines until the DB responds. The fix is sync_to_async(get_closest_embeddings),
which runs it in a dedicated worker thread with its own threading.local()
connection. Bare await does not work at all — Django ORM querysets are not
awaitables and raise TypeError immediately.
- Under WSGI (manage.py runserver), async views run in a new event loop
per request — adds overhead to every web request for no benefit
- Cleaner call site in eval_assistant.py but wrong trade-off given WSGI
"""
try:
response_text, response_id = run_assistant(message=question, user=user)
return {
"branch": branch,
"model": MODEL,
"question": question,
"response_output_text": response_text,
"error": None,
}
except Exception as e:
logger.error(f"Error evaluating question '{question}': {e}")
return {
"branch": branch,
"model": MODEL,
"question": question,
"response_output_text": None,
"error": str(e),
}


def main():
branch = os.environ.get("EVAL_BRANCH", "develop")

User = get_user_model()
user = User.objects.filter(is_superuser=True).first()
if not user:
raise RuntimeError("No superuser found. Create one with manage.py createsuperuser.")

logger.info(f"Starting evaluation: branch={branch}, model={MODEL}, questions={len(QUESTIONS)}")

# ThreadPoolExecutor runs questions concurrently — see run_one docstring
# for trade-off discussion vs asyncio.gather + await run_assistant.
# max_workers=5 stays safely under OpenAI rate limits for gpt-5-nano.
results = []
with ThreadPoolExecutor(max_workers=5) as pool:
futures = {
pool.submit(run_one, question, user, branch): question
for question in QUESTIONS
}
for future in as_completed(futures):
results.append(future.result())

df = pd.DataFrame(results)

results_dir = os.path.join(os.path.dirname(__file__), "results")
os.makedirs(results_dir, exist_ok=True)
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S")
output_path = os.path.join(results_dir, f"{branch}-{timestamp}.csv")
df.to_csv(output_path, index=False)

logger.info(f"Results saved to {output_path}")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions server/api/views/assistant/review.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# notebook to review and compare the two CSVs
Empty file.
Empty file.
Empty file.
Empty file.
Loading
Loading