import json
from openai import OpenAI
from phoenix.client.experiments import evaluate_experiment
from phoenix.experiments.evaluators import create_evaluator
from phoenix.experiments.types import EvaluationResult
openai_client = OpenAI()
judge_instructions = """
You are a judge that determines if a given question can be answered with the provided SQL query and results.
Make sure to ensure that the SQL query maps to the question accurately.
Provide the label `correct` if the SQL query and results accurately answer the question.
Provide the label `invalid` if the SQL query does not map to the question or is not valid.
"""
@create_evaluator(name="qa_correctness", kind="llm")
def qa_correctness(input, output):
question = input.get("question")
query = output.get("query")
results = output.get("results")
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": judge_instructions},
{
"role": "user",
"content": f"Question: {question}\nSQL Query: {query}\nSQL Results: {results}",
},
],
tool_choice="required",
tools=[
{
"type": "function",
"function": {
"name": "qa_correctness",
"description": "Determine if the SQL query and results accurately answer the question.",
"parameters": {
"type": "object",
"properties": {
"explanation": {
"type": "string",
"description": "Explain why the label is correct or invalid.",
},
"label": {"type": "string", "enum": ["correct", "invalid"]},
},
},
},
}
],
)
if response.choices[0].message.tool_calls is None:
raise ValueError("No tool call found in response")
args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
label = args["label"]
explanation = args["explanation"]
score = 1 if label == "correct" else 0
return EvaluationResult(score=score, label=label, explanation=explanation)
evaluate_experiment(experiment, evaluators=[qa_correctness])