from phoenix.evals import bind_evaluator, evaluate_dataframe
from phoenix.evals.llm import LLM
from phoenix.evals.metrics import HallucinationEvaluator, exact_match
df = pd.DataFrame(
{
# exact_match columns
"output": ["Yes", "Yes", "No"],
"expected": ["Yes", "No", "No"],
# hallucination columns (need mapping)
"context": ["This is a test", "This is another test", "This is a third test"],
"query": [
"What is the name of this test?",
"What is the name of this test?",
"What is the name of this test?",
],
"response": ["First test", "Another test", "Third test"],
}
)
llm = LLM(provider="openai", model="gpt-4o")
hallucination_evaluator = bind_evaluator(
HallucinationEvaluator(llm=llm), {"input": "query", "output": "response"}
)
result = evaluate_dataframe(df, [exact_match, hallucination_evaluator])
result.head()