import os
from phoenix.evals import (
RAG_RELEVANCY_PROMPT_TEMPLATE,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
os.environ["OPENAI_API_KEY"] = "<your-openai-key>"
# Download the benchmark golden dataset
df = download_benchmark_dataset(
task="binary-relevance-classification", dataset_name="wiki_qa-train"
)
# Sample and re-name the columns to match the template
df = df.sample(100)
df = df.rename(
columns={
"query_text": "input",
"document_text": "reference",
},
)
model = OpenAIModel(
model="gpt-4",
temperature=0.0,
)
rails =list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
df[["eval_relevance"]] = llm_classify(df, model, RAG_RELEVANCY_PROMPT_TEMPLATE, rails)
#Golden dataset has True/False map to -> "irrelevant" / "relevant"
#we can then scikit compare to output of template - same format
y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
y_pred = df["eval_relevance"]
# Compute Per-Class Precision, Recall, F1 Score, Support
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred)