이 문서는 대화형 노트북입니다. 로컬에서 실행하거나 아래 링크를 사용할 수 있습니다:
Leaderboard 퀵스타트
- 가상의 우편번호 데이터셋을 생성합니다.
- 여러 스코어링 함수를 정의하고 베이스라인 모델을 평가합니다.
- 이 기법들을 사용해 여러 모델과 평가 조합으로 이루어진 매트릭스를 평가합니다.
- Weave UI에서 Leaderboard를 확인합니다.
1단계: 가짜 우편번호 데이터셋 생성하기
generate_dataset_rows 함수를 만듭니다.
잘못된 코드 신고
복사
AI에게 묻기
import json
from openai import OpenAI
from pydantic import BaseModel
class Row(BaseModel):
zip_code: str
city: str
state: str
avg_temp_f: float
population: int
median_income: int
known_for: str
class Rows(BaseModel):
rows: list[Row]
def generate_dataset_rows(
location: str = "United States", count: int = 5, year: int = 2022
):
client = OpenAI()
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"Please generate {count} rows of data for random zip codes in {location} for the year {year}.",
},
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "response_format",
"schema": Rows.model_json_schema(),
},
},
)
return json.loads(completion.choices[0].message.content)["rows"]
python
import weave
weave.init("leaderboard-demo")
2단계: 스코어링 함수 작성
check_concrete_fields: 모델 출력의 도시와 주가 예상 값과 일치하는지 확인합니다.check_value_fields: 모델 출력이 예상 인구와 중위 소득 값의 ±10% 이내인지 확인합니다.check_subjective_fields: LLM(대규모 언어 모델)을 사용하여 모델 출력이 예상 “known for” 필드와 일치하는지 확인합니다.
잘못된 코드 신고
복사
AI에게 묻기
@weave.op
def check_concrete_fields(city: str, state: str, output: dict):
return {
"city_match": city == output["city"],
"state_match": state == output["state"],
}
@weave.op
def check_value_fields(
avg_temp_f: float, population: int, median_income: int, output: dict
):
return {
"avg_temp_f_err": abs(avg_temp_f - output["avg_temp_f"]) / avg_temp_f,
"population_err": abs(population - output["population"]) / population,
"median_income_err": abs(median_income - output["median_income"])
/ median_income,
}
@weave.op
def check_subjective_fields(zip_code: str, known_for: str, output: dict):
client = OpenAI()
class Response(BaseModel):
correct_known_for: bool
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"My student was asked what the zip code {zip_code} is best known best for. The right answer is '{known_for}', and they said '{output['known_for']}'. Is their answer correct?",
},
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "response_format",
"schema": Response.model_json_schema(),
},
},
)
return json.loads(completion.choices[0].message.content)
3단계: 간단한 Evaluation 만들기
잘못된 코드 신고
복사
AI에게 묻기
rows = generate_dataset_rows()
evaluation = weave.Evaluation(
name="United States - 2022",
dataset=rows,
scorers=[
check_concrete_fields,
check_value_fields,
check_subjective_fields,
],
)
4단계: 베이스라인 모델 평가하기
잘못된 코드 신고
복사
AI에게 묻기
@weave.op
def baseline_model(zip_code: str):
return {
"city": "New York",
"state": "NY",
"avg_temp_f": 50.0,
"population": 1000000,
"median_income": 100000,
"known_for": "The Big Apple",
}
await evaluation.evaluate(baseline_model)
Step 5: 더 많은 모델 만들기
잘못된 코드 신고
복사
AI에게 묻기
@weave.op
def gpt_4o_mini_no_context(zip_code: str):
client = OpenAI()
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": f"""Zip code {zip_code}"""}],
response_format={
"type": "json_schema",
"json_schema": {
"name": "response_format",
"schema": Row.model_json_schema(),
},
},
)
return json.loads(completion.choices[0].message.content)
await evaluation.evaluate(gpt_4o_mini_no_context)
python
@weave.op
def gpt_4o_mini_with_context(zip_code: str):
client = OpenAI()
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": f"""Please answer the following questions about the zip code {zip_code}:
1. What is the city?
2. What is the state?
3. What is the average temperature in Fahrenheit?
4. What is the population?
5. What is the median income?
6. What is the most well known thing about this zip code?
""",
}
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "response_format",
"schema": Row.model_json_schema(),
},
},
)
return json.loads(completion.choices[0].message.content)
await evaluation.evaluate(gpt_4o_mini_with_context)
6단계: 더 많은 Evaluation 생성
잘못된 코드 신고
복사
AI에게 묻기
scorers = [
check_concrete_fields,
check_value_fields,
check_subjective_fields,
]
evaluations = [
weave.Evaluation(
name="United States - 2022",
dataset=weave.Dataset(
name="United States - 2022",
rows=generate_dataset_rows("United States", 5, 2022),
),
scorers=scorers,
),
weave.Evaluation(
name="California - 2022",
dataset=weave.Dataset(
name="California - 2022", rows=generate_dataset_rows("California", 5, 2022)
),
scorers=scorers,
),
weave.Evaluation(
name="United States - 2000",
dataset=weave.Dataset(
name="United States - 2000",
rows=generate_dataset_rows("United States", 5, 2000),
),
scorers=scorers,
),
]
models = [
baseline_model,
gpt_4o_mini_no_context,
gpt_4o_mini_with_context,
]
for evaluation in evaluations:
for model in models:
await evaluation.evaluate(
model, __weave={"display_name": evaluation.name + ":" + model.__name__}
)
7단계: Leaderboard 검토
잘못된 코드 신고
복사
AI에게 묻기
from weave.flow import leaderboard
from weave.trace.ref_util import get_ref
spec = leaderboard.Leaderboard(
name="Zip Code World Knowledge",
description="""
This leaderboard compares the performance of models in terms of world knowledge about zip codes.
### Columns
1. **State Match against `United States - 2022`**: The fraction of zip codes that the model correctly identified the state for.
2. **Avg Temp F Error against `California - 2022`**: The mean absolute error of the model's average temperature prediction.
3. **Correct Known For against `United States - 2000`**: The fraction of zip codes that the model correctly identified the most well known thing about the zip code.
""",
columns=[
leaderboard.LeaderboardColumn(
evaluation_object_ref=get_ref(evaluations[0]).uri(),
scorer_name="check_concrete_fields",
summary_metric_path="state_match.true_fraction",
),
leaderboard.LeaderboardColumn(
evaluation_object_ref=get_ref(evaluations[1]).uri(),
scorer_name="check_value_fields",
should_minimize=True,
summary_metric_path="avg_temp_f_err.mean",
),
leaderboard.LeaderboardColumn(
evaluation_object_ref=get_ref(evaluations[2]).uri(),
scorer_name="check_subjective_fields",
summary_metric_path="correct_known_for.true_fraction",
),
],
)
ref = weave.publish(spec)