import warnings
import json
from multiprocessing import Pool
from statistics import mean
from typing import Any
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from ipywidgets import interact
from juddges.utils.misc import parse_yaml
from juddges.metrics.info_extraction import evaluate_extraction
= '{:,.3f}'.format
pd.options.display.float_format 'ignore', message="To copy construct from a tensor, it is recommended to use") warnings.filterwarnings(
SFT results inspection
Compare metrics
= []
results for f in Path("../../data/experiments/predict/pl-court-instruct").glob("metrics_*.json"):
= f.stem.replace("metrics_", "")
model_name with f.open() as file:
= json.load(file)
m_res
results.append("llm": model_name}
{| {"full_text_chrf": m_res["full_text_chrf"]}
| m_res["field_chrf"]
)
"llm") pd.DataFrame(results).sort_values(
llm | full_text_chrf | court_name | date | department_name | judges | legal_bases | recorder | signature | |
---|---|---|---|---|---|---|---|---|---|
2 | Meta-Llama-3-8B-Instruct | 0.247 | 0.862 | 0.971 | 0.833 | 0.882 | 0.287 | 0.805 | 0.778 |
0 | Mistral-7B-Instruct-v0.2 | 0.432 | 0.839 | 0.922 | 0.850 | 0.879 | 0.333 | 0.837 | 0.145 |
3 | Mistral-7B-Instruct-v0.2-fine-tuned | 0.772 | 0.987 | 0.990 | 0.965 | 0.952 | 0.600 | 0.979 | 0.972 |
4 | Unsloth-Llama-3-8B-Instruct-fine-tuned | 0.828 | 0.995 | 0.989 | 0.986 | 0.977 | 0.601 | 0.993 | 0.994 |
1 | Unsloth-Mistral-7B-Instruct-v0.3 | 0.477 | 0.830 | 0.987 | 0.900 | 0.870 | 0.419 | 0.943 | 0.567 |
5 | Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned | 0.798 | 0.995 | 0.988 | 0.986 | 0.967 | 0.608 | 0.987 | 0.976 |
Inspect results
= "../../data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json"
OUTPUTS_PATH
with open(OUTPUTS_PATH) as file:
= json.load(file) data
def eval_item(item: dict[str, Any]) -> dict[str, Any]:
"metrics"] = evaluate_extraction([item])
item["metrics"]["mean_field"] = mean(item["metrics"]["field_chrf"].values())
item["gold"] = parse_yaml(item["gold"])
item[try:
"answer"] = parse_yaml(item["answer"])
item[except:
"answer"] = None
item[return item
= 0
num_invalid_answers = []
results with Pool(10) as pool:
for item in tqdm(pool.imap(eval_item, data), total=len(data)):
results.append(item)if item["answer"] is None:
+= 1
num_invalid_answers
print(f"Number of invalid answers: {num_invalid_answers} / {len(data)}")
Number of invalid answers: 224 / 2000
= [item for item in results if item["answer"] is not None]
data_valid = sorted(data_valid, key=lambda x: x["metrics"]["mean_field"])
data_valid
def item_to_df(idx: int) -> pd.DataFrame:
= data_valid[idx]
item return pd.DataFrame({
"gold": item["gold"],
"answer": item["answer"],
"metrics": item["metrics"]["field_chrf"],
})
=range(len(data_valid))); interact(item_to_df, idx