Dataset Card for JuDDGES/pl-court-raw

import warnings

import datasets
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import polars as pl
import seaborn as sns
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer

warnings.filterwarnings('ignore')
sns.set_theme("notebook")
transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()
datasets.utils.disable_progress_bars()

raw_ds = pl.scan_parquet(source="../../data/datasets/pl/raw/*")

Statistics

Missing values

We identified 5,725 judgments (approximately 1%) with a missing content field. The root cause of these missing values is unknown and assumed to be due to random errors, as only error codes (e.g. 404) were observed when accessing the API during dataset curation. Given that this represents a very small fraction of the dataset, these missing values are not expected to impact the overall data quality, so we removed these judgments. Additionally, the table below displays the number and proportion of missing values across other fields in the dataset (after removing those with missing content).

null_count = raw_ds.null_count().collect().to_pandas().T.rename(columns={0: "Null count"})
null_count.index.name = "Field name"
null_count["Null fraction"] = (null_count["Null count"] / raw_ds.select(pl.len()).collect().item()).round(2)
# print(null_count.to_markdown())

Field name	Null count	Null fraction
_id	0	0
signature	0	0
date	0	0
publicationDate	0	0
lastUpdate	0	0
courtId	0	0
departmentId	0	0
type	0	0
excerpt	0	0
content	0	0
chairman	47283	0.12
decision	408423	1
judges	39772	0.1
legalBases	113534	0.28
publisher	609	0
recorder	103675	0.25
references	40737	0.1
reviser	171	0
themePhrases	117074	0.29
num_pages	0	0
text	0	0
vol_number	0	0
vol_type	0	0
court_name	605	0
department_name	605	0
text_legal_bases	0	0
thesis	369092	0.9

Analysis of selected fields

court_distribution = raw_ds.drop_nulls(subset="court_name").select("court_name").group_by("court_name").len().sort("len", descending=True).collect().to_pandas()
ax = sns.histplot(data=court_distribution, x="len", log_scale=True, kde=True)
ax.set(title="Distribution of judgments per court", xlabel="#Judgements in single court", ylabel="Count")
plt.show()

judgements_per_year = raw_ds.select("date").collect()["date"].str.split(" ").list.get(0).str.to_date().dt.year().value_counts().sort("date").to_pandas()
judgements_per_year = judgements_per_year[judgements_per_year["date"] < 2024]

_, ax = plt.subplots(1, 1, figsize=(10, 5))
ax = sns.pointplot(data=judgements_per_year, x="date", y="count", linestyles="--", ax=ax)
ax.set(xlabel="Year", ylabel="Number of Judgements", title="Yearly Number of Judgements", yscale="log")
plt.xticks(rotation=90)
plt.show()

types = raw_ds.fill_null(value="<null>").select("type").group_by("type").len().sort("len", descending=True).collect().to_pandas()

_, ax = plt.subplots(1, 1, figsize=(8, 8))
ax = sns.barplot(data=types, x="len", y="type", errorbar=None, ax=ax)
ax.set(xlabel="Count", ylabel="Type", title="Judgement types cardinality", xscale="log")
plt.show()

num_judges = raw_ds.with_columns([pl.col("judges").list.len().alias("num_judges")]).select("num_judges").sort("num_judges").collect().to_pandas()
ax = sns.histplot(data=num_judges, x="num_judges", bins=num_judges["num_judges"].nunique())
ax.set(xlabel="#Judges per judgement", ylabel="Count", yscale="log", title="#Judges per single judgement")
plt.show()

num_lb = raw_ds.with_columns([pl.col("legalBases").list.len().alias("num_lb")]).select("num_lb").sort("num_lb").collect().to_pandas()
ax = sns.histplot(data=num_lb, x="num_lb", bins=num_lb["num_lb"].nunique())
ax.set(xlabel="#Legal bases", ylabel="Count", yscale="log", title="#Legal bases per judgement")
plt.show()

raw_text_ds = load_dataset("parquet", data_dir="../../data/datasets/pl/raw/", columns=["_id", "text"])
raw_text_ds = raw_text_ds.filter(lambda x: x["text"] is not None)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

def tokenize(batch: dict[str, list]) -> list[int]: 
    tokenized = tokenizer(batch["text"], add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False, return_length=True)
    return {"length": tokenized["length"]}

raw_text_ds = raw_text_ds.map(tokenize, batched=True, batch_size=16, remove_columns=["text"], num_proc=20)

judgement_len = raw_text_ds["train"].to_pandas()

ax = sns.histplot(data=judgement_len, x="length", bins=50)
ax.set(xlabel="#Tokens", ylabel="Count", title="#Tokens distribution in judgements (llama-3 tokenizer)", yscale="log")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
plt.show()

per_type_tokens = raw_ds.fill_null(value="<null>").select(["_id", "type"]).collect().to_pandas().set_index("_id").join(judgement_len.set_index("_id"))

_, ax = plt.subplots(1, 1, figsize=(10, 10))
ax = sns.boxenplot(data=per_type_tokens, y="type", x="length")
ax.set(xscale="log", title="Judgement token count per type", xlabel="#Tokens", ylabel="Type")
plt.show()