Dataset Card for JuDDGES/pl-court-raw

import warnings

import datasets
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import polars as pl
import seaborn as sns
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer

warnings.filterwarnings('ignore')
sns.set_theme("notebook")
transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()
datasets.utils.disable_progress_bars()
raw_ds = pl.scan_parquet(source="../../data/datasets/pl/raw/*")

Statistics

Missing values

We identified 5,725 judgments (approximately 1%) with a missing content field. The root cause of these missing values is unknown and assumed to be due to random errors, as only error codes (e.g. 404) were observed when accessing the API during dataset curation. Given that this represents a very small fraction of the dataset, these missing values are not expected to impact the overall data quality, so we removed these judgments. Additionally, the table below displays the number and proportion of missing values across other fields in the dataset (after removing those with missing content).

null_count = raw_ds.null_count().collect().to_pandas().T.rename(columns={0: "Null count"})
null_count.index.name = "Field name"
null_count["Null fraction"] = (null_count["Null count"] / raw_ds.select(pl.len()).collect().item()).round(2)
# print(null_count.to_markdown())
Field name Null count Null fraction
_id 0 0
signature 0 0
date 0 0
publicationDate 0 0
lastUpdate 0 0
courtId 0 0
departmentId 0 0
type 0 0
excerpt 0 0
content 0 0
chairman 47283 0.12
decision 408423 1
judges 39772 0.1
legalBases 113534 0.28
publisher 609 0
recorder 103675 0.25
references 40737 0.1
reviser 171 0
themePhrases 117074 0.29
num_pages 0 0
text 0 0
vol_number 0 0
vol_type 0 0
court_name 605 0
department_name 605 0
text_legal_bases 0 0
thesis 369092 0.9

Analysis of selected fields

court_distribution = raw_ds.drop_nulls(subset="court_name").select("court_name").group_by("court_name").len().sort("len", descending=True).collect().to_pandas()
ax = sns.histplot(data=court_distribution, x="len", log_scale=True, kde=True)
ax.set(title="Distribution of judgments per court", xlabel="#Judgements in single court", ylabel="Count")
plt.show()
judgements_per_year = raw_ds.select("date").collect()["date"].str.split(" ").list.get(0).str.to_date().dt.year().value_counts().sort("date").to_pandas()
judgements_per_year = judgements_per_year[judgements_per_year["date"] < 2024]

_, ax = plt.subplots(1, 1, figsize=(10, 5))
ax = sns.pointplot(data=judgements_per_year, x="date", y="count", linestyles="--", ax=ax)
ax.set(xlabel="Year", ylabel="Number of Judgements", title="Yearly Number of Judgements", yscale="log")
plt.xticks(rotation=90)
plt.show()
types = raw_ds.fill_null(value="<null>").select("type").group_by("type").len().sort("len", descending=True).collect().to_pandas()

_, ax = plt.subplots(1, 1, figsize=(8, 8))
ax = sns.barplot(data=types, x="len", y="type", errorbar=None, ax=ax)
ax.set(xlabel="Count", ylabel="Type", title="Judgement types cardinality", xscale="log")
plt.show()
num_judges = raw_ds.with_columns([pl.col("judges").list.len().alias("num_judges")]).select("num_judges").sort("num_judges").collect().to_pandas()
ax = sns.histplot(data=num_judges, x="num_judges", bins=num_judges["num_judges"].nunique())
ax.set(xlabel="#Judges per judgement", ylabel="Count", yscale="log", title="#Judges per single judgement")
plt.show()
num_lb = raw_ds.with_columns([pl.col("legalBases").list.len().alias("num_lb")]).select("num_lb").sort("num_lb").collect().to_pandas()
ax = sns.histplot(data=num_lb, x="num_lb", bins=num_lb["num_lb"].nunique())
ax.set(xlabel="#Legal bases", ylabel="Count", yscale="log", title="#Legal bases per judgement")
plt.show()
raw_text_ds = load_dataset("parquet", data_dir="../../data/datasets/pl/raw/", columns=["_id", "text"])
raw_text_ds = raw_text_ds.filter(lambda x: x["text"] is not None)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

def tokenize(batch: dict[str, list]) -> list[int]: 
    tokenized = tokenizer(batch["text"], add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False, return_length=True)
    return {"length": tokenized["length"]}

raw_text_ds = raw_text_ds.map(tokenize, batched=True, batch_size=16, remove_columns=["text"], num_proc=20)
judgement_len = raw_text_ds["train"].to_pandas()

ax = sns.histplot(data=judgement_len, x="length", bins=50)
ax.set(xlabel="#Tokens", ylabel="Count", title="#Tokens distribution in judgements (llama-3 tokenizer)", yscale="log")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
plt.show()
per_type_tokens = raw_ds.fill_null(value="<null>").select(["_id", "type"]).collect().to_pandas().set_index("_id").join(judgement_len.set_index("_id"))

_, ax = plt.subplots(1, 1, figsize=(10, 10))
ax = sns.boxenplot(data=per_type_tokens, y="type", x="length")
ax.set(xscale="log", title="Judgement token count per type", xlabel="#Tokens", ylabel="Type")
plt.show()