import warnings
import datasets
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import polars as pl
import seaborn as sns
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
'ignore')
warnings.filterwarnings("notebook")
sns.set_theme(
transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error() datasets.utils.disable_progress_bars()
Dataset Card for JuDDGES/pl-court-raw
= pl.scan_parquet(source="../../data/datasets/pl/raw/*") raw_ds
Statistics
Missing values
We identified 5,725 judgments (approximately 1%) with a missing content field. The root cause of these missing values is unknown and assumed to be due to random errors, as only error codes (e.g. 404) were observed when accessing the API during dataset curation. Given that this represents a very small fraction of the dataset, these missing values are not expected to impact the overall data quality, so we removed these judgments. Additionally, the table below displays the number and proportion of missing values across other fields in the dataset (after removing those with missing content
).
= raw_ds.null_count().collect().to_pandas().T.rename(columns={0: "Null count"})
null_count = "Field name"
null_count.index.name "Null fraction"] = (null_count["Null count"] / raw_ds.select(pl.len()).collect().item()).round(2)
null_count[# print(null_count.to_markdown())
Field name | Null count | Null fraction |
---|---|---|
_id | 0 | 0 |
signature | 0 | 0 |
date | 0 | 0 |
publicationDate | 0 | 0 |
lastUpdate | 0 | 0 |
courtId | 0 | 0 |
departmentId | 0 | 0 |
type | 0 | 0 |
excerpt | 0 | 0 |
content | 0 | 0 |
chairman | 47283 | 0.12 |
decision | 408423 | 1 |
judges | 39772 | 0.1 |
legalBases | 113534 | 0.28 |
publisher | 609 | 0 |
recorder | 103675 | 0.25 |
references | 40737 | 0.1 |
reviser | 171 | 0 |
themePhrases | 117074 | 0.29 |
num_pages | 0 | 0 |
text | 0 | 0 |
vol_number | 0 | 0 |
vol_type | 0 | 0 |
court_name | 605 | 0 |
department_name | 605 | 0 |
text_legal_bases | 0 | 0 |
thesis | 369092 | 0.9 |
Analysis of selected fields
= raw_ds.drop_nulls(subset="court_name").select("court_name").group_by("court_name").len().sort("len", descending=True).collect().to_pandas()
court_distribution = sns.histplot(data=court_distribution, x="len", log_scale=True, kde=True)
ax set(title="Distribution of judgments per court", xlabel="#Judgements in single court", ylabel="Count")
ax. plt.show()
= raw_ds.select("date").collect()["date"].str.split(" ").list.get(0).str.to_date().dt.year().value_counts().sort("date").to_pandas()
judgements_per_year = judgements_per_year[judgements_per_year["date"] < 2024]
judgements_per_year
= plt.subplots(1, 1, figsize=(10, 5))
_, ax = sns.pointplot(data=judgements_per_year, x="date", y="count", linestyles="--", ax=ax)
ax set(xlabel="Year", ylabel="Number of Judgements", title="Yearly Number of Judgements", yscale="log")
ax.=90)
plt.xticks(rotation plt.show()
= raw_ds.fill_null(value="<null>").select("type").group_by("type").len().sort("len", descending=True).collect().to_pandas()
types
= plt.subplots(1, 1, figsize=(8, 8))
_, ax = sns.barplot(data=types, x="len", y="type", errorbar=None, ax=ax)
ax set(xlabel="Count", ylabel="Type", title="Judgement types cardinality", xscale="log")
ax. plt.show()
= raw_ds.with_columns([pl.col("judges").list.len().alias("num_judges")]).select("num_judges").sort("num_judges").collect().to_pandas()
num_judges = sns.histplot(data=num_judges, x="num_judges", bins=num_judges["num_judges"].nunique())
ax set(xlabel="#Judges per judgement", ylabel="Count", yscale="log", title="#Judges per single judgement")
ax. plt.show()
= raw_ds.with_columns([pl.col("legalBases").list.len().alias("num_lb")]).select("num_lb").sort("num_lb").collect().to_pandas()
num_lb = sns.histplot(data=num_lb, x="num_lb", bins=num_lb["num_lb"].nunique())
ax set(xlabel="#Legal bases", ylabel="Count", yscale="log", title="#Legal bases per judgement")
ax. plt.show()
= load_dataset("parquet", data_dir="../../data/datasets/pl/raw/", columns=["_id", "text"])
raw_text_ds = raw_text_ds.filter(lambda x: x["text"] is not None) raw_text_ds
= AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer
def tokenize(batch: dict[str, list]) -> list[int]:
= tokenizer(batch["text"], add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False, return_length=True)
tokenized return {"length": tokenized["length"]}
= raw_text_ds.map(tokenize, batched=True, batch_size=16, remove_columns=["text"], num_proc=20) raw_text_ds
= raw_text_ds["train"].to_pandas()
judgement_len
= sns.histplot(data=judgement_len, x="length", bins=50)
ax set(xlabel="#Tokens", ylabel="Count", title="#Tokens distribution in judgements (llama-3 tokenizer)", yscale="log")
ax.lambda x, pos: f'{int(x/1_000)}k'))
ax.xaxis.set_major_formatter(ticker.FuncFormatter( plt.show()
= raw_ds.fill_null(value="<null>").select(["_id", "type"]).collect().to_pandas().set_index("_id").join(judgement_len.set_index("_id"))
per_type_tokens
= plt.subplots(1, 1, figsize=(10, 10))
_, ax = sns.boxenplot(data=per_type_tokens, y="type", x="length")
ax set(xscale="log", title="Judgement token count per type", xlabel="#Tokens", ylabel="Type")
ax. plt.show()