Analyse England and Wales Dataset

import polars as pl
from datasets import load_from_disk

from juddges.settings import DATA_PATH
path_ = DATA_PATH / "datasets" / "en"
dataset_path = path_ / "en_judgements_dataset"
ds = load_from_disk(dataset_path)
ds
Dataset({
    features: ['_id', 'citation', 'signature', 'date', 'publicationDate', 'type', 'excerpt', 'content', 'judges', 'caseNumbers', 'citation_references', 'legislation', 'file_name', 'appeal_type', 'appeal_outcome', 'xml_uri', 'uri'],
    num_rows: 6154
})
df = ds.to_pandas()
pl_df = pl.DataFrame(df)
pl_df = pl_df.with_columns([
    pl.col("date").cast(pl.Utf8),
    pl.col("publicationDate").cast(pl.Utf8),
])

# Define date format
dt_fmt = "%Y-%m-%d %H:%M:%S%.f %Z"

# Perform column transformations
pl_df = pl_df.with_columns([
    pl.col("date").str.strptime(pl.Datetime, format=dt_fmt),
    pl.col("publicationDate").str.strptime(pl.Datetime, format=dt_fmt),
    pl.col("type").cast(pl.Categorical),
    pl.col("appeal_type").cast(pl.Categorical),
    pl.col("appeal_outcome").cast(pl.Categorical)
])

# Display the first few rows of the transformed DataFrame
print(pl_df.head())
shape: (5, 17)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ _id       ┆ citation  ┆ signature ┆ date      ┆ … ┆ appeal_ty ┆ appeal_ou ┆ xml_uri   ┆ uri      │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ pe        ┆ tcome     ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ datetime[ ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆ ns]       ┆   ┆ cat       ┆ cat       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ ab0224364 ┆ [2008]    ┆ EWCA_Crim ┆ null      ┆ … ┆ null      ┆ null      ┆ https://c ┆ https:// │
│ e4cf6562c ┆ EWCA Crim ┆ _2952     ┆           ┆   ┆           ┆           ┆ aselaw.na ┆ caselaw. │
│ 82f8861d5 ┆ 2952      ┆           ┆           ┆   ┆           ┆           ┆ tionalarc ┆ national │
│ 268d4…    ┆           ┆           ┆           ┆   ┆           ┆           ┆ hives…    ┆ archives │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ …        │
│ d4630d932 ┆ [2006]    ┆ EWCA_Crim ┆ null      ┆ … ┆ convictio ┆ null      ┆ https://c ┆ https:// │
│ 58ea51ecf ┆ EWCA Crim ┆ _3187     ┆           ┆   ┆ n         ┆           ┆ aselaw.na ┆ caselaw. │
│ f4bc40154 ┆ 3187      ┆           ┆           ┆   ┆           ┆           ┆ tionalarc ┆ national │
│ 43b4e…    ┆           ┆           ┆           ┆   ┆           ┆           ┆ hives…    ┆ archives │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ …        │
│ 37183a714 ┆ [2012]    ┆ EWCA_Crim ┆ null      ┆ … ┆ null      ┆ null      ┆ https://c ┆ https:// │
│ b626cfe98 ┆ EWCA Crim ┆ _1840     ┆           ┆   ┆           ┆           ┆ aselaw.na ┆ caselaw. │
│ 081ac0250 ┆ 1840      ┆           ┆           ┆   ┆           ┆           ┆ tionalarc ┆ national │
│ c804f…    ┆           ┆           ┆           ┆   ┆           ┆           ┆ hives…    ┆ archives │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ …        │
│ b41933b19 ┆ [2014]    ┆ EWCA_Crim ┆ null      ┆ … ┆ null      ┆ null      ┆ https://c ┆ https:// │
│ 505ab8767 ┆ EWCA Crim ┆ _1730     ┆           ┆   ┆           ┆           ┆ aselaw.na ┆ caselaw. │
│ ce30faf8d ┆ 1730      ┆           ┆           ┆   ┆           ┆           ┆ tionalarc ┆ national │
│ b9524…    ┆           ┆           ┆           ┆   ┆           ┆           ┆ hives…    ┆ archives │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ …        │
│ 418382a2a ┆ [2018]    ┆ EWCA_Crim ┆ null      ┆ … ┆ null      ┆ allowed   ┆ https://c ┆ https:// │
│ 6c0c32d3d ┆ EWCA Crim ┆ _2189     ┆           ┆   ┆           ┆           ┆ aselaw.na ┆ caselaw. │
│ 2bd4cb7b3 ┆ 2189      ┆           ┆           ┆   ┆           ┆           ┆ tionalarc ┆ national │
│ 9e1ba…    ┆           ┆           ┆           ┆   ┆           ┆           ┆ hives…    ┆ archives │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ …        │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘
pl_df.describe()
shape: (9, 18)
statistic _id citation signature date publicationDate type excerpt content judges caseNumbers citation_references legislation file_name appeal_type appeal_outcome xml_uri uri
str str str str str str str str str f64 f64 f64 f64 str str str str str
"count" "6154" "6154" "6154" "0" "6154" "6154" "6058" "6154" 6115.0 4934.0 1392.0 1826.0 "6154" "834" "1368" "6154" "6154"
"null_count" "0" "0" "0" "6154" "0" "0" "96" "0" 39.0 1220.0 4762.0 4328.0 "0" "5320" "4786" "0" "0"
"mean" null null null null "2013-10-13 09:… null null null null null null null null null null null null
"std" null null null null null null null null null null null null null null null null null
"min" "001d3b389f60bf… "[2003] EWCA Cr… "EWCA_(Crim)_14… null "2003-01-04 00:… null "********REPORT… " 2020] EWCA Cr… null null null null "2003_01_04-1.x… null null "https://casela… "https://casela…
"25%" null null null null "2008-06-11 00:… null null null null null null null null null null null null
"50%" null null null null "2012-11-29 00:… null null null null null null null null null null null null
"75%" null null null null "2019-06-07 00:… null null null null null null null null null null null null
"max" "ffffb6552ad898… "[2024] EWCA Cr… "Ewca_Crim_664" null "2024-05-22 00:… null "…WARNING: repo… "…WARNING: repo… null null null null "2024_05_22-615… null null "https://casela… "https://casela…
pl_df["type"].value_counts()
shape: (7, 2)
type count
cat u32
"crown_court" 5472
"supreme_court" 660
"martial_court" 11
"high_court_adm… 2
"high_court_div… 7
"civil_criminal… 1
"division_court… 1
pl_df["appeal_type"].value_counts()
shape: (3, 2)
appeal_type count
cat u32
"conviction" 496
null 5320
"sentence" 338
pl_df["appeal_outcome"].value_counts()
shape: (5, 2)
appeal_outcome count
cat u32
null 4786
"granted" 20
"dismissed" 586
"refused" 65
"allowed" 697
print(f"Missing content: {pl_df['content'].null_count() / len(pl_df)}")
print(f"Missing excerpt: {pl_df['excerpt'].null_count() / len(pl_df)}")
Missing content: 0.0
Missing excerpt: 0.015599610009749756
pl_df["excerpt"].str.strip_chars().str.len_chars().to_pandas().plot.hist(
    bins=50, log=True, title="Excerpt #chars distribution"
)

pl_df["excerpt"]
shape: (6_154,)
excerpt
str
"No. 2008/03296…
"Neutral Citati…
"Neutral Citati…
"Neutral Citati…
"No: 201802356 …
"Neutral Citati…
"Case No: 2002/…
"Neutral Citati…
"Case No: 20030…
"2017/05382/B1 …
pl_df["excerpt"].str.strip_chars().str.len_chars().to_pandas()
0       500.0
1       500.0
2       499.0
3       500.0
4       499.0
        ...  
6149    499.0
6150    500.0
6151    500.0
6152    499.0
6153    499.0
Name: excerpt, Length: 6154, dtype: float64